diff --git a/.bazelrc b/.bazelrc index 4ffb73bd4..37cdc3d88 100644 --- a/.bazelrc +++ b/.bazelrc @@ -19,6 +19,9 @@ build --incompatible_depset_is_not_iterable=false # Sets the default Apple platform to macOS. build --apple_platform_type=macos +# Allow debugging with XCODE +build --apple_generate_dsym + # Android configs. build:android --crosstool_top=//external:android/crosstool build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain diff --git a/Dockerfile b/Dockerfile index 8adcd6c24..ef7b4d43a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,6 +35,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libopencv-highgui-dev \ libopencv-imgproc-dev \ libopencv-video-dev \ + libopencv-calib3d-dev \ + libopencv-features2d-dev \ software-properties-common && \ add-apt-repository -y ppa:openjdk-r/ppa && \ apt-get update && apt-get install -y openjdk-8-jdk && \ diff --git a/README.md b/README.md index f5a0bceef..1b6385a89 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,13 @@ * [Face Detection](mediapipe/docs/face_detection_mobile_gpu.md) * [Hair Segmentation](mediapipe/docs/hair_segmentation_mobile_gpu.md) * [Object Detection](mediapipe/docs/object_detection_mobile_gpu.md) +* [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md) ![hand_tracking](mediapipe/docs/images/mobile/hand_tracking_3d_android_gpu_small.gif) ![multi-hand_tracking](mediapipe/docs/images/mobile/multi_hand_tracking_android_gpu_small.gif) ![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif) ![hair_segmentation](mediapipe/docs/images/mobile/hair_segmentation_android_gpu_small.gif) -![object_detection](mediapipe/docs/images/mobile/object_detection_android_gpu_small.gif) +![object_tracking](mediapipe/docs/images/mobile/object_tracking_android_gpu_small.gif) ## Installation Follow these [instructions](mediapipe/docs/install.md). diff --git a/WORKSPACE b/WORKSPACE index c09ca03a5..35d1a7e36 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -116,6 +116,13 @@ http_archive( "https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, ], + # Patch https://github.com/tensorflow/tensorflow/commit/e3a7bdbebb99352351a19e2e403136166aa52934 + patches = [ + "@//third_party:org_tensorflow_e3a7bdbebb99352351a19e2e403136166aa52934.diff" + ], + patch_args = [ + "-p1", + ], strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT, sha256 = _TENSORFLOW_SHA256, ) @@ -123,8 +130,22 @@ http_archive( load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace") tf_workspace(tf_repo_name = "org_tensorflow") +http_archive( + name = "ceres_solver", + url = "https://github.com/ceres-solver/ceres-solver/archive/1.14.0.zip", + patches = [ + "@//third_party:ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff" + ], + patch_args = [ + "-p1", + ], + strip_prefix = "ceres-solver-1.14.0", + sha256 = "5ba6d0db4e784621fda44a50c58bb23b0892684692f0c623e2063f9c19f192f1" +) + # Please run # $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \ +# libopencv-calib3d-dev libopencv-features2d-dev \ # libopencv-imgproc-dev libopencv-video-dev new_local_repository( name = "linux_opencv", @@ -292,3 +313,4 @@ http_archive( strip_prefix = "google-toolbox-for-mac-2.2.1", build_file = "@//third_party:google_toolbox_for_mac.BUILD", ) + diff --git a/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen b/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen index 51a68cc8e..a283b56fc 100644 --- a/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen +++ b/mediapipe/MediaPipe.tulsiproj/Configs/MediaPipe.tulsigen @@ -9,6 +9,7 @@ "mediapipe/examples/ios/facedetectiongpu/BUILD", "mediapipe/examples/ios/handdetectiongpu/BUILD", "mediapipe/examples/ios/handtrackinggpu/BUILD", + "mediapipe/examples/ios/multihandtrackinggpu/BUILD", "mediapipe/examples/ios/objectdetectioncpu/BUILD", "mediapipe/examples/ios/objectdetectiongpu/BUILD" ], @@ -18,6 +19,7 @@ "//mediapipe/examples/ios/facedetectiongpu:FaceDetectionGpuApp", "//mediapipe/examples/ios/handdetectiongpu:HandDetectionGpuApp", "//mediapipe/examples/ios/handtrackinggpu:HandTrackingGpuApp", + "//mediapipe/examples/ios/multihandtrackinggpu:MultiHandTrackingGpuApp", "//mediapipe/examples/ios/objectdetectioncpu:ObjectDetectionCpuApp", "//mediapipe/examples/ios/objectdetectiongpu:ObjectDetectionGpuApp", "//mediapipe/objc:mediapipe_framework_ios" @@ -84,6 +86,8 @@ "mediapipe/examples/ios/handdetectiongpu/Base.lproj", "mediapipe/examples/ios/handtrackinggpu", "mediapipe/examples/ios/handtrackinggpu/Base.lproj", + "mediapipe/examples/ios/multihandtrackinggpu", + "mediapipe/examples/ios/multihandtrackinggpu/Base.lproj", "mediapipe/examples/ios/objectdetectioncpu", "mediapipe/examples/ios/objectdetectioncpu/Base.lproj", "mediapipe/examples/ios/objectdetectiongpu", diff --git a/mediapipe/MediaPipe.tulsiproj/project.tulsiconf b/mediapipe/MediaPipe.tulsiproj/project.tulsiconf index bb40e553a..24d8764f0 100644 --- a/mediapipe/MediaPipe.tulsiproj/project.tulsiconf +++ b/mediapipe/MediaPipe.tulsiproj/project.tulsiconf @@ -16,6 +16,7 @@ "mediapipe/examples/ios/facedetectiongpu", "mediapipe/examples/ios/handdetectiongpu", "mediapipe/examples/ios/handtrackinggpu", + "mediapipe/examples/ios/multihandtrackinggpu", "mediapipe/examples/ios/objectdetectioncpu", "mediapipe/examples/ios/objectdetectiongpu" ], diff --git a/mediapipe/calculators/core/BUILD b/mediapipe/calculators/core/BUILD index 6de3c3828..aa8a94033 100644 --- a/mediapipe/calculators/core/BUILD +++ b/mediapipe/calculators/core/BUILD @@ -165,6 +165,7 @@ cc_library( deps = [ "//mediapipe/framework:calculator_framework", "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:status", ], alwayslink = 1, ) diff --git a/mediapipe/calculators/core/add_header_calculator.cc b/mediapipe/calculators/core/add_header_calculator.cc index 341813817..393c12225 100644 --- a/mediapipe/calculators/core/add_header_calculator.cc +++ b/mediapipe/calculators/core/add_header_calculator.cc @@ -13,11 +13,12 @@ // limitations under the License. #include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" #include "mediapipe/framework/port/logging.h" namespace mediapipe { -// Attach the header from one stream to another stream. +// Attach the header from a stream or side input to another stream. // // The header stream (tag HEADER) must not have any packets in it. // @@ -25,17 +26,53 @@ namespace mediapipe { // calculator to not need a header or to accept a separate stream with // a header, that would be more future proof. // +// Example usage 1: +// node { +// calculator: "AddHeaderCalculator" +// input_stream: "DATA:audio" +// input_stream: "HEADER:audio_header" +// output_stream: "audio_with_header" +// } +// +// Example usage 2: +// node { +// calculator: "AddHeaderCalculator" +// input_stream: "DATA:audio" +// input_side_packet: "HEADER:audio_header" +// output_stream: "audio_with_header" +// } +// class AddHeaderCalculator : public CalculatorBase { public: static ::mediapipe::Status GetContract(CalculatorContract* cc) { - cc->Inputs().Tag("HEADER").SetNone(); + bool has_side_input = false; + bool has_header_stream = false; + if (cc->InputSidePackets().HasTag("HEADER")) { + cc->InputSidePackets().Tag("HEADER").SetAny(); + has_side_input = true; + } + if (cc->Inputs().HasTag("HEADER")) { + cc->Inputs().Tag("HEADER").SetNone(); + has_header_stream = true; + } + if (has_side_input == has_header_stream) { + return mediapipe::InvalidArgumentError( + "Header must be provided via exactly one of side input and input " + "stream"); + } cc->Inputs().Tag("DATA").SetAny(); cc->Outputs().Index(0).SetSameAs(&cc->Inputs().Tag("DATA")); return ::mediapipe::OkStatus(); } ::mediapipe::Status Open(CalculatorContext* cc) override { - const Packet& header = cc->Inputs().Tag("HEADER").Header(); + Packet header; + if (cc->InputSidePackets().HasTag("HEADER")) { + header = cc->InputSidePackets().Tag("HEADER"); + } + if (cc->Inputs().HasTag("HEADER")) { + header = cc->Inputs().Tag("HEADER").Header(); + } if (!header.IsEmpty()) { cc->Outputs().Index(0).SetHeader(header); } diff --git a/mediapipe/calculators/core/add_header_calculator_test.cc b/mediapipe/calculators/core/add_header_calculator_test.cc index b67cf04dc..01ea986f1 100644 --- a/mediapipe/calculators/core/add_header_calculator_test.cc +++ b/mediapipe/calculators/core/add_header_calculator_test.cc @@ -14,8 +14,10 @@ #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/calculator_runner.h" +#include "mediapipe/framework/port/canonical_errors.h" #include "mediapipe/framework/port/gmock.h" #include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/status.h" #include "mediapipe/framework/port/status_matchers.h" #include "mediapipe/framework/timestamp.h" #include "mediapipe/framework/tool/validate_type.h" @@ -24,7 +26,7 @@ namespace mediapipe { class AddHeaderCalculatorTest : public ::testing::Test {}; -TEST_F(AddHeaderCalculatorTest, Works) { +TEST_F(AddHeaderCalculatorTest, HeaderStream) { CalculatorGraphConfig::Node node; node.set_calculator("AddHeaderCalculator"); node.add_input_stream("HEADER:header_stream"); @@ -96,4 +98,62 @@ TEST_F(AddHeaderCalculatorTest, NoPacketsOnHeaderStream) { ASSERT_FALSE(runner.Run().ok()); } +TEST_F(AddHeaderCalculatorTest, InputSidePacket) { + CalculatorGraphConfig::Node node; + node.set_calculator("AddHeaderCalculator"); + node.add_input_stream("DATA:data_stream"); + node.add_output_stream("merged_stream"); + node.add_input_side_packet("HEADER:header"); + + CalculatorRunner runner(node); + + // Set header and add 5 packets. + runner.MutableSidePackets()->Tag("HEADER") = + Adopt(new std::string("my_header")); + for (int i = 0; i < 5; ++i) { + Packet packet = Adopt(new int(i)).At(Timestamp(i * 1000)); + runner.MutableInputs()->Tag("DATA").packets.push_back(packet); + } + + // Run calculator. + MP_ASSERT_OK(runner.Run()); + + ASSERT_EQ(1, runner.Outputs().NumEntries()); + + // Test output. + EXPECT_EQ(std::string("my_header"), + runner.Outputs().Index(0).header.Get()); + const std::vector& output_packets = runner.Outputs().Index(0).packets; + ASSERT_EQ(5, output_packets.size()); + for (int i = 0; i < 5; ++i) { + const int val = output_packets[i].Get(); + EXPECT_EQ(i, val); + EXPECT_EQ(Timestamp(i * 1000), output_packets[i].Timestamp()); + } +} + +TEST_F(AddHeaderCalculatorTest, UsingBothSideInputAndStream) { + CalculatorGraphConfig::Node node; + node.set_calculator("AddHeaderCalculator"); + node.add_input_stream("HEADER:header_stream"); + node.add_input_stream("DATA:data_stream"); + node.add_output_stream("merged_stream"); + node.add_input_side_packet("HEADER:header"); + + CalculatorRunner runner(node); + + // Set both headers and add 5 packets. + runner.MutableSidePackets()->Tag("HEADER") = + Adopt(new std::string("my_header")); + runner.MutableSidePackets()->Tag("HEADER") = + Adopt(new std::string("my_header")); + for (int i = 0; i < 5; ++i) { + Packet packet = Adopt(new int(i)).At(Timestamp(i * 1000)); + runner.MutableInputs()->Tag("DATA").packets.push_back(packet); + } + + // Run should fail because header can only be provided one way. + EXPECT_EQ(runner.Run().code(), ::mediapipe::InvalidArgumentError("").code()); +} + } // namespace mediapipe diff --git a/mediapipe/calculators/core/packet_resampler_calculator.cc b/mediapipe/calculators/core/packet_resampler_calculator.cc index 2966e4589..e9c12880f 100644 --- a/mediapipe/calculators/core/packet_resampler_calculator.cc +++ b/mediapipe/calculators/core/packet_resampler_calculator.cc @@ -330,22 +330,27 @@ void PacketResamplerCalculator::UpdateNextOutputTimestampWithJitter() { return ::mediapipe::OkStatus(); } - LOG_IF(WARNING, frame_time_usec_ < - (cc->InputTimestamp() - last_packet_.Timestamp()).Value()) - << "Adding jitter is meaningless when upsampling."; - - const int64 curr_diff = - (next_output_timestamp_ - cc->InputTimestamp()).Value(); - const int64 last_diff = - (next_output_timestamp_ - last_packet_.Timestamp()).Value(); - if (curr_diff * last_diff > 0) { - return ::mediapipe::OkStatus(); + if (frame_time_usec_ < + (cc->InputTimestamp() - last_packet_.Timestamp()).Value()) { + LOG_FIRST_N(WARNING, 2) + << "Adding jitter is not very useful when upsampling."; + } + + while (true) { + const int64 last_diff = + (next_output_timestamp_ - last_packet_.Timestamp()).Value(); + RET_CHECK_GT(last_diff, 0.0); + const int64 curr_diff = + (next_output_timestamp_ - cc->InputTimestamp()).Value(); + if (curr_diff > 0.0) { + break; + } + OutputWithinLimits(cc, (std::abs(curr_diff) > last_diff + ? last_packet_ + : cc->Inputs().Get(input_data_id_).Value()) + .At(next_output_timestamp_)); + UpdateNextOutputTimestampWithJitter(); } - OutputWithinLimits(cc, (std::abs(curr_diff) > std::abs(last_diff) - ? last_packet_ - : cc->Inputs().Get(input_data_id_).Value()) - .At(next_output_timestamp_)); - UpdateNextOutputTimestampWithJitter(); return ::mediapipe::OkStatus(); } diff --git a/mediapipe/calculators/image/image_cropping_calculator.cc b/mediapipe/calculators/image/image_cropping_calculator.cc index a9277e871..792c24466 100644 --- a/mediapipe/calculators/image/image_cropping_calculator.cc +++ b/mediapipe/calculators/image/image_cropping_calculator.cc @@ -501,8 +501,11 @@ void ImageCroppingCalculator::GetOutputDimensions(CalculatorContext* cc, row_max = std::max(row_max, transformed_points_[i * 2 + 1]); } - *dst_width = std::round((col_max - col_min) * src_width); - *dst_height = std::round((row_max - row_min) * src_height); + int width = static_cast(std::round((col_max - col_min) * src_width)); + int height = static_cast(std::round((row_max - row_min) * src_height)); + // Minimum output dimension 1x1 prevents creation of textures with 0x0. + *dst_width = std::max(1, width); + *dst_height = std::max(1, height); } } // namespace mediapipe diff --git a/mediapipe/calculators/util/BUILD b/mediapipe/calculators/util/BUILD index d3a2e341d..b603efd1c 100644 --- a/mediapipe/calculators/util/BUILD +++ b/mediapipe/calculators/util/BUILD @@ -983,3 +983,31 @@ cc_test( "//mediapipe/framework/port:parse_text_proto", ], ) + +cc_library( + name = "detections_to_timed_box_list_calculator", + srcs = ["detections_to_timed_box_list_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/util/tracking:box_tracker", + ], + alwayslink = 1, +) + +cc_library( + name = "detection_unique_id_calculator", + srcs = ["detection_unique_id_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + ], + alwayslink = 1, +) diff --git a/mediapipe/calculators/util/detection_unique_id_calculator.cc b/mediapipe/calculators/util/detection_unique_id_calculator.cc new file mode 100644 index 000000000..2069f1677 --- /dev/null +++ b/mediapipe/calculators/util/detection_unique_id_calculator.cc @@ -0,0 +1,110 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/port/status.h" + +namespace mediapipe { + +namespace { + +constexpr char kDetectionsTag[] = "DETECTIONS"; +constexpr char kDetectionListTag[] = "DETECTION_LIST"; + +// Each detection processed by DetectionUniqueIDCalculator will be assigned an +// unique id that starts from 1. If a detection already has an ID other than 0, +// the ID will be overwritten. +static int64 detection_id = 0; + +inline int GetNextDetectionId() { return ++detection_id; } + +} // namespace + +// Assign a unique id to detections. +// Note that the calculator will consume the input vector of Detection or +// DetectionList. So the input stream can not be connected to other calculators. +// +// Example config: +// node { +// calculator: "DetectionUniqueIdCalculator" +// input_stream: "DETECTIONS:detections" +// output_stream: "DETECTIONS:output_detections" +// } +class DetectionUniqueIdCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kDetectionListTag) || + cc->Inputs().HasTag(kDetectionsTag)) + << "None of the input streams are provided."; + + if (cc->Inputs().HasTag(kDetectionListTag)) { + RET_CHECK(cc->Outputs().HasTag(kDetectionListTag)); + cc->Inputs().Tag(kDetectionListTag).Set(); + cc->Outputs().Tag(kDetectionListTag).Set(); + } + if (cc->Inputs().HasTag(kDetectionsTag)) { + RET_CHECK(cc->Outputs().HasTag(kDetectionsTag)); + cc->Inputs().Tag(kDetectionsTag).Set>(); + cc->Outputs().Tag(kDetectionsTag).Set>(); + } + + return ::mediapipe::OkStatus(); + } + + ::mediapipe::Status Open(CalculatorContext* cc) override { + cc->SetOffset(::mediapipe::TimestampDiff(0)); + return ::mediapipe::OkStatus(); + } + ::mediapipe::Status Process(CalculatorContext* cc) override; +}; +REGISTER_CALCULATOR(DetectionUniqueIdCalculator); + +::mediapipe::Status DetectionUniqueIdCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().HasTag(kDetectionListTag) && + !cc->Inputs().Tag(kDetectionListTag).IsEmpty()) { + auto result = + cc->Inputs().Tag(kDetectionListTag).Value().Consume(); + if (result.ok()) { + auto detection_list = std::move(result).ValueOrDie(); + for (Detection& detection : *detection_list->mutable_detection()) { + detection.set_detection_id(GetNextDetectionId()); + } + cc->Outputs() + .Tag(kDetectionListTag) + .Add(detection_list.release(), cc->InputTimestamp()); + } + } + + if (cc->Inputs().HasTag(kDetectionsTag) && + !cc->Inputs().Tag(kDetectionsTag).IsEmpty()) { + auto result = cc->Inputs() + .Tag(kDetectionsTag) + .Value() + .Consume>(); + if (result.ok()) { + auto detections = std::move(result).ValueOrDie(); + for (Detection& detection : *detections) { + detection.set_detection_id(GetNextDetectionId()); + } + cc->Outputs() + .Tag(kDetectionsTag) + .Add(detections.release(), cc->InputTimestamp()); + } + } + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/util/detections_to_render_data_calculator.cc b/mediapipe/calculators/util/detections_to_render_data_calculator.cc index 098334ec7..1077f7687 100644 --- a/mediapipe/calculators/util/detections_to_render_data_calculator.cc +++ b/mediapipe/calculators/util/detections_to_render_data_calculator.cc @@ -39,6 +39,8 @@ constexpr char kKeypointLabel[] = "KEYPOINT"; // The ratio of detection label font height to the height of detection bounding // box. constexpr double kLabelToBoundingBoxRatio = 0.1; +// Perserve 2 decimal digits. +constexpr float kNumScoreDecimalDigitsMultipler = 100; } // namespace @@ -235,18 +237,26 @@ void DetectionsToRenderDataCalculator::AddLabels( std::string label_str = detection.label().empty() ? absl::StrCat(detection.label_id(i)) : detection.label(i); + const float rounded_score = + std::round(detection.score(i) * kNumScoreDecimalDigitsMultipler) / + kNumScoreDecimalDigitsMultipler; std::string label_and_score = - absl::StrCat(label_str, options.text_delimiter(), detection.score(i), + absl::StrCat(label_str, options.text_delimiter(), rounded_score, options.text_delimiter()); label_and_scores.push_back(label_and_score); } std::vector labels; + if (options.render_detection_id()) { + const std::string detection_id_str = + absl::StrCat("Id: ", detection.detection_id()); + labels.push_back(detection_id_str); + } if (options.one_label_per_line()) { - labels.swap(label_and_scores); + labels.insert(labels.end(), label_and_scores.begin(), + label_and_scores.end()); } else { labels.push_back(absl::StrJoin(label_and_scores, "")); } - // Add the render annotations for "label(_id),score". for (int i = 0; i < labels.size(); ++i) { auto label = labels.at(i); diff --git a/mediapipe/calculators/util/detections_to_render_data_calculator.proto b/mediapipe/calculators/util/detections_to_render_data_calculator.proto index 245f48fea..acc847dcd 100644 --- a/mediapipe/calculators/util/detections_to_render_data_calculator.proto +++ b/mediapipe/calculators/util/detections_to_render_data_calculator.proto @@ -53,4 +53,7 @@ message DetectionsToRenderDataCalculatorOptions { // instances of this calculator are present in the graph, this value // should be unique among them. optional string scene_class = 7 [default = "DETECTION"]; + + // If true, renders the detection id in the first line before the labels. + optional bool render_detection_id = 8 [default = false]; } diff --git a/mediapipe/calculators/util/detections_to_timed_box_list_calculator.cc b/mediapipe/calculators/util/detections_to_timed_box_list_calculator.cc new file mode 100644 index 000000000..b0a177e58 --- /dev/null +++ b/mediapipe/calculators/util/detections_to_timed_box_list_calculator.cc @@ -0,0 +1,110 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/formats/location_data.pb.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/util/tracking/box_tracker.h" + +namespace mediapipe { + +namespace { + +constexpr char kDetectionsTag[] = "DETECTIONS"; +constexpr char kDetectionListTag[] = "DETECTION_LIST"; +constexpr char kBoxesTag[] = "BOXES"; + +} // namespace + +// A calculator that converts Detection proto to TimedBoxList proto for +// tracking. +// +// Please note that only Location Data formats of RELATIVE_BOUNDING_BOX are +// supported. +// +// Example config: +// node { +// calculator: "DetectionsToTimedBoxListCalculator" +// input_stream: "DETECTIONS:detections" +// output_stream: "BOXES:boxes" +// } +class DetectionsToTimedBoxListCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kDetectionListTag) || + cc->Inputs().HasTag(kDetectionsTag)) + << "None of the input streams are provided."; + if (cc->Inputs().HasTag(kDetectionListTag)) { + cc->Inputs().Tag(kDetectionListTag).Set(); + } + if (cc->Inputs().HasTag(kDetectionsTag)) { + cc->Inputs().Tag(kDetectionsTag).Set>(); + } + cc->Outputs().Tag(kBoxesTag).Set(); + return ::mediapipe::OkStatus(); + } + + ::mediapipe::Status Open(CalculatorContext* cc) override { + cc->SetOffset(TimestampDiff(0)); + return ::mediapipe::OkStatus(); + } + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + void ConvertDetectionToTimedBox(const Detection& detection, + TimedBoxProto* box, CalculatorContext* cc); +}; +REGISTER_CALCULATOR(DetectionsToTimedBoxListCalculator); + +::mediapipe::Status DetectionsToTimedBoxListCalculator::Process( + CalculatorContext* cc) { + auto output_timed_box_list = absl::make_unique(); + + if (cc->Inputs().HasTag(kDetectionListTag)) { + const auto& detection_list = + cc->Inputs().Tag(kDetectionListTag).Get(); + for (const auto& detection : detection_list.detection()) { + TimedBoxProto* box = output_timed_box_list->add_box(); + ConvertDetectionToTimedBox(detection, box, cc); + } + } + if (cc->Inputs().HasTag(kDetectionsTag)) { + const auto& detections = + cc->Inputs().Tag(kDetectionsTag).Get>(); + for (const auto& detection : detections) { + TimedBoxProto* box = output_timed_box_list->add_box(); + ConvertDetectionToTimedBox(detection, box, cc); + } + } + + cc->Outputs().Tag(kBoxesTag).Add(output_timed_box_list.release(), + cc->InputTimestamp()); + return ::mediapipe::OkStatus(); +} + +void DetectionsToTimedBoxListCalculator::ConvertDetectionToTimedBox( + const Detection& detection, TimedBoxProto* box, CalculatorContext* cc) { + const auto& relative_bounding_box = + detection.location_data().relative_bounding_box(); + box->set_left(relative_bounding_box.xmin()); + box->set_right(relative_bounding_box.xmin() + relative_bounding_box.width()); + box->set_top(relative_bounding_box.ymin()); + box->set_bottom(relative_bounding_box.ymin() + + relative_bounding_box.height()); + box->set_id(detection.detection_id()); + box->set_time_msec(cc->InputTimestamp().Microseconds() / 1000); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/video/BUILD b/mediapipe/calculators/video/BUILD index d0f93a104..b82743d79 100644 --- a/mediapipe/calculators/video/BUILD +++ b/mediapipe/calculators/video/BUILD @@ -37,6 +37,67 @@ proto_library( deps = ["//mediapipe/framework:calculator_proto"], ) +proto_library( + name = "motion_analysis_calculator_proto", + srcs = ["motion_analysis_calculator.proto"], + deps = [ + "//mediapipe/framework:calculator_proto", + "//mediapipe/util/tracking:motion_analysis_proto", + ], +) + +proto_library( + name = "flow_packager_calculator_proto", + srcs = ["flow_packager_calculator.proto"], + deps = [ + "//mediapipe/framework:calculator_proto", + "//mediapipe/util/tracking:flow_packager_proto", + ], +) + +proto_library( + name = "box_tracker_calculator_proto", + srcs = ["box_tracker_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_proto", + "//mediapipe/util/tracking:box_tracker_proto", + ], +) + +mediapipe_cc_proto_library( + name = "motion_analysis_calculator_cc_proto", + srcs = ["motion_analysis_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/util/tracking:motion_analysis_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":motion_analysis_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "flow_packager_calculator_cc_proto", + srcs = ["flow_packager_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/util/tracking:flow_packager_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":flow_packager_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "box_tracker_calculator_cc_proto", + srcs = ["box_tracker_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/util/tracking:box_tracker_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":box_tracker_calculator_proto"], +) + mediapipe_cc_proto_library( name = "flow_to_image_calculator_cc_proto", srcs = ["flow_to_image_calculator.proto"], @@ -131,6 +192,94 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "motion_analysis_calculator", + srcs = ["motion_analysis_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":motion_analysis_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:image_frame_opencv", + "//mediapipe/framework/formats:video_stream_header", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/util/tracking:camera_motion", + "//mediapipe/util/tracking:camera_motion_cc_proto", + "//mediapipe/util/tracking:frame_selection_cc_proto", + "//mediapipe/util/tracking:motion_analysis", + "//mediapipe/util/tracking:motion_estimation", + "//mediapipe/util/tracking:motion_models", + "//mediapipe/util/tracking:region_flow_cc_proto", + "@com_google_absl//absl/strings", + ], + alwayslink = 1, +) + +cc_library( + name = "flow_packager_calculator", + srcs = ["flow_packager_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":flow_packager_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/util/tracking:camera_motion_cc_proto", + "//mediapipe/util/tracking:flow_packager", + "//mediapipe/util/tracking:region_flow_cc_proto", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + ], + alwayslink = 1, +) + +cc_library( + name = "box_tracker_calculator", + srcs = ["box_tracker_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":box_tracker_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/formats:image_frame_opencv", + "//mediapipe/framework/formats:video_stream_header", # fixdeps: keep -- required for exobazel build. + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:parse_text_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/framework/tool:options_util", + "//mediapipe/util/tracking", + "//mediapipe/util/tracking:box_tracker", + "//mediapipe/util/tracking:tracking_visualization_utilities", + "@com_google_absl//absl/strings", + ], + alwayslink = 1, +) + +cc_library( + name = "tracked_detection_manager_calculator", + srcs = ["tracked_detection_manager_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/port:status", + "//mediapipe/util/tracking", + "//mediapipe/util/tracking:box_tracker", + "//mediapipe/util/tracking:tracked_detection", + "//mediapipe/util/tracking:tracked_detection_manager", + "//mediapipe/util/tracking:tracking_visualization_utilities", + "@com_google_absl//absl/container:node_hash_map", + ], + alwayslink = 1, +) + filegroup( name = "test_videos", srcs = [ @@ -201,3 +350,64 @@ cc_test( "//mediapipe/framework/port:parse_text_proto", ], ) + +MEDIAPIPE_DEPS = [ + "//mediapipe/calculators/video:box_tracker_calculator", + "//mediapipe/calculators/video:flow_packager_calculator", + "//mediapipe/calculators/video:motion_analysis_calculator", + "//mediapipe/framework/stream_handler:fixed_size_input_stream_handler", + "//mediapipe/framework/stream_handler:sync_set_input_stream_handler", +] + +mediapipe_binary_graph( + name = "parallel_tracker_binarypb", + graph = "testdata/parallel_tracker_graph.pbtxt", + output_name = "testdata/parallel_tracker.binarypb", + visibility = ["//visibility:public"], + deps = MEDIAPIPE_DEPS, +) + +mediapipe_binary_graph( + name = "tracker_binarypb", + graph = "testdata/tracker_graph.pbtxt", + output_name = "testdata/tracker.binarypb", + visibility = ["//visibility:public"], + deps = MEDIAPIPE_DEPS, +) + +cc_test( + name = "tracking_graph_test", + size = "small", + srcs = ["tracking_graph_test.cc"], + copts = ["-DPARALLEL_INVOKER_ACTIVE"] + select({ + "//mediapipe:apple": [], + "//mediapipe:android": [], + "//conditions:default": [], + }), + data = [ + ":testdata/lenna.png", + ":testdata/parallel_tracker.binarypb", + ":testdata/tracker.binarypb", + ], + deps = [ + ":box_tracker_calculator", + ":box_tracker_calculator_cc_proto", + ":flow_packager_calculator", + ":motion_analysis_calculator", + "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:packet", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/formats:image_frame", + "//mediapipe/framework/port:advanced_proto", + "//mediapipe/framework/port:core_proto", + "//mediapipe/framework/port:file_helpers", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:opencv_highgui", + "//mediapipe/framework/port:status", + "//mediapipe/framework/stream_handler:fixed_size_input_stream_handler", + "//mediapipe/framework/stream_handler:sync_set_input_stream_handler", + "//mediapipe/util/tracking:box_tracker_cc_proto", + "//mediapipe/util/tracking:tracking_cc_proto", + ], +) diff --git a/mediapipe/calculators/video/box_tracker_calculator.cc b/mediapipe/calculators/video/box_tracker_calculator.cc new file mode 100644 index 000000000..9963d864a --- /dev/null +++ b/mediapipe/calculators/video/box_tracker_calculator.cc @@ -0,0 +1,1271 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "absl/strings/numbers.h" +#include "mediapipe/calculators/video/box_tracker_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/image_frame_opencv.h" +#include "mediapipe/framework/formats/video_stream_header.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/parse_text_proto.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/framework/tool/options_util.h" +#include "mediapipe/util/tracking/box_tracker.h" +#include "mediapipe/util/tracking/tracking.h" +#include "mediapipe/util/tracking/tracking_visualization_utilities.h" + +namespace mediapipe { + +const char kOptionsTag[] = "OPTIONS"; + +// A calculator to track box positions over time. +// This calculator works in two modes: +// a) Streaming mode, forward tracking only uses per frame TRACKING TrackingData +// supplied by tracking. For faster processing use TRACK_TIME to explicitly +// request tracking results at higher FPS than supplied by TRACKING. +// b) Batch mode: Tracks from tracking chunk files as specified by CACHE_DIR +// side packet (forward and backward with multiple key framing support). +// NOTE: When using batch mode you might need some external logic +// to clear out the caching directory between runs / files; or otherwise +// stale chunk files might be used. +// +// Initial positions can be either supplied via calculator options or +// INITIAL_POS (not supported on mobile) side packet, but not both. + +// Input stream: +// TRACKING: Input tracking data (proto TrackingData, required if CACHE_DIR +// is not defined) +// TRACK_TIME: Timestamps that tracking results should be generated at. +// Optional. Results generated at a TRACK_TIME w/o corresponding +// TRACKING packet will be queued up and returned when the next +// TRACKING input is observed. For those packets also no +// visualization output will be generated. +// Can be Packet of any type. +// VIDEO: Optional input video stream tracked boxes are rendered over. +// START: Optional input stream with PreStream packet to begin processing. +// Typical use case: When used in batch mode have +// FlowPackagerCalculator emit a COMPLETE packet to indicate caching +// is completed. +// START_POS: Optional initial positions to be tracked as TimedBoxProtoList. +// Timestamp of the box is used, so box timestamp does not have to +// be monotonic. Assign monotonic increasing timestamps for +// START_POS, e.g. 1,2,3 per request. +// Supplied starting positions are 'fast forwarded', i.e. quickly +// tracked towards current track head, i.e. last received +// TrackingData and added to current set of tracked boxes. +// This is recommended to be used with SyncSetInputStreamHandler. +// START_POS_PROTO_STRING: Same as START_POS, but is in the form of serialized +// protobuffer std::string. When both START_POS and +// START_POS_PROTO_STRING are present, START_POS is used. Suggest +// to specify only one of them. +// RESTART_POS: Same as START_POS, but exclusively for receiving detection +// results from reacquisition. +// CANCEL_OBJECT_ID: Optional id of box to be removed. This is recommended +// to be used with SyncSetInputStreamHandler. +// RA_TRACK: Performs random access tracking within the specified +// tracking cache, which is specified in the options of this +// calculator BoxTrackerCalculatorOptions. Input is of type +// TimedBoxProtoList. +// Assumed to be supplied as pair +// [start0, stop0, start1, stop1, ...] of boxes, +// (that is list size() % 2 == 0), where position, id and time +// is used for start, and only time for stop; that is position +// is ignored. +// Assign monotonically increasing packet timestamps for RA_TRACK, +// e.g. 1,2,3; however the timestamp in TimedBoxProtoList +// can be in arbitrary order. +// Use with SyncSetInputStreamHandler in streaming mode only. +// RA_TRACK_PROTO_STRING: Same as RA_TRACK, but is in the form of serialized +// protobuffer std::string. When both RA_TRACK and +// RA_TRACK_PROTO_STRING are present, RA_TRACK is used. Suggest +// to specify only one of them. +// +// Output streams: +// VIZ: Optional output video stream with rendered box positions +// (requires VIDEO to be present) +// BOXES: Optional output stream of type TimedBoxProtoList for each +// initialized result. +// RA_BOXES: Optional output stream of type TimedBoxProtoList for each +// request in RA_TRACK. Same timestamp as request is used. +// +// Input side packets: +// INITIAL_POS: Optional initial positions to be tracked as text format proto +// of type TimedBoxProtoList. Can not be combined with initial +// position option. NOT SUPPORTED ON MOBILE. +// CACHE_DIR: Optional caching directory tracking chunk files are read +// from. +// +// +class BoxTrackerCalculator : public CalculatorBase { + public: + ~BoxTrackerCalculator() override = default; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + + protected: + void RenderStates(const std::vector& states, cv::Mat* mat); + void RenderInternalStates(const std::vector& states, + cv::Mat* mat); + + // MotionBox and corresponding PathSegment of results; used in streaming mode. + struct MotionBoxPath { + MotionBoxPath(MotionBox&& box_, PathSegment&& path_, bool reacq_ = false) + : box(std::move(box_)), path(std::move(path_)), reacquisition(reacq_) {} + + MotionBoxPath() = default; + + // Trims the state for forward/backward tracking. + void Trim(const int cache_size, const bool forward) { + if (forward) { + // Trims the box's states queue. + box.TrimFront(cache_size); + // Trims the path queue. + int trim_count = path.size() - cache_size; + while (trim_count-- > 0) { + path.pop_front(); + } + } else { // backward + // Trims the box's states queue. + box.TrimBack(cache_size); + // Trims the path queue. + int trim_count = path.size() - cache_size; + while (trim_count-- > 0) { + path.pop_back(); + } + } + } + + MotionBox box; + PathSegment path; + bool reacquisition; + }; + + // MotionBoxPath per unique id that we are tracking. + typedef std::unordered_map MotionBoxMap; + + // Performs tracking of all MotionBoxes in box_map by one frame forward or + // backward to or from data_frame_num using passed TrackingData. + // Specify destination timestamp and frame duration TrackingData was + // computed for. Used in streaming mode. + // Returns list of ids that failed. + void StreamTrack(const TrackingData& data, int data_frame_num, + int64 dst_timestamp_ms, int64 duration_ms, bool forward, + MotionBoxMap* box_map, std::vector* failed_ids); + + // Fast forwards specified boxes from starting position to current play head + // and outputs successful boxes to box_map. + // Specify the timestamp boxes are tracked from via timestamp in each + // TimedBox. + void FastForwardStartPos(const TimedBoxProtoList& start_pos_list, + MotionBoxMap* box_map); + + // Performs random access tracking from box_list (start,stop) tuples and + // outputs results. + void OutputRandomAccessTrack(const TimedBoxProtoList& box_list, + CalculatorContext* cc); + + private: + BoxTrackerCalculatorOptions options_; + + TimedBoxProtoList initial_pos_; + + // Keeps tracks boxes that have already been initialized. + std::unordered_set initialized_ids_; + + // Non empty for batch mode tracking. + std::string cache_dir_; + // Ids to be tracked in batch_mode. + std::unordered_set batch_track_ids_; + + int frame_num_ = 0; + + // Boxes that are tracked in streaming mode. + MotionBoxMap streaming_motion_boxes_; + + std::unordered_map> last_tracked_boxes_; + int frame_num_since_reset_ = 0; + + // Cache used during streaming mode for fast forward tracking. + std::deque> tracking_data_cache_; + + // Indicator to track if box_tracker_ has started tracking. + bool tracking_issued_ = false; + std::unique_ptr box_tracker_; + + // If set, renders tracking data into VIZ stream. + bool visualize_tracking_data_ = false; + + // If set, renders the box state and internal box state into VIZ stream. + bool visualize_state_ = false; + bool visualize_internal_state_ = false; + + // Timestamps for every tracking data input frame. + std::deque track_timestamps_; + + // For pruning track_timestamps_ queue. + static const int kTrackTimestampsMinQueueSize; + + // The long-running index of the head of track_timestamps_. + int track_timestamps_base_index_ = 0; + + // For pruning MotionBoxPath's state/path queues. + static const int kMotionBoxPathMinQueueSize; + + // Queued track time requests. + std::vector queued_track_requests_; + + // Add smooth transition between re-acquisition and previous tracked boxes. + // `result_box` is the tracking result of one specific timestamp. The smoothed + // result will be updated in place. + // `subframe_alpha` is from 0 to 1 (0, 1 repressents previous and current + // frame with TRACKING_DATA). Any frames with TRACK_TIME should interpolate in + // between. + void AddSmoothTransitionToOutputBox(int box_id, TimedBox* result_box, + float subframe_alpha = 1.0f); + + std::deque::iterator GetRandomAccessTimestampPos( + const TimedBoxProto& start, bool forward_track); + + std::deque>::iterator + GetRandomAccessStartData( + const std::deque::iterator& timestamp_pos); + + MotionBoxMap PrepareRandomAccessTrack( + const TimedBoxProto& start, int init_frame, bool forward_track, + const std::deque>::iterator& + start_data); + + bool RunForwardTrack( + const std::deque>::iterator& + start_data, + int init_frame, MotionBoxMap* single_map, int64 end_time_msec); + + bool RunBackwardTrack( + const std::deque>::iterator& + start_data, + int init_frame, MotionBoxMap* single_map, int64 end_time_msec); + + void ObtainResultOfRandomAccessTrack( + const MotionBoxMap& single_map, const TimedBoxProto& start, + int64 end_time_msec, + const std::unique_ptr& result_list); +}; + +REGISTER_CALCULATOR(BoxTrackerCalculator); + +// At least 2 timestamps need to be present in track_timestamps_ or streaming +// logic's duration calculation will break. +const int BoxTrackerCalculator::kTrackTimestampsMinQueueSize = 2; + +// At least 2: the newly added state, and one from the history. +const int BoxTrackerCalculator::kMotionBoxPathMinQueueSize = 2; + +namespace { + +// Convert box position according to rotation angle in degrees. +void ConvertCoordinateForRotation(float in_top, float in_left, float in_bottom, + float in_right, int rotation, float* out_top, + float* out_left, float* out_bottom, + float* out_right) { + CHECK(out_top != nullptr); + CHECK(out_left != nullptr); + CHECK(out_bottom != nullptr); + CHECK(out_right != nullptr); + const float in_center_x = (in_left + in_right) * 0.5f; + const float in_center_y = (in_top + in_bottom) * 0.5f; + const float in_width = in_right - in_left; + const float in_height = in_bottom - in_top; + CHECK_GT(in_width, 0); + CHECK_GT(in_height, 0); + float out_center_x; + float out_center_y; + float out_width; + float out_height; + switch (rotation) { + case 0: + out_center_x = in_center_x; + out_center_y = in_center_y; + out_width = in_width; + out_height = in_height; + break; + case -270: // FALL_THROUGH_INTENDED + case 90: + out_center_x = 1 - in_center_y; + out_center_y = in_center_x; + out_width = in_height; + out_height = in_width; + break; + case -180: // FALL_THROUGH_INTENDED + case 180: + out_center_x = 1 - in_center_x; + out_center_y = 1 - in_center_y; + out_width = in_width; + out_height = in_height; + break; + case -90: // FALL_THROUGH_INTENDED + case 270: + out_center_x = in_center_y; + out_center_y = 1 - in_center_x; + out_width = in_height; + out_height = in_width; + break; + default: + LOG(ERROR) << "invalid rotation " << rotation; + out_center_x = in_center_x; + out_center_y = in_center_y; + out_width = in_width; + out_height = in_height; + break; + } + *out_top = out_center_y - out_height * 0.5f; + *out_left = out_center_x - out_width * 0.5f; + *out_bottom = out_center_y + out_height * 0.5f; + *out_right = out_center_x + out_width * 0.5f; +} + +void AddStateToPath(const MotionBoxState& state, int64 time_msec, + PathSegment* path) { + CHECK(path); + TimedBox result; + TimedBoxFromMotionBoxState(state, &result); + result.time_msec = time_msec; + + const auto insert_pos = std::lower_bound(path->begin(), path->end(), result); + // Do not duplicate box positions. + if (insert_pos == path->end() || insert_pos->time_msec != time_msec) { + path->insert(insert_pos, + InternalTimedBox(result, new MotionBoxState(state))); + } else { + LOG(ERROR) << "Box at time " << time_msec << " already present; ignoring"; + } +} + +} // namespace. + +::mediapipe::Status BoxTrackerCalculator::GetContract(CalculatorContract* cc) { + if (cc->Inputs().HasTag("TRACKING")) { + cc->Inputs().Tag("TRACKING").Set(); + } + + if (cc->Inputs().HasTag("TRACK_TIME")) { + RET_CHECK(cc->Inputs().HasTag("TRACKING")) + << "TRACK_TIME needs TRACKING input"; + cc->Inputs().Tag("TRACK_TIME").SetAny(); + } + + if (cc->Inputs().HasTag("VIDEO")) { + cc->Inputs().Tag("VIDEO").Set(); + } + + if (cc->Inputs().HasTag("START")) { + // Actual packet content does not matter. + cc->Inputs().Tag("START").SetAny(); + } + + if (cc->Inputs().HasTag("START_POS")) { + cc->Inputs().Tag("START_POS").Set(); + } + + if (cc->Inputs().HasTag("START_POS_PROTO_STRING")) { + cc->Inputs().Tag("START_POS_PROTO_STRING").Set(); + } + + if (cc->Inputs().HasTag("RESTART_POS")) { + cc->Inputs().Tag("RESTART_POS").Set(); + } + + if (cc->Inputs().HasTag("CANCEL_OBJECT_ID")) { + cc->Inputs().Tag("CANCEL_OBJECT_ID").Set(); + } + + if (cc->Inputs().HasTag("RA_TRACK")) { + cc->Inputs().Tag("RA_TRACK").Set(); + } + + if (cc->Inputs().HasTag("RA_TRACK_PROTO_STRING")) { + cc->Inputs().Tag("RA_TRACK_PROTO_STRING").Set(); + } + + if (cc->Outputs().HasTag("VIZ")) { + RET_CHECK(cc->Inputs().HasTag("VIDEO")) + << "Output stream VIZ requires VIDEO to be present."; + cc->Outputs().Tag("VIZ").Set(); + } + + if (cc->Outputs().HasTag("BOXES")) { + cc->Outputs().Tag("BOXES").Set(); + } + + if (cc->Outputs().HasTag("RA_BOXES")) { + cc->Outputs().Tag("RA_BOXES").Set(); + } + +#if defined(__ANDROID__) || defined(__APPLE__) || defined(__EMSCRIPTEN__) + RET_CHECK(!cc->InputSidePackets().HasTag("INITIAL_POS")) + << "Unsupported on mobile"; +#else + if (cc->InputSidePackets().HasTag("INITIAL_POS")) { + cc->InputSidePackets().Tag("INITIAL_POS").Set(); + } +#endif // defined(__ANDROID__) || defined(__APPLE__) || defined(__EMSCRIPTEN__) + + if (cc->InputSidePackets().HasTag("CACHE_DIR")) { + cc->InputSidePackets().Tag("CACHE_DIR").Set(); + } + + RET_CHECK(cc->Inputs().HasTag("TRACKING") != + cc->InputSidePackets().HasTag("CACHE_DIR")) + << "Either TRACKING or CACHE_DIR needs to be specified."; + + if (cc->InputSidePackets().HasTag(kOptionsTag)) { + cc->InputSidePackets().Tag(kOptionsTag).Set(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status BoxTrackerCalculator::Open(CalculatorContext* cc) { + options_ = tool::RetrieveOptions(cc->Options(), + cc->InputSidePackets(), kOptionsTag); + + RET_CHECK(!cc->InputSidePackets().HasTag("INITIAL_POS") || + !options_.has_initial_position()) + << "Can not specify initial position as side packet and via options"; + + if (options_.has_initial_position()) { + initial_pos_ = options_.initial_position(); + } + +#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(__EMSCRIPTEN__) + if (cc->InputSidePackets().HasTag("INITIAL_POS")) { + LOG(INFO) << "Parsing: " + << cc->InputSidePackets().Tag("INITIAL_POS").Get(); + initial_pos_ = ParseTextProtoOrDie( + cc->InputSidePackets().Tag("INITIAL_POS").Get()); + } +#endif // !defined(__ANDROID__) && !defined(__APPLE__) && + // !defined(__EMSCRIPTEN__) + + // Compile list of ids to be tracked. + for (const auto& pos : initial_pos_.box()) { + RET_CHECK(pos.id() >= 0) << "Requires id to be set"; + batch_track_ids_.insert(pos.id()); + } + + visualize_tracking_data_ = + options_.visualize_tracking_data() && cc->Outputs().HasTag("VIZ"); + visualize_state_ = options_.visualize_state() && cc->Outputs().HasTag("VIZ"); + visualize_internal_state_ = + options_.visualize_internal_state() && cc->Outputs().HasTag("VIZ"); + + // Force recording of internal state for rendering. + if (visualize_internal_state_) { + options_.mutable_tracker_options() + ->mutable_track_step_options() + ->set_return_internal_state(true); + } + + if (visualize_state_ || visualize_internal_state_) { + options_.mutable_tracker_options()->set_record_path_states(true); + } + + if (cc->InputSidePackets().HasTag("CACHE_DIR")) { + cache_dir_ = cc->InputSidePackets().Tag("CACHE_DIR").Get(); + RET_CHECK(!cache_dir_.empty()); + box_tracker_.reset(new BoxTracker(cache_dir_, options_.tracker_options())); + } else { + // Check that all boxes have a unique id. + RET_CHECK(initial_pos_.box_size() == batch_track_ids_.size()) + << "In streaming mode every box must be given its unique id"; + } + + if (options_.streaming_track_data_cache_size() > 0) { + RET_CHECK(!cc->InputSidePackets().HasTag("CACHE_DIR")) + << "Streaming mode not compatible with cache dir."; + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status BoxTrackerCalculator::Process(CalculatorContext* cc) { + // Batch mode, issue tracking requests. + if (box_tracker_ && !tracking_issued_) { + for (const auto& pos : initial_pos_.box()) { + box_tracker_->NewBoxTrack(TimedBox::FromProto(pos), pos.id()); + } + tracking_issued_ = true; + } + + const Timestamp& timestamp = cc->InputTimestamp(); + if (timestamp == Timestamp::PreStream()) { + // Indicator packet. + return ::mediapipe::OkStatus(); + } + + InputStream* track_stream = cc->Inputs().HasTag("TRACKING") + ? &(cc->Inputs().Tag("TRACKING")) + : nullptr; + InputStream* track_time_stream = cc->Inputs().HasTag("TRACK_TIME") + ? &(cc->Inputs().Tag("TRACK_TIME")) + : nullptr; + + // Cache tracking data if possible. + if (track_stream && !track_stream->IsEmpty()) { + const TrackingData& track_data = track_stream->Get(); + const int track_cache_size = options_.streaming_track_data_cache_size(); + if (track_cache_size > 0) { + tracking_data_cache_.push_back(std::make_pair(timestamp, track_data)); + while (tracking_data_cache_.size() > track_cache_size) { + tracking_data_cache_.pop_front(); + } + } + track_timestamps_.push_back(timestamp); + int trim_count = track_timestamps_.size() - + std::max(track_cache_size, kTrackTimestampsMinQueueSize); + if (trim_count > 0) { + track_timestamps_base_index_ += trim_count; + + while (trim_count-- > 0) { + track_timestamps_.pop_front(); + } + } + } + + InputStream* start_pos_stream = cc->Inputs().HasTag("START_POS") + ? &(cc->Inputs().Tag("START_POS")) + : nullptr; + + MotionBoxMap fast_forward_boxes; + if (start_pos_stream && !start_pos_stream->IsEmpty()) { + // Try to fast forward boxes to current tracking head. + const TimedBoxProtoList& start_pos_list = + start_pos_stream->Get(); + FastForwardStartPos(start_pos_list, &fast_forward_boxes); + } + + InputStream* start_pos_proto_string_stream = + cc->Inputs().HasTag("START_POS_PROTO_STRING") + ? &(cc->Inputs().Tag("START_POS_PROTO_STRING")) + : nullptr; + if (start_pos_stream == nullptr || start_pos_stream->IsEmpty()) { + if (start_pos_proto_string_stream && + !start_pos_proto_string_stream->IsEmpty()) { + auto start_pos_list_str = + start_pos_proto_string_stream->Get(); + TimedBoxProtoList start_pos_list; + start_pos_list.ParseFromString(start_pos_list_str); + FastForwardStartPos(start_pos_list, &fast_forward_boxes); + } + } + + InputStream* restart_pos_stream = cc->Inputs().HasTag("RESTART_POS") + ? &(cc->Inputs().Tag("RESTART_POS")) + : nullptr; + + if (restart_pos_stream && !restart_pos_stream->IsEmpty()) { + const TimedBoxProtoList& restart_pos_list = + restart_pos_stream->Get(); + FastForwardStartPos(restart_pos_list, &fast_forward_boxes); + } + + InputStream* cancel_object_id_stream = + cc->Inputs().HasTag("CANCEL_OBJECT_ID") + ? &(cc->Inputs().Tag("CANCEL_OBJECT_ID")) + : nullptr; + if (cancel_object_id_stream && !cancel_object_id_stream->IsEmpty()) { + const int cancel_object_id = cancel_object_id_stream->Get(); + if (streaming_motion_boxes_.erase(cancel_object_id) == 0) { + LOG(WARNING) << "box id " << cancel_object_id << " does not exist."; + } + } + + cv::Mat input_view; + cv::Mat viz_view; + std::unique_ptr viz_frame; + + TrackingData track_data_to_render; + + if (cc->Outputs().HasTag("VIZ")) { + InputStream* video_stream = &(cc->Inputs().Tag("VIDEO")); + if (!video_stream->IsEmpty()) { + input_view = formats::MatView(&video_stream->Get()); + + viz_frame.reset(new ImageFrame()); + viz_frame->CopyFrom(video_stream->Get(), 16); + viz_view = formats::MatView(viz_frame.get()); + } + } + + // Results to be output or rendered, list of TimedBox for every id that are + // present at this frame. + TimedBoxProtoList box_track_list; + + CHECK(box_tracker_ || track_stream) + << "Expected either batch or streaming mode"; + + // Corresponding list of box states for rendering. For each id present at + // this frame stores closest 1-2 states. + std::vector> box_state_list; + int64 timestamp_msec = timestamp.Value() / 1000; + + if (box_tracker_) { // Batch mode. + // Ensure tracking has terminated. + box_tracker_->WaitForAllOngoingTracks(); + + // Cycle through ids. + for (int id : batch_track_ids_) { + TimedBox result; + std::vector states; + std::vector* states_ptr = + (visualize_state_ || visualize_internal_state_) ? &states : nullptr; + + if (box_tracker_->GetTimedPosition(id, timestamp_msec, &result, + states_ptr)) { + TimedBoxProto proto = result.ToProto(); + proto.set_id(id); + *box_track_list.add_box() = std::move(proto); + + if (states_ptr) { + box_state_list.push_back(*states_ptr); + } + } + } + + if (visualize_tracking_data_) { + constexpr int kVizId = -1; + box_tracker_->GetTrackingData(kVizId, timestamp_msec, + &track_data_to_render); + } + } else { + // Streaming mode. + // If track data is available advance all boxes by new data. + if (!track_stream->IsEmpty()) { + const TrackingData& track_data = track_stream->Get(); + + if (visualize_tracking_data_) { + track_data_to_render = track_data; + } + + const int64 time_ms = track_timestamps_.back().Value() / 1000; + const int64 duration_ms = + track_timestamps_.size() > 1 + ? time_ms - track_timestamps_.rbegin()[1].Value() / 1000 + : 0; + + std::vector failed_boxes; + StreamTrack(track_data, frame_num_, time_ms, duration_ms, + true, // forward. + &streaming_motion_boxes_, &failed_boxes); + + // Add fast forward boxes. + if (!fast_forward_boxes.empty()) { + for (const auto& box : fast_forward_boxes) { + streaming_motion_boxes_.emplace(box.first, box.second); + } + fast_forward_boxes.clear(); + } + + // Remove failed boxes. + for (int id : failed_boxes) { + streaming_motion_boxes_.erase(id); + } + + // Init new boxes once data from previous time to current is available. + for (const auto& pos : initial_pos_.box()) { + if (timestamp_msec - pos.time_msec() >= 0 && + initialized_ids_.find(pos.id()) == initialized_ids_.end()) { + MotionBoxState init_state; + MotionBoxStateFromTimedBox(TimedBox::FromProto(pos), &init_state); + + InitializeInliersOutliersInMotionBoxState(track_data, &init_state); + InitializePnpHomographyInMotionBoxState( + track_data, options_.tracker_options().track_step_options(), + &init_state); + + TrackStepOptions track_step_options = + options_.tracker_options().track_step_options(); + ChangeTrackingDegreesBasedOnStartPos(pos, &track_step_options); + MotionBox init_box(track_step_options); + + // Init at previous frame. + init_box.ResetAtFrame(frame_num_, init_state); + + PathSegment init_path; + AddStateToPath(init_state, timestamp_msec, &init_path); + + streaming_motion_boxes_.emplace( + pos.id(), MotionBoxPath(std::move(init_box), std::move(init_path), + pos.reacquisition())); + initialized_ids_.insert(pos.id()); + } + } + + ++frame_num_; + } else { + // Track stream is empty, if anything is requested on track_time_stream + // queue up requests. + if (track_time_stream && !track_time_stream->IsEmpty()) { + queued_track_requests_.push_back(timestamp); + } + } + + // Can output be generated? + if (!track_stream->IsEmpty()) { + ++frame_num_since_reset_; + + // Generate results for queued up request. + if (cc->Outputs().HasTag("BOXES") && !queued_track_requests_.empty()) { + for (int j = 0; j < queued_track_requests_.size(); ++j) { + const Timestamp& past_time = queued_track_requests_[j]; + RET_CHECK(past_time.Value() < timestamp.Value()) + << "Inconsistency, queued up requests should occur in past"; + std::unique_ptr past_box_list( + new TimedBoxProtoList()); + + for (auto& motion_box_path : streaming_motion_boxes_) { + TimedBox result_box; + TimedBoxAtTime(motion_box_path.second.path, + past_time.Value() / 1000, &result_box, nullptr); + + const float subframe_alpha = + static_cast(j + 1) / (queued_track_requests_.size() + 1); + AddSmoothTransitionToOutputBox(motion_box_path.first, &result_box, + subframe_alpha); + + TimedBoxProto proto = result_box.ToProto(); + proto.set_id(motion_box_path.first); + proto.set_reacquisition(motion_box_path.second.reacquisition); + *past_box_list->add_box() = std::move(proto); + } + + // Output for every time. + cc->Outputs().Tag("BOXES").Add(past_box_list.release(), past_time); + } + + queued_track_requests_.clear(); + } + + // Generate result at current frame. + for (auto& motion_box_path : streaming_motion_boxes_) { + TimedBox result_box; + MotionBoxState result_state; + TimedBoxAtTime(motion_box_path.second.path, timestamp_msec, &result_box, + &result_state); + + AddSmoothTransitionToOutputBox(motion_box_path.first, &result_box); + + TimedBoxProto proto = result_box.ToProto(); + proto.set_id(motion_box_path.first); + proto.set_reacquisition(motion_box_path.second.reacquisition); + *box_track_list.add_box() = std::move(proto); + + if (visualize_state_ || visualize_internal_state_) { + box_state_list.push_back({result_state}); + } + } + } + // end streaming mode case. + } + + // Save a snapshot of latest tracking results before override with fast + // forwarded start pos. + if (!fast_forward_boxes.empty()) { + frame_num_since_reset_ = 0; + last_tracked_boxes_.clear(); + // Add any remaining fast forward boxes. For example occurs if START_POS is + // specified with non-matching TRACKING mode + for (const auto& reset_box : fast_forward_boxes) { + const auto tracked_box_iter = + streaming_motion_boxes_.find(reset_box.first); + if (tracked_box_iter != streaming_motion_boxes_.end()) { + if (!reset_box.second.path.empty() && + !tracked_box_iter->second.path.empty()) { + last_tracked_boxes_[reset_box.first] = + std::make_pair(tracked_box_iter->second.path.back(), + reset_box.second.path.back()); + } + } + + // Override previous tracking with reset start pos. + streaming_motion_boxes_[reset_box.first] = reset_box.second; + } + } + + if (viz_frame) { + if (visualize_tracking_data_) { + RenderTrackingData(track_data_to_render, &viz_view); + } + + if (visualize_state_) { + for (const auto& state_vec : box_state_list) { + RenderStates(state_vec, &viz_view); + } + } + + if (visualize_internal_state_) { + for (const auto& state_vec : box_state_list) { + RenderInternalStates(state_vec, &viz_view); + } + } + + for (const auto& box : box_track_list.box()) { + RenderBox(box, &viz_view); + } + } + + // Handle random access track requests. + InputStream* ra_track_stream = cc->Inputs().HasTag("RA_TRACK") + ? &(cc->Inputs().Tag("RA_TRACK")) + : nullptr; + + if (ra_track_stream && !ra_track_stream->IsEmpty()) { + RET_CHECK(!box_tracker_) << "Random access only for streaming mode " + << "implemented."; + const TimedBoxProtoList& box_list = + ra_track_stream->Get(); + RET_CHECK(box_list.box_size() % 2 == 0) + << "Expect even number of (start,end) tuples but get " + << box_list.box_size(); + OutputRandomAccessTrack(box_list, cc); + } + + InputStream* ra_track_proto_string_stream = + cc->Inputs().HasTag("RA_TRACK_PROTO_STRING") + ? &(cc->Inputs().Tag("RA_TRACK_PROTO_STRING")) + : nullptr; + if (ra_track_stream == nullptr || ra_track_stream->IsEmpty()) { + if (ra_track_proto_string_stream && + !ra_track_proto_string_stream->IsEmpty()) { + RET_CHECK(!box_tracker_) << "Random access only for streaming mode " + << "implemented."; + auto box_list_str = ra_track_proto_string_stream->Get(); + TimedBoxProtoList box_list; + box_list.ParseFromString(box_list_str); + RET_CHECK(box_list.box_size() % 2 == 0) + << "Expect even number of (start,end) tuples but get " + << box_list.box_size(); + OutputRandomAccessTrack(box_list, cc); + } + } + + // Always output in batch, only output in streaming if tracking data + // is present (might be in fast forward mode instead). + if (cc->Outputs().HasTag("BOXES") && + (box_tracker_ || !track_stream->IsEmpty())) { + std::unique_ptr boxes(new TimedBoxProtoList()); + *boxes = std::move(box_track_list); + cc->Outputs().Tag("BOXES").Add(boxes.release(), timestamp); + } + + if (viz_frame) { + cc->Outputs().Tag("VIZ").Add(viz_frame.release(), timestamp); + } + + return ::mediapipe::OkStatus(); +} + +void BoxTrackerCalculator::AddSmoothTransitionToOutputBox( + int box_id, TimedBox* result_box, float subframe_alpha) { + if (options_.start_pos_transition_frames() > 0 && + frame_num_since_reset_ <= options_.start_pos_transition_frames()) { + const auto& box_iter = last_tracked_boxes_.find(box_id); + if (box_iter != last_tracked_boxes_.end()) { + // We first compute the blend of last tracked box with reset box at the + // same timestamp as blend_start = alpha * reset_box + (1 - alpha) * + // last_tracked_box. Then apply the motion from current tracking to reset + // pos to the blended start pos as: result_box = blend_start + + // (current_box - reset_box) With some derivation, we can get result_box = + // (1 - alpha) * (last_track - reset_box) + current_box + TimedBox tmp_box = TimedBox::Blend(box_iter->second.first, + box_iter->second.second, 1.0, -1.0); + const float alpha = (frame_num_since_reset_ - 1.0f + subframe_alpha) / + options_.start_pos_transition_frames(); + *result_box = TimedBox::Blend(tmp_box, *result_box, 1.0 - alpha, 1.0); + } + } +} + +void BoxTrackerCalculator::OutputRandomAccessTrack( + const TimedBoxProtoList& box_list, CalculatorContext* cc) { + std::unique_ptr result_list(new TimedBoxProtoList()); + + for (int i = 0; i < box_list.box_size(); i += 2) { + const TimedBoxProto start = box_list.box(i); + int64 end_time_msec = box_list.box(i + 1).time_msec(); + const bool forward_track = start.time_msec() < end_time_msec; + + if (track_timestamps_.empty()) { + LOG(WARNING) << "No tracking data cached yet."; + continue; + } + + // Performing the range check in msec (b/138399787) + const int64 tracking_start_timestamp_msec = + track_timestamps_.front().Microseconds() / 1000; + const int64 tracking_end_timestamp_msec = + track_timestamps_.back().Microseconds() / 1000; + if (start.time_msec() < tracking_start_timestamp_msec) { + LOG(WARNING) << "Request start timestamp " << start.time_msec() + << " too old. First frame in the window: " + << tracking_start_timestamp_msec; + continue; + } + if (start.time_msec() > tracking_end_timestamp_msec) { + LOG(WARNING) << "Request start timestamp " << start.time_msec() + << " too new. Last frame in the window: " + << tracking_end_timestamp_msec; + continue; + } + if (end_time_msec < tracking_start_timestamp_msec) { + LOG(WARNING) << "Request end timestamp " << end_time_msec + << " too old. First frame in the window: " + << tracking_start_timestamp_msec; + continue; + } + if (end_time_msec > tracking_end_timestamp_msec) { + LOG(WARNING) << "Request end timestamp " << end_time_msec + << " too new. Last frame in the window: " + << tracking_end_timestamp_msec; + continue; + } + + std::deque::iterator timestamp_pos = + GetRandomAccessTimestampPos(start, forward_track); + + if (timestamp_pos == track_timestamps_.end()) { + LOG(ERROR) << "Random access outside cached range"; + continue; + } + + // Locate start of tracking data. + std::deque>::iterator start_data = + GetRandomAccessStartData(timestamp_pos); + + // TODO: Interpolate random access tracking start_data instead + // of dropping the request in the case of missing processed frame. + if (start_data == tracking_data_cache_.end()) { + LOG(ERROR) << "Random access starts at unprocessed frame."; + continue; + } + + const int init_frame = timestamp_pos - track_timestamps_.begin() + + track_timestamps_base_index_; + CHECK_GE(init_frame, 0); + + MotionBoxMap single_map = + PrepareRandomAccessTrack(start, init_frame, forward_track, start_data); + bool track_error = forward_track + ? RunForwardTrack(start_data, init_frame, + &single_map, end_time_msec) + : RunBackwardTrack(start_data, init_frame, + &single_map, end_time_msec); + + if (track_error) { + LOG(ERROR) << "Could not track box."; + continue; + } + + ObtainResultOfRandomAccessTrack(single_map, start, end_time_msec, + result_list); + } + + cc->Outputs() + .Tag("RA_BOXES") + .Add(result_list.release(), cc->InputTimestamp()); +} + +std::deque::iterator +BoxTrackerCalculator::GetRandomAccessTimestampPos(const TimedBoxProto& start, + bool forward_track) { + std::deque::iterator timestamp_pos; + Timestamp timestamp(start.time_msec() * 1000); + if (forward_track) { + timestamp_pos = std::upper_bound(track_timestamps_.begin(), + track_timestamps_.end(), timestamp); + } else { + timestamp_pos = std::lower_bound(track_timestamps_.begin(), + track_timestamps_.end(), timestamp); + } + return timestamp_pos; +} + +std::deque>::iterator +BoxTrackerCalculator::GetRandomAccessStartData( + const std::deque::iterator& timestamp_pos) { + std::deque>::iterator start_data = + std::find_if(tracking_data_cache_.begin(), tracking_data_cache_.end(), + [timestamp_pos]( + const std::pair& item) -> bool { + return item.first == *timestamp_pos; + }); + return start_data; +} + +BoxTrackerCalculator::MotionBoxMap +BoxTrackerCalculator::PrepareRandomAccessTrack( + const TimedBoxProto& start, int init_frame, bool forward_track, + const std::deque>::iterator& + start_data) { + MotionBoxMap single_map; + // Init state at request time. + MotionBoxState init_state; + MotionBoxStateFromTimedBox(TimedBox::FromProto(start), &init_state); + + InitializeInliersOutliersInMotionBoxState(start_data->second, &init_state); + InitializePnpHomographyInMotionBoxState( + start_data->second, options_.tracker_options().track_step_options(), + &init_state); + + TrackStepOptions track_step_options = + options_.tracker_options().track_step_options(); + ChangeTrackingDegreesBasedOnStartPos(start, &track_step_options); + MotionBox init_box(track_step_options); + init_box.ResetAtFrame(init_frame - (forward_track ? 1 : 0), init_state); + + PathSegment init_path; + + // Avoid duplicating start time in case TrackingData has same value. + // Note: For backward tracking we always arrive at an earlier frame, so + // no duplication can happen, see StreamTrack for details. + if (start.time_msec() != start_data->first.Value() / 1000 || !forward_track) { + AddStateToPath(init_state, start.time_msec(), &init_path); + } + + single_map.emplace(start.id(), + MotionBoxPath(std::move(init_box), std::move(init_path))); + return single_map; +} + +bool BoxTrackerCalculator::RunForwardTrack( + const std::deque>::iterator& start_data, + int init_frame, MotionBoxMap* single_map, int64 end_time_msec) { + int curr_frame = init_frame; + for (auto cache_pos = start_data; cache_pos != tracking_data_cache_.end(); + ++cache_pos, ++curr_frame) { + std::vector failed_box; + const int64 dst_time_msec = cache_pos->first.Value() / 1000; + const int64 curr_duration = + (cache_pos == tracking_data_cache_.begin()) + ? 0 + : (cache_pos[0].first.Value() - cache_pos[-1].first.Value()) / 1000; + StreamTrack(cache_pos->second, curr_frame, dst_time_msec, curr_duration, + true, // forward + single_map, &failed_box); + if (!failed_box.empty()) { + return true; + } + if (dst_time_msec > end_time_msec) { + return false; + } + } + return false; +} + +bool BoxTrackerCalculator::RunBackwardTrack( + const std::deque>::iterator& start_data, + int init_frame, MotionBoxMap* single_map, int64 end_time_msec) { + int curr_frame = init_frame; + for (auto cache_pos = start_data; cache_pos != tracking_data_cache_.begin(); + --cache_pos, --curr_frame) { + std::vector failed_box; + const int64 dst_time_msec = cache_pos[-1].first.Value() / 1000; + const int64 curr_duration = + (cache_pos[0].first.Value() - cache_pos[-1].first.Value()) / 1000; + StreamTrack(cache_pos->second, curr_frame, dst_time_msec, curr_duration, + false, // backward + single_map, &failed_box); + if (!failed_box.empty()) { + return true; + } + if (dst_time_msec < end_time_msec) { + return false; + } + } + return false; +} + +void BoxTrackerCalculator::ObtainResultOfRandomAccessTrack( + const MotionBoxMap& single_map, const TimedBoxProto& start, + int64 end_time_msec, + const std::unique_ptr& result_list) { + const MotionBoxPath& result_path = single_map.find(start.id())->second; + TimedBox result_box; + TimedBoxAtTime(result_path.path, end_time_msec, &result_box, nullptr); + TimedBoxProto proto = result_box.ToProto(); + proto.set_id(start.id()); + *result_list->add_box() = std::move(proto); +} + +void BoxTrackerCalculator::RenderStates( + const std::vector& states, cv::Mat* mat) { + for (int k = 0; k < states.size(); ++k) { + const bool print_stats = k == 0; + RenderState(states[k], print_stats, mat); + } +} + +void BoxTrackerCalculator::RenderInternalStates( + const std::vector& states, cv::Mat* mat) { + for (const MotionBoxState& state : states) { + RenderInternalState(state.internal(), mat); + } +} + +void BoxTrackerCalculator::StreamTrack(const TrackingData& data, + int data_frame_num, + int64 dst_timestamp_ms, + int64 duration_ms, bool forward, + MotionBoxMap* box_map, + std::vector* failed_ids) { + CHECK(box_map); + CHECK(failed_ids); + + // Track all existing boxes by one frame. + MotionVectorFrame mvf; // Holds motion from current to previous frame. + MotionVectorFrameFromTrackingData(data, &mvf); + + if (forward) { + MotionVectorFrame mvf_inverted; + InvertMotionVectorFrame(mvf, &mvf_inverted); + std::swap(mvf, mvf_inverted); + } + + if (duration_ms > 0) { + mvf.duration_ms = duration_ms; + } + + const int from_frame = data_frame_num - (forward ? 1 : 0); + const int to_frame = forward ? from_frame + 1 : from_frame - 1; + + for (auto& motion_box : *box_map) { + if (!motion_box.second.box.TrackStep(from_frame, // from frame. + mvf, forward)) { + failed_ids->push_back(motion_box.first); + LOG(INFO) << "lost track. pushed failed id: " << motion_box.first; + } else { + // Store result. + PathSegment& path = motion_box.second.path; + const MotionBoxState& result_state = + motion_box.second.box.StateAtFrame(to_frame); + AddStateToPath(result_state, dst_timestamp_ms, &path); + // motion_box has got new tracking state/path. Now trimming it. + const int cache_size = + std::max(options_.streaming_track_data_cache_size(), + kMotionBoxPathMinQueueSize); + motion_box.second.Trim(cache_size, forward); + } + } +} + +void BoxTrackerCalculator::FastForwardStartPos( + const TimedBoxProtoList& start_pos_list, MotionBoxMap* box_map) { + for (const TimedBoxProto& start_pos : start_pos_list.box()) { + Timestamp timestamp(start_pos.time_msec() * 1000); + // Locate corresponding frame number for starting position. As TrackingData + // stores motion from current to last frame; we are using the data after + // this frame for tracking. + auto timestamp_pos = std::lower_bound(track_timestamps_.begin(), + track_timestamps_.end(), timestamp); + + if (timestamp_pos == track_timestamps_.end()) { + LOG(WARNING) << "Received start pos beyond current timestamp, " + << "Starting to track once frame arrives."; + *initial_pos_.add_box() = start_pos; + continue; + } + + // Start at previous frame. + const int init_frame = timestamp_pos - track_timestamps_.begin() + + track_timestamps_base_index_; + CHECK_GE(init_frame, 0); + + // Locate corresponding tracking data. + auto start_data = std::find_if( + tracking_data_cache_.begin(), tracking_data_cache_.end(), + [timestamp_pos](const std::pair& item) + -> bool { return item.first == timestamp_pos[0]; }); + + if (start_data == tracking_data_cache_.end()) { + LOG(ERROR) << "Box to fast forward outside tracking data cache. Ignoring." + << " To avoid this error consider increasing the cache size."; + continue; + } + + // Init state at request time. + MotionBoxState init_state; + MotionBoxStateFromTimedBox(TimedBox::FromProto(start_pos), &init_state); + + InitializeInliersOutliersInMotionBoxState(start_data->second, &init_state); + InitializePnpHomographyInMotionBoxState( + start_data->second, options_.tracker_options().track_step_options(), + &init_state); + + TrackStepOptions track_step_options = + options_.tracker_options().track_step_options(); + ChangeTrackingDegreesBasedOnStartPos(start_pos, &track_step_options); + MotionBox init_box(track_step_options); + init_box.ResetAtFrame(init_frame, init_state); + + int curr_frame = init_frame + 1; + MotionBoxMap single_map; + PathSegment init_path; + AddStateToPath(init_state, timestamp_pos[0].Value() / 1000, &init_path); + single_map.emplace(start_pos.id(), + MotionBoxPath(std::move(init_box), std::move(init_path), + start_pos.reacquisition())); + bool track_error = false; + + for (auto cache_pos = start_data + 1; + cache_pos != tracking_data_cache_.end(); ++cache_pos, ++curr_frame) { + std::vector failed_box; + const int64 curr_time_msec = cache_pos->first.Value() / 1000; + const int64 curr_duration = + (cache_pos[0].first.Value() - cache_pos[-1].first.Value()) / 1000; + StreamTrack(cache_pos->second, curr_frame, curr_time_msec, curr_duration, + true, // forward + &single_map, &failed_box); + if (!failed_box.empty()) { + LOG(WARNING) << "Unable to fast forward box at frame " << curr_frame; + track_error = true; + break; + } + } + + if (!track_error) { + // Fast forward successful. + if (box_map->find(start_pos.id()) != box_map->end()) { + DLOG(ERROR) << "Fast forward successful, but box with same id " + << "exists already."; + } else { + // Add to set of currently tracked boxes. + const MotionBoxPath& result = single_map.find(start_pos.id())->second; + box_map->emplace(start_pos.id(), result); + } + } + } +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/video/box_tracker_calculator.proto b/mediapipe/calculators/video/box_tracker_calculator.proto new file mode 100644 index 000000000..200108201 --- /dev/null +++ b/mediapipe/calculators/video/box_tracker_calculator.proto @@ -0,0 +1,55 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; +import "mediapipe/util/tracking/box_tracker.proto"; + +message BoxTrackerCalculatorOptions { + extend CalculatorOptions { + optional BoxTrackerCalculatorOptions ext = 268767860; + } + + optional BoxTrackerOptions tracker_options = 1; + + // Initial position to be tracked. Can also be supplied as side packet or + // as input stream. + optional TimedBoxProtoList initial_position = 2; + + // If set and VIZ stream is present, renders tracking data into the + // visualization. + optional bool visualize_tracking_data = 3 [default = false]; + + // If set and VIZ stream is present, renders the box state + // into the visualization. + optional bool visualize_state = 4 [default = false]; + + // If set and VIZ stream is present, renders the internal box state + // into the visualization. + optional bool visualize_internal_state = 5 [default = false]; + + // Size of the track data cache during streaming mode. This allows to buffer + // track_data's for fast forward tracking, i.e. any TimedBox received + // via input stream START_POS can be tracked towards the current track head + // (i.e. last received TrackingData). Measured in number of frames. + optional int32 streaming_track_data_cache_size = 6 [default = 0]; + + // Add a transition period of N frames to smooth the jump from original + // tracking to reset start pos with motion compensation. The transition will + // be a linear decay of original tracking result. 0 means no transition. + optional int32 start_pos_transition_frames = 7 [default = 0]; +} diff --git a/mediapipe/calculators/video/flow_packager_calculator.cc b/mediapipe/calculators/video/flow_packager_calculator.cc new file mode 100644 index 000000000..f871433eb --- /dev/null +++ b/mediapipe/calculators/video/flow_packager_calculator.cc @@ -0,0 +1,281 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "mediapipe/calculators/video/flow_packager_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/flow_packager.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +using mediapipe::CameraMotion; +using mediapipe::FlowPackager; +using mediapipe::RegionFlowFeatureList; +using mediapipe::TrackingData; +using mediapipe::TrackingDataChunk; + +// A calculator that packages input CameraMotion and RegionFlowFeatureList +// into a TrackingData and optionally writes TrackingDataChunks to file. +// +// Input stream: +// FLOW: Input region flow (proto RegionFlowFeatureList). +// CAMERA: Input camera stream (proto CameraMotion, optional). +// +// Input side packets: +// CACHE_DIR: Optional caching directory tracking files are written to. +// +// Output streams. +// TRACKING: Output tracking data (proto TrackingData, per frame +// optional). +// TRACKING_CHUNK: Output tracking chunks (proto TrackingDataChunk, +// per chunk, optional), output at the first timestamp +// of each chunk. +// COMPLETE: Optional output packet sent on PreStream to +// to signal downstream calculators that all data has been +// processed and calculator is closed. Can be used to indicate +// that all data as been written to CACHE_DIR. +class FlowPackagerCalculator : public CalculatorBase { + public: + ~FlowPackagerCalculator() override = default; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + // Writes passed chunk to disk. + void WriteChunk(const TrackingDataChunk& chunk) const; + + // Initializes next chunk for tracking beginning from last frame of + // current chunk (Chunking is design with one frame overlap). + void PrepareCurrentForNextChunk(TrackingDataChunk* chunk); + + private: + FlowPackagerCalculatorOptions options_; + + // Caching options. + bool use_caching_ = false; + bool build_chunk_ = false; + std::string cache_dir_; + int chunk_idx_ = -1; + TrackingDataChunk tracking_chunk_; + + int frame_idx_ = 0; + + Timestamp prev_timestamp_; + std::unique_ptr flow_packager_; +}; + +REGISTER_CALCULATOR(FlowPackagerCalculator); + +::mediapipe::Status FlowPackagerCalculator::GetContract( + CalculatorContract* cc) { + if (!cc->Inputs().HasTag("FLOW")) { + return tool::StatusFail("No input flow was specified."); + } + + cc->Inputs().Tag("FLOW").Set(); + + if (cc->Inputs().HasTag("CAMERA")) { + cc->Inputs().Tag("CAMERA").Set(); + } + if (cc->Outputs().HasTag("TRACKING")) { + cc->Outputs().Tag("TRACKING").Set(); + } + if (cc->Outputs().HasTag("TRACKING_CHUNK")) { + cc->Outputs().Tag("TRACKING_CHUNK").Set(); + } + if (cc->Outputs().HasTag("COMPLETE")) { + cc->Outputs().Tag("COMPLETE").Set(); + } + + if (cc->InputSidePackets().HasTag("CACHE_DIR")) { + cc->InputSidePackets().Tag("CACHE_DIR").Set(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FlowPackagerCalculator::Open(CalculatorContext* cc) { + options_ = cc->Options(); + + flow_packager_.reset(new FlowPackager(options_.flow_packager_options())); + + use_caching_ = cc->InputSidePackets().HasTag("CACHE_DIR"); + build_chunk_ = use_caching_ || cc->Outputs().HasTag("TRACKING_CHUNK"); + if (use_caching_) { + cache_dir_ = cc->InputSidePackets().Tag("CACHE_DIR").Get(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FlowPackagerCalculator::Process(CalculatorContext* cc) { + InputStream* flow_stream = &(cc->Inputs().Tag("FLOW")); + const RegionFlowFeatureList& flow = flow_stream->Get(); + + const Timestamp timestamp = flow_stream->Value().Timestamp(); + + const CameraMotion* camera_motion = nullptr; + if (cc->Inputs().HasTag("CAMERA")) { + InputStream* camera_stream = &(cc->Inputs().Tag("CAMERA")); + camera_motion = &camera_stream->Get(); + } + + std::unique_ptr tracking_data(new TrackingData()); + + flow_packager_->PackFlow(flow, camera_motion, tracking_data.get()); + + if (build_chunk_) { + if (chunk_idx_ < 0) { // Lazy init, determine first start. + chunk_idx_ = + timestamp.Value() / 1000 / options_.caching_chunk_size_msec(); + tracking_chunk_.set_first_chunk(true); + } + CHECK_GE(chunk_idx_, 0); + + TrackingDataChunk::Item* item = tracking_chunk_.add_item(); + item->set_frame_idx(frame_idx_); + item->set_timestamp_usec(timestamp.Value()); + if (frame_idx_ > 0) { + item->set_prev_timestamp_usec(prev_timestamp_.Value()); + } + if (cc->Outputs().HasTag("TRACKING")) { + // Need to copy as output is requested. + *item->mutable_tracking_data() = *tracking_data; + } else { + item->mutable_tracking_data()->Swap(tracking_data.get()); + } + + const int next_chunk_msec = + options_.caching_chunk_size_msec() * (chunk_idx_ + 1); + + if (timestamp.Value() / 1000 >= next_chunk_msec) { + if (cc->Outputs().HasTag("TRACKING_CHUNK")) { + cc->Outputs() + .Tag("TRACKING_CHUNK") + .Add(new TrackingDataChunk(tracking_chunk_), + Timestamp(tracking_chunk_.item(0).timestamp_usec())); + } + if (use_caching_) { + WriteChunk(tracking_chunk_); + } + PrepareCurrentForNextChunk(&tracking_chunk_); + } + } + + if (cc->Outputs().HasTag("TRACKING")) { + cc->Outputs() + .Tag("TRACKING") + .Add(tracking_data.release(), flow_stream->Value().Timestamp()); + } + + prev_timestamp_ = timestamp; + ++frame_idx_; + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FlowPackagerCalculator::Close(CalculatorContext* cc) { + if (frame_idx_ > 0) { + tracking_chunk_.set_last_chunk(true); + if (cc->Outputs().HasTag("TRACKING_CHUNK")) { + cc->Outputs() + .Tag("TRACKING_CHUNK") + .Add(new TrackingDataChunk(tracking_chunk_), + Timestamp(tracking_chunk_.item(0).timestamp_usec())); + } + + if (use_caching_) { + WriteChunk(tracking_chunk_); + } + } + + if (cc->Outputs().HasTag("COMPLETE")) { + cc->Outputs().Tag("COMPLETE").Add(new bool(true), Timestamp::PreStream()); + } + + return ::mediapipe::OkStatus(); +} + +void FlowPackagerCalculator::WriteChunk(const TrackingDataChunk& chunk) const { + if (chunk.item_size() == 0) { + LOG(ERROR) << "Write chunk called with empty tracking data." + << "This can only occur if the spacing between frames " + << "is larger than the requested chunk size. Try increasing " + << "the chunk size"; + return; + } + + auto format_runtime = + absl::ParsedFormat<'d'>::New(options_.cache_file_format()); + + std::string chunk_file; + if (format_runtime) { + chunk_file = + cache_dir_ + "/" + absl::StrFormat(*format_runtime, chunk_idx_); + } else { + LOG(ERROR) << "chache_file_format wrong. fall back to chunk_%04d."; + chunk_file = cache_dir_ + "/" + absl::StrFormat("chunk_%04d", chunk_idx_); + } + + std::string data; + chunk.SerializeToString(&data); + + const char* temp_filename = tempnam(cache_dir_.c_str(), nullptr); + std::ofstream out_file(temp_filename); + if (!out_file) { + LOG(ERROR) << "Could not open " << temp_filename; + } else { + out_file.write(data.data(), data.size()); + } + + if (rename(temp_filename, chunk_file.c_str()) != 0) { + LOG(ERROR) << "Failed to rename to " << chunk_file; + } + + LOG(INFO) << "Wrote chunk : " << chunk_file; +} + +void FlowPackagerCalculator::PrepareCurrentForNextChunk( + TrackingDataChunk* chunk) { + CHECK(chunk); + if (chunk->item_size() == 0) { + LOG(ERROR) << "Called with empty chunk. Unexpected."; + return; + } + + chunk->set_first_chunk(false); + + // Buffer last item for next chunk. + TrackingDataChunk::Item last_item; + last_item.Swap(chunk->mutable_item(chunk->item_size() - 1)); + + chunk->Clear(); + chunk->add_item()->Swap(&last_item); + + ++chunk_idx_; +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/video/flow_packager_calculator.proto b/mediapipe/calculators/video/flow_packager_calculator.proto new file mode 100644 index 000000000..c1e91d8c5 --- /dev/null +++ b/mediapipe/calculators/video/flow_packager_calculator.proto @@ -0,0 +1,36 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; +import "mediapipe/util/tracking/flow_packager.proto"; + +message FlowPackagerCalculatorOptions { + extend CalculatorOptions { + optional FlowPackagerCalculatorOptions ext = 271236147; + } + + optional mediapipe.FlowPackagerOptions flow_packager_options = 1; + + // Chunk size for caching files that are written to the externally specified + // caching directory. Specified in msec. + // Note that each chunk always contains at its end the first frame of the + // next chunk (to enable forward tracking across chunk boundaries). + optional int32 caching_chunk_size_msec = 2 [default = 2500]; + + optional string cache_file_format = 3 [default = "chunk_%04d"]; +} diff --git a/mediapipe/calculators/video/motion_analysis_calculator.cc b/mediapipe/calculators/video/motion_analysis_calculator.cc new file mode 100644 index 000000000..6746bc3a9 --- /dev/null +++ b/mediapipe/calculators/video/motion_analysis_calculator.cc @@ -0,0 +1,988 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "absl/strings/numbers.h" +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "mediapipe/calculators/video/motion_analysis_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/formats/image_frame_opencv.h" +#include "mediapipe/framework/formats/video_stream_header.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/util/tracking/camera_motion.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/frame_selection.pb.h" +#include "mediapipe/util/tracking/motion_analysis.h" +#include "mediapipe/util/tracking/motion_estimation.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +using mediapipe::AffineAdapter; +using mediapipe::CameraMotion; +using mediapipe::FrameSelectionResult; +using mediapipe::Homography; +using mediapipe::HomographyAdapter; +using mediapipe::LinearSimilarityModel; +using mediapipe::MixtureHomography; +using mediapipe::MixtureRowWeights; +using mediapipe::MotionAnalysis; +using mediapipe::ProjectViaFit; +using mediapipe::RegionFlowComputationOptions; +using mediapipe::RegionFlowFeatureList; +using mediapipe::SalientPointFrame; +using mediapipe::TranslationModel; + +const char kOptionsTag[] = "OPTIONS"; + +// A calculator that performs motion analysis on an incoming video stream. +// +// Input streams: (at least one of them is required). +// VIDEO: The input video stream (ImageFrame, sRGB, sRGBA or GRAY8). +// SELECTION: Optional input stream to perform analysis only on selected +// frames. If present needs to contain camera motion +// and features. +// +// Input side packets: +// CSV_FILE: Read motion models as homographies from CSV file. Expected +// to be defined in the frame domain (un-normalized). +// Should store 9 floats per row. +// Specify number of homographies per frames via option +// meta_models_per_frame. For values > 1, MixtureHomographies +// are created, for value == 1, a single Homography is used. +// DOWNSAMPLE: Optionally specify downsampling factor via input side packet +// overriding value in the graph settings. +// Output streams (all are optional). +// FLOW: Sparse feature tracks in form of proto RegionFlowFeatureList. +// CAMERA: Camera motion as proto CameraMotion describing the per frame- +// pair motion. Has VideoHeader from input video. +// SALIENCY: Foreground saliency (objects moving different from the +// background) as proto SalientPointFrame. +// VIZ: Visualization stream as ImageFrame, sRGB, visualizing +// features and saliency (set via +// analysis_options().visualization_options()) +// DENSE_FG: Dense foreground stream, describing per-pixel foreground- +// ness as confidence between 0 (background) and 255 +// (foreground). Output is ImageFrame (GRAY8). +// VIDEO_OUT: Optional output stream when SELECTION is used. Output is input +// VIDEO at the selected frames. Required VIDEO to be present. +// GRAY_VIDEO_OUT: Optional output stream for downsampled, grayscale video. +// Requires VIDEO to be present and SELECTION to not be used. +class MotionAnalysisCalculator : public CalculatorBase { + // TODO: Activate once leakr approval is ready. + // typedef com::google::android::libraries::micro::proto::Data HomographyData; + + public: + ~MotionAnalysisCalculator() override = default; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + // Outputs results to Outputs() if MotionAnalysis buffered sufficient results. + // Otherwise no-op. Set flush to true to force output of all buffered data. + void OutputMotionAnalyzedFrames(bool flush, CalculatorContext* cc); + + // Lazy init function to be called on Process. + ::mediapipe::Status InitOnProcess(InputStream* video_stream, + InputStream* selection_stream); + + // Parses CSV file contents to homographies. + bool ParseModelCSV(const std::string& contents, + std::deque* homographies); + + // Turns list of 9-tuple floating values into set of homographies. + bool HomographiesFromValues(const std::vector& homog_values, + std::deque* homographies); + + // Appends CameraMotions and features from homographies. + // Set append_identity to true to add an identity transform to the beginning + // of the each list *in addition* to the motions derived from homographies. + void AppendCameraMotionsFromHomographies( + const std::deque& homographies, bool append_identity, + std::deque* camera_motions, + std::deque* features); + + // Helper function to subtract current metadata motion from features. Used + // for hybrid estimation case. + void SubtractMetaMotion(const CameraMotion& meta_motion, + RegionFlowFeatureList* features); + + // Inverse of above function to add back meta motion and replace + // feature location with originals after estimation. + void AddMetaMotion(const CameraMotion& meta_motion, + const RegionFlowFeatureList& meta_features, + RegionFlowFeatureList* features, CameraMotion* motion); + + MotionAnalysisCalculatorOptions options_; + int frame_width_ = -1; + int frame_height_ = -1; + int frame_idx_ = 0; + + // Buffers incoming video frame packets (if visualization output is requested) + std::vector packet_buffer_; + + // Buffers incoming timestamps until MotionAnalysis is ready to output via + // above OutputMotionAnalyzedFrames. + std::vector timestamp_buffer_; + + // Input indicators for each stream. + bool selection_input_ = false; + bool video_input_ = false; + + // Output indicators for each stream. + bool region_flow_feature_output_ = false; + bool camera_motion_output_ = false; + bool saliency_output_ = false; + bool visualize_output_ = false; + bool dense_foreground_output_ = false; + bool video_output_ = false; + bool grayscale_output_ = false; + bool csv_file_input_ = false; + + // Inidicates if saliency should be computed. + bool with_saliency_ = false; + + // Set if hybrid meta analysis - see proto for details. + bool hybrid_meta_analysis_ = false; + + // Concatenated motions for each selected frame. Used in case + // hybrid estimation is requested to fallback to valid models. + std::deque selected_motions_; + + // Normalized homographies from CSV file or metadata. + std::deque meta_homographies_; + std::deque meta_motions_; + std::deque meta_features_; + + // Offset into above meta_motions_ and features_ when using + // hybrid meta analysis. + int hybrid_meta_offset_ = 0; + + std::unique_ptr motion_analysis_; + + std::unique_ptr row_weights_; +}; + +REGISTER_CALCULATOR(MotionAnalysisCalculator); + +::mediapipe::Status MotionAnalysisCalculator::GetContract( + CalculatorContract* cc) { + if (cc->Inputs().HasTag("VIDEO")) { + cc->Inputs().Tag("VIDEO").Set(); + } + + // Optional input stream from frame selection calculator. + if (cc->Inputs().HasTag("SELECTION")) { + cc->Inputs().Tag("SELECTION").Set(); + } + + RET_CHECK(cc->Inputs().HasTag("VIDEO") || cc->Inputs().HasTag("SELECTION")) + << "Either VIDEO, SELECTION must be specified."; + + if (cc->Outputs().HasTag("FLOW")) { + cc->Outputs().Tag("FLOW").Set(); + } + + if (cc->Outputs().HasTag("CAMERA")) { + cc->Outputs().Tag("CAMERA").Set(); + } + + if (cc->Outputs().HasTag("SALIENCY")) { + cc->Outputs().Tag("SALIENCY").Set(); + } + + if (cc->Outputs().HasTag("VIZ")) { + cc->Outputs().Tag("VIZ").Set(); + } + + if (cc->Outputs().HasTag("DENSE_FG")) { + cc->Outputs().Tag("DENSE_FG").Set(); + } + + if (cc->Outputs().HasTag("VIDEO_OUT")) { + cc->Outputs().Tag("VIDEO_OUT").Set(); + } + + if (cc->Outputs().HasTag("GRAY_VIDEO_OUT")) { + // We only output grayscale video if we're actually performing full region- + // flow analysis on the video. + RET_CHECK(cc->Inputs().HasTag("VIDEO") && + !cc->Inputs().HasTag("SELECTION")); + cc->Outputs().Tag("GRAY_VIDEO_OUT").Set(); + } + + if (cc->InputSidePackets().HasTag("CSV_FILE")) { + cc->InputSidePackets().Tag("CSV_FILE").Set(); + } + if (cc->InputSidePackets().HasTag("DOWNSAMPLE")) { + cc->InputSidePackets().Tag("DOWNSAMPLE").Set(); + } + + if (cc->InputSidePackets().HasTag(kOptionsTag)) { + cc->InputSidePackets().Tag(kOptionsTag).Set(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status MotionAnalysisCalculator::Open(CalculatorContext* cc) { + options_ = + tool::RetrieveOptions(cc->Options(), + cc->InputSidePackets(), kOptionsTag); + + video_input_ = cc->Inputs().HasTag("VIDEO"); + selection_input_ = cc->Inputs().HasTag("SELECTION"); + region_flow_feature_output_ = cc->Outputs().HasTag("FLOW"); + camera_motion_output_ = cc->Outputs().HasTag("CAMERA"); + saliency_output_ = cc->Outputs().HasTag("SALIENCY"); + visualize_output_ = cc->Outputs().HasTag("VIZ"); + dense_foreground_output_ = cc->Outputs().HasTag("DENSE_FG"); + video_output_ = cc->Outputs().HasTag("VIDEO_OUT"); + grayscale_output_ = cc->Outputs().HasTag("GRAY_VIDEO_OUT"); + csv_file_input_ = cc->InputSidePackets().HasTag("CSV_FILE"); + hybrid_meta_analysis_ = options_.meta_analysis() == + MotionAnalysisCalculatorOptions::META_ANALYSIS_HYBRID; + + if (video_output_) { + RET_CHECK(selection_input_) << "VIDEO_OUT requires SELECTION input"; + } + + if (selection_input_) { + switch (options_.selection_analysis()) { + case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION: + RET_CHECK(!visualize_output_) + << "Visualization not supported for NO_ANALYSIS_USE_SELECTION"; + RET_CHECK(!dense_foreground_output_) + << "Dense foreground not supported for NO_ANALYSIS_USE_SELECTION"; + RET_CHECK(!saliency_output_) + << "Saliency output not supported for NO_ANALYSIS_USE_SELECTION"; + break; + + case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE: + case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED: + RET_CHECK(video_input_) << "Need video input for feature tracking."; + break; + + case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES: + // Nothing to add here. + break; + } + } + + if (visualize_output_ || dense_foreground_output_ || video_output_) { + RET_CHECK(video_input_) << "Video input required."; + } + + if (csv_file_input_) { + RET_CHECK(!selection_input_) + << "Can not use selection input with csv input."; + if (!hybrid_meta_analysis_) { + RET_CHECK(!saliency_output_ && !visualize_output_ && + !dense_foreground_output_ && !grayscale_output_) + << "CSV file and meta input only supports flow and camera motion " + << "output when using metadata only."; + } + } + + if (csv_file_input_) { + // Read from file and parse. + const std::string filename = + cc->InputSidePackets().Tag("CSV_FILE").Get(); + + std::string file_contents; + std::ifstream input_file(filename, std::ios::in); + input_file.seekg(0, std::ios::end); + const int file_length = input_file.tellg(); + file_contents.resize(file_length); + input_file.seekg(0, std::ios::beg); + input_file.read(&file_contents[0], file_length); + input_file.close(); + + RET_CHECK(ParseModelCSV(file_contents, &meta_homographies_)) + << "Could not parse CSV file"; + } + + // Get video header from video or selection input if present. + const VideoHeader* video_header = nullptr; + if (video_input_ && !cc->Inputs().Tag("VIDEO").Header().IsEmpty()) { + video_header = &(cc->Inputs().Tag("VIDEO").Header().Get()); + } else if (selection_input_ && + !cc->Inputs().Tag("SELECTION").Header().IsEmpty()) { + video_header = &(cc->Inputs().Tag("SELECTION").Header().Get()); + } else { + LOG(WARNING) << "No input video header found. Downstream calculators " + "expecting video headers are likely to fail."; + } + + with_saliency_ = options_.analysis_options().compute_motion_saliency(); + // Force computation of saliency if requested as output. + if (cc->Outputs().HasTag("SALIENCY")) { + with_saliency_ = true; + if (!options_.analysis_options().compute_motion_saliency()) { + LOG(WARNING) << "Enable saliency computation. Set " + << "compute_motion_saliency to true to silence this " + << "warning."; + options_.mutable_analysis_options()->set_compute_motion_saliency(true); + } + } + + if (options_.bypass_mode()) { + cc->SetOffset(TimestampDiff(0)); + } + + if (cc->InputSidePackets().HasTag("DOWNSAMPLE")) { + options_.mutable_analysis_options() + ->mutable_flow_options() + ->set_downsample_factor( + cc->InputSidePackets().Tag("DOWNSAMPLE").Get()); + } + + // If no video header is provided, just return and initialize on the first + // Process() call. + if (video_header == nullptr) { + return ::mediapipe::OkStatus(); + } + + ////////////// EARLY RETURN; ONLY HEADER OUTPUT SHOULD GO HERE /////////////// + + if (visualize_output_) { + cc->Outputs().Tag("VIZ").SetHeader(Adopt(new VideoHeader(*video_header))); + } + + if (video_output_) { + cc->Outputs() + .Tag("VIDEO_OUT") + .SetHeader(Adopt(new VideoHeader(*video_header))); + } + + if (cc->Outputs().HasTag("DENSE_FG")) { + std::unique_ptr foreground_header( + new VideoHeader(*video_header)); + foreground_header->format = ImageFormat::GRAY8; + cc->Outputs().Tag("DENSE_FG").SetHeader(Adopt(foreground_header.release())); + } + + if (cc->Outputs().HasTag("CAMERA")) { + cc->Outputs().Tag("CAMERA").SetHeader( + Adopt(new VideoHeader(*video_header))); + } + + if (cc->Outputs().HasTag("SALIENCY")) { + cc->Outputs() + .Tag("SALIENCY") + .SetHeader(Adopt(new VideoHeader(*video_header))); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status MotionAnalysisCalculator::Process(CalculatorContext* cc) { + if (options_.bypass_mode()) { + return ::mediapipe::OkStatus(); + } + + InputStream* video_stream = + video_input_ ? &(cc->Inputs().Tag("VIDEO")) : nullptr; + InputStream* selection_stream = + selection_input_ ? &(cc->Inputs().Tag("SELECTION")) : nullptr; + + // Checked on Open. + CHECK(video_stream || selection_stream); + + // Lazy init. + if (frame_width_ < 0 || frame_height_ < 0) { + MP_RETURN_IF_ERROR(InitOnProcess(video_stream, selection_stream)); + } + + const Timestamp timestamp = cc->InputTimestamp(); + if ((csv_file_input_) && !hybrid_meta_analysis_) { + if (camera_motion_output_) { + RET_CHECK(!meta_motions_.empty()) << "Insufficient metadata."; + + CameraMotion output_motion = meta_motions_.front(); + meta_motions_.pop_front(); + output_motion.set_timestamp_usec(timestamp.Value()); + cc->Outputs().Tag("CAMERA").Add(new CameraMotion(output_motion), + timestamp); + } + + if (region_flow_feature_output_) { + RET_CHECK(!meta_features_.empty()) << "Insufficient frames in CSV file"; + RegionFlowFeatureList output_features = meta_features_.front(); + meta_features_.pop_front(); + + output_features.set_timestamp_usec(timestamp.Value()); + cc->Outputs().Tag("FLOW").Add(new RegionFlowFeatureList(output_features), + timestamp); + } + + ++frame_idx_; + return ::mediapipe::OkStatus(); + } + + if (motion_analysis_ == nullptr) { + // We do not need MotionAnalysis when using just metadata. + motion_analysis_.reset(new MotionAnalysis(options_.analysis_options(), + frame_width_, frame_height_)); + } + + std::unique_ptr frame_selection_result; + // Always use frame if selection is not activated. + bool use_frame = !selection_input_; + if (selection_input_) { + CHECK(selection_stream); + + // Fill in timestamps we process. + if (!selection_stream->Value().IsEmpty()) { + ASSIGN_OR_RETURN( + frame_selection_result, + selection_stream->Value().ConsumeOrCopy()); + use_frame = true; + + // Make sure both features and camera motion are present. + RET_CHECK(frame_selection_result->has_camera_motion() && + frame_selection_result->has_features()) + << "Frame selection input error at: " << timestamp + << " both camera motion and features need to be " + "present in FrameSelectionResult. " + << frame_selection_result->has_camera_motion() << " , " + << frame_selection_result->has_features(); + } + } + + if (selection_input_ && use_frame && + options_.selection_analysis() == + MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION) { + // Output concatenated results, nothing to compute here. + if (camera_motion_output_) { + cc->Outputs().Tag("CAMERA").Add( + frame_selection_result->release_camera_motion(), timestamp); + } + if (region_flow_feature_output_) { + cc->Outputs().Tag("FLOW").Add(frame_selection_result->release_features(), + timestamp); + } + + if (video_output_) { + cc->Outputs().Tag("VIDEO_OUT").AddPacket(video_stream->Value()); + } + + return ::mediapipe::OkStatus(); + } + + if (use_frame) { + if (!selection_input_) { + const cv::Mat input_view = + formats::MatView(&video_stream->Get()); + if (hybrid_meta_analysis_) { + // Seed with meta homography. + RET_CHECK(hybrid_meta_offset_ < meta_motions_.size()) + << "Not enough metadata received for hybrid meta analysis"; + Homography initial_transform = + meta_motions_[hybrid_meta_offset_].homography(); + std::function subtract_helper = std::bind( + &MotionAnalysisCalculator::SubtractMetaMotion, this, + meta_motions_[hybrid_meta_offset_], std::placeholders::_1); + + // Keep original features before modification around. + motion_analysis_->AddFrameGeneric( + input_view, timestamp.Value(), initial_transform, nullptr, nullptr, + &subtract_helper, &meta_features_[hybrid_meta_offset_]); + ++hybrid_meta_offset_; + } else { + motion_analysis_->AddFrame(input_view, timestamp.Value()); + } + } else { + selected_motions_.push_back(frame_selection_result->camera_motion()); + switch (options_.selection_analysis()) { + case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION: + return ::mediapipe::UnknownErrorBuilder(MEDIAPIPE_LOC) + << "Should not reach this point!"; + + case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES: + motion_analysis_->AddFeatures(frame_selection_result->features()); + break; + + case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE: { + const cv::Mat input_view = + formats::MatView(&video_stream->Get()); + motion_analysis_->AddFrame(input_view, timestamp.Value()); + break; + } + + case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED: { + Homography homography; + CameraMotionToHomography(frame_selection_result->camera_motion(), + &homography); + const cv::Mat input_view = + formats::MatView(&video_stream->Get()); + motion_analysis_->AddFrameGeneric(input_view, timestamp.Value(), + homography, &homography); + break; + } + } + } + + timestamp_buffer_.push_back(timestamp); + ++frame_idx_; + + VLOG_EVERY_N(0, 100) << "Analyzed frame " << frame_idx_; + + // Buffer input frames only if visualization is requested. + if (visualize_output_ || video_output_) { + packet_buffer_.push_back(video_stream->Value()); + } + + // If requested, output grayscale thumbnails + if (grayscale_output_) { + cv::Mat grayscale_mat = motion_analysis_->GetGrayscaleFrameFromResults(); + std::unique_ptr grayscale_image(new ImageFrame( + ImageFormat::GRAY8, grayscale_mat.cols, grayscale_mat.rows)); + cv::Mat image_frame_mat = formats::MatView(grayscale_image.get()); + grayscale_mat.copyTo(image_frame_mat); + + cc->Outputs() + .Tag("GRAY_VIDEO_OUT") + .Add(grayscale_image.release(), timestamp); + } + + // Output other results, if we have any yet. + OutputMotionAnalyzedFrames(false, cc); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status MotionAnalysisCalculator::Close(CalculatorContext* cc) { + // Guard against empty videos. + if (motion_analysis_) { + OutputMotionAnalyzedFrames(true, cc); + } + if (csv_file_input_) { + if (!meta_motions_.empty()) { + LOG(ERROR) << "More motions than frames. Unexpected! Remainder: " + << meta_motions_.size(); + } + } + return ::mediapipe::OkStatus(); +} + +void MotionAnalysisCalculator::OutputMotionAnalyzedFrames( + bool flush, CalculatorContext* cc) { + std::vector> features; + std::vector> camera_motions; + std::vector> saliency; + + const int buffer_size = timestamp_buffer_.size(); + const int num_results = motion_analysis_->GetResults( + flush, &features, &camera_motions, with_saliency_ ? &saliency : nullptr); + + CHECK_LE(num_results, buffer_size); + + if (num_results == 0) { + return; + } + + for (int k = 0; k < num_results; ++k) { + // Region flow features and camera motion for this frame. + auto& feature_list = features[k]; + auto& camera_motion = camera_motions[k]; + const Timestamp timestamp = timestamp_buffer_[k]; + + if (selection_input_ && options_.hybrid_selection_camera()) { + if (camera_motion->type() > selected_motions_.front().type()) { + // Composited type is more stable. + camera_motion->Swap(&selected_motions_.front()); + } + selected_motions_.pop_front(); + } + + if (hybrid_meta_analysis_) { + AddMetaMotion(meta_motions_.front(), meta_features_.front(), + feature_list.get(), camera_motion.get()); + meta_motions_.pop_front(); + meta_features_.pop_front(); + } + + // Video frame for visualization. + std::unique_ptr visualization_frame; + cv::Mat visualization; + if (visualize_output_) { + // Initialize visualization frame with original frame. + visualization_frame.reset(new ImageFrame()); + visualization_frame->CopyFrom(packet_buffer_[k].Get(), 16); + visualization = formats::MatView(visualization_frame.get()); + + motion_analysis_->RenderResults( + *feature_list, *camera_motion, + with_saliency_ ? saliency[k].get() : nullptr, &visualization); + + cc->Outputs().Tag("VIZ").Add(visualization_frame.release(), timestamp); + } + + // Output dense foreground mask. + if (dense_foreground_output_) { + std::unique_ptr foreground_frame( + new ImageFrame(ImageFormat::GRAY8, frame_width_, frame_height_)); + cv::Mat foreground = formats::MatView(foreground_frame.get()); + motion_analysis_->ComputeDenseForeground(*feature_list, *camera_motion, + &foreground); + cc->Outputs().Tag("DENSE_FG").Add(foreground_frame.release(), timestamp); + } + + // Output flow features if requested. + if (region_flow_feature_output_) { + cc->Outputs().Tag("FLOW").Add(feature_list.release(), timestamp); + } + + // Output camera motion. + if (camera_motion_output_) { + cc->Outputs().Tag("CAMERA").Add(camera_motion.release(), timestamp); + } + + if (video_output_) { + cc->Outputs().Tag("VIDEO_OUT").AddPacket(packet_buffer_[k]); + } + + // Output saliency. + if (saliency_output_) { + cc->Outputs().Tag("SALIENCY").Add(saliency[k].release(), timestamp); + } + } + + if (hybrid_meta_analysis_) { + hybrid_meta_offset_ -= num_results; + CHECK_GE(hybrid_meta_offset_, 0); + } + + timestamp_buffer_.erase(timestamp_buffer_.begin(), + timestamp_buffer_.begin() + num_results); + + if (visualize_output_ || video_output_) { + packet_buffer_.erase(packet_buffer_.begin(), + packet_buffer_.begin() + num_results); + } +} + +::mediapipe::Status MotionAnalysisCalculator::InitOnProcess( + InputStream* video_stream, InputStream* selection_stream) { + if (video_stream) { + frame_width_ = video_stream->Get().Width(); + frame_height_ = video_stream->Get().Height(); + + // Ensure image options are set correctly. + auto* region_options = + options_.mutable_analysis_options()->mutable_flow_options(); + + // Use two possible formats to account for different channel orders. + RegionFlowComputationOptions::ImageFormat image_format; + RegionFlowComputationOptions::ImageFormat image_format2; + switch (video_stream->Get().Format()) { + case ImageFormat::GRAY8: + image_format = image_format2 = + RegionFlowComputationOptions::FORMAT_GRAYSCALE; + break; + + case ImageFormat::SRGB: + image_format = RegionFlowComputationOptions::FORMAT_RGB; + image_format2 = RegionFlowComputationOptions::FORMAT_BGR; + break; + + case ImageFormat::SRGBA: + image_format = RegionFlowComputationOptions::FORMAT_RGBA; + image_format2 = RegionFlowComputationOptions::FORMAT_BGRA; + break; + + default: + RET_CHECK(false) << "Unsupported image format."; + } + if (region_options->image_format() != image_format && + region_options->image_format() != image_format2) { + LOG(WARNING) << "Requested image format in RegionFlowComputation " + << "does not match video stream format. Overriding."; + region_options->set_image_format(image_format); + } + + // Account for downsampling mode INPUT_SIZE. In this case we are handed + // already downsampled frames but the resulting CameraMotion should + // be computed on higher resolution as specifed by the downsample scale. + if (region_options->downsample_mode() == + RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) { + const float scale = region_options->downsample_factor(); + frame_width_ = static_cast(std::round(frame_width_ * scale)); + frame_height_ = static_cast(std::round(frame_height_ * scale)); + } + } else if (selection_stream) { + const auto& camera_motion = + selection_stream->Get().camera_motion(); + frame_width_ = camera_motion.frame_width(); + frame_height_ = camera_motion.frame_height(); + } else { + LOG(FATAL) << "Either VIDEO or SELECTION stream need to be specified."; + } + + // Filled by CSV file parsing. + if (!meta_homographies_.empty()) { + CHECK(csv_file_input_); + AppendCameraMotionsFromHomographies(meta_homographies_, + true, // append identity. + &meta_motions_, &meta_features_); + meta_homographies_.clear(); + } + + // Filter weights before using for hybrid mode. + if (hybrid_meta_analysis_) { + auto* motion_options = + options_.mutable_analysis_options()->mutable_motion_options(); + motion_options->set_filter_initialized_irls_weights(true); + } + + return ::mediapipe::OkStatus(); +} + +bool MotionAnalysisCalculator::ParseModelCSV( + const std::string& contents, std::deque* homographies) { + std::vector values = + absl::StrSplit(contents, absl::ByAnyChar(",\n")); + + // Trim off any empty lines. + while (values.back().empty()) { + values.pop_back(); + } + + // Convert to float. + std::vector homog_values; + homog_values.reserve(values.size()); + + for (const auto& value : values) { + double value_64f; + if (!absl::SimpleAtod(value, &value_64f)) { + LOG(ERROR) << "Not a double, expected!"; + return false; + } + + homog_values.push_back(value_64f); + } + + return HomographiesFromValues(homog_values, homographies); +} + +bool MotionAnalysisCalculator::HomographiesFromValues( + const std::vector& homog_values, + std::deque* homographies) { + CHECK(homographies); + + // Obvious constants are obvious :D + constexpr int kHomographyValues = 9; + if (homog_values.size() % kHomographyValues != 0) { + LOG(ERROR) << "Contents not a multiple of " << kHomographyValues; + return false; + } + + for (int k = 0; k < homog_values.size(); k += kHomographyValues) { + std::vector h_vals(kHomographyValues); + for (int l = 0; l < kHomographyValues; ++l) { + h_vals[l] = homog_values[k + l]; + } + + // Normalize last entry to 1. + if (h_vals[kHomographyValues - 1] == 0) { + LOG(ERROR) << "Degenerate homography, last entry is zero"; + return false; + } + + const double scale = 1.0f / h_vals[kHomographyValues - 1]; + for (int l = 0; l < kHomographyValues; ++l) { + h_vals[l] *= scale; + } + + Homography h = HomographyAdapter::FromDoublePointer(h_vals.data(), false); + homographies->push_back(h); + } + + if (homographies->size() % options_.meta_models_per_frame() != 0) { + LOG(ERROR) << "Total homographies not a multiple of specified models " + << "per frame."; + return false; + } + + return true; +} + +void MotionAnalysisCalculator::SubtractMetaMotion( + const CameraMotion& meta_motion, RegionFlowFeatureList* features) { + if (meta_motion.mixture_homography().model_size() > 0) { + CHECK(row_weights_ != nullptr); + RegionFlowFeatureListViaTransform(meta_motion.mixture_homography(), + features, -1.0f, + 1.0f, // subtract transformed. + true, // replace feature loc. + row_weights_.get()); + } else { + RegionFlowFeatureListViaTransform(meta_motion.homography(), features, -1.0f, + 1.0f, // subtract transformed. + true); // replace feature loc. + } + + // Clamp transformed features to domain and handle outliers. + const float domain_diam = + hypot(features->frame_width(), features->frame_height()); + const float motion_mag = meta_motion.average_magnitude(); + // Same irls fraction as used by MODEL_MIXTURE_HOMOGRAPHY scaling in + // MotionEstimation. + const float irls_fraction = options_.analysis_options() + .motion_options() + .irls_mixture_fraction_scale() * + options_.analysis_options() + .motion_options() + .irls_motion_magnitude_fraction(); + float err_scale = std::max(1.0f, motion_mag * irls_fraction); + + const float max_err = + options_.meta_outlier_domain_ratio() * domain_diam * err_scale; + const float max_err_sq = max_err * max_err; + + for (auto& feature : *features->mutable_feature()) { + feature.set_x( + std::max(0.0f, std::min(features->frame_width() - 1.0f, feature.x()))); + feature.set_y( + std::max(0.0f, std::min(features->frame_height() - 1.0f, feature.y()))); + // Label anything with large residual motion an outlier. + if (FeatureFlow(feature).Norm2() > max_err_sq) { + feature.set_irls_weight(0.0f); + } + } +} + +void MotionAnalysisCalculator::AddMetaMotion( + const CameraMotion& meta_motion, const RegionFlowFeatureList& meta_features, + RegionFlowFeatureList* features, CameraMotion* motion) { + // Restore old feature location. + CHECK_EQ(meta_features.feature_size(), features->feature_size()); + for (int k = 0; k < meta_features.feature_size(); ++k) { + auto feature = features->mutable_feature(k); + const auto& meta_feature = meta_features.feature(k); + feature->set_x(meta_feature.x()); + feature->set_y(meta_feature.y()); + feature->set_dx(meta_feature.dx()); + feature->set_dy(meta_feature.dy()); + } + + // Composite camera motion. + *motion = ComposeCameraMotion(*motion, meta_motion); + // Restore type from metadata, i.e. do not declare motions as invalid. + motion->set_type(meta_motion.type()); + motion->set_match_frame(-1); +} + +void MotionAnalysisCalculator::AppendCameraMotionsFromHomographies( + const std::deque& homographies, bool append_identity, + std::deque* camera_motions, + std::deque* features) { + CHECK(camera_motions); + CHECK(features); + + CameraMotion identity; + identity.set_frame_width(frame_width_); + identity.set_frame_height(frame_height_); + + *identity.mutable_translation() = TranslationModel(); + *identity.mutable_linear_similarity() = LinearSimilarityModel(); + *identity.mutable_homography() = Homography(); + identity.set_type(CameraMotion::VALID); + identity.set_match_frame(0); + + RegionFlowFeatureList empty_list; + empty_list.set_long_tracks(true); + empty_list.set_match_frame(-1); + empty_list.set_frame_width(frame_width_); + empty_list.set_frame_height(frame_height_); + + if (append_identity) { + camera_motions->push_back(identity); + features->push_back(empty_list); + } + + const int models_per_frame = options_.meta_models_per_frame(); + CHECK_GT(models_per_frame, 0) << "At least one model per frame is needed"; + CHECK_EQ(0, homographies.size() % models_per_frame); + const int num_frames = homographies.size() / models_per_frame; + + // Heuristic sigma, similar to what we use for rolling shutter removal. + const float mixture_sigma = 1.0f / models_per_frame; + + if (row_weights_ == nullptr) { + row_weights_.reset(new MixtureRowWeights(frame_height_, + frame_height_ / 10, // 10% margin + mixture_sigma * frame_height_, + 1.0f, models_per_frame)); + } + + for (int f = 0; f < num_frames; ++f) { + MixtureHomography mix_homog; + const int model_start = f * models_per_frame; + + for (int k = 0; k < models_per_frame; ++k) { + const Homography& homog = homographies[model_start + k]; + *mix_homog.add_model() = ModelInvert(homog); + } + + CameraMotion c = identity; + c.set_match_frame(-1); + + if (mix_homog.model_size() > 1) { + *c.mutable_mixture_homography() = mix_homog; + c.set_mixture_row_sigma(mixture_sigma); + + for (int k = 0; k < models_per_frame; ++k) { + c.add_mixture_inlier_coverage(1.0f); + } + *c.add_mixture_homography_spectrum() = mix_homog; + c.set_rolling_shutter_motion_index(0); + + *c.mutable_homography() = ProjectViaFit( + mix_homog, frame_width_, frame_height_, row_weights_.get()); + } else { + // Guaranteed to exist because to check that models_per_frame > 0 above. + *c.mutable_homography() = mix_homog.model(0); + } + + // Project remaining motions down. + *c.mutable_linear_similarity() = ProjectViaFit( + c.homography(), frame_width_, frame_height_); + *c.mutable_translation() = ProjectViaFit( + c.homography(), frame_width_, frame_height_); + + c.set_average_magnitude( + std::hypot(c.translation().dx(), c.translation().dy())); + + camera_motions->push_back(c); + features->push_back(empty_list); + } +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/video/motion_analysis_calculator.proto b/mediapipe/calculators/video/motion_analysis_calculator.proto new file mode 100644 index 000000000..a2af77d12 --- /dev/null +++ b/mediapipe/calculators/video/motion_analysis_calculator.proto @@ -0,0 +1,111 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; +import "mediapipe/util/tracking/motion_analysis.proto"; + +// Next tag: 10 +message MotionAnalysisCalculatorOptions { + extend CalculatorOptions { + optional MotionAnalysisCalculatorOptions ext = 270698255; + } + + optional mediapipe.MotionAnalysisOptions analysis_options = 1; + + // Determines how optional input SELECTION (if present) is used to compute + // the final camera motion. + enum SelectionAnalysis { + // Recompute camera motion for selected frame neighbors. + ANALYSIS_RECOMPUTE = 1; + + // Use composited camera motion and region flow from SELECTION input. No + // tracking or re-computation is performed. + // Note that in this case only CAMERA, FLOW and VIDEO_OUT tags are + // supported as output. + NO_ANALYSIS_USE_SELECTION = 2; + + // Recompute camera motion for selected frame neighbors using + // features supplied by SELECTION input. No feature tracking is performed. + ANALYSIS_FROM_FEATURES = 3; + + // Recomputes camera motion for selected frame neighbors but seeds + // initial transform with camera motion from SELECTION input. + ANALYSIS_WITH_SEED = 4; + } + + optional SelectionAnalysis selection_analysis = 4 + [default = ANALYSIS_WITH_SEED]; + + // If activated when SELECTION input is activated, will replace the computed + // camera motion (for any of the ANALYSIS_* case above) with the one supplied + // by the frame selection, in case the frame selection one is more stable. + // For example, if recomputed camera motion is unstable but the one from + // the selection result is stable, will use the stable result instead. + optional bool hybrid_selection_camera = 5 [default = false]; + + // Determines how optional input META is used to compute the final camera + // motion. + enum MetaAnalysis { + // Uses metadata supplied motions as is. + META_ANALYSIS_USE_META = 1; + + // Seeds visual tracking from metadata motions - estimates visual residual + // motion and combines with metadata. + META_ANALYSIS_HYBRID = 2; + } + + optional MetaAnalysis meta_analysis = 8 [default = META_ANALYSIS_USE_META]; + + // Determines number of homography models per frame stored in the CSV file + // or the homography metadata in META. + // For values > 1, MixtureHomographies are created. + optional int32 meta_models_per_frame = 6 [default = 1]; + + // Used for META_ANALYSIS_HYBRID. Rejects features which flow deviates + // domain_ratio * image diagonal size from the ground truth metadata motion. + optional float meta_outlier_domain_ratio = 9 [default = 0.0015]; + + // If true, the MotionAnalysisCalculator will skip all processing and emit no + // packets on any output. This is useful for quickly creating different + // versions of a MediaPipe graph without changing its structure, assuming that + // downstream calculators can handle missing input packets. + // TODO: Remove this hack. See b/36485206 for more details. + optional bool bypass_mode = 7 [default = false]; +} + +// Taken from +// java/com/google/android/libraries/microvideo/proto/microvideo.proto to +// satisfy leakr requirements +// TODO: Remove and use above proto. +message HomographyData { + // For each frame, there are 12 homography matrices stored. Each matrix is + // 3x3 (9 elements). This field will contain 12 x 3 x 3 float values. The + // first row of the first homography matrix will be followed by the second row + // of the first homography matrix, followed by third row of first homography + // matrix, followed by the first row of the second homography matrix, etc. + repeated float motion_homography_data = 1 [packed = true]; + + // Vector containing histogram counts for individual patches in the frame. + repeated uint32 histogram_count_data = 2 [packed = true]; + + // The width of the frame at the time metadata was sampled. + optional int32 frame_width = 3; + + // The height of the frame at the time metadata was sampled. + optional int32 frame_height = 4; +} diff --git a/mediapipe/calculators/video/testdata/lenna.png b/mediapipe/calculators/video/testdata/lenna.png new file mode 100644 index 000000000..bd247e8a4 Binary files /dev/null and b/mediapipe/calculators/video/testdata/lenna.png differ diff --git a/mediapipe/calculators/video/testdata/parallel_tracker_graph.pbtxt b/mediapipe/calculators/video/testdata/parallel_tracker_graph.pbtxt new file mode 100644 index 000000000..f3fe5b424 --- /dev/null +++ b/mediapipe/calculators/video/testdata/parallel_tracker_graph.pbtxt @@ -0,0 +1,134 @@ +input_stream: "image_cpu_frames" +input_stream: "start_pos" +input_stream: "ra_track" + +num_threads: 4 + +node: { + calculator: "MotionAnalysisCalculator" + input_stream: "VIDEO:image_cpu_frames" + output_stream: "CAMERA:camera_motion" + output_stream: "FLOW:region_flow" + + options: { + [mediapipe.MotionAnalysisCalculatorOptions.ext]: { + analysis_options: { + analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE + + flow_options: { + # Maybe move down to 50 + fast_estimation_min_block_size: 100 + top_inlier_sets: 1 + frac_inlier_error_threshold: 3e-3 + downsample_mode: DOWNSAMPLE_NONE + verification_distance: 5.0 + verify_long_feature_acceleration: true + verify_long_feature_trigger_ratio: 0.1 + tracking_options: { + max_features: 500 + adaptive_extraction_levels: 2 + min_eig_val_settings: { + adaptive_lowest_quality_level: 2e-4 + } + klt_tracker_implementation: KLT_OPENCV + } + } + + motion_options: { + label_empty_frames_as_valid: false + } + } + } + } +} + +node: { + calculator: "FlowPackagerCalculator" + input_stream: "FLOW:region_flow" + input_stream: "CAMERA:camera_motion" + output_stream: "TRACKING:tracking_data" + + options: { + [mediapipe.FlowPackagerCalculatorOptions.ext]: { + flow_packager_options: { + binary_tracking_data_support: false + } + } + } +} + +node: { + calculator: "BoxTrackerCalculator" + input_stream: "TRACKING:tracking_data" + input_stream: "START_POS:start_pos" + output_stream: "BOXES:boxes" + input_side_packet: "OPTIONS:calculator_options" + + input_stream_handler: { + input_stream_handler: "SyncSetInputStreamHandler" + options: { + [mediapipe.SyncSetInputStreamHandlerOptions.ext]: { + sync_set: { + tag_index: "TRACKING" + } + sync_set: { + tag_index: "START_POS" + } + } + } + } + + options: { + [mediapipe.BoxTrackerCalculatorOptions.ext]: { + tracker_options: { + track_step_options: { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_PERSPECTIVE + object_similarity_min_contd_inliers: 6 + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} + +node: { + calculator: "BoxTrackerCalculator" + input_stream: "TRACKING:tracking_data" + input_stream: "RA_TRACK:ra_track" + output_stream: "RA_BOXES:ra_boxes" + input_side_packet: "OPTIONS:calculator_options" + + input_stream_handler: { + input_stream_handler: "SyncSetInputStreamHandler" + options: { + [mediapipe.SyncSetInputStreamHandlerOptions.ext]: { + sync_set: { + tag_index: "TRACKING" + } + sync_set: { + tag_index: "RA_TRACK" + } + } + } + } + + options: { + [mediapipe.BoxTrackerCalculatorOptions.ext]: { + tracker_options: { + track_step_options: { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_PERSPECTIVE + object_similarity_min_contd_inliers: 6 + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} diff --git a/mediapipe/calculators/video/testdata/tracker_graph.pbtxt b/mediapipe/calculators/video/testdata/tracker_graph.pbtxt new file mode 100644 index 000000000..c86d50136 --- /dev/null +++ b/mediapipe/calculators/video/testdata/tracker_graph.pbtxt @@ -0,0 +1,122 @@ +input_stream: "image_cpu_frames" +input_stream: "start_pos" +input_stream: "cancel_object_id" +input_stream: "ra_track" +input_stream: "restart_pos" +input_stream: "track_time" + +num_threads: 4 + +node: { + calculator: "MotionAnalysisCalculator" + options: { + [mediapipe.MotionAnalysisCalculatorOptions.ext]: { + analysis_options: { + analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE + + flow_options: { + # Maybe move down to 50 + fast_estimation_min_block_size: 100 + top_inlier_sets: 1 + frac_inlier_error_threshold: 3e-3 + # For mobile application, downsample before input into graph + # and use DOWNSAMPLE_TO_INPUT_SIZE and specify + # downsampling_factor option or DOWNSAMPLE input_side_packet + downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE + verification_distance: 5.0 + verify_long_feature_acceleration: true + verify_long_feature_trigger_ratio: 0.1 + tracking_options: { + max_features: 500 + corner_extraction_method: EXTRACTION_FAST + adaptive_extraction_levels: 2 + min_eig_val_settings: { + adaptive_lowest_quality_level: 2e-4 + } + klt_tracker_implementation: KLT_OPENCV + } + } + } + } + } + + # Drops packets if calculator cannot keep up with the input rate. + input_stream_handler: { + input_stream_handler: "FixedSizeInputStreamHandler" + } + + input_stream: "VIDEO:image_cpu_frames" + input_side_packet: "DOWNSAMPLE:analysis_downsample_factor" + input_side_packet: "OPTIONS:calculator_options" + output_stream: "CAMERA:camera_motion" + output_stream: "FLOW:region_flow" +} + +node: { + calculator: "FlowPackagerCalculator" + + input_stream: "FLOW:region_flow" + input_stream: "CAMERA:camera_motion" + output_stream: "TRACKING:tracking_data" + options: { + [mediapipe.FlowPackagerCalculatorOptions.ext]: { + flow_packager_options: { + binary_tracking_data_support: false + } + } + } +} + +node: { + calculator: "BoxTrackerCalculator" + + input_side_packet: "OPTIONS:calculator_options" + input_stream: "TRACKING:tracking_data" + input_stream: "TRACK_TIME:track_time" + input_stream: "START_POS:start_pos" + input_stream: "RESTART_POS:restart_pos" + input_stream: "CANCEL_OBJECT_ID:cancel_object_id" + input_stream: "RA_TRACK:ra_track" + output_stream: "BOXES:boxes" + output_stream: "RA_BOXES:ra_boxes" + + input_stream_handler: { + input_stream_handler: "SyncSetInputStreamHandler" + options: { + [mediapipe.SyncSetInputStreamHandlerOptions.ext]: { + sync_set: { + tag_index: "TRACKING" + tag_index: "TRACK_TIME" + } + sync_set: { + tag_index: "START_POS" + } + sync_set: { + tag_index: "RESTART_POS" + } + sync_set: { + tag_index: "CANCEL_OBJECT_ID" + } + sync_set: { + tag_index: "RA_TRACK" + } + } + } + } + + options: { + [mediapipe.BoxTrackerCalculatorOptions.ext]: { + tracker_options: { + track_step_options: { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + object_similarity_min_contd_inliers: 10 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} diff --git a/mediapipe/calculators/video/tracked_detection_manager_calculator.cc b/mediapipe/calculators/video/tracked_detection_manager_calculator.cc new file mode 100644 index 000000000..749647d17 --- /dev/null +++ b/mediapipe/calculators/video/tracked_detection_manager_calculator.cc @@ -0,0 +1,319 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "absl/container/node_hash_map.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/formats/location_data.pb.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/util/tracking/box_tracker.h" +#include "mediapipe/util/tracking/tracked_detection.h" +#include "mediapipe/util/tracking/tracked_detection_manager.h" +#include "mediapipe/util/tracking/tracking.h" + +namespace mediapipe { +namespace { + +constexpr int kDetectionUpdateTimeOutMS = 5000; +constexpr char kDetectionsTag[] = "DETECTIONS"; +constexpr char kDetectionBoxesTag[] = "DETECTION_BOXES"; +constexpr char kDetectionListTag[] = "DETECTION_LIST"; +constexpr char kTrackingBoxesTag[] = "TRACKING_BOXES"; +constexpr char kCancelObjectIdTag[] = "CANCEL_OBJECT_ID"; + +// Move |src| to the back of |dst|. +void MoveIds(std::vector* dst, std::vector src) { + dst->insert(dst->end(), std::make_move_iterator(src.begin()), + std::make_move_iterator(src.end())); +} + +int64 GetInputTimestampMs(::mediapipe::CalculatorContext* cc) { + return cc->InputTimestamp().Microseconds() / 1000; // 1 ms = 1000 us. +} + +// Converts a Mediapipe Detection Proto to a TrackedDetection class. +std::unique_ptr GetTrackedDetectionFromDetection( + const Detection& detection, int64 timestamp) { + std::unique_ptr tracked_detection = + absl::make_unique(detection.detection_id(), timestamp); + const float top = detection.location_data().relative_bounding_box().ymin(); + const float bottom = + detection.location_data().relative_bounding_box().ymin() + + detection.location_data().relative_bounding_box().height(); + const float left = detection.location_data().relative_bounding_box().xmin(); + const float right = detection.location_data().relative_bounding_box().xmin() + + detection.location_data().relative_bounding_box().width(); + NormalizedRect bounding_box; + bounding_box.set_x_center((left + right) / 2.f); + bounding_box.set_y_center((top + bottom) / 2.f); + bounding_box.set_height(bottom - top); + bounding_box.set_width(right - left); + tracked_detection->set_bounding_box(bounding_box); + + for (int i = 0; i < detection.label_size(); ++i) { + tracked_detection->AddLabel(detection.label(i), detection.score(i)); + } + return tracked_detection; +} + +// Converts a TrackedDetection class to a Mediapipe Detection Proto. +Detection GetAxisAlignedDetectionFromTrackedDetection( + const TrackedDetection& tracked_detection) { + Detection detection; + LocationData* location_data = detection.mutable_location_data(); + + auto corners = tracked_detection.GetCorners(); + + float x_min = std::numeric_limits::max(); + float x_max = std::numeric_limits::min(); + float y_min = std::numeric_limits::max(); + float y_max = std::numeric_limits::min(); + for (int i = 0; i < 4; ++i) { + x_min = std::min(x_min, corners[i].x()); + x_max = std::max(x_max, corners[i].x()); + y_min = std::min(y_min, corners[i].y()); + y_max = std::max(y_max, corners[i].y()); + } + location_data->set_format(LocationData::RELATIVE_BOUNDING_BOX); + LocationData::RelativeBoundingBox* relative_bbox = + location_data->mutable_relative_bounding_box(); + relative_bbox->set_xmin(x_min); + relative_bbox->set_ymin(y_min); + relative_bbox->set_width(x_max - x_min); + relative_bbox->set_height(y_max - y_min); + + // Use previous id which is the id the object when it's first detected. + if (tracked_detection.previous_id() > 0) { + detection.set_detection_id(tracked_detection.previous_id()); + } else { + detection.set_detection_id(tracked_detection.unique_id()); + } + for (const auto& label_and_score : tracked_detection.label_to_score_map()) { + detection.add_label(label_and_score.first); + detection.add_score(label_and_score.second); + } + return detection; +} + +} // namespace + +// TrackedDetectionManagerCalculator accepts detections and tracking results at +// different frame rate for real time tracking of targets. +// Input: +// DETECTIONS: A vector of newly detected targets. +// TRACKING_BOXES: A TimedBoxProtoList which contains a list of tracked boxes +// from previous detections. +// +// Output: +// CANCEL_OBJECT_ID: Ids of targets that are missing/lost such that it should +// be removed from tracking. +// DETECTIONS: List of detections that are being tracked. +// DETECTION_BOXES: List of bounding boxes of detections that are being +// tracked. +// +// Usage example: +// node { +// calculator: "TrackedDetectionManagerCalculator" +// input_stream: "DETECTIONS:detections" +// input_stream: "TRACKING_BOXES:boxes" +// output_stream: "CANCEL_OBJECT_ID:cancel_object_id" +// output_stream: "DETECTIONS:output_detections" +// } +class TrackedDetectionManagerCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + // Adds new list of detections to |waiting_for_update_detections_|. + void AddDetectionList(const DetectionList& detection_list, + CalculatorContext* cc); + void AddDetections(const std::vector& detections, + CalculatorContext* cc); + + // Manages existing and new detections. + TrackedDetectionManager tracked_detection_manager_; + + // Set of detections that are not up to date yet. These detections will be + // added to the detection manager until they got updated from the box tracker. + absl::node_hash_map> + waiting_for_update_detections_; +}; +REGISTER_CALCULATOR(TrackedDetectionManagerCalculator); + +::mediapipe::Status TrackedDetectionManagerCalculator::GetContract( + CalculatorContract* cc) { + if (cc->Inputs().HasTag(kDetectionsTag)) { + cc->Inputs().Tag(kDetectionsTag).Set>(); + } + if (cc->Inputs().HasTag(kDetectionListTag)) { + cc->Inputs().Tag(kDetectionListTag).Set(); + } + if (cc->Inputs().HasTag(kTrackingBoxesTag)) { + cc->Inputs().Tag(kTrackingBoxesTag).Set(); + } + + if (cc->Outputs().HasTag(kCancelObjectIdTag)) { + cc->Outputs().Tag(kCancelObjectIdTag).Set(); + } + if (cc->Outputs().HasTag(kDetectionsTag)) { + cc->Outputs().Tag(kDetectionsTag).Set>(); + } + if (cc->Outputs().HasTag(kDetectionBoxesTag)) { + cc->Outputs().Tag(kDetectionBoxesTag).Set>(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TrackedDetectionManagerCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().HasTag("TRACKING_BOXES")) { + if (!cc->Inputs().Tag("TRACKING_BOXES").IsEmpty()) { + const TimedBoxProtoList& tracked_boxes = + cc->Inputs().Tag("TRACKING_BOXES").Get(); + + // Collect all detections that are removed. + auto removed_detection_ids = absl::make_unique>(); + for (const TimedBoxProto& tracked_box : tracked_boxes.box()) { + NormalizedRect bounding_box; + bounding_box.set_x_center((tracked_box.left() + tracked_box.right()) / + 2.f); + bounding_box.set_y_center((tracked_box.bottom() + tracked_box.top()) / + 2.f); + bounding_box.set_height(tracked_box.bottom() - tracked_box.top()); + bounding_box.set_width(tracked_box.right() - tracked_box.left()); + bounding_box.set_rotation(tracked_box.rotation()); + // First check if this box updates a detection that's waiting for + // update from the tracker. + auto waiting_for_update_detectoin_ptr = + waiting_for_update_detections_.find(tracked_box.id()); + if (waiting_for_update_detectoin_ptr != + waiting_for_update_detections_.end()) { + // Add the detection and remove duplicated detections. + auto removed_ids = tracked_detection_manager_.AddDetection( + std::move(waiting_for_update_detectoin_ptr->second)); + MoveIds(removed_detection_ids.get(), std::move(removed_ids)); + + waiting_for_update_detections_.erase( + waiting_for_update_detectoin_ptr); + } + auto removed_ids = tracked_detection_manager_.UpdateDetectionLocation( + tracked_box.id(), bounding_box, tracked_box.time_msec()); + MoveIds(removed_detection_ids.get(), std::move(removed_ids)); + } + // TODO: Should be handled automatically in detection manager. + auto removed_ids = tracked_detection_manager_.RemoveObsoleteDetections( + GetInputTimestampMs(cc) - kDetectionUpdateTimeOutMS); + MoveIds(removed_detection_ids.get(), std::move(removed_ids)); + + // TODO: Should be handled automatically in detection manager. + removed_ids = tracked_detection_manager_.RemoveOutOfViewDetections(); + MoveIds(removed_detection_ids.get(), std::move(removed_ids)); + + if (!removed_detection_ids->empty() && + cc->Outputs().HasTag(kCancelObjectIdTag)) { + auto timestamp = cc->InputTimestamp(); + for (int box_id : *removed_detection_ids) { + // The timestamp is incremented (by 1 us) because currently the box + // tracker calculator only accepts one cancel object ID for any given + // timestamp. + cc->Outputs() + .Tag(kCancelObjectIdTag) + .AddPacket(mediapipe::MakePacket(box_id).At(timestamp++)); + } + } + + // Output detections and corresponding bounding boxes. + const auto& all_detections = + tracked_detection_manager_.GetAllTrackedDetections(); + auto output_detections = absl::make_unique>(); + auto output_boxes = absl::make_unique>(); + + for (const auto& detection_ptr : all_detections) { + const auto& detection = *detection_ptr.second; + // Only output detections that are synced. + if (detection.last_updated_timestamp() < + cc->InputTimestamp().Microseconds() / 1000) { + continue; + } + output_detections->emplace_back( + GetAxisAlignedDetectionFromTrackedDetection(detection)); + output_boxes->emplace_back(detection.bounding_box()); + } + if (cc->Outputs().HasTag(kDetectionsTag)) { + cc->Outputs() + .Tag(kDetectionsTag) + .Add(output_detections.release(), cc->InputTimestamp()); + } + + if (cc->Outputs().HasTag(kDetectionBoxesTag)) { + cc->Outputs() + .Tag(kDetectionBoxesTag) + .Add(output_boxes.release(), cc->InputTimestamp()); + } + } + } + + if (cc->Inputs().HasTag(kDetectionsTag) && + !cc->Inputs().Tag(kDetectionsTag).IsEmpty()) { + const auto detections = + cc->Inputs().Tag(kDetectionsTag).Get>(); + AddDetections(detections, cc); + } + + if (cc->Inputs().HasTag(kDetectionListTag) && + !cc->Inputs().Tag(kDetectionListTag).IsEmpty()) { + const auto detection_list = + cc->Inputs().Tag(kDetectionListTag).Get(); + AddDetectionList(detection_list, cc); + } + + return ::mediapipe::OkStatus(); +} + +void TrackedDetectionManagerCalculator::AddDetectionList( + const DetectionList& detection_list, CalculatorContext* cc) { + for (const auto& detection : detection_list.detection()) { + // Convert from microseconds to milliseconds. + std::unique_ptr new_detection = + GetTrackedDetectionFromDetection( + detection, cc->InputTimestamp().Microseconds() / 1000); + + const int id = new_detection->unique_id(); + waiting_for_update_detections_[id] = std::move(new_detection); + } +} + +void TrackedDetectionManagerCalculator::AddDetections( + const std::vector& detections, CalculatorContext* cc) { + for (const auto& detection : detections) { + // Convert from microseconds to milliseconds. + std::unique_ptr new_detection = + GetTrackedDetectionFromDetection( + detection, cc->InputTimestamp().Microseconds() / 1000); + + const int id = new_detection->unique_id(); + waiting_for_update_detections_[id] = std::move(new_detection); + } +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/video/tracking_graph_test.cc b/mediapipe/calculators/video/tracking_graph_test.cc new file mode 100644 index 000000000..fc04ee6e8 --- /dev/null +++ b/mediapipe/calculators/video/tracking_graph_test.cc @@ -0,0 +1,709 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#include "mediapipe/calculators/video/box_tracker_calculator.pb.h" +#include "mediapipe/framework/calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/framework/packet.h" +#include "mediapipe/framework/port/advanced_proto_inc.h" +#include "mediapipe/framework/port/file_helpers.h" +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/opencv_highgui_inc.h" +#include "mediapipe/framework/port/proto_ns.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/framework/port/status_matchers.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" +#include "mediapipe/util/tracking/tracking.pb.h" + +#ifdef __APPLE__ +#include +#endif // defined(__APPLE__) + +namespace mediapipe { +namespace { +using ::testing::FloatNear; +using ::testing::Test; + +std::string GetTestDir() { +#ifdef __APPLE__ + char path[1024]; + CFURLRef bundle_url = CFBundleCopyBundleURL(CFBundleGetMainBundle()); + CFURLGetFileSystemRepresentation( + bundle_url, true, reinterpret_cast(path), sizeof(path)); + CFRelease(bundle_url); + return ::mediapipe::file::JoinPath(path, "testdata"); +#elif defined(__ANDROID__) + char path[1024]; + getcwd(path, sizeof(path)); + return ::mediapipe::file::JoinPath(path, + "mediapipe/calculators/video/testdata"); +#else + return ::mediapipe::file::JoinPath( + "./", + // This should match the path of the output files + // of the genrule() that generates test model files. + "mediapipe/calculators/video/testdata"); +#endif // defined(__APPLE__) +} + +bool LoadBinaryTestGraph(const std::string& graph_path, + CalculatorGraphConfig* config) { + std::ifstream ifs; + ifs.open(graph_path.c_str()); + proto_ns::io::IstreamInputStream in_stream(&ifs); + bool success = config->ParseFromZeroCopyStream(&in_stream); + ifs.close(); + if (!success) { + LOG(ERROR) << "could not parse test graph: " << graph_path; + } + return success; +} + +class TrackingGraphTest : public Test { + protected: + TrackingGraphTest() {} + + void SetUp() override { + test_dir_ = GetTestDir(); + const auto graph_path = file::JoinPath(test_dir_, "tracker.binarypb"); + ASSERT_TRUE(LoadBinaryTestGraph(graph_path, &config_)); + + original_image_ = cv::imread(file::JoinPath(test_dir_, "lenna.png")); + CreateInputFramesFromOriginalImage(kNumImages, kTranslationStep, + &input_frames_packets_); + + const auto& first_input_img = input_frames_packets_[0].Get(); + const int img_width = first_input_img.Width(); + const int img_height = first_input_img.Height(); + translation_step_x_ = kTranslationStep / static_cast(img_width); + translation_step_y_ = kTranslationStep / static_cast(img_height); + + // Creat new configure and packet dump vector to store output. + mediapipe::CalculatorGraphConfig config_copy = config_; + mediapipe::tool::AddVectorSink("boxes", &config_copy, &output_packets_); + mediapipe::tool::AddVectorSink("ra_boxes", &config_copy, + &random_access_results_packets_); + + // Initialize graph + MP_ASSERT_OK(graph_.Initialize(config_copy)); + + const auto parallel_graph_path = + file::JoinPath(test_dir_, "parallel_tracker.binarypb"); + CalculatorGraphConfig parallel_config; + ASSERT_TRUE(LoadBinaryTestGraph(parallel_graph_path, ¶llel_config)); + mediapipe::tool::AddVectorSink("boxes", ¶llel_config, &output_packets_); + mediapipe::tool::AddVectorSink("ra_boxes", ¶llel_config, + &random_access_results_packets_); + MP_ASSERT_OK(parallel_graph_.Initialize(parallel_config)); + } + + void CreateInputFramesFromOriginalImage( + int num_images, int translation_step, + std::vector* input_frames_packets); + + void TearDown() override { + output_packets_.clear(); + random_access_results_packets_.clear(); + } + + std::unique_ptr MakeBoxList( + const Timestamp& timestamp, const std::vector& is_quad_tracking, + const std::vector& is_pnp_tracking, + const std::vector& reacquisition) const; + + void RunGraphWithSidePacketsAndInputs( + const std::map& side_packets, + const mediapipe::Packet& start_pos_packet); + + // Utility functions used to judge if a given quad or box is near to the + // groundtruth location at a given frame. + // Examine box.reacquisition() field equals to `reacquisition`. + // `frame` can be float number to account for inter-frame interpolation. + void ExpectBoxAtFrame(const TimedBoxProto& box, float frame, + bool reacquisition); + + // Examine box.aspect_ratio() field equals to `aspect_ratio` if asepct_ratio + // is positive. + void ExpectQuadAtFrame(const TimedBoxProto& box, float frame, + float aspect_ratio, bool reacquisition); + + // Utility function to judge if two quad are near to each other. + void ExpectQuadNear(const TimedBoxProto& box1, const TimedBoxProto& box2); + + std::unique_ptr CreateRandomAccessTrackingBoxList( + const std::vector& start_timestamps, + const std::vector& end_timestamps) const; + + CalculatorGraph graph_; + CalculatorGraph parallel_graph_; + CalculatorGraphConfig config_; + std::string test_dir_; + cv::Mat original_image_; + std::vector input_frames_packets_; + std::vector output_packets_; + std::vector random_access_results_packets_; + float translation_step_x_; // normalized translation step in x direction + float translation_step_y_; // normalized translation step in y direction + static constexpr float kInitialBoxHalfWidthNormalized = 0.25f; + static constexpr float kInitialBoxHalfHeightNormalized = 0.25f; + static constexpr float kImageAspectRatio = 1.0f; // for lenna.png + static constexpr float kInitialBoxLeft = + 0.5f - kInitialBoxHalfWidthNormalized; + static constexpr float kInitialBoxRight = + 0.5f + kInitialBoxHalfWidthNormalized; + static constexpr float kInitialBoxTop = + 0.5f - kInitialBoxHalfHeightNormalized; + static constexpr float kInitialBoxBottom = + 0.5f + kInitialBoxHalfHeightNormalized; + static constexpr int kFrameIntervalUs = 30000; + static constexpr int kNumImages = 8; + // Each image is shifted to the right and bottom by kTranslationStep + // pixels compared with the previous image. + static constexpr int kTranslationStep = 10; + static constexpr float kEqualityTolerance = 3e-4f; +}; + +void TrackingGraphTest::ExpectBoxAtFrame(const TimedBoxProto& box, float frame, + bool reacquisition) { + EXPECT_EQ(box.reacquisition(), reacquisition); + EXPECT_TRUE(box.has_rotation()); + EXPECT_THAT(box.rotation(), FloatNear(0, kEqualityTolerance)); + EXPECT_THAT(box.left(), + FloatNear(kInitialBoxLeft - frame * translation_step_x_, + kEqualityTolerance)); + EXPECT_THAT(box.top(), FloatNear(kInitialBoxTop - frame * translation_step_y_, + kEqualityTolerance)); + EXPECT_THAT(box.bottom(), + FloatNear(kInitialBoxBottom - frame * translation_step_y_, + kEqualityTolerance)); + EXPECT_THAT(box.right(), + FloatNear(kInitialBoxRight - frame * translation_step_x_, + kEqualityTolerance)); +} + +void TrackingGraphTest::ExpectQuadAtFrame(const TimedBoxProto& box, float frame, + float aspect_ratio, + bool reacquisition) { + EXPECT_TRUE(box.has_quad()) << "quad must exist!"; + if (aspect_ratio > 0) { + EXPECT_TRUE(box.has_aspect_ratio()); + EXPECT_NEAR(box.aspect_ratio(), aspect_ratio, kEqualityTolerance); + } + + EXPECT_EQ(box.reacquisition(), reacquisition); + + const auto& quad = box.quad(); + EXPECT_EQ(8, quad.vertices_size()) + << "quad has only " << box.quad().vertices_size() << " vertices"; + EXPECT_THAT(quad.vertices(0), + FloatNear(kInitialBoxLeft - frame * translation_step_x_, + kEqualityTolerance)); + EXPECT_THAT(quad.vertices(1), + FloatNear(kInitialBoxTop - frame * translation_step_y_, + kEqualityTolerance)); + EXPECT_THAT(quad.vertices(3), + FloatNear(kInitialBoxBottom - frame * translation_step_y_, + kEqualityTolerance)); + EXPECT_THAT(quad.vertices(4), + FloatNear(kInitialBoxRight - frame * translation_step_x_, + kEqualityTolerance)); +} + +void TrackingGraphTest::ExpectQuadNear(const TimedBoxProto& box1, + const TimedBoxProto& box2) { + EXPECT_TRUE(box1.has_quad()); + EXPECT_TRUE(box2.has_quad()); + EXPECT_EQ(8, box1.quad().vertices_size()) + << "quad has only " << box1.quad().vertices_size() << " vertices"; + EXPECT_EQ(8, box2.quad().vertices_size()) + << "quad has only " << box2.quad().vertices_size() << " vertices"; + for (int j = 0; j < box1.quad().vertices_size(); ++j) { + EXPECT_NEAR(box1.quad().vertices(j), box2.quad().vertices(j), + kEqualityTolerance); + } +} + +std::unique_ptr TrackingGraphTest::MakeBoxList( + const Timestamp& timestamp, const std::vector& is_quad_tracking, + const std::vector& is_pnp_tracking, + const std::vector& reacquisition) const { + auto box_list = absl::make_unique(); + int box_id = 0; + for (int j = 0; j < is_quad_tracking.size(); ++j) { + TimedBoxProto* box = box_list->add_box(); + if (is_quad_tracking[j]) { + box->mutable_quad()->add_vertices(kInitialBoxLeft); + box->mutable_quad()->add_vertices(kInitialBoxTop); + box->mutable_quad()->add_vertices(kInitialBoxLeft); + box->mutable_quad()->add_vertices(kInitialBoxBottom); + box->mutable_quad()->add_vertices(kInitialBoxRight); + box->mutable_quad()->add_vertices(kInitialBoxBottom); + box->mutable_quad()->add_vertices(kInitialBoxRight); + box->mutable_quad()->add_vertices(kInitialBoxTop); + + if (is_pnp_tracking[j]) { + box->set_aspect_ratio(kImageAspectRatio); + } + } else { + box->set_left(kInitialBoxLeft); + box->set_right(kInitialBoxRight); + box->set_top(kInitialBoxTop); + box->set_bottom(kInitialBoxBottom); + } + box->set_id(box_id++); + box->set_time_msec(timestamp.Value() / 1000); + box->set_reacquisition(reacquisition[j]); + } + + return box_list; +} + +void TrackingGraphTest::CreateInputFramesFromOriginalImage( + int num_images, int translation_step, + std::vector* input_frames_packets) { + const int crop_width = original_image_.cols - num_images * translation_step; + const int crop_height = original_image_.rows - num_images * translation_step; + for (int i = 0; i < num_images; ++i) { + cv::Rect roi(i * translation_step, i * translation_step, crop_width, + crop_height); + cv::Mat cropped_img = cv::Mat(original_image_, roi); + auto cropped_image_frame = absl::make_unique( + ImageFormat::SRGB, crop_width, crop_height, cropped_img.step[0], + cropped_img.data, ImageFrame::PixelDataDeleter::kNone); + Timestamp curr_timestamp = Timestamp(i * kFrameIntervalUs); + Packet image_packet = + Adopt(cropped_image_frame.release()).At(curr_timestamp); + input_frames_packets->push_back(image_packet); + } +} + +void TrackingGraphTest::RunGraphWithSidePacketsAndInputs( + const std::map& side_packets, + const mediapipe::Packet& start_pos_packet) { + // Start running the graph + MP_EXPECT_OK(graph_.StartRun(side_packets)); + + MP_EXPECT_OK(graph_.AddPacketToInputStream("start_pos", start_pos_packet)); + + for (auto frame_packet : input_frames_packets_) { + MP_EXPECT_OK( + graph_.AddPacketToInputStream("image_cpu_frames", frame_packet)); + MP_EXPECT_OK(graph_.WaitUntilIdle()); + } + + MP_EXPECT_OK(graph_.CloseAllInputStreams()); + MP_EXPECT_OK(graph_.WaitUntilDone()); +} + +std::unique_ptr +TrackingGraphTest::CreateRandomAccessTrackingBoxList( + const std::vector& start_timestamps, + const std::vector& end_timestamps) const { + CHECK_EQ(start_timestamps.size(), end_timestamps.size()); + auto ra_boxes = absl::make_unique(); + for (int i = 0; i < start_timestamps.size(); ++i) { + auto start_box_list = + MakeBoxList(start_timestamps[i], std::vector{true}, + std::vector{true}, std::vector{false}); + auto end_box_list = + MakeBoxList(end_timestamps[i], std::vector{true}, + std::vector{true}, std::vector{false}); + *(ra_boxes->add_box()) = (*start_box_list).box(0); + *(ra_boxes->add_box()) = (*end_box_list).box(0); + } + return ra_boxes; +} + +TEST_F(TrackingGraphTest, BasicBoxTrackingSanityCheck) { + // Create input side packets. + std::map side_packets; + side_packets.insert(std::make_pair("analysis_downsample_factor", + mediapipe::MakePacket(1.0f))); + side_packets.insert(std::make_pair( + "calculator_options", + mediapipe::MakePacket(CalculatorOptions()))); + + // Run the graph with input side packets, start_pos, and input image frames. + Timestamp start_box_time = input_frames_packets_[0].Timestamp(); + // is_quad_tracking is used to indicate whether to track quad for each + // individual box. + std::vector is_quad_tracking{false}; + // is_pnp_tracking is used to indicate whether to use perspective transform to + // track quad. + std::vector is_pnp_tracking{false}; + // is_reacquisition is used to indicate whether to enable reacquisition for + // the box. + std::vector is_reacquisition{false}; + auto start_box_list = MakeBoxList(start_box_time, is_quad_tracking, + is_pnp_tracking, is_reacquisition); + Packet start_pos_packet = Adopt(start_box_list.release()).At(start_box_time); + RunGraphWithSidePacketsAndInputs(side_packets, start_pos_packet); + + EXPECT_EQ(input_frames_packets_.size(), output_packets_.size()); + + for (int i = 0; i < output_packets_.size(); ++i) { + const TimedBoxProtoList& boxes = + output_packets_[i].Get(); + EXPECT_EQ(is_quad_tracking.size(), boxes.box_size()); + ExpectBoxAtFrame(boxes.box(0), i, false); + } +} + +TEST_F(TrackingGraphTest, BasicQuadTrackingSanityCheck) { + // Create input side packets. + std::map side_packets; + side_packets.insert(std::make_pair("analysis_downsample_factor", + mediapipe::MakePacket(1.0f))); + CalculatorOptions calculator_options; + calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext) + ->mutable_tracker_options() + ->mutable_track_step_options() + ->set_tracking_degrees( + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE); + side_packets.insert(std::make_pair( + "calculator_options", + mediapipe::MakePacket(calculator_options))); + + Timestamp start_box_time = input_frames_packets_[0].Timestamp(); + // Box id 0 use quad tracking with 8DoF homography transform. + // Box id 1 use quad tracking with 6DoF perspective transform. + // Box id 2 use box tracking with 4DoF similarity transform. + std::vector is_quad_tracking{true, true, false}; + std::vector is_pnp_tracking{false, true, false}; + std::vector is_reacquisition{true, false, true}; + auto start_box_list = MakeBoxList(start_box_time, is_quad_tracking, + is_pnp_tracking, is_reacquisition); + Packet start_pos_packet = Adopt(start_box_list.release()).At(start_box_time); + RunGraphWithSidePacketsAndInputs(side_packets, start_pos_packet); + + EXPECT_EQ(input_frames_packets_.size(), output_packets_.size()); + for (int i = 0; i < output_packets_.size(); ++i) { + const TimedBoxProtoList& boxes = + output_packets_[i].Get(); + EXPECT_EQ(is_quad_tracking.size(), boxes.box_size()); + for (int j = 0; j < boxes.box_size(); ++j) { + const TimedBoxProto& box = boxes.box(j); + if (is_quad_tracking[box.id()]) { + ExpectQuadAtFrame(box, i, + is_pnp_tracking[box.id()] ? kImageAspectRatio : -1.0f, + is_reacquisition[box.id()]); + } else { + ExpectBoxAtFrame(box, i, is_reacquisition[box.id()]); + } + } + } +} + +TEST_F(TrackingGraphTest, TestRandomAccessTrackingResults) { + // Create input side packets. + std::map side_packets; + side_packets.insert(std::make_pair("analysis_downsample_factor", + mediapipe::MakePacket(1.0f))); + CalculatorOptions calculator_options; + calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext) + ->mutable_tracker_options() + ->mutable_track_step_options() + ->set_tracking_degrees( + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE); + side_packets.insert(std::make_pair( + "calculator_options", + mediapipe::MakePacket(calculator_options))); + + ASSERT_GT(input_frames_packets_.size(), 2); // at least 3 frames + ASSERT_TRUE(input_frames_packets_[2].Timestamp() - + input_frames_packets_[1].Timestamp() > + TimestampDiff(1000)); + + constexpr int start_frame = 0; + Timestamp start_box_time = input_frames_packets_[start_frame].Timestamp(); + auto start_box_list = + MakeBoxList(start_box_time, std::vector{true}, + std::vector{true}, std::vector{false}); + constexpr int end_frame = 2; + Timestamp end_box_time = input_frames_packets_[end_frame].Timestamp(); + + // Also test reverse random access tracking. + // This offset of 1ms is simulating the case where the start query timestamp + // to be not any existing frame timestamp. In reality, it's highly encouraged + // to have the start query timestamp be aligned with frame timestamp. + constexpr int reverse_start_frame = 1; + Timestamp reverse_start_box_time = + input_frames_packets_[reverse_start_frame].Timestamp() + 1000; + + auto ra_boxes = CreateRandomAccessTrackingBoxList( + {start_box_time, reverse_start_box_time}, {end_box_time, start_box_time}); + + Packet ra_packet = Adopt(ra_boxes.release()).At(start_box_time); + Packet start_packet = Adopt(start_box_list.release()).At(start_box_time); + + // Start running the ordinary graph, verify random access produce same result + // as normal tracking. + MP_EXPECT_OK(graph_.StartRun(side_packets)); + MP_EXPECT_OK(graph_.AddPacketToInputStream("start_pos", start_packet)); + for (auto frame_packet : input_frames_packets_) { + MP_EXPECT_OK( + graph_.AddPacketToInputStream("image_cpu_frames", frame_packet)); + Packet track_time_packet = Adopt(new int(0)).At(frame_packet.Timestamp()); + MP_EXPECT_OK( + graph_.AddPacketToInputStream("track_time", track_time_packet)); + MP_EXPECT_OK(graph_.WaitUntilIdle()); + } + MP_EXPECT_OK(graph_.AddPacketToInputStream("ra_track", ra_packet)); + MP_EXPECT_OK(graph_.CloseAllInputStreams()); + MP_EXPECT_OK(graph_.WaitUntilDone()); + + EXPECT_EQ(input_frames_packets_.size(), output_packets_.size()); + const TimedBoxProtoList tracking_result = + output_packets_[end_frame].Get(); + EXPECT_EQ(1, tracking_result.box_size()); + + // Should have 1 random access packet. + EXPECT_EQ(1, random_access_results_packets_.size()); + const TimedBoxProtoList& ra_result = + random_access_results_packets_[0].Get(); + // Two box tracking results. One for comparison with normal tracking. The + // other for reverse random access tracking. + EXPECT_EQ(2, ra_result.box_size()); + + // Check if randan access tracking has same result with normal tracking. + ExpectQuadNear(tracking_result.box(0), ra_result.box(0)); + ExpectQuadAtFrame(ra_result.box(0), end_frame - start_frame, + kImageAspectRatio, false); + ExpectQuadAtFrame(ra_result.box(1), start_frame - reverse_start_frame - 1, + kImageAspectRatio, false); + + // Clear output and ra result packet vector before test parallel graph. + TearDown(); + + // Start running the parallel graph, verify random access produce same result + // as normal tracking. + MP_EXPECT_OK(parallel_graph_.StartRun(side_packets)); + MP_EXPECT_OK( + parallel_graph_.AddPacketToInputStream("start_pos", start_packet)); + for (auto frame_packet : input_frames_packets_) { + MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("image_cpu_frames", + frame_packet)); + MP_EXPECT_OK(parallel_graph_.WaitUntilIdle()); + } + MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("ra_track", ra_packet)); + MP_EXPECT_OK(parallel_graph_.CloseAllInputStreams()); + MP_EXPECT_OK(parallel_graph_.WaitUntilDone()); + + EXPECT_EQ(input_frames_packets_.size(), output_packets_.size()); + const TimedBoxProtoList parallel_tracking_result = + output_packets_[end_frame].Get(); + EXPECT_EQ(1, parallel_tracking_result.box_size()); + + // should have only 1 random access + EXPECT_EQ(1, random_access_results_packets_.size()); + const TimedBoxProtoList& parallel_ra_result = + random_access_results_packets_[0].Get(); + EXPECT_EQ(2, parallel_ra_result.box_size()); + + // Check if randan access tracking has same result with normal tracking. + ExpectQuadNear(parallel_tracking_result.box(0), parallel_ra_result.box(0)); + ExpectQuadAtFrame(parallel_ra_result.box(0), end_frame - start_frame, + kImageAspectRatio, false); + ExpectQuadAtFrame(parallel_ra_result.box(1), + start_frame - reverse_start_frame - 1, kImageAspectRatio, + false); +} + +// Tests what happens when random access request timestamps are +// outside of cache. +TEST_F(TrackingGraphTest, TestRandomAccessTrackingTimestamps) { + // Create input side packets. + std::map side_packets; + side_packets.insert(std::make_pair("analysis_downsample_factor", + mediapipe::MakePacket(1.0f))); + CalculatorOptions calculator_options; + calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext) + ->mutable_tracker_options() + ->mutable_track_step_options() + ->set_tracking_degrees( + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE); + // We intentionally don't cache all frames, to see what happens when + // random access tracking request time falls outside cache range. + calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext) + ->set_streaming_track_data_cache_size(input_frames_packets_.size() - 1); + side_packets.insert(std::make_pair( + "calculator_options", + mediapipe::MakePacket(calculator_options))); + + // Set up random access boxes + const int num_frames = input_frames_packets_.size(); + const int64 usec_in_sec = 1000000; + std::vector start_timestamps{ + input_frames_packets_[0].Timestamp() - usec_in_sec, // forward + input_frames_packets_[0].Timestamp(), // forward + input_frames_packets_[1].Timestamp(), // forward + input_frames_packets_[num_frames - 1].Timestamp() + usec_in_sec, // fwd + input_frames_packets_[0].Timestamp(), // backward + input_frames_packets_[num_frames - 1].Timestamp(), // backward + input_frames_packets_[num_frames - 1].Timestamp(), // backward + input_frames_packets_[num_frames - 1].Timestamp() + usec_in_sec // back + }; + std::vector end_timestamps{ + input_frames_packets_[num_frames - 1].Timestamp(), + input_frames_packets_[num_frames - 1].Timestamp(), + input_frames_packets_[num_frames - 1].Timestamp() + usec_in_sec, + input_frames_packets_[num_frames - 1].Timestamp() + 2 * usec_in_sec, + input_frames_packets_[0].Timestamp() - usec_in_sec, + input_frames_packets_[0].Timestamp(), + input_frames_packets_[0].Timestamp() - usec_in_sec, + input_frames_packets_[1].Timestamp()}; + auto ra_boxes = + CreateRandomAccessTrackingBoxList(start_timestamps, end_timestamps); + Packet ra_packet = + Adopt(ra_boxes.release()).At(input_frames_packets_[0].Timestamp()); + + // Run the graph and check if the outside-cache request have no results. + // Start running the parallel graph, verify random access produce same result + // as normal tracking. + MP_EXPECT_OK(parallel_graph_.StartRun(side_packets)); + for (auto frame_packet : input_frames_packets_) { + MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("image_cpu_frames", + frame_packet)); + MP_EXPECT_OK(parallel_graph_.WaitUntilIdle()); + } + MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("ra_track", ra_packet)); + MP_EXPECT_OK(parallel_graph_.CloseAllInputStreams()); + MP_EXPECT_OK(parallel_graph_.WaitUntilDone()); + + // should have 1 random access packet with 0 result boxes + EXPECT_EQ(1, random_access_results_packets_.size()); + const auto& ra_returned_boxes = + random_access_results_packets_[0].Get(); + const int num_returned_ra_boxes = ra_returned_boxes.box_size(); + EXPECT_EQ(0, num_returned_ra_boxes); +} + +TEST_F(TrackingGraphTest, TestTransitionFramesForReacquisition) { + // Create input side packets. + std::map side_packets; + side_packets.insert(std::make_pair("analysis_downsample_factor", + mediapipe::MakePacket(1.0f))); + CalculatorOptions calculator_options; + calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext) + ->mutable_tracker_options() + ->mutable_track_step_options() + ->set_tracking_degrees( + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE); + constexpr int kTransitionFrames = 3; + calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext) + ->set_start_pos_transition_frames(kTransitionFrames); + + side_packets.insert(std::make_pair( + "calculator_options", + mediapipe::MakePacket(calculator_options))); + + Timestamp start_box_time = input_frames_packets_[0].Timestamp(); + // Box id 0 use quad tracking with 8DoF homography transform. + // Box id 1 use quad tracking with 6DoF perspective transform. + // Box id 2 use box tracking with 4DoF similarity transform. + std::vector is_quad_tracking{true, true, false}; + std::vector is_pnp_tracking{false, true, false}; + std::vector is_reacquisition{true, true, true}; + auto start_box_list = MakeBoxList(start_box_time, is_quad_tracking, + is_pnp_tracking, is_reacquisition); + Packet start_pos_packet = Adopt(start_box_list.release()).At(start_box_time); + + // Setting box pos restart from initial position (frame 0's position). + constexpr int kRestartFrame = 3; + Timestamp restart_box_time = input_frames_packets_[kRestartFrame].Timestamp(); + auto restart_box_list = MakeBoxList(restart_box_time, is_quad_tracking, + is_pnp_tracking, is_reacquisition); + Packet restart_pos_packet = + Adopt(restart_box_list.release()).At(restart_box_time); + MP_EXPECT_OK(graph_.StartRun(side_packets)); + MP_EXPECT_OK(graph_.AddPacketToInputStream("start_pos", start_pos_packet)); + + for (int j = 0; j < input_frames_packets_.size(); ++j) { + // Add TRACK_TIME stream queries in between 2 frames. + if (j > 0) { + Timestamp track_time = Timestamp((j - 0.5f) * kFrameIntervalUs); + LOG(INFO) << track_time.Value(); + Packet track_time_packet = Adopt(new Timestamp).At(track_time); + MP_EXPECT_OK( + graph_.AddPacketToInputStream("track_time", track_time_packet)); + } + + MP_EXPECT_OK(graph_.AddPacketToInputStream("image_cpu_frames", + input_frames_packets_[j])); + Packet track_time_packet = + Adopt(new int(0)).At(input_frames_packets_[j].Timestamp()); + MP_EXPECT_OK( + graph_.AddPacketToInputStream("track_time", track_time_packet)); + MP_EXPECT_OK(graph_.WaitUntilIdle()); + + if (j == kRestartFrame) { + MP_EXPECT_OK( + graph_.AddPacketToInputStream("restart_pos", restart_pos_packet)); + } + } + + MP_EXPECT_OK(graph_.CloseAllInputStreams()); + MP_EXPECT_OK(graph_.WaitUntilDone()); + + EXPECT_EQ(input_frames_packets_.size() * 2 - 1, output_packets_.size()); + for (int i = 0; i < output_packets_.size(); ++i) { + const TimedBoxProtoList& boxes = + output_packets_[i].Get(); + EXPECT_EQ(is_quad_tracking.size(), boxes.box_size()); + float frame_id = i / 2.0f; + float expected_frame_id; + if (frame_id <= kRestartFrame) { + // before transition + expected_frame_id = frame_id; + } else { + float transition_frames = frame_id - kRestartFrame; + if (transition_frames <= kTransitionFrames) { + // transitioning. + expected_frame_id = + kRestartFrame - + transition_frames / kTransitionFrames * kRestartFrame + + transition_frames; + } else { + // after transition. + expected_frame_id = transition_frames; + } + } + + for (int j = 0; j < boxes.box_size(); ++j) { + const TimedBoxProto& box = boxes.box(j); + if (is_quad_tracking[box.id()]) { + ExpectQuadAtFrame(box, expected_frame_id, + is_pnp_tracking[box.id()] ? kImageAspectRatio : -1.0f, + is_reacquisition[box.id()]); + } else { + ExpectBoxAtFrame(box, expected_frame_id, is_reacquisition[box.id()]); + } + } + } +} + +// TODO: Add test for reacquisition. + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/docs/examples.md b/mediapipe/docs/examples.md index 846f4f7e7..0f3077a5d 100644 --- a/mediapipe/docs/examples.md +++ b/mediapipe/docs/examples.md @@ -39,6 +39,11 @@ pipeline. the same TFLite model in a CPU-based pipeline. This example highlights how graphs can be easily adapted to run on CPU v.s. GPU. +### Object Detection and Tracking with GPU + +[Object Detection and Tracking with GPU](./object_tracking_mobile_gpu.md) illustrates how to +use MediaPipe for object detection and tracking. + ### Face Detection with GPU [Face Detection with GPU](./face_detection_mobile_gpu.md) illustrates how to use diff --git a/mediapipe/docs/images/mobile/box_tracking_subgraph.png b/mediapipe/docs/images/mobile/box_tracking_subgraph.png new file mode 100644 index 000000000..fccfc65eb Binary files /dev/null and b/mediapipe/docs/images/mobile/box_tracking_subgraph.png differ diff --git a/mediapipe/docs/images/mobile/object_detection_gpu_subgraph.png b/mediapipe/docs/images/mobile/object_detection_gpu_subgraph.png new file mode 100644 index 000000000..8c4a24f95 Binary files /dev/null and b/mediapipe/docs/images/mobile/object_detection_gpu_subgraph.png differ diff --git a/mediapipe/docs/images/mobile/object_tracking_android_gpu.gif b/mediapipe/docs/images/mobile/object_tracking_android_gpu.gif new file mode 100644 index 000000000..cb8b4f6e3 Binary files /dev/null and b/mediapipe/docs/images/mobile/object_tracking_android_gpu.gif differ diff --git a/mediapipe/docs/images/mobile/object_tracking_android_gpu_small.gif b/mediapipe/docs/images/mobile/object_tracking_android_gpu_small.gif new file mode 100644 index 000000000..9fd53144f Binary files /dev/null and b/mediapipe/docs/images/mobile/object_tracking_android_gpu_small.gif differ diff --git a/mediapipe/docs/images/mobile/object_tracking_mobile_gpu.png b/mediapipe/docs/images/mobile/object_tracking_mobile_gpu.png new file mode 100644 index 000000000..de09514c1 Binary files /dev/null and b/mediapipe/docs/images/mobile/object_tracking_mobile_gpu.png differ diff --git a/mediapipe/docs/images/mobile/object_tracking_renderer_gpu_subgraph.png b/mediapipe/docs/images/mobile/object_tracking_renderer_gpu_subgraph.png new file mode 100644 index 000000000..b164643a6 Binary files /dev/null and b/mediapipe/docs/images/mobile/object_tracking_renderer_gpu_subgraph.png differ diff --git a/mediapipe/docs/images/mobile/object_tracking_subgraph.png b/mediapipe/docs/images/mobile/object_tracking_subgraph.png new file mode 100644 index 000000000..8b7aa2143 Binary files /dev/null and b/mediapipe/docs/images/mobile/object_tracking_subgraph.png differ diff --git a/mediapipe/docs/install.md b/mediapipe/docs/install.md index 97eb8ca0b..cdee42746 100644 --- a/mediapipe/docs/install.md +++ b/mediapipe/docs/install.md @@ -63,6 +63,7 @@ To build and run iOS apps: ```bash $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \ + libopencv-calib3d-dev libopencv-features2d-dev \ libopencv-imgproc-dev libopencv-video-dev ``` @@ -411,6 +412,7 @@ To build and run iOS apps: ```bash username@DESKTOP-TMVLBJ1:~/mediapipe$ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \ + libopencv-calib3d-dev libopencv-features2d-dev \ libopencv-imgproc-dev libopencv-video-dev ``` diff --git a/mediapipe/docs/object_tracking_mobile_gpu.md b/mediapipe/docs/object_tracking_mobile_gpu.md new file mode 100644 index 000000000..aacd1f288 --- /dev/null +++ b/mediapipe/docs/object_tracking_mobile_gpu.md @@ -0,0 +1,475 @@ +# Object Detection and Tracking + +This doc focuses on the +[below example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt) +that performs object detection and tracking. +Note that object detection is using TensorFlow Lite on GPU while tracking is using CPU. + +For overall context on hand detection and hand tracking, please read this +[Google Developer Blog](https://mediapipe.page.link/objecttrackingblog). + +![object_tracking_android_gpu_gif](images/mobile/object_tracking_android_gpu.gif) + +## Android + +[Source](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu) + +To build and install the app: + +```bash +bazel build -c opt --config=android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu +``` + +Once the app is built, install it on Android device with: + +```bash +adb install bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/objecttrackinggpu.apk +``` + +## Graph +The object detection and tracking [main graph](#main-graph) internally utilizes a +[object detection subgraph](#object-detection-subgraph), a +[object tracking subgraph](#object-tracking-subgraph) and a +[renderer subgraph](#renderer-subgraph). + + +The subgraphs show up in the main graph visualization as nodes colored in +purple, and the subgraph itself can also be visualized just like a regular +graph. For more information on how to visualize a graph that includes subgraphs, +see the Visualizing Subgraphs section in the +[visualizer documentation](./visualizer.md). + +### Main Graph + + +![object_detection_mobile_gpu_graph](images/mobile/object_tracking_mobile_gpu.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection/object_detection_mobile_gpu.pbtxt) + +```bash +# MediaPipe graph that performs object detection and tracking. +# Used in the examples in +# mediapipie/examples/android/src/java/com/mediapipe/apps/objecttrackinggpu + +# Images on GPU coming into and out of the graph. +input_stream: "input_video" +output_stream: "output_video" + +# Resamples the images by specific frame rate. This calculator is used to +# control the frequecy of subsequent calculators/subgraphs, e.g. less power +# consumption for expensive process. +node { + calculator: "PacketResamplerCalculator" + input_stream: "DATA:input_video" + output_stream: "DATA:throttled_input_video" + node_options: { + [type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] { + frame_rate: 0.5 + } + } +} + +# Subgraph that detections objects (see object_detection_gpu.pbtxt). +node { + calculator: "ObjectDetectionSubgraph" + input_stream: "IMAGE:throttled_input_video" + output_stream: "DETECTIONS:output_detections" +} + +# Subgraph that tracks objects (see object_tracking.pbtxt). +node { + calculator: "ObjectTrackingSubgraph" + input_stream: "VIDEO:input_video" + input_stream: "DETECTIONS:output_detections" + output_stream: "DETECTIONS:tracked_detections" +} + +# Subgraph that renders annotations and overlays them on top of the input +# images (see renderer_gpu.pbtxt). +node { + calculator: "RendererSubgraph" + input_stream: "IMAGE:input_video" + input_stream: "DETECTIONS:tracked_detections" + output_stream: "IMAGE:output_video" +} +``` + +### Object Detection Subgraph + +![object_detection_gpu_subgraph](images/mobile/object_detection_gpu_subgraph.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/object_detection_gpu.pbtxt) + +```bash +# MediaPipe object detection subgraph. + +type: "ObjectDetectionSubgraph" + +input_stream: "IMAGE:input_video" +output_stream: "DETECTIONS:output_detections" + +# Transforms the input image on GPU to a 320x320 image. To scale the image, by +# default it uses the STRETCH scale mode that maps the entire input image to the +# entire transformed image. As a result, image aspect ratio may be changed and +# objects in the image may be deformed (stretched or squeezed), but the object +# detection model used in this graph is agnostic to that deformation. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 320 + output_height: 320 + } + } +} + +# Converts the transformed input image on GPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE_GPU:transformed_input_video" + output_stream: "TENSORS_GPU:image_tensor" +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS_GPU:image_tensor" + output_stream: "TENSORS_GPU:detection_tensors" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "mediapipe/models/ssdlite_object_detection.tflite" + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + node_options: { + [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { + num_layers: 6 + min_scale: 0.2 + max_scale: 0.95 + input_size_height: 320 + input_size_width: 320 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 16 + strides: 32 + strides: 64 + strides: 128 + strides: 256 + strides: 512 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.3333 + reduce_boxes_in_lowest_layer: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TfLiteTensorsToDetectionsCalculator" + input_stream: "TENSORS_GPU:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:detections" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { + num_classes: 91 + num_boxes: 2034 + num_coords: 4 + ignore_classes: 0 + sigmoid_score: true + apply_exponential_on_box_size: true + x_scale: 10.0 + y_scale: 10.0 + h_scale: 5.0 + w_scale: 5.0 + min_score_thresh: 0.6 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "detections" + output_stream: "filtered_detections" + node_options: { + [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { + min_suppression_threshold: 0.4 + max_num_detections: 3 + overlap_type: INTERSECTION_OVER_UNION + return_empty_detections: true + } + } +} + +# Maps detection label IDs to the corresponding label text. The label map is +# provided in the label_map_path option. +node { + calculator: "DetectionLabelIdToTextCalculator" + input_stream: "filtered_detections" + output_stream: "output_detections" + node_options: { + [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { + label_map_path: "mediapipe/models/ssdlite_object_detection_labelmap.txt" + } + } +} +``` + +### Object Tracking Subgraph + +Object tracking subgraph uses a box tracking subgraph which is a generic +tracking library that can be used for other use cases. + +![object_tracking_subgraph](images/mobile/object_tracking_subgraph.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/object_tracking.pbtxt) + +```bash +# MediaPipe object tracking subgraph. + +type: "ObjectTrackingSubgraph" + +input_stream: "VIDEO:input_video" +input_stream: "DETECTIONS:new_detections" +output_stream: "DETECTIONS:tracked_detections" + +# Assigns an unique id for each new detection. +node { + calculator: "DetectionUniqueIdCalculator" + input_stream: "DETECTIONS:new_detections" + output_stream: "DETECTIONS:detections_with_id" +} + +# Converts detections to TimedBox protos which are used as initial location +# for tracking. +node { + calculator: "DetectionsToTimedBoxListCalculator" + input_stream: "DETECTIONS:detections_with_id" + output_stream: "BOXES:start_pos" +} + +# Subgraph that tracks boxes (see box_tracking.pbtxt). +node { + calculator: "BoxTrackingSubgraph" + input_stream: "VIDEO:input_video" + input_stream: "BOXES:start_pos" + input_stream: "CANCEL_ID:cancel_object_id" + output_stream: "BOXES:boxes" +} + +# Managers new detected objects and objects that are being tracked. +# It associates the duplicated detections and updates the locations of +# detections from tracking. +node: { + calculator: "TrackedDetectionManagerCalculator" + input_stream: "DETECTIONS:detections_with_id" + input_stream: "TRACKING_BOXES:boxes" + output_stream: "DETECTIONS:tracked_detections" + output_stream: "CANCEL_OBJECT_ID:cancel_object_id" + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "TRACKING_BOXES" + } + sync_set { + tag_index: "DETECTIONS" + } + } + } + } +} + +``` + +### Box Tracking Subgraph + +![box_tracking_subgraph](images/mobile/box_tracking_subgraph.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/box_tracking.pbtxt) + +```bash +# MediaPipe box tracking subgraph. + +type: "BoxTrackingSubgraph" + +input_stream: "VIDEO:input_video" +input_stream: "BOXES:start_pos" +input_stream: "CANCEL_ID:cancel_object_id" +output_stream: "BOXES:boxes" + +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:downscaled_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 240 + output_height: 320 + } + } +} + +# Converts GPU buffer to ImageFrame for processing tracking. +node: { + calculator: "GpuBufferToImageFrameCalculator" + input_stream: "downscaled_input_video" + output_stream: "downscaled_input_video_cpu" +} + +# Performs motion analysis on an incoming video stream. +node: { + calculator: "MotionAnalysisCalculator" + input_stream: "VIDEO:downscaled_input_video_cpu" + output_stream: "CAMERA:camera_motion" + output_stream: "FLOW:region_flow" + + node_options: { + [type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: { + analysis_options { + analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE + flow_options { + fast_estimation_min_block_size: 100 + top_inlier_sets: 1 + frac_inlier_error_threshold: 3e-3 + downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE + verification_distance: 5.0 + verify_long_feature_acceleration: true + verify_long_feature_trigger_ratio: 0.1 + tracking_options { + max_features: 500 + adaptive_extraction_levels: 2 + min_eig_val_settings { + adaptive_lowest_quality_level: 2e-4 + } + klt_tracker_implementation: KLT_OPENCV + } + } + } + } + } +} + +# Reads optical flow fields defined in +# mediapipe/framework/formats/motion/optical_flow_field.h, +# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized +# to 0-255. +node: { + calculator: "FlowPackagerCalculator" + input_stream: "FLOW:region_flow" + input_stream: "CAMERA:camera_motion" + output_stream: "TRACKING:tracking_data" + + node_options: { + [type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: { + flow_packager_options: { + binary_tracking_data_support: false + } + } + } +} + +# Tracks box positions over time. +node: { + calculator: "BoxTrackerCalculator" + input_stream: "TRACKING:tracking_data" + input_stream: "TRACK_TIME:input_video" + input_stream: "START_POS:start_pos" + input_stream: "CANCEL_OBJECT_ID:cancel_object_id" + input_stream_info: { + tag_index: "CANCEL_OBJECT_ID" + back_edge: true + } + output_stream: "BOXES:boxes" + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "TRACKING" + tag_index: "TRACK_TIME" + } + sync_set { + tag_index: "START_POS" + } + sync_set { + tag_index: "CANCEL_OBJECT_ID" + } + } + } + } + + node_options: { + [type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: { + tracker_options: { + track_step_options { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_SCALE + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} +``` + +### Renderer Subgraph + +![hand_renderer_gpu_subgraph.pbtxt](images/mobile/object_tracking_renderer_gpu_subgraph.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/renderer_gpu.pbtxt) + +```bash +# MediaPipe object tracking rendering subgraph. + +type: "RendererSubgraph" + +input_stream: "IMAGE:input_image" +input_stream: "DETECTIONS:detections" +output_stream: "IMAGE:output_image" + +# Converts the detections to drawing primitives for annotation overlay. +node { + calculator: "DetectionsToRenderDataCalculator" + input_stream: "DETECTIONS:detections" + output_stream: "RENDER_DATA:detections_render_data" + node_options: { + [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { + thickness: 4.0 + color { r: 255 g: 0 b: 0 } + render_detection_id: true + } + } +} + +# Draws annotations and overlays them on top of the input images. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "INPUT_FRAME_GPU:input_image" + input_stream: "detections_render_data" + output_stream: "OUTPUT_FRAME_GPU:output_image" +} + +``` diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/AndroidManifest.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/AndroidManifest.xml new file mode 100644 index 000000000..2c3693a8f --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/AndroidManifest.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/BUILD new file mode 100644 index 000000000..be0fc0cf2 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/BUILD @@ -0,0 +1,81 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:private"]) + +cc_binary( + name = "libmediapipe_jni.so", + linkshared = 1, + linkstatic = 1, + deps = [ + "//mediapipe/graphs/tracking:mobile_calculators", + "//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni", + ], +) + +cc_library( + name = "mediapipe_jni_lib", + srcs = [":libmediapipe_jni.so"], + alwayslink = 1, +) + +# Maps the binary graph to an alias (e.g., the app name) for convenience so that the alias can be +# easily incorporated into the app via, for example, +# MainActivity.BINARY_GRAPH_NAME = "appname.binarypb". +genrule( + name = "binary_graph", + srcs = ["//mediapipe/graphs/tracking:mobile_gpu_binary_graph"], + outs = ["objecttrackinggpu.binarypb"], + cmd = "cp $< $@", +) + +android_library( + name = "mediapipe_lib", + srcs = glob(["*.java"]), + assets = [ + ":binary_graph", + "//mediapipe/models:ssdlite_object_detection.tflite", + "//mediapipe/models:ssdlite_object_detection_labelmap.txt", + ], + assets_dir = "", + manifest = "AndroidManifest.xml", + resource_files = glob(["res/**"]), + deps = [ + ":mediapipe_jni_lib", + "//mediapipe/java/com/google/mediapipe/components:android_camerax_helper", + "//mediapipe/java/com/google/mediapipe/components:android_components", + "//mediapipe/java/com/google/mediapipe/framework:android_framework", + "//mediapipe/java/com/google/mediapipe/glutil", + "//third_party:androidx_appcompat", + "//third_party:androidx_constraint_layout", + "//third_party:androidx_legacy_support_v4", + "//third_party:androidx_recyclerview", + "//third_party:opencv", + "@androidx_concurrent_futures//jar", + "@androidx_lifecycle//jar", + "@com_google_guava_android//jar", + ], +) + +android_binary( + name = "objecttrackinggpu", + manifest = "AndroidManifest.xml", + manifest_values = {"applicationId": "com.google.mediapipe.apps.objecttrackinggpu"}, + multidex = "native", + deps = [ + ":mediapipe_lib", + ], +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/MainActivity.java b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/MainActivity.java new file mode 100644 index 000000000..c08e7055e --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/MainActivity.java @@ -0,0 +1,167 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.google.mediapipe.apps.objecttrackinggpu; + +import android.graphics.SurfaceTexture; +import android.os.Bundle; +import androidx.appcompat.app.AppCompatActivity; +import android.util.Size; +import android.view.SurfaceHolder; +import android.view.SurfaceView; +import android.view.View; +import android.view.ViewGroup; +import com.google.mediapipe.components.CameraHelper; +import com.google.mediapipe.components.CameraXPreviewHelper; +import com.google.mediapipe.components.ExternalTextureConverter; +import com.google.mediapipe.components.FrameProcessor; +import com.google.mediapipe.components.PermissionHelper; +import com.google.mediapipe.framework.AndroidAssetUtil; +import com.google.mediapipe.glutil.EglManager; + +/** Main activity of MediaPipe example apps. */ +public class MainActivity extends AppCompatActivity { + private static final String TAG = "MainActivity"; + + private static final String BINARY_GRAPH_NAME = "objecttrackinggpu.binarypb"; + private static final String INPUT_VIDEO_STREAM_NAME = "input_video"; + private static final String OUTPUT_VIDEO_STREAM_NAME = "output_video"; + private static final CameraHelper.CameraFacing CAMERA_FACING = CameraHelper.CameraFacing.BACK; + + // Flips the camera-preview frames vertically before sending them into FrameProcessor to be + // processed in a MediaPipe graph, and flips the processed frames back when they are displayed. + // This is needed because OpenGL represents images assuming the image origin is at the bottom-left + // corner, whereas MediaPipe in general assumes the image origin is at top-left. + private static final boolean FLIP_FRAMES_VERTICALLY = true; + + static { + // Load all native libraries needed by the app. + System.loadLibrary("mediapipe_jni"); + System.loadLibrary("opencv_java3"); + } + + // {@link SurfaceTexture} where the camera-preview frames can be accessed. + private SurfaceTexture previewFrameTexture; + // {@link SurfaceView} that displays the camera-preview frames processed by a MediaPipe graph. + private SurfaceView previewDisplayView; + + // Creates and manages an {@link EGLContext}. + private EglManager eglManager; + // Sends camera-preview frames into a MediaPipe graph for processing, and displays the processed + // frames onto a {@link Surface}. + private FrameProcessor processor; + // Converts the GL_TEXTURE_EXTERNAL_OES texture from Android camera into a regular texture to be + // consumed by {@link FrameProcessor} and the underlying MediaPipe graph. + private ExternalTextureConverter converter; + + // Handles camera access via the {@link CameraX} Jetpack support library. + private CameraXPreviewHelper cameraHelper; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_main); + + previewDisplayView = new SurfaceView(this); + setupPreviewDisplayView(); + + // Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g., + // binary graphs. + AndroidAssetUtil.initializeNativeAssetManager(this); + + eglManager = new EglManager(null); + processor = + new FrameProcessor( + this, + eglManager.getNativeContext(), + BINARY_GRAPH_NAME, + INPUT_VIDEO_STREAM_NAME, + OUTPUT_VIDEO_STREAM_NAME); + processor.getVideoSurfaceOutput().setFlipY(FLIP_FRAMES_VERTICALLY); + + PermissionHelper.checkAndRequestCameraPermissions(this); + } + + @Override + protected void onResume() { + super.onResume(); + converter = new ExternalTextureConverter(eglManager.getContext()); + converter.setFlipY(FLIP_FRAMES_VERTICALLY); + converter.setConsumer(processor); + if (PermissionHelper.cameraPermissionsGranted(this)) { + startCamera(); + } + } + + @Override + protected void onPause() { + super.onPause(); + converter.close(); + } + + @Override + public void onRequestPermissionsResult( + int requestCode, String[] permissions, int[] grantResults) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults); + PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults); + } + + private void setupPreviewDisplayView() { + previewDisplayView.setVisibility(View.GONE); + ViewGroup viewGroup = findViewById(R.id.preview_display_layout); + viewGroup.addView(previewDisplayView); + + previewDisplayView + .getHolder() + .addCallback( + new SurfaceHolder.Callback() { + @Override + public void surfaceCreated(SurfaceHolder holder) { + processor.getVideoSurfaceOutput().setSurface(holder.getSurface()); + } + + @Override + public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) { + // (Re-)Compute the ideal size of the camera-preview display (the area that the + // camera-preview frames get rendered onto, potentially with scaling and rotation) + // based on the size of the SurfaceView that contains the display. + Size viewSize = new Size(width, height); + Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize); + + // Connect the converter to the camera-preview frames as its input (via + // previewFrameTexture), and configure the output width and height as the computed + // display size. + converter.setSurfaceTextureAndAttachToGLContext( + previewFrameTexture, displaySize.getWidth(), displaySize.getHeight()); + } + + @Override + public void surfaceDestroyed(SurfaceHolder holder) { + processor.getVideoSurfaceOutput().setSurface(null); + } + }); + } + + private void startCamera() { + cameraHelper = new CameraXPreviewHelper(); + cameraHelper.setOnCameraStartedListener( + surfaceTexture -> { + previewFrameTexture = surfaceTexture; + // Make the display view visible to start showing the preview. This triggers the + // SurfaceHolder.Callback added to (the holder of) previewDisplayView. + previewDisplayView.setVisibility(View.VISIBLE); + }); + cameraHelper.startCamera(this, CAMERA_FACING, /*surfaceTexture=*/ null); + } +} diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/layout/activity_main.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/layout/activity_main.xml new file mode 100644 index 000000000..c19d7e628 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/layout/activity_main.xml @@ -0,0 +1,20 @@ + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/colors.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/colors.xml new file mode 100644 index 000000000..69b22338c --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/colors.xml @@ -0,0 +1,6 @@ + + + #008577 + #00574B + #D81B60 + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/strings.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/strings.xml new file mode 100644 index 000000000..577828cb4 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/strings.xml @@ -0,0 +1,4 @@ + + Object Tracking GPU + Please grant camera permissions. + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/styles.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/styles.xml new file mode 100644 index 000000000..5885930df --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/res/values/styles.xml @@ -0,0 +1,11 @@ + + + + + + diff --git a/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py b/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py index 93c62d54e..b2bb0fa55 100644 --- a/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py +++ b/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py @@ -93,15 +93,15 @@ FILEPATTERN = "kinetics_700_%s_25fps_rgb_flow" SPLITS = { "train": { "shards": 1000, - "examples": 541490 + "examples": 541279 }, "validate": { "shards": 100, - "examples": 34715 + "examples": 34688 }, "test": { "shards": 100, - "examples": 69321 + "examples": 69278 }, "custom": { "csv": None, # Add a CSV for your own data here. diff --git a/mediapipe/framework/calculator_graph_event_loop_test.cc b/mediapipe/framework/calculator_graph_event_loop_test.cc index 599d079a1..915b02e4a 100644 --- a/mediapipe/framework/calculator_graph_event_loop_test.cc +++ b/mediapipe/framework/calculator_graph_event_loop_test.cc @@ -541,5 +541,90 @@ TEST_F(CalculatorGraphEventLoopTest, WaitToAddPacketToInputStream) { ASSERT_EQ(kNumInputPackets, output_packets_.size()); } +// Captures log messages during testing. +class TextMessageLogSink : public LogSink { + public: + std::vector messages; + void Send(const LogEntry& entry) { + messages.push_back(std::string(entry.text_message())); + } +}; + +// Verifies that CalculatorGraph::UnthrottleSources does not run repeatedly +// in a "busy-loop" while the graph is throttled due to a graph-output stream. +TEST_F(CalculatorGraphEventLoopTest, UnthrottleSources) { + CalculatorGraphConfig graph_config; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString( + R"( + node { + calculator: "PassThroughCalculator" + input_stream: "input_numbers" + output_stream: "output_numbers" + } + input_stream: "input_numbers" + output_stream: "output_numbers" + num_threads: 2 + max_queue_size: 5 + )", + &graph_config)); + constexpr int kQueueSize = 5; + + // Initialize and start the mediapipe graph. + CalculatorGraph graph; + MP_ASSERT_OK(graph.Initialize(graph_config)); + graph.SetGraphInputStreamAddMode( + CalculatorGraph::GraphInputStreamAddMode::ADD_IF_NOT_FULL); + auto poller_status = graph.AddOutputStreamPoller("output_numbers"); + MP_ASSERT_OK(poller_status.status()); + mediapipe::OutputStreamPoller& poller = poller_status.ValueOrDie(); + poller.SetMaxQueueSize(kQueueSize); + MP_ASSERT_OK(graph.StartRun({})); + + // Lambda that adds a packet to the calculator graph. + auto add_packet = [&graph](std::string s, int i) { + return graph.AddPacketToInputStream(s, MakePacket(i).At(Timestamp(i))); + }; + + // Start capturing VLOG messages from the mediapipe::Scheduler. + TextMessageLogSink log_listener; + mediapipe::AddLogSink(&log_listener); + SetVLOGLevel("scheduler", 3); + + // Add just enough packets to fill the output stream queue. + std::vector out_packets; + for (int i = 0; i < kQueueSize; ++i) { + MP_EXPECT_OK(add_packet("input_numbers", i)); + MP_EXPECT_OK(graph.WaitUntilIdle()); + } + + // The graph is throttled due to the full output stream. + EXPECT_FALSE(add_packet("input_numbers", kQueueSize).ok()); + + // CalculatorGraph::UnthrottleSources should be called just one time. + absl::SleepFor(absl::Milliseconds(100)); + + // Read all packets from the output stream queue and close the graph. + for (int i = 0; i < kQueueSize; ++i) { + Packet packet; + EXPECT_TRUE(poller.Next(&packet)); + out_packets.push_back(packet); + } + MP_EXPECT_OK(graph.CloseAllInputStreams()); + MP_EXPECT_OK(graph.WaitUntilDone()); + EXPECT_EQ(kQueueSize, out_packets.size()); + + // Stop capturing VLOG messages. + SetVLOGLevel("scheduler", 0); + mediapipe::RemoveLogSink(&log_listener); + + // Count and validate the calls to UnthrottleSources. + int loop_count = 0; + for (auto& message : log_listener.messages) { + loop_count += (message == "HandleIdle: unthrottling") ? 1 : 0; + } + EXPECT_GE(loop_count, 1); + EXPECT_LE(loop_count, 2); +} + } // namespace } // namespace mediapipe diff --git a/mediapipe/framework/formats/BUILD b/mediapipe/framework/formats/BUILD index f831bb3fd..c42621748 100644 --- a/mediapipe/framework/formats/BUILD +++ b/mediapipe/framework/formats/BUILD @@ -241,7 +241,9 @@ cc_test( proto_library( name = "rect_proto", srcs = ["rect.proto"], - visibility = ["//visibility:public"], + visibility = [ + "//visibility:public", + ], ) mediapipe_cc_proto_library( @@ -253,10 +255,21 @@ mediapipe_cc_proto_library( deps = [":rect_proto"], ) +java_lite_proto_library( + name = "rect_java_proto_lite", + strict_deps = 0, + visibility = [ + "//mediapipe:__subpackages__", + ], + deps = [":landmark_proto"], +) + proto_library( name = "landmark_proto", srcs = ["landmark.proto"], - visibility = ["//visibility:public"], + visibility = [ + "//visibility:public", + ], ) mediapipe_cc_proto_library( @@ -269,7 +282,9 @@ mediapipe_cc_proto_library( java_lite_proto_library( name = "landmark_java_proto_lite", strict_deps = 0, - visibility = ["//mediapipe:__subpackages__"], + visibility = [ + "//mediapipe:__subpackages__", + ], deps = [":landmark_proto"], ) diff --git a/mediapipe/framework/port/BUILD b/mediapipe/framework/port/BUILD index 18d46e1c3..3e0fa8630 100644 --- a/mediapipe/framework/port/BUILD +++ b/mediapipe/framework/port/BUILD @@ -176,6 +176,7 @@ cc_library( deps = [ "//mediapipe/framework:port", "@com_github_glog_glog//:glog", + "@com_google_absl//absl/time", ], ) diff --git a/mediapipe/framework/port/logging.h b/mediapipe/framework/port/logging.h index a603c3761..526c045ec 100644 --- a/mediapipe/framework/port/logging.h +++ b/mediapipe/framework/port/logging.h @@ -15,6 +15,50 @@ #ifndef MEDIAPIPE_PORT_LOGGING_H_ #define MEDIAPIPE_PORT_LOGGING_H_ +#include "absl/time/time.h" #include "glog/logging.h" +namespace mediapipe { +using LogSeverity = google::LogSeverity; +const auto SetVLOGLevel = google::SetVLOGLevel; +class LogEntry { + public: + LogEntry(LogSeverity severity, const struct ::tm* tm_time, + absl::string_view message) + : severity_(severity), + timestamp_(absl::FromTM(*tm_time, absl::LocalTimeZone())), + text_message_(message) {} + LogSeverity log_severity() const { return severity_; } + absl::Time timestamp() const { return timestamp_; } + absl::string_view text_message() const { return text_message_; } + + private: + LogSeverity severity_; + absl::Time timestamp_; + absl::string_view text_message_; +}; +class LogSink : public google::LogSink { + public: + virtual ~LogSink() = default; + virtual void Send(const LogEntry& entry) = 0; + virtual void WaitTillSent() {} + + private: + virtual void send(LogSeverity severity, const char* full_filename, + const char* base_filename, int line, + const struct ::tm* tm_time, const char* message, + size_t message_len) { + LogEntry log_entry(severity, tm_time, + absl::string_view(message, message_len)); + Send(log_entry); + } +}; +inline void AddLogSink(LogSink* destination) { + google::AddLogSink(destination); +} +inline void RemoveLogSink(LogSink* destination) { + google::RemoveLogSink(destination); +} +} // namespace mediapipe + #endif // MEDIAPIPE_PORT_LOGGING_H_ diff --git a/mediapipe/graphs/object_detection/object_detection_desktop_lstd_tflite_graph.pbtxt b/mediapipe/graphs/object_detection/object_detection_desktop_lstd_tflite_graph.pbtxt new file mode 100644 index 000000000..1cbba49d0 --- /dev/null +++ b/mediapipe/graphs/object_detection/object_detection_desktop_lstd_tflite_graph.pbtxt @@ -0,0 +1,176 @@ +# MediaPipe graph that performs LSTM based object detection on desktop with +# TensorFlow Lite on CPU. + +# max_queue_size limits the number of packets enqueued on any input stream +# by throttling inputs to the graph. This makes the graph only process one +# frame per time. +max_queue_size: 1 + +# Decodes an input video file into images and a video header. +node { + calculator: "OpenCvVideoDecoderCalculator" + input_side_packet: "INPUT_FILE_PATH:input_video_path" + output_stream: "VIDEO:input_video" + output_stream: "VIDEO_PRESTREAM:input_video_header" +} + +# Transforms the input image on CPU to a 256x256 image. To scale the image, by +# default it uses the STRETCH scale mode that maps the entire input image to the +# entire transformed image. As a result, image aspect ratio may be changed and +# objects in the image may be deformed (stretched or squeezed), but the object +# detection model used in this graph is agnostic to that deformation. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE:input_video" + output_stream: "IMAGE:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 256 + output_height: 256 + } + } +} + +# Converts the transformed input image on CPU into an image tensor as a +# TfLiteTensor. The zero_center option is set to true to normalize the +# pixel values to [-1.f, 1.f] as opposed to [0.f, 1.f]. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE:transformed_input_video" + output_stream: "TENSORS:image_tensor" + node_options: { + [type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] { + zero_center: true + use_quantized_tensors: true + } + } +} + +# Joins the input image_tensor and previous lstm state tensors into a single +# output vector of tensors. +node { + calculator: "ConcatenateTfLiteTensorVectorCalculator" + input_stream: "image_tensor" + input_stream: "prev_lstm_tensors" + output_stream: "concatenated_tensors" +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and state +# tensors, then outputs a vector of tensors representing, for instance, +# detection boxes/keypoints and scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS:concatenated_tensors" + output_stream: "TENSORS:inference_tensors" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "mediapipe/models/lstdlite_object_detection.tflite" + } + } +} + +# Splits up the output tensors into two streams [detection_boxes, +# detection_classes, detection_scores, num_detections], and [lstm_c, lstm_h]. +node { + calculator: "SplitTfLiteTensorVectorCalculator" + input_stream: "inference_tensors" + output_stream: "detection_tensors" + output_stream: "lstm_tensors" + options { + [type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] { + ranges: { begin: 0 end: 4 } + ranges: { begin: 4 end: 6 } + } + } +} + +# Waits for the LSTM state output from the TfLite inferencing to feed back in as +# an input tensor for the next frame. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:transformed_input_video" + input_stream: "LOOP:lstm_tensors" + input_stream_info: { tag_index: "LOOP" back_edge: true } + output_stream: "PREV_LOOP:prev_lstm_tensors" +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the specification in the options, into a vector of detections. Each detection +# describes a detected object. +node { + calculator: "TfLiteTensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" + output_stream: "DETECTIONS:detections" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { + num_classes: 7 + num_boxes: 40 + num_coords: 4 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "detections" + output_stream: "filtered_detections" + node_options: { + [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { + min_suppression_threshold: 0.4 + min_score_threshold: 0.2 + max_num_detections: 5 + overlap_type: INTERSECTION_OVER_UNION + } + } +} + +# Maps detection label IDs to the corresponding label text. The label map is +# provided in the label_map_path option. +node { + calculator: "DetectionLabelIdToTextCalculator" + input_stream: "filtered_detections" + output_stream: "output_detections" + node_options: { + [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { + label_map_path: "mediapipe/models/lstdlite_object_detection_labelmap.txt" + } + } +} + +# Converts the detections to drawing primitives for annotation overlay. +node { + calculator: "DetectionsToRenderDataCalculator" + input_stream: "DETECTIONS:output_detections" + output_stream: "RENDER_DATA:render_data" + node_options: { + [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { + thickness: 2.0 + color { r: 255 g: 255 b: 255 } + } + } +} + +# Draws annotations and overlays them on top of the original image coming into +# the graph. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "INPUT_FRAME:input_video" + input_stream: "render_data" + output_stream: "OUTPUT_FRAME:output_video" +} + +# Encodes the annotated images into a video file, adopting properties specified +# in the input video header, e.g., video framerate. +node { + calculator: "OpenCvVideoEncoderCalculator" + input_stream: "VIDEO:output_video" + input_stream: "VIDEO_PRESTREAM:input_video_header" + input_side_packet: "OUTPUT_FILE_PATH:output_video_path" + node_options: { + [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: { + codec: "avc1" + video_format: "mp4" + } + } +} diff --git a/mediapipe/graphs/tracking/BUILD b/mediapipe/graphs/tracking/BUILD new file mode 100644 index 000000000..0a8054a81 --- /dev/null +++ b/mediapipe/graphs/tracking/BUILD @@ -0,0 +1,39 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_binary_graph", +) + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "mobile_calculators", + deps = [ + "//mediapipe/calculators/core:packet_resampler_calculator", + "//mediapipe/graphs/tracking/subgraphs:object_detection_gpu", + "//mediapipe/graphs/tracking/subgraphs:object_tracking", + "//mediapipe/graphs/tracking/subgraphs:renderer_gpu", + ], +) + +mediapipe_binary_graph( + name = "mobile_gpu_binary_graph", + graph = "object_detection_tracking_mobile_gpu.pbtxt", + output_name = "mobile_gpu.binarypb", + deps = [":mobile_calculators"], +) diff --git a/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt b/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt new file mode 100644 index 000000000..5aebfc1f2 --- /dev/null +++ b/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt @@ -0,0 +1,46 @@ +# MediaPipe graph that performs object detection and tracking with TensorFlow +# Lite on GPU. +# Used in the examples in +# mediapipie/examples/android/src/java/com/mediapipe/apps/objecttrackinggpu + +# Images on GPU coming into and out of the graph. +input_stream: "input_video" +output_stream: "output_video" + +# Resamples the images by specific frame rate. This calculator is used to +# control the frequecy of subsequent calculators/subgraphs, e.g. less power +# consumption for expensive process. +node { + calculator: "PacketResamplerCalculator" + input_stream: "DATA:input_video" + output_stream: "DATA:throttled_input_video" + node_options: { + [type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] { + frame_rate: 0.5 + } + } +} + +# Subgraph that detections objects (see object_detection_gpu.pbtxt). +node { + calculator: "ObjectDetectionSubgraph" + input_stream: "IMAGE:throttled_input_video" + output_stream: "DETECTIONS:output_detections" +} + +# Subgraph that tracks objects (see object_tracking.pbtxt). +node { + calculator: "ObjectTrackingSubgraph" + input_stream: "VIDEO:input_video" + input_stream: "DETECTIONS:output_detections" + output_stream: "DETECTIONS:tracked_detections" +} + +# Subgraph that renders annotations and overlays them on top of the input +# images (see renderer_gpu.pbtxt). +node { + calculator: "RendererSubgraph" + input_stream: "IMAGE:input_video" + input_stream: "DETECTIONS:tracked_detections" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/graphs/tracking/subgraphs/BUILD b/mediapipe/graphs/tracking/subgraphs/BUILD new file mode 100644 index 000000000..357463d9e --- /dev/null +++ b/mediapipe/graphs/tracking/subgraphs/BUILD @@ -0,0 +1,76 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_simple_subgraph", +) + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +mediapipe_simple_subgraph( + name = "box_tracking", + graph = "box_tracking.pbtxt", + register_as = "BoxTrackingSubgraph", + deps = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/video:box_tracker_calculator", + "//mediapipe/calculators/video:flow_packager_calculator", + "//mediapipe/calculators/video:motion_analysis_calculator", + "//mediapipe/framework/stream_handler:immediate_input_stream_handler", + "//mediapipe/framework/stream_handler:sync_set_input_stream_handler", + "//mediapipe/gpu:gpu_buffer_to_image_frame_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "object_tracking", + graph = "object_tracking.pbtxt", + register_as = "ObjectTrackingSubgraph", + deps = [ + "//mediapipe/calculators/util:detection_unique_id_calculator", + "//mediapipe/calculators/util:detections_to_timed_box_list_calculator", + "//mediapipe/calculators/video:tracked_detection_manager_calculator", + "//mediapipe/framework/stream_handler:sync_set_input_stream_handler", + "//mediapipe/graphs/tracking/subgraphs:box_tracking", + ], +) + +mediapipe_simple_subgraph( + name = "object_detection_gpu", + graph = "object_detection_gpu.pbtxt", + register_as = "ObjectDetectionSubgraph", + deps = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/tflite:tflite_converter_calculator", + "//mediapipe/calculators/tflite:tflite_inference_calculator", + "//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator", + "//mediapipe/calculators/util:detection_label_id_to_text_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "renderer_gpu", + graph = "renderer_gpu.pbtxt", + register_as = "RendererSubgraph", + deps = [ + "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/calculators/util:detections_to_render_data_calculator", + "//mediapipe/calculators/util:rect_to_render_data_calculator", + ], +) diff --git a/mediapipe/graphs/tracking/subgraphs/box_tracking.pbtxt b/mediapipe/graphs/tracking/subgraphs/box_tracking.pbtxt new file mode 100644 index 000000000..f4830fad9 --- /dev/null +++ b/mediapipe/graphs/tracking/subgraphs/box_tracking.pbtxt @@ -0,0 +1,126 @@ +# MediaPipe box tracking subgraph. + +type: "BoxTrackingSubgraph" + +input_stream: "VIDEO:input_video" +input_stream: "BOXES:start_pos" +input_stream: "CANCEL_ID:cancel_object_id" +output_stream: "BOXES:boxes" + +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:downscaled_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 240 + output_height: 320 + } + } +} + +# Converts GPU buffer to ImageFrame for processing tracking. +node: { + calculator: "GpuBufferToImageFrameCalculator" + input_stream: "downscaled_input_video" + output_stream: "downscaled_input_video_cpu" +} + +# Performs motion analysis on an incoming video stream. +node: { + calculator: "MotionAnalysisCalculator" + input_stream: "VIDEO:downscaled_input_video_cpu" + output_stream: "CAMERA:camera_motion" + output_stream: "FLOW:region_flow" + + node_options: { + [type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: { + analysis_options { + analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE + flow_options { + fast_estimation_min_block_size: 100 + top_inlier_sets: 1 + frac_inlier_error_threshold: 3e-3 + downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE + verification_distance: 5.0 + verify_long_feature_acceleration: true + verify_long_feature_trigger_ratio: 0.1 + tracking_options { + max_features: 500 + adaptive_extraction_levels: 2 + min_eig_val_settings { + adaptive_lowest_quality_level: 2e-4 + } + klt_tracker_implementation: KLT_OPENCV + } + } + } + } + } +} + +# Reads optical flow fields defined in +# mediapipe/framework/formats/motion/optical_flow_field.h, +# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized +# to 0-255. +node: { + calculator: "FlowPackagerCalculator" + input_stream: "FLOW:region_flow" + input_stream: "CAMERA:camera_motion" + output_stream: "TRACKING:tracking_data" + + node_options: { + [type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: { + flow_packager_options: { + binary_tracking_data_support: false + } + } + } +} + +# Tracks box positions over time. +node: { + calculator: "BoxTrackerCalculator" + input_stream: "TRACKING:tracking_data" + input_stream: "TRACK_TIME:input_video" + input_stream: "START_POS:start_pos" + input_stream: "CANCEL_OBJECT_ID:cancel_object_id" + input_stream_info: { + tag_index: "CANCEL_OBJECT_ID" + back_edge: true + } + output_stream: "BOXES:boxes" + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "TRACKING" + tag_index: "TRACK_TIME" + } + sync_set { + tag_index: "START_POS" + } + sync_set { + tag_index: "CANCEL_OBJECT_ID" + } + } + } + } + + node_options: { + [type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: { + tracker_options: { + track_step_options { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_SCALE + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} diff --git a/mediapipe/graphs/tracking/subgraphs/object_detection_gpu.pbtxt b/mediapipe/graphs/tracking/subgraphs/object_detection_gpu.pbtxt new file mode 100644 index 000000000..1b40f434d --- /dev/null +++ b/mediapipe/graphs/tracking/subgraphs/object_detection_gpu.pbtxt @@ -0,0 +1,128 @@ +# MediaPipe object detection subgraph. + +type: "ObjectDetectionSubgraph" + +input_stream: "IMAGE:input_video" +output_stream: "DETECTIONS:output_detections" + +# Transforms the input image on GPU to a 320x320 image. To scale the image, by +# default it uses the STRETCH scale mode that maps the entire input image to the +# entire transformed image. As a result, image aspect ratio may be changed and +# objects in the image may be deformed (stretched or squeezed), but the object +# detection model used in this graph is agnostic to that deformation. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 320 + output_height: 320 + } + } +} + +# Converts the transformed input image on GPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE_GPU:transformed_input_video" + output_stream: "TENSORS_GPU:image_tensor" +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS_GPU:image_tensor" + output_stream: "TENSORS_GPU:detection_tensors" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "mediapipe/models/ssdlite_object_detection.tflite" + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + node_options: { + [type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] { + num_layers: 6 + min_scale: 0.2 + max_scale: 0.95 + input_size_height: 320 + input_size_width: 320 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 16 + strides: 32 + strides: 64 + strides: 128 + strides: 256 + strides: 512 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.3333 + reduce_boxes_in_lowest_layer: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TfLiteTensorsToDetectionsCalculator" + input_stream: "TENSORS_GPU:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:detections" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] { + num_classes: 91 + num_boxes: 2034 + num_coords: 4 + ignore_classes: 0 + sigmoid_score: true + apply_exponential_on_box_size: true + x_scale: 10.0 + y_scale: 10.0 + h_scale: 5.0 + w_scale: 5.0 + min_score_thresh: 0.6 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "detections" + output_stream: "filtered_detections" + node_options: { + [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { + min_suppression_threshold: 0.4 + max_num_detections: 3 + overlap_type: INTERSECTION_OVER_UNION + return_empty_detections: true + } + } +} + +# Maps detection label IDs to the corresponding label text. The label map is +# provided in the label_map_path option. +node { + calculator: "DetectionLabelIdToTextCalculator" + input_stream: "filtered_detections" + output_stream: "output_detections" + node_options: { + [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { + label_map_path: "mediapipe/models/ssdlite_object_detection_labelmap.txt" + } + } +} diff --git a/mediapipe/graphs/tracking/subgraphs/object_tracking.pbtxt b/mediapipe/graphs/tracking/subgraphs/object_tracking.pbtxt new file mode 100644 index 000000000..a7cf87fe1 --- /dev/null +++ b/mediapipe/graphs/tracking/subgraphs/object_tracking.pbtxt @@ -0,0 +1,56 @@ +# MediaPipe object tracking subgraph. + +type: "ObjectTrackingSubgraph" + +input_stream: "VIDEO:input_video" +input_stream: "DETECTIONS:new_detections" +output_stream: "DETECTIONS:tracked_detections" + +# Assigns an unique id for each new detection. +node { + calculator: "DetectionUniqueIdCalculator" + input_stream: "DETECTIONS:new_detections" + output_stream: "DETECTIONS:detections_with_id" +} + +# Converts detections to TimedBox protos which are used as initial location +# for tracking. +node { + calculator: "DetectionsToTimedBoxListCalculator" + input_stream: "DETECTIONS:detections_with_id" + output_stream: "BOXES:start_pos" +} + +# Subgraph that tracks boxes (see box_tracking.pbtxt). +node { + calculator: "BoxTrackingSubgraph" + input_stream: "VIDEO:input_video" + input_stream: "BOXES:start_pos" + input_stream: "CANCEL_ID:cancel_object_id" + output_stream: "BOXES:boxes" +} + +# Managers new detected objects and objects that are being tracked. +# It associates the duplicated detections and updates the locations of +# detections from tracking. +node: { + calculator: "TrackedDetectionManagerCalculator" + input_stream: "DETECTIONS:detections_with_id" + input_stream: "TRACKING_BOXES:boxes" + output_stream: "DETECTIONS:tracked_detections" + output_stream: "CANCEL_OBJECT_ID:cancel_object_id" + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "TRACKING_BOXES" + } + sync_set { + tag_index: "DETECTIONS" + } + } + } + } +} diff --git a/mediapipe/graphs/tracking/subgraphs/renderer_gpu.pbtxt b/mediapipe/graphs/tracking/subgraphs/renderer_gpu.pbtxt new file mode 100644 index 000000000..9640ccabc --- /dev/null +++ b/mediapipe/graphs/tracking/subgraphs/renderer_gpu.pbtxt @@ -0,0 +1,29 @@ +# MediaPipe object tracking rendering subgraph. + +type: "RendererSubgraph" + +input_stream: "IMAGE:input_image" +input_stream: "DETECTIONS:detections" +output_stream: "IMAGE:output_image" + +# Converts the detections to drawing primitives for annotation overlay. +node { + calculator: "DetectionsToRenderDataCalculator" + input_stream: "DETECTIONS:detections" + output_stream: "RENDER_DATA:detections_render_data" + node_options: { + [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] { + thickness: 4.0 + color { r: 255 g: 0 b: 0 } + render_detection_id: true + } + } +} + +# Draws annotations and overlays them on top of the input images. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "INPUT_FRAME_GPU:input_image" + input_stream: "detections_render_data" + output_stream: "OUTPUT_FRAME_GPU:output_image" +} diff --git a/mediapipe/java/com/google/mediapipe/framework/MediaPipeRunner.java b/mediapipe/java/com/google/mediapipe/framework/MediaPipeRunner.java index 39a527372..466c18915 100644 --- a/mediapipe/java/com/google/mediapipe/framework/MediaPipeRunner.java +++ b/mediapipe/java/com/google/mediapipe/framework/MediaPipeRunner.java @@ -50,4 +50,16 @@ public abstract class MediaPipeRunner extends Graph { * Stops the running graph and releases the resource. Call this in Activity onDestroy callback. */ public abstract void release(); + /** + * Stops the running graph and releases the resource. Call this in Activity onDestroy callback. + * + *

Like {@link #release()} but with a timeout. The default behavior is to call + * {@link #release()}. The implementation can override this method if it cares about timing out + * releasing resources. + * + * @param timeoutMillis the time it takes to force timeout from releasing mff context. + */ + public void release(long timeoutMillis) { + release(); + } } diff --git a/mediapipe/util/tracking/BUILD b/mediapipe/util/tracking/BUILD new file mode 100644 index 000000000..6eb4a16a4 --- /dev/null +++ b/mediapipe/util/tracking/BUILD @@ -0,0 +1,742 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library") + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +proto_library( + name = "tone_models_proto", + srcs = ["tone_models.proto"], +) + +proto_library( + name = "tone_estimation_proto", + srcs = ["tone_estimation.proto"], + deps = [":tone_models_proto"], +) + +proto_library( + name = "region_flow_computation_proto", + srcs = ["region_flow_computation.proto"], + deps = [ + ":tone_estimation_proto", + ], +) + +proto_library( + name = "motion_saliency_proto", + srcs = ["motion_saliency.proto"], +) + +proto_library( + name = "motion_estimation_proto", + srcs = ["motion_estimation.proto"], +) + +proto_library( + name = "motion_analysis_proto", + srcs = ["motion_analysis.proto"], + deps = [ + ":motion_estimation_proto", + ":motion_saliency_proto", + ":region_flow_computation_proto", + ], +) + +proto_library( + name = "region_flow_proto", + srcs = ["region_flow.proto"], +) + +proto_library( + name = "motion_models_proto", + srcs = ["motion_models.proto"], +) + +proto_library( + name = "camera_motion_proto", + srcs = ["camera_motion.proto"], + deps = [":motion_models_proto"], +) + +proto_library( + name = "push_pull_filtering_proto", + srcs = ["push_pull_filtering.proto"], +) + +proto_library( + name = "frame_selection_solution_evaluator_proto", + srcs = ["frame_selection_solution_evaluator.proto"], +) + +proto_library( + name = "frame_selection_proto", + srcs = ["frame_selection.proto"], + deps = [ + ":camera_motion_proto", + ":frame_selection_solution_evaluator_proto", + ":region_flow_proto", + ], +) + +proto_library( + name = "flow_packager_proto", + srcs = ["flow_packager.proto"], + deps = [ + ":motion_models_proto", + ":region_flow_proto", + ], +) + +proto_library( + name = "tracking_proto", + srcs = ["tracking.proto"], + deps = [ + ":motion_models_proto", + ], +) + +proto_library( + name = "box_tracker_proto", + srcs = ["box_tracker.proto"], + deps = [":tracking_proto"], +) + +mediapipe_cc_proto_library( + name = "tone_models_cc_proto", + srcs = ["tone_models.proto"], + visibility = ["//visibility:public"], + deps = [":tone_models_proto"], +) + +mediapipe_cc_proto_library( + name = "tone_estimation_cc_proto", + srcs = ["tone_estimation.proto"], + cc_deps = [":tone_models_cc_proto"], + visibility = ["//visibility:public"], + deps = [":tone_estimation_proto"], +) + +mediapipe_cc_proto_library( + name = "region_flow_computation_cc_proto", + srcs = ["region_flow_computation.proto"], + cc_deps = [ + ":tone_estimation_cc_proto", + ":tone_models_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":region_flow_computation_proto"], +) + +mediapipe_cc_proto_library( + name = "motion_saliency_cc_proto", + srcs = ["motion_saliency.proto"], + visibility = ["//visibility:public"], + deps = [":motion_saliency_proto"], +) + +mediapipe_cc_proto_library( + name = "motion_estimation_cc_proto", + srcs = ["motion_estimation.proto"], + visibility = ["//visibility:public"], + deps = [":motion_estimation_proto"], +) + +mediapipe_cc_proto_library( + name = "motion_analysis_cc_proto", + srcs = ["motion_analysis.proto"], + cc_deps = [ + ":motion_estimation_cc_proto", + ":motion_saliency_cc_proto", + ":region_flow_computation_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":motion_analysis_proto"], +) + +mediapipe_cc_proto_library( + name = "region_flow_cc_proto", + srcs = ["region_flow.proto"], + cc_deps = [":motion_models_cc_proto"], + visibility = ["//visibility:public"], + deps = [":region_flow_proto"], +) + +mediapipe_cc_proto_library( + name = "motion_models_cc_proto", + srcs = ["motion_models.proto"], + visibility = ["//visibility:public"], + deps = [":motion_models_proto"], +) + +mediapipe_cc_proto_library( + name = "camera_motion_cc_proto", + srcs = ["camera_motion.proto"], + cc_deps = [":motion_models_cc_proto"], + visibility = ["//visibility:public"], + deps = [":camera_motion_proto"], +) + +mediapipe_cc_proto_library( + name = "push_pull_filtering_cc_proto", + srcs = ["push_pull_filtering.proto"], + visibility = ["//visibility:public"], + deps = [":push_pull_filtering_proto"], +) + +mediapipe_cc_proto_library( + name = "frame_selection_solution_evaluator_cc_proto", + srcs = ["frame_selection_solution_evaluator.proto"], + visibility = ["//visibility:public"], + deps = [":frame_selection_solution_evaluator_proto"], +) + +mediapipe_cc_proto_library( + name = "frame_selection_cc_proto", + srcs = ["frame_selection.proto"], + cc_deps = [ + ":camera_motion_cc_proto", + ":frame_selection_solution_evaluator_cc_proto", + ":region_flow_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":frame_selection_proto"], +) + +mediapipe_cc_proto_library( + name = "flow_packager_cc_proto", + srcs = ["flow_packager.proto"], + cc_deps = [ + ":motion_models_cc_proto", + ":region_flow_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":flow_packager_proto"], +) + +mediapipe_cc_proto_library( + name = "tracking_cc_proto", + srcs = ["tracking.proto"], + cc_deps = [":motion_models_cc_proto"], + visibility = ["//visibility:public"], + deps = [":tracking_proto"], +) + +mediapipe_cc_proto_library( + name = "box_tracker_cc_proto", + srcs = ["box_tracker.proto"], + cc_deps = [":tracking_cc_proto"], + visibility = ["//visibility:public"], + deps = [":box_tracker_proto"], +) + +cc_library( + name = "motion_models", + srcs = ["motion_models.cc"], + hdrs = ["motion_models.h"], + deps = [ + ":camera_motion_cc_proto", + ":motion_models_cc_proto", + ":region_flow_cc_proto", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:singleton", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/container:node_hash_map", + "@com_google_absl//absl/strings:str_format", + "@eigen_archive//:eigen", + ], +) + +cc_library( + name = "motion_models_cv", + srcs = ["motion_models_cv.cc"], + hdrs = ["motion_models_cv.h"], + deps = [ + ":camera_motion_cc_proto", + ":motion_models", + ":motion_models_cc_proto", + "//mediapipe/framework/port:opencv_core", + ], +) + +cc_library( + name = "region_flow", + srcs = ["region_flow.cc"], + hdrs = ["region_flow.h"], + deps = [ + ":measure_time", + ":motion_models", + ":parallel_invoker", + ":region_flow_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "camera_motion", + srcs = ["camera_motion.cc"], + hdrs = ["camera_motion.h"], + deps = [ + ":camera_motion_cc_proto", + ":motion_models", + ":region_flow", + ":region_flow_cc_proto", + "@com_google_absl//absl/strings:str_format", + ], +) + +PARALLEL_COPTS = ["-DPARALLEL_INVOKER_ACTIVE"] + select({ + "//mediapipe:apple": [], + "//mediapipe:android": [], + "//conditions:default": [], +}) + +PARALLEL_LINKOPTS = [] + +cc_library( + name = "measure_time", + srcs = ["measure_time.cc"], + hdrs = ["measure_time.h"], + deps = [ + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/synchronization", + "@com_google_absl//absl/time", + ], +) + +cc_library( + name = "parallel_invoker", + srcs = ["parallel_invoker.cc"], + hdrs = ["parallel_invoker.h"], + copts = PARALLEL_COPTS, + linkopts = PARALLEL_LINKOPTS, + deps = [ + ":parallel_invoker_forbid_mixed_active", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:threadpool", + "@com_google_absl//absl/synchronization", + ], +) + +cc_library( + name = "parallel_invoker_forbid_mixed_active", + srcs = ["parallel_invoker_forbid_mixed.cc"], + copts = PARALLEL_COPTS, + alwayslink = 1, # Forces all symbols to be included. +) + +cc_library( + name = "image_util", + srcs = ["image_util.cc"], + hdrs = ["image_util.h"], + deps = [ + ":motion_models", + ":motion_models_cc_proto", + ":region_flow", + ":region_flow_cc_proto", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:vector", + ], +) + +cc_library( + name = "streaming_buffer", + srcs = ["streaming_buffer.cc"], + hdrs = ["streaming_buffer.h"], + deps = [ + "//mediapipe/framework/port:logging", + "@com_google_absl//absl/container:node_hash_map", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:any", + ], +) + +cc_library( + name = "motion_estimation", + srcs = ["motion_estimation.cc"], + hdrs = ["motion_estimation.h"], + copts = PARALLEL_COPTS, + linkopts = PARALLEL_LINKOPTS, + deps = [ + ":camera_motion", + ":camera_motion_cc_proto", + ":measure_time", + ":motion_estimation_cc_proto", + ":motion_models", + ":motion_models_cc_proto", + ":parallel_invoker", + ":region_flow", + ":region_flow_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/container:node_hash_map", + "@com_google_absl//absl/strings", + "@eigen_archive//:eigen", + ], +) + +cc_library( + name = "motion_saliency", + srcs = ["motion_saliency.cc"], + hdrs = ["motion_saliency.h"], + deps = [ + ":camera_motion", + ":measure_time", + ":motion_saliency_cc_proto", + ":region_flow", + ":region_flow_cc_proto", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:vector", + ], +) + +cc_library( + name = "push_pull_filtering", + hdrs = ["push_pull_filtering.h"], + deps = [ + ":image_util", + ":push_pull_filtering_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_highgui", + ], +) + +cc_library( + name = "tone_models", + srcs = ["tone_models.cc"], + hdrs = ["tone_models.h"], + deps = [ + ":tone_models_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/strings:str_format", + ], +) + +cc_library( + name = "tone_estimation", + srcs = ["tone_estimation.cc"], + hdrs = ["tone_estimation.h"], + deps = [ + ":motion_models_cc_proto", + ":region_flow", + ":region_flow_cc_proto", + ":tone_estimation_cc_proto", + ":tone_models", + ":tone_models_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:vector", + ], +) + +cc_library( + name = "region_flow_computation", + srcs = ["region_flow_computation.cc"], + hdrs = ["region_flow_computation.h"], + copts = PARALLEL_COPTS, + linkopts = PARALLEL_LINKOPTS, + deps = [ + ":camera_motion_cc_proto", + ":image_util", + ":measure_time", + ":motion_estimation", + ":motion_estimation_cc_proto", + ":motion_models", + ":motion_models_cc_proto", + ":parallel_invoker", + ":region_flow", + ":region_flow_cc_proto", + ":region_flow_computation_cc_proto", + ":tone_estimation", + ":tone_estimation_cc_proto", + ":tone_models", + ":tone_models_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_features2d", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:opencv_video", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/container:node_hash_set", + "@com_google_absl//absl/memory", + "@eigen_archive//:eigen", + ], +) + +cc_library( + name = "region_flow_visualization", + srcs = ["region_flow_visualization.cc"], + hdrs = ["region_flow_visualization.h"], + deps = [ + ":measure_time", + ":motion_models", + ":parallel_invoker", + ":region_flow", + ":region_flow_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "motion_analysis", + srcs = ["motion_analysis.cc"], + hdrs = ["motion_analysis.h"], + deps = [ + ":camera_motion", + ":camera_motion_cc_proto", + ":image_util", + ":measure_time", + ":motion_analysis_cc_proto", + ":motion_estimation", + ":motion_estimation_cc_proto", + ":motion_saliency", + ":motion_saliency_cc_proto", + ":push_pull_filtering", + ":region_flow", + ":region_flow_cc_proto", + ":region_flow_computation", + ":region_flow_computation_cc_proto", + ":region_flow_visualization", + ":streaming_buffer", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/strings:str_format", + ], +) + +cc_library( + name = "flow_packager", + srcs = ["flow_packager.cc"], + hdrs = ["flow_packager.h"], + deps = [ + ":camera_motion", + ":camera_motion_cc_proto", + ":flow_packager_cc_proto", + ":motion_estimation", + ":motion_estimation_cc_proto", + ":motion_models", + ":motion_models_cc_proto", + ":region_flow", + ":region_flow_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/strings", + ], +) + +cc_library( + name = "tracking", + srcs = ["tracking.cc"], + hdrs = ["tracking.h"], + copts = PARALLEL_COPTS, + linkopts = PARALLEL_LINKOPTS, + deps = [ + ":flow_packager_cc_proto", + ":measure_time", + ":motion_models", + ":motion_models_cc_proto", + ":motion_models_cv", + ":parallel_invoker", + ":region_flow", + ":tracking_cc_proto", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_calib3d", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/memory", + "@eigen_archive//:eigen", + ], + alwayslink = 1, +) + +cc_library( + name = "box_tracker", + srcs = ["box_tracker.cc"], + hdrs = ["box_tracker.h"], + deps = [ + ":box_tracker_cc_proto", + ":flow_packager_cc_proto", + ":measure_time", + ":tracking", + ":tracking_cc_proto", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:threadpool", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", + "@com_google_absl//absl/time", + ], +) + +cc_library( + name = "tracking_visualization_utilities", + srcs = ["tracking_visualization_utilities.cc"], + hdrs = ["tracking_visualization_utilities.h"], + deps = [ + ":box_tracker", + ":box_tracker_cc_proto", + ":flow_packager_cc_proto", + ":tracking", + ":tracking_cc_proto", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "@com_google_absl//absl/strings:str_format", + ], +) + +cc_test( + name = "parallel_invoker_test", + srcs = ["parallel_invoker_test.cc"], + copts = PARALLEL_COPTS, + linkopts = PARALLEL_LINKOPTS, + deps = [ + ":parallel_invoker", + "//mediapipe/framework/port:gtest_main", + "@com_google_absl//absl/synchronization", + ], +) + +cc_test( + name = "image_util_test", + srcs = [ + "image_util_test.cc", + ], + deps = [ + ":image_util", + "//mediapipe/framework/port:gtest_main", + ], +) + +cc_test( + name = "motion_models_test", + srcs = ["motion_models_test.cc"], + deps = [ + ":motion_estimation", + ":motion_models", + "//mediapipe/framework/deps:message_matchers", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:parse_text_proto", + "//mediapipe/framework/port:vector", + ], +) + +cc_test( + name = "region_flow_computation_test", + srcs = ["region_flow_computation_test.cc"], + copts = PARALLEL_COPTS, + data = ["testdata/stabilize_test.png"], + linkopts = PARALLEL_LINKOPTS, + deps = [ + ":region_flow", + ":region_flow_cc_proto", + ":region_flow_computation", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/port:commandlineflags", + "//mediapipe/framework/port:file_helpers", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_highgui", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:status", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/time", + ], +) + +cc_test( + name = "box_tracker_test", + timeout = "short", + srcs = ["box_tracker_test.cc"], + data = glob(["testdata/box_tracker/*"]), + deps = [ + ":box_tracker", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/port:gtest_main", + ], +) + +cc_library( + name = "tracked_detection", + srcs = [ + "tracked_detection.cc", + ], + hdrs = [ + "tracked_detection.h", + ], + deps = [ + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/framework/port:vector", + "@com_google_absl//absl/container:node_hash_map", + ], +) + +cc_test( + name = "tracked_detection_test", + srcs = [ + "tracked_detection_test.cc", + ], + deps = [ + ":tracked_detection", + "//mediapipe/framework/port:gtest_main", + ], +) + +cc_library( + name = "tracked_detection_manager", + srcs = [ + "tracked_detection_manager.cc", + ], + hdrs = [ + "tracked_detection_manager.h", + ], + deps = [ + ":tracked_detection", + "//mediapipe/framework/formats:rect_cc_proto", + "@com_google_absl//absl/container:node_hash_map", + ], +) diff --git a/mediapipe/util/tracking/OWNERS b/mediapipe/util/tracking/OWNERS new file mode 100644 index 000000000..a5c705e31 --- /dev/null +++ b/mediapipe/util/tracking/OWNERS @@ -0,0 +1,2 @@ +jianingwei +yegenzhi diff --git a/mediapipe/util/tracking/box_tracker.cc b/mediapipe/util/tracking/box_tracker.cc new file mode 100644 index 000000000..2111eac4b --- /dev/null +++ b/mediapipe/util/tracking/box_tracker.cc @@ -0,0 +1,1062 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/box_tracker.h" + +#include + +#include +#include + +#include "absl/strings/str_cat.h" +#include "absl/synchronization/mutex.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/tracking.pb.h" + +namespace mediapipe { + +// Time within which close checkpoints are removed. +static constexpr int kSnapMs = 1000; +static constexpr int kInitCheckpoint = -1; + +void MotionBoxStateQuadToVertices(const MotionBoxState::Quad& quad, + std::vector* vertices) { + CHECK_EQ(TimedBox::kNumQuadVertices * 2, quad.vertices_size()); + CHECK(vertices != nullptr); + vertices->clear(); + for (int i = 0; i < TimedBox::kNumQuadVertices; ++i) { + vertices->push_back( + Vector2_f(quad.vertices(i * 2), quad.vertices(i * 2 + 1))); + } +} + +void VerticesToMotionBoxStateQuad(const std::vector& vertices, + MotionBoxState::Quad* quad) { + CHECK_EQ(TimedBox::kNumQuadVertices, vertices.size()); + CHECK(quad != nullptr); + for (const Vector2_f& vertex : vertices) { + quad->add_vertices(vertex.x()); + quad->add_vertices(vertex.y()); + } +} + +void MotionBoxStateFromTimedBox(const TimedBox& box, MotionBoxState* state) { + CHECK(state); + state->set_pos_x(box.left); + state->set_pos_y(box.top); + state->set_width(box.right - box.left); + state->set_height(box.bottom - box.top); + state->set_rotation(box.rotation); + state->set_request_grouping(box.request_grouping); + if (box.quad_vertices.size() == TimedBox::kNumQuadVertices) { + VerticesToMotionBoxStateQuad(box.quad_vertices, state->mutable_quad()); + + if (box.aspect_ratio > 0.0f) { + state->set_aspect_ratio(box.aspect_ratio); + } + + // set pos_x and pos_y to be the top-left vertex x and y coordinates + // set width and height to be max - min of x and y. + float min_x = std::numeric_limits::max(); + float max_x = std::numeric_limits::lowest(); + float min_y = std::numeric_limits::max(); + float max_y = std::numeric_limits::lowest(); + for (const auto& vertex : box.quad_vertices) { + min_x = std::min(min_x, vertex.x()); + max_x = std::max(max_x, vertex.x()); + min_y = std::min(min_y, vertex.y()); + max_y = std::max(max_y, vertex.y()); + } + state->set_pos_x(min_x); + state->set_pos_y(min_y); + state->set_width(max_x - min_x); + state->set_height(max_y - min_y); + } +} + +void TimedBoxFromMotionBoxState(const MotionBoxState& state, TimedBox* box) { + CHECK(box); + const float scale_dx = state.width() * (state.scale() - 1.0f) * 0.5f; + const float scale_dy = state.height() * (state.scale() - 1.0f) * 0.5f; + box->left = state.pos_x() - scale_dx; + box->top = state.pos_y() - scale_dy; + box->right = state.pos_x() + state.width() + scale_dx; + box->bottom = state.pos_y() + state.height() + scale_dy; + box->rotation = state.rotation(); + box->confidence = state.tracking_confidence(); + box->request_grouping = state.request_grouping(); + if (state.has_quad()) { + MotionBoxStateQuadToVertices(state.quad(), &(box->quad_vertices)); + + if (state.has_aspect_ratio()) { + box->aspect_ratio = state.aspect_ratio(); + } + } +} + +namespace { + +TimedBox BlendTimedBoxes(const TimedBox& lhs, const TimedBox& rhs, + int64 time_msec) { + CHECK_LT(lhs.time_msec, rhs.time_msec); + const double alpha = + (time_msec - lhs.time_msec) * 1.0 / (rhs.time_msec - lhs.time_msec); + return TimedBox::Blend(lhs, rhs, alpha); +} + +} // namespace. + +TimedBox TimedBox::Blend(const TimedBox& lhs, const TimedBox& rhs, double alpha, + double beta) { + // Due to large timestamps alpha beta should be in double. + TimedBox result; + result.top = alpha * lhs.top + beta * rhs.top; + result.left = alpha * lhs.left + beta * rhs.left; + result.bottom = alpha * lhs.bottom + beta * rhs.bottom; + result.right = alpha * lhs.right + beta * rhs.right; + result.rotation = alpha * lhs.rotation + beta * rhs.rotation; + result.time_msec = std::round(alpha * lhs.time_msec + beta * rhs.time_msec); + result.confidence = alpha * lhs.confidence + beta * rhs.confidence; + if (lhs.quad_vertices.size() == kNumQuadVertices && + rhs.quad_vertices.size() == kNumQuadVertices) { + result.quad_vertices.clear(); + for (int i = 0; i < lhs.quad_vertices.size(); ++i) { + result.quad_vertices.push_back(alpha * lhs.quad_vertices[i] + + beta * rhs.quad_vertices[i]); + } + + // Since alpha and beta are not necessarily sum to 1, aspect ratio can not + // be derived with alpha and beta. Here we are simply averaging the + // aspect_ratio as the blended box aspect ratio. + if (lhs.aspect_ratio > 0 && rhs.aspect_ratio > 0) { + result.aspect_ratio = 0.5f * lhs.aspect_ratio + 0.5f * rhs.aspect_ratio; + } + } + return result; +} + +TimedBox TimedBox::Blend(const TimedBox& lhs, const TimedBox& rhs, + double alpha) { + return Blend(lhs, rhs, 1.0 - alpha, alpha); +} + +std::array TimedBox::Corners(float width, float height) const { + if (quad_vertices.size() == kNumQuadVertices) { + std::array corners{{ + Vector2_f(quad_vertices[0].x() * width, quad_vertices[0].y() * height), + Vector2_f(quad_vertices[1].x() * width, quad_vertices[1].y() * height), + Vector2_f(quad_vertices[2].x() * width, quad_vertices[2].y() * height), + Vector2_f(quad_vertices[3].x() * width, quad_vertices[3].y() * height), + }}; + return corners; + } else { + // Rotate 4 corner w.r.t. center. + const Vector2_f center(0.5f * (left + right) * width, + 0.5f * (top + bottom) * height); + const std::array corners{{ + Vector2_f(left * width, top * height), + Vector2_f(left * width, bottom * height), + Vector2_f(right * width, bottom * height), + Vector2_f(right * width, top * height), + }}; + + const float cos_a = std::cos(rotation); + const float sin_a = std::sin(rotation); + std::array transformed_corners; + for (int k = 0; k < 4; ++k) { + // Scale and rotate w.r.t. center. + const Vector2_f rad = corners[k] - center; + const Vector2_f rot_rad(cos_a * rad.x() - sin_a * rad.y(), + sin_a * rad.x() + cos_a * rad.y()); + transformed_corners[k] = center + rot_rad; + } + return transformed_corners; + } +} + +TimedBox TimedBox::FromProto(const TimedBoxProto& proto) { + TimedBox box; + box.top = proto.top(); + box.left = proto.left(); + box.bottom = proto.bottom(); + box.right = proto.right(); + box.rotation = proto.rotation(); + box.time_msec = proto.time_msec(); + box.request_grouping = proto.request_grouping(); + if (proto.has_quad() && + proto.quad().vertices_size() == kNumQuadVertices * 2) { + MotionBoxStateQuadToVertices(proto.quad(), &(box.quad_vertices)); + + if (proto.has_aspect_ratio()) { + box.aspect_ratio = proto.aspect_ratio(); + } + } + return box; +} + +TimedBoxProto TimedBox::ToProto() const { + TimedBoxProto proto; + proto.set_top(top); + proto.set_left(left); + proto.set_bottom(bottom); + proto.set_right(right); + proto.set_rotation(rotation); + proto.set_time_msec(time_msec); + proto.set_confidence(confidence); + proto.set_request_grouping(request_grouping); + if (quad_vertices.size() == kNumQuadVertices) { + VerticesToMotionBoxStateQuad(quad_vertices, proto.mutable_quad()); + + if (aspect_ratio > 0.0f) { + proto.set_aspect_ratio(aspect_ratio); + } + } + return proto; +} + +BoxTracker::BoxTracker(const std::string& cache_dir, + const BoxTrackerOptions& options) + : options_(options), cache_dir_(cache_dir) { + tracking_workers_.reset(new ThreadPool(options_.num_tracking_workers())); + tracking_workers_->StartWorkers(); +} + +BoxTracker::BoxTracker( + const std::vector& tracking_data, bool copy_data, + const BoxTrackerOptions& options) + : BoxTracker("", options) { + AddTrackingDataChunks(tracking_data, copy_data); +} + +void BoxTracker::AddTrackingDataChunk(const TrackingDataChunk* chunk, + bool copy_data) { + CHECK_GT(chunk->item_size(), 0) << "Empty chunk."; + int64 chunk_time_msec = chunk->item(0).timestamp_usec() / 1000; + int chunk_idx = ChunkIdxFromTime(chunk_time_msec); + CHECK_GE(chunk_idx, tracking_data_.size()) << "Chunk is out of order."; + if (chunk_idx > tracking_data_.size()) { + LOG(INFO) << "Resize tracking_data_ to " << chunk_idx; + tracking_data_.resize(chunk_idx); + } + if (copy_data) { + tracking_data_buffer_.emplace_back(new TrackingDataChunk(*chunk)); + tracking_data_.push_back(tracking_data_buffer_.back().get()); + } else { + tracking_data_.emplace_back(chunk); + } +} + +void BoxTracker::AddTrackingDataChunks( + const std::vector& tracking_data, + bool copy_data) { + for (const auto item : tracking_data) { + AddTrackingDataChunk(item, copy_data); + } +} + +void BoxTracker::NewBoxTrack(const TimedBox& initial_pos, int id, + int64 min_msec, int64 max_msec) { + VLOG(1) << "New box track: " << id << " : " << initial_pos.ToString() + << " from " << min_msec << " to " << max_msec; + + // Mark initialization with checkpoint -1. + absl::MutexLock lock(&status_mutex_); + + if (canceling_) { + LOG(WARNING) << "Box Tracker is in cancel state. Refusing request."; + return; + } + ++track_status_[id][kInitCheckpoint].tracks_ongoing; + + auto operation = [this, initial_pos, id, min_msec, max_msec]() { + this->NewBoxTrackAsync(initial_pos, id, min_msec, max_msec); + }; + + tracking_workers_->Schedule(operation); +} + +std::pair BoxTracker::TrackInterval(int id) { + absl::MutexLock lock(&path_mutex_); + const Path& path = paths_[id]; + if (path.empty()) { + return std::make_pair(-1, -1); + } + + auto first_interval = path.begin()->second; + auto last_interval = path.rbegin()->second; + + return std::make_pair(first_interval.front().time_msec, + last_interval.back().time_msec); +} + +void BoxTracker::NewBoxTrackAsync(const TimedBox& initial_pos, int id, + int64 min_msec, int64 max_msec) { + VLOG(1) << "Async track for id: " << id << " from " << min_msec << " to " + << max_msec; + + // Determine start position and track forward and backward. + int chunk_idx = ChunkIdxFromTime(initial_pos.time_msec); + + VLOG(1) << "Starting at chunk " << chunk_idx; + + AugmentedChunkPtr tracking_chunk(ReadChunk(id, kInitCheckpoint, chunk_idx)); + + if (!tracking_chunk.first) { + absl::MutexLock lock(&status_mutex_); + --track_status_[id][kInitCheckpoint].tracks_ongoing; + LOG(ERROR) << "Could not read tracking chunk from file: " << chunk_idx + << " for start position: " << initial_pos.ToString(); + return; + } + + // Grab ownership here, to avoid any memory leaks due to early return. + std::unique_ptr chunk_owned; + if (tracking_chunk.second) { + chunk_owned.reset(tracking_chunk.first); + } + + const int start_frame = + ClosestFrameIndex(initial_pos.time_msec, *tracking_chunk.first); + + VLOG(1) << "Local start frame: " << start_frame; + + // Update starting position to coincide with a frame. + TimedBox start_pos = initial_pos; + start_pos.time_msec = + tracking_chunk.first->item(start_frame).timestamp_usec() / 1000; + + VLOG(1) << "Request at " << initial_pos.time_msec << " revised to " + << start_pos.time_msec; + + const int checkpoint = start_pos.time_msec; + + // TODO: + // Compute min and max for tracking based on existing check points. + if (!WaitToScheduleId(id)) { + // Could not schedule, id already being canceled. + return; + } + + // If another checkpoint is close by, cancel that one. + VLOG(1) << "Removing close checkpoints"; + absl::MutexLock lock(&status_mutex_); + RemoveCloseCheckpoints(id, checkpoint); + + VLOG(1) << "Cancel existing tracks"; + CancelTracking(id, checkpoint); + + // Remove checkpoint results (to be replaced with current one). + ClearCheckpoint(id, checkpoint); + + MotionBoxState start_state; + MotionBoxStateFromTimedBox(start_pos, &start_state); + + VLOG(1) << "Adding initial result"; + AddBoxResult(start_pos, id, checkpoint, start_state); + + // Perform forward and backward tracking and add to current PathSegment. + // Track forward. + track_status_[id][checkpoint].tracks_ongoing += 2; + + VLOG(1) << "Starting tracking workers ... "; + + AugmentedChunkPtr forward_chunk = tracking_chunk; + AugmentedChunkPtr backward_chunk = tracking_chunk; + + if (tracking_chunk.second) { // We have ownership, need a copy here. + forward_chunk = std::make_pair(new TrackingDataChunk(*chunk_owned), true); + backward_chunk = std::make_pair(chunk_owned.release(), true); + } + + auto forward_operation = [this, forward_chunk, start_state, start_frame, + chunk_idx, id, checkpoint, min_msec, max_msec]() { + this->TrackingImpl(TrackingImplArgs(forward_chunk, start_state, start_frame, + chunk_idx, id, checkpoint, true, true, + min_msec, max_msec)); + }; + + tracking_workers_->Schedule(forward_operation); + + // Track backward. + auto backward_operation = [this, backward_chunk, start_state, start_frame, + chunk_idx, id, checkpoint, min_msec, max_msec]() { + this->TrackingImpl(TrackingImplArgs(backward_chunk, start_state, + start_frame, chunk_idx, id, checkpoint, + false, true, min_msec, max_msec)); + }; + + tracking_workers_->Schedule(backward_operation); + + DoneSchedulingId(id); + + // Tell a waiting request that we are done scheduling. + status_condvar_.SignalAll(); + VLOG(1) << "Scheduling done for " << id; +} + +void BoxTracker::RemoveCloseCheckpoints(int id, int checkpoint) { + if (track_status_[id].empty()) { + return; + } + + auto pos = track_status_[id].lower_bound(checkpoint); + // Test current and previous location (if possible). + int num_turns = 1; + if (pos != track_status_[id].begin()) { + --pos; + ++num_turns; + } + + for (int k = 0; k < num_turns; ++k, ++pos) { + if (pos != track_status_[id].end()) { + const int check_pos = pos->first; + // Ignore marker init checkpoint from track_status_. + if (check_pos > kInitCheckpoint && + std::abs(check_pos - checkpoint) < kSnapMs) { + CancelTracking(id, check_pos); + ClearCheckpoint(id, check_pos); + } + } else { + break; + } + } +} + +bool BoxTracker::WaitToScheduleId(int id) { + VLOG(1) << "Wait to schedule id: " << id; + absl::MutexLock lock(&status_mutex_); + while (new_box_track_[id]) { + // Box tracking is currently ongoing for this id. + if (track_status_[id][kInitCheckpoint].canceled) { + // Canceled, remove myself from ongoing tracks. + --track_status_[id][kInitCheckpoint].tracks_ongoing; + status_condvar_.SignalAll(); + return false; + } + + // Only one request can be processing in the section till end of the + // function NewBoxTrackAsync at a time. + status_condvar_.Wait(&status_mutex_); + } + + // We got canceled already, don't proceed. + if (track_status_[id][kInitCheckpoint].canceled) { + --track_status_[id][kInitCheckpoint].tracks_ongoing; + status_condvar_.SignalAll(); + return false; + } + + // Signal we are about to schedule new tracking. + new_box_track_[id] = true; + VLOG(1) << "Ready to schedule id: " << id; + return true; +} + +void BoxTracker::DoneSchedulingId(int id) { + new_box_track_[id] = false; + --track_status_[id][kInitCheckpoint].tracks_ongoing; +} + +void BoxTracker::CancelTracking(int id, int checkpoint) { + // Wait for ongoing requests to terminate. + while (track_status_[id][checkpoint].tracks_ongoing != 0) { + // Cancel all ongoing requests. + track_status_[id][checkpoint].canceled = true; + status_condvar_.Wait(&status_mutex_); + } + + track_status_[id][checkpoint].canceled = false; +} + +bool BoxTracker::GetTimedPosition(int id, int64 time_msec, TimedBox* result, + std::vector* states) { + CHECK(result); + + MotionBoxState* lhs_box_state = nullptr; + MotionBoxState* rhs_box_state = nullptr; + if (states) { + CHECK(options_.record_path_states()) + << "Requesting corresponding tracking states requires option " + << "record_path_states to be set"; + states->resize(1); + lhs_box_state = rhs_box_state = &states->at(0); + } + + VLOG(1) << "Obtaining result at " << time_msec; + + absl::MutexLock lock(&path_mutex_); + const Path& path = paths_[id]; + if (path.size() < 1) { + LOG(ERROR) << "Empty path!"; + return false; + } + + // Find corresponding checkpoint. + auto check_pos = path.lower_bound(time_msec); + if (check_pos == path.begin()) { + VLOG(1) << "To left"; + // We are to the left of the earliest checkpoint. + return TimedBoxAtTime(check_pos->second, time_msec, result, lhs_box_state); + } + if (check_pos == path.end()) { + VLOG(1) << "To right"; + --check_pos; + // We are to the right of the lastest checkpoint. + return TimedBoxAtTime(check_pos->second, time_msec, result, rhs_box_state); + } + + VLOG(1) << "Blending ..."; + + // We are inbetween checkpoints, get result for each, then blend. + const PathSegment& rhs = check_pos->second; + const int check_rhs = check_pos->first; + --check_pos; + const PathSegment& lhs = check_pos->second; + const int check_lhs = check_pos->first; + + TimedBox lhs_box; + TimedBox rhs_box; + if (states) { + states->resize(2); + lhs_box_state = &states->at(0); + rhs_box_state = &states->at(1); + } + + if (!TimedBoxAtTime(lhs, time_msec, &lhs_box, lhs_box_state)) { + return false; + } + + if (!TimedBoxAtTime(rhs, time_msec, &rhs_box, rhs_box_state)) { + return false; + } + + VLOG(1) << "Blending: " << lhs_box.ToString() << " and " + << rhs_box.ToString(); + const double alpha = (time_msec - check_lhs) * 1.0 / (check_rhs - check_lhs); + *result = TimedBox::Blend(lhs_box, rhs_box, alpha); + + return true; +} + +bool BoxTracker::IsTrackingOngoingForId(int id) { + absl::MutexLock lock(&status_mutex_); + for (const auto& item : track_status_[id]) { + if (item.second.tracks_ongoing > 0) { + return true; + } + } + return false; +} + +bool BoxTracker::IsTrackingOngoing() { + absl::MutexLock lock(&status_mutex_); + return IsTrackingOngoingMutexHeld(); +} + +bool BoxTracker::IsTrackingOngoingMutexHeld() { + for (const auto& id : track_status_) { + for (const auto& item : id.second) { + if (item.second.tracks_ongoing > 0) { + return true; + } + } + } + return false; +} + +BoxTracker::AugmentedChunkPtr BoxTracker::ReadChunk(int id, int checkpoint, + int chunk_idx) { + VLOG(1) << __FUNCTION__ << " id=" << id << " chunk_idx=" << chunk_idx; + if (cache_dir_.empty() && !tracking_data_.empty()) { + if (chunk_idx < tracking_data_.size()) { + return std::make_pair(tracking_data_[chunk_idx], false); + } else { + LOG(ERROR) << "chunk_idx >= tracking_data_.size()"; + return std::make_pair(nullptr, false); + } + } else { + std::unique_ptr chunk_data( + ReadChunkFromCache(id, checkpoint, chunk_idx)); + return std::make_pair(chunk_data.release(), true); + } +} + +std::unique_ptr BoxTracker::ReadChunkFromCache( + int id, int checkpoint, int chunk_idx) { + VLOG(1) << __FUNCTION__ << " id=" << id << " chunk_idx=" << chunk_idx; + + auto format_runtime = + absl::ParsedFormat<'d'>::New(options_.cache_file_format()); + + std::string chunk_file; + if (format_runtime) { + chunk_file = cache_dir_ + "/" + absl::StrFormat(*format_runtime, chunk_idx); + } else { + LOG(ERROR) << "chache_file_format wrong. fall back to chunk_%04d."; + chunk_file = cache_dir_ + "/" + absl::StrFormat("chunk_%04d", chunk_idx); + } + + VLOG(1) << "Reading chunk from cache: " << chunk_file; + std::unique_ptr chunk_data(new TrackingDataChunk()); + + struct stat tmp; + if (stat(chunk_file.c_str(), &tmp)) { + if (!WaitForChunkFile(id, checkpoint, chunk_file)) { + return nullptr; + } + } + + VLOG(1) << "File exists, reading ..."; + + std::ifstream in(chunk_file, std::ios::in | std::ios::binary); + if (!in) { + LOG(ERROR) << "Could not read chunk file: " << chunk_file; + return nullptr; + } + + std::string data; + in.seekg(0, std::ios::end); + data.resize(in.tellg()); + in.seekg(0, std::ios::beg); + in.read(&data[0], data.size()); + in.close(); + + chunk_data->ParseFromString(data); + + VLOG(1) << "Read success"; + return chunk_data; +} + +bool BoxTracker::WaitForChunkFile(int id, int checkpoint, + const std::string& chunk_file) { + VLOG(1) << "Chunk no exists, waiting for file: " << chunk_file; + + const int timeout_msec = options_.read_chunk_timeout_msec(); + + // Exponential backoff sleep till file exists. + int wait_time_msec = 20; + + VLOG(1) << "In wait for chunk ...: " << chunk_file; + + // Maximum wait time. + const int kMaxWaitPeriod = 5000; + int total_wait_msec = 0; + bool file_exists = false; + + while (!file_exists && total_wait_msec < timeout_msec) { + // Check if we got canceled. + { + absl::MutexLock lock(&status_mutex_); + if (track_status_[id][checkpoint].canceled) { + return false; + } + } + + usleep(wait_time_msec * 1000); + total_wait_msec += wait_time_msec; + + struct stat tmp; + file_exists = stat(chunk_file.c_str(), &tmp) == 0; + + if (file_exists) { + VLOG(1) << "Sucessfully waited on " << chunk_file << " for " + << total_wait_msec; + break; + } + if (wait_time_msec < kMaxWaitPeriod) { + wait_time_msec *= 1.5; + } + } + + return file_exists; +} + +int BoxTracker::ClosestFrameIndex(int64 msec, + const TrackingDataChunk& chunk) const { + CHECK_GT(chunk.item_size(), 0); + typedef TrackingDataChunk::Item Item; + Item item_to_find; + item_to_find.set_timestamp_usec(msec * 1000); + int pos = + std::lower_bound(chunk.item().begin(), chunk.item().end(), item_to_find, + [](const Item& lhs, const Item& rhs) -> bool { + return lhs.timestamp_usec() < rhs.timestamp_usec(); + }) - + chunk.item().begin(); + + // Skip end. + if (pos == chunk.item_size()) { + return pos - 1; + } else if (pos == 0) { + // Nothing smaller exists. + return 0; + } + + // Determine closest timestamp. + const int64 lhs_diff = msec - chunk.item(pos - 1).timestamp_usec() / 1000; + const int64 rhs_diff = chunk.item(pos).timestamp_usec() / 1000 - msec; + + if (std::min(lhs_diff, rhs_diff) >= 67) { + LOG(ERROR) << "No frame found within 67ms, probably using wrong chunk."; + } + + if (lhs_diff < rhs_diff) { + return pos - 1; + } else { + return pos; + } +} + +void BoxTracker::AddBoxResult(const TimedBox& box, int id, int checkpoint, + const MotionBoxState& state) { + absl::MutexLock lock(&path_mutex_); + PathSegment& segment = paths_[id][checkpoint]; + auto insert_pos = std::lower_bound(segment.begin(), segment.end(), box); + const bool store_state = options_.record_path_states(); + + // Don't overwrite an existing box. + if (insert_pos == segment.end() || insert_pos->time_msec != box.time_msec) { + segment.insert(insert_pos, + InternalTimedBox( + box, store_state ? new MotionBoxState(state) : nullptr)); + } +} + +void BoxTracker::ClearCheckpoint(int id, int checkpoint) { + absl::MutexLock lock(&path_mutex_); + PathSegment& segment = paths_[id][checkpoint]; + segment.clear(); +} + +void BoxTracker::TrackingImpl(const TrackingImplArgs& a) { + TrackStepOptions track_step_options = options_.track_step_options(); + ChangeTrackingDegreesBasedOnStartPos(a.start_state, &track_step_options); + MotionBox motion_box(track_step_options); + const int chunk_data_size = a.chunk_data->item_size(); + + CHECK_GE(a.start_frame, 0); + CHECK_LT(a.start_frame, chunk_data_size); + + VLOG(1) << " a.start_frame = " << a.start_frame << " @" + << a.chunk_data->item(a.start_frame).timestamp_usec() << " with " + << chunk_data_size << " items"; + motion_box.ResetAtFrame(a.start_frame, a.start_state); + + auto cleanup_func = [&a, this]() -> void { + if (a.first_call) { + // Signal we are done processing in this direction. + absl::MutexLock lock(&status_mutex_); + --track_status_[a.id][a.checkpoint].tracks_ongoing; + status_condvar_.SignalAll(); + } + }; + + if (a.forward) { + // TrackingData at frame f, contains tracking information from + // frame f to f - 1. Get information at frame f + 1 and invert: + // Tracking from f to f + 1. + for (int f = a.start_frame; f + 1 < chunk_data_size; ++f) { + // Note: we use / 1000 instead of * 1000 to avoid overflow. + if (a.chunk_data->item(f + 1).timestamp_usec() / 1000 > a.max_msec) { + VLOG(2) << "Reached maximum tracking timestamp @" << a.max_msec; + break; + } + VLOG(1) << "Track forward from " << f; + MotionVectorFrame mvf; + MotionVectorFrameFromTrackingData( + a.chunk_data->item(f + 1).tracking_data(), &mvf); + const int track_duration_ms = + TrackingDataDurationMs(a.chunk_data->item(f + 1)); + if (track_duration_ms > 0) { + mvf.duration_ms = track_duration_ms; + } + + // If this is the first frame in a chunk, there might be an unobserved + // chunk boundary at the first frame. + if (f == 0 && a.chunk_data->item(0).tracking_data().frame_flags() & + TrackingData::FLAG_CHUNK_BOUNDARY) { + mvf.is_chunk_boundary = true; + } + + MotionVectorFrame mvf_inverted; + InvertMotionVectorFrame(mvf, &mvf_inverted); + + const bool forward_tracking = true; + if (!motion_box.TrackStep(f, mvf_inverted, forward_tracking)) { + VLOG(1) << "Failed forward at frame: " << f; + break; + } else { + // Test if current request is canceled. + { + absl::MutexLock lock(&status_mutex_); + if (track_status_[a.id][a.checkpoint].canceled) { + --track_status_[a.id][a.checkpoint].tracks_ongoing; + status_condvar_.SignalAll(); + return; + } + } + + TimedBox result; + const MotionBoxState& result_state = motion_box.StateAtFrame(f + 1); + TimedBoxFromMotionBoxState(result_state, &result); + result.time_msec = a.chunk_data->item(f + 1).timestamp_usec() / 1000; + AddBoxResult(result, a.id, a.checkpoint, result_state); + } + + if (f + 2 == chunk_data_size && !a.chunk_data->last_chunk()) { + // Last frame, successful track, continue; + AugmentedChunkPtr next_chunk( + ReadChunk(a.id, a.checkpoint, a.chunk_idx + 1)); + + if (next_chunk.first != nullptr) { + TrackingImplArgs next_args(next_chunk, motion_box.StateAtFrame(f + 1), + 0, a.chunk_idx + 1, a.id, a.checkpoint, + a.forward, false, a.min_msec, a.max_msec); + + TrackingImpl(next_args); + } else { + cleanup_func(); + LOG(ERROR) << "Can't read expected chunk file!"; + } + } + } + } else { + // Backward tracking. + // Don't attempt to track from the very first frame backwards. + const int first_frame = a.chunk_data->first_chunk() ? 1 : 0; + + for (int f = a.start_frame; f >= first_frame; --f) { + if (a.chunk_data->item(f).timestamp_usec() / 1000 < a.min_msec) { + VLOG(2) << "Reached minimum tracking timestamp @" << a.min_msec; + break; + } + VLOG(1) << "Track backward from " << f; + MotionVectorFrame mvf; + MotionVectorFrameFromTrackingData(a.chunk_data->item(f).tracking_data(), + &mvf); + const int64 track_duration_ms = + TrackingDataDurationMs(a.chunk_data->item(f)); + if (track_duration_ms > 0) { + mvf.duration_ms = track_duration_ms; + } + const bool forward_tracking = false; + if (!motion_box.TrackStep(f, mvf, forward_tracking)) { + VLOG(1) << "Failed backward at frame: " << f; + break; + } else { + // Test if current request is canceled. + { + absl::MutexLock lock(&status_mutex_); + if (track_status_[a.id][a.checkpoint].canceled) { + --track_status_[a.id][a.checkpoint].tracks_ongoing; + status_condvar_.SignalAll(); + return; + } + } + + TimedBox result; + const MotionBoxState& result_state = motion_box.StateAtFrame(f - 1); + TimedBoxFromMotionBoxState(result_state, &result); + result.time_msec = a.chunk_data->item(f).prev_timestamp_usec() / 1000; + AddBoxResult(result, a.id, a.checkpoint, result_state); + } + + if (f == first_frame && !a.chunk_data->first_chunk()) { + VLOG(1) << "Read next chunk: " << f << "==" << first_frame << " in " + << a.chunk_idx; + // First frame, successful track, continue. + AugmentedChunkPtr prev_chunk( + ReadChunk(a.id, a.checkpoint, a.chunk_idx - 1)); + if (prev_chunk.first != nullptr) { + const int last_frame = prev_chunk.first->item_size() - 1; + TrackingImplArgs prev_args(prev_chunk, motion_box.StateAtFrame(f - 1), + last_frame, a.chunk_idx - 1, a.id, + a.checkpoint, a.forward, false, a.min_msec, + a.max_msec); + + TrackingImpl(prev_args); + } else { + cleanup_func(); + LOG(ERROR) << "Can't read expected chunk file! " << a.chunk_idx - 1 + << " while tracking @" + << a.chunk_data->item(f).timestamp_usec() / 1000 + << " with cutoff " << a.min_msec; + return; + } + } + } + } + + cleanup_func(); +} + +bool TimedBoxAtTime(const PathSegment& segment, int64 time_msec, TimedBox* box, + MotionBoxState* state) { + CHECK(box); + + if (segment.empty()) { + return false; + } + + TimedBox to_find; + to_find.time_msec = time_msec; + auto pos = std::lower_bound(segment.begin(), segment.end(), to_find); + if (pos != segment.end() && pos->time_msec == time_msec) { + *box = *pos; + if (state) { + *state = *pos->state; + } + return true; + } + + constexpr int kMaxDiff = 67; + + if (pos == segment.begin()) { + if (pos->time_msec - time_msec < kMaxDiff) { + *box = *pos; + if (state && pos->state) { + *state = *pos->state; + } + return true; + } else { + return false; + } + } + + if (pos == segment.end()) { + if (time_msec - pos[-1].time_msec < kMaxDiff) { + *box = pos[-1]; + if (state && pos[-1].state) { + *state = *pos[-1].state; + } + return true; + } else { + return false; + } + } + + // Interpolation necessary. + *box = BlendTimedBoxes(pos[-1], pos[0], time_msec); + if (state) { + // Grab closest state. + if (std::abs(pos[-1].time_msec - time_msec) < + std::abs(pos[0].time_msec - time_msec)) { + if (pos[-1].state) { + *state = *pos[-1].state; + } + } else { + if (pos[0].state) { + *state = *pos[0].state; + } + } + } + return true; +} + +void BoxTracker::ResumeTracking() { + absl::MutexLock lock(&status_mutex_); + canceling_ = false; +} + +void BoxTracker::CancelAllOngoingTracks() { + // Get a list of items to be canceled (id, checkpoint) + absl::MutexLock lock(&status_mutex_); + canceling_ = true; + + std::vector> to_be_canceled; + for (auto& id : track_status_) { + for (auto& checkpoint : id.second) { + if (checkpoint.second.tracks_ongoing > 0) { + checkpoint.second.canceled = true; + to_be_canceled.push_back(std::make_pair(id.first, checkpoint.first)); + } + } + } + + // Wait for ongoing requests to terminate. + auto on_going_test = [&to_be_canceled, this]() -> bool { + status_mutex_.AssertHeld(); + for (const auto& item : to_be_canceled) { + if (track_status_[item.first][item.second].tracks_ongoing > 0) { + return true; + } + } + return false; + }; + + while (on_going_test()) { + status_condvar_.Wait(&status_mutex_); + } + + // Indicate we are done canceling. + for (const auto& item : to_be_canceled) { + track_status_[item.first][item.second].canceled = false; + } +} + +bool BoxTracker::WaitForAllOngoingTracks(int timeout_us) { + MEASURE_TIME << "Tracking time ..."; + absl::MutexLock lock(&status_mutex_); + + // Infinite wait for timeout <= 0. + absl::Duration timeout = timeout_us > 0 ? absl::Microseconds(timeout_us) + : absl::InfiniteDuration(); + + while (timeout > absl::ZeroDuration() && IsTrackingOngoingMutexHeld()) { + absl::Time start_wait = absl::Now(); + status_condvar_.WaitWithTimeout(&status_mutex_, timeout); + + absl::Duration elapsed = absl::Now() - start_wait; + timeout -= elapsed; + } + + return !IsTrackingOngoingMutexHeld(); +} + +bool BoxTracker::GetTrackingData(int id, int64 request_time_msec, + TrackingData* tracking_data, + int* tracking_data_msec) { + CHECK(tracking_data); + + int chunk_idx = ChunkIdxFromTime(request_time_msec); + + AugmentedChunkPtr tracking_chunk(ReadChunk(id, kInitCheckpoint, chunk_idx)); + if (!tracking_chunk.first) { + absl::MutexLock lock(&status_mutex_); + --track_status_[id][kInitCheckpoint].tracks_ongoing; + LOG(ERROR) << "Could not read tracking chunk from file."; + return false; + } + + std::unique_ptr owned_chunk; + if (tracking_chunk.second) { + owned_chunk.reset(tracking_chunk.first); + } + + const int closest_frame = + ClosestFrameIndex(request_time_msec, *tracking_chunk.first); + + *tracking_data = tracking_chunk.first->item(closest_frame).tracking_data(); + if (tracking_data_msec) { + *tracking_data_msec = + tracking_chunk.first->item(closest_frame).timestamp_usec() / 1000; + } + return true; +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/box_tracker.h b/mediapipe/util/tracking/box_tracker.h new file mode 100644 index 000000000..72022bf45 --- /dev/null +++ b/mediapipe/util/tracking/box_tracker.h @@ -0,0 +1,372 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_BOX_TRACKER_H_ +#define MEDIAPIPE_UTIL_TRACKING_BOX_TRACKER_H_ + +#include + +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "absl/synchronization/mutex.h" +#include "mediapipe/framework/port/threadpool.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" +#include "mediapipe/util/tracking/flow_packager.pb.h" +#include "mediapipe/util/tracking/tracking.h" +#include "mediapipe/util/tracking/tracking.pb.h" + +namespace mediapipe { + +// Describes a rectangle box at an instance of time. +struct TimedBox { + static const int kNumQuadVertices = 4; + // Dimensions are in normalized coordinates [0, 1]. + float top = 0; + float left = 0; + float bottom = 0; + float right = 0; + // Rotation of box w.r.t. center in radians. + float rotation = 0; + int64 time_msec = 0; + // Confidence of the tracked box in range [0, 1]. + float confidence = 0; + std::vector quad_vertices; + // Aspect ratio (width / height) for the tracked rectangle in physical space. + float aspect_ratio = -1.0; + // Whether we want this box to be potentially grouped with other boxes + // to track together. This is useful for tracking small boxes that lie + // on a plane. For example, when we detect a plane, + // track the plane, then all boxes within the plane can share the same + // homography transform. + bool request_grouping = false; + + bool operator<(const TimedBox& rhs) const { + return time_msec < rhs.time_msec; + } + + // Returns (1.0 - alpha) * lhs + alpha * rhs; + static TimedBox Blend(const TimedBox& lhs, const TimedBox& rhs, double alpha); + + // Returns lhs * alpha + rhs * beta; + static TimedBox Blend(const TimedBox& lhs, const TimedBox& rhs, double alpha, + double beta); + + std::string ToString() const { + return absl::StrFormat( + "top: %.3f left: %.3f bottom: %.3f right: %.3f " + "rot: %.3f t: %d", + top, left, bottom, right, rotation, static_cast(time_msec)); + } + + // Returns corners of TimedBox in the requested domain. + std::array Corners(float width, float height) const; + + static TimedBox FromProto(const TimedBoxProto& proto); + TimedBoxProto ToProto() const; +}; + +// TimedBox augment with internal states. +struct InternalTimedBox : public TimedBox { + InternalTimedBox() = default; + // Convenience constructor. + InternalTimedBox(const TimedBox& box, const MotionBoxState* state_) + : TimedBox(box), state(state_) {} + + // Corresponding MotionBoxState a TimedBox. + std::shared_ptr state; +}; + +// Initializes beginning tracking state from a TimedBox. +void MotionBoxStateFromTimedBox(const TimedBox& box, MotionBoxState* state); + +// Retrieves box position and time from a tracking state. +void TimedBoxFromMotionBoxState(const MotionBoxState& state, TimedBox* box); + +// Downgrade tracking degrees of TrackStepOptions to +// TRACKING_DEGREE_OBJECT_ROTATION_SCALE if originally +// specified TRACKING_DEGREE_OBJECT_PERSPECTIVE, but start pos +// does not contain quad or contains invalid quad. +template +void ChangeTrackingDegreesBasedOnStartPos( + const T& start_pos, TrackStepOptions* track_step_options) { + if (track_step_options->tracking_degrees() == + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE && + (!start_pos.has_quad() || start_pos.quad().vertices_size() != 8)) { + track_step_options->set_tracking_degrees( + TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE); + VLOG(1) << "Originally specified TRACKING_DEGREE_OBJECT_PERSPECTIVE, but " + "changed to TRACKING_DEGREE_OBJECT_ROTATION_SCALE"; + } +} + +// Set of boxes for a particular checkpoint. +typedef std::deque PathSegment; + +// Stores the PathSegment for each checkpoint time. +typedef std::map Path; + +// Returns box at specified time for a specific path segment. +// Returns true on success, otherwise box is left untouched. +// Optionally returns closest known MotionBoxState if state is not null. +bool TimedBoxAtTime(const PathSegment& segment, int64 time_msec, TimedBox* box, + MotionBoxState* state = nullptr); + +// Tracks timed boxes from cached TrackingDataChunks created by +// FlowPackagerCalculator. For usage see accompanying test. +class BoxTracker { + public: + // Initializes a new BoxTracker to work on cached TrackingData from a chunk + // directory. + BoxTracker(const std::string& cache_dir, const BoxTrackerOptions& options); + + // Initializes a new BoxTracker to work on the passed TrackingDataChunks. + // If copy_data is true, BoxTracker will retain its own copy of the data; + // otherwise the passed pointer need to be valid for the lifetime of the + // BoxTracker. + BoxTracker(const std::vector& tracking_data, + bool copy_data, const BoxTrackerOptions& options); + + // Add single TrackingDataChunk. This chunk must be correctly aligned with + // existing chunks. If chunk starting timestamp is larger than next valid + // chunk timestamp, empty chunks will be added to fill the gap. If copy_data + // is true, BoxTracker will retain its own copy of the data; otherwise the + // passed pointers need to be valid for the lifetime of the BoxTracker. + void AddTrackingDataChunk(const TrackingDataChunk* chunk, bool copy_data); + // Add new TrackingDataChunks. This method allows to streamline BoxTracker + // usage. Instead of collecting all tracking chunks and then calling the + // constructor, it makes possible to pass only minimally required chunks to + // constructor, and then append new chunks when available. + // Caller is responsible to guarantee that all chunks that are necessary for + // tracking are at the disposal of BoxTracker before calling NewBoxTrack, i.e. + // min_msec and max_msec parameters must be in the range of chunks passed to + // BoxTracker. + // Chunks will be added in the order they are specified in tracking_data. Each + // chunk must be correctly aligned with existing chunks. If chunk starting + // timestamp is larger than next valid chunk timestamp, empty chunks will be + // added to fill the gap. If copy_data is true, BoxTracker will retain its own + // copy of the data; otherwise the passed pointers need to be valid for the + // lifetime of the BoxTracker. + void AddTrackingDataChunks( + const std::vector& tracking_data, + bool copy_data); + + // Starts a new track for the specified box until tracking terminates or + // until track length achieves max_length. + // Does not block caller, returns immediately. + // Note: Use positive integers for id, we reserve negative ones for debugging + // and visualization purposes. + void NewBoxTrack(const TimedBox& initial_pos, int id, int64 min_msec = 0, + int64 max_msec = std::numeric_limits::max()); + + // Returns interval for which the state of the specified box is known. + // (Returns -1, -1 if id is missing or no tracking has been done). + std::pair TrackInterval(int id); + + // Returns box position for requested box at specified time. + // Returns false if no such box exists. + // Optional, if record_path_states is enabled in options outputs the + // corresponding states for the resulting TimedBox. Note, as TimedBoxes are + // generally interpolated from the underlying data and blended across + // checkpoints, the returned states are the closest (snapped) known tracking + // states for the left and right checkpoint path. Therefore the size of states + // is either two or one (if only one checkpoint exists). + bool GetTimedPosition(int id, int64 time_msec, TimedBox* result, + std::vector* states = nullptr); + + // Returns chunk index for specified time. + int ChunkIdxFromTime(int64 msec) const { + return msec / options_.caching_chunk_size_msec(); + } + + // Returns true if any tracking is ongoing for the specified id. + bool IsTrackingOngoingForId(int id) LOCKS_EXCLUDED(status_mutex_); + + // Returns true if any tracking is ongoing. + bool IsTrackingOngoing() LOCKS_EXCLUDED(status_mutex_); + + // Cancels all ongoing tracks. To avoid race conditions all NewBoxTrack's in + // flight will also be canceled. Future NewBoxTrack's will be canceled. + // NOTE: To resume execution, you have to call ResumeTracking() before + // issuing more NewBoxTrack calls. + void CancelAllOngoingTracks() LOCKS_EXCLUDED(status_mutex_); + void ResumeTracking() LOCKS_EXCLUDED(status_mutex_); + + // Waits for all ongoing tracks to complete. + // Optionally accepts a timeout in microseconds (== 0 for infinite wait). + // Returns true on success, false if timeout is reached. + // NOTE: If WaitForAllOngoingTracks timed out, CancelAllOngoingTracks() must + // be called before destructing the BoxTracker object or dangeling running + // threads might try to access invalid data. + bool WaitForAllOngoingTracks(int timeout_us = 0) + LOCKS_EXCLUDED(status_mutex_); + + // Debug function to obtain raw TrackingData closest to the specified + // timestamp. This call will read from disk on every invocation so it is + // expensive. + // To not interfere with other tracking requests it is recommended that you + // use a unique id here. + // Returns true on success. + bool GetTrackingData(int id, int64 request_time_msec, + TrackingData* tracking_data, + int* tracking_data_msec = nullptr); + + private: + // Asynchronous implementation function for box tracking. Schedules forward + // and backward tracking. + void NewBoxTrackAsync(const TimedBox& initial_pos, int id, int64 min_msec, + int64 max_msec); + + typedef std::pair AugmentedChunkPtr; + // Attempts to read chunk at chunk_idx if it exists. Reads from cache + // directory or from in memory cache. + // Important: 2nd part of return value indicates if returned tracking data + // will be owned by the caller (if true). In that case caller is responsible + // for releasing the returned chunk. + AugmentedChunkPtr ReadChunk(int id, int checkpoint, int chunk_idx); + + // Attempts to read specified chunk from caching directory. Blocks and waits + // until chunk is available or internal time out is reached. + // Returns nullptr if data could not be read. + std::unique_ptr ReadChunkFromCache(int id, int checkpoint, + int chunk_idx); + + // Waits with timeout for chunkfile to become available. Returns true on + // success, false if waited till timeout or when canceled. + bool WaitForChunkFile(int id, int checkpoint, const std::string& chunk_file) + LOCKS_EXCLUDED(status_mutex_); + + // Determines closest index in passed TrackingDataChunk + int ClosestFrameIndex(int64 msec, const TrackingDataChunk& chunk) const; + + // Adds new TimedBox to specified checkpoint with state. + void AddBoxResult(const TimedBox& box, int id, int checkpoint, + const MotionBoxState& state); + + // Callback can only handle 5 args max. + // Set own_data to true for args to assume ownership. + struct TrackingImplArgs { + TrackingImplArgs(AugmentedChunkPtr chunk_ptr, + const MotionBoxState& start_state_, int start_frame_, + int chunk_idx_, int id_, int checkpoint_, bool forward_, + bool first_call_, int64 min_msec_, int64 max_msec_) + : start_state(start_state_), + start_frame(start_frame_), + chunk_idx(chunk_idx_), + id(id_), + checkpoint(checkpoint_), + forward(forward_), + first_call(first_call_), + min_msec(min_msec_), + max_msec(max_msec_) { + if (chunk_ptr.second) { + chunk_data_buffer.reset(chunk_ptr.first); + } + + chunk_data = chunk_ptr.first; + } + + TrackingImplArgs(const TrackingImplArgs&) = default; + + // Storage for tracking data. + std::shared_ptr chunk_data_buffer; + + // Pointer to the actual tracking data. Usually points to the buffer + // but can also point to external data for performance reasons. + const TrackingDataChunk* chunk_data; + + MotionBoxState start_state; + int start_frame; + int chunk_idx; + int id; + int checkpoint; + bool forward = true; + bool first_call = true; + int64 min_msec; // minimum timestamp to track to + int64 max_msec; // maximum timestamp to track to + }; + + // Actual tracking algorithm. + void TrackingImpl(const TrackingImplArgs& args); + + // Ids are scheduled exclusively, run this method to acquire lock. + // Returns false if id could not be scheduled (e.g. id got canceled during + // waiting). + bool WaitToScheduleId(int id) LOCKS_EXCLUDED(status_mutex_); + + // Signals end of scheduling phase. Requires status mutex to be held. + void DoneSchedulingId(int id) EXCLUSIVE_LOCKS_REQUIRED(status_mutex_); + + // Removes all checkpoints within vicinity of new checkpoint. + void RemoveCloseCheckpoints(int id, int checkpoint) + EXCLUSIVE_LOCKS_REQUIRED(status_mutex_); + + // Removes specific checkpoint. + void ClearCheckpoint(int id, int checkpoint) + EXCLUSIVE_LOCKS_REQUIRED(status_mutex_); + + // Terminates tracking for specific id and checkpoint. + void CancelTracking(int id, int checkpoint) + EXCLUSIVE_LOCKS_REQUIRED(status_mutex_); + + // Implementation function for IsTrackingOngoing assuming mutex is already + // held. + bool IsTrackingOngoingMutexHeld() EXCLUSIVE_LOCKS_REQUIRED(status_mutex_); + + // Captures tracking status for each checkpoint + struct TrackStatus { + // Indicates that all current tracking processes should be canceled. + bool canceled = false; + + // Number of tracking requests than are currently ongoing. + int tracks_ongoing = 0; + }; + + private: + // Stores computed tracking paths_ for all boxes. + std::unordered_map paths_ GUARDED_BY(path_mutex_); + absl::Mutex path_mutex_; + + // For each id and each checkpoint stores current tracking status. + std::unordered_map> track_status_ + GUARDED_BY(status_mutex_); + + // Keeps track which ids are currently processing in NewBoxTrack. + std::unordered_map new_box_track_ GUARDED_BY(status_mutex_); + absl::Mutex status_mutex_; + + bool canceling_ GUARDED_BY(status_mutex_) = false; + + // Use to signal changes to status_condvar_; + absl::CondVar status_condvar_ GUARDED_BY(status_mutex_); + + BoxTrackerOptions options_; + + // Caching directory for TrackingData stored on disk. + std::string cache_dir_; + + // Pointers to tracking data stored in memory. + std::vector tracking_data_; + // Buffer for tracking data in case we retain a deep copy. + std::vector> tracking_data_buffer_; + + // Workers that run the tracking algorithm. + std::unique_ptr tracking_workers_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_BOX_TRACKER_H_ diff --git a/mediapipe/util/tracking/box_tracker.proto b/mediapipe/util/tracking/box_tracker.proto new file mode 100644 index 000000000..1d778da1c --- /dev/null +++ b/mediapipe/util/tracking/box_tracker.proto @@ -0,0 +1,87 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/tracking.proto"; + +option java_package = "com.google.mediapipe.tracking"; +option java_outer_classname = "BoxTrackerProto"; + +message BoxTrackerOptions { + // Chunk size for caching files. Should be equal to those + // written by the FlowPackagerCalculator. + optional int32 caching_chunk_size_msec = 1 [default = 2500]; + + // Chunk file format. + optional string cache_file_format = 2 [default = "chunk_%04d"]; + + // Number of simultaneous tracking requests. + optional int32 num_tracking_workers = 3 [default = 8]; + + // Maximum waiting time for next chunk, till function times out. + optional int32 read_chunk_timeout_msec = 4 [default = 60000]; + + // If set, box tracker will record the state for each computed TimedBox + // across all paths. + optional bool record_path_states = 5 [default = false]; + + // Actual tracking options to be used for every step. + optional TrackStepOptions track_step_options = 6; +} + +// Next tag: 13 +// Proto equivalent of struct TimedBox. +message TimedBoxProto { + // Normalized coords - in [0, 1] + optional float top = 1; + optional float left = 2; + optional float bottom = 3; + optional float right = 4; + // Rotation of box w.r.t. center in radians. + optional float rotation = 7; + optional MotionBoxState.Quad quad = 9; + optional int64 time_msec = 5 [default = 0]; + + // Unique per object id to disambiguate boxes. + optional int32 id = 6 [default = -1]; + + // Confidence of box tracked in the range [0, 1], with 0 being least + // confident, and 1 being most confident. A reasonable threshold is 0.5 + // to filter out unconfident boxes. + optional float confidence = 8; + + // Aspect ratio (width / height) for the tracked rectangle in physical space. + // If this field is provided, quad tracking will be performed using + // 6 degrees of freedom perspective transform between physical rectangle and + // frame quad. Otherwise, 8 degrees of freedom homography tracking between + // adjacent frames will be used. + optional float aspect_ratio = 10; + + // Whether or not to enable reacquisition functionality for this specific box. + optional bool reacquisition = 11 [default = false]; + + // Whether we want this box to be potentially grouped with other boxes + // to track together. This is useful for tracking small boxes that lie + // on a plane. For example, when we detect a plane, + // track the plane, then all boxes within the plane can share the same + // homography transform. + optional bool request_grouping = 12 [default = false]; +} + +message TimedBoxProtoList { + repeated TimedBoxProto box = 1; +} diff --git a/mediapipe/util/tracking/box_tracker_test.cc b/mediapipe/util/tracking/box_tracker_test.cc new file mode 100644 index 000000000..558daa91d --- /dev/null +++ b/mediapipe/util/tracking/box_tracker_test.cc @@ -0,0 +1,117 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/box_tracker.h" + +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/port/gtest.h" + +namespace mediapipe { +namespace { + +constexpr double kWidth = 1280.0; +constexpr double kHeight = 720.0; + +// Ground truth test; testing tracking accuracy and multi-thread load testing. +TEST(BoxTrackerTest, MovingBoxTest) { + const std::string cache_dir = + file::JoinPath("./", "/mediapipe/util/tracking/testdata/box_tracker"); + BoxTracker box_tracker(cache_dir, BoxTrackerOptions()); + + // Ground truth positions of the overlay (linear in between). + // @ 0: (50, 100) + // @ 3000: (50, 400) + // @ 6000: (500, 400) + // @ 9000: (1000, 50) + // @ 12000: (50, 100) + // @ 15000: (1000, 400) + // + // size of overlay: 220 x 252 + std::vector positions{ + {50.0 / kWidth, 100.0 / kHeight}, {50.0 / kWidth, 400.0 / kHeight}, + {500.0 / kWidth, 400.0 / kHeight}, {1000.0 / kWidth, 50.0 / kHeight}, + {50.0 / kWidth, 100.0 / kHeight}, {1000.0 / kWidth, 400.0 / kHeight}, + }; + + const Vector2_d overlay_sz(220.0 / kWidth, 252.0 / kHeight); + + TimedBox initial_pos; + initial_pos.left = positions[1].x(); + initial_pos.top = positions[1].y(); + initial_pos.right = initial_pos.left + overlay_sz.x(); + initial_pos.bottom = initial_pos.top + overlay_sz.y(); + initial_pos.time_msec = 3000; + + // Test multithreading under load, ensure this does not crash or stall. + box_tracker.NewBoxTrack(initial_pos, 0); + // Cancel right after issuing. + box_tracker.CancelAllOngoingTracks(); + + // Should not be scheduled. + box_tracker.NewBoxTrack(initial_pos, 0); + EXPECT_FALSE(box_tracker.IsTrackingOngoing()); + box_tracker.ResumeTracking(); + + box_tracker.NewBoxTrack(initial_pos, 0); + // Two cancelations in a row should not block. + box_tracker.CancelAllOngoingTracks(); + box_tracker.CancelAllOngoingTracks(); + box_tracker.ResumeTracking(); + + // Start again for real this time. + box_tracker.NewBoxTrack(initial_pos, 0); + + // Wait to terminate. + box_tracker.WaitForAllOngoingTracks(); + + // Check that tracking did not abort. + EXPECT_EQ(0, box_tracker.TrackInterval(0).first); + EXPECT_GT(box_tracker.TrackInterval(0).second, 15000); + + auto boxes_equal = [](const TimedBox& lhs, const TimedBox& rhs) { + constexpr float kAccuracy = 0.015f; + return (lhs.time_msec == rhs.time_msec && + std::abs(lhs.top - rhs.top) < kAccuracy && + std::abs(lhs.left - rhs.left) < kAccuracy && + std::abs(lhs.right - rhs.right) < kAccuracy && + std::abs(lhs.bottom - rhs.bottom) < kAccuracy); + }; + + for (int k = 0; k < 15000; k += 33) { + TimedBox box; + EXPECT_TRUE(box_tracker.GetTimedPosition(0, k, &box)); + + // One groundtruth position every 3s, linear in between. + const int rect_pos = k / 3000; + const int dt = k - rect_pos * 3000; + const int next_rect_pos = dt == 0 ? rect_pos : (rect_pos + 1); + + // Blend to get initial position. + const double alpha = dt / 3000.0; + const Vector2_d gt_pos = + (1.0 - alpha) * positions[rect_pos] + alpha * positions[next_rect_pos]; + + TimedBox gt_box; + gt_box.time_msec = k; + gt_box.top = gt_pos.y(); + gt_box.left = gt_pos.x(); + gt_box.right = gt_box.left + overlay_sz.x(); + gt_box.bottom = gt_box.top + overlay_sz.y(); + EXPECT_TRUE(boxes_equal(gt_box, box)); + } +} + +} // namespace + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/camera_motion.cc b/mediapipe/util/tracking/camera_motion.cc new file mode 100644 index 000000000..e753be712 --- /dev/null +++ b/mediapipe/util/tracking/camera_motion.cc @@ -0,0 +1,408 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/camera_motion.h" + +#include + +#include "absl/strings/str_format.h" +#include "mediapipe/util/tracking/region_flow.h" + +namespace mediapipe { + +void CameraMotionToTranslation(const CameraMotion& camera_motion, + TranslationModel* model) { + if (camera_motion.has_translation()) { + model->CopyFrom(camera_motion.translation()); + } else { + model->Clear(); + } +} + +void CameraMotionToLinearSimilarity(const CameraMotion& camera_motion, + LinearSimilarityModel* model) { + if (camera_motion.has_linear_similarity()) { + model->CopyFrom(camera_motion.linear_similarity()); + } else { + TranslationModel translation; + CameraMotionToTranslation(camera_motion, &translation); + model->CopyFrom(LinearSimilarityAdapter::Embed(translation)); + } +} + +void CameraMotionToAffine(const CameraMotion& camera_motion, + AffineModel* model) { + if (camera_motion.has_affine()) { + model->CopyFrom(camera_motion.affine()); + } else { + LinearSimilarityModel similarity; + CameraMotionToLinearSimilarity(camera_motion, &similarity); + model->CopyFrom(AffineAdapter::Embed(similarity)); + } +} + +void CameraMotionToHomography(const CameraMotion& camera_motion, + Homography* model) { + if (camera_motion.has_homography()) { + model->CopyFrom(camera_motion.homography()); + } else { + AffineModel affine; + CameraMotionToAffine(camera_motion, &affine); + model->CopyFrom(HomographyAdapter::Embed(affine)); + } +} + +void CameraMotionToMixtureHomography(const CameraMotion& camera_motion, + MixtureHomography* model) { + if (camera_motion.has_mixture_homography()) { + model->CopyFrom(camera_motion.mixture_homography()); + } else { + Homography homography; + CameraMotionToHomography(camera_motion, &homography); + model->CopyFrom(MixtureHomographyAdapter::Embed(homography, 1)); + } +} + +CameraMotion ComposeCameraMotion(const CameraMotion& lhs, + const CameraMotion& rhs) { + CHECK_EQ(lhs.frame_width(), rhs.frame_width()); + CHECK_EQ(lhs.frame_height(), rhs.frame_height()); + + CameraMotion result = rhs; + if (lhs.has_translation() || rhs.has_translation()) { + *result.mutable_translation() = + ModelCompose2(lhs.translation(), rhs.translation()); + } + + if (lhs.has_similarity() || rhs.has_similarity()) { + *result.mutable_similarity() = + ModelCompose2(lhs.similarity(), rhs.similarity()); + } + + if (lhs.has_linear_similarity() || rhs.has_linear_similarity()) { + *result.mutable_linear_similarity() = + ModelCompose2(lhs.linear_similarity(), rhs.linear_similarity()); + } + + if (lhs.has_affine() || rhs.has_affine()) { + *result.mutable_affine() = ModelCompose2(lhs.affine(), rhs.affine()); + } + + if (lhs.has_homography() || rhs.has_homography()) { + *result.mutable_homography() = + ModelCompose2(lhs.homography(), rhs.homography()); + } + + if (rhs.has_mixture_homography()) { + if (lhs.has_mixture_homography()) { + LOG(ERROR) << "Mixture homographies are not closed under composition, " + << "Only rhs mixtures composed with lhs homographies " + << "are supported."; + } else if (lhs.type() <= CameraMotion::UNSTABLE_SIM) { + // We only composit base model when stability is sufficient. + *result.mutable_mixture_homography() = + MixtureHomographyAdapter::ComposeLeft(rhs.mixture_homography(), + lhs.homography()); + } + } else if (lhs.has_mixture_homography()) { + LOG(ERROR) << "Only rhs mixtures supported."; + } + + // Select max unstable type. + result.set_type(std::max(lhs.type(), rhs.type())); + result.set_average_magnitude(lhs.average_magnitude() + + rhs.average_magnitude()); + result.set_translation_variance( + std::max(lhs.translation_variance(), rhs.translation_variance())); + result.set_similarity_inlier_ratio( + std::min(lhs.similarity_inlier_ratio(), rhs.similarity_inlier_ratio())); + + result.set_similarity_strict_inlier_ratio( + std::min(lhs.similarity_strict_inlier_ratio(), + rhs.similarity_strict_inlier_ratio())); + + result.set_average_homography_error( + std::max(lhs.average_homography_error(), rhs.average_homography_error())); + result.set_homography_inlier_coverage(std::min( + lhs.homography_inlier_coverage(), rhs.homography_inlier_coverage())); + result.set_homography_strict_inlier_coverage( + std::min(lhs.homography_strict_inlier_coverage(), + rhs.homography_strict_inlier_coverage())); + + // TODO: Overlay stuff. + + result.set_flags(lhs.flags() | rhs.flags()); + result.set_timestamp_usec( + std::max(lhs.timestamp_usec(), rhs.timestamp_usec())); + result.set_match_frame(lhs.match_frame() + rhs.match_frame()); + + // TODO: Rest. + return result; +} + +CameraMotion InvertCameraMotion(const CameraMotion& motion) { + CameraMotion inverted = motion; + if (motion.has_translation()) { + *inverted.mutable_translation() = ModelInvert(motion.translation()); + } + + if (motion.has_similarity()) { + *inverted.mutable_similarity() = ModelInvert(motion.similarity()); + } + + if (motion.has_linear_similarity()) { + *inverted.mutable_linear_similarity() = + ModelInvert(motion.linear_similarity()); + } + + if (motion.has_affine()) { + *inverted.mutable_affine() = ModelInvert(motion.affine()); + } + + if (motion.has_homography()) { + *inverted.mutable_homography() = ModelInvert(motion.homography()); + } + + if (motion.has_mixture_homography()) { + LOG(ERROR) << "Mixture homographies are not closed under inversion."; + } + + return inverted; +} + +void SubtractCameraMotionFromFeatures( + const std::vector& camera_motions, + std::vector* feature_lists) { + CHECK(feature_lists != nullptr); + CHECK_GE(camera_motions.size(), feature_lists->size()); + if (feature_lists->empty()) { + return; + } + + bool use_mixtures = camera_motions[0].has_mixture_homography(); + + std::unique_ptr row_weights; + if (use_mixtures) { + row_weights.reset(MixtureRowWeightsFromCameraMotion( + camera_motions[0], (*feature_lists)[0]->frame_height())); + } + + for (int k = 0; k < feature_lists->size(); ++k) { + Homography background_model; + MixtureHomography background_model_mixture; + if (use_mixtures) { + CameraMotionToMixtureHomography(camera_motions[k], + &background_model_mixture); + } else { + CameraMotionToHomography(camera_motions[k], &background_model); + } + + // Remove motion due to camera motion, leaving only foreground motion. + for (auto& feature : *(*feature_lists)[k]->mutable_feature()) { + const Vector2_f background_motion = + (!use_mixtures ? HomographyAdapter::TransformPoint( + background_model, FeatureLocation(feature)) + : MixtureHomographyAdapter::TransformPoint( + background_model_mixture, *row_weights, + FeatureLocation(feature))) - + FeatureLocation(feature); + const Vector2_f object_motion = FeatureFlow(feature) - background_motion; + feature.set_dx(object_motion.x()); + feature.set_dy(object_motion.y()); + } + } +} + +float ForegroundMotion(const CameraMotion& camera_motion, + const RegionFlowFeatureList& feature_list) { + if (camera_motion.has_mixture_homography()) { + LOG(WARNING) << "Mixture homographies are present but function is only " + << "using homographies. Truncation error likely."; + } + + Homography background_motion; + CameraMotionToHomography(camera_motion, &background_motion); + + float foreground_motion = 0; + for (auto& feature : feature_list.feature()) { + const float error = (FeatureMatchLocation(feature) - + HomographyAdapter::TransformPoint( + background_motion, FeatureLocation(feature))) + .Norm(); + foreground_motion += error; + } + + if (feature_list.feature_size() > 0) { + foreground_motion *= 1.0f / feature_list.feature_size(); + } + + return foreground_motion; +} + +void InitCameraMotionFromFeatureList(const RegionFlowFeatureList& feature_list, + CameraMotion* camera_motion) { + camera_motion->set_blur_score(feature_list.blur_score()); + camera_motion->set_frac_long_features_rejected( + feature_list.frac_long_features_rejected()); + camera_motion->set_timestamp_usec(feature_list.timestamp_usec()); + camera_motion->set_match_frame(feature_list.match_frame()); + camera_motion->set_frame_width(feature_list.frame_width()); + camera_motion->set_frame_height(feature_list.frame_height()); +} + +std::string CameraMotionFlagToString(const CameraMotion& camera_motion) { + std::string text; + if (camera_motion.flags() & CameraMotion::FLAG_SHOT_BOUNDARY) { + text += "SHOT_BOUNDARY|"; + } + + if (camera_motion.flags() & CameraMotion::FLAG_BLURRY_FRAME) { + text += absl::StrFormat("BLURRY_FRAME %.2f|", camera_motion.bluriness()); + } + + if (camera_motion.flags() & CameraMotion::FLAG_MAJOR_OVERLAY) { + text += "MAJOR_OVERLAY|"; + } + + if (camera_motion.flags() & CameraMotion::FLAG_SHARP_FRAME) { + text += "SHARP_FRAME|"; + } + + if (camera_motion.flags() & CameraMotion::FLAG_SHOT_FADE) { + text += "SHOT_FADE|"; + } + + if (camera_motion.flags() & CameraMotion::FLAG_DUPLICATED) { + text += "DUPLICATED|"; + } + return text; +} + +std::string CameraMotionTypeToString(const CameraMotion& motion) { + switch (motion.type()) { + case CameraMotion::VALID: + return "VALID"; + case CameraMotion::UNSTABLE_HOMOG: + return "UNSTABLE_HOMOG"; + case CameraMotion::UNSTABLE_SIM: + return "UNSTABLE_SIM"; + case CameraMotion::UNSTABLE: + return "UNSTABLE"; + case CameraMotion::INVALID: + return "INVALID"; + } + + return "NEVER HAPPENS WITH CLANG"; +} + +float InlierCoverage(const CameraMotion& camera_motion, + bool use_homography_coverage) { + const int num_block_coverages = camera_motion.mixture_inlier_coverage_size(); + if (num_block_coverages == 0 || use_homography_coverage) { + if (camera_motion.has_homography()) { + return camera_motion.homography_inlier_coverage(); + } else { + return 1.0f; + } + } else { + return std::accumulate(camera_motion.mixture_inlier_coverage().begin(), + camera_motion.mixture_inlier_coverage().end(), + 0.0f) * + (1.0f / num_block_coverages); + } +} + +template +CameraMotion FirstCameraMotionForLooping( + const CameraMotionContainer& camera_motions) { + if (camera_motions.size() < 2) { + LOG(ERROR) << "Not enough camera motions for refinement."; + return CameraMotion(); + } + + CameraMotion loop_motion = camera_motions[0]; + + // Only update motions present in first camera motion. + bool has_translation = loop_motion.has_translation(); + bool has_similarity = loop_motion.has_linear_similarity(); + bool has_homography = loop_motion.has_homography(); + + TranslationModel translation; + LinearSimilarityModel similarity; + Homography homography; + + for (int i = 1; i < camera_motions.size(); ++i) { + const CameraMotion& motion = camera_motions[i]; + if (motion.has_mixture_homography()) { + // TODO: Implement + LOG(WARNING) << "This function does not validly apply mixtures; " + << "which are currently not closed under composition. "; + } + + switch (motion.type()) { + case CameraMotion::INVALID: + has_translation = false; + has_similarity = false; + has_homography = false; + break; + case CameraMotion::UNSTABLE: + has_similarity = false; + has_homography = false; + break; + case CameraMotion::UNSTABLE_SIM: + has_homography = false; + break; + case CameraMotion::VALID: + case CameraMotion::UNSTABLE_HOMOG: + break; + default: + LOG(FATAL) << "Unknown CameraMotion::type."; + } + + // Only accumulate motions which are valid for the entire chain, otherwise + // keep the pre-initialized motions. + has_translation = + has_translation && motion.has_translation() && + AccumulateInvertedModel(motion.translation(), &translation); + has_similarity = + has_similarity && motion.has_linear_similarity() && + AccumulateInvertedModel(motion.linear_similarity(), &similarity); + has_homography = has_homography && motion.has_homography() && + AccumulateInvertedModel(motion.homography(), &homography); + } + + if (has_translation) { + *loop_motion.mutable_translation() = translation; + } + if (has_similarity && IsInverseStable(similarity)) { + *loop_motion.mutable_linear_similarity() = similarity; + } + if (has_homography && IsInverseStable(homography)) { + *loop_motion.mutable_homography() = homography; + } + + VLOG(1) << "Looping camera motion refinement for " + << " translation:" << (has_translation ? "successful" : "failed") + << " similarity:" << (has_similarity ? "successful" : "failed") + << " homography:" << (has_homography ? "successful" : "failed"); + + return loop_motion; +} + +// Explicit instantiation. +template CameraMotion FirstCameraMotionForLooping>( + const std::vector&); +template CameraMotion FirstCameraMotionForLooping>( + const std::deque&); +} // namespace mediapipe diff --git a/mediapipe/util/tracking/camera_motion.h b/mediapipe/util/tracking/camera_motion.h new file mode 100644 index 000000000..98b28abbf --- /dev/null +++ b/mediapipe/util/tracking/camera_motion.h @@ -0,0 +1,336 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_CAMERA_MOTION_H_ +#define MEDIAPIPE_UTIL_TRACKING_CAMERA_MOTION_H_ + +#include + +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +// Helper functions to extract specific models from CameraMotion. +// Returned is always the requested model. In case, model is not present (i.e. +// has_ fails), the highest degree of freedom model +// (lower or equal to the requested model) that is present is embedded in the +// requested model. +// Presence of the model depends on wich models were requesteded to be +// estimated (via MotionEstimationOptions, to initialize requested models to +// identity, use ResetMotionModels above). For example, assume linear similarity +// was not requested to be estimated, but affine was requested. If +// CameraMotionToLinearSimilarity is called, has_linear_similarity would be +// false and the function fall back returning a translation model. +void CameraMotionToTranslation(const CameraMotion& camera_motion, + TranslationModel* model); +void CameraMotionToLinearSimilarity(const CameraMotion& camera_motion, + LinearSimilarityModel* model); +void CameraMotionToAffine(const CameraMotion& camera_motion, + AffineModel* model); +void CameraMotionToHomography(const CameraMotion& camera_motion, + Homography* homography); +void CameraMotionToMixtureHomography(const CameraMotion& camera_motion, + MixtureHomography* mixture); + +// TODO: Under development ... +// Returns camera motion lhs * rhs. Initial camera motion is set to rhs +// before composition. +CameraMotion ComposeCameraMotion(const CameraMotion& lhs, + const CameraMotion& rhs); + +// Inverts every motion model that is set in CameraMotion. +CameraMotion InvertCameraMotion(const CameraMotion& motion); + +// Templated wrapper for above calls. +template +Model CameraMotionToModel(const CameraMotion& camera_motion); + +// Returns model from passed CameraMotion specified by unstable_type +// (which must name a type != VALID, CHECK-ed) and embeds it in the specified +// Model. +template +Model UnstableCameraMotionToModel(const CameraMotion& camera_motion, + CameraMotion::Type unstable_type); + +// Projects passed model to lower degree of freedom model (embedded in original +// type), as specified type. In case type is valid, function is effectively +// identity function. +// Only implemented for the following models: +// - Translation +// - LinearSimilarity +// - AffineModel +template +Model ProjectToTypeModel(const Model& model, float frame_width, + float frame_height, CameraMotion::Type type); + +// Substract camera motion (specifically highest, degree of freedom model, +// that has been estimated reliably) from feature lists. Operates on vectors +// for improved performance. Size of camera_motions can be larger than +// feature_lists, in this case last camera motions are ignored. +void SubtractCameraMotionFromFeatures( + const std::vector& camera_motions, + std::vector* feature_lists); + +// Returns average motion magnitude after subtracting camera motion. +float ForegroundMotion(const CameraMotion& camera_motion, + const RegionFlowFeatureList& feature_list); + +// Initializes a CameraMotion with its corresponding fields from a +// RegionFlowFeatureList. +void InitCameraMotionFromFeatureList(const RegionFlowFeatureList& feature_list, + CameraMotion* camera_motion); + +// Converts Camera motion flag to std::string. +std::string CameraMotionFlagToString(const CameraMotion& motion); + +// Converts Camera motion type to std::string. Used instead of builtin proto +// function for mobile support. +std::string CameraMotionTypeToString(const CameraMotion& motion); + +// Returns inlier coverage either based on mixture (if present, in this case +// return mean of block coverages) or else homography. +// If neither is present, returns 0 to signal insufficient inliers. +// If use_homography_coverage is set, uses homography even when mixture is +// present. +float InlierCoverage(const CameraMotion& camera_motion, + bool use_homography_coverage); + +// Downsamples passed motion models temporally by specified downsample_scale, +// i.e. for models F_0, F_1, F_2, F_3, F_4 and downsample_scale of 2, models: +// F_0 * F_1, F_2 * F_3 and F_4 are returned. +// Optionally also performs downsampling of corresponding model_type returning +// the least unstable for each composition. +template +void DownsampleMotionModels( + const std::vector& models, + const std::vector* model_type, // optional. + int downsample_scale, std::vector* downsampled_models, + std::vector* downsampled_types); + +// Compatible subsampling method to above DownsampleMotionModels. +// Note, when downsampling for example: +// F_0, F_1, F_2, F_3, F_4 by factor 3 via above function, downsampled result +// will be F_0 * F_1 * F_2, F_3 * F_4 +// so we would need to pick entities at F_2 and F_4. +// Template class Container must be SequenceContainer, like +// std::vector, std::deque. +template +void SubsampleEntities(const Container& input, int downsample_scale, + Container* output); + +// For perfect looping, this function computes the motion in the first frame +// to be the inverse of the accumulated motion from frame 1 to N. +// If a particular motion type is not available or not invertible at any +// frame pair, the original motion for that type is retained. +// Does not work if mixtures are present. +template // STL container of CameraMotion's +CameraMotion FirstCameraMotionForLooping( + const CameraMotionContainer& container); + +// Template implementation functions. + +template +Model UnstableCameraMotionToModel(const CameraMotion& camera_motion, + CameraMotion::Type unstable_type) { + switch (unstable_type) { + case CameraMotion::INVALID: + return Model(); // Identity. + + case CameraMotion::UNSTABLE: { + return ModelAdapter::Embed( + CameraMotionToModel(camera_motion)); + } + + case CameraMotion::UNSTABLE_SIM: { + return ModelAdapter::Embed( + CameraMotionToModel(camera_motion)); + } + + case CameraMotion::UNSTABLE_HOMOG: { + return ModelAdapter::Embed( + CameraMotionToModel(camera_motion)); + } + + case CameraMotion::VALID: + LOG(FATAL) << "Specify a type != VALID"; + return Model(); + } +} + +template <> +inline TranslationModel ProjectToTypeModel(const TranslationModel& model, + float frame_width, + float frame_height, + CameraMotion::Type type) { + switch (type) { + case CameraMotion::INVALID: + return TranslationModel(); // Identity. + default: + return model; + } +} + +template <> +inline LinearSimilarityModel ProjectToTypeModel( + const LinearSimilarityModel& model, float frame_width, float frame_height, + CameraMotion::Type type) { + switch (type) { + case CameraMotion::INVALID: + return LinearSimilarityModel(); // Identity. + + case CameraMotion::UNSTABLE: + return LinearSimilarityAdapter::Embed( + TranslationAdapter::ProjectFrom(model, frame_width, frame_height)); + + default: + return model; + } +} + +template +Model ProjectToTypeModel(const Model& model, float frame_width, + float frame_height, CameraMotion::Type type) { + switch (type) { + case CameraMotion::INVALID: + return Model(); // Identity. + + case CameraMotion::UNSTABLE: + return ModelAdapter::Embed( + TranslationAdapter::ProjectFrom(model, frame_width, frame_height)); + + case CameraMotion::UNSTABLE_SIM: + return ModelAdapter::Embed(LinearSimilarityAdapter::ProjectFrom( + model, frame_width, frame_height)); + + // case UNSTABLE_HOMOG does not occur except for mixtures. + + default: + return model; + } +} + +template <> +inline MixtureHomography ProjectToTypeModel(const MixtureHomography&, float, + float, CameraMotion::Type) { + LOG(FATAL) << "Projection not supported for mixtures."; + return MixtureHomography(); +} + +template +void DownsampleMotionModels( + const std::vector& models, + const std::vector* model_type, int downsample_scale, + std::vector* downsampled_models, + std::vector* downsampled_types) { + if (model_type) { + CHECK_EQ(models.size(), model_type->size()); + CHECK(downsampled_models) << "Expecting output models."; + } + + CHECK(downsampled_models); + downsampled_models->clear(); + if (downsampled_types) { + downsampled_types->clear(); + } + + const int num_models = models.size(); + + for (int model_idx = 0; model_idx < num_models; + model_idx += downsample_scale) { + const int last_idx = + std::min(model_idx + downsample_scale, num_models) - 1; + + CameraMotion::Type sampled_type = CameraMotion::VALID; + if (model_type) { + // Get least stable model within downsample window (max operation). + for (int i = model_idx; i <= last_idx; ++i) { + sampled_type = std::max(sampled_type, model_type->at(i)); + } + downsampled_types->push_back(sampled_type); + } + + // Concatenate models. + Model composed = models[last_idx]; + + for (int i = last_idx - 1; i >= model_idx; --i) { + composed = ModelCompose2(models[i], composed); + } + + downsampled_models->push_back(composed); + } +} + +template +void SubsampleEntities(const Container& input, int downsample_factor, + Container* output) { + CHECK(output); + output->clear(); + + if (input.empty()) { + return; + } + + for (int k = downsample_factor - 1; k < input.size(); + k += downsample_factor) { + output->push_back(input[k]); + } + + if (input.size() % downsample_factor != 0) { + // We need to add last constraint as termination. + output->push_back(input.back()); + } +} + +template <> +inline TranslationModel CameraMotionToModel(const CameraMotion& camera_motion) { + TranslationModel model; + CameraMotionToTranslation(camera_motion, &model); + return model; +} + +template <> +inline LinearSimilarityModel CameraMotionToModel( + const CameraMotion& camera_motion) { + LinearSimilarityModel model; + CameraMotionToLinearSimilarity(camera_motion, &model); + return model; +} + +template <> +inline AffineModel CameraMotionToModel(const CameraMotion& camera_motion) { + AffineModel model; + CameraMotionToAffine(camera_motion, &model); + return model; +} + +template <> +inline Homography CameraMotionToModel(const CameraMotion& camera_motion) { + Homography model; + CameraMotionToHomography(camera_motion, &model); + return model; +} + +template <> +inline MixtureHomography CameraMotionToModel( + const CameraMotion& camera_motion) { + MixtureHomography model; + CameraMotionToMixtureHomography(camera_motion, &model); + return model; +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_CAMERA_MOTION_H_ diff --git a/mediapipe/util/tracking/camera_motion.proto b/mediapipe/util/tracking/camera_motion.proto new file mode 100644 index 000000000..9e19f4ad8 --- /dev/null +++ b/mediapipe/util/tracking/camera_motion.proto @@ -0,0 +1,198 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Describes camera motion between two frames with various degree of freedom +// parametric motion models. +// In addition, stores features describing how reliable the estimated motion +// model is. +// Flags indicate several properties derived from the camera motion, e.g. if a +// frame is sharp, blurry or contains overlays. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/motion_models.proto"; + +// Next tag: 33 +message CameraMotion { + // Background motion expressed in various models. + // These are per-frame pair motions (from current to previous frame). + // Models are expressed in the un-normalized domain frame_width x frame_height + // that is passed to MotionEstimation (storred below). + optional TranslationModel translation = 1; + optional SimilarityModel similarity = 2; + optional LinearSimilarityModel linear_similarity = 3; + optional AffineModel affine = 4; + optional Homography homography = 5; + + optional MixtureHomography mixture_homography = 8; + + // Frame dimensions camera motion was computed over. + optional float frame_width = 31; + optional float frame_height = 32; + + // Mixture homographies computed w.r.t. exponentially increasing + // regularizers. Above mixture_homography member is selected from spectrum + // based on amount of rolling shutter present in the video. + repeated MixtureHomography mixture_homography_spectrum = 23; + + // Relative row sigma w.r.t. frame_height for mixture models. + optional float mixture_row_sigma = 10; + + // Average of all motion vector magnitudes (without accounting for any motion + // model), within 10th to 90th percentile (to remove outliers). + optional float average_magnitude = 24 [default = 0.0]; + + // Inlier-weighted variance of the translation model. + // Specified, w.r.t. unnormalized video domain that motion models + // are computed for. + optional float translation_variance = 25 [default = 0.0]; + + // Ratio of inliers w.r.t. regular and stricter thresholds. In [0, 1]. + optional float similarity_inlier_ratio = 29 [default = 0]; + optional float similarity_strict_inlier_ratio = 30 [default = 0]; + + // Average registration error of homography in pixels. + // Note: These two parameters default to zero in-case homographies have not + // been estimated. + optional float average_homography_error = 11; + + // Fraction, in [0,1], of homography inliers. + optional float homography_inlier_coverage = 12; + + // Same as above but with stricter threshold. + // (For details, see: MotionEstimationOptions::strict_coverage_scale). + // Coverage is designed to measure the amount of significant outliers, + // which can affect the validity of the estimated homography. + // However, it does not discount small outliers, which occur in case + // of small rolling shutter wobbles. For this a stricter version of coverage + // is needed, which is essential for computing the rolling_shutter_guess, + // i.e. the increase in coverage by using mixtures vs. homographies. + optional float homography_strict_inlier_coverage = 22; + + // Per-block inlier fraction for mixtures. + repeated float mixture_inlier_coverage = 13; + + // Set based on stability analysis indicating if frame is likely to originate + // from a rolling shutter camera. (-1 is used to indicate frame was not + // tested, e.g. due to mixture deemed unstable for analysis). + // Guess is a scaler indicating by how much the mixture models (suitable for + // rolling shutter distortions) increased inlier coverage compared to a single + // homography. For example a value, of 1.3 indicates, that the mixture models + // increased inlier coverage by 30%. + // If not -1, range is in [0, inf] (values slightly smaller than 1 are + // possible due to suppression of noisy feature tracks during estimation). + optional float rolling_shutter_guess = 14; + + // Indicating if CameraMotion is deemed to originate from rolling + // shutter camera (index >= 0), and if so, denotes the index in the + // mixture_homography_spectrum, where higher indices correspond to heavier + // regularized motions. If motion is not deemed to originate from a rolling + // shutter camera, index is set to -1. + optional int32 rolling_shutter_motion_index = 16 [default = -1]; + + // List of overlay indices (cell locations in column major format) over domain + // of size overlay_domain x overlay_domain, where + // overlay_domain is set by MotionEstimation to + // MotionEstimationOptions::OverlayDetectionOptions::analysis_mask_size. + // Overlay analysis is performed over chunk of frames, as specified by + // MotionEstimationOptions::overlay_analysis_chunk_size, with the resulting + // overlay indices being assigned to each frame of the chunk. + // Consequently it suffices to store the result only for the first frame + // of every chunk. Subsequent frames store a single negative index relative + // to the first chunk frame indicating where to locate the overlay indicies. + // Specifically if for frame f, overlay_indices(0) == -2, overlay indices for + // corresponding chunk can be found at frame f - 2. + // For details about how overlay indices are used to flag a frame to contain + // an overlay, see MotionFilterOptions::OverlayOptions. + repeated int32 overlay_indices = 17; + optional int32 overlay_domain = 18 [default = 10]; + + // CameraMotion type indicates whether highest degree of freedom (DOF) + // model estimation was deemed stable, in which case CameraMotion::Type is set + // to VALID. + // If a model was deemed not stable (according to *StabilityBounds in + // MotionEstimationOptions), it is set to the lower dof type which was deemed + // stable. + enum Type { + VALID = 0; // All requested motion models estimated reliably. + UNSTABLE_HOMOG = 1; // Fallback to homographies, mixture unreliable. + UNSTABLE_SIM = 2; // Fallback to similarity model, homography + // unreliable. + UNSTABLE = 3; // Fallback to translation model, similarity + // unreliable, legacy naming. + INVALID = 4; // Identity model, translation unreliable. + } + + optional Type type = 6 [default = VALID]; + + // If set, stores original type in case it was overriden (by filtering + // functions, etc.). + optional Type overridden_type = 15 [default = VALID]; + + // Set of optional *bit* flags set for various purposes. + enum Flags { + FLAG_SHOT_BOUNDARY = 1; // Set to indicate presence of a + // shot boundary. + FLAG_BLURRY_FRAME = 2; + FLAG_MAJOR_OVERLAY = 4; + FLAG_SHARP_FRAME = 8; // Set if frame is considered sharp + // in a neighborhood of frames. + FLAG_SINGULAR_ESTIMATION = 16; // Indicates that estimation resulted + // in singular optimization problem. + // Used internally by MotionEstimation. + // Indicates if shot boundary is part of a fade. If so, all frames of the + // fade will be labeled with the FLAG but only the begin and end of the fade + // will have the FLAG_SHOT_BOUNDARY set. + FLAG_SHOT_FADE = 32; + + FLAG_DUPLICATED = 64; // Set if frame is exact duplicate of + // previous frame. + FLAG_CENTER_FRAME = 128; // Indicates this frame is at the + // center of the sequence. Currently + // used to constrain stabilizing crop + // transform. + } + + optional int32 flags = 19 [default = 0]; + + // Same as in RegionFlowFeatureList (from region_flow.proto), measures blur + // as average cornerness over textured areas. As it depends on the image + // content, should only be used relative. + optional float blur_score = 20; + + // Quanitifies amount of blur. Specified as ratio w.r.t. sharpest matching + // frame, i.e. 1 indicates no blur, values > 1 amount of blur w.r.t. sharpest + // frame. + optional float bluriness = 21 [default = 0.0]; + + // Same as in RegionFlowFeatureList (from region_flow.proto). Stores fraction + // of long feature tracks that got rejected for this frame. + optional float frac_long_features_rejected = 26; + + // Same as in RegionFlowFeatureList (from region_flow.proto). + // Timestamp in micro seconds of the underlying frame. + optional int64 timestamp_usec = 27 [default = 0]; + + // Same as in RegionFlowFeatureList (from region_flow.proto). + // Denotes frame that motion was computed w.r.t. to, locally to the current + // frame. Values < 0 indicate backward tracking, while values > 0 indicate + // forward tracking. For example, match_frame = -1, indicates tracking is + // from current to previous frame. + optional int32 match_frame = 28 [default = 0]; + + // Deprecated fields. + extensions 9; +} diff --git a/mediapipe/util/tracking/flow_packager.cc b/mediapipe/util/tracking/flow_packager.cc new file mode 100644 index 000000000..3a570d85b --- /dev/null +++ b/mediapipe/util/tracking/flow_packager.cc @@ -0,0 +1,1023 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/flow_packager.h" + +#include + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/camera_motion.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/motion_estimation.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +FlowPackager::FlowPackager(const FlowPackagerOptions& options) + : options_(options) { + if (options_.binary_tracking_data_support()) { + CHECK_LE(options.domain_width(), 256); + CHECK_LE(options.domain_height(), 256); + } +} + +namespace { + +// Performs rounding of float vector position to int. +class FeatureIntegerPosition { + public: + // Scales a feature's location in x and y by scale_x and scale_y respectively. + // Limits feature position to the integer domain + // [0, width - 1] x [0, height - 1] + FeatureIntegerPosition(float scale_x, float scale_y, int width, int height) + : scale_x_(scale_x), scale_y_(scale_y), width_(width), height_(height) {} + + Vector2_i ToIntPosition(const RegionFlowFeature& feature) const { + return Vector2_i( + std::max(0, std::min(width_ - 1, feature.x() * scale_x_ + 0.5f)), + std::max(0, std::min(height_ - 1, feature.y() * scale_y_ + 0.5f))); + } + + private: + float scale_x_; + float scale_y_; + int width_; + int height_; +}; + +// Lexicographic compare (first in x, then in y) under scaled integer rounding +// as specified by FeatureIntegerPosition. +class IntegerColumnComparator { + public: + IntegerColumnComparator(float scale_x, float scale_y, int width, int height) + : integer_pos_(scale_x, scale_y, width, height) {} + + bool operator()(const RegionFlowFeature& lhs, + const RegionFlowFeature& rhs) const { + const Vector2_i vec_lhs = integer_pos_.ToIntPosition(lhs); + const Vector2_i vec_rhs = integer_pos_.ToIntPosition(rhs); + return (vec_lhs.x() < vec_rhs.x()) || + (vec_lhs.x() == vec_rhs.x() && vec_lhs.y() < vec_rhs.y()); + } + + private: + const FeatureIntegerPosition integer_pos_; +}; + +template +inline std::string EncodeToString(const T& value) { + std::string s(sizeof(T), 0); + memcpy(&s[0], &value, sizeof(T)); + return s; +} + +template +inline std::string EncodeVectorToString(const std::vector& vec) { + std::string s(vec.size() * sizeof(T), 0); + typename std::vector::const_iterator iter; + char* ptr; + for (iter = vec.begin(), ptr = &s[0]; iter != vec.end(); + ++iter, ptr += sizeof(T)) { + memcpy(ptr, &(*iter), sizeof(T)); + } + return s; +} + +template +inline bool DecodeFromStringView(absl::string_view str, T* result) { + CHECK(result != nullptr); + if (sizeof(*result) != str.size()) { + return false; + } + memcpy(result, str.data(), sizeof(T)); + return true; +} + +template +inline bool DecodeVectorFromStringView(absl::string_view str, + std::vector* result) { + CHECK(result != nullptr); + if (str.size() % sizeof(T) != 0) return false; + result->clear(); + result->reserve(str.size() / sizeof(T)); + T value; + const char* begin = str.data(); + const char* end = str.data() + str.size(); + for (const char* ptr = begin; ptr != end; ptr += sizeof(T)) { + memcpy(&value, ptr, sizeof(T)); + result->push_back(value); + } + return true; +} +} // namespace. + +void FlowPackager::PackFlow(const RegionFlowFeatureList& feature_list, + const CameraMotion* camera_motion, + TrackingData* tracking_data) const { + CHECK(tracking_data); + CHECK_GT(feature_list.frame_width(), 0); + CHECK_GT(feature_list.frame_height(), 0); + + // Scale flow to output domain. + const float dim_x_scale = + options_.domain_width() * (1.0f / feature_list.frame_width()); + const float dim_y_scale = + options_.domain_height() * (1.0f / feature_list.frame_height()); + + const bool long_tracks = feature_list.long_tracks(); + + // Sort features lexicographically. + RegionFlowFeatureList sorted_feature_list(feature_list); + + SortRegionFlowFeatureList(dim_x_scale, dim_y_scale, &sorted_feature_list); + + tracking_data->set_domain_width(options_.domain_width()); + tracking_data->set_domain_height(options_.domain_height()); + tracking_data->set_frame_aspect(feature_list.frame_width() * 1.0f / + feature_list.frame_height()); + tracking_data->set_global_feature_count(feature_list.feature_size()); + int flags = 0; + + if (camera_motion == nullptr || + camera_motion->type() > CameraMotion::UNSTABLE_SIM) { + flags |= TrackingData::FLAG_BACKGROUND_UNSTABLE; + } else { + Homography transform; + CameraMotionToHomography(*camera_motion, &transform); + Homography normalization = HomographyAdapter::Embed( + AffineAdapter::FromArgs(0, 0, dim_x_scale, 0, 0, dim_y_scale)); + Homography inv_normalization = + HomographyAdapter::Embed(AffineAdapter::FromArgs( + 0, 0, 1.0f / dim_x_scale, 0, 0, 1.0f / dim_y_scale)); + *tracking_data->mutable_background_model() = + ModelCompose3(normalization, transform, inv_normalization); + } + + if (camera_motion != nullptr) { + tracking_data->set_average_motion_magnitude( + camera_motion->average_magnitude()); + } + + if (feature_list.is_duplicated()) { + flags |= TrackingData::FLAG_DUPLICATED; + } + tracking_data->set_frame_flags(flags); + + const int num_vectors = sorted_feature_list.feature_size(); + TrackingData::MotionData* data = tracking_data->mutable_motion_data(); + data->set_num_elements(num_vectors); + + // Initialize col starts with "unseen" marker. + std::vector col_start(options_.domain_width() + 1, -1); + + int last_col = -1; + int last_row = -1; + FeatureIntegerPosition integer_pos(dim_x_scale, dim_y_scale, + options_.domain_width(), + options_.domain_height()); + + // Store feature and corresponding motion (minus camera motion) in + // compressed sparse column format: + // https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29 + for (const auto& feature : sorted_feature_list.feature()) { + float flow_x = feature.dx() * dim_x_scale; + float flow_y = feature.dy() * dim_y_scale; + Vector2_i loc = integer_pos.ToIntPosition(feature); + + // Convert back to float for accurate background model computation. + Vector2_f loc_f = Vector2_f::Cast(loc); + + if (camera_motion) { + Vector2_f residual = HomographyAdapter::TransformPoint( + tracking_data->background_model(), loc_f) - + loc_f; + flow_x -= residual.x(); + flow_y -= residual.y(); + } + + data->add_vector_data(flow_x); + data->add_vector_data(flow_y); + data->add_row_indices(loc.y()); + + if (feature.has_binary_feature_descriptor()) { + data->add_feature_descriptors()->set_data( + feature.binary_feature_descriptor().data()); + } + + if (long_tracks) { + data->add_track_id(feature.track_id()); + } + + const int curr_col = loc.x(); + + if (curr_col != last_col) { + CHECK_LT(last_col, curr_col); + CHECK_EQ(-1, col_start[curr_col]); + col_start[curr_col] = data->row_indices_size() - 1; + last_col = curr_col; + } else { + CHECK_LE(last_row, loc.y()); + } + last_row = loc.y(); + } + + col_start[0] = 0; + col_start[options_.domain_width()] = num_vectors; + + // Fill unset values with previously set value. Propagate end value. + for (int i = options_.domain_width() - 1; i > 0; --i) { + if (col_start[i] < 0) { + DCHECK_GE(col_start[i + 1], 0); + col_start[i] = col_start[i + 1]; + } + } + + for (const auto& col_idx : col_start) { + data->add_col_starts(col_idx); + } + + // Check monotonicity of the row indices. + for (int c = 0; c < options_.domain_width(); ++c) { + const int r_start = data->col_starts(c); + const int r_end = data->col_starts(c + 1); + for (int r = r_start; r < r_end - 1; ++r) { + CHECK_LE(data->row_indices(r), data->row_indices(r + 1)); + } + } + + CHECK_EQ(data->vector_data_size(), 2 * data->row_indices_size()); +} + +void FlowPackager::EncodeTrackingData(const TrackingData& tracking_data, + BinaryTrackingData* binary_data) const { + CHECK(options_.binary_tracking_data_support()); + CHECK(binary_data != nullptr); + + int32 frame_flags = 0; + const bool high_profile = options_.use_high_profile(); + if (high_profile) { + frame_flags |= TrackingData::FLAG_PROFILE_HIGH; + } else { + frame_flags |= TrackingData::FLAG_PROFILE_BASELINE; // No op. + } + + if (options_.high_fidelity_16bit_encode()) { + frame_flags |= TrackingData::FLAG_HIGH_FIDELITY_VECTORS; + } + + // Copy background flag. + frame_flags |= + tracking_data.frame_flags() & TrackingData::FLAG_BACKGROUND_UNSTABLE; + + const TrackingData::MotionData& motion_data = tracking_data.motion_data(); + int32 num_vectors = motion_data.num_elements(); + + // Compute maximum vector or delta vector value. + float max_vector_value = 0; + if (high_profile) { + for (int k = 2; k < 2 * num_vectors; ++k) { + max_vector_value = std::max( + max_vector_value, + fabs(motion_data.vector_data(k) - motion_data.vector_data(k - 2)) * + 1.02f); // Expand by 2% to account for + // rounding issues. + } + } else { + for (const float vector_value : motion_data.vector_data()) { + max_vector_value = std::max(max_vector_value, fabs(vector_value)); + } + } + + const int32 domain_width = tracking_data.domain_width(); + const int32 domain_height = tracking_data.domain_height(); + CHECK_LT(domain_height, 256) << "Only heights below 256 are supported."; + const float frame_aspect = tracking_data.frame_aspect(); + + // Limit vector value from above (to 20% frame diameter) and below (small + // eps). + const float max_vector_threshold = hypot(domain_width, domain_height) * 0.2f; + // Warn if too much truncation. + if (max_vector_value > max_vector_threshold * 1.5f) { + LOG(WARNING) << "A lot of truncation will occur during encoding. " + << "Vector magnitudes are larger than 20% of the " + << "frame diameter."; + } + + max_vector_value = + std::min(max_vector_threshold, std::max(1e-4f, max_vector_value)); + + // Compute scales for 16bit and 8bit float -> int conversion. + // Use highest bit for sign. + const int kByteMax16 = (1 << 15) - 1; + const int kByteMax8 = (1 << 7) - 1; + + // Scale such that highest vector value is mapped to kByteMax + int scale_16 = std::ceil(kByteMax16 / max_vector_value); + int scale_8 = std::ceil(kByteMax8 / max_vector_value); + + const int32 scale = + options_.high_fidelity_16bit_encode() ? scale_16 : scale_8; + const float inv_scale = 1.0f / scale; + const int kByteMax = + options_.high_fidelity_16bit_encode() ? kByteMax16 : kByteMax8; + + // Compressed flow to be encoded in binary format. + std::vector flow_compressed_16; + std::vector flow_compressed_8; + + flow_compressed_16.reserve(num_vectors); + flow_compressed_8.reserve(num_vectors); + + std::vector row_idx; + row_idx.reserve(num_vectors); + + float average_error = 0; + std::vector col_starts(motion_data.col_starts().begin(), + motion_data.col_starts().end()); + + // Separate both implementations for easier readability. + // For details please refer to description in proto. + // Low profile: + // * Encode vectors by scaling to integer format. + // * Keep sparse matrix format as is + // High profile: + // * Encode deltas between vectors scaling them to integers + // * Re-use encoded vectors if delta is small, use ADVANCE flag in row + // index. + // * Delta encode row indices to reduce magnitude. + // * If two row deltas are small (< 8), encode in one byte + if (!high_profile) { + // Traverse columns. + for (int c = 0; c < col_starts.size() - 1; ++c) { + const int r_start = col_starts[c]; + const int r_end = col_starts[c + 1]; + for (int r = r_start; r < r_end; ++r) { + const float flow_x_32f = motion_data.vector_data(2 * r); + const float flow_y_32f = motion_data.vector_data(2 * r + 1); + + const int flow_x = + std::max(-kByteMax, std::min(kByteMax, flow_x_32f * scale)); + const int flow_y = + std::max(-kByteMax, std::min(kByteMax, flow_y_32f * scale)); + average_error += 0.5f * (fabs(flow_x * inv_scale - flow_x_32f) + + fabs(flow_y * inv_scale - flow_y_32f)); + + if (options_.high_fidelity_16bit_encode()) { + flow_compressed_16.push_back(flow_x); + flow_compressed_16.push_back(flow_y); + } else { + flow_compressed_8.push_back(flow_x); + flow_compressed_8.push_back(flow_y); + } + + DCHECK_LT(motion_data.row_indices(r), 256); + row_idx.push_back(motion_data.row_indices(r)); + } + } + } else { + // Compress flow. + int prev_flow_x = 0; + int prev_flow_y = 0; + const float reuse_threshold = options_.high_profile_reuse_threshold(); + int compressible = 0; + + std::vector compressions_per_column(domain_width, 0); + const int kAdvanceFlag = FlowPackagerOptions::ADVANCE_FLAG; + const int kDoubleIndexEncode = FlowPackagerOptions::DOUBLE_INDEX_ENCODE; + const int kIndexMask = FlowPackagerOptions::INDEX_MASK; + + // Traverse columns. + for (int c = 0; c < motion_data.col_starts().size() - 1; ++c) { + const int r_start = col_starts[c]; + const int r_end = col_starts[c + 1]; + for (int r = r_start; r < r_end; ++r) { + int flow_x = 0; + int flow_y = 0; + bool advance = true; + const float flow_x_32f = motion_data.vector_data(2 * r); + const float flow_y_32f = motion_data.vector_data(2 * r + 1); + + // Delta coding of vectors. + const float diff_x = flow_x_32f - prev_flow_x * inv_scale; + const float diff_y = flow_y_32f - prev_flow_y * inv_scale; + + // Determine if previous flow can be re-used. + if (fabs(diff_x) < reuse_threshold && fabs(diff_y) < reuse_threshold) { + advance = false; + } else { + flow_x = std::max(-kByteMax, std::min(kByteMax, diff_x * scale)); + + flow_y = std::max(-kByteMax, std::min(kByteMax, diff_y * scale)); + + prev_flow_x += flow_x; + prev_flow_y += flow_y; + } + + average_error += 0.5f * (fabs(prev_flow_x * inv_scale - flow_x_32f) + + fabs(prev_flow_y * inv_scale - flow_y_32f)); + + // Combine into one 32 or 16 bit value (clear sign bits for the + // right part before combining). + if (advance) { + if (options_.high_fidelity_16bit_encode()) { + flow_compressed_16.push_back(flow_x); + flow_compressed_16.push_back(flow_y); + } else { + flow_compressed_8.push_back(flow_x); + flow_compressed_8.push_back(flow_y); + } + } + + // Delta code row indices in high profile mode and use two top bits + // for status: + // 10: single row encode, use next vector data. + // (ADVANCE_FLAG) + // + // 11: double row encode: (3 bit + 3 bit = maximum of 7 + 7 row delta), + // use next vector data for each. + // (ADVANCE_FLAG | DOUBLE_INDEX_ENCODE) + // + // 00: single row encode + no advance (re-use previous vector data). + // (no flags set) + // + // 01: double row encode + no advance (re-use previous vector data for + // each). + // (DOUBLE_INDEX_ENCODE) + + // Delta compress. + int delta_row = motion_data.row_indices(r) - + (r == r_start ? 0 : motion_data.row_indices(r - 1)); + CHECK_GE(delta_row, 0); + + bool combined = false; + if (r > r_start) { + int prev_row_idx = row_idx.back(); + if (!(prev_row_idx & kDoubleIndexEncode) && // Single encode. + (prev_row_idx & kAdvanceFlag) == advance) { // Same advance flag. + // Both compressible (each index fits in 3 bit). + if (delta_row < 8 && (prev_row_idx & kIndexMask) < 8) { + // Encode two deltas into 6 bit. + prev_row_idx = ((prev_row_idx & 0x07) << 3) | delta_row | + kDoubleIndexEncode | (advance ? kAdvanceFlag : 0); + + row_idx.back() = prev_row_idx; + // Record as one compression for this column. + ++compressions_per_column[c]; + ++compressible; + combined = true; + } + } + } + + if (!combined) { + while (delta_row > kIndexMask) { + // Special case of large displacement. Duplicate vector until sum of + // deltas reaches target delta). + row_idx.push_back(kIndexMask | (advance ? kAdvanceFlag : 0)); + delta_row -= kIndexMask; + advance = false; // Store same vector again, re-use previously + // encoded vector data. + + // Record as one addition for the column. + --compressions_per_column[c]; + ++num_vectors; + } + + row_idx.push_back(delta_row | (advance ? kAdvanceFlag : 0)); + } + } + } + + // Count number of advance flags encoded. + int encoded = 0; + for (int idx : row_idx) { + if (idx & kAdvanceFlag) { + encoded += (idx & kDoubleIndexEncode) ? 2 : 1; + } + } + + if (options_.high_fidelity_16bit_encode()) { + CHECK_EQ(2 * encoded, flow_compressed_16.size()); + } else { + CHECK_EQ(2 * encoded, flow_compressed_8.size()); + } + + // Adjust column start by compressions. + int curr_adjust = 0; + for (int k = 0; k < domain_width; ++k) { + curr_adjust -= compressions_per_column[k]; + col_starts[k + 1] += curr_adjust; + CHECK_LE(col_starts[k], col_starts[k + 1]); + } + + CHECK_EQ(row_idx.size(), col_starts.back()); + CHECK_EQ(num_vectors, row_idx.size() + compressible); + } + + // Delta compress col_starts. + std::vector col_start_delta(domain_width + 1, 0); + col_start_delta[0] = col_starts[0]; + for (int k = 1; k < domain_width + 1; ++k) { + const int delta = col_starts[k] - col_starts[k - 1]; + CHECK_LT(delta, 256) << "Only up to 255 items per column supported."; + col_start_delta[k] = delta; + } + + VLOG(1) << "error: " << average_error / (num_vectors + 1) + << " additions: " << num_vectors - motion_data.num_elements(); + const Homography& background_model = tracking_data.background_model(); + + const float scale_x = 1.0f / tracking_data.domain_width(); + const float scale_y = 1.0f / tracking_data.domain_height(); + + Homography homog_scale = HomographyAdapter::Embed( + AffineAdapter::FromArgs(0, 0, scale_x, 0, 0, scale_y)); + + Homography inv_homog_scale = HomographyAdapter::Embed( + AffineAdapter::FromArgs(0, 0, 1.0f / scale_x, 0, 0, 1.0f / scale_y)); + + // Might be just the identity if not set. + const Homography background_model_scaled = + ModelCompose3(homog_scale, background_model, inv_homog_scale); + + std::string background_model_string = + absl::StrCat(EncodeToString(background_model.h_00()), + EncodeToString(background_model.h_01()), + EncodeToString(background_model.h_02()), + EncodeToString(background_model.h_10()), + EncodeToString(background_model.h_11()), + EncodeToString(background_model.h_12()), + EncodeToString(background_model.h_20()), + EncodeToString(background_model.h_21())); + + std::string* data = binary_data->mutable_data(); + data->clear(); + int32 vector_size = options_.high_fidelity_16bit_encode() + ? flow_compressed_16.size() + : flow_compressed_8.size(); + int32 row_idx_size = row_idx.size(); + + absl::StrAppend( + data, + absl::StrCat(EncodeToString(frame_flags), EncodeToString(domain_width), + EncodeToString(domain_height), EncodeToString(frame_aspect), + background_model_string, EncodeToString(scale), + EncodeToString(num_vectors), + EncodeVectorToString(col_start_delta), + EncodeToString(row_idx_size), EncodeVectorToString(row_idx), + EncodeToString(vector_size), + (options_.high_fidelity_16bit_encode() + ? EncodeVectorToString(flow_compressed_16) + : EncodeVectorToString(flow_compressed_8)))); + VLOG(1) << "Binary data size: " << data->size() << " for " << num_vectors + << " (" << vector_size << ")"; +} + +std::string PopSubstring(int len, absl::string_view* piece) { + std::string result = std::string(piece->substr(0, len)); + piece->remove_prefix(len); + return result; +} + +void FlowPackager::DecodeTrackingData(const BinaryTrackingData& container_data, + TrackingData* tracking_data) const { + CHECK(tracking_data != nullptr); + + absl::string_view data(container_data.data()); + int32 frame_flags = 0; + int32 domain_width = 0; + int32 domain_height = 0; + std::vector background_model; + int32 scale = 0; + int32 num_vectors = 0; + float frame_aspect = 0.0f; + + DecodeFromStringView(PopSubstring(4, &data), &frame_flags); + DecodeFromStringView(PopSubstring(4, &data), &domain_width); + DecodeFromStringView(PopSubstring(4, &data), &domain_height); + DecodeFromStringView(PopSubstring(4, &data), &frame_aspect); + + CHECK_LE(domain_width, 256); + CHECK_LE(domain_height, 256); + + DecodeVectorFromStringView( + PopSubstring(4 * HomographyAdapter::NumParameters(), &data), + &background_model); + DecodeFromStringView(PopSubstring(4, &data), &scale); + DecodeFromStringView(PopSubstring(4, &data), &num_vectors); + + tracking_data->set_frame_flags(frame_flags); + tracking_data->set_domain_width(domain_width); + tracking_data->set_domain_height(domain_height); + tracking_data->set_frame_aspect(frame_aspect); + *tracking_data->mutable_background_model() = + HomographyAdapter::FromFloatPointer(&background_model[0], false); + + TrackingData::MotionData* motion_data = tracking_data->mutable_motion_data(); + motion_data->set_num_elements(num_vectors); + + const bool high_profile = frame_flags & TrackingData::FLAG_PROFILE_HIGH; + const bool high_fidelity = + frame_flags & TrackingData::FLAG_HIGH_FIDELITY_VECTORS; + const float flow_denom = 1.0f / scale; + + std::vector col_starts_delta; + DecodeVectorFromStringView(PopSubstring(domain_width + 1, &data), + &col_starts_delta); + + // Delta decompress. + std::vector col_starts; + col_starts.reserve(domain_width + 1); + + int column = 0; + for (auto col : col_starts_delta) { + column += col; + col_starts.push_back(column); + } + + std::vector row_idx; + int32 row_idx_size; + DecodeFromStringView(PopSubstring(4, &data), &row_idx_size); + + // Should not have more row indices than vectors. (One for each in baseline + // profile, less in high profile). + CHECK_LE(row_idx_size, num_vectors); + DecodeVectorFromStringView(PopSubstring(row_idx_size, &data), &row_idx); + + // Records for each vector whether to advance pointer in the vector data array + // or re-use previously read data. + std::vector advance(num_vectors, true); + + if (high_profile) { + // Unpack row indices, populate advance. + const int kAdvanceFlag = FlowPackagerOptions::ADVANCE_FLAG; + const int kDoubleIndexEncode = FlowPackagerOptions::DOUBLE_INDEX_ENCODE; + const int kIndexMask = FlowPackagerOptions::INDEX_MASK; + + std::vector column_expansions(domain_width, 0); + std::vector row_idx_unpacked; + row_idx_unpacked.reserve(num_vectors); + advance.clear(); + + for (int c = 0; c < col_starts.size() - 1; ++c) { + const int r_start = col_starts[c]; + const int r_end = col_starts[c + 1]; + uint8 prev_row_idx = 0; + for (int r = r_start; r < r_end; ++r) { + // Use top bit as indicator to advance. + advance.push_back(row_idx[r] & kAdvanceFlag); + + // Double encode? + if (row_idx[r] & kDoubleIndexEncode) { + // Indices are encoded as each 3 bit offset within kIndexMask. + prev_row_idx += (row_idx[r] >> 3) & 0x7; + row_idx_unpacked.push_back(prev_row_idx); + prev_row_idx += row_idx[r] & 0x7; + row_idx_unpacked.push_back(prev_row_idx); + + // Duplicate advance setting. + advance.push_back(advance.back()); + ++column_expansions[c]; + } else { + // Single encode. + prev_row_idx += row_idx[r] & kIndexMask; // Clear status. + row_idx_unpacked.push_back(prev_row_idx); + } + } + } + row_idx.swap(row_idx_unpacked); + CHECK_EQ(num_vectors, row_idx.size()); + + // Adjust column start by expansions. + int curr_adjust = 0; + for (int k = 0; k < domain_width; ++k) { + curr_adjust += column_expansions[k]; + col_starts[k + 1] += curr_adjust; + } + } + + CHECK_EQ(num_vectors, col_starts.back()); + + int vector_data_size; + DecodeFromStringView(PopSubstring(4, &data), &vector_data_size); + + int prev_flow_x = 0; + int prev_flow_y = 0; + if (high_fidelity) { + std::vector vector_data; + DecodeVectorFromStringView( + PopSubstring(sizeof(vector_data[0]) * vector_data_size, &data), + &vector_data); + int counter = 0; + for (int k = 0; k < num_vectors; ++k) { + if (advance[k]) { // Read new vector data. + int flow_x = vector_data[counter++]; + int flow_y = vector_data[counter++]; + + if (high_profile) { // Delta decode in high profile. + flow_x += prev_flow_x; + flow_y += prev_flow_y; + prev_flow_x = flow_x; + prev_flow_y = flow_y; + } + + motion_data->add_vector_data(flow_x * flow_denom); + motion_data->add_vector_data(flow_y * flow_denom); + } else { // Re-use previous vector data. + motion_data->add_vector_data(prev_flow_x * flow_denom); + motion_data->add_vector_data(prev_flow_y * flow_denom); + } + } + CHECK_EQ(vector_data_size, counter); + } else { + std::vector vector_data; + DecodeVectorFromStringView( + PopSubstring(sizeof(vector_data[0]) * vector_data_size, &data), + &vector_data); + int counter = 0; + for (int k = 0; k < num_vectors; ++k) { + if (advance[k]) { // Read new vector data. + int flow_x = vector_data[counter++]; + int flow_y = vector_data[counter++]; + + if (high_profile) { // Delta decode in high profile. + flow_x += prev_flow_x; + flow_y += prev_flow_y; + prev_flow_x = flow_x; + prev_flow_y = flow_y; + } + + motion_data->add_vector_data(flow_x * flow_denom); + motion_data->add_vector_data(flow_y * flow_denom); + } else { // Re-use previous vector data. + motion_data->add_vector_data(prev_flow_x * flow_denom); + motion_data->add_vector_data(prev_flow_y * flow_denom); + } + } + CHECK_EQ(vector_data_size, counter); + } + + for (auto idx : row_idx) { + motion_data->add_row_indices(idx); + } + + for (auto column : col_starts) { + motion_data->add_col_starts(column); + } +} + +void FlowPackager::BinaryTrackingDataToContainer( + const BinaryTrackingData& binary_data, TrackingContainer* container) const { + CHECK(container != nullptr); + container->Clear(); + container->set_header("TRAK"); + container->set_version(1); + container->set_size(binary_data.data().size()); + *container->mutable_data() = binary_data.data(); +} + +void FlowPackager::BinaryTrackingDataFromContainer( + const TrackingContainer& container, BinaryTrackingData* binary_data) const { + CHECK_EQ("TRAK", container.header()); + CHECK_EQ(1, container.version()) << "Unsupported version."; + *binary_data->mutable_data() = container.data(); +} + +void FlowPackager::DecodeMetaData(const TrackingContainer& container_data, + MetaData* meta_data) const { + CHECK(meta_data != nullptr); + + CHECK_EQ("META", container_data.header()); + CHECK_EQ(1, container_data.version()) << "Unsupported version."; + + absl::string_view data(container_data.data()); + + int32 num_frames; + DecodeFromStringView(PopSubstring(4, &data), &num_frames); + meta_data->set_num_frames(num_frames); + + for (int k = 0; k < num_frames; ++k) { + int32 msec; + int32 stream_offset; + + DecodeFromStringView(PopSubstring(4, &data), &msec); + DecodeFromStringView(PopSubstring(4, &data), &stream_offset); + + MetaData::TrackOffset* track_offset = meta_data->add_track_offsets(); + track_offset->set_msec(msec); + track_offset->set_stream_offset(stream_offset); + } +} + +void FlowPackager::FinalizeTrackingContainerFormat( + std::vector* timestamps, + TrackingContainerFormat* container_format) { + CHECK(container_format != nullptr); + + // Compute binary sizes of track_data. + const int num_frames = container_format->track_data_size(); + + std::vector msecs(num_frames, 0); + if (timestamps) { + CHECK_EQ(num_frames, timestamps->size()); + msecs = *timestamps; + } + std::vector sizes(num_frames, 0); + + for (int f = 0; f < num_frames; ++f) { + // Default size of container: 12 bytes + binary data size (see comment for + // TrackingContainer in flow_packager.proto). + sizes[f] = container_format->track_data(f).data().size() + 12; + } + + // Store relative offsets w.r.t. end of MetaData. + MetaData meta_data; + InitializeMetaData(num_frames, msecs, sizes, &meta_data); + + // Serialize metadata to binary. + TrackingContainer* meta = container_format->mutable_meta_data(); + meta->Clear(); + meta->set_header("META"); + + std::string* binary_metadata = meta->mutable_data(); + absl::StrAppend(binary_metadata, EncodeToString(meta_data.num_frames())); + for (auto& track_offset : *meta_data.mutable_track_offsets()) { + absl::StrAppend(binary_metadata, + absl::StrCat(EncodeToString(track_offset.msec()), + EncodeToString(track_offset.stream_offset()))); + } + + meta->set_size(binary_metadata->size()); + + // Add term header. + TrackingContainer* term = container_format->mutable_term_data(); + term->set_header("TERM"); + term->set_size(0); +} + +void FlowPackager::FinalizeTrackingContainerProto( + std::vector* timestamps, TrackingContainerProto* proto) { + CHECK(proto != nullptr); + + // Compute binary sizes of track_data. + const int num_frames = proto->track_data_size(); + + std::vector msecs(num_frames, 0); + if (timestamps) { + CHECK_EQ(num_frames, timestamps->size()); + msecs = *timestamps; + } + + std::vector sizes(num_frames, 0); + + TrackingContainerProto temp_proto; + BinaryTrackingData* temp_track_data = temp_proto.add_track_data(); + for (int f = 0; f < num_frames; ++f) { + // Swap current track data in and out of temp_track_data to determine total + // encoding size with proto preamble. + proto->mutable_track_data(f)->Swap(temp_track_data); + sizes[f] = temp_proto.ByteSize(); + proto->mutable_track_data(f)->Swap(temp_track_data); + } + + proto->clear_meta_data(); + InitializeMetaData(num_frames, msecs, sizes, proto->mutable_meta_data()); +} + +void FlowPackager::InitializeMetaData(int num_frames, + const std::vector& msecs, + const std::vector& data_sizes, + MetaData* meta_data) const { + meta_data->set_num_frames(num_frames); + CHECK_EQ(num_frames, msecs.size()); + CHECK_EQ(num_frames, data_sizes.size()); + + int curr_offset = 0; + for (int f = 0; f < num_frames; ++f) { + MetaData::TrackOffset* track_offset = meta_data->add_track_offsets(); + track_offset->set_msec(msecs[f]); + track_offset->set_stream_offset(curr_offset); + curr_offset += data_sizes[f]; + } +} + +void FlowPackager::AddContainerToString(const TrackingContainer& container, + std::string* binary_data) { + CHECK(binary_data != nullptr); + std::string header_string(container.header()); + CHECK_EQ(4, header_string.size()); + + std::vector header{header_string[0], header_string[1], header_string[2], + header_string[3]}; + absl::StrAppend(binary_data, EncodeVectorToString(header), + EncodeToString(container.version()), + EncodeToString(container.size()), container.data()); +} + +std::string FlowPackager::SplitContainerFromString( + absl::string_view* binary_data, TrackingContainer* container) { + CHECK(binary_data != nullptr); + CHECK(container != nullptr); + CHECK_GE(binary_data->size(), 12) << "Data does not contain " + << "valid container"; + + container->set_header(PopSubstring(4, binary_data)); + + int version; + DecodeFromStringView(PopSubstring(4, binary_data), &version); + + int size; + DecodeFromStringView(PopSubstring(4, binary_data), &size); + + container->set_version(version); + container->set_size(size); + + if (size > 0) { + container->set_data(PopSubstring(size, binary_data)); + } + + return container->header(); +} + +void FlowPackager::TrackingContainerFormatToBinary( + const TrackingContainerFormat& container_format, std::string* binary) { + CHECK(binary != nullptr); + binary->clear(); + + AddContainerToString(container_format.meta_data(), binary); + for (const auto& track_data : container_format.track_data()) { + AddContainerToString(track_data, binary); + } + + AddContainerToString(container_format.term_data(), binary); +} + +void FlowPackager::TrackingContainerFormatFromBinary( + const std::string& binary, TrackingContainerFormat* container_format) { + CHECK(container_format != nullptr); + container_format->Clear(); + + absl::string_view data(binary); + + CHECK_EQ("META", SplitContainerFromString( + &data, container_format->mutable_meta_data())); + MetaData meta_data; + DecodeMetaData(container_format->meta_data(), &meta_data); + + for (int f = 0; f < meta_data.num_frames(); ++f) { + TrackingContainer* container = container_format->add_track_data(); + CHECK_EQ("TRAK", SplitContainerFromString(&data, container)); + } + + CHECK_EQ("TERM", SplitContainerFromString( + &data, container_format->mutable_term_data())); +} + +void FlowPackager::SortRegionFlowFeatureList( + float scale_x, float scale_y, RegionFlowFeatureList* feature_list) const { + CHECK(feature_list != nullptr); + // Sort features lexicographically. + std::sort(feature_list->mutable_feature()->begin(), + feature_list->mutable_feature()->end(), + IntegerColumnComparator(scale_x, scale_y, options_.domain_width(), + options_.domain_height())); +} + +bool FlowPackager::CompatibleForEncodeWithoutDuplication( + const TrackingData& tracking_data) const { + const TrackingData::MotionData& motion_data = tracking_data.motion_data(); + for (int c = 0; c < motion_data.col_starts_size() - 1; ++c) { + const int r_start = motion_data.col_starts(c); + const int r_end = motion_data.col_starts(c + 1); + for (int r = r_start; r < r_end; ++r) { + if (motion_data.row_indices(r) - + (r == r_start ? 0 : motion_data.row_indices(r - 1)) >= + 64) { + return false; + } + } + } + return true; +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/flow_packager.h b/mediapipe/util/tracking/flow_packager.h new file mode 100644 index 000000000..03b67facf --- /dev/null +++ b/mediapipe/util/tracking/flow_packager.h @@ -0,0 +1,168 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_FLOW_PACKAGER_H_ +#define MEDIAPIPE_UTIL_TRACKING_FLOW_PACKAGER_H_ + +#include +#include + +#include "absl/strings/string_view.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/util/tracking/flow_packager.pb.h" +#include "mediapipe/util/tracking/motion_estimation.pb.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +// Usage (output): +// FlowPackager flow_packager((FlowPackagerOptions())); +// +// // Input: Feature lists and optional camera motion +// vector input_features; // Externally supplied. +// vector input_motions; +// +// const int num_frames = input_features.size(); +// +// // Can encode to either TrackingContainerFormat or use protos. +// TrackingContainerFormat container; +// TrackingContainerProto proto; +// +// for (int f = 0; f < num_frames; ++f) { +// // Obtain tracking data. +// TrackingData track_data; +// flow_packager.PackFlow(input_features[f], +// input_motions[f], +// &track_data); +// +// // Encode tracking data. +// BinaryTrackingData binary_data; +// flow_packager.EncodeTrackingData(track_data, +// &binary_data); +// +// // Add to either container format or proto. +// // Container: +// TrackingContainer* track_data_encoded = container.add_track_data(); +// flow_packager.BinaryTrackingDataToContainer(binary_data, +// track_data_encoded); +// // Proto: +// proto.add_track_data()->CopyFrom(binary_data); +// } +// +// // Write meta and term containers. +// flow_packager.FinalizeTrackingContainerFormat(&container); +// flow_packager.FinalizeTrackingProto(&proto); +// +// // Convert to binary std::string to stream out. +// std::string output; +// flow_packager.TrackingContainerFormatToBinary(container, &output); +// // OR: +// proto.SerializeToString(&output); + +// Usage (input): +// std::string input; +// FlowPackager flow_packager((FlowPackagerOptions())); +// TrackingContainerFormat container; +// flow_packager.TrackingContainerFormatFromBinary(input, &container); +// +// // Obtain Track data. +// vector tracking_data; // To be used by tracking. +// for (const auto& track_data_encoded : container.track_data()) { +// tracking_data.push_back(TrackingData()); +// flow_packager.DecodeTrackingData(track_data_encoded, +// &tracking_data.back()); +// } +// +// // Use tracking_data with Tracker. + +class CameraMotion; +class RegionFlowFeatureList; + +class FlowPackager { + public: + explicit FlowPackager(const FlowPackagerOptions& options); + FlowPackager(const FlowPackager&) = delete; + FlowPackager& operator=(const FlowPackager&) = delete; + + void PackFlow(const RegionFlowFeatureList& feature_list, + const CameraMotion* camera_motion, // optional. + TrackingData* tracking_data) const; + + // Converts TrackingData to condensed binary representation. + void EncodeTrackingData(const TrackingData& tracking_data, + BinaryTrackingData* binary_data) const; + + void DecodeTrackingData(const BinaryTrackingData& data, + TrackingData* tracking_data) const; + + void BinaryTrackingDataToContainer(const BinaryTrackingData& binary_data, + TrackingContainer* container) const; + + void BinaryTrackingDataFromContainer(const TrackingContainer& container, + BinaryTrackingData* binary_data) const; + + void DecodeMetaData(const TrackingContainer& data, MetaData* meta_ata) const; + + // Fills in meta (first container) and termination data (last container). + // Optionally, pass timestamps for each frame. + void FinalizeTrackingContainerFormat( + std::vector* timestamps, // optional, can be null. + TrackingContainerFormat* container_fromat); + void FinalizeTrackingContainerProto( + std::vector* timestamps, // optional, can be null. + TrackingContainerProto* proto); + + // Fast encode to binary representation. + void TrackingContainerFormatToBinary( + const TrackingContainerFormat& container_format, std::string* binary); + + // Fast decode from binary representation. + void TrackingContainerFormatFromBinary( + const std::string& binary, TrackingContainerFormat* container_format); + + // Checks whether tracking data can be encoded in high profile mode without + // duplicating any features. This occurs if the horizonal distance between two + // features is less than 64. + bool CompatibleForEncodeWithoutDuplication( + const TrackingData& tracking_data) const; + + // Helper function for test. Sorts according to scaled, integer based + // lexicographical ordering. + // Declared public for access by test. + void SortRegionFlowFeatureList(float scale_x, float scale_y, + RegionFlowFeatureList* feature_list) const; + + // Removes binary encoded container from std::string and parses it to + // container. Returns header std::string of the parsed container. Useful for + // random seek. + std::string SplitContainerFromString(absl::string_view* binary_data, + TrackingContainer* container); + + private: + // Sets meta data for a set + void InitializeMetaData(int num_frames, const std::vector& msecs, + const std::vector& data_sizes, + MetaData* meta_data) const; + + // Serializes container to binary std::string and adds it to binary_data. + void AddContainerToString(const TrackingContainer& container, + std::string* binary_data); + + private: + FlowPackagerOptions options_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_FLOW_PACKAGER_H_ diff --git a/mediapipe/util/tracking/flow_packager.proto b/mediapipe/util/tracking/flow_packager.proto new file mode 100644 index 000000000..c4f4797a9 --- /dev/null +++ b/mediapipe/util/tracking/flow_packager.proto @@ -0,0 +1,319 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/motion_models.proto"; +import "mediapipe/util/tracking/region_flow.proto"; + +// Messages encapsulating compressed and uncompressed TrackingData. +// +// Uncompressed tracking data can be aggregated via an TrackingDataChunk +// (e.g. to be cached to file per chunk). The whole chunk persists in memory +// after reading. +// +// Compressed tracking data can be aggregated as binary encoded TrackingData +// messages into two container formats (with support for random seeking): +// 1) TrackingContainerProto: +// Encoding using proto buffer wire format, using default proto +// serialization and de-serialization to binary string. +// The container uses the MetaData message to store the stream offsets and +// milliseconds for each frame of TrackingData. TrackingData itself is custom +// encoded to binary using FlowPackager::EncodeTrackingData and the resulting +// binary blob wrapped in a BinaryTrackingData message. +// 2) TrackingContainerFormat: +// Encoding without any dependencies to protobuffers, for clients without +// proto buffer support. +// Encoding is based on encoding binary blobs of data wrapped into repeated +// containers. The layout of a container is described by the message +// TrackingContainer and serialized to binary data as described below +// (without using proto encoding). Therefore, message TrackingContainer is +// mostly for documentation purposes than for direct use. +// The format is described by the proto message TrackingContainerFormat (used +// internally by FlowPackager) however serialization and de-serialization +// to binary string is performed using custom methods supplied by +// FlowPackager (TrackingContainerFormatToBinary and +// TrackingContainerFormatFromBinary). +// The format stores the MetaData first as above, although using custom +// encoding. TrackingData is encoded to binary as above using +// FlowPackager::EncodeTrackingData and the resulting binary blob is storred +// within a TrackingContainer. + +// Next flag: 9 +message TrackingData { + enum FrameFlags { + FLAG_PROFILE_BASELINE = 0; + FLAG_PROFILE_HIGH = 1; + FLAG_HIGH_FIDELITY_VECTORS = 2; + FLAG_BACKGROUND_UNSTABLE = 4; // Background model could not be estimated. + FLAG_DUPLICATED = 8; // Frame is duplicated, i.e. identical to + // previous one. + // Indicates the beginning of a new chunk. In this case the track_id's + // are not compatible w.r.t. previous one. + FLAG_CHUNK_BOUNDARY = 16; + } + + optional int32 frame_flags = 1 [default = 0]; + + // Tracking data is resolution independent specified w.r.t. + // specified domain. + optional int32 domain_width = 2; + optional int32 domain_height = 3; + + // Aspect ratio (w/h) of the original frame tracking data was computed from. + optional float frame_aspect = 6 [default = 1.0]; + + optional Homography background_model = 4; + + // Stores num_elements vectors of motion data. (x,y) position encoded via + // row_indices and col_starts, as compressed sparse column matrix storage + // format: + // (https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29), + // Vector data is stored as (dx, dy) position. Optionally we store the fitting + // error and track id for each feature. + message MotionData { + optional int32 num_elements = 1; + + // #num_elements pairs (flow_x, flow_y) densely packed. + repeated float vector_data = 2 [packed = true]; + + // Stores corresponding track index for each feature. Features belonging + // to the same track over time are assigned the same id. + // NOTE: Due to size, tracking ids are never stored as compressed binary + // tracking data. + repeated int32 track_id = 3 [packed = true]; + + // # num_elements row indices. + repeated int32 row_indices = 4 [packed = true]; + + // Start index in above array for each column (#domain_width + 1 entries). + repeated int32 col_starts = 5 [packed = true]; + + // Feature descriptors for num_elements feature points. + repeated BinaryFeatureDescriptor feature_descriptors = 6; + } + + optional MotionData motion_data = 5; + + // Total number of features in our analysis + optional uint32 global_feature_count = 7; + + // Average of all motion vector magnitudes (without accounting for any motion + // model), within 10th to 90th percentile (to remove outliers). + optional float average_motion_magnitude = 8; +} + +message TrackingDataChunk { + message Item { + optional TrackingData tracking_data = 1; + // Global frame index. + optional int32 frame_idx = 2; + // Corresponding timestamp. + optional int64 timestamp_usec = 3; + // Previous frame timestamp. + optional int64 prev_timestamp_usec = 4; + } + + repeated Item item = 1; + + // Set as marker for last chunk. + optional bool last_chunk = 2 [default = false]; + + // Set as marker for first chunk. + optional bool first_chunk = 3 [default = false]; +} + +// TrackingData in compressed binary format. Obtainable via +// FlowPackager::EncodeTrackingData. Details of binary encode are below. +message BinaryTrackingData { // TrackingContainer::header = "TRAK" + optional bytes data = 1; +} + +// Detailed explanation of binary Tracking data encode (LITTLE ENDIAN encode!) +// TrackingData is stored in binary as a struct of the above fields and the +// compressed motion data in sparse column matrix storage format. +// (https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29) +// Specifically, TrackingData is encoded as: +// { frame_flags : 32 bit int (from member) +// domain_width : 32 bit int (from member) +// domain_height : 32 bit int (from member) +// frame_aspect : 32 bit float (from member) +// +// background_model : 6 * 32 bit float (dx, dy, a, b, c, d of AffineModel) +// scale : 32 bit float (scale vectors are multiplied with) +// num_vectors : 32 bit int (from member num_elements) +// +// col_start_delta : (domain_width + 1) * 8 bit uint (col starts delta +// encoded) +// row_idx_size : 32 bit int (size of row_idx array <= num_vectors) +// row_idx : row_idx_size * 8 bit uint +// vector_size : 32 bit int (size of vector_data) +// vector_data : vector_size * [8 bit | 16 bit] int +// (depending on FLAG_HIGH_FIDELITY_VECTORS) +// } +// +// >> Baseline encode << +// Scale is determined such that maximum vector value (maximum across x and y) +// is mapped to highest 8 bit or 16 bit SIGNED int +// (i.e. 7 or 15 bit resolution respectively). +// Vector values are multiplied by this scale (storring float in int with +// truncation) and (dx, dy) is packed as [dy | dx] into a 16bit or 32 bit word. +// Unpacking requires therefore dividing the vector values by scale. +// +// Column starts are delta compressed, that is, col_start_delta[i] stores +// col_starts(i) - col_starts(i - 1) from MotionData. +// +// Row indices are directly stored at 8 bit uints, that is row_idx_size == +// num_vectors in this case. +// +// +// >> High profile encode << +// Scale is determined as above but for maximum vector deltas (maximum across x +// and y of magnitude in difference between two adjacent vectors). Vector value +// deltas are multiplied by this scale before encoding. +// +// Encoding is more complex compared to baseline. Instead of vector value, delta +// vector values (difference in dx = ddx, difference in dy = ddy) +// are multiplied by scale and storred packed as [ddy | ddx] into to 16bit or +// 32bit word. Compression algorithm accounts for error accumulation, so +// unpacking should first add deltas in integer domain (for x and y separately) +// and then divide by scale to yield (an approximation) of the +// original vector value. +// Most importantly, not every vector value is storred, but only if the delta is +// above the FlowPackagerOptions::high_profile_reuse_threshold, in which case we +// advance to the next vector data. Otherwise the previous vector is used. +// +// The information whether to advance is stored for each vector in the +// highest bit of of the row index (FlowPackagerOptions::ADVANCE_FLAG). Row +// indicies are not storred as in the baseline profile directly, but as deltas +// (reset at the beginning of every column). As deltas are small it is often +// possible to store two deltas (if both are < 8) in a single byte. This is +// indicated by the second highest flag in the row index +// (FlowPackagerOptions::DOUBLE_INDEX_ENCODE). If set, row index stores +// [row_delta_1 | row_delta_2] in the lower 6 bit. Note, that the advance flag +// applies uniformly to both deltas in this case. +// Sidenote (edge case): Due to the use of the top 2 bits as flags, +// at times we cannot store the full row delta in the lower 6 bits. +// In this case the vector is duplicated (using the ADVANCE_FLAG) +// until the delta sum of duplicated vectors reaches the original delta. +// Consequently, the compressed vector field in high profile may contain a few +// vectors more than the original. +// +// Column starts are delta compressed as in baseline, but account for double +// index encodes. Therefore each column delta is reduced by the number of double +// index encodes occuring for this column. This has to be replicated on the +// decoding side, each delta needs to be increased by the number of double index +// encodes encountered during encoding. + +// Stores offsets for random seek and time offsets for each frame of +// TrackingData. Stream offsets are specified relative w.r.t. end of metadata +// blob. +// Offsets specify start of the corresponding binary encoded TrackingContainer +// (for TrackingContainerFormat) or BinaryTrackingData proto (for +// TrackingContainerProto). +message MetaData { // TrackingContainer::header = "META" + optional fixed32 num_frames = 2; + + message TrackOffset { + optional fixed32 msec = 1; // Time offset of the metadata in msec. + optional fixed32 stream_offset = 2; // Offset of TrackingContainer or + // respectively BinaryTrackingData + // in stream. + // Specifed w.r.t. end of the Metadata. + } + + repeated TrackOffset track_offsets = 3; +} + +// TrackingContainer is self-describing container format to store arbitrary +// chunks of binary data. Each container is typed via its 4 character header, +// versioned via an int, and followed by the size of the binary data and the +// actual data. Designed for clients without availability of protobuffer +// support. +// Note: This message is mainly used for documentation purposes and uses custom +// encoding as specified by FlowPackager::TrackingContainerFormatToBinary. +// Default binary size of a TrackingContainer (DO NOT CHANGE!): +// header: 4 byte + +// version: 4 byte + +// size: 4 byte + +// data #size +// SUM: 12 + #size. +message TrackingContainer { + optional string header = 1; // 4 character header. + optional fixed32 version = 2 [default = 1]; // Version information. + optional fixed32 size = 3; // Size of binary data held by container + optional bytes data = 4; // Binary data encoded. + + // DO NOT alter layout of TrackingContainer. + // Use version to extend or alter encoded binary data. +} + +// Container format for clients without proto support (written via +// FlowPackager::TrackingContainerFormatToBinary and read via +// FlowPackager::TrackingContainerFormatFromBinary). +// Proto here is intermediate format for documentationa and internal use. +// Stores multiple TrackingContainers of different types. +// Meta data is storred first, to facilitate random seek (via stream offset +// positions) to arbitrary binary TrackinData. Termination container signals end +// of stream. +message TrackingContainerFormat { + optional TrackingContainer meta_data = 1; // Wraps binary meta data, via + // custom encode. + repeated TrackingContainer track_data = 2; // Wraps BinaryTrackingData. + + // Add new TrackingContainers above before end of stream indicator. + // Zero sized termination container with TrackingContainer::header = "TERM". + optional TrackingContainer term_data = 3; +} + +// Simplified proto format of above TrackingContainerFormat. Instead of using +// self-describing TrackingContainer's, we simply use the proto wire format for +// encoding and decoding (proto format is typed and versioned via ids). +message TrackingContainerProto { + optional MetaData meta_data = 1; + repeated BinaryTrackingData track_data = 2; +} + +// Options controlling compression and encoding. +message FlowPackagerOptions { + // Tracking data is resolution independent specified w.r.t. + // specified domain. Only values <= 256 are supported if binary tracking data + // is requested to be supported (see below). + optional int32 domain_width = 1 [default = 256]; + optional int32 domain_height = 2 [default = 192]; + + // Needs to be set for calls to FlowPackager::EncodeTrackingData. If encoding + // is not required, can be set to false in which case a higher domain_width + // can be used. + optional bool binary_tracking_data_support = 6 [default = true]; + + optional bool use_high_profile = 3 [default = false]; + + // If set uses 16 bit encode for vector data, in BinaryTrackingData, + // otherwise only 8 bits are used. + optional bool high_fidelity_16bit_encode = 4 [default = true]; + + // In high profile encode, re-use previously encoded vector when absolute + // difference to current vector is below threshold. + optional float high_profile_reuse_threshold = 5 [default = 0.5]; + + // High profile encoding flags. + enum HighProfileEncoding { + ADVANCE_FLAG = 0x80; + DOUBLE_INDEX_ENCODE = 0x40; + INDEX_MASK = 0x3F; + } +} diff --git a/mediapipe/util/tracking/frame_selection.proto b/mediapipe/util/tracking/frame_selection.proto new file mode 100644 index 000000000..35712c20d --- /dev/null +++ b/mediapipe/util/tracking/frame_selection.proto @@ -0,0 +1,93 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/camera_motion.proto"; +import "mediapipe/util/tracking/frame_selection_solution_evaluator.proto"; +import "mediapipe/util/tracking/region_flow.proto"; + +// Stores selected timestamps and corresponding frame index. +message FrameSelectionTimestamp { + optional int64 timestamp = 1; // Timestamp of the selected frame. + + // Frame index of the selected frame in the initial video stream. If this + // timestamp was manufactured, this will be the index of the initial frame. + optional int32 frame_idx = 2; + + // If this timestamp was manufactured, the timestamp of the original frame. + optional int64 processed_from_timestamp = 3 [default = -1]; +} + +// Stores the result of the frame selection, with composited features. +// Next index: 6 +message FrameSelectionResult { + optional int64 timestamp = 1; // Timestamp of the selected frame. + + // Frame index of the selected frame in the initial video stream. If this + // timestamp was manufactured, this will be the index of the initial frame. + optional int32 frame_idx = 2; + + // CameraMotion from selected item to previous selected item. + optional CameraMotion camera_motion = 3; + + // Features from selected item to previous selected item. + optional RegionFlowFeatureList features = 4; + + // If this FrameSelectionResult was the result of processing a previous one, + // the timestamp of the original frame. + optional int64 processed_from_timestamp = 5 [default = -1]; +} + +// Next index: 7 +message FrameSelectionCriterion { + // Interval at which frames should be sampled; set to zero if sampling should + // not be enforced (i.e. selection is performed w.r.t. other criteria). + optional int32 sampling_rate = 1 [default = 0]; + + // Bandwidth used during dynamic programming. The larger the bandwidth the + // more accurate the result w.r.t. the specified sampling rate. Smaller + // bandwidth's bias the solution suboptimally to center around the mean + // frame numbers of the sampling rate. + // If in (0, 1), assumed to specify fraction of total number of input frames, + // otherwise must be an integer. + optional float bandwidth_frames = 2 [default = 50]; + + // Search radius for dynamic programming (how many frames you are allowed to + // search around the previous frame). + optional int32 search_radius_frames = 3 [default = 1]; + + // Allows one to specify custom solution selection criteria (i.e. different + // way to choose the best row of the computed cost matrix). + optional FrameSelectionSolutionEvaluatorType solution_evaluator = 4; + + // Outputs a fixed number of frames and automatically sets the appropriate + // sampling rate. Set to 0 by default (i.e. not enabled). + optional int32 max_output_frames = 5 [default = 0]; +} + +// Options for computing frame selection. +// TODO: Support multiple criteria if required. Currently uses only the +// first one. +message FrameSelectionOptions { + repeated FrameSelectionCriterion criterion = 1; + + // FrameSelection buffers incoming CameraMotions for specified chunk size + // and creates cost matrices upon reaching the limit. + // TODO: Implement if necessary (currently nothing is cleared upon + // reaching the limit). + optional int32 chunk_size = 2 [default = 100]; +} diff --git a/mediapipe/util/tracking/frame_selection_solution_evaluator.proto b/mediapipe/util/tracking/frame_selection_solution_evaluator.proto new file mode 100644 index 000000000..b2b92c995 --- /dev/null +++ b/mediapipe/util/tracking/frame_selection_solution_evaluator.proto @@ -0,0 +1,27 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +message FrameSelectionSolutionEvaluatorOptions { + extensions 20000 to max; +} + +message FrameSelectionSolutionEvaluatorType { + // Class of type FrameSelectionSolution that computes the best row. + optional string class_name = 1 [default = "FrameSelectionSolutionEvaluator"]; + optional FrameSelectionSolutionEvaluatorOptions options = 2; +} diff --git a/mediapipe/util/tracking/image_util.cc b/mediapipe/util/tracking/image_util.cc new file mode 100644 index 000000000..cafdef079 --- /dev/null +++ b/mediapipe/util/tracking/image_util.cc @@ -0,0 +1,116 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/image_util.h" + +#include +#include + +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/region_flow.h" + +namespace mediapipe { + +// Returns median of the L1 color distance between img_1 and img_2 +float FrameDifferenceMedian(const cv::Mat& img_1, const cv::Mat& img_2) { + CHECK(img_1.size() == img_2.size()); + CHECK_EQ(img_1.channels(), img_2.channels()); + + std::vector color_diffs; + color_diffs.reserve(img_1.cols * img_1.rows); + const int channels = img_1.channels(); + for (int j = 0; j < img_1.rows; ++j) { + const uint8* src_1 = img_1.ptr(j); + const uint8* src_2 = img_2.ptr(j); + const int end_i = channels * img_1.cols; + const float inverse = 1.0f / channels; + for (int i = 0; i < end_i;) { + float color_diff = 0.0f; + for (int k = 0; k < channels; ++k, ++i) { + color_diff += + std::abs(static_cast(src_1[i]) - static_cast(src_2[i])); + } + color_diffs.push_back(color_diff * inverse); + } + } + + auto median = color_diffs.begin() + color_diffs.size() / 2; + std::nth_element(color_diffs.begin(), median, color_diffs.end()); + return *median; +} + +void JetColoring(int steps, std::vector* color_map) { + CHECK(color_map != nullptr); + color_map->resize(steps); + for (int i = 0; i < steps; ++i) { + const float frac = 2.0f * (i * (1.0f / steps) - 0.5f); + if (frac < -0.8f) { + (*color_map)[i] = Vector3_f(0, 0, 0.6f + (frac + 1.0) * 2.0f) * 255.0f; + } else if (frac >= -0.8f && frac < -0.25f) { + (*color_map)[i] = Vector3_f(0, (frac + 0.8f) * 1.82f, 1.0f) * 255.0f; + } else if (frac >= -0.25f && frac < 0.25f) { + (*color_map)[i] = + Vector3_f((frac + 0.25f) * 2.0f, 1.0f, 1.0 + (frac + 0.25f) * -2.0f) * + 255.0f; + } else if (frac >= 0.25f && frac < 0.8f) { + (*color_map)[i] = + Vector3_f(1.0f, 1.0f + (frac - 0.25f) * -1.81f, 0.0f) * 255.0f; + } else if (frac >= 0.8f) { + (*color_map)[i] = + Vector3_f(1.0f + (frac - 0.8f) * -2.0f, 0.0f, 0.0f) * 255.0f; + } else { + LOG(ERROR) << "Out of bound value. Should not occur."; + } + } +} + +void RenderSaliency(const SalientPointFrame& salient_points, + const cv::Scalar& line_color, int line_thickness, + bool render_bounding_box, cv::Mat* image) { + // Visualize salient points. + for (const auto& point : salient_points.point()) { + if (point.weight() > 0) { + SalientPoint copy = point; + ScaleSalientPoint(image->cols, image->rows, ©); + Vector2_f pt(copy.norm_point_x(), copy.norm_point_y()); + + cv::ellipse(*image, cv::Point(pt.x(), pt.y()), + cv::Size(copy.norm_major(), copy.norm_minor()), + copy.angle() / M_PI * 180.0, + 0, // start angle + 360, // end angle + line_color, line_thickness); + + if (render_bounding_box) { + std::vector ellipse_bounding_box; + BoundingBoxFromEllipse(pt, copy.norm_major(), copy.norm_minor(), + copy.angle(), &ellipse_bounding_box); + + std::vector corners; + corners.reserve(4); + for (const Vector2_f& corner : ellipse_bounding_box) { + corners.emplace_back(cv::Point(corner.x(), corner.y())); + } + + for (int k = 0; k < 4; ++k) { + cv::line(*image, corners[k], corners[(k + 1) % 4], line_color, + line_thickness, CV_AA); + } + } + } + } +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/image_util.h b/mediapipe/util/tracking/image_util.h new file mode 100644 index 000000000..ba58d343b --- /dev/null +++ b/mediapipe/util/tracking/image_util.h @@ -0,0 +1,192 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_IMAGE_UTIL_H_ +#define MEDIAPIPE_UTIL_TRACKING_IMAGE_UTIL_H_ + +#include + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +// Returns median of the L1 color distance between img_1 and img_2. +float FrameDifferenceMedian(const cv::Mat& img_1, const cv::Mat& img_2); + +// Matlab's jet color map (returned assuming RGB channel order in [0, 1] +// normalized intensity domain). For details: http://goo.gl/gmHKZ +// Returned as map with num_entries entries. +void JetColoring(int num_entries, std::vector* color_map); + +// Draws a saliency point frame to a single frame. +// Optionally renders axis aligned bounding box for each SalientPointFrame. +void RenderSaliency(const SalientPointFrame& salient_points, + const cv::Scalar& line_color, int line_thickness, + bool render_bounding_box, cv::Mat* image); + +// Templated CopyBorder methods for increased speed. In-place border copy +// for specified Mat of type T with channels. Passed matrix is assumed to be of +// full size, that is we copy the content at [border, cols - 2 * border] x +// [border, rows - 2 * border] to the full size. +template +void CopyMatBorder(cv::Mat* mat); + +// Same as above for copying border only in X or Y +template +void CopyMatBorderX(cv::Mat* mat); +template +void CopyMatBorderY(cv::Mat* mat); + +template +void CopyMatBorder(cv::Mat* mat) { + const int width = mat->cols - 2 * border; + const int height = mat->rows - 2 * border; + + // Maximum values we clamp at to avoid going out of bound small images. + const int max_w = width - 1; + const int max_h = height - 1; + + // Top rows. + for (int r = 0; r < border; ++r) { + const T* src_ptr = + mat->ptr(border + std::min(r, max_h)) + border * channels; + T* dst_ptr = mat->ptr(border - 1 - r); + + // Top left elems. + for (int i = 0; i < border; ++i, dst_ptr += channels) { + for (int j = 0; j < channels; ++j) { + dst_ptr[j] = src_ptr[std::min(max_w, border - 1 - i) * channels + j]; + } + } + + // src and dst should point to same column from here. + DCHECK_EQ(0, (src_ptr - dst_ptr) * sizeof(T) % mat->step[0]); + + // Top row copy. + memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0])); + src_ptr += width * channels; // Points one behind end. + dst_ptr += width * channels; + + // Top right elems. + for (int i = 0; i < border; ++i, dst_ptr += channels) { + if (i <= max_w) { + src_ptr -= channels; + } + for (int j = 0; j < channels; ++j) { + dst_ptr[j] = src_ptr[j]; + } + } + } + + // Left and right border. + for (int r = 0; r < height; ++r) { + // Get pointers to left most and right most column within image. + T* left_ptr = mat->ptr(r + border) + border * channels; + T* right_ptr = left_ptr + (width - 1) * channels; + for (int i = 0; i < border; ++i) { + for (int j = 0; j < channels; ++j) { + left_ptr[-(i + 1) * channels + j] = + left_ptr[std::min(max_w, i) * channels + j]; + right_ptr[(i + 1) * channels + j] = + right_ptr[-std::min(max_w, i) * channels + j]; + } + } + } + + // Bottom rows. + for (int r = 0; r < border; ++r) { + const T* src_ptr = mat->ptr(border + height - 1 - std::min(r, max_h)) + + border * channels; + T* dst_ptr = mat->ptr(border + height + r); + + // First elems. + for (int i = 0; i < border; ++i, dst_ptr += channels) { + for (int j = 0; j < channels; ++j) { + dst_ptr[j] = src_ptr[(border - 1 - std::min(max_w, i)) * channels + j]; + } + } + + // src and dst should point to same column from here. + DCHECK_EQ(0, (dst_ptr - src_ptr) * sizeof(T) % mat->step[0]); + memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0])); + src_ptr += width * channels; // Points one behind the end. + dst_ptr += width * channels; + + // Top right elems. + for (int i = 0; i < border; ++i, dst_ptr += channels) { + if (i <= max_w) { + src_ptr -= channels; + } + for (int j = 0; j < channels; ++j) { + dst_ptr[j] = src_ptr[j]; + } + } + } +} + +template +void CopyMatBorderX(cv::Mat* mat) { + const int width = mat->cols - 2 * border; + const int height = mat->rows - 2 * border; + + // Maximum values we clamp at to avoid going out of bound small images. + const int max_w = width - 1; + + // Left and right border. + for (int r = 0; r < height; ++r) { + T* left_ptr = mat->ptr(r + border) + border * channels; + T* right_ptr = left_ptr + (width - 1) * channels; + for (int i = 0; i < border; ++i) { + for (int j = 0; j < channels; ++j) { + left_ptr[-(i + 1) * channels + j] = + left_ptr[std::min(i, max_w) * channels + j]; + right_ptr[(i + 1) * channels + j] = + right_ptr[-std::min(max_w, i) * channels + j]; + } + } + } +} + +template +void CopyMatBorderY(cv::Mat* mat) { + const int width = mat->cols - 2 * border; + const int height = mat->rows - 2 * border; + + // Maximum values we clamp at to avoid going out of bound small images. + const int max_h = height - 1; + + // Top rows. + for (int r = 0; r < border; ++r) { + const T* src_ptr = + mat->ptr(border + std::min(max_h, r)) + border * channels; + T* dst_ptr = mat->ptr(border - 1 - r) + border * channels; + memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0])); + } + + // Bottom rows. + for (int r = 0; r < border; ++r) { + const T* src_ptr = mat->ptr(border + height - 1 - std::min(max_h, r)) + + border * channels; + T* dst_ptr = mat->ptr(border + height + r) + border * channels; + memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0])); + } +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_IMAGE_UTIL_H_ diff --git a/mediapipe/util/tracking/image_util_test.cc b/mediapipe/util/tracking/image_util_test.cc new file mode 100644 index 000000000..a4c4e0cc8 --- /dev/null +++ b/mediapipe/util/tracking/image_util_test.cc @@ -0,0 +1,93 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/image_util.h" + +#include "mediapipe/framework/port/gtest.h" + +namespace mediapipe { + +template +void TestCopyBorder(cv::Mat* full_size) { + full_size->setTo(0); + + const int width = full_size->cols - 2 * border; + // Fill center with per pixel value [1, 2, 3]. + for (int y = border; y < full_size->rows - border; ++y) { + float* row_ptr = full_size->ptr(y); + for (int x = border; x < full_size->cols - border; ++x) { + int value = x - border; + row_ptr[3 * x] = value + 0.1f; + row_ptr[3 * x + 1] = value + 0.2f; + row_ptr[3 * x + 2] = value + 0.3f; + } + } + + CopyMatBorder(full_size); + + // Main part should not be modified. + for (int y = border; y < full_size->rows - border; ++y) { + float* row_ptr = full_size->ptr(y); + for (int x = border; x < full_size->cols - border; ++x) { + int value = x - border; + EXPECT_EQ(row_ptr[3 * x], value + 0.1f); + EXPECT_EQ(row_ptr[3 * x + 1], value + 0.2f); + EXPECT_EQ(row_ptr[3 * x + 2], value + 0.3f); + } + } + + for (int y = 0; y < full_size->rows; ++y) { + const float* row_ptr = full_size->ptr(y); + for (int x = 0; x < full_size->cols; ++x) { + // Memory copy, expect exact floating point equality. + int value = x - border; + if (x < border) { + // Cap to valid area. + value = std::min(width - 1, (border - 1) - x); + } else if (x >= full_size->cols - border) { + // Last column minus distance from frame boundary, capped + // to valid area. + value = std::max(0, width - 1 - (x - full_size->cols + border)); + } + EXPECT_EQ(row_ptr[3 * x], value + 0.1f); + EXPECT_EQ(row_ptr[3 * x + 1], value + 0.2f); + EXPECT_EQ(row_ptr[3 * x + 2], value + 0.3f); + } + } +} + +TEST(PushPullFilteringTest, CopyBorder) { + cv::Mat full_size(100, 50, CV_32FC3); + TestCopyBorder<1>(&full_size); + TestCopyBorder<2>(&full_size); + TestCopyBorder<3>(&full_size); + TestCopyBorder<4>(&full_size); + TestCopyBorder<5>(&full_size); +} + +TEST(PushPullFilteringTest, CopyBorderSmallFrame) { + cv::Mat full_size(3, 3, CV_32FC3); + TestCopyBorder<1>(&full_size); + + full_size.create(5, 5, CV_32FC3); + TestCopyBorder<2>(&full_size); + + full_size.create(7, 7, CV_32FC3); + TestCopyBorder<3>(&full_size); + + full_size.create(9, 9, CV_32FC3); + TestCopyBorder<4>(&full_size); +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/measure_time.cc b/mediapipe/util/tracking/measure_time.cc new file mode 100644 index 000000000..e628dff0f --- /dev/null +++ b/mediapipe/util/tracking/measure_time.cc @@ -0,0 +1,22 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/measure_time.h" + +#ifdef SET_FLAG_MEASURE_TIME +// If set to true, outputs time measurements to LOG(INFO). +bool flags_measure_time = true; +#else +bool flags_measure_time = false; +#endif diff --git a/mediapipe/util/tracking/measure_time.h b/mediapipe/util/tracking/measure_time.h new file mode 100644 index 000000000..0351f4652 --- /dev/null +++ b/mediapipe/util/tracking/measure_time.h @@ -0,0 +1,169 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Helper class and macro to take time measurements within current scope. +// Takes time measurement within current scope. Outputs to LOG(INFO) if +// flag --measure_time is set or if build flag SET_FLAG_MEASURE_TIME is +// defined (add --copt=-DSET_FLAG_MEASURE_TIME to your build command). +// Additionally you can limit time measurements to specific files, +// via the flag +// --measure_time_filter="" +// Example: +// { // Scope to be measured +// MEASURE_TIME << "Some additional logging and answers : " << 42; +// ... // instructions. +// } + +#ifndef MEDIAPIPE_UTIL_TRACKING_MEASURE_TIME_H_ +#define MEDIAPIPE_UTIL_TRACKING_MEASURE_TIME_H_ + +#include +#include + +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "absl/synchronization/mutex.h" +#include "absl/time/clock.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" + +extern bool flags_measure_time; + +namespace mediapipe { +class MeasureTimeFilter; +} // namespace mediapipe + +#define MEASURE_TIME \ + MEASURE_TIME_PRE_IMPL(flags_measure_time, __LINE__, __FILE__) + +// Add level of indirection to trigger __LINE__ macro expansion. +#define MEASURE_TIME_PRE_IMPL(show_output, line, file) \ + MEASURE_TIME_IMPL(show_output, line, file) + +#define MEASURE_TIME_IMPL(show_output, line, file) \ + std::unique_ptr scoped_wall_timer_##line; \ + const bool activated##line = show_output; \ + if (activated##line) { \ + static mediapipe::ScopedWallTimer::Accumulator* \ + scoped_wall_timer_accum_##line = \ + new mediapipe::ScopedWallTimer::Accumulator; \ + scoped_wall_timer_##line.reset(new mediapipe::ScopedWallTimer( \ + file, line, show_output, scoped_wall_timer_accum_##line)); \ + } \ + if (activated##line) /* NOLINT */ \ + scoped_wall_timer_##line->stream() + +namespace mediapipe { + +class ScopedWallTimer { + public: + // Helper class for accumulating time across multiple calls to a scoped wall + // timer. Thread-safe (except on ANDROID, for which Mutex is mocked out). + class Accumulator { + public: + Accumulator() : accum_time_(0.0f), count_(0) {} + Accumulator(const Accumulator&) = delete; + Accumulator& operator=(const Accumulator&) = delete; + + // Accumulates passed_time into accumulators. Returns total time and number + // of calls made. + void Accumulate(double passed_time, double* accum_time, int* count) { + absl::MutexLock lock(&mutex_); + accum_time_ += passed_time; + ++count_; + *accum_time = accum_time_; + *count = count_; + } + + private: + double accum_time_; + int count_; + absl::Mutex mutex_; + }; + + // Creates a new ScopedWallTimer for current file and line. LogMessage is only + // initialized if show_output is set to true. + ScopedWallTimer(const char* file, int line, bool show_output, + Accumulator* accumulator) + : file_(file), + line_(line), + show_output_(show_output), + accumulator_(accumulator) { + if (show_output_) { + CHECK(accumulator_); + start_time_ = GetWallTime(); + } + } + ScopedWallTimer(const ScopedWallTimer&) = delete; + ScopedWallTimer& operator=(const ScopedWallTimer&) = delete; + + // Destructor measures time and outputs to stream. + ~ScopedWallTimer() { + if (show_output_) { + double passed_time = GetWallTime() - start_time_; + double accum_time = 0.0; + int count = 0; + accumulator_->Accumulate(passed_time, &accum_time, &count); + LOG(INFO) << stream_.str() << " TIMES: [Curr: " << passed_time * 1e-6 + << " ms, " + << "Avg: " << accum_time * 1e-6 / std::max(1, count) << " ms, " + << count << " calls]"; + } + } + + std::ostream& stream() { return stream_; } + + private: + const char* file_; + int line_; + bool show_output_; + // We need to buffer information passed via stream operator << + // While LogMessage is adequate for this, no good equivalent exists on + // Android, so we employ a portable ostringstream for buffering. + std::ostringstream stream_; + int64 start_time_; + Accumulator* accumulator_; + + int64 GetWallTime() { return absl::GetCurrentTimeNanos(); } +}; + +class MeasureTimeFilter { + public: + static const MeasureTimeFilter* get() { + static MeasureTimeFilter instance; + return &instance; + } + MeasureTimeFilter(const MeasureTimeFilter&) = delete; + MeasureTimeFilter& operator=(const MeasureTimeFilter&) = delete; + + bool Matches(const std::string& item) const { + for (const std::string& match_item : match_items_) { + if (item.find(match_item) != std::string::npos) { + return true; + } + } + return false; + } + + private: + explicit MeasureTimeFilter() {} + explicit MeasureTimeFilter(const std::string& filter) { + match_items_ = absl::StrSplit(filter, absl::ByChar(',')); + } + std::vector match_items_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_MEASURE_TIME_H_ diff --git a/mediapipe/util/tracking/motion_analysis.cc b/mediapipe/util/tracking/motion_analysis.cc new file mode 100644 index 000000000..ff4d3d531 --- /dev/null +++ b/mediapipe/util/tracking/motion_analysis.cc @@ -0,0 +1,850 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/motion_analysis.h" + +#include +#include +#include +#include +#include + +#include "absl/strings/str_format.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/camera_motion.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/image_util.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/motion_saliency.pb.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" +#include "mediapipe/util/tracking/region_flow_computation.h" +#include "mediapipe/util/tracking/region_flow_computation.pb.h" +#include "mediapipe/util/tracking/region_flow_visualization.h" + +namespace mediapipe { + +MotionAnalysis::MotionAnalysis(const MotionAnalysisOptions& options, + int frame_width, int frame_height) + : options_(options), + frame_width_(frame_width), + frame_height_(frame_height) { + // Init options by policy. + InitPolicyOptions(); + // Merge back in any overriden options. + options_.MergeFrom(options); + + region_flow_computation_.reset(new RegionFlowComputation( + options_.flow_options(), frame_width_, frame_height_)); + motion_estimation_.reset(new MotionEstimation(options_.motion_options(), + frame_width_, frame_height_)); + + if (options_.compute_motion_saliency()) { + motion_saliency_.reset(new MotionSaliency(options_.saliency_options(), + frame_width_, frame_height_)); + + // Compute overlap needed between clips if filtering or smoothing of + // saliency is requested. + if (options_.select_saliency_inliers()) { + // 1.65 sigmas in each direction = 90% of variance is captured. + overlap_size_ = std::max( + overlap_size_, options_.saliency_options().selection_frame_radius()); + } + + if (options_.filter_saliency()) { + overlap_size_ = std::max( + overlap_size_, + options_.saliency_options().filtering_sigma_time() * 1.65f); + } + } + + long_feature_stream_.reset(new LongFeatureStream); + + frame_num_ = 0; + + // Determine if feature descriptors need to be computed. + // Required for irls smoothing, overlay detection and mixture homographies. + const bool compute_mixtures = + options_.motion_options().mix_homography_estimation() != + MotionEstimationOptions::ESTIMATION_HOMOG_MIX_NONE; + + const bool use_spatial_bias = + options_.motion_options().estimation_policy() == + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS && + options_.motion_options().long_feature_bias_options().use_spatial_bias(); + + compute_feature_descriptors_ = + options_.post_irls_smoothing() || + options_.motion_options().overlay_detection() || compute_mixtures || + use_spatial_bias; + + if (compute_feature_descriptors_) { + CHECK_EQ(RegionFlowComputationOptions::FORMAT_RGB, + options_.flow_options().image_format()) + << "Feature descriptors only support RGB currently."; + prev_frame_.reset(new cv::Mat(frame_height_, frame_width_, CV_8UC3)); + } + + // Setup streaming buffer. By default we buffer features and motion. + // If saliency is computed, also buffer saliency and filtered/smoothed + // output_saliency. + std::vector data_config{ + TaggedPointerType("features"), + TaggedPointerType("motion")}; + std::vector data_config_saliency = data_config; + data_config_saliency.push_back( + TaggedPointerType("saliency")); + data_config_saliency.push_back( + TaggedPointerType("output_saliency")); + + // Store twice the overlap. + buffer_.reset(new StreamingBuffer( + options_.compute_motion_saliency() ? data_config_saliency : data_config, + 2 * overlap_size_)); +} + +void MotionAnalysis::InitPolicyOptions() { + auto* flow_options = options_.mutable_flow_options(); + auto* tracking_options = flow_options->mutable_tracking_options(); + auto* motion_options = options_.mutable_motion_options(); + auto* feature_bias_options = + motion_options->mutable_long_feature_bias_options(); + auto* translation_bounds = + motion_options->mutable_stable_translation_bounds(); + auto* similarity_bounds = motion_options->mutable_stable_similarity_bounds(); + auto* homography_bounds = motion_options->mutable_stable_homography_bounds(); + + switch (options_.analysis_policy()) { + case MotionAnalysisOptions::ANALYSIS_POLICY_LEGACY: + break; + + case MotionAnalysisOptions::ANALYSIS_POLICY_VIDEO: + // Long track settings. Temporally consistent. + options_.set_estimation_clip_size(64); + + tracking_options->set_internal_tracking_direction( + TrackingOptions::FORWARD); + tracking_options->set_tracking_policy( + TrackingOptions::POLICY_LONG_TRACKS); + + feature_bias_options->set_use_spatial_bias(false); + feature_bias_options->set_seed_priors_from_bias(true); + + similarity_bounds->set_min_inlier_fraction(0.15f); + homography_bounds->set_min_inlier_coverage(0.25f); + flow_options->set_verify_long_features(false); + + // Better features. + tracking_options->set_adaptive_features_levels(3); + tracking_options->set_use_cv_tracking_algorithm(true); + + // Speed. + flow_options->set_downsample_mode( + RegionFlowComputationOptions::DOWNSAMPLE_BY_SCHEDULE); + + motion_options->set_estimation_policy( + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS); + motion_options->set_feature_density_normalization(true); + motion_options->set_overlay_detection(true); + motion_options->set_domain_limited_irls_scaling(true); + motion_options->set_irls_weights_preinitialized(true); + motion_options->mutable_irls_initialization()->set_activated(true); + motion_options->mutable_long_feature_initialization()->set_activated( + true); + + break; + + case MotionAnalysisOptions::ANALYSIS_POLICY_VIDEO_MOBILE: + // Long track settings. Temporally consistent. + options_.set_estimation_clip_size(32); + tracking_options->set_internal_tracking_direction( + TrackingOptions::FORWARD); + tracking_options->set_tracking_policy( + TrackingOptions::POLICY_LONG_TRACKS); + + motion_options->set_estimation_policy( + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS); + motion_options->set_feature_density_normalization(true); + motion_options->set_domain_limited_irls_scaling(true); + motion_options->mutable_irls_initialization()->set_activated(true); + motion_options->mutable_long_feature_initialization()->set_activated( + true); + + feature_bias_options->set_use_spatial_bias(false); + feature_bias_options->set_seed_priors_from_bias(true); + + similarity_bounds->set_inlier_threshold(2.0); + similarity_bounds->set_min_inlier_fraction(0.15f); + + // Speed for mobile. + tracking_options->set_tracking_window_size(8); + tracking_options->set_tracking_iterations(8); + tracking_options->set_fractional_tracking_distance(0.1); + tracking_options->set_reuse_features_max_frame_distance(15); + tracking_options->set_max_features(500); + tracking_options->set_use_cv_tracking_algorithm(true); + + flow_options->set_downsample_mode( + RegionFlowComputationOptions::DOWNSAMPLE_TO_MIN_SIZE); + flow_options->set_round_downsample_factor(true); + flow_options->set_downsampling_size(256); + flow_options->set_pre_blur_sigma(0); + flow_options->set_top_inlier_sets(1); + flow_options->set_ransac_rounds_per_region(10); + flow_options->mutable_visual_consistency_options() + ->set_compute_consistency(false); + flow_options->set_verify_long_features(false); + + // Avoid resetting features if a large amount is flagged. + // In that case verify feature tracks to avoid pertubing the estimation. + flow_options->set_verify_long_feature_acceleration(true); + flow_options->set_verification_distance(5.0); + flow_options->set_verify_long_feature_trigger_ratio(0.1); + + motion_options->set_use_exact_homography_estimation(false); + motion_options->set_use_highest_accuracy_for_normal_equations(false); + + break; + + case MotionAnalysisOptions::ANALYSIS_POLICY_CAMERA_MOBILE: + // Long track settings. + tracking_options->set_internal_tracking_direction( + TrackingOptions::FORWARD); + tracking_options->set_tracking_policy( + TrackingOptions::POLICY_LONG_TRACKS); + + motion_options->set_estimation_policy( + MotionEstimationOptions::TEMPORAL_IRLS_MASK); + motion_options->set_feature_density_normalization(true); + motion_options->set_domain_limited_irls_scaling(true); + motion_options->mutable_irls_initialization()->set_activated(true); + motion_options->mutable_irls_initialization()->set_rounds(50); + + feature_bias_options->set_use_spatial_bias(false); + feature_bias_options->set_seed_priors_from_bias(true); + + similarity_bounds->set_inlier_threshold(2.0); + similarity_bounds->set_min_inlier_fraction(0.15f); + + // Speed for mobile. + tracking_options->set_tracking_window_size(8); + tracking_options->set_tracking_iterations(8); + tracking_options->set_fractional_tracking_distance(0.1); + tracking_options->set_reuse_features_max_frame_distance(10); + tracking_options->set_reuse_features_min_survived_frac(0.6); + tracking_options->set_max_features(240); + tracking_options->set_adaptive_tracking_distance(true); + tracking_options->set_use_cv_tracking_algorithm(true); + + // Assumes downsampled input. + flow_options->set_pre_blur_sigma(0); + flow_options->set_top_inlier_sets(1); + flow_options->set_ransac_rounds_per_region(10); + flow_options->mutable_visual_consistency_options() + ->set_compute_consistency(false); + flow_options->set_verify_long_features(false); + + motion_options->set_use_exact_homography_estimation(false); + motion_options->set_use_highest_accuracy_for_normal_equations(false); + + // Low latency. + options_.set_estimation_clip_size(1); + break; + + case MotionAnalysisOptions::ANALYSIS_POLICY_HYPERLAPSE: + // Long track settings. Temporally consistent. + options_.set_estimation_clip_size(64); + + tracking_options->set_internal_tracking_direction( + TrackingOptions::FORWARD); + tracking_options->set_tracking_policy( + TrackingOptions::POLICY_LONG_TRACKS); + + feature_bias_options->set_use_spatial_bias(false); + feature_bias_options->set_seed_priors_from_bias(true); + + // Relaxed stability bounds. + translation_bounds->set_max_motion_stdev(0.15); + translation_bounds->set_max_acceleration(60.0); + translation_bounds->set_frac_max_motion_magnitude(0.25); + + similarity_bounds->set_min_inlier_fraction(0.02f); + similarity_bounds->set_min_inliers(5); + similarity_bounds->set_lower_scale(0.5); + similarity_bounds->set_upper_scale(2.0); + similarity_bounds->set_limit_rotation(0.5); + + homography_bounds->set_lower_scale(0.5); + homography_bounds->set_upper_scale(2.0); + homography_bounds->set_limit_rotation(0.5); + homography_bounds->set_limit_perspective(0.004); + homography_bounds->set_min_inlier_coverage(0.1); + + // Relax constraining of homographies. + motion_options->set_lin_sim_inlier_threshold(0.02); + // Anticipate much higher registration errors. + motion_options->set_irls_motion_magnitude_fraction(0.2); + + flow_options->set_verify_long_features(false); + + // Better features. + tracking_options->set_adaptive_features_levels(3); + + // Speed. + tracking_options->set_use_cv_tracking_algorithm(true); + flow_options->set_downsample_mode( + RegionFlowComputationOptions::DOWNSAMPLE_BY_SCHEDULE); + + // Less quality features. + flow_options->set_absolute_inlier_error_threshold(4); + flow_options->set_relative_inlier_error_threshold(0.2); + + motion_options->set_estimation_policy( + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS); + motion_options->set_feature_density_normalization(true); + + // TODO: This needs to be activated and then tighten relaxed + // thresholds above again. + // motion_options->set_domain_limited_irls_scaling(true); + + motion_options->set_irls_weights_preinitialized(true); + motion_options->mutable_irls_initialization()->set_activated(true); + + // More weight towards long feature tracks. + motion_options->mutable_long_feature_initialization()->set_activated( + true); + motion_options->mutable_long_feature_initialization() + ->set_min_length_percentile(0.975); + motion_options->mutable_long_feature_initialization() + ->set_upweight_multiplier(50); + + // 2nd pass, no need to do overlay again. + motion_options->set_overlay_detection(false); + + break; + } +} + +bool MotionAnalysis::AddFrame(const cv::Mat& frame, int64 timestamp_usec, + RegionFlowFeatureList* feature_list) { + return AddFrameWithSeed(frame, timestamp_usec, Homography(), feature_list); +} + +bool MotionAnalysis::AddFrameWithSeed(const cv::Mat& frame, + int64 timestamp_usec, + const Homography& initial_transform, + RegionFlowFeatureList* feature_list) { + return AddFrameGeneric(frame, timestamp_usec, initial_transform, + nullptr, // rejection_transform + nullptr, // external features + nullptr, // feature modification function + feature_list); +} + +bool MotionAnalysis::AddFrameGeneric( + const cv::Mat& frame, int64 timestamp_usec, + const Homography& initial_transform, const Homography* rejection_transform, + const RegionFlowFeatureList* external_features, + std::function* modify_features, + RegionFlowFeatureList* output_feature_list) { + // Don't check input sizes here, RegionFlowComputation does that based + // on its internal options. + CHECK(feature_computation_) << "Calls to AddFrame* can NOT be mixed " + << "with AddFeatures"; + + // Compute RegionFlow. + { + MEASURE_TIME << "CALL RegionFlowComputation::AddImage"; + if (!region_flow_computation_->AddImageWithSeed(frame, timestamp_usec, + initial_transform)) { + LOG(ERROR) << "Error while computing region flow."; + return false; + } + } + + // Buffer features. + std::unique_ptr feature_list; + { + MEASURE_TIME << "CALL RegionFlowComputation::RetrieveRegionFlowFeatureList"; + const bool compute_feature_match_descriptors = + compute_feature_descriptors_ && frame_num_ > 0; + + // Determine which region flow should be computed in case multi frame + // tracking has been requested. + int max_track_index = 0; + if (options_.flow_options().tracking_options().tracking_policy() == + TrackingOptions::POLICY_MULTI_FRAME) { + max_track_index = std::min( + options_.flow_options().tracking_options().multi_frames_to_track() - + 1, + options_.track_index()); + } + + feature_list.reset( + region_flow_computation_->RetrieveMultiRegionFlowFeatureList( + std::min(std::max(0, frame_num_ - 1), max_track_index), + compute_feature_descriptors_, compute_feature_match_descriptors, + &frame, + compute_feature_match_descriptors ? prev_frame_.get() : nullptr)); + + if (feature_list == nullptr) { + LOG(ERROR) << "Error retrieving feature list."; + return false; + } + } + + if (external_features) { + constexpr int kTrackIdShift = 1 << 20; + for (const auto& external_feat : external_features->feature()) { + auto* new_feat = feature_list->add_feature(); + *new_feat = external_feat; + if (new_feat->track_id() >= 0) { + new_feat->set_track_id(new_feat->track_id() + kTrackIdShift); + } + } + } + + if (rejection_transform) { + RegionFlowFeatureList tmp_list; + tmp_list.mutable_feature()->Swap(feature_list->mutable_feature()); + + for (const auto& feature : tmp_list.feature()) { + const Vector2_f diff = + TransformPoint(*rejection_transform, FeatureLocation(feature)) - + FeatureMatchLocation(feature); + if (diff.Norm() < options_.rejection_transform_threshold()) { + *feature_list->add_feature() = feature; + } + } + } + + if (output_feature_list) { + *output_feature_list = *feature_list; + } + + if (modify_features) { + (*modify_features)(feature_list.get()); + } + + buffer_->EmplaceDatum("features", feature_list.release()); + + // Store frame for next call. + if (compute_feature_descriptors_) { + frame.copyTo(*prev_frame_); + } + + ++frame_num_; + + return true; +} + +void MotionAnalysis::AddFeatures(const RegionFlowFeatureList& features) { + feature_computation_ = false; + buffer_->EmplaceDatum("features", new RegionFlowFeatureList(features)); + + ++frame_num_; +} + +void MotionAnalysis::EnqueueFeaturesAndMotions( + const RegionFlowFeatureList& features, const CameraMotion& motion) { + feature_computation_ = false; + CHECK(buffer_->HaveEqualSize({"motion", "features"})) + << "Can not be mixed with other Add* calls"; + buffer_->EmplaceDatum("features", new RegionFlowFeatureList(features)); + buffer_->EmplaceDatum("motion", new CameraMotion(motion)); +} + +cv::Mat MotionAnalysis::GetGrayscaleFrameFromResults() { + return region_flow_computation_->GetGrayscaleFrameFromResults(); +} + +int MotionAnalysis::GetResults( + bool flush, std::vector>* features, + std::vector>* camera_motion, + std::vector>* saliency) { + MEASURE_TIME << "GetResults"; + + const int num_features_lists = buffer_->BufferSize("features"); + const int num_new_feature_lists = num_features_lists - overlap_start_; + CHECK_GE(num_new_feature_lists, 0); + + if (!flush && num_new_feature_lists < options_.estimation_clip_size()) { + // Nothing to compute, return. + return 0; + } + + const bool compute_saliency = options_.compute_motion_saliency(); + CHECK_EQ(compute_saliency, saliency != nullptr) + << "Computing saliency requires saliency output and vice versa"; + + // Estimate motions for newly buffered RegionFlowFeatureLists, which also + // computes IRLS feature weights for foreground estimation, if needed + // (otherwise could be externally added). + const int num_motions_to_compute = + buffer_->BufferSize("features") - buffer_->BufferSize("motion"); + + if (num_motions_to_compute > 0) { + std::vector camera_motions; + std::vector feature_lists; + for (int k = overlap_start_; k < num_features_lists; ++k) { + feature_lists.push_back( + buffer_->GetMutableDatum("features", k)); + } + + // TODO: Result should be vector of unique_ptr. + motion_estimation_->EstimateMotionsParallel( + options_.post_irls_smoothing(), &feature_lists, &camera_motions); + + // Add solution to buffer. + for (const auto& motion : camera_motions) { + buffer_->EmplaceDatum("motion", new CameraMotion(motion)); + } + } + + CHECK(buffer_->HaveEqualSize({"features", "motion"})); + + if (compute_saliency) { + ComputeSaliency(); + } + + return OutputResults(flush, features, camera_motion, saliency); +} + +int MotionAnalysis::OutputResults( + bool flush, std::vector>* features, + std::vector>* camera_motion, + std::vector>* saliency) { + const bool compute_saliency = options_.compute_motion_saliency(); + CHECK_EQ(compute_saliency, saliency != nullptr) + << "Computing saliency requires saliency output and vice versa"; + CHECK(buffer_->HaveEqualSize({"features", "motion"})); + + // Discard prev. overlap (already output, just used for filtering here). + buffer_->DiscardData(buffer_->AllTags(), prev_overlap_start_); + prev_overlap_start_ = 0; + + // Output only frames not part of the overlap. + const int num_output_frames = + std::max(0, buffer_->MaxBufferSize() - (flush ? 0 : overlap_size_)); + + if (features) { + features->reserve(num_output_frames); + } + if (camera_motion) { + camera_motion->reserve(num_output_frames); + } + if (saliency) { + saliency->reserve(num_output_frames); + } + + // Need to retain twice the overlap, for frames in that segment we need + // to create a copy. + const int new_overlap_start = + std::max(0, num_output_frames - (flush ? 0 : overlap_size_)); + + // Populate output. + for (int k = 0; k < num_output_frames; ++k) { + std::unique_ptr out_features; + std::unique_ptr out_motion; + std::unique_ptr out_saliency; + + if (k >= new_overlap_start) { + // Create copy. + out_features.reset(new RegionFlowFeatureList( + *buffer_->GetDatum("features", k))); + out_motion.reset( + new CameraMotion(*buffer_->GetDatum("motion", k))); + } else { + // Release datum. + out_features = + buffer_->ReleaseDatum("features", k); + out_motion = buffer_->ReleaseDatum("motion", k); + } + + // output_saliency is temporary so we never need to buffer it. + if (compute_saliency) { + out_saliency = + buffer_->ReleaseDatum("output_saliency", k); + } + + if (options_.subtract_camera_motion_from_features()) { + std::vector feature_view{out_features.get()}; + SubtractCameraMotionFromFeatures({*out_motion}, &feature_view); + } + + if (features != nullptr) { + features->push_back(std::move(out_features)); + } + if (camera_motion != nullptr) { + camera_motion->push_back(std::move(out_motion)); + } + if (saliency != nullptr) { + saliency->push_back(std::move(out_saliency)); + } + } + + // Reset for next chunk. + prev_overlap_start_ = num_output_frames - new_overlap_start; + CHECK_GE(prev_overlap_start_, 0); + + CHECK(buffer_->TruncateBuffer(flush)); + + overlap_start_ = buffer_->MaxBufferSize(); + return num_output_frames; +} + +void MotionAnalysis::RenderResults(const RegionFlowFeatureList& feature_list, + const CameraMotion& motion, + const SalientPointFrame* saliency, + cv::Mat* rendered_results) { +#ifndef NO_RENDERING + CHECK(rendered_results != nullptr); + CHECK_EQ(frame_width_, rendered_results->cols); + CHECK_EQ(frame_height_, rendered_results->rows); + + const auto viz_options = options_.visualization_options(); + + // Visualize flow features if requested. + if (viz_options.visualize_region_flow_features()) { + cv::Scalar inlier_color(0, 255, 0); + cv::Scalar outlier_color(255, 0, 0); + if (feature_list.long_tracks()) { + long_feature_stream_->AddFeatures(feature_list, + true, // Check connectivity. + true); // Purge non present ones. + + VisualizeLongFeatureStream( + *long_feature_stream_, inlier_color, outlier_color, + viz_options.min_long_feature_track(), + viz_options.max_long_feature_points(), 1.0f, 1.0f, rendered_results); + } else { + VisualizeRegionFlowFeatures(feature_list, inlier_color, outlier_color, + true, 1.0f, 1.0f, rendered_results); + } + } + + if (saliency != nullptr && viz_options.visualize_salient_points()) { + static const cv::Scalar kColor(255, 0, 0); + RenderSaliency(*saliency, kColor, viz_options.line_thickness(), false, + rendered_results); + } + + if (viz_options.visualize_blur_analysis_region()) { + VisualizeBlurAnalysisRegions(rendered_results); + } + + if (viz_options.visualize_stats()) { + // Output general stats. + std::string hud_text = absl::StrFormat( + "H-cvg %.2f | H-err %4.2f | Avg.t %3.1f | dx: %+2.1f dy: %+2.1f " + "Feat# %4d | %s | ", + motion.homography_inlier_coverage(), motion.average_homography_error(), + motion.average_magnitude(), motion.translation().dx(), + motion.translation().dy(), feature_list.feature_size(), + CameraMotionTypeToString(motion)); + hud_text += CameraMotionFlagToString(motion); + + const float text_scale = frame_width_ * 5.e-4; + cv::putText(*rendered_results, hud_text, + cv::Point(0.02 * frame_width_, 0.975 * frame_height_), + cv::FONT_HERSHEY_SIMPLEX, text_scale, cv::Scalar::all(255), + text_scale * 3, CV_AA); + + cv::putText(*rendered_results, + absl::StrFormat("%6d", motion.timestamp_usec() / 1000), + cv::Point(0.9 * frame_width_, 0.05 * frame_height_), + cv::FONT_HERSHEY_SIMPLEX, text_scale, cv::Scalar::all(255), + text_scale * 3, CV_AA); + } +#else + LOG(FATAL) << "Code stripped out because of NO_RENDERING"; +#endif +} + +void MotionAnalysis::ComputeDenseForeground( + const RegionFlowFeatureList& feature_list, + const CameraMotion& camera_motion, cv::Mat* foreground_mask) { + const auto& foreground_options = options_.foreground_options(); + + if (foreground_push_pull_ == nullptr) { + foreground_push_pull_.reset( + new PushPullFilteringC1(cv::Size(frame_width_, frame_height_), + PushPullFilteringC1::BINOMIAL_5X5, false, + nullptr, // Gaussian filter weights only. + nullptr, // No mip map visualizer. + nullptr)); // No weight adjustment. + } + + // Determine foreground weights for each features. + std::vector foreground_weights; + ForegroundWeightsFromFeatures( + feature_list, foreground_options.foreground_threshold(), + foreground_options.foreground_gamma(), + foreground_options.threshold_coverage_scaling() ? &camera_motion + : nullptr, + &foreground_weights); + + // Setup push pull map (with border). Ensure constructor used the right type. + CHECK(foreground_push_pull_->filter_type() == + PushPullFilteringC1::BINOMIAL_5X5 || + foreground_push_pull_->filter_type() == + PushPullFilteringC1::GAUSSIAN_5X5); + + cv::Mat foreground_map(frame_height_ + 4, frame_width_ + 4, CV_32FC2); + std::vector feature_locations; + std::vector> feature_irls; + + for (int feat_idx = 0; feat_idx < foreground_weights.size(); ++feat_idx) { + // Skip marked outliers. + if (foreground_weights[feat_idx] == 0) { + continue; + } + + feature_locations.push_back( + FeatureLocation(feature_list.feature(feat_idx))); + feature_irls.push_back(cv::Vec(foreground_weights[feat_idx])); + } + + // Run push pull. + foreground_push_pull_->PerformPushPull(feature_locations, feature_irls, 0.2, + cv::Point2i(0, 0), + 0, // Default read out level. + nullptr, // Uniform weights. + nullptr, // No bilateral term. + &foreground_map); + + // Convert to grayscale output. + foreground_mask->create(frame_height_, frame_width_, CV_8U); + for (int i = 0; i < frame_height_; ++i) { + const float* src_ptr = foreground_map.ptr(i); + uint8* dst_ptr = foreground_mask->ptr(i); + for (int j = 0; j < frame_width_; ++j) { + // Result is in first channel (second is confidence). + dst_ptr[j] = + std::max(0, std::min(255, static_cast(src_ptr[2 * j] * 255.0f))); + } + } +} + +void MotionAnalysis::VisualizeDenseForeground(const cv::Mat& foreground_mask, + cv::Mat* output) { + CHECK(output != nullptr); + CHECK(foreground_mask.size() == output->size()); + // Map foreground measure to color (green by default). + std::vector color_map; + if (options_.visualization_options().foreground_jet_coloring()) { + JetColoring(1000, &color_map); + } else { + color_map.resize(1000, Vector3_f(0, 255, 0)); + } + + auto clamp = [](int value) -> int { + return std::max(0, std::min(255, value)); + }; + + // Burn-in alpha compositing. + const float alpha = 1.3f; + for (int i = 0; i < frame_height_; ++i) { + uint8* image_ptr = output->ptr(i); + const uint8* foreground_ptr = foreground_mask.ptr(i); + + for (int j = 0; j < frame_width_; ++j) { + const float norm_foreground = foreground_ptr[j] * (1.0 / 255.0f); + const float foreground = norm_foreground * alpha; + const float alpha_denom = 1.0f / (1.0f + foreground); + + Vector3_f color = + color_map[static_cast(norm_foreground * (color_map.size() - 1))]; + + image_ptr[3 * j] = + clamp((image_ptr[3 * j] + color[0] * foreground) * alpha_denom); + image_ptr[3 * j + 1] = + clamp((image_ptr[3 * j + 1] + color[1] * foreground) * alpha_denom); + image_ptr[3 * j + 2] = + clamp((image_ptr[3 * j + 2] + color[2] * foreground) * alpha_denom); + } + } +} + +void MotionAnalysis::VisualizeBlurAnalysisRegions(cv::Mat* input_view) { + CHECK(input_view != nullptr); + + cv::Mat intensity; + cv::cvtColor(*input_view, intensity, CV_RGB2GRAY); + cv::Mat corner_values; + cv::cornerMinEigenVal(intensity, corner_values, 3); + + cv::Mat mask; + region_flow_computation_->ComputeBlurMask(*input_view, &corner_values, &mask); + + cv::Mat mask_3c; + cv::cvtColor(mask, mask_3c, CV_GRAY2RGB); + cv::addWeighted(*input_view, 0.5, mask_3c, 0.5, -128, *input_view); +} + +void MotionAnalysis::ComputeSaliency() { + MEASURE_TIME << "Saliency computation."; + CHECK_EQ(overlap_start_, buffer_->BufferSize("saliency")); + + const int num_features_lists = buffer_->BufferSize("features"); + + // Compute saliency only for newly buffered RegionFlowFeatureLists. + for (int k = overlap_start_; k < num_features_lists; ++k) { + std::vector foreground_weights; + ForegroundWeightsFromFeatures( + *buffer_->GetDatum("features", k), + options_.foreground_options().foreground_threshold(), + options_.foreground_options().foreground_gamma(), + options_.foreground_options().threshold_coverage_scaling() + ? buffer_->GetDatum("motion", k) + : nullptr, + &foreground_weights); + + std::unique_ptr saliency(new SalientPointFrame()); + motion_saliency_->SaliencyFromFeatures( + *buffer_->GetDatum("features", k), + &foreground_weights, saliency.get()); + + buffer_->AddDatum("saliency", std::move(saliency)); + } + + CHECK(buffer_->HaveEqualSize({"features", "motion", "saliency"})); + + // Clear output saliency and copy from saliency. + buffer_->DiscardDatum("output_saliency", + buffer_->BufferSize("output_saliency")); + + for (int k = 0; k < buffer_->BufferSize("saliency"); ++k) { + std::unique_ptr copy(new SalientPointFrame()); + *copy = *buffer_->GetDatum("saliency", k); + buffer_->AddDatum("output_saliency", std::move(copy)); + } + + // Create view. + std::vector saliency_view = + buffer_->GetMutableDatumVector("output_saliency"); + + // saliency_frames are filtered after this point and ready for output. + if (options_.select_saliency_inliers()) { + motion_saliency_->SelectSaliencyInliers(&saliency_view, false); + } + + if (options_.filter_saliency()) { + motion_saliency_->FilterMotionSaliency(&saliency_view); + } +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/motion_analysis.h b/mediapipe/util/tracking/motion_analysis.h new file mode 100644 index 000000000..d2ed44d06 --- /dev/null +++ b/mediapipe/util/tracking/motion_analysis.h @@ -0,0 +1,229 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Module for performing motion analysis on a video stream including computing +// locally filtered (robust) feature tracking, camera motion estimation, and +// dense foreground saliency estimation. +// Module buffers frames internally (using an adaptive overlap to achieve +// temporal consistency): +// +// Usage example: +// +// MotionAnalysisOptions options(); +// // Should be always be a multiple 8 for optimal parallel performance +// options.set_estimation_clip_size(16); +// MotionAnalysis motion_analysis(options, 960, 540); + +// std::vector input_frames(N); +// // Define output vectors. +// std::vector> features; +// std::vector> camera_motion; +// std::vector> saliency; +// std::vector rendered_results); // Should to be initialized with +// // frame. +// +// for (int k = 0; k < N; ++k) { +// motion_analysis.AddFrame(input_frames[k], 0); +// // Outputs results, if new ones are available. +// // Output will be all of the same lengths (Length returned by function). +// if (motion_analysis.GetResults(k + 1 == N, // Flush, force output. +// &features, +// &camera_motion, +// &saliency) > 0) { // Optional. +// // Optionally render at i'th frame. +// motion_analysis.RenderResults(*features[i], +// *camera_motion[i], +// saliency[i].get(), +// &rendered_results[i]); +// +// // Output results... +// } +// } + +#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_ANALYSIS_H_ +#define MEDIAPIPE_UTIL_TRACKING_MOTION_ANALYSIS_H_ + +#include +#include +#include + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/motion_analysis.pb.h" +#include "mediapipe/util/tracking/motion_estimation.h" +#include "mediapipe/util/tracking/motion_estimation.pb.h" +#include "mediapipe/util/tracking/motion_saliency.h" +#include "mediapipe/util/tracking/push_pull_filtering.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" +#include "mediapipe/util/tracking/region_flow_computation.h" +#include "mediapipe/util/tracking/streaming_buffer.h" + +namespace mediapipe { + +typedef PushPullFiltering<1, FilterWeightMultiplierOne> PushPullFlowC1; + +class MotionAnalysis { + public: + MotionAnalysis(const MotionAnalysisOptions& options, int frame_width, + int frame_height); + ~MotionAnalysis() = default; + MotionAnalysis(const MotionAnalysis&) = delete; + MotionAnalysis& operator=(const MotionAnalysis&) = delete; + + // Call with every frame. Timestamp is optional (set to zero if not needed). + // Optionally outputs list of features extracted from this frame. + // Returns true on success. + bool AddFrame(const cv::Mat& frame, int64 timestamp_usec, + RegionFlowFeatureList* feature_list = nullptr); + + // Same as above, but uses specified initial transform to seed + // feature locations. + bool AddFrameWithSeed(const cv::Mat& frame, int64 timestamp_usec, + const Homography& initial_transform, + RegionFlowFeatureList* feature_list = nullptr); + + // Generic function to perform motion analysis on the passed frame, + // with initial_transform used as seed. + // Optionally accepts external_feature to be added to the computed ones, + // and to reject all features that do not agree with the rejection transform + // within a specified threshold (rejection_transform_threshold in options). + // Also allows to modify feature locations before motion estimation by + // supplying appropiate callback - this is invoked *after* the rejection + // transform. + // Returns list of features extracted from this frame, *before* any + // modification is applied. To yield modified features, simply + // apply modify_features function to returned result. + bool AddFrameGeneric( + const cv::Mat& frame, int64 timestamp_usec, + const Homography& initial_transform, + const Homography* rejection_transform = nullptr, + const RegionFlowFeatureList* external_features = nullptr, + std::function* modify_features = nullptr, + RegionFlowFeatureList* feature_list = nullptr); + + // Instead of tracking passed frames, uses result directly as supplied by + // features. Can not be mixed with above AddFrame* calls. + void AddFeatures(const RegionFlowFeatureList& features); + + // Instead of tracking and computing camera motions, simply adds precomputed + // features and camera motions to the internal buffers. Can not be mixed + // with above Add* calls. + // This is useful for just computing saliency via GetResults. + void EnqueueFeaturesAndMotions(const RegionFlowFeatureList& features, + const CameraMotion& motion); + + // Returns motion results (features, camera motions and saliency, all + // optional). + // Call after every AddFrame for optimal performance. + // Returns number of available results. Note, this call with often return + // zero, and only return results (multiple in this case) when chunk boundaries + // are reached. The actual number returned depends on various smoothing + // settings for saliency and features. + // Set flush to true, to force output of all results (e.g. when the end of the + // video stream is reached). + // Note: Passing a non-zero argument for saliency, requires + // MotionAnalysisOptions::compute_motion_saliency to be set and + // vice versa. (CHECKED) + int GetResults( + bool flush, // Forces output. + std::vector>* features = nullptr, + std::vector>* camera_motion = nullptr, + std::vector>* saliency = nullptr); + + // Exposes the grayscale image frame from the most recently created region + // flow tracking data. + cv::Mat GetGrayscaleFrameFromResults(); + + // Renders features and saliency to rendered_results based on + // VisualizationOptions onto pre-initialized rendered_results (in most cases + // you want to create a copy of the input frame). + // NOTE: + // If features are requested to be rendered, this function should be + // called serially with each frame, or wrong feature location might be + // rendered. + void RenderResults(const RegionFlowFeatureList& features, + const CameraMotion& camera_motion, + const SalientPointFrame* saliency, // Optional. + cv::Mat* rendered_results); + + // Determines dense foreground mask from features. + // Returns foreground mask as CV_8U image, indicating propability of + // foreground. + void ComputeDenseForeground(const RegionFlowFeatureList& feature_list, + const CameraMotion& camera_motion, + cv::Mat* foreground_mask); + + // Overlays foreground mask over output as green burn in, or with + // jet_coloring (based on options). + void VisualizeDenseForeground(const cv::Mat& foreground_mask, + cv::Mat* output); + + // Masks out regions from input that are not used for blur analysis. + void VisualizeBlurAnalysisRegions(cv::Mat* input); + + // Number of frames/features added so far. + int NumFrames() const { return frame_num_; } + + private: + void InitPolicyOptions(); + + // Compute saliency from buffered features and motions. + void ComputeSaliency(); + + // Outputs computed results from the streaming buffer to the optional + // output args. Also performs overlap handling. + int OutputResults( + bool flush, // Forces output. + std::vector>* features = nullptr, + std::vector>* camera_motion = nullptr, + std::vector>* saliency = nullptr); + + MotionAnalysisOptions options_; + int frame_width_ = 0; + int frame_height_ = 0; + int frame_num_ = 0; + + // Internal objects for actual motion analysis. + std::unique_ptr region_flow_computation_; + std::unique_ptr motion_estimation_; + std::unique_ptr motion_saliency_; + std::unique_ptr foreground_push_pull_; + // Used for visualization if long feature tracks are present. + std::unique_ptr long_feature_stream_; + + std::unique_ptr buffer_; + + // Indicates where previous overlap in above buffers starts (earlier data is + // just to improve smoothing). + int prev_overlap_start_ = 0; + // Indicates where actual overlap starts (data after this has not been + // output). + int overlap_start_ = 0; + + // Buffers previous frame. + std::unique_ptr prev_frame_; + + bool compute_feature_descriptors_ = false; + + // Amount of overlap between clips. Determined from saliency smoothing + // and filtering options. + int overlap_size_ = 0; + + bool feature_computation_ = true; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_ANALYSIS_H_ diff --git a/mediapipe/util/tracking/motion_analysis.proto b/mediapipe/util/tracking/motion_analysis.proto new file mode 100644 index 000000000..97d6d1d7b --- /dev/null +++ b/mediapipe/util/tracking/motion_analysis.proto @@ -0,0 +1,154 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/motion_estimation.proto"; +import "mediapipe/util/tracking/motion_saliency.proto"; +import "mediapipe/util/tracking/region_flow_computation.proto"; + +// Settings for MotionAnalysis. This class computes sparse, locally consistent +// flow (referred to as region flow), camera motions, and foreground saliency +// (i.e. likely foreground objects moving different from the background). +// Next tag: 16 +message MotionAnalysisOptions { + // Pre-configured policies for MotionAnalysis. + // For general use, it is recommended to select an appropiate policy + // instead of customizing flow and motion options by hand. + // Policies are being kept up to date with appropiate settings. + enum AnalysisPolicy { + // Default legacy options. Effectivley no op. + ANALYSIS_POLICY_LEGACY = 0; + + // Use for video. + ANALYSIS_POLICY_VIDEO = 1; + + // Use for video on mobile. + ANALYSIS_POLICY_VIDEO_MOBILE = 2; + + // Use if applied to camera stream on mobile, e.g. + // low latency and high throughput. + // ASSUMES DOWNSAMPLED INPUT, e.g. from GPU. + ANALYSIS_POLICY_CAMERA_MOBILE = 3; + + // Use for sped up video / hyperlapse when adding frames with seeds + // and rejection transforms. Mostly ups temporal consistency weights + // and relaxes stability constraints. + // Only recommended to be used as second pass after initial MotionAnalysis + // and FrameSelection. + ANALYSIS_POLICY_HYPERLAPSE = 4; + } + + optional AnalysisPolicy analysis_policy = 14 + [default = ANALYSIS_POLICY_LEGACY]; + + // Options for the actual motion stabilization + // (in order of object usage). + optional RegionFlowComputationOptions flow_options = 1; + optional MotionEstimationOptions motion_options = 2; + optional MotionSaliencyOptions saliency_options = 3; + + // Clip-size used for (parallelized) motion estimation. + optional int32 estimation_clip_size = 4 [default = 16]; + + // If set, camera motion is subtracted from features before output. + // Effectively outputs, residual motion w.r.t. background. + optional bool subtract_camera_motion_from_features = 5 [default = false]; + + // If flow_options().tracking_options().tracking_policy() equals + // POLICY_MULTI_FRAME, this flag indicates which RegionFlowFeatureList to use. + // Specifically, for frame C, we use the motion from C to C - 1 - track_index. + optional int32 track_index = 6 [default = 0]; + + // If set, compute motion saliency (regions of moving foreground). + optional bool compute_motion_saliency = 7 [default = false]; + + // Selects saliency inliers (only saliency locations with sufficient + // spatial and temporal support are kept). + // Only applied when compute_motion_saliency is set. + optional bool select_saliency_inliers = 8 [default = true]; + + // Performs spatio-temporal filtering of extracted foreground saliency. If + // used with above selection of saliency inliers, filtering is performed + // *after* inlier selection. + // Only applied when compute_motion_saliency is set. + optional bool filter_saliency = 9 [default = true]; + + // If set, irls weights of motion estimation are spatio-temporally smoothed + // after model estimation. + optional bool post_irls_smoothing = 10 [default = false]; + + // If a rejection_transform is passed to AddFrameGeneric, features that + // do not agree with the transform within below threshold are removed. + optional float rejection_transform_threshold = 13 [default = 20.0]; + + // Adapts visualization for rendered_results when passed to GetResults. + message VisualizationOptions { + // Visualizes tracked region flow features, colored w.r.t. fitting error. + optional bool visualize_region_flow_features = 1 [default = true]; + + // Visualizes salient points. Only applicable is compute_motion_saliency is + // set to true. + optional bool visualize_salient_points = 2 [default = false]; + + // Line thickness of ellipse when rendering salient points. + optional int32 line_thickness = 5 [default = 4]; + + // Instead of green burn in uses jet coloring to indicate magnitude of + // foreground motion. + optional bool foreground_jet_coloring = 3 [default = false]; + + // If set, only keeps masks of pixels that is used for blur analysis, rest + // is set to zero. + optional bool visualize_blur_analysis_region = 4 [default = false]; + + optional bool visualize_stats = 6 [default = true]; + + // Only long feature tracks with specified minimum length are rendered. + // Set to zero to consider all tracks. + optional int32 min_long_feature_track = 7 [default = 0]; + + // Only the last N points of a long feature track are rendered. Set to zero + // to render all points. + optional int32 max_long_feature_points = 8 [default = 0]; + } + + optional VisualizationOptions visualization_options = 11; + + // Describes how to compute foreground from features. + message ForegroundOptions { + // Indicates the *inverse* registration error (i.e. the irls weight) + // that is deemed a complete inlier. + // Weights in the interval [0, foreground_threshold] (corresponding to + // pixel errors in the interval [1 / foreground_threshold, inf]) + // are mappend to 1 - [0, 1], i.e. foreground threshold is mapped to zero + // with weights below the threshold being assigned values > 0. + // Therefore, larger values will increase amount of detected foreground + // as well as noise. + optional float foreground_threshold = 1 [default = 0.5]; + + // By using foreground_gamma < 1.0 you can increase resolution of small + // foreground motion at the expense of the resolution of large foreground + // motions. + optional float foreground_gamma = 2 [default = 1.0]; + + // Threshold is scaled by coverage, i.e. for frames with large registration + // error less forground is visualized. + optional bool threshold_coverage_scaling = 3 [default = true]; + } + + optional ForegroundOptions foreground_options = 12; +} diff --git a/mediapipe/util/tracking/motion_estimation.cc b/mediapipe/util/tracking/motion_estimation.cc new file mode 100644 index 000000000..311424dd8 --- /dev/null +++ b/mediapipe/util/tracking/motion_estimation.cc @@ -0,0 +1,6009 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/motion_estimation.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Eigen/Core" +#include "Eigen/Dense" +#include "Eigen/QR" +#include "Eigen/SVD" +#include "absl/container/node_hash_map.h" +#include "absl/strings/str_cat.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/util/tracking/camera_motion.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/parallel_invoker.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +constexpr float kIrlsEps = 1e-4f; +constexpr float kOutlierIRLSWeight = 1e-10f; +constexpr float kMaxCondition = 1e30f; + +constexpr float kPrecision = 0.1f; + +typedef RegionFlowFrame::RegionFlow RegionFlow; +typedef RegionFlowFeature Feature; + +namespace { + +void GenericFit( + const RegionFlowFeatureList& features, + const std::function& est_func, + CameraMotion* motion) { + MotionEstimationOptions options; + options.set_irls_rounds(1); + options.set_use_exact_homography_estimation(false); + options.set_use_highest_accuracy_for_normal_equations(false); + + MotionEstimation motion_est(options, features.frame_width(), + features.frame_height()); + + RegionFlowFeatureList local = features; + NormalizeRegionFlowFeatureList(&local); + est_func(&motion_est, &local, motion); +} + +} // namespace. + +TranslationModel FitTranslationModel(const RegionFlowFeatureList& features) { + CameraMotion motion; + GenericFit(features, &MotionEstimation::EstimateTranslationModel, &motion); + return motion.translation(); +} + +LinearSimilarityModel FitLinearSimilarityModel( + const RegionFlowFeatureList& features) { + CameraMotion motion; + GenericFit(features, &MotionEstimation::EstimateLinearSimilarityModel, + &motion); + return motion.linear_similarity(); +} + +AffineModel FitAffineModel(const RegionFlowFeatureList& features) { + CameraMotion motion; + GenericFit(features, &MotionEstimation::EstimateAffineModel, &motion); + return motion.affine(); +} + +Homography FitHomography(const RegionFlowFeatureList& features) { + CameraMotion motion; + GenericFit(features, &MotionEstimation::EstimateHomography, &motion); + return motion.homography(); +} + +MixtureHomography FitMixtureHomography(const RegionFlowFeatureList& features) { + CameraMotion motion; + GenericFit(features, &MotionEstimation::EstimateMixtureHomography, &motion); + return motion.mixture_homography(); +} + +// Records inlier state across frames. +// Specifically records spatial position and average magnitude of inliers +// over time (motion prior). +// New sample points can be weighted w.r.t. their agreement of spatial inlier +// locations and motion prior. +class InlierMask { + public: + // Initialize mask from options for specified frame domain. + InlierMask(const MotionEstimationOptions::IrlsMaskOptions& options, + int feature_mask_size, int frame_width, int frame_height) + : options_(options), + frame_width_(frame_width), + frame_height_(frame_height) { + const int num_bins = feature_mask_size * feature_mask_size; + mask_.resize(num_bins); + update_mask_.resize(num_bins); + + const LinearSimilarityModel norm_model = + LinearSimilarityAdapter::NormalizationTransform(frame_width_, + frame_height_); + const Vector2_f domain = LinearSimilarityAdapter::TransformPoint( + norm_model, Vector2_f(frame_width_, frame_height_)); + denom_x_ = 1.0f / domain.x(); + denom_y_ = 1.0f / domain.y(); + base_score_ = options_.base_score(); + } + + // Resets mask to all inliers. + void InitMask() { + mask_.assign(mask_.size(), 1.0f); + translation_prior_ = 0; + } + + // Applies update mask to mask. + void UpdateMask() { update_mask_.swap(mask_); } + + void UpdateTranslation(Vector2_f translation) { + const float alpha = options_.translation_blend_alpha() * translation_prior_; + translation_ = translation_ * alpha + (1.0 - alpha) * translation; + translation_prior_ = std::min( + 1.0f, translation_prior_ + options_.translation_prior_increase()); + } + + // Initialize update mask from current mask, by decaying each element. + void InitUpdateMask() { + const float decay = options_.decay(); + for (int k = 0; k < mask_.size(); ++k) { + update_mask_[k] = mask_[k] * decay; + } + } + + // Returns inlier score for bin index. + // Can be > 1, as we take the best inlier score compared to other iterations, + // only relative values matter. + float GetInlierScore(int idx) const { return base_score_ + mask_[idx]; } + + // Increases inlier score at bin idx. + void RecordInlier(int idx, float feature_weight) { + update_mask_[idx] = std::min( + 1.0f, update_mask_[idx] + feature_weight * options_.inlier_score()); + } + + // Multiplies passed motion prior with a weight within [0, 1] for each + // feature point describing how well feature's motion agrees with previously + // estimated translation. + void MotionPrior(const RegionFlowFeatureList& feature_list, + std::vector* motion_prior) { + CHECK(motion_prior != nullptr); + const int num_features = feature_list.feature_size(); + CHECK_EQ(num_features, motion_prior->size()); + + // Return, if prior is too low. + const float kMinTranslationPrior = 0.5f; + if (translation_prior_ < kMinTranslationPrior) { + motion_prior->assign(num_features, 1.0f); + return; + } + const float prev_magnitude = translation_.Norm(); + + CHECK_EQ(num_features, motion_prior->size()); + const float inv_prev_magnitude = + prev_magnitude < options_.min_translation_norm() + ? (1.0f / options_.min_translation_norm()) + : (1.0f / prev_magnitude); + for (int k = 0; k < num_features; ++k) { + const Vector2_f flow = FeatureFlow(feature_list.feature(k)); + const float weight = + base_score_ + std::max(0, 1.0f - (flow - translation_).Norm() * + inv_prev_magnitude); + (*motion_prior)[k] *= weight; + } + } + + private: + MotionEstimationOptions::IrlsMaskOptions options_; + int frame_width_; + int frame_height_; + float denom_x_; + float denom_y_; + float base_score_; + + Vector2_f translation_; + float translation_prior_ = 0; + + std::vector mask_; + std::vector update_mask_; +}; + +// Local storage for MotionEstimation within each thread to minimize +// allocations. +class MotionEstimationThreadStorage { + public: + MotionEstimationThreadStorage(const MotionEstimationOptions& options, + const MotionEstimation* motion_estimation, + int max_feature_guess = 0) { + const int coverage_grid_size = options.coverage_grid_size(); + grid_coverage_irls_mask_.resize(coverage_grid_size * coverage_grid_size); + const int max_features = max_feature_guess > 0 ? max_feature_guess : 4000; + + // Allocate bins to 150% of expected features. + const int features_per_bin = + max_features * 1.5f / grid_coverage_irls_mask_.size(); + + for (auto& mask : grid_coverage_irls_mask_) { + mask.reserve(features_per_bin); + } + + // Compute gaussian weights for grid coverage. + const float scaled_width = + 1.0f / motion_estimation->normalized_domain_.x() * coverage_grid_size; + const float scaled_height = + 1.0f / motion_estimation->normalized_domain_.y() * coverage_grid_size; + + const float inv_scaled_width = 1.0f / scaled_width; + const float inv_scaled_height = 1.0f / scaled_height; + + // Compute gaussian weights for grid cells. + RegionFlowFeatureList grid_cell_features; + for (int y = 0; y < coverage_grid_size; ++y) { + for (int x = 0; x < coverage_grid_size; ++x) { + RegionFlowFeature* feature = grid_cell_features.add_feature(); + feature->set_x((x + 0.5f) * inv_scaled_width); + feature->set_y((y + 0.5f) * inv_scaled_height); + } + } + + motion_estimation->GetHomographyIRLSCenterWeights(grid_cell_features, + &grid_cell_weights_); + } + + MotionEstimationThreadStorage(const MotionEstimationThreadStorage&) = delete; + MotionEstimationThreadStorage& operator=( + const MotionEstimationThreadStorage&) = delete; + + std::vector>* EmptyGridCoverageIrlsMask() { + for (auto& mask : grid_coverage_irls_mask_) { + mask.clear(); + } + return &grid_coverage_irls_mask_; + } + + const std::vector& GridCoverageInitializationWeights() const { + return grid_cell_weights_; + } + + // Creates copy of current thread storage, caller takes ownership. + std::unique_ptr Copy() const { + std::unique_ptr copy( + new MotionEstimationThreadStorage); + copy->grid_coverage_irls_mask_ = grid_coverage_irls_mask_; + copy->grid_cell_weights_ = grid_cell_weights_; + return copy; + } + + private: + // Empty constructor for Copy. + MotionEstimationThreadStorage() {} + + std::vector> grid_coverage_irls_mask_; + std::vector grid_cell_weights_; +}; + +// Holds all the data for a clip (multiple frames) of single-frame tracks. +struct MotionEstimation::SingleTrackClipData { + // Features to be processed. Can be set to point to external data, or + // point to internal storage via InitializeFromInternalStorage. + // Stores one RegionFlowFeatureList pointer per frame. + std::vector* feature_lists = nullptr; + + // Camera motions to be output. Can be set to point to external data, or + // point to internal storage via InitializeFromInternalStorage. + // Stores one camera motion per frame. + std::vector* camera_motions = nullptr; + + // Difference in frames that features and motions are computed for. + int frame_diff = 1; + + // Prior weights for each frame. + std::vector prior_weights; + + // Optional inlier mask. Used across the whole clip. + InlierMask* inlier_mask = nullptr; + + // Weights to be passed to each stage of motion estimation. + std::vector> irls_weight_input; + + // Indicates if weights in above vectors are uniform (to avoid testing on + // each loop iteration). + std::vector uniform_weight_input; + + // Indicates if non-decaying full prior should be used + // (always bias towards initialization). + std::vector use_full_prior; + + // Specific weights for homography. + std::vector> homog_irls_weight_input; + + // Storage for earlier weights, in case estimated model is unstable. + std::vector>* irls_weight_backup = nullptr; + + // If above feature_lists and camera_motions are not a view on external + // data, storage holds underlying data. + std::vector feature_storage; + std::vector feature_view; + std::vector motion_storage; + std::vector> irls_backup_storage; + + // Call after populating feature_storage and motion_storage with data, to + // initialize feature_lists and camera_motions. + void InitializeFromInternalStorage() { + feature_view.reserve(feature_storage.size()); + + for (auto& feature_list : feature_storage) { + feature_view.push_back(&feature_list); + } + + feature_lists = &feature_view; + camera_motions = &motion_storage; + } + + // Call after initializing feature_lists, to allocate storage for each + // feature's irls weight. If weight_backup is set, allocates storage + // to backup and reset irls weights. + void AllocateIRLSWeightStorage(bool weight_backup) { + CHECK(feature_lists != nullptr); + const int num_frames = feature_lists->size(); + if (weight_backup) { + irls_weight_backup = &irls_backup_storage; + } + + if (num_frames == 0) { + return; + } + + irls_weight_input.resize(num_frames); + uniform_weight_input.resize(num_frames, true); + use_full_prior.resize(num_frames, false); + homog_irls_weight_input.resize(num_frames); + + if (weight_backup) { + irls_weight_backup->resize(num_frames); + } + + for (int k = 0; k < num_frames; ++k) { + const int num_features = (*feature_lists)[k]->feature_size(); + if (num_features != 0) { + irls_weight_input[k].reserve(num_features); + homog_irls_weight_input[k].reserve(num_features); + } + } + } + + // Returns number of frames in this clip. + int num_frames() const { + DCHECK(feature_lists); + return feature_lists->size(); + } + + // Returns irls weight input depending on the passed motion type. + std::vector>& IrlsWeightInput(const MotionType& type) { + switch (type) { + case MODEL_HOMOGRAPHY: + return homog_irls_weight_input; + default: + return irls_weight_input; + } + } + + // Checks that SingleTrackClipData is properly initialized. + void CheckInitialization() const { + CHECK(feature_lists != nullptr); + CHECK(camera_motions != nullptr); + CHECK_EQ(feature_lists->size(), camera_motions->size()); + if (feature_lists->empty()) { + return; + } + + CHECK_EQ(num_frames(), irls_weight_input.size()); + CHECK_EQ(num_frames(), homog_irls_weight_input.size()); + if (irls_weight_backup) { + CHECK_EQ(num_frames(), irls_weight_backup->size()); + } + + for (int k = 0; k < num_frames(); ++k) { + const int num_features = (*feature_lists)[k]->feature_size(); + CHECK_EQ(num_features, irls_weight_input[k].size()); + CHECK_EQ(num_features, homog_irls_weight_input[k].size()); + } + } + + // Prepares PriorFeatureWeights structure for usage. + void SetupPriorWeights(int irls_rounds) { + prior_weights.resize(num_frames(), PriorFeatureWeights(irls_rounds)); + for (int k = 0; k < num_frames(); ++k) { + prior_weights[k].use_full_prior = use_full_prior[k]; + } + } + + // Clears the specified flag from each camera motion. + void ClearFlagFromMotion(int flag) { + for (auto& camera_motion : *camera_motions) { + camera_motion.set_flags(camera_motion.flags() & ~flag); + } + } + + // Resets feature weights from backed up ones if type is <= + // max_unstable_type. + void RestoreWeightsFromBackup(CameraMotion::Type max_unstable_type) { + if (irls_weight_backup == nullptr) { + return; + } + + const int num_frames = feature_lists->size(); + for (int k = 0; k < num_frames; ++k) { + if (camera_motions->at(k).type() <= max_unstable_type) { + SetRegionFlowFeatureIRLSWeights(irls_weight_backup->at(k), + feature_lists->at(k)); + } + } + } +}; + +MotionEstimation::MotionEstimation(const MotionEstimationOptions& options, + int frame_width, int frame_height) + : frame_width_(frame_width), frame_height_(frame_height) { + normalization_transform_ = LinearSimilarityAdapter::NormalizationTransform( + frame_width_, frame_height_); + + inv_normalization_transform_ = + LinearSimilarityAdapter::Invert(normalization_transform_); + + // Cap domain to express IRLS errors to 640x360 (format used + // to calibrate thresholds on dataset). + const int max_irls_width = frame_width_ > frame_height ? 640 : 360; + const int max_irls_height = frame_width_ > frame_height ? 360 : 640; + const int irls_width = std::min(max_irls_width, frame_width_); + const int irls_height = std::min(max_irls_height, frame_height_); + irls_transform_ = ModelInvert( + LinearSimilarityAdapter::NormalizationTransform(irls_width, irls_height)); + if (!options.domain_limited_irls_scaling()) { + // Fallback to inverse normalization transform, i.e. express errors + // in image domain. + irls_transform_ = inv_normalization_transform_; + } + + normalized_domain_ = TransformPoint(normalization_transform_, + Vector2_f(frame_width_, frame_height_)); + + InitializeWithOptions(options); +} + +MotionEstimation::~MotionEstimation() {} + +void MotionEstimation::InitializeWithOptions( + const MotionEstimationOptions& options) { + // Check options, specifically if fall-back models are set to be estimated. + if (options.homography_estimation() != + MotionEstimationOptions::ESTIMATION_HOMOG_NONE && + options.linear_similarity_estimation() == + MotionEstimationOptions::ESTIMATION_LS_NONE) { + LOG(FATAL) << "Invalid MotionEstimationOptions. " + << "Homography estimation requires similarity to be estimated"; + } + + if (options.mix_homography_estimation() != + MotionEstimationOptions::ESTIMATION_HOMOG_MIX_NONE && + options.homography_estimation() == + MotionEstimationOptions::ESTIMATION_HOMOG_NONE) { + LOG(FATAL) << "Invalid MotionEstimationOptions. " + << "Mixture homography estimation requires homography to be " + << "estimated."; + } + + // Check for deprecated options. + CHECK_NE(options.estimate_similarity(), true) + << "Option estimate_similarity is deprecated, use static function " + << "EstimateSimilarityModelL2 instead."; + CHECK_NE(options.linear_similarity_estimation(), + MotionEstimationOptions::ESTIMATION_LS_L2_RANSAC) + << "Option ESTIMATION_LS_L2_RANSAC is deprecated, use " + << "ESTIMATION_LS_IRLS instead."; + CHECK_NE(options.linear_similarity_estimation(), + MotionEstimationOptions::ESTIMATION_LS_L1) + << "Option ESTIMATION_LS_L1 is deprecated, use static function " + << "EstimateLinearSimilarityL1 instead."; + + options_ = options; + + // (Re)-Initialize row_weights_ based on options. + if (options.mix_homography_estimation() != + MotionEstimationOptions::ESTIMATION_HOMOG_MIX_NONE) { + const float row_sigma = options.mixture_row_sigma() * frame_height_; + const float y_scale = frame_height_ / normalized_domain_.y(); + + if (row_weights_ == NULL || + row_weights_->NeedsInitialization(options.num_mixtures(), row_sigma, + y_scale)) { + row_weights_.reset(new MixtureRowWeights(frame_height_, + 0, // no margin. + row_sigma, y_scale, + options.num_mixtures())); + } + } + + switch (options.estimation_policy()) { + case MotionEstimationOptions::INDEPENDENT_PARALLEL: + case MotionEstimationOptions::JOINTLY_FROM_TRACKS: + break; + + case MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS: { + const auto& bias_options = options.long_feature_bias_options(); + + // Using 3x3 filters, max distance is 2 bin diagonals plus 1% room + // incase maximum value is attained. + const float max_space_diff = + 2.0f * sqrt(2.0) * bias_options.grid_size() * 1.01f; + + InitGaussLUT(bias_options.spatial_sigma(), max_space_diff, + &feature_bias_lut_.spatial_lut, + &feature_bias_lut_.spatial_scale); + + const float max_color_diff = + std::sqrt(static_cast(3.0f)) * 255.0f; // 3 channels. + InitGaussLUT(bias_options.color_sigma(), max_color_diff, + &feature_bias_lut_.color_lut, + &feature_bias_lut_.color_scale); + + // Gaussian at 2.5 (normalized) < 0.05 + const float max_weight = bias_options.bias_stdev() * 2.5 * 1.01f; + InitGaussLUT(bias_options.bias_stdev(), max_weight, + &feature_bias_lut_.bias_weight_lut, + &feature_bias_lut_.bias_weight_scale); + + break; + } + + case MotionEstimationOptions::TEMPORAL_IRLS_MASK: + CHECK(options.irls_initialization().activated()) + << "To use dependent_initialization, irls_initialization has to " + << "be activated. "; + inlier_mask_.reset(new InlierMask(options.irls_mask_options(), + options.feature_mask_size(), + frame_width_, frame_height_)); + inlier_mask_->InitMask(); + break; + } +} + +void MotionEstimation::EstimateMotion(const RegionFlowFrame& region_flow_frame, + const int* intensity_frame, // null + const int* prev_intensity_frame, // null + CameraMotion* camera_motion) const { + CHECK(camera_motion); + + CHECK(intensity_frame == NULL) + << "Parameter intensity_frame is deprecated, must be NULL."; + CHECK(prev_intensity_frame == NULL) + << "Parameter prev_intensity_frame is deprecated, must be NULL."; + + RegionFlowFeatureList feature_list; + GetRegionFlowFeatureList(region_flow_frame, 0, &feature_list); + std::vector feature_lists(1, &feature_list); + std::vector camera_motions(1); + + EstimateMotionsParallel(false, &feature_lists, &camera_motions); + camera_motion->CopyFrom(camera_motions[0]); +} + +bool MotionEstimation::EstimateTranslationModel( + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) { + EstimateTranslationModelIRLS(options_.irls_rounds(), false, feature_list, + nullptr, camera_motion); + return true; +} + +bool MotionEstimation::EstimateLinearSimilarityModel( + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) { + return EstimateLinearSimilarityModelIRLS( + options_.irls_rounds(), false, feature_list, nullptr, camera_motion); +} + +bool MotionEstimation::EstimateAffineModel(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion) { + return EstimateAffineModelIRLS(options_.irls_rounds(), feature_list, + camera_motion); +} + +bool MotionEstimation::EstimateHomography(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion) { + return EstimateHomographyIRLS(options_.irls_rounds(), false, nullptr, nullptr, + feature_list, camera_motion); +} + +bool MotionEstimation::EstimateMixtureHomography( + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) { + return EstimateMixtureHomographyIRLS( + options_.irls_rounds(), true, options_.mixture_regularizer(), + 0, // spectrum index. + nullptr, nullptr, feature_list, camera_motion); +} + +float MotionEstimation::GetIRLSResidualScale(const float avg_motion_magnitude, + float distance_fraction) const { + const float translation_magnitude = + LinearSimilarityAdapter::TransformPoint( + normalization_transform_, Vector2_f(avg_motion_magnitude, 0)) + .x(); + + // Assume 1 pixel estimation error for tracked features at 480p video. + // This serves as absolute minimum of the estimation error, so we do not + // scale translation fractions below this threshold. + const float minimum_error = 1.25e-3f; + + // Only normalize if residual (w.r.t. translation) is larger than + // estimation error. + const float translation_threshold = translation_magnitude * distance_fraction; + // Translation is above minimum error. + if (translation_threshold > minimum_error) { + return minimum_error / translation_threshold; + } else { + return 1.0f; + } +} + +// Collects various options that alter how motion models are initialized or +// estimated. Construct from MotionEstimationOptions for default values. +struct MotionEstimation::EstimateModelOptions { + explicit EstimateModelOptions(const MotionEstimationOptions& options) + : mixture_regularizer(options.mixture_regularizer()), + estimate_linear_similarity( + options.linear_similarity_estimation() != + MotionEstimationOptions::ESTIMATION_LS_NONE) {} + + // Maps each motion type to a unique index, whereas different mixtures in a + // spectrum are treated as separate types. + int IndexFromType(const MotionType& type) const { + if (type != MODEL_MIXTURE_HOMOGRAPHY) { + return static_cast(type); + } else { + return static_cast(type) + mixture_spectrum_index; + } + } + + float mixture_regularizer = 0; + float mixture_inlier_threshold_scale = 0; + int mixture_spectrum_index = 0; + bool check_model_stability = true; + bool estimate_linear_similarity = true; +}; + +// Invoker for parallel execution. Thread storage is optional. +class EstimateMotionIRLSInvoker { + public: + // Performs estimation of the requested type for irls_rounds IRLS iterations. + // Only performs estimation if current motion type is less or equal to + // max_unstable_type. + // Optionally accepts list of PriorFeatureWeights and thread_storage. + // After model computation irls_weight member for each RegionFlowFeature + // in the passed feature_lists is updated with the inverse fitting error. + // Stability features for requested MotionType are computed if + // compute_stability argument is set. + EstimateMotionIRLSInvoker( + const MotionEstimation::MotionType& type, int irls_rounds, + bool compute_stability, const CameraMotion::Type& max_unstable_type, + const MotionEstimation::EstimateModelOptions& model_options, + const MotionEstimation* motion_estimation, + const std::vector* + prior_weights, // optional. + const MotionEstimationThreadStorage* thread_storage, // optional. + std::vector* feature_lists, + std::vector* camera_motions) + : motion_type_(type), + irls_rounds_(irls_rounds), + compute_stability_(compute_stability), + max_unstable_type_(max_unstable_type), + model_options_(model_options), + motion_estimation_(motion_estimation), + prior_weights_(prior_weights), + feature_lists_(feature_lists), + camera_motions_(camera_motions) { + if (thread_storage != nullptr) { + std::unique_ptr tmp_storage( + thread_storage->Copy()); + thread_storage_ = std::move(tmp_storage); + } + } + + EstimateMotionIRLSInvoker(const EstimateMotionIRLSInvoker& invoker) + : motion_type_(invoker.motion_type_), + irls_rounds_(invoker.irls_rounds_), + compute_stability_(invoker.compute_stability_), + max_unstable_type_(invoker.max_unstable_type_), + model_options_(invoker.model_options_), + motion_estimation_(invoker.motion_estimation_), + prior_weights_(invoker.prior_weights_), + feature_lists_(invoker.feature_lists_), + camera_motions_(invoker.camera_motions_) { + if (invoker.thread_storage_ != nullptr) { + std::unique_ptr tmp_storage( + invoker.thread_storage_->Copy()); + thread_storage_ = std::move(tmp_storage); + } + } + + void operator()(const BlockedRange& range) const { + for (int frame = range.begin(); frame != range.end(); ++frame) { + EstimateMotion(frame, (*feature_lists_)[frame], + &(*camera_motions_)[frame]); + } + } + + private: + inline void EstimateMotion(int frame, RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion) const { + if (camera_motion->type() > max_unstable_type_) { + // Don't estimate anything, immediate return. + return; + } + + if (camera_motion->flags() & CameraMotion::FLAG_SINGULAR_ESTIMATION) { + // If motion became singular in earlier iteration, skip. + return; + } + + const MotionEstimation::PriorFeatureWeights* prior_weight = + prior_weights_ && (*prior_weights_)[frame].HasPrior() + ? &(*prior_weights_)[frame] + : nullptr; + + switch (motion_type_) { + case MotionEstimation::MODEL_AVERAGE_MAGNITUDE: + motion_estimation_->EstimateAverageMotionMagnitude(*feature_list, + camera_motion); + break; + + case MotionEstimation::MODEL_TRANSLATION: + motion_estimation_->EstimateTranslationModelIRLS( + irls_rounds_, compute_stability_, feature_list, prior_weight, + camera_motion); + break; + + case MotionEstimation::MODEL_LINEAR_SIMILARITY: + motion_estimation_->EstimateLinearSimilarityModelIRLS( + irls_rounds_, compute_stability_, feature_list, prior_weight, + camera_motion); + break; + + case MotionEstimation::MODEL_AFFINE: + motion_estimation_->EstimateAffineModelIRLS(irls_rounds_, feature_list, + camera_motion); + break; + + case MotionEstimation::MODEL_HOMOGRAPHY: + motion_estimation_->EstimateHomographyIRLS( + irls_rounds_, compute_stability_, prior_weight, + thread_storage_.get(), feature_list, camera_motion); + break; + + case MotionEstimation::MODEL_MIXTURE_HOMOGRAPHY: + // If one estimation fails, clear the whole spectrum. + if (!motion_estimation_->EstimateMixtureHomographyIRLS( + irls_rounds_, compute_stability_, + model_options_.mixture_regularizer, + model_options_.mixture_spectrum_index, prior_weight, + thread_storage_.get(), feature_list, camera_motion)) { + camera_motion->clear_mixture_homography_spectrum(); + } + break; + + case MotionEstimation::MODEL_NUM_VALUES: + LOG(FATAL) << "Function should not be called with this value"; + break; + } + } + + private: + const MotionEstimation::MotionType motion_type_; + int irls_rounds_; + bool compute_stability_; + CameraMotion::Type max_unstable_type_; + const MotionEstimation::EstimateModelOptions& model_options_; + const MotionEstimation* motion_estimation_; + const std::vector* prior_weights_; + std::vector* feature_lists_; + std::vector* camera_motions_; + + std::unique_ptr thread_storage_; +}; + +void MotionEstimation::EstimateMotionsParallelImpl( + bool irls_weights_preinitialized, + std::vector* feature_lists, + std::vector* camera_motions) const { + MEASURE_TIME << "Estimate motions: " << feature_lists->size(); + + CHECK(feature_lists != nullptr); + CHECK(camera_motions != nullptr); + + const int num_frames = feature_lists->size(); + CHECK_EQ(num_frames, camera_motions->size()); + + // Initialize camera_motions. + for (int f = 0; f < num_frames; ++f) { + CameraMotion& camera_motion = (*camera_motions)[f]; + const RegionFlowFeatureList& feature_list = *(*feature_lists)[f]; + + // Resets every model to INVALID. + ResetMotionModels(options_, &camera_motion); + InitCameraMotionFromFeatureList(feature_list, &camera_motion); + + // Assume motions to be VALID in case they contain features. + // INVALID (= empty frames) wont be considered during motion estimation. + if (feature_list.feature_size() != 0) { + camera_motion.set_type(CameraMotion::VALID); + } + + // Flag duplicated frames. + if (feature_list.is_duplicated()) { + camera_motion.set_flags(camera_motion.flags() | + CameraMotion::FLAG_DUPLICATED); + } + } + + // Backup original IRLS weights if original weights are requested to be + // output. + std::vector> original_irls_weights(num_frames); + if (!options_.output_refined_irls_weights()) { + for (int f = 0; f < num_frames; ++f) { + const RegionFlowFeatureList& feature_list = *(*feature_lists)[f]; + GetRegionFlowFeatureIRLSWeights(feature_list, &original_irls_weights[f]); + } + } + + const bool use_joint_tracks = options_.estimation_policy() == + MotionEstimationOptions::JOINTLY_FROM_TRACKS; + + // Joint frame estimation. + const int num_motion_models = + use_joint_tracks ? options_.joint_track_estimation().num_motion_models() + : 1; + CHECK_GT(num_motion_models, 0); + + // Several single track clip datas, we seek to process. + std::vector clip_datas(num_motion_models); + SingleTrackClipData* main_clip_data = &clip_datas[0]; + + // First clip data is always view on external data. + main_clip_data->feature_lists = feature_lists; + main_clip_data->camera_motions = camera_motions; + main_clip_data->inlier_mask = inlier_mask_.get(); + main_clip_data->frame_diff = 1; + + main_clip_data->AllocateIRLSWeightStorage(true); + + LongFeatureInfo long_feature_info; + + if (irls_weights_preinitialized && + options_.filter_initialized_irls_weights()) { + MinFilterIrlsWeightByTrack(main_clip_data); + } + + // Determine importance for each track length. + std::vector track_length_importance; + if (options_.long_feature_initialization().activated()) { + for (const auto feature_list_ptr : *feature_lists) { + if (feature_list_ptr->long_tracks()) { + long_feature_info.AddFeatures(*feature_list_ptr); + } + } + + const float percentile = + options_.long_feature_initialization().min_length_percentile(); + const float min_length = long_feature_info.GlobalTrackLength(percentile); + + track_length_importance.resize(num_frames + 1, 1.0f); + // Gaussian weighting. + const float denom = -0.5f / (2.0f * 2.0f); // 2 frame stdev. + for (int k = 0; k <= num_frames; ++k) { + float weight = 1.0f; + if (k < min_length) { + weight = std::exp((k - min_length) * (k - min_length) * denom); + } + track_length_importance[k] = weight; + } + } + + int max_features = 0; + for (int f = 0; f < num_frames; ++f) { + // Initialize IRLS input. + const RegionFlowFeatureList& feature_list = + *(*main_clip_data->feature_lists)[f]; + + max_features = std::max(feature_list.feature_size(), max_features); + + std::vector& irls_weight_input = + main_clip_data->irls_weight_input[f]; + + if (irls_weights_preinitialized) { + GetRegionFlowFeatureIRLSWeights(feature_list, &irls_weight_input); + } else { + // Set to one. + irls_weight_input.resize(feature_list.feature_size(), 1.0f); + } + + // Note: To create visualizations of the prior stored in irls_weight_input, + // add a call to + // SetRegionFlowFeatureIRLSWeights(irls_weight_input, &feature_list); + // anywhere below and adopt irls_rounds to zero in options. + // This will effectively output identity motions with irls weights set + // to the priors here. + bool uniform_weights = !irls_weights_preinitialized; + bool use_full_prior = false; + + if (options_.long_feature_initialization().activated()) { + if (!feature_list.long_tracks()) { + LOG(ERROR) << "Requesting long feature initialization but " + << "input is not computed with long features."; + } else { + LongFeatureInitialization(feature_list, long_feature_info, + track_length_importance, &irls_weight_input); + uniform_weights = false; + use_full_prior = true; + } + } + + if (options_.feature_density_normalization()) { + FeatureDensityNormalization(feature_list, &irls_weight_input); + uniform_weights = false; + use_full_prior = true; + } + + GetHomographyIRLSCenterWeights(feature_list, + &main_clip_data->homog_irls_weight_input[f]); + + if (!uniform_weights) { + // Multiply homography weights by non-uniform irls input weights. + std::vector& homg_weights = + main_clip_data->homog_irls_weight_input[f]; + + const int num_features = feature_list.feature_size(); + for (int k = 0; k < num_features; ++k) { + homg_weights[k] *= irls_weight_input[k]; + } + } + + main_clip_data->uniform_weight_input[f] = uniform_weights; + main_clip_data->use_full_prior[f] = use_full_prior; + } + + if (options_.estimation_policy() == + MotionEstimationOptions::JOINTLY_FROM_TRACKS) { + for (int k = 1; k < num_motion_models; ++k) { + SingleTrackClipData* curr_clip_data = &clip_datas[k]; + // Camera motions are simply a copy of the initialized main data. + curr_clip_data->motion_storage = *camera_motions; + curr_clip_data->feature_storage.resize(num_frames); + curr_clip_data->InitializeFromInternalStorage(); + + // No backup storage for non-main data (feature's are not output). + curr_clip_data->AllocateIRLSWeightStorage(false); + const int stride = options_.joint_track_estimation().motion_stride(); + curr_clip_data->frame_diff = stride * k; + + // Populate feature and motion storage. + for (int f = 0; f < num_frames; ++f) { + const int prev_frame = f - stride * k; + if (prev_frame < 0) { + CopyToEmptyFeatureList((*feature_lists)[f], + &curr_clip_data->feature_storage[f]); + } else { + // Determine features present in both frames along the long feature + // tracks. + std::vector source_idx; + IntersectRegionFlowFeatureList(*(*feature_lists)[prev_frame], + &FeatureLocation, (*feature_lists)[f], + &curr_clip_data->feature_storage[f], + &source_idx); + + curr_clip_data->irls_weight_input[f].reserve(source_idx.size()); + curr_clip_data->homog_irls_weight_input[f].reserve(source_idx.size()); + + // Copy weights. + for (int idx : source_idx) { + curr_clip_data->irls_weight_input[f].push_back( + main_clip_data->irls_weight_input[f][idx]); + curr_clip_data->homog_irls_weight_input[f].push_back( + main_clip_data->homog_irls_weight_input[f][idx]); + } + } + } + } + } + + for (auto& clip_data : clip_datas) { + clip_data.CheckInitialization(); + } + + for (auto& clip_data : clip_datas) { + // Estimate AverageMotion magnitudes. + ParallelFor(0, num_frames, 1, + EstimateMotionIRLSInvoker( + MODEL_AVERAGE_MAGNITUDE, + 1, // Does not use irls. + true, // Compute stability. + CameraMotion::VALID, DefaultModelOptions(), this, + nullptr, // No prior weights. + nullptr, // No thread storage. + clip_data.feature_lists, clip_data.camera_motions)); + } + + // Order of estimation for motion models: + // Translation -> Linear Similarity -> Affine -> Homography -> Mixture + // Homography. + + // Estimate translations, regardless of stability of similarity. + EstimateMotionModels(MODEL_TRANSLATION, + CameraMotion::UNSTABLE, // Any but invalid. + DefaultModelOptions(), + nullptr, // No thread storage. + &clip_datas); + + // Estimate linear similarity, but only if translation was deemed stable. + EstimateMotionModels(MODEL_LINEAR_SIMILARITY, CameraMotion::VALID, + DefaultModelOptions(), + nullptr, // No thread storage. + &clip_datas); + + if (options_.project_valid_motions_down()) { + ProjectMotionsDown(MODEL_LINEAR_SIMILARITY, camera_motions); + } + + // Estimate affine, but only if similarity was deemed stable. + EstimateMotionModels(MODEL_AFFINE, CameraMotion::VALID, DefaultModelOptions(), + nullptr, // No thread storage. + &clip_datas); + + // Thread storage below is only used for homography or mixtures. + MotionEstimationThreadStorage thread_storage(options_, this, max_features); + + // Estimate homographies, only if similarity was deemed stable. + EstimateMotionModels(MODEL_HOMOGRAPHY, CameraMotion::VALID, + DefaultModelOptions(), &thread_storage, &clip_datas); + + if (options_.project_valid_motions_down()) { + // If homography is unstable, then whatever was deemed stable got + // embedded here and is now down projected again. + ProjectMotionsDown(MODEL_HOMOGRAPHY, camera_motions); + } + + // Estimate mixtures. We attempt estimation as long as at least + // translation was estimated stable. + // Estimate mixtures across a spectrum a different regularizers, from the + // weakest to the most regularized one. + const int num_mixture_levels = options_.mixture_regularizer_levels(); + CHECK_LE(num_mixture_levels, 10) << "Only up to 10 mixtures are supported."; + + // Initialize to weakest regularizer. + float regularizer = options_.mixture_regularizer(); + + // Initialize with maximum of weakest regularized mixture. + float inlier_threshold_scale = + std::pow(static_cast(options_.mixture_regularizer_base()), + static_cast(options_.mixture_regularizer_levels() - 1)); + + bool base_mixture_estimated = false; + for (int m = 0; m < num_mixture_levels; ++m) { + EstimateModelOptions options = DefaultModelOptions(); + options.mixture_regularizer = regularizer; + options.mixture_inlier_threshold_scale = inlier_threshold_scale; + options.mixture_spectrum_index = m; + // Only check stability for weakest regularized mixture. + options.check_model_stability = m == 0; + // Estimate weakest mixture even if similarity was deemed unstable, higher + // mixtures only if weakest mixture was deemed stable. + const bool estimate_result = EstimateMotionModels( + MODEL_MIXTURE_HOMOGRAPHY, + m == 0 ? CameraMotion::UNSTABLE : CameraMotion::VALID, options, + &thread_storage, &clip_datas); + + if (m == 0) { + base_mixture_estimated = estimate_result; + } + + regularizer *= options_.mixture_regularizer_base(); + inlier_threshold_scale /= options_.mixture_regularizer_base(); + + // Preserve IRLS weights from the very first mixture (all stability is + // computed w.r.t. it). + if (base_mixture_estimated && m > 0) { + for (auto& clip_data : clip_datas) { + clip_data.RestoreWeightsFromBackup(CameraMotion::VALID); + } + } + } + + // Check that mixture spectrum has sufficient entries. + for (const CameraMotion& motion : *camera_motions) { + if (motion.mixture_homography_spectrum_size() > 0) { + CHECK_EQ(motion.mixture_homography_spectrum_size(), + options_.mixture_regularizer_levels()); + } + } + + IRLSWeightFilter(feature_lists); + + if (!options_.output_refined_irls_weights()) { + for (int f = 0; f < num_frames; ++f) { + RegionFlowFeatureList& feature_list = *(*feature_lists)[f]; + SetRegionFlowFeatureIRLSWeights(original_irls_weights[f], &feature_list); + } + } + + // Lift model type from INVALID for empty frames to VALID if requested. + if (options_.label_empty_frames_as_valid()) { + for (int f = 0; f < num_frames; ++f) { + CameraMotion& camera_motion = (*camera_motions)[f]; + if ((*feature_lists)[f]->feature_size() == 0) { + camera_motion.set_type(CameraMotion::VALID); + } + } + } +} + +MotionEstimation::EstimateModelOptions MotionEstimation::DefaultModelOptions() + const { + return EstimateModelOptions(options_); +} + +// In the following member refers to member in SingleTrackClipData. +// For each estimation invocation, irls weights of features are set from +// member irls_weight_input. +// Motion models are estimated from member feature_list and stored in +// member camera_motions. +bool MotionEstimation::EstimateMotionModels( + const MotionType& type, const CameraMotion::Type& max_unstable_type, + const EstimateModelOptions& model_options, + const MotionEstimationThreadStorage* thread_storage, + std::vector* clip_datas) const { + CHECK(clip_datas != nullptr); + + const int num_datas = clip_datas->size(); + if (num_datas == 0) { + return false; + } + + for (const auto& clip_data : *clip_datas) { + clip_data.CheckInitialization(); + } + + // Perform estimation across all frames for several total_rounds with each + // round having irls_per_round iterations. + int irls_per_round = 1; + int total_rounds = 1; + + PolicyToIRLSRounds(IRLSRoundsFromSettings(type), &total_rounds, + &irls_per_round); + + const int total_irls_rounds = irls_per_round * total_rounds; + + // Skip if nothing to estimate. + if (total_irls_rounds == 0) { + return false; + } + + // Setup each clip data for this estimation round. + for (auto& clip_data : *clip_datas) { + clip_data.SetupPriorWeights(irls_per_round); + // Clear flag from earlier model estimation. + clip_data.ClearFlagFromMotion(CameraMotion::FLAG_SINGULAR_ESTIMATION); + } + + if (options_.estimation_policy() != + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS) { + // Irls initialization for each list. + for (auto& clip_data : *clip_datas) { + IrlsInitialization(type, max_unstable_type, + -1, // all frames. + model_options, &clip_data); + } + + // Parallel estimation across frames. + for (int r = 0; r < total_rounds; ++r) { + // Setup, default decaying irls alphas. + std::vector irls_alphas(irls_per_round); + for (int k = 0; k < irls_per_round; ++k) { + irls_alphas[k] = + IRLSPriorWeight(r * irls_per_round + k, total_irls_rounds); + } + + for (auto& clip_data : *clip_datas) { + // Setup prior's alphas. + for (auto& prior_weight : clip_data.prior_weights) { + if (prior_weight.use_full_prior) { + prior_weight.alphas.assign(irls_per_round, 1.0f); + } else { + prior_weight.alphas = irls_alphas; + } + + // Last iteration, irls_alpha is always zero to return actual error. + if (r + 1 == total_rounds) { + prior_weight.alphas.back() = 0.0; + } + } + + const bool last_round = r + 1 == total_rounds; + ParallelFor(0, clip_data.num_frames(), 1, + EstimateMotionIRLSInvoker( + type, irls_per_round, + last_round, // Compute stability on last round. + max_unstable_type, model_options, this, + &clip_data.prior_weights, thread_storage, + clip_data.feature_lists, clip_data.camera_motions)); + } + + if (options_.estimation_policy() == + MotionEstimationOptions::JOINTLY_FROM_TRACKS) { + EnforceTrackConsistency(clip_datas); + } + } + + if (model_options.check_model_stability) { + for (auto& clip_data : *clip_datas) { + CheckModelStability(type, max_unstable_type, + clip_data.irls_weight_backup, + clip_data.feature_lists, clip_data.camera_motions); + } + } + } else { + // Estimation policy == TEMPORAL_LONG_FEATURE_BIAS. + for (auto& clip_data : *clip_datas) { + EstimateMotionIRLSInvoker motion_invoker( + type, irls_per_round, + true, // Compute stability on last round. + max_unstable_type, model_options, this, &clip_data.prior_weights, + thread_storage, clip_data.feature_lists, clip_data.camera_motions); + + for (int round = 0; round < total_rounds; ++round) { + // Traverse frames in order. + for (int k = 0; k < clip_data.num_frames(); ++k) { + if (clip_data.feature_lists->at(k)->feature_size() > 0) { + CHECK(clip_data.feature_lists->at(k)->long_tracks()) + << "Estimation policy TEMPORAL_LONG_FEATURE_BIAS requires " + << "tracking with long tracks."; + } + + // First round -> initialize weights. + if (round == 0) { + IrlsInitialization(type, max_unstable_type, k, model_options, + &clip_data); + + BiasLongFeatures(clip_data.feature_lists->at(k), type, + model_options, &clip_data.prior_weights[k]); + } + + if (clip_data.camera_motions->at(k).type() <= max_unstable_type) { + CHECK(clip_data.prior_weights[k].use_full_prior); + clip_data.prior_weights[k].alphas.assign(irls_per_round, 1.0f); + clip_data.prior_weights[k].alphas.back() = 0.0; + } + + // Compute per-frame motion. + motion_invoker(BlockedRange(k, k + 1, 1)); + + if (model_options.check_model_stability) { + CheckSingleModelStability(type, max_unstable_type, + clip_data.irls_weight_backup + ? &clip_data.irls_weight_backup->at(k) + : nullptr, + clip_data.feature_lists->at(k), + &clip_data.camera_motions->at(k)); + } + + if (clip_data.camera_motions->at(k).type() == CameraMotion::VALID) { + const bool remove_terminated_tracks = + total_rounds == 1 || (round == 0 && k == 0); + UpdateLongFeatureBias(type, model_options, remove_terminated_tracks, + round > 0, // Update irls observation. + clip_data.feature_lists->at(k)); + } + } + + // Update feature weights and priors for the next round. + for (int k = 0; k < clip_data.num_frames(); ++k) { + auto& feats = *clip_data.feature_lists->at(k); + auto& priors = clip_data.prior_weights[k].priors; + const int type_idx = model_options.IndexFromType(type); + const auto& bias_map = long_feature_bias_maps_[type_idx]; + + for (int l = 0; l < feats.feature_size(); ++l) { + auto iter = bias_map.find(feats.feature(l).track_id()); + if (iter != bias_map.end()) { + const float bias = iter->second.bias; + float irls = 1.0f / (bias + kIrlsEps); + if (irls < 1.0f) { + irls *= irls; // Downweight outliers even more. + } + feats.mutable_feature(l)->set_irls_weight(irls); + priors[l] = irls; + } + } + } + } + } + } + return true; +} + +namespace { + +class DoesFeatureAgreeWithSimilarity { + public: + DoesFeatureAgreeWithSimilarity(const LinearSimilarityModel& similarity, + float inlier_threshold) + : similarity_(similarity), + sq_inlier_threshold_(inlier_threshold * inlier_threshold) {} + bool operator()(const RegionFlowFeature& feature) const { + Vector2_f lin_pt = LinearSimilarityAdapter::TransformPoint( + similarity_, FeatureLocation(feature)); + return (lin_pt - FeatureMatchLocation(feature)).Norm2() < + sq_inlier_threshold_; + } + + private: + LinearSimilarityModel similarity_; + float sq_inlier_threshold_; +}; + +} // namespace. + +class IrlsInitializationInvoker { + public: + IrlsInitializationInvoker( + const MotionEstimation::MotionType& type, int max_unstable_type, + const MotionEstimation::EstimateModelOptions& model_options, + const MotionEstimation* motion_estimation, + MotionEstimation::SingleTrackClipData* clip_data) + : type_(type), + max_unstable_type_(max_unstable_type), + model_options_(model_options), + motion_estimation_(motion_estimation), + clip_data_(clip_data) {} + + void operator()(const BlockedRange& range) const { + for (int frame = range.begin(); frame != range.end(); ++frame) { + const CameraMotion& camera_motion = (*clip_data_->camera_motions)[frame]; + RegionFlowFeatureList& feature_list = + *(*clip_data_->feature_lists)[frame]; + + // Don't process motions that are already deemed unstable. + // Keep last resulting weights around. + if (camera_motion.type() > max_unstable_type_) { + continue; + } + + // Backup irls weights, reset of weights occurs in CheckModelStability + // if model was deemed unstable. + if (clip_data_->irls_weight_backup) { + GetRegionFlowFeatureIRLSWeights( + feature_list, &(*clip_data_->irls_weight_backup)[frame]); + } + + // Should translation or similarity irls weights be initialized? + // In this case, prior weights will be enforced during estimation. + const bool irls_initialization = + motion_estimation_->options_.irls_initialization().activated(); + + // Should (mixture) homography irls weights be initialized? + // In this case, prior weights are not necessarily enforced, to allow + // estimation to deviate from filtered result. + const bool use_only_lin_sim_inliers_for_homography = + motion_estimation_->options_ + .use_only_lin_sim_inliers_for_homography(); + + // Only seed priors if at least one round of translation estimation was + // performed, i.e. feature weights contain results of that estimation. + bool seed_priors_from_bias = + motion_estimation_->options_.estimation_policy() == + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS && + motion_estimation_->options_.long_feature_bias_options() + .seed_priors_from_bias() && + type_ >= MotionEstimation::MODEL_LINEAR_SIMILARITY; + + if (seed_priors_from_bias && + type_ == MotionEstimation::MODEL_LINEAR_SIMILARITY) { + // Check that translation variance is not too large, otherwise + // biasing from prior distorts results (think of zooming, rotation, + // etc.). + std::vector variances; + for (const auto& camera_motion : *clip_data_->camera_motions) { + variances.push_back(camera_motion.translation_variance()); + } + auto percentile = variances.begin() + variances.size() * 0.8f; + std::nth_element(variances.begin(), percentile, variances.end()); + const float variance = + *percentile / hypot(motion_estimation_->frame_width_, + motion_estimation_->frame_height_); + constexpr float kMaxTranslationVariance = 5e-3; + seed_priors_from_bias = variance < kMaxTranslationVariance; + } + + if (seed_priors_from_bias && + type_ == MotionEstimation::MODEL_HOMOGRAPHY) { + // Check strict inlier ratio; if there are sufficient inliers + // don't bias homography to better fit; otherwise bias for more + // rigidity. + std::vector inlier_ratio; + for (const auto& camera_motion : *clip_data_->camera_motions) { + inlier_ratio.push_back( + camera_motion.similarity_strict_inlier_ratio()); + } + auto percentile = inlier_ratio.begin() + inlier_ratio.size() * 0.5f; + std::nth_element(inlier_ratio.begin(), percentile, inlier_ratio.end()); + // TODO: Revisit decision boundary after limiting + // motion estimation to selection. + constexpr float kMaxRatio = 0.7; + seed_priors_from_bias = *percentile < kMaxRatio; + } + + // Seed priors from previous estimation if requested. + if (seed_priors_from_bias) { + GetRegionFlowFeatureIRLSWeights( + feature_list, &clip_data_->prior_weights[frame].priors); + } + + // Initialize irls weights to their default values. + SetRegionFlowFeatureIRLSWeights(clip_data_->IrlsWeightInput(type_)[frame], + &feature_list); + + // Update weights based on type and options. + // Initialization step. + switch (type_) { + case MotionEstimation::MODEL_TRANSLATION: + if (irls_initialization) { + TranslationModel best_model; + if (motion_estimation_->GetTranslationIrlsInitialization( + &feature_list, model_options_, + camera_motion.average_magnitude(), clip_data_->inlier_mask, + &best_model)) { + // Successful initialization, update inlier mask and signal to + // use full prior. + clip_data_->prior_weights[frame].use_full_prior = true; + if (clip_data_->inlier_mask) { + clip_data_->inlier_mask->UpdateTranslation( + Vector2_f(best_model.dx(), best_model.dy())); + clip_data_->inlier_mask->UpdateMask(); + } + } else { + // Initialization failed, reset to original weights. + SetRegionFlowFeatureIRLSWeights( + clip_data_->IrlsWeightInput(type_)[frame], &feature_list); + + if (clip_data_->inlier_mask) { + // Unstable translation: Pretty bad motion here, should reset + // prior. + clip_data_->inlier_mask->InitMask(); + } + } + } + break; + + case MotionEstimation::MODEL_LINEAR_SIMILARITY: + if (irls_initialization) { + LinearSimilarityModel best_model; + if (motion_estimation_->GetSimilarityIrlsInitialization( + &feature_list, model_options_, + camera_motion.average_magnitude(), clip_data_->inlier_mask, + &best_model)) { + // Successful initialization, update inlier mask and signal to + // use full prior. + clip_data_->prior_weights[frame].use_full_prior = true; + + if (clip_data_->inlier_mask) { + clip_data_->inlier_mask->UpdateMask(); + } + } else { + // Initialization failed, reset to original weights. + SetRegionFlowFeatureIRLSWeights( + clip_data_->IrlsWeightInput(type_)[frame], &feature_list); + } + } + break; + + default: + break; + } + + // Filtering step. + switch (type_) { + case MotionEstimation::MODEL_HOMOGRAPHY: + if (use_only_lin_sim_inliers_for_homography && + camera_motion.type() <= CameraMotion::UNSTABLE_SIM) { + // Threshold is set w.r.t. normalized frame diameter. + LinearSimilarityModel normalized_similarity = + ModelCompose3(motion_estimation_->normalization_transform_, + camera_motion.linear_similarity(), + motion_estimation_->inv_normalization_transform_); + FilterRegionFlowFeatureList( + DoesFeatureAgreeWithSimilarity( + normalized_similarity, + motion_estimation_->options_.lin_sim_inlier_threshold()), + kOutlierIRLSWeight, &feature_list); + } + break; + + case MotionEstimation::MODEL_MIXTURE_HOMOGRAPHY: + if (use_only_lin_sim_inliers_for_homography && + camera_motion.type() <= CameraMotion::UNSTABLE_SIM) { + // Threshold is set w.r.t. normalized frame diameter. + LinearSimilarityModel normalized_similarity = + ModelCompose3(motion_estimation_->normalization_transform_, + camera_motion.linear_similarity(), + motion_estimation_->inv_normalization_transform_); + + // Linear similarity is a rigid motion model, only reject severe + // outliers (4 times motion magnitude, 1.5 times rejection + // threshold). + const float irls_residual_scale = + motion_estimation_->GetIRLSResidualScale( + camera_motion.average_magnitude(), + motion_estimation_->options_.irls_mixture_fraction_scale() * + motion_estimation_->options_ + .irls_motion_magnitude_fraction()); + + const float inlier_threshold = + model_options_.mixture_inlier_threshold_scale * + motion_estimation_->options_.lin_sim_inlier_threshold() / + irls_residual_scale; + + FilterRegionFlowFeatureList( + DoesFeatureAgreeWithSimilarity(normalized_similarity, + inlier_threshold), + kOutlierIRLSWeight, &feature_list); + } + break; + + default: + break; + } + + const bool use_prior_weights = + !clip_data_->uniform_weight_input[frame] || irls_initialization; + + // Initialize priors from irls weights. + if (use_prior_weights) { + CHECK_LT(frame, clip_data_->prior_weights.size()); + + if (clip_data_->prior_weights[frame].priors.empty()) { + clip_data_->prior_weights[frame].priors.resize( + feature_list.feature_size(), 1.0f); + } + + if (seed_priors_from_bias) { + std::vector multiply; + GetRegionFlowFeatureIRLSWeights(feature_list, &multiply); + for (int l = 0; l < multiply.size(); ++l) { + clip_data_->prior_weights[frame].priors[l] *= multiply[l]; + } + } else { + GetRegionFlowFeatureIRLSWeights( + feature_list, &clip_data_->prior_weights[frame].priors); + } + } + } + } + + private: + MotionEstimation::MotionType type_; + int max_unstable_type_; + const MotionEstimation::EstimateModelOptions& model_options_; + const MotionEstimation* motion_estimation_; + MotionEstimation::SingleTrackClipData* clip_data_; +}; + +void MotionEstimation::LongFeatureInitialization( + const RegionFlowFeatureList& feature_list, + const LongFeatureInfo& feature_info, + const std::vector& track_length_importance, + std::vector* irls_weights) const { + CHECK(irls_weights); + const int num_features = feature_list.feature_size(); + if (num_features == 0) { + return; + } + + CHECK_EQ(num_features, irls_weights->size()); + + // Determine actual scale to be applied to each feature. + std::vector feature_scales(num_features); + constexpr float kTrackLengthImportance = 0.5f; + int num_upweighted = 0; // Count number of upweighted features. + for (int k = 0; k < num_features; ++k) { + const int track_len = feature_info.TrackLength(feature_list.feature(k)); + const float track_len_scale = track_length_importance[track_len]; + if (track_len_scale >= kTrackLengthImportance) { + ++num_upweighted; + } + + feature_scales[k] = track_len_scale; + } + + // Use full upweighting above kMinFraction of upweighted features. + constexpr float kMinFraction = 0.1f; + const float upweight_multiplier = + options_.long_feature_initialization().upweight_multiplier() * + std::min(1.0f, num_upweighted / (num_features * kMinFraction)); + + for (int k = 0; k < num_features; ++k) { + // Never downweight. + (*irls_weights)[k] *= + std::max(1.0f, feature_scales[k] * upweight_multiplier); + } +} + +void MotionEstimation::FeatureDensityNormalization( + const RegionFlowFeatureList& feature_list, + std::vector* irls_weights) const { + CHECK(irls_weights); + const int num_features = feature_list.feature_size(); + CHECK_EQ(num_features, irls_weights->size()); + + // Compute mask index for each feature. + std::vector bin_indices; + bin_indices.reserve(num_features); + + const int mask_size = options_.feature_mask_size(); + const int max_bins = mask_size * mask_size; + + std::vector bin_normalizer(max_bins, 0.0f); + + const Vector2_f domain = NormalizedDomain(); + const float scale_x = (mask_size - 1) / domain.x(); + const float scale_y = (mask_size - 1) / domain.y(); + + // Interpolate location into adjacent bins. + for (const auto& feature : feature_list.feature()) { + const float grid_y = feature.y() * scale_y; + const float grid_x = feature.x() * scale_x; + + const int int_grid_x = grid_x; + const int int_grid_y = grid_y; + + const float dx = grid_x - int_grid_x; + const float dy = grid_y - int_grid_y; + const float dxdy = dx * dy; + const float dx_plus_dy = dx + dy; + + const int inc_x = dx != 0; + const int inc_y = dy != 0; + + int bin_idx = int_grid_y * mask_size + int_grid_x; + // (1 - dx)(1 - dy) = 1 - (dx + dy) + dx*dy + bin_normalizer[bin_idx] += 1 - dx_plus_dy + dxdy; + // dx * (1 - dy) = dx - dxdy + bin_normalizer[bin_idx + inc_x] += dx - dxdy; + + bin_idx += mask_size * inc_y; + // (1 - dx) * dy = dy - dxdy + bin_normalizer[bin_idx] += dy - dxdy; + bin_normalizer[bin_idx + inc_x] += dxdy; + } + + // Get normalization for each feature. + float avg_normalizer = 0.0f; + for (int k = 0; k < num_features; ++k) { + const RegionFlowFeature& feature = feature_list.feature(k); + const float grid_y = feature.y() * scale_y; + const float grid_x = feature.x() * scale_x; + + const int int_grid_x = grid_x; + const int int_grid_y = grid_y; + + const float dx = grid_x - int_grid_x; + const float dy = grid_y - int_grid_y; + const float dxdy = dx * dy; + const float dx_plus_dy = dx + dy; + + int inc_x = dx != 0; + int inc_y = dy != 0; + + float normalizer = 0; + int bin_idx = int_grid_y * mask_size + int_grid_x; + CHECK_LT(bin_idx, max_bins); + // See above. + normalizer += bin_normalizer[bin_idx] * (1 - dx_plus_dy + dxdy); + normalizer += bin_normalizer[bin_idx + inc_x] * (dx - dxdy); + + bin_idx += mask_size * inc_y; + CHECK_LT(bin_idx, max_bins); + normalizer += bin_normalizer[bin_idx] * (dy - dxdy); + normalizer += bin_normalizer[bin_idx + inc_x] * dxdy; + + const float inv_normalizer = + normalizer > 0 ? 1.0f / std::sqrt(static_cast(normalizer)) : 0; + avg_normalizer += inv_normalizer; + + (*irls_weights)[k] *= inv_normalizer; + } + + const float scale = num_features / (avg_normalizer + 1e-6f); + + // Normalize such that average scale is 1.0f. + for (int k = 0; k < num_features; ++k) { + (*irls_weights)[k] *= scale; + } +} + +void MotionEstimation::IrlsInitialization( + const MotionType& type, int max_unstable_type, int frame, + const EstimateModelOptions& model_options, + SingleTrackClipData* clip_data) const { + if (options_.estimation_policy() == + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS) { + CHECK_NE(frame, -1) << "Only per frame processing for this policy " + << "supported."; + } + + IrlsInitializationInvoker invoker(type, max_unstable_type, model_options, + this, clip_data); + + if (frame == -1) { + // Function pointer to select between SerialFor and ParallelFor based on + // EstimationPolicy. + typedef void (*ForFunctionPtr)(size_t, size_t, size_t, + const IrlsInitializationInvoker&); + + ForFunctionPtr for_function = &ParallelFor; + + // Inlier mask only used for translation or linear similarity. + // In that case, initialization needs to proceed serially. + if (type == MODEL_TRANSLATION || type == MODEL_LINEAR_SIMILARITY) { + if (clip_data->inlier_mask != nullptr) { + for_function = &SerialFor; + } + } + + for_function(0, clip_data->num_frames(), 1, invoker); + } else { + CHECK_GE(frame, 0); + CHECK_LT(frame, clip_data->num_frames()); + invoker(BlockedRange(frame, frame + 1, 1)); + } +} + +// Helper class for parallel irls weight filtering. +// This filters per-frame pair, across multiple models, not across time. +class TrackFilterInvoker { + public: + explicit TrackFilterInvoker( + std::vector* clip_datas) + : clip_datas_(clip_datas) {} + + void operator()(const BlockedRange& range) const { + for (int f = range.begin(); f != range.end(); ++f) { + // Gather irls weights for each track. + absl::node_hash_map> track_weights; + for (auto& clip_data : *clip_datas_) { + for (const auto& feature : (*clip_data.feature_lists)[f]->feature()) { + track_weights[feature.track_id()].push_back(feature.irls_weight()); + } + } + + // Min filter across weights, store in first element. + int match_sum = 0; + for (auto& entry : track_weights) { + match_sum += entry.second.size(); + entry.second[0] = + *std::min_element(entry.second.begin(), entry.second.end()); + } + + // Apply. + for (auto& clip_data : *clip_datas_) { + for (auto& feature : + *(*clip_data.feature_lists)[f]->mutable_feature()) { + feature.set_irls_weight(track_weights[feature.track_id()][0]); + } + } + } + } + + private: + std::vector* clip_datas_; +}; + +void MotionEstimation::MinFilterIrlsWeightByTrack( + SingleTrackClipData* clip_data) const { + // Gather irls weights for each track. + std::unordered_map> track_weights; + + const int num_frames = clip_data->feature_lists->size(); + for (int f = 0; f < num_frames; ++f) { + for (const auto& feature : clip_data->feature_lists->at(f)->feature()) { + track_weights[feature.track_id()].push_back(feature.irls_weight()); + } + } + + // Robust min filter across weights, store in first element; here 20th + // percentile. + for (auto& entry : track_weights) { + if (entry.second.size() > 1) { + auto robust_min = + entry.second.begin() + std::ceil(entry.second.size() * 0.2f); + + std::nth_element(entry.second.begin(), robust_min, entry.second.end()); + entry.second[0] = *robust_min; + } + } + + // Apply. + for (int f = 0; f < num_frames; ++f) { + for (auto& feature : *clip_data->feature_lists->at(f)->mutable_feature()) { + feature.set_irls_weight(track_weights[feature.track_id()][0]); + } + } +} + +void MotionEstimation::EnforceTrackConsistency( + std::vector* clip_datas) const { + CHECK(clip_datas != nullptr); + if (clip_datas->empty()) { + return; + } + + // Traverse each frame, filter across clip_datas. + const int num_frames = (*clip_datas)[0].num_frames(); + + // Map track id to weights. + SerialFor(0, num_frames, 1, TrackFilterInvoker(clip_datas)); + + if (!options_.joint_track_estimation().temporal_smoothing()) { + return; + } + + // Temporal smoothing in time for each clip data. + for (auto& clip_data : *clip_datas) { + // Bilateral 1D filter across irls weights for each track. + // Gather irls weights across all tracks. + absl::node_hash_map> track_irls_weights; + + for (RegionFlowFeatureList* feature_list : *clip_data.feature_lists) { + for (const auto& feature : feature_list->feature()) { + track_irls_weights[feature.track_id()].push_back(feature.irls_weight()); + } + } + + // Filter weights for each track, store results in map below. + for (auto& iter : track_irls_weights) { + SmoothIRLSWeights(&iter.second); + } + + // Set filtered weight. + for (RegionFlowFeatureList* feature_list : *clip_data.feature_lists) { + for (auto& feature : *feature_list->mutable_feature()) { + feature.set_irls_weight(track_irls_weights[feature.track_id()].front()); + track_irls_weights[feature.track_id()].pop_front(); + } + } + } +} + +void MotionEstimation::BiasFromFeatures( + const RegionFlowFeatureList& feature_list, MotionType type, + const EstimateModelOptions& model_options, std::vector* bias) const { + CHECK(bias); + const int num_features = feature_list.feature_size(); + bias->resize(num_features); + + int feature_idx = 0; + const int type_idx = model_options.IndexFromType(type); + auto& bias_map = long_feature_bias_maps_[type_idx]; + constexpr float kMinBias = 0.1f; + + for (const auto& feature : feature_list.feature()) { + // Is feature present? + auto iter = bias_map.find(feature.track_id()); + if (iter != bias_map.end()) { + const float current_bias_bin = + iter->second.bias * feature_bias_lut_.bias_weight_scale; + + // Never bias 100% towards old value, + // allow for new values to propagate. + // Downweight outliers but do not upweight inliers. + if (current_bias_bin >= feature_bias_lut_.bias_weight_lut.size()) { + (*bias)[feature_idx] = kMinBias; + } else { + (*bias)[feature_idx] = std::max( + kMinBias, feature_bias_lut_.bias_weight_lut[current_bias_bin]); + } + } else { + // TODO: This should be some kind of average of all the other + // feature's bias; such that new features can't overpower old ones + // (e.g. on tracking errors). + (*bias)[feature_idx] = 1.0f; + } + + ++feature_idx; + } +} + +void MotionEstimation::BiasLongFeatures( + RegionFlowFeatureList* feature_list, MotionType type, + const EstimateModelOptions& model_options, + PriorFeatureWeights* prior_weights) const { + CHECK(prior_weights); + CHECK(feature_list); + + // Don't bias duplicated frames -> should be identity transform. + if (feature_list->is_duplicated()) { + return; + } + + // TODO: Rename, it is not the bias (= error), but a weight in + // [0, 1] to condition features. + std::vector bias; + BiasFromFeatures(*feature_list, type, model_options, &bias); + + // Bias along long tracks. + if (!prior_weights->use_full_prior) { + LOG_IF(WARNING, + []() { + static int k = 0; + return k++ < 2; + }()) + << "Use full prior overridden to true, no initialization used. " + << "Atypical usage."; + prior_weights->use_full_prior = true; + } + + const int num_features = feature_list->feature_size(); + if (prior_weights->priors.empty() && num_features > 0) { + LOG(WARNING) << "BiasLongFeatures without using IrlsOutlierInitialization " + << "or LongFeatureInitialization."; + prior_weights->priors.resize(num_features, 1.0f); + } + + CHECK_EQ(num_features, prior_weights->priors.size()); + for (int k = 0; k < num_features; ++k) { + prior_weights->priors[k] *= bias[k]; + auto* feature = feature_list->mutable_feature(k); + feature->set_irls_weight(feature->irls_weight() * bias[k]); + } +} + +void MotionEstimation::ComputeSpatialBias( + MotionType type, const EstimateModelOptions& model_options, + RegionFlowFeatureList* feature_list, SpatialBiasMap* spatial_bias) const { + const auto& bias_options = options_.long_feature_bias_options(); + const int type_idx = model_options.IndexFromType(type); + const auto& bias_map = long_feature_bias_maps_[type_idx]; + + // Select all features that are not marked to be ignored (irls weight of + // zero). + RegionFlowFeatureView feature_view; + SelectFeaturesFromList( + [](const RegionFlowFeature& feature) -> bool { + return feature.irls_weight() != 0; + }, + feature_list, &feature_view); + + const int num_features = feature_view.size(); + + std::vector> feature_taps_3; + std::vector> feature_grids; + + // Create grid to seed newly observed features with an appropiate bias. + BuildFeatureGrid(NormalizedDomain().x(), NormalizedDomain().y(), + bias_options.grid_size(), {feature_view}, FeatureLocation, + &feature_taps_3, nullptr, nullptr, &feature_grids); + CHECK_EQ(1, feature_grids.size()); + const FeatureGrid& single_grid = feature_grids[0]; + + const float long_track_threshold = bias_options.long_track_threshold(); + + // Traverse bins. + for (int k = 0; k < single_grid.size(); ++k) { + for (auto feature_ptr : single_grid[k]) { // Traverse each bin. + float total_weight = 0.0f; + float weighted_bias = 0.0f; + + // Counts all neighbors considered (including itself). + int total_neighbors = 0; + + // Counts approximately how many similar looking long feature tracks + // are neighbors. + float similar_long_tracks = 0; + for (int neighbor_bin : feature_taps_3[k]) { + for (auto neighbor_ptr : single_grid[neighbor_bin]) { + ++total_neighbors; + auto iter = bias_map.find(neighbor_ptr->track_id()); + float neighbor_bias = 0; + int num_observations = 0; + + if (iter != bias_map.end()) { + neighbor_bias = iter->second.bias; + num_observations = iter->second.total_observations; + } else { + // If new track use estimated irls weight. + neighbor_bias = 1.0f / neighbor_ptr->irls_weight(); + num_observations = 1; + } + + const float distance = + (FeatureLocation(*feature_ptr) - FeatureLocation(*neighbor_ptr)) + .Norm(); + const float spatial_weight = + feature_bias_lut_ + .spatial_lut[distance * feature_bias_lut_.spatial_scale]; + + const float color_distance = + RegionFlowFeatureDistance(feature_ptr->feature_descriptor(), + neighbor_ptr->feature_descriptor()); + + const float color_weight = + feature_bias_lut_ + .color_lut[color_distance * feature_bias_lut_.color_scale]; + + if (num_observations >= long_track_threshold) { + // Count similar looking tracks (weights are normalized such that + // identical looking track has color_weight of 1.0. + // Scale by length of track (limited to kMaxTrackScale). + constexpr float kMaxTrackScale = 3.0f; + similar_long_tracks += + color_weight * + std::min(kMaxTrackScale, + num_observations / long_track_threshold); + } + + const float weight = spatial_weight * color_weight; + + total_weight += weight; + weighted_bias += neighbor_bias * weight; + } + } + + DCHECK(spatial_bias->find(feature_ptr->track_id()) == + spatial_bias->end()); + + // Threshold such that few similar tracks do not count. + // Set to 0.25% of features. + if (similar_long_tracks < 2.5e-3 * num_features) { + similar_long_tracks = 0; + } + + // Cutoff for features that do not have any matching neighbors. + // In that case fallback to feature's irls weight. + if (total_weight > total_neighbors * 1e-4f) { + // Note: Considered doing minimum of bias and irls weight + // but this leads to instable IRLS weights very similar + // to independent estimation. + const float norm_bias = weighted_bias / total_weight; + (*spatial_bias)[feature_ptr->track_id()] = + std::make_pair(norm_bias, similar_long_tracks); + } else { + (*spatial_bias)[feature_ptr->track_id()] = std::make_pair( + 1.0f / feature_ptr->irls_weight(), similar_long_tracks); + } + } + } +} + +void MotionEstimation::UpdateLongFeatureBias( + MotionType type, const EstimateModelOptions& model_options, + bool remove_terminated_tracks, bool update_irls_observation, + RegionFlowFeatureList* feature_list) const { + const int type_idx = model_options.IndexFromType(type); + auto& bias_map = long_feature_bias_maps_[type_idx]; + + constexpr int kMaxDuplicatedFrames = 2; + int& model_duplicate_frames = num_duplicate_frames_[type_idx]; + if (feature_list->is_duplicated()) { + ++model_duplicate_frames; + } else { + model_duplicate_frames = 0; + } + + // Do not update bias from duplicated frames. We consider any duplicated + // frames > kMaxDuplicatedFrames as a totally static camera, in which case the + // update is desired. + if (model_duplicate_frames > 0 && + model_duplicate_frames <= kMaxDuplicatedFrames) { + for (auto& feature : *feature_list->mutable_feature()) { + auto iter = bias_map.find(feature.track_id()); + if (iter != bias_map.end() && feature.irls_weight() > 0) { + // Restore bias from last non-duplicated observation. + feature.set_irls_weight(1.0f / (iter->second.bias + kIrlsEps)); + } + } + return; + } + + const auto& bias_options = options_.long_feature_bias_options(); + const int num_irls_observations = bias_options.num_irls_observations(); + + CHECK_GT(num_irls_observations, 0) << "Specify value > 0"; + const float inv_num_irls_observations = 1.0f / num_irls_observations; + + SpatialBiasMap spatial_bias; + if (bias_options.use_spatial_bias()) { + ComputeSpatialBias(type, model_options, feature_list, &spatial_bias); + } else { + // Just populate bias with inverse irls error. + for (const auto& feature : feature_list->feature()) { + spatial_bias[feature.track_id()] = + std::make_pair(1.0f / feature.irls_weight(), 0); + } + } + + // Tracks current ids in this frame. + std::unordered_set curr_track_ids; + + // Scale applied to irls weight for linear interpolation between inlier and + // outlier bias. + CHECK_GT(bias_options.inlier_irls_weight(), 0); + const float irls_scale = 1.0f / bias_options.inlier_irls_weight(); + const float long_track_scale = + 1.0f / bias_options.long_track_confidence_fraction(); + + for (auto& feature : *feature_list->mutable_feature()) { + if (remove_terminated_tracks) { + curr_track_ids.insert(feature.track_id()); + } + + // Skip features that are marked as not to be processed. + if (feature.irls_weight() == 0) { + continue; + } + + auto iter = bias_map.find(feature.track_id()); + + // Is LongFeatureBias present? + if (iter != bias_map.end()) { + // Get minimum across last k observation. + constexpr size_t lastK = 3; + + const std::vector& irls_values = iter->second.irls_values; + const float last_min = *std::min_element( + irls_values.end() - std::min(irls_values.size(), lastK), + irls_values.end()); + + const float curr_irls_weight = feature.irls_weight(); + + // Clamp weights for ratio computation (count major outliers and inliers + // as regular ones). Set to range of errors between 0.5 pixels to 25 + // pixels. + const float last_min_clamped = std::max(0.04f, std::min(last_min, 2.0f)); + + const float curr_irls_clamped = + std::max(0.04f, std::min(curr_irls_weight, 2.0f)); + + // Only checking for change from outlier to inlier here. + // The reverse case inlier -> outlier is addressed by bias + // blending below. Denominator is guaranteed to be > 0. + float irls_ratio = curr_irls_clamped / last_min_clamped; + + if (irls_ratio > bias_options.max_irls_change_ratio()) { + // Reset feature and start again. + bias_map[feature.track_id()] = + LongFeatureBias(spatial_bias[feature.track_id()].first); + continue; + } + + ++iter->second.total_observations; + + // Compute median. + std::vector irls_values_copy = iter->second.irls_values; + auto median = irls_values_copy.begin() + irls_values_copy.size() / 2; + std::nth_element(irls_values_copy.begin(), median, + irls_values_copy.end()); + + // By default shorter observations are given less prior, except if + // sufficient long tracks were used during seeding. + float prior_weight = + std::max(std::min(1.0f, spatial_bias[feature.track_id()].second * + long_track_scale), + (irls_values_copy.size() * inv_num_irls_observations)); + + const float alpha = std::min(1.0f, *median * irls_scale) * prior_weight; + const float bias = (alpha * bias_options.inlier_bias() + + (1.0f - alpha) * bias_options.outlier_bias()); + + // Bias is weighted by median IRLS weight (foreground / outliers) + // can override bias faster. Similar if only few IRLS values have + // been observed, update bias faster. + const float biased_weight = + bias * iter->second.bias + + (1.0f - bias) * (1.0f / feature.irls_weight()); + + // Update weight. + iter->second.bias = biased_weight; + + std::vector& irls_values_ref = iter->second.irls_values; + if (!update_irls_observation) { + irls_values_ref.push_back(feature.irls_weight()); + if (irls_values_ref.size() > num_irls_observations) { + irls_values_ref.erase(irls_values_ref.begin()); + } + } else { + // Update. + // TODO: Sure about this? This is the error after + // estimation, but also biased toward previous solution. + irls_values_ref.back() = feature.irls_weight(); + } + + // Update feature's weight as well. + feature.set_irls_weight(1.0f / (biased_weight + kIrlsEps)); + } else { + CHECK(!update_irls_observation) << "Should never happen on >= 2nd round"; + + // Not present, reset to spatial bias. + const float biased_weight = spatial_bias[feature.track_id()].first; + bias_map[feature.track_id()] = LongFeatureBias(biased_weight); + feature.set_irls_weight(1.0f / (biased_weight + kIrlsEps)); + } + } + + // Remove terminated tracks. + if (remove_terminated_tracks) { + std::vector tracks_to_be_removed; + for (const auto& entry : bias_map) { + if (curr_track_ids.find(entry.first) == curr_track_ids.end()) { + tracks_to_be_removed.push_back(entry.first); + } + } + + for (auto id : tracks_to_be_removed) { + bias_map.erase(id); + } + } +} + +void MotionEstimation::SmoothIRLSWeights(std::deque* irls) const { + CHECK(irls != nullptr); + if (irls->empty()) { + return; + } + + const float sigma_space = 7.0f; + const float sigma_signal = 0.5f; + + // Account for 90% of the data. + const int radius = 1.65f * sigma_space + 0.5f; + const int diameter = 2 * radius + 1; + const int num_irls = irls->size(); + + // Calculate spatial weights; + std::vector weights(diameter); + const float space_coeff = -0.5f / (sigma_space * sigma_space); + for (int i = -radius; i <= radius; ++i) { + weights[i + radius] = std::exp(space_coeff * i * i); + } + + // Map weights to error. + std::vector error(num_irls + 2 * radius); + for (int k = 0; k < num_irls; ++k) { + error[radius + k] = 1.0f / ((*irls)[k] + 1e-6f); + } + + // Copy border (right hand side). + std::copy(error.rbegin() + radius, error.rbegin() + 2 * radius, + error.end() - radius); + // Left hand side. + std::copy(error.begin() + radius, error.begin() + 2 * radius, error.begin()); + + // Bilateral filter. + const float signal_coeff = (-0.5f / (sigma_signal * sigma_signal)); + for (int i = 0; i < num_irls; ++i) { + const float curr_val = error[i + radius]; + float val_sum = 0; + float weight_sum = 0; + for (int k = 0; k < diameter; ++k) { + const float value = error[i + k]; + const float diff = value - curr_val; + const float weight = + weights[k] * + std::exp(static_cast(diff * diff * signal_coeff)); + weight_sum += weight; + val_sum += value * weight; + } + + // Result is val_sum / weight_sum, as irls is inverse of error, result is: + // weight_sum / val_sum. + if (val_sum != 0) { + (*irls)[i] = weight_sum / val_sum; + } + } +} + +int MotionEstimation::IRLSRoundsFromSettings(const MotionType& type) const { + const int irls_rounds = options_.irls_rounds(); + switch (type) { + case MODEL_AVERAGE_MAGNITUDE: + LOG(WARNING) << "Called with irls free motion type. Returning zero."; + return 0; + + case MODEL_TRANSLATION: + if (options_.estimate_translation_irls()) { + return irls_rounds; + } else { + return 1; // Always do at least L2 for translation. + } + break; + + case MODEL_LINEAR_SIMILARITY: + switch (options_.linear_similarity_estimation()) { + case MotionEstimationOptions::ESTIMATION_LS_NONE: + return 0; + + case MotionEstimationOptions::ESTIMATION_LS_L2: + return 1; + + case MotionEstimationOptions::ESTIMATION_LS_IRLS: + return irls_rounds; + + case MotionEstimationOptions::ESTIMATION_LS_L2_RANSAC: + case MotionEstimationOptions::ESTIMATION_LS_L1: + LOG(FATAL) << "Deprecated options, use ESTIMATION_LS_IRLS instead."; + return -1; + } + break; + + case MODEL_AFFINE: + switch (options_.affine_estimation()) { + case MotionEstimationOptions::ESTIMATION_AFFINE_NONE: + return 0; + + case MotionEstimationOptions::ESTIMATION_AFFINE_L2: + return 1; + + case MotionEstimationOptions::ESTIMATION_AFFINE_IRLS: + return irls_rounds; + } + break; + + case MODEL_HOMOGRAPHY: + switch (options_.homography_estimation()) { + case MotionEstimationOptions::ESTIMATION_HOMOG_NONE: + return 0; + + case MotionEstimationOptions::ESTIMATION_HOMOG_L2: + return 1; + + case MotionEstimationOptions::ESTIMATION_HOMOG_IRLS: + return irls_rounds; + } + break; + + case MODEL_MIXTURE_HOMOGRAPHY: + switch (options_.mix_homography_estimation()) { + case MotionEstimationOptions::ESTIMATION_HOMOG_MIX_NONE: + return 0; + + case MotionEstimationOptions::ESTIMATION_HOMOG_MIX_L2: + return 1; + + case MotionEstimationOptions::ESTIMATION_HOMOG_MIX_IRLS: + return irls_rounds; + } + break; + + case MODEL_NUM_VALUES: + LOG(FATAL) << "Function should never be called with this value"; + break; + } + + LOG(FATAL) << "All branches above return, execution can not reach this point"; + return -1; +} + +void MotionEstimation::PolicyToIRLSRounds(int irls_rounds, int* total_rounds, + int* irls_per_round) const { + CHECK(total_rounds != nullptr); + CHECK(irls_per_round != nullptr); + + // Small optimization: irls_rounds == 0 -> total_rounds = 0 regardless of + // settings. + if (irls_rounds == 0) { + *total_rounds = 0; + *irls_per_round = 0; + return; + } + + switch (options_.estimation_policy()) { + case MotionEstimationOptions::INDEPENDENT_PARALLEL: + case MotionEstimationOptions::TEMPORAL_IRLS_MASK: + *irls_per_round = irls_rounds; + *total_rounds = 1; + break; + + case MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS: + *total_rounds = options_.long_feature_bias_options().total_rounds(); + *irls_per_round = irls_rounds; + break; + + case MotionEstimationOptions::JOINTLY_FROM_TRACKS: + *irls_per_round = 1; + *total_rounds = irls_rounds; + break; + } +} + +void MotionEstimation::CheckModelStability( + const MotionType& type, const CameraMotion::Type& max_unstable_type, + const std::vector>* reset_irls_weights, + std::vector* feature_lists, + std::vector* camera_motions) const { + CHECK(feature_lists != nullptr); + CHECK(camera_motions != nullptr); + const int num_frames = feature_lists->size(); + if (reset_irls_weights) { + DCHECK_EQ(num_frames, reset_irls_weights->size()); + } + DCHECK_EQ(num_frames, camera_motions->size()); + + for (int f = 0; f < num_frames; ++f) { + CameraMotion& camera_motion = (*camera_motions)[f]; + RegionFlowFeatureList* feature_list = (*feature_lists)[f]; + + const std::vector* reset_irls_weight = + reset_irls_weights ? &reset_irls_weights->at(f) : nullptr; + CheckSingleModelStability(type, max_unstable_type, reset_irls_weight, + feature_list, &camera_motion); + } +} + +// Order of estimation is: +// Translation -> Linear Similarity -> Affine -> Homography -> Mixture +// Homography. +void MotionEstimation::CheckSingleModelStability( + const MotionType& type, const CameraMotion::Type& max_unstable_type, + const std::vector* reset_irls_weights, + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const { + if (camera_motion->type() > max_unstable_type) { + return; + } + + switch (type) { + case MODEL_AVERAGE_MAGNITUDE: + LOG(WARNING) << "Nothing to check for requested model type."; + return; + + case MODEL_TRANSLATION: + if (IsStableTranslation(camera_motion->translation(), + camera_motion->translation_variance(), + *feature_list)) { + // Translation can never be singular. + CHECK_EQ( + 0, camera_motion->flags() & CameraMotion::FLAG_SINGULAR_ESTIMATION); + } else { + // Invalid model. + if (reset_irls_weights) { + SetRegionFlowFeatureIRLSWeights(*reset_irls_weights, feature_list); + } + ResetMotionModels(options_, camera_motion); + } + break; + + case MODEL_LINEAR_SIMILARITY: { + const int num_inliers = + std::round(feature_list->feature_size() * + camera_motion->similarity_inlier_ratio()); + if (camera_motion->flags() & CameraMotion::FLAG_SINGULAR_ESTIMATION || + !IsStableSimilarity(camera_motion->linear_similarity(), *feature_list, + num_inliers)) { + if (reset_irls_weights) { + SetRegionFlowFeatureIRLSWeights(*reset_irls_weights, feature_list); + } + + ResetToTranslation(camera_motion->translation(), camera_motion); + } + break; + } + + case MODEL_AFFINE: + // Not implemented, nothing to check here. + break; + + case MODEL_HOMOGRAPHY: + if (camera_motion->flags() & CameraMotion::FLAG_SINGULAR_ESTIMATION || + !IsStableHomography(camera_motion->homography(), + camera_motion->average_homography_error(), + camera_motion->homography_inlier_coverage())) { + if (reset_irls_weights) { + SetRegionFlowFeatureIRLSWeights(*reset_irls_weights, feature_list); + } + + ResetToSimilarity(camera_motion->linear_similarity(), camera_motion); + } + break; + + case MODEL_MIXTURE_HOMOGRAPHY: { + std::vector block_coverage( + camera_motion->mixture_inlier_coverage().begin(), + camera_motion->mixture_inlier_coverage().end()); + const float mix_min_inlier_coverage = + options_.stable_mixture_homography_bounds().min_inlier_coverage(); + + if (camera_motion->flags() & CameraMotion::FLAG_SINGULAR_ESTIMATION || + !IsStableMixtureHomography(camera_motion->mixture_homography(), + mix_min_inlier_coverage, block_coverage)) { + // Unstable homography mixture. + // Fall-back to previously estimated motion type. + // Function is only called for CameraMotion::Type <= UNSTABLE. + // In this case this means type is either UNSTABLE, UNSTABLE_SIM + // or VALID. + // (UNSTABLE_HOMOG flag is set by this function only during + // ResetToHomography below). + switch (camera_motion->type()) { + case CameraMotion::VALID: + // Homography deemed stable, fallback to it. + MotionEstimation::ResetToHomography(camera_motion->homography(), + true, // flag_as_unstable_model + camera_motion); + break; + + case CameraMotion::UNSTABLE_SIM: + MotionEstimation::ResetToSimilarity( + camera_motion->linear_similarity(), camera_motion); + break; + + case CameraMotion::UNSTABLE: + MotionEstimation::ResetToTranslation(camera_motion->translation(), + camera_motion); + break; + + case CameraMotion::INVALID: + case CameraMotion::UNSTABLE_HOMOG: + LOG(FATAL) << "Unexpected CameraMotion::Type: " + << camera_motion->type(); + break; + } + + if (reset_irls_weights) { + SetRegionFlowFeatureIRLSWeights(*reset_irls_weights, feature_list); + } + + // Clear rolling shutter guess in case it was computed. + camera_motion->set_rolling_shutter_guess(-1); + camera_motion->clear_mixture_homography_spectrum(); + } else { + // Stable mixture homography can reset unstable type. + camera_motion->set_overridden_type(camera_motion->type()); + camera_motion->set_type(CameraMotion::VALID); + // Select weakest regularized mixture. + camera_motion->set_rolling_shutter_motion_index(0); + } + + break; + } + + case MODEL_NUM_VALUES: + LOG(FATAL) << "Function should not be called with this value"; + break; + } +} + +void MotionEstimation::ProjectMotionsDown( + const MotionType& type, std::vector* camera_motions) const { + CHECK(camera_motions != nullptr); + for (auto& camera_motion : *camera_motions) { + switch (type) { + case MODEL_AVERAGE_MAGNITUDE: + case MODEL_TRANSLATION: + case MODEL_MIXTURE_HOMOGRAPHY: + case MODEL_AFFINE: + LOG(WARNING) << "Nothing to project for requested model type"; + return; + + case MODEL_HOMOGRAPHY: + // Only project down if model was estimated, otherwise would propagate + // identity. + if (camera_motion.has_homography() && + camera_motion.type() <= CameraMotion::UNSTABLE_HOMOG) { + LinearSimilarityModel lin_sim = + *camera_motion.mutable_linear_similarity() = + AffineAdapter::ProjectToLinearSimilarity( + HomographyAdapter::ProjectToAffine( + camera_motion.homography(), frame_width_, + frame_height_), + frame_width_, frame_height_); + } + + ABSL_FALLTHROUGH_INTENDED; + + case MODEL_LINEAR_SIMILARITY: + // Only project down if model was estimated. + if (camera_motion.has_linear_similarity() && + camera_motion.type() <= CameraMotion::UNSTABLE_SIM) { + *camera_motion.mutable_translation() = + LinearSimilarityAdapter::ProjectToTranslation( + camera_motion.linear_similarity(), frame_width_, + frame_height_); + } + break; + + case MODEL_NUM_VALUES: + LOG(FATAL) << "Function should not be called with this value"; + break; + } + } +} + +void MotionEstimation::IRLSWeightFilter( + std::vector* feature_lists) const { + CHECK(feature_lists != nullptr); + for (auto feature_ptr : *feature_lists) { + switch (options_.irls_weight_filter()) { + case MotionEstimationOptions::IRLS_FILTER_TEXTURE: + TextureFilteredRegionFlowFeatureIRLSWeights( + 0.5, // Below texturedness threshold of 0.5 + 1, // Set irls_weight to 1. + feature_ptr); + break; + + case MotionEstimationOptions::IRLS_FILTER_CORNER_RESPONSE: + CornerFilteredRegionFlowFeatureIRLSWeights( + 0.5, // Below texturedness threshold of 0.5 + 1, // Set irls_weight to 1. + feature_ptr); + break; + + case MotionEstimationOptions::IRLS_FILTER_NONE: + break; + } + } +} + +void MotionEstimation::EstimateMotionsParallel( + bool post_irls_weight_smoothing, + std::vector* feature_lists, + std::vector* camera_motions) const { + CHECK(camera_motions != nullptr); + camera_motions->clear(); + camera_motions->resize(feature_lists->size()); + + // Normalize features. + for (std::vector::iterator feature_list = + feature_lists->begin(); + feature_list != feature_lists->end(); ++feature_list) { + TransformRegionFlowFeatureList(normalization_transform_, *feature_list); + } + + if (!options_.overlay_detection()) { + EstimateMotionsParallelImpl(options_.irls_weights_preinitialized(), + feature_lists, camera_motions); + } else { + DetermineOverlayIndices(options_.irls_weights_preinitialized(), + camera_motions, feature_lists); + + EstimateMotionsParallelImpl(true, feature_lists, camera_motions); + } + + if (!options_.deactivate_stable_motion_estimation()) { + CheckTranslationAcceleration(camera_motions); + } + + if (post_irls_weight_smoothing) { + PostIRLSSmoothing(*camera_motions, feature_lists); + } + + // Undo transform applied to features. + for (auto& feature_list_ptr : *feature_lists) { + TransformRegionFlowFeatureList(inv_normalization_transform_, + feature_list_ptr); + } + + DetermineShotBoundaries(*feature_lists, camera_motions); +} + +void MotionEstimation::DetermineShotBoundaries( + const std::vector& feature_lists, + std::vector* camera_motions) const { + CHECK(camera_motions != nullptr); + CHECK_EQ(feature_lists.size(), camera_motions->size()); + const auto& shot_options = options_.shot_boundary_options(); + + // Verify empty feature frames and invalid models via visual consistency. + const int num_motions = camera_motions->size(); + for (int k = 0; k < num_motions; ++k) { + auto& camera_motion = (*camera_motions)[k]; + if (camera_motion.type() == CameraMotion::INVALID || + feature_lists[k]->feature_size() == 0) { + // Potential shot boundary, verify. + if (feature_lists[k]->visual_consistency() >= 0) { + if (feature_lists[k]->visual_consistency() >= + shot_options.motion_consistency_threshold()) { + camera_motion.set_flags(camera_motion.flags() | + CameraMotion::FLAG_SHOT_BOUNDARY); + } + } else { + // No consistency present, label as shot boundary. + camera_motion.set_flags(camera_motion.flags() | + CameraMotion::FLAG_SHOT_BOUNDARY); + } + } + } + + // Determine additional boundaries missed during motion estimation. + for (int k = 0; k < num_motions; ++k) { + auto& camera_motion = (*camera_motions)[k]; + if (feature_lists[k]->visual_consistency() >= + shot_options.appearance_consistency_threshold()) { + if (k + 1 == num_motions || // no next frame available. + feature_lists[k + 1]->visual_consistency() >= + shot_options.appearance_consistency_threshold()) { + // Only add boundaries if previous or next frame are not already labeled + // boundaries by above tests. + if (k > 0 && ((*camera_motions)[k - 1].flags() & + CameraMotion::FLAG_SHOT_BOUNDARY) != 0) { + continue; + } + if (k + 1 < num_motions && ((*camera_motions)[k + 1].flags() & + CameraMotion::FLAG_SHOT_BOUNDARY) != 0) { + continue; + } + + // Shot boundaries based on visual consistency measure. + camera_motion.set_flags(camera_motion.flags() | + CameraMotion::FLAG_SHOT_BOUNDARY); + } + } + } + + // LOG shot boundaries. + for (const auto& camera_motion : *camera_motions) { + if (camera_motion.flags() & CameraMotion::FLAG_SHOT_BOUNDARY) { + VLOG(1) << "Shot boundary at : " << camera_motion.timestamp_usec() * 1e-6f + << "s"; + } + } +} + +void MotionEstimation::ResetMotionModels(const MotionEstimationOptions& options, + CameraMotion* camera_motion) { + CHECK(camera_motion); + + // Clear models. + camera_motion->clear_translation(); + camera_motion->clear_similarity(); + camera_motion->clear_linear_similarity(); + camera_motion->clear_affine(); + camera_motion->clear_homography(); + camera_motion->clear_mixture_homography(); + camera_motion->clear_mixture_homography_spectrum(); + + // We need to set models explicitly for has_* tests to work. + *camera_motion->mutable_translation() = TranslationModel(); + + if (options.estimate_similarity()) { + *camera_motion->mutable_similarity() = SimilarityModel(); + } + + if (options.linear_similarity_estimation() != + MotionEstimationOptions::ESTIMATION_LS_NONE) { + *camera_motion->mutable_linear_similarity() = LinearSimilarityModel(); + } + + if (options.affine_estimation() != + MotionEstimationOptions::ESTIMATION_AFFINE_NONE) { + *camera_motion->mutable_affine() = AffineModel(); + } + + if (options.homography_estimation() != + MotionEstimationOptions::ESTIMATION_HOMOG_NONE) { + *camera_motion->mutable_homography() = Homography(); + } + + if (options.mix_homography_estimation() != + MotionEstimationOptions::ESTIMATION_HOMOG_MIX_NONE) { + *camera_motion->mutable_mixture_homography() = + MixtureHomographyAdapter::IdentityModel(options.num_mixtures()); + camera_motion->set_mixture_row_sigma(options.mixture_row_sigma()); + } + + camera_motion->set_type(CameraMotion::INVALID); +} + +void MotionEstimation::ResetToIdentity(CameraMotion* camera_motion, + bool consider_valid) { + if (camera_motion->has_translation()) { + *camera_motion->mutable_translation() = TranslationModel(); + } + + if (camera_motion->has_similarity()) { + *camera_motion->mutable_similarity() = SimilarityModel(); + } + + if (camera_motion->has_linear_similarity()) { + *camera_motion->mutable_linear_similarity() = LinearSimilarityModel(); + } + + if (camera_motion->has_affine()) { + *camera_motion->mutable_affine() = AffineModel(); + } + + if (camera_motion->has_homography()) { + *camera_motion->mutable_homography() = Homography(); + } + + if (camera_motion->has_mixture_homography()) { + const int num_models = camera_motion->mixture_homography().model_size(); + for (int m = 0; m < num_models; ++m) { + *camera_motion->mutable_mixture_homography()->mutable_model(m) = + Homography(); + } + } + + if (consider_valid) { + camera_motion->set_type(CameraMotion::VALID); + } else { + camera_motion->set_type(CameraMotion::INVALID); + } +} + +void MotionEstimation::ResetToTranslation(const TranslationModel& model, + CameraMotion* camera_motion) { + const float dx = model.dx(); + const float dy = model.dy(); + + if (camera_motion->has_translation()) { + *camera_motion->mutable_translation() = model; + } + + if (camera_motion->has_similarity()) { + *camera_motion->mutable_similarity() = + SimilarityAdapter::FromArgs(dx, dy, 1, 0); + } + + if (camera_motion->has_linear_similarity()) { + *camera_motion->mutable_linear_similarity() = + LinearSimilarityAdapter::FromArgs(dx, dy, 1, 0); + } + + if (camera_motion->has_affine()) { + *camera_motion->mutable_affine() = TranslationAdapter::ToAffine(model); + } + + if (camera_motion->has_homography()) { + *camera_motion->mutable_homography() = + TranslationAdapter::ToHomography(model); + } + + if (camera_motion->has_mixture_homography()) { + const int num_models = camera_motion->mixture_homography().model_size(); + const Homography h = TranslationAdapter::ToHomography(model); + for (int m = 0; m < num_models; ++m) { + camera_motion->mutable_mixture_homography()->mutable_model(m)->CopyFrom( + h); + } + camera_motion->mutable_mixture_homography()->set_dof( + MixtureHomography::CONST_DOF); + } + + camera_motion->set_type(CameraMotion::UNSTABLE); +} + +void MotionEstimation::ResetToSimilarity(const LinearSimilarityModel& model, + CameraMotion* camera_motion) { + if (camera_motion->has_similarity()) { + *camera_motion->mutable_similarity() = + LinearSimilarityAdapter::ToSimilarity(model); + } + + if (camera_motion->has_linear_similarity()) { + *camera_motion->mutable_linear_similarity() = model; + } + + if (camera_motion->has_affine()) { + *camera_motion->mutable_affine() = LinearSimilarityAdapter::ToAffine(model); + } + + if (camera_motion->has_homography()) { + *camera_motion->mutable_homography() = + LinearSimilarityAdapter::ToHomography(model); + } + + if (camera_motion->has_mixture_homography()) { + const int num_models = camera_motion->mixture_homography().model_size(); + const Homography h = LinearSimilarityAdapter::ToHomography(model); + + for (int m = 0; m < num_models; ++m) { + camera_motion->mutable_mixture_homography()->mutable_model(m)->CopyFrom( + h); + } + camera_motion->mutable_mixture_homography()->set_dof( + MixtureHomography::CONST_DOF); + } + + camera_motion->set_type(CameraMotion::UNSTABLE_SIM); +} + +void MotionEstimation::ResetToHomography(const Homography& model, + bool flag_as_unstable_model, + CameraMotion* camera_motion) { + if (camera_motion->has_homography()) { + *camera_motion->mutable_homography() = model; + } + + if (camera_motion->has_mixture_homography()) { + const int num_models = camera_motion->mixture_homography().model_size(); + + for (int m = 0; m < num_models; ++m) { + camera_motion->mutable_mixture_homography()->mutable_model(m)->CopyFrom( + model); + } + camera_motion->mutable_mixture_homography()->set_dof( + MixtureHomography::CONST_DOF); + } + + if (flag_as_unstable_model) { + camera_motion->set_type(CameraMotion::UNSTABLE_HOMOG); + } +} + +void MotionEstimation::EstimateAverageMotionMagnitude( + const RegionFlowFeatureList& feature_list, + CameraMotion* camera_motion) const { + std::vector magnitudes; + magnitudes.reserve(feature_list.feature_size()); + for (const auto& feature : feature_list.feature()) { + magnitudes.push_back(std::hypot(feature.dy(), feature.dx())); + } + + std::sort(magnitudes.begin(), magnitudes.end()); + auto tenth = magnitudes.begin() + magnitudes.size() / 10; + auto ninetieth = magnitudes.begin() + magnitudes.size() * 9 / 10; + const int elems = ninetieth - tenth; + if (elems > 0) { + const float average_magnitude = + std::accumulate(tenth, ninetieth, 0.0f) * (1.0f / elems); + + // De-normalize translation. + const float magnitude = + LinearSimilarityAdapter::TransformPoint(inv_normalization_transform_, + Vector2_f(average_magnitude, 0)) + .x(); + camera_motion->set_average_magnitude(magnitude); + } +} + +float MotionEstimation::IRLSPriorWeight(int iteration, int irls_rounds) const { + // Iteration zero -> mapped to one + // Iteration irls_rounds -> mapped to irls_prior_scale. + return 1.0f - (iteration * (1.0f / irls_rounds) * + (1.0f - options_.irls_prior_scale())); +} + +namespace { + +// Returns weighted translational model from feature_list. +Vector2_f EstimateTranslationModelFloat( + const RegionFlowFeatureList& feature_list) { + Vector2_f mean_motion(0, 0); + float weight_sum = 0; + for (const auto& feature : feature_list.feature()) { + mean_motion += FeatureFlow(feature) * feature.irls_weight(); + weight_sum += feature.irls_weight(); + } + + if (weight_sum > 0) { + mean_motion *= (1.0f / weight_sum); + } + return mean_motion; +} + +Vector2_f EstimateTranslationModelDouble( + const RegionFlowFeatureList& feature_list) { + Vector2_d mean_motion(0, 0); + double weight_sum = 0; + for (const auto& feature : feature_list.feature()) { + mean_motion += + Vector2_d::Cast(FeatureFlow(feature)) * feature.irls_weight(); + weight_sum += feature.irls_weight(); + } + + if (weight_sum > 0) { + mean_motion *= (1.0 / weight_sum); + } + + return Vector2_f::Cast(mean_motion); +} + +} // namespace. + +void MotionEstimation::ComputeFeatureMask( + const RegionFlowFeatureList& feature_list, std::vector* mask_indices, + std::vector* bin_normalizer) const { + CHECK(mask_indices != nullptr); + CHECK(bin_normalizer != nullptr); + + const int num_features = feature_list.feature_size(); + mask_indices->clear(); + mask_indices->reserve(num_features); + + const int mask_size = options_.feature_mask_size(); + const int max_bins = mask_size * mask_size; + bin_normalizer->clear(); + bin_normalizer->resize(max_bins, 0.0f); + + const Vector2_f domain = NormalizedDomain(); + const float denom_x = 1.0f / domain.x(); + const float denom_y = 1.0f / domain.y(); + + // Record index, but guard against out of bound error. + for (const auto& feature : feature_list.feature()) { + const int bin_idx = std::min( + max_bins, + static_cast(feature.y() * denom_y * mask_size) * mask_size + + feature.x() * denom_x * mask_size); + + ++(*bin_normalizer)[bin_idx]; + mask_indices->push_back(bin_idx); + } + + for (float& bin_value : *bin_normalizer) { + bin_value = (bin_value == 0) ? 0 : sqrt(1.0 / bin_value); + } +} + +bool MotionEstimation::GetTranslationIrlsInitialization( + RegionFlowFeatureList* feature_list, + const EstimateModelOptions& model_options, float avg_camera_motion, + InlierMask* inlier_mask, TranslationModel* best_model) const { + CHECK(best_model != nullptr); + + const int num_features = feature_list->feature_size(); + if (!num_features) { + return false; + } + + // Bool indicator which features agree with model in each round. + // In case no RANSAC rounds are performed considered all features inliers. + std::vector best_features(num_features, 1); + std::vector curr_features(num_features); + float best_sum = 0; + + unsigned int seed = 900913; // = Google in leet :) + std::default_random_engine rand_gen(seed); + std::uniform_int_distribution<> distribution(0, num_features - 1); + + auto& options = options_.irls_initialization(); + const float irls_residual_scale = GetIRLSResidualScale( + avg_camera_motion, options_.irls_motion_magnitude_fraction()); + const float cutoff = options.cutoff() / irls_residual_scale; + const float sq_cutoff = cutoff * cutoff; + + // Either temporal bias or motion prior based on options. + std::vector bias(num_features, 1.0f); + + // Optionally, compute mask index for each feature. + std::vector mask_indices; + + if (options_.estimation_policy() == + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS) { + BiasFromFeatures(*feature_list, MODEL_TRANSLATION, model_options, &bias); + } else if (inlier_mask) { + std::vector unused_bin_normalizer; + ComputeFeatureMask(*feature_list, &mask_indices, &unused_bin_normalizer); + inlier_mask->MotionPrior(*feature_list, &bias); + } + + for (int rounds = 0; rounds < options.rounds(); ++rounds) { + float curr_sum = 0; + // Pick a random vector. + const int rand_idx = distribution(rand_gen); + const Vector2_f flow = FeatureFlow(feature_list->feature(rand_idx)); + + // curr_features gets set for every feature below; no need to reset. + for (int i = 0; i < num_features; ++i) { + const Feature& feature = feature_list->feature(i); + const Vector2_f diff = FeatureFlow(feature) - flow; + curr_features[i] = static_cast(diff.Norm2() < sq_cutoff); + if (curr_features[i]) { + float score = feature.irls_weight(); + if (inlier_mask) { + const int bin_idx = mask_indices[i]; + score *= bias[i] + inlier_mask->GetInlierScore(bin_idx); + } else { + score *= bias[i]; + } + curr_sum += score; + } + } + + if (curr_sum > best_sum) { + best_sum = curr_sum; + best_features.swap(curr_features); + best_model->set_dx(flow.x()); + best_model->set_dy(flow.y()); + } + } + + if (inlier_mask) { + inlier_mask->InitUpdateMask(); + } + + std::vector inlier_weights; + + // Score outliers low. + for (int i = 0; i < num_features; ++i) { + RegionFlowFeature* feature = feature_list->mutable_feature(i); + if (best_features[i] == 0 && feature->irls_weight() != 0) { + feature->set_irls_weight(kOutlierIRLSWeight); + } else { + inlier_weights.push_back(feature->irls_weight()); + if (inlier_mask) { + const int bin_idx = mask_indices[i]; + inlier_mask->RecordInlier(bin_idx, feature->irls_weight()); + } + } + } + + if (!inlier_weights.empty()) { + // Ensure that all selected inlier features have at least median weight. + auto median = inlier_weights.begin() + inlier_weights.size() * 0.5f; + std::nth_element(inlier_weights.begin(), median, inlier_weights.end()); + + for (int i = 0; i < num_features; ++i) { + RegionFlowFeature* feature = feature_list->mutable_feature(i); + if (best_features[i] != 0) { + feature->set_irls_weight(std::max(*median, feature->irls_weight())); + } + } + } + + // Compute translation variance as feature for stability evaluation. + const float translation_variance = TranslationVariance( + *feature_list, Vector2_f(best_model->dx(), best_model->dy())); + + return IsStableTranslation(*best_model, translation_variance, *feature_list); +} + +void MotionEstimation::EstimateTranslationModelIRLS( + int irls_rounds, bool compute_stability, + RegionFlowFeatureList* flow_feature_list, + const PriorFeatureWeights* prior_weights, + CameraMotion* camera_motion) const { + if (prior_weights && !prior_weights->HasCorrectDimension( + irls_rounds, flow_feature_list->feature_size())) { + LOG(ERROR) << "Prior weights incorrectly initialized, ignoring."; + prior_weights = nullptr; + } + + // Simply average over features. + const bool irls_use_l0_norm = options_.irls_use_l0_norm(); + + const float irls_residual_scale = + GetIRLSResidualScale(camera_motion->average_magnitude(), + options_.irls_motion_magnitude_fraction()); + + const std::vector* irls_priors = nullptr; + const std::vector* irls_alphas = nullptr; + if (prior_weights && prior_weights->HasNonZeroAlpha()) { + irls_priors = &prior_weights->priors; + irls_alphas = &prior_weights->alphas; + } + + Vector2_f mean_motion; + for (int i = 0; i < irls_rounds; ++i) { + if (options_.use_highest_accuracy_for_normal_equations()) { + mean_motion = EstimateTranslationModelDouble(*flow_feature_list); + } else { + mean_motion = EstimateTranslationModelFloat(*flow_feature_list); + } + + const float alpha = irls_alphas != nullptr ? (*irls_alphas)[i] : 0.0f; + const float one_minus_alpha = 1.0f - alpha; + + // Update irls weights. + const auto feature_start = flow_feature_list->mutable_feature()->begin(); + for (auto feature = feature_start; + feature != flow_feature_list->mutable_feature()->end(); ++feature) { + if (feature->irls_weight() == 0.0f) { + continue; + } + + // Express difference in original domain. + const Vector2_f diff = LinearSimilarityAdapter::TransformPoint( + irls_transform_, FeatureFlow(*feature) - mean_motion); + + const float numerator = + alpha == 0.0f ? 1.0f + : ((*irls_priors)[feature - feature_start] * alpha + + one_minus_alpha); + + if (irls_use_l0_norm) { + feature->set_irls_weight( + numerator / (diff.Norm() * irls_residual_scale + kIrlsEps)); + } else { + feature->set_irls_weight( + numerator / + (std::sqrt(static_cast(diff.Norm() * irls_residual_scale)) + + kIrlsEps)); + } + } + } + + // De-normalize translation. + Vector2_f translation = LinearSimilarityAdapter::TransformPoint( + inv_normalization_transform_, mean_motion); + + camera_motion->mutable_translation()->set_dx(translation.x()); + camera_motion->mutable_translation()->set_dy(translation.y()); + + if (compute_stability) { + camera_motion->set_translation_variance( + TranslationVariance(*flow_feature_list, translation)); + } +} + +float MotionEstimation::TranslationVariance( + const RegionFlowFeatureList& feature_list, + const Vector2_f& translation) const { + // Compute irls based variance. + float variance = 0; + double weight_sum = 0; + + for (const auto& feature : feature_list.feature()) { + weight_sum += feature.irls_weight(); + variance += (LinearSimilarityAdapter::TransformPoint( + inv_normalization_transform_, FeatureFlow(feature)) - + translation) + .Norm2() * + feature.irls_weight(); + } + + if (weight_sum > 0) { + return variance / weight_sum; + } else { + return 0.0f; + } +} + +namespace { + +// Solves for the linear similarity via normal equations, +// using only the positions specified by features from the feature list. +// Input matrix is expected to be a 4x4 matrix of type T, rhs and solution are +// both 4x1 vectors of type T. +// Template class T specifies the desired accuracy, use float or double. +template +LinearSimilarityModel LinearSimilarityL2SolveSystem( + const RegionFlowFeatureList& feature_list, Eigen::Matrix* matrix, + Eigen::Matrix* rhs, Eigen::Matrix* solution, + bool* success) { + CHECK(matrix != nullptr); + CHECK(rhs != nullptr); + CHECK(solution != nullptr); + + *matrix = Eigen::Matrix::Zero(); + *rhs = Eigen::Matrix::Zero(); + + // Matrix multiplications are hand-coded for speed improvements vs. + // opencv's cvGEMM calls. + for (const auto& feature : feature_list.feature()) { + const T x = feature.x(); + const T y = feature.y(); + const T w = feature.irls_weight(); + + // double J[2 * 4] = {1, 0, x, -y, + // 0, 1, y, x}; + // Compute J^t * J * w = {1, 0, x, -y + // 0, 1, y, x, + // x, y, xx+yy, 0, + // -y x, 0, xx+yy} * w; + + const T x_w = x * w; + const T y_w = y * w; + const T xx_yy_w = (x * x + y * y) * w; + + T* matrix_ptr = matrix->data(); + matrix_ptr[0] += w; + matrix_ptr[2] += x_w; + matrix_ptr[3] += -y_w; + + matrix_ptr += 4; + matrix_ptr[1] += w; + matrix_ptr[2] += y_w; + matrix_ptr[3] += x_w; + + matrix_ptr += 4; + matrix_ptr[0] += x_w; + matrix_ptr[1] += y_w; + matrix_ptr[2] += xx_yy_w; + + matrix_ptr += 4; + matrix_ptr[0] += -y_w; + matrix_ptr[1] += x_w; + matrix_ptr[3] += xx_yy_w; + + T* rhs_ptr = rhs->data(); + + // Using identity parametrization below. + const T m_x = feature.dx() * w; + const T m_y = feature.dy() * w; + + rhs_ptr[0] += m_x; + rhs_ptr[1] += m_y; + rhs_ptr[2] += x * m_x + y * m_y; + rhs_ptr[3] += -y * m_x + x * m_y; + } + + // Solution parameters p. + *solution = matrix->colPivHouseholderQr().solve(*rhs); + if (((*matrix) * (*solution)).isApprox(*rhs, kPrecision)) { + LinearSimilarityModel model; + model.set_dx((*solution)(0, 0)); + model.set_dy((*solution)(1, 0)); + model.set_a((*solution)(2, 0) + 1.0); // Identity parametrization. + model.set_b((*solution)(3, 0)); + if (success) { + *success = true; + } + return model; + } + + if (success) { + *success = false; + } + return LinearSimilarityModel(); +} + +} // namespace. + +bool MotionEstimation::GetSimilarityIrlsInitialization( + RegionFlowFeatureList* feature_list, + const EstimateModelOptions& model_options, float avg_camera_motion, + InlierMask* inlier_mask, LinearSimilarityModel* best_model) const { + CHECK(best_model != nullptr); + + const int num_features = feature_list->feature_size(); + if (!num_features) { + return false; + } + + // matrix is symmetric. + Eigen::Matrix matrix = Eigen::Matrix::Zero(); + Eigen::Matrix solution = Eigen::Matrix::Zero(); + Eigen::Matrix rhs = Eigen::Matrix::Zero(); + + // Bool indicator which features agree with model in each round. + // In case no RANSAC rounds are performed considered all features inliers. + std::vector best_features(num_features, 1); + std::vector curr_features(num_features); + float best_sum = 0; + + unsigned int seed = 900913; // = Google in leet :) + std::default_random_engine rand_gen(seed); + std::uniform_int_distribution<> distribution(0, num_features - 1); + auto& options = options_.irls_initialization(); + + const float irls_residual_scale = GetIRLSResidualScale( + avg_camera_motion, options_.irls_motion_magnitude_fraction()); + const float cutoff = options.cutoff() / irls_residual_scale; + const float sq_cutoff = cutoff * cutoff; + + // Either temporal bias or motion prior based on options. + std::vector bias(num_features, 1.0f); + + // Compute mask index for each feature. + std::vector mask_indices; + + if (options_.estimation_policy() == + MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS) { + BiasFromFeatures(*feature_list, MODEL_LINEAR_SIMILARITY, model_options, + &bias); + } else if (inlier_mask) { + std::vector unused_bin_normalizer; + ComputeFeatureMask(*feature_list, &mask_indices, &unused_bin_normalizer); + inlier_mask->MotionPrior(*feature_list, &bias); + } + + for (int rounds = 0; rounds < options.rounds(); ++rounds) { + // Pick two random vectors. + RegionFlowFeatureList to_test; + to_test.add_feature()->CopyFrom( + feature_list->feature(distribution(rand_gen))); + to_test.add_feature()->CopyFrom( + feature_list->feature(distribution(rand_gen))); + ResetRegionFlowFeatureIRLSWeights(1.0f, &to_test); + bool success = false; + LinearSimilarityModel similarity = LinearSimilarityL2SolveSystem( + to_test, &matrix, &rhs, &solution, &success); + if (!success) { + continue; + } + + float curr_sum = 0; + for (int i = 0; i < num_features; ++i) { + const Feature& feature = feature_list->feature(i); + const Vector2_f trans_location = LinearSimilarityAdapter::TransformPoint( + similarity, FeatureLocation(feature)); + const Vector2_f diff = FeatureMatchLocation(feature) - trans_location; + curr_features[i] = static_cast(diff.Norm2() < sq_cutoff); + if (curr_features[i]) { + float score = feature.irls_weight(); + if (inlier_mask) { + const int bin_idx = mask_indices[i]; + score *= (bias[i] + inlier_mask->GetInlierScore(bin_idx)); + } else { + score *= bias[i]; + } + curr_sum += score; + } + } + + if (curr_sum > best_sum) { + best_sum = curr_sum; + best_features.swap(curr_features); + best_model->Swap(&similarity); + } + } + + if (inlier_mask) { + inlier_mask->InitUpdateMask(); + } + + int num_inliers = 0; + + std::vector inlier_weights; + + // Score outliers low. + for (int i = 0; i < num_features; ++i) { + RegionFlowFeature* feature = feature_list->mutable_feature(i); + if (best_features[i] == 0 && feature->irls_weight() != 0) { + feature->set_irls_weight(kOutlierIRLSWeight); + } else { + ++num_inliers; + inlier_weights.push_back(feature->irls_weight()); + if (inlier_mask) { + const int bin_idx = mask_indices[i]; + inlier_mask->RecordInlier(bin_idx, feature->irls_weight()); + } + } + } + + if (!inlier_weights.empty()) { + // Ensure that all selected inlier features have at least median weight. + auto median = inlier_weights.begin() + inlier_weights.size() * 0.5f; + std::nth_element(inlier_weights.begin(), median, inlier_weights.end()); + + for (int i = 0; i < num_features; ++i) { + RegionFlowFeature* feature = feature_list->mutable_feature(i); + if (best_features[i] != 0) { + feature->set_irls_weight(std::max(*median, feature->irls_weight())); + } + } + } + + // For stability purposes we don't need to be that strict here. + // Inflate number of actual inliers, as failing the initialization will most + // likely fail the actual estimation. That way it can recover. + num_inliers *= 2; + return IsStableSimilarity(*best_model, *feature_list, num_inliers); +} + +void MotionEstimation::ComputeSimilarityInliers( + const RegionFlowFeatureList& feature_list, int* num_inliers, + int* num_strict_inliers) const { + CHECK(num_inliers); + CHECK(num_strict_inliers); + + const auto& similarity_bounds = options_.stable_similarity_bounds(); + + // Compute IRLS weight threshold from inlier threshold expressed in pixel + // error. IRLS weight is normalized to 1 pixel error, so take reciprocal. + float threshold = std::max(similarity_bounds.inlier_threshold(), + similarity_bounds.frac_inlier_threshold() * + hypot(frame_width_, frame_height_)); + CHECK_GT(threshold, 0); + + threshold = 1.0f / threshold; + float strict_threshold = similarity_bounds.strict_inlier_threshold(); + CHECK_GT(strict_threshold, 0); + strict_threshold = 1.0f / strict_threshold; + + if (!options_.irls_use_l0_norm()) { + threshold = std::sqrt(static_cast(threshold)); + } + + *num_inliers = 0; + *num_strict_inliers = 0; + for (const auto& feature : feature_list.feature()) { + if (feature.irls_weight() >= threshold) { + ++*num_inliers; + } + + if (feature.irls_weight() >= strict_threshold) { + ++*num_strict_inliers; + } + } +} + +bool MotionEstimation::EstimateLinearSimilarityModelIRLS( + int irls_rounds, bool compute_stability, + RegionFlowFeatureList* flow_feature_list, + const PriorFeatureWeights* prior_weights, + CameraMotion* camera_motion) const { + if (prior_weights && !prior_weights->HasCorrectDimension( + irls_rounds, flow_feature_list->feature_size())) { + LOG(ERROR) << "Prior weights incorrectly initialized, ignoring."; + prior_weights = nullptr; + } + + // Just declaring does not actually allocate memory + Eigen::Matrix matrix_f; + Eigen::Matrix solution_f; + Eigen::Matrix rhs_f; + Eigen::Matrix matrix_d; + Eigen::Matrix solution_d; + Eigen::Matrix rhs_d; + + if (options_.use_highest_accuracy_for_normal_equations()) { + matrix_d = Eigen::Matrix::Zero(); + solution_d = Eigen::Matrix::Zero(); + rhs_d = Eigen::Matrix::Zero(); + } else { + matrix_f = Eigen::Matrix::Zero(); + solution_f = Eigen::Matrix::Zero(); + rhs_f = Eigen::Matrix::Zero(); + } + + LinearSimilarityModel* solved_model = + camera_motion->mutable_linear_similarity(); + + const float irls_residual_scale = + GetIRLSResidualScale(camera_motion->average_magnitude(), + options_.irls_motion_magnitude_fraction()); + + const bool irls_use_l0_norm = options_.irls_use_l0_norm(); + + const std::vector* irls_priors = nullptr; + const std::vector* irls_alphas = nullptr; + if (prior_weights && prior_weights->HasNonZeroAlpha()) { + irls_priors = &prior_weights->priors; + irls_alphas = &prior_weights->alphas; + } + + for (int i = 0; i < irls_rounds; ++i) { + bool success; + if (options_.use_highest_accuracy_for_normal_equations()) { + *solved_model = LinearSimilarityL2SolveSystem( + *flow_feature_list, &matrix_d, &rhs_d, &solution_d, &success); + } else { + *solved_model = LinearSimilarityL2SolveSystem( + *flow_feature_list, &matrix_f, &rhs_f, &solution_f, &success); + } + + if (!success) { + VLOG(1) << "Linear similarity estimation failed."; + *camera_motion->mutable_linear_similarity() = LinearSimilarityModel(); + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + + const float alpha = irls_alphas != nullptr ? (*irls_alphas)[i] : 0.0f; + const float one_minus_alpha = 1.0f - alpha; + + const auto feature_start = flow_feature_list->mutable_feature()->begin(); + for (auto feature = feature_start; + feature != flow_feature_list->mutable_feature()->end(); ++feature) { + if (feature->irls_weight() == 0.0f) { + continue; + } + + const Vector2_f trans_location = LinearSimilarityAdapter::TransformPoint( + *solved_model, FeatureLocation(*feature)); + const Vector2_f matched_location = FeatureMatchLocation(*feature); + + // Express residual in frame coordinates. + const Vector2_f residual = LinearSimilarityAdapter::TransformPoint( + irls_transform_, trans_location - matched_location); + const float numerator = + alpha == 0.0f ? 1.0f + : ((*irls_priors)[feature - feature_start] * alpha + + one_minus_alpha); + + if (irls_use_l0_norm) { + feature->set_irls_weight( + numerator / (residual.Norm() * irls_residual_scale + kIrlsEps)); + } else { + feature->set_irls_weight( + numerator / (std::sqrt(static_cast(residual.Norm() * + irls_residual_scale)) + + kIrlsEps)); + } + } + } + + // Undo pre_transform. + *solved_model = ModelCompose3(inv_normalization_transform_, *solved_model, + normalization_transform_); + + if (compute_stability) { + int num_inliers = 0; + int num_strict_inliers = 0; + + if (flow_feature_list->feature_size() > 0) { + ComputeSimilarityInliers(*flow_feature_list, &num_inliers, + &num_strict_inliers); + + const float inv_num_feat = 1.0f / flow_feature_list->feature_size(); + camera_motion->set_similarity_inlier_ratio(num_inliers * inv_num_feat); + camera_motion->set_similarity_strict_inlier_ratio(num_strict_inliers * + inv_num_feat); + } else { + camera_motion->set_similarity_inlier_ratio(1.0f); + camera_motion->set_similarity_strict_inlier_ratio(1.0f); + } + } + + return true; +} + +bool MotionEstimation::EstimateAffineModelIRLS( + int irls_rounds, RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion) const { + // Setup solution matrices in column major. + Eigen::Matrix matrix = Eigen::Matrix::Zero(); + Eigen::Matrix rhs = Eigen::Matrix::Zero(); + + AffineModel* solved_model = camera_motion->mutable_affine(); + + // Multiple rounds of weighting based L2 optimization. + for (int i = 0; i < irls_rounds; ++i) { + // Build Jacobians. + for (const auto& feature : feature_list->feature()) { + const double w = feature.irls_weight(); + const Vector2_f& pt_1 = FeatureLocation(feature); + const double x = pt_1.x() * w; + const double y = pt_1.y() * w; + + Eigen::Matrix jacobian = + Eigen::Matrix::Zero(); + jacobian(0, 0) = w; + jacobian(0, 2) = x; + jacobian(0, 3) = y; + jacobian(1, 1) = w; + jacobian(1, 4) = x; + jacobian(1, 5) = y; + + // Update A. + // Compute J^t * J, where J = jacobian. + matrix = jacobian.transpose() * jacobian + matrix; + + // Transform matched point. + const Vector2_f& pt_2 = FeatureMatchLocation(feature); + Eigen::Matrix pt_2_mat(pt_2.x() * w, pt_2.y() * w); + + // Compute J^t * y_i; + rhs = jacobian.transpose() * pt_2_mat + rhs; + } + + // Solve A * p = b; + Eigen::Matrix p = Eigen::Matrix::Zero(); + p = matrix.colPivHouseholderQr().solve(rhs); + if (!(matrix * p).isApprox(rhs, kPrecision)) { + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + + // Set model. + solved_model->set_dx(p(0, 0)); + solved_model->set_dy(p(1, 0)); + solved_model->set_a(p(2, 0)); + solved_model->set_b(p(3, 0)); + solved_model->set_c(p(4, 0)); + solved_model->set_d(p(5, 0)); + + // Re-compute weights from errors. + for (auto& feature : *feature_list->mutable_feature()) { + if (feature.irls_weight() == 0.0f) { + continue; + } + + const Vector2_f trans_location = AffineAdapter::TransformPoint( + *solved_model, FeatureLocation(feature)); + const Vector2_f& matched_location = FeatureMatchLocation(feature); + + // Express residual in frame coordinates. + const Vector2_f residual = LinearSimilarityAdapter::TransformPoint( + irls_transform_, trans_location - matched_location); + + feature.set_irls_weight(sqrt(1.0 / (residual.Norm() + kIrlsEps))); + } + } + + // Express in original frame coordinate system. + *solved_model = ModelCompose3( + LinearSimilarityAdapter::ToAffine(inv_normalization_transform_), + *solved_model, + LinearSimilarityAdapter::ToAffine(normalization_transform_)); + return true; +} + +// Estimates homography via least squares (specifically QR decomposition). +// Specifically, for +// H = (a b t1) +// c d t2 +// w1 w2 1 +// H * (x, y, 1)^T = (a*x + b*y + t1 +// c*x + d*y + t2 +// w1*x + w2*y + 1 ) +// +// Therefore matching point (mx, my) is given by +// mx = (a*x + b*y + t1) / (w1*x + w2*y + 1) Eq. 1 +// my = (c*x + d*y + t2) / (w1*x + w2*y + 1) +// +// Multiply with denominator to get: +// a*x + b*y + t1 -w1*x*mx -w2*y*mx = mx Eq. 2 +// c*x + d*y + t2 -w1*x*my -w2*y*my = my +// +// This results in the following system +// J = (x y 1 0 0 0 -x*mx -y*mx) * (a b t1 c d t2 w1 w2)^T = (mx +// 0 0 0 x y 1 -x*my -y*my my) +// +// Note, that in a linear system Eq. 1 and Eq. 2 are not equivalent, +// as the denominator varies w.r.t. the position of each feature. + +// Therefore, if the optional argument prev_solution is passed, +// each J will be scaled with the denominator w1*x + w2*y + 1. +// +// If perspective_regularizer := r != 0, an additional equation is introduced: +// (0 0 0 0 0 0 r r) * (a b t1 c d t2 w1 w2)^T = 0. +// +// Returns false if system could not be solved for. +template +bool HomographyL2QRSolve( + const RegionFlowFeatureList& feature_list, + const Homography* prev_solution, // optional. + float perspective_regularizer, + Eigen::Matrix* matrix, // tmp matrix + Eigen::Matrix* solution) { + CHECK(matrix); + CHECK(solution); + CHECK_EQ(8, matrix->cols()); + const int num_rows = + 2 * feature_list.feature_size() + (perspective_regularizer == 0 ? 0 : 1); + CHECK_EQ(num_rows, matrix->rows()); + CHECK_EQ(1, solution->cols()); + CHECK_EQ(8, solution->rows()); + + // Compute homography from features (H * location = prev_location). + *matrix = Eigen::Matrix::Zero(matrix->rows(), 8); + Eigen::Matrix rhs = + Eigen::Matrix::Zero(matrix->rows(), 1); + + if (RegionFlowFeatureIRLSSum(feature_list) > kMaxCondition) { + return false; + } + + // Create matrix and rhs (using h_33 = 1 constraint). + int feature_idx = 0; + for (auto feature = feature_list.feature().begin(); + feature != feature_list.feature().end(); ++feature, ++feature_idx) { + int feature_row = 2 * feature_idx; + + Vector2_f pt = FeatureLocation(*feature); + Vector2_f prev_pt = FeatureMatchLocation(*feature); + // Weight per feature. + double scale = 1.0; + if (prev_solution) { + const double denom = + prev_solution->h_20() * pt.x() + prev_solution->h_21() * pt.y() + 1.0; + if (fabs(denom) > 1e-5) { + scale /= denom; + } else { + scale = 0; + } + } + + const float w = feature->irls_weight() * scale; + + // Scale feature with weight; + Vector2_f pt_w = pt * w; + + // Row 1 of above J: + (*matrix)(feature_row, 0) = pt_w.x(); + (*matrix)(feature_row, 1) = pt_w.y(); + (*matrix)(feature_row, 2) = w; + + // Entry 3 .. 5 equal zero. + (*matrix)(feature_row, 6) = -pt_w.x() * prev_pt.x(); + (*matrix)(feature_row, 7) = -pt_w.y() * prev_pt.x(); + rhs(feature_row, 0) = prev_pt.x() * w; + + // Row 2 of above J: + // Entry 0 .. 2 equal zero. + (*matrix)(feature_row + 1, 3) = pt_w.x(); + (*matrix)(feature_row + 1, 4) = pt_w.y(); + (*matrix)(feature_row + 1, 5) = w; + + (*matrix)(feature_row + 1, 6) = -pt_w.x() * prev_pt.y(); + (*matrix)(feature_row + 1, 7) = -pt_w.y() * prev_pt.y(); + rhs(feature_row + 1, 0) = prev_pt.y() * w; + } + + if (perspective_regularizer > 0) { + int last_row_idx = 2 * feature_list.feature_size(); + (*matrix)(last_row_idx, 6) = (*matrix)(last_row_idx, 7) = + perspective_regularizer; + } + + // TODO: Consider a faster function? + *solution = matrix->colPivHouseholderQr().solve(rhs); + return ((*matrix) * (*solution)).isApprox(rhs, kPrecision); +} + +// Same as function above, but solves for homography via normal equations, +// using only the positions specified by features from the feature list. +// Expects 8x8 matrix of type T and 8x1 rhs and solution vector of type T. +// Optional parameter is prev_solution, in which case each row is scaled by +// correct denominator (see derivation at function description +// HomographyL2QRSolve). +// Template class T specifies the desired accuracy, use float or double. +template +Homography HomographyL2NormalEquationSolve( + const RegionFlowFeatureList& feature_list, + const Homography* prev_solution, // optional. + float perspective_regularizer, Eigen::Matrix* matrix, + Eigen::Matrix* rhs, Eigen::Matrix* solution, + bool* success) { + CHECK(matrix != nullptr); + CHECK(rhs != nullptr); + CHECK(solution != nullptr); + + *matrix = Eigen::Matrix::Zero(); + *rhs = Eigen::Matrix::Zero(); + + // Matrix multiplications are hand-coded for speed improvements vs. + // opencv's cvGEMM calls. + for (const auto& feature : feature_list.feature()) { + T scale = 1.0; + if (prev_solution) { + const T denom = prev_solution->h_20() * feature.x() + + prev_solution->h_21() * feature.y() + 1.0; + if (fabs(denom) > 1e-5) { + scale /= denom; + } else { + scale = 0; + } + } + const T w = feature.irls_weight() * scale; + const T x = feature.x(); + const T y = feature.y(); + const T xw = x * w; + const T yw = y * w; + const T xxw = x * x * w; + const T yyw = y * y * w; + const T xyw = x * y * w; + const T mx = feature.x() + feature.dx(); + const T my = feature.y() + feature.dy(); + + const T mxxyy = mx * mx + my * my; + // Jacobian + // double J[2 * 8] = {x, y, 1, 0, 0, 0, -x * m_x, -y * m_x, + // {0, 0, 0, x, y, 1, -x * m_y, -y * m_y} + // + // // Compute J^t * J * w = + // ( xx xy x 0 0 0 -xx*mx -xy*mx ) + // ( xy yy y 0 0 0 -xy*mx -yy*mx ) + // ( x y 1 0 0 0 -x*mx -y*mx ) + // ( 0 0 0 xx xy x -xx*my -xy*my ) + // ( 0 0 0 xy yy y -xy*my -yy*my ) + // ( 0 0 0 x y 1 -x*my -y*my ) + // ( -xx*mx -xy*mx -x*mx -xx*my -xy*my -x*my xx*mxxyy xy*mxxyy ) + // ( -xy*mx -yy*mx -y*mx -xy*my -yy*my -y*my xy*mxxyy yy*mxxyy ) * w + + // 1st row: xx xy x 0 0 0 -xx*mx -xy*mx + T* matrix_ptr = matrix->data(); + matrix_ptr[0] += xxw; + matrix_ptr[1] += xyw; + matrix_ptr[2] += xw; + matrix_ptr[6] += -xxw * mx; + matrix_ptr[7] += -xyw * mx; + + // 2nd row: xy yy y 0 0 0 -xy*mx -yy*mx + matrix_ptr += 8; + matrix_ptr[0] += xyw; + matrix_ptr[1] += yyw; + matrix_ptr[2] += yw; + matrix_ptr[6] += -xyw * mx; + matrix_ptr[7] += -yyw * mx; + + // 3rd row: x y 1 0 0 0 -x*mx -y*mx + matrix_ptr += 8; + matrix_ptr[0] += xw; + matrix_ptr[1] += yw; + matrix_ptr[2] += w; + matrix_ptr[6] += -xw * mx; + matrix_ptr[7] += -yw * mx; + + // 4th row: 0 0 0 xx xy x -xx*my -xy*my + matrix_ptr += 8; + matrix_ptr[3] += xxw; + matrix_ptr[4] += xyw; + matrix_ptr[5] += xw; + matrix_ptr[6] += -xxw * my; + matrix_ptr[7] += -xyw * my; + + // 5th row: 0 0 0 xy yy y -xy*my -yy*my + matrix_ptr += 8; + matrix_ptr[3] += xyw; + matrix_ptr[4] += yyw; + matrix_ptr[5] += yw; + matrix_ptr[6] += -xyw * my; + matrix_ptr[7] += -yyw * my; + + // 6th row: 0 0 0 x y 1 -x*my -y*my + matrix_ptr += 8; + matrix_ptr[3] += xw; + matrix_ptr[4] += yw; + matrix_ptr[5] += w; + matrix_ptr[6] += -xw * my; + matrix_ptr[7] += -yw * my; + + // 7th row: -xx*mx -xy*mx -x*mx -xx*my -xy*my -x*my xx*mxxyy xy*mxxyy + matrix_ptr += 8; + matrix_ptr[0] += -xxw * mx; + matrix_ptr[1] += -xyw * mx; + matrix_ptr[2] += -xw * mx; + matrix_ptr[3] += -xxw * my; + matrix_ptr[4] += -xyw * my; + matrix_ptr[5] += -xw * my; + matrix_ptr[6] += xxw * mxxyy; + matrix_ptr[7] += xyw * mxxyy; + + // 8th row: -xy*mx -yy*mx -y*mx -xy*my -yy*my -y*my xy*mxxyy yy*mxxyy + matrix_ptr += 8; + matrix_ptr[0] += -xyw * mx; + matrix_ptr[1] += -yyw * mx; + matrix_ptr[2] += -yw * mx; + matrix_ptr[3] += -xyw * my; + matrix_ptr[4] += -yyw * my; + matrix_ptr[5] += -yw * my; + matrix_ptr[6] += xyw * mxxyy; + matrix_ptr[7] += yyw * mxxyy; + + // Right hand side: + // b = ( x + // y ) + // Compute J^t * b * w = + // ( x*mx y*mx mx x*my y*my my -x*mxxyy -y*mxxyy ) * w + T* rhs_ptr = rhs->data(); + rhs_ptr[0] += xw * mx; + rhs_ptr[1] += yw * mx; + rhs_ptr[2] += mx * w; + rhs_ptr[3] += xw * my; + rhs_ptr[4] += yw * my; + rhs_ptr[5] += my * w; + rhs_ptr[6] += -xw * mxxyy; + rhs_ptr[7] += -yw * mxxyy; + } + + if (perspective_regularizer > 0) { + // Additional constraint: + // C[8] = {0, 0, 0, 0, 0, 0, r, r} + // Compute C^t * C = + // [ 0 ... 0 0 0 + // ... + // 0 ... 0 r^2 r^2 + // 0 ... 0 r^2 r^2 ] + const T sq_r = perspective_regularizer * perspective_regularizer; + + T* matrix_ptr = matrix->row(6).data(); + matrix_ptr[6] += sq_r; + matrix_ptr[7] += sq_r; + matrix_ptr += 8; + matrix_ptr[6] += sq_r; + matrix_ptr[7] += sq_r; + // Nothing to add to RHS (zero). + } + + // Solution parameters p. + *solution = matrix->colPivHouseholderQr().solve(*rhs); + if (((*matrix) * (*solution)).isApprox(*rhs, kPrecision)) { + const T* ptr = solution->data(); + Homography model; + + model.set_h_00(ptr[0]); + model.set_h_01(ptr[1]); + model.set_h_02(ptr[2]); + model.set_h_10(ptr[3]); + model.set_h_11(ptr[4]); + model.set_h_12(ptr[5]); + model.set_h_20(ptr[6]); + model.set_h_21(ptr[7]); + + if (success) { + *success = true; + } + return model; + } + + if (success) { + *success = false; + } + return Homography(); +} + +namespace { + +float PatchDescriptorIRLSWeight(const RegionFlowFeature& feature) { + float weight = feature.irls_weight(); + + // Blend weight to combine irls weight with a feature's path standard + // deviation. + const float alpha = 0.7f; + // Inverse of maximum value of standard deviation for intensities in [0, 255]. + // Scaled such that only low textured regions are given small weight. + const float denom = 1.0f / 128.0f * 5.0f; + + const float feature_stdev_l1 = + PatchDescriptorColorStdevL1(feature.feature_descriptor()); + + if (feature_stdev_l1 >= 0.0f) { + weight *= alpha + (1.f - alpha) * std::min(1.f, feature_stdev_l1 * denom); + } + + return weight; +} + +// Extension of above function to evenly spaced row-mixture models. +bool MixtureHomographyL2DLTSolve( + const RegionFlowFeatureList& feature_list, int num_models, + const MixtureRowWeights& row_weights, float regularizer_lambda, + Eigen::MatrixXf* matrix, // least squares matrix + Eigen::MatrixXf* solution) { + CHECK(matrix); + CHECK(solution); + + // cv::solve can hang for really bad conditioned systems. + const double feature_irls_sum = RegionFlowFeatureIRLSSum(feature_list); + if (feature_irls_sum > kMaxCondition) { + return false; + } + + const int num_dof = 8 * num_models; + const int num_constraints = num_dof - 8; + + CHECK_EQ(matrix->cols(), num_dof); + // 2 Rows (x,y) per feature. + CHECK_EQ(matrix->rows(), 2 * feature_list.feature_size() + num_constraints); + CHECK_EQ(solution->cols(), 1); + CHECK_EQ(solution->rows(), num_dof); + + // Compute homography from features. (H * location = prev_location) + *matrix = Eigen::MatrixXf::Zero(matrix->rows(), matrix->cols()); + Eigen::Matrix rhs = + Eigen::MatrixXf::Zero(matrix->rows(), 1); + + // Normalize feature sum to 1. + float irls_denom = 1.0 / (feature_irls_sum + 1e-6); + + // Create matrix for DLT. + int feature_idx = 0; + for (auto feature = feature_list.feature().begin(); + feature != feature_list.feature().end(); ++feature, ++feature_idx) { + float* mat_row_1 = matrix->row(2 * feature_idx).data(); + float* mat_row_2 = matrix->row(2 * feature_idx + 1).data(); + float* rhs_row_1 = rhs.row(2 * feature_idx).data(); + float* rhs_row_2 = rhs.row(2 * feature_idx + 1).data(); + + Vector2_f pt = FeatureLocation(*feature); + Vector2_f prev_pt = FeatureMatchLocation(*feature); + // Weight per feature. + const float f_w = PatchDescriptorIRLSWeight(*feature) * irls_denom; + + // Scale feature point by weight; + Vector2_f pt_w = pt * f_w; + const float* mix_weights = row_weights.RowWeightsClamped(feature->y()); + + for (int m = 0; m < num_models; ++m, mat_row_1 += 8, mat_row_2 += 8) { + const float w = mix_weights[m]; + // Entries 0 .. 2 are zero. + mat_row_1[3] = -pt_w.x() * w; + mat_row_1[4] = -pt_w.y() * w; + mat_row_1[5] = -f_w * w; + + mat_row_1[6] = pt_w.x() * prev_pt.y() * w; + mat_row_1[7] = pt_w.y() * prev_pt.y() * w; + + mat_row_2[0] = pt_w.x() * w; + mat_row_2[1] = pt_w.y() * w; + mat_row_2[2] = f_w * w; + + // Entries 3 .. 5 are zero. + mat_row_2[6] = -pt_w.x() * prev_pt.x() * w; + mat_row_2[7] = -pt_w.y() * prev_pt.x() * w; + } + + // Weights sum to one (-> take out of loop). + rhs_row_1[0] = -prev_pt.y() * f_w; + rhs_row_2[0] = prev_pt.x() * f_w; + } + + // Add regularizer term. It is important to weight perspective larger + // to roughly obtain similar magnitudes across parameters. + const float param_weights[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 100.f, 100.f}; + + const int reg_row_start = 2 * feature_list.feature_size(); + for (int m = 0; m < num_models - 1; ++m) { + for (int p = 0; p < 8; ++p) { + const int curr_idx = m * 8 + p; + float* curr_row = matrix->row(reg_row_start + curr_idx).data(); + curr_row[curr_idx] = regularizer_lambda * param_weights[p]; + curr_row[curr_idx + 8] = -regularizer_lambda * param_weights[p]; + } + } + + // TODO: Consider a faster function? + *solution = matrix->colPivHouseholderQr().solve(rhs); + return ((*matrix) * (*solution)).isApprox(rhs, kPrecision); +} + +// Constraint mixture homography model. +// Only translation (2 DOF) varies across mixture of size num_models, with +// strictly affine and perspective part (4 + 2 = 6 DOF) being constant across +// the mixtures. +bool TransMixtureHomographyL2DLTSolve( + const RegionFlowFeatureList& feature_list, int num_models, + const MixtureRowWeights& row_weights, float regularizer_lambda, + Eigen::MatrixXf* matrix, // least squares matrix + Eigen::MatrixXf* solution) { + CHECK(matrix); + CHECK(solution); + + // cv::solve can hang for really bad conditioned systems. + const double feature_irls_sum = RegionFlowFeatureIRLSSum(feature_list); + if (feature_irls_sum > kMaxCondition) { + return false; + } + + const int num_dof = 6 + 2 * num_models; + const int num_constraints = 2 * (num_models - 1); + + CHECK_EQ(matrix->cols(), num_dof); + // 2 Rows (x,y) per feature. + CHECK_EQ(matrix->rows(), 2 * feature_list.feature_size() + num_constraints); + CHECK_EQ(solution->cols(), 1); + CHECK_EQ(solution->rows(), num_dof); + + // Compute homography from features. (H * location = prev_location) + *matrix = Eigen::MatrixXf::Zero(matrix->rows(), matrix->cols()); + Eigen::Matrix rhs = + Eigen::MatrixXf::Zero(matrix->rows(), 1); + + // Create matrix for DLT. + int feature_idx = 0; + + // Normalize feature sum to 1. + float irls_denom = 1.0 / (feature_irls_sum + 1e-6); + + for (auto feature = feature_list.feature().begin(); + feature != feature_list.feature().end(); ++feature, ++feature_idx) { + float* mat_row_1 = matrix->row(2 * feature_idx).data(); + float* mat_row_2 = matrix->row(2 * feature_idx + 1).data(); + float* rhs_row_1 = rhs.row(2 * feature_idx).data(); + float* rhs_row_2 = rhs.row(2 * feature_idx + 1).data(); + + Vector2_f pt = FeatureLocation(*feature); + Vector2_f prev_pt = FeatureMatchLocation(*feature); + + // Weight per feature. + const float f_w = PatchDescriptorIRLSWeight(*feature) * irls_denom; + + // Scale feature point by weight. + Vector2_f pt_w = pt * f_w; + const float* mix_weights = row_weights.RowWeightsClamped(feature->y()); + + // Entries 0 .. 1 are zero. + mat_row_1[2] = -pt_w.x(); + mat_row_1[3] = -pt_w.y(); + + mat_row_1[4] = pt_w.x() * prev_pt.y(); + mat_row_1[5] = pt_w.y() * prev_pt.y(); + + mat_row_2[0] = pt_w.x(); + mat_row_2[1] = pt_w.y(); + + // Entries 2 .. 3 are zero. + mat_row_2[4] = -pt_w.x() * prev_pt.x(); + mat_row_2[5] = -pt_w.y() * prev_pt.x(); + + // Weights sum to one (-> take out of loop). + rhs_row_1[0] = -prev_pt.y() * f_w; + rhs_row_2[0] = prev_pt.x() * f_w; + + for (int m = 0; m < num_models; ++m, mat_row_1 += 2, mat_row_2 += 2) { + const float w = mix_weights[m]; + mat_row_1[6] = 0; + mat_row_1[7] = -f_w * w; + + mat_row_2[6] = f_w * w; + mat_row_2[7] = 0; + } + } + + const int reg_row_start = 2 * feature_list.feature_size(); + int constraint_idx = 0; + for (int m = 0; m < num_models - 1; ++m) { + for (int p = 0; p < 2; ++p, ++constraint_idx) { + const int curr_idx = 6 + m * 2 + p; + float* curr_row = matrix->row(reg_row_start + constraint_idx).data(); + curr_row[curr_idx] = regularizer_lambda; + curr_row[curr_idx + 2] = -regularizer_lambda; + } + } + + // TODO: Consider a faster function + *solution = matrix->colPivHouseholderQr().solve(rhs); + return ((*matrix) * (*solution)).isApprox(rhs, kPrecision); +} + +// Constraint mixture homography model. +// Only translation and skew and rotation (2 + 2 = 4 DOF) vary across mixture +// of size num_models, with scale and perspective part (2 + 2 = 4 DOF) being +// constant across the mixtures. +bool SkewRotMixtureHomographyL2DLTSolve( + const RegionFlowFeatureList& feature_list, int num_models, + const MixtureRowWeights& row_weights, float regularizer_lambda, + Eigen::MatrixXf* matrix, // least squares matrix + Eigen::MatrixXf* solution) { + CHECK(matrix); + CHECK(solution); + + // cv::solve can hang for really bad conditioned systems. + const double feature_irls_sum = RegionFlowFeatureIRLSSum(feature_list); + if (feature_irls_sum > kMaxCondition) { + return false; + } + + const int num_dof = 4 + 4 * num_models; + const int num_constraints = 4 * (num_models - 1); + + CHECK_EQ(matrix->cols(), num_dof); + // 2 Rows (x,y) per feature. + CHECK_EQ(matrix->rows(), 2 * feature_list.feature_size() + num_constraints); + CHECK_EQ(solution->cols(), 1); + CHECK_EQ(solution->rows(), num_dof); + + // Compute homography from features. (H * location = prev_location) + *matrix = Eigen::MatrixXf::Zero(matrix->rows(), matrix->cols()); + Eigen::Matrix rhs = + Eigen::MatrixXf::Zero(matrix->rows(), 1); + + // Create matrix for DLT. + int feature_idx = 0; + + // Normalize feature sum to 1. + float irls_denom = 1.0 / (feature_irls_sum + 1e-6); + + for (auto feature = feature_list.feature().begin(); + feature != feature_list.feature().end(); ++feature, ++feature_idx) { + Vector2_f pt = FeatureLocation(*feature); + Vector2_f prev_pt = FeatureMatchLocation(*feature); + + // Weight per feature. + const float f_w = PatchDescriptorIRLSWeight(*feature) * irls_denom; + + // Scale feature point by weight. + Vector2_f pt_w = pt * f_w; + const float* mix_weights = row_weights.RowWeightsClamped(feature->y()); + + // Compare to MixtureHomographyDLTSolve. + // Mapping of parameters (from homography to mixture) is as follows: + // 0 1 2 3 4 5 6 7 + // --> 0 4 6 5 1 7 2 3 + + int feature_row = 2 * feature_idx; + // Entry 0 is zero. + // Skew is in mixture. + (*matrix)(feature_row, 1) = -pt_w.y(); + (*matrix)(feature_row, 2) = pt_w.x() * prev_pt.y(); + (*matrix)(feature_row, 3) = pt_w.y() * prev_pt.y(); + + (*matrix)(feature_row + 1, 0) = pt_w.x(); + // Entry 1 is zero. + (*matrix)(feature_row + 1, 2) = -pt_w.x() * prev_pt.x(); + (*matrix)(feature_row + 1, 3) = -pt_w.y() * prev_pt.x(); + + // Weights sum to one (-> take out of loop). + rhs(feature_row, 0) = -prev_pt.y() * f_w; + rhs(feature_row + 1, 0) = prev_pt.x() * f_w; + + // Is this right? + for (int m = 0; m < num_models; ++m) { + const float w = mix_weights[m]; + (*matrix)(feature_row, 4 + 4 * m) = 0.0f; + (*matrix)(feature_row, 5 + 4 * m) = -pt_w.x() * w; // Skew. + (*matrix)(feature_row, 6 + 4 * m) = 0.0f; + (*matrix)(feature_row, 7 + 4 * m) = -f_w * w; + + (*matrix)(feature_row + 1, 4 + 4 * m) = pt_w.y() * w; + (*matrix)(feature_row + 1, 5 + 4 * m) = 0.0f; // Translation. + (*matrix)(feature_row + 1, 6 + 4 * m) = f_w * w; + (*matrix)(feature_row + 1, 7 + 4 * m) = 0.0f; + } + } + + const int reg_row_start = 2 * feature_list.feature_size(); + int constraint_idx = 0; + for (int m = 0; m < num_models - 1; ++m) { + for (int p = 0; p < 4; ++p, ++constraint_idx) { + const int curr_idx = 4 + m * 4 + p; + int curr_row = reg_row_start + constraint_idx; + (*matrix)(curr_row, curr_idx) = regularizer_lambda; + (*matrix)(curr_row, curr_idx + 4) = -regularizer_lambda; + } + } + + // TODO: Consider a faster function? + *solution = matrix->colPivHouseholderQr().solve(rhs); + return ((*matrix) * (*solution)).isApprox(rhs, kPrecision); +} + +} // namespace. + +// For plot example for IRLS_WEIGHT_PERIMITER_GAUSSIAN, see: goo.gl/fNzQc +// (plot assumes 3:2 ratio for width:height). +void MotionEstimation::GetHomographyIRLSCenterWeights( + const RegionFlowFeatureList& feature_list, + std::vector* weights) const { + CHECK(weights != nullptr); + + const int num_features = feature_list.feature_size(); + weights->clear(); + weights->reserve(num_features); + + // Early return for constant weight. + if (options_.homography_irls_weight_initialization() == + MotionEstimationOptions::IRLS_WEIGHT_CONSTANT_ONE) { + weights->resize(num_features, 1.0f); + return; + } + + const float sigma_x = normalized_domain_.x() * 0.3f; + const float sigma_y = normalized_domain_.y() * 0.3f; + const float denom_x = 1.0f / (sigma_x * sigma_x); + const float denom_y = 1.0f / (sigma_y * sigma_y); + const Vector2_f center = normalized_domain_ * 0.5f; + + for (const auto& feature : feature_list.feature()) { + const float diff_x = feature.x() - center.x(); + const float diff_y = feature.y() - center.y(); + const float dist = diff_x * diff_x * denom_x + diff_y * diff_y * denom_y; + const float weight = std::exp(static_cast(-0.5f * dist)); + + switch (options_.homography_irls_weight_initialization()) { + case MotionEstimationOptions::IRLS_WEIGHT_CENTER_GAUSSIAN: + weights->push_back(weight); + break; + case MotionEstimationOptions::IRLS_WEIGHT_PERIMETER_GAUSSIAN: + weights->push_back(1.0f - weight * 0.5f); + break; + default: + LOG(INFO) << "Unsupported IRLS weighting."; + } + } +} + +bool MotionEstimation::IsStableTranslation( + const TranslationModel& translation, float translation_variance, + const RegionFlowFeatureList& features) const { + if (options_.deactivate_stable_motion_estimation()) { + return true; + } + + const bool sufficient_features = + features.feature_size() >= + options_.stable_translation_bounds().min_features(); + + if (!sufficient_features) { + VLOG(1) << "Translation unstable, insufficient features."; + return false; + } + + const float translation_magnitude = + std::hypot(translation.dx(), translation.dy()); + const float max_translation_magnitude = + options_.stable_translation_bounds().frac_max_motion_magnitude() * + hypot(frame_width_, frame_height_); + + const float stdev = std::sqrt(static_cast(translation_variance)) / + hypot(frame_width_, frame_height_); + + const float max_motion_stdev_threshold = + options_.stable_translation_bounds().max_motion_stdev_threshold(); + + // Only test for exceeding max translation magnitude if standard deviation + // of translation is not close to zero (in which case registration + // can be considered stable). + if (translation_magnitude >= max_translation_magnitude && + stdev >= max_motion_stdev_threshold) { + VLOG(1) << "Translation unstable, exceeds max translation: " + << translation_magnitude << " stdev: " << stdev; + return false; + } + + if (stdev >= options_.stable_translation_bounds().max_motion_stdev()) { + VLOG(1) << "Translation unstable, translation variance out of bound: " + << stdev; + return false; + } + + return true; +} + +void MotionEstimation::CheckTranslationAcceleration( + std::vector* camera_motions) const { + CHECK(camera_motions != nullptr); + std::vector magnitudes; + for (const auto& motion : *camera_motions) { + const float translation_magnitude = + LinearSimilarityAdapter::TransformPoint( + normalization_transform_, + Vector2_f(motion.translation().dx(), motion.translation().dy())) + .Norm(); + + magnitudes.push_back(translation_magnitude); + } + + // Determine median motion around each frame + // (really third lower percentile here). + const int median_radius = 6; + const int num_magnitudes = magnitudes.size(); + std::vector median_magnitudes; + const float kZeroMotion = 3e-4f; // 0.5 pixels @ 720p. + for (int k = 0; k < num_magnitudes; ++k) { + std::vector filter; + const auto mag_begin = + magnitudes.begin() + std::max(0, (k - median_radius)); + const auto mag_end = + magnitudes.begin() + std::min(num_magnitudes, k + median_radius + 1); + + for (auto mag = mag_begin; mag != mag_end; ++mag) { + // Ignore zero motion (empty or duplicate frames). + if (*mag > kZeroMotion) { + filter.push_back(*mag); + } + } + + const float kMinMotion = 1e-3f; // 1.5 pixels @ 720p. + if (filter.empty()) { + median_magnitudes.push_back(kMinMotion); + } else { + auto median_iter = filter.begin() + filter.size() / 3; + std::nth_element(filter.begin(), median_iter, filter.end()); + median_magnitudes.push_back(std::max(kMinMotion, *median_iter)); + } + } + + const float max_acceleration = + options_.stable_translation_bounds().max_acceleration(); + for (int k = 0; k < magnitudes.size(); ++k) { + // Relative test, test for acceleration and de-acceleration (only + // in case motion is not zero, to ignore empty or duplicated frames). + if (magnitudes[k] > max_acceleration * median_magnitudes[k] || + (magnitudes[k] > kZeroMotion && + median_magnitudes[k] > max_acceleration * magnitudes[k])) { + MotionEstimation::ResetMotionModels(options_, &(*camera_motions)[k]); + } + } +} + +bool MotionEstimation::IsStableSimilarity( + const LinearSimilarityModel& model, + const RegionFlowFeatureList& feature_list, int num_inliers) const { + if (options_.deactivate_stable_motion_estimation()) { + // Only check if model is invertible. + return IsInverseStable(model); + } + + const auto& similarity_bounds = options_.stable_similarity_bounds(); + + if (similarity_bounds.only_stable_input() && feature_list.unstable()) { + VLOG(1) << "Feature list is unstable."; + return false; + } + + const float lower_scale = similarity_bounds.lower_scale(); + const float upper_scale = similarity_bounds.upper_scale(); + if (model.a() < lower_scale || model.a() > upper_scale) { + VLOG(1) << "Warning: Unstable similarity found. " + << "Scale is out of bound: " << model.a(); + return false; + } + + const float limit_rotation = similarity_bounds.limit_rotation(); + if (fabs(model.b()) > limit_rotation) { + VLOG(1) << "Warning: Unstable similarity found. " + << "Rotation is out of bound: " << model.b(); + return false; + } + + if (num_inliers < similarity_bounds.min_inliers()) { + VLOG(1) << "Unstable similarity, only " << num_inliers << " inliers chosen " + << "from " << feature_list.feature_size() << " features."; + return false; + } + + if (num_inliers < + similarity_bounds.min_inlier_fraction() * feature_list.feature_size()) { + VLOG(1) << "Unstable similarity, inlier fraction only " + << static_cast(num_inliers) / + (feature_list.feature_size() + 1.e-6f); + return false; + } + + return true; +} + +bool MotionEstimation::IsStableHomography(const Homography& model, + float average_homography_error, + float inlier_coverage) const { + if (options_.deactivate_stable_motion_estimation()) { + return IsInverseStable(model); + } + + const auto& homography_bounds = options_.stable_homography_bounds(); + + // Test if the inter-frame transform is stable, i.e. small enough + // that its faithful estimation is possible. + const float lower_scale = homography_bounds.lower_scale(); + const float upper_scale = homography_bounds.upper_scale(); + if (model.h_00() < lower_scale || model.h_00() > upper_scale || // Scale. + model.h_11() < lower_scale || model.h_11() > upper_scale) { + VLOG(1) << "Warning: Unstable homography found. " + << "Scale is out of bound: " << model.h_00() << " " << model.h_11(); + return false; + } + + const float limit_rotation = homography_bounds.limit_rotation(); + if (fabs(model.h_01()) > limit_rotation || // Rotation. + fabs(model.h_10()) > limit_rotation) { + VLOG(1) << "Warning: Unstable homography found. " + << "Rotation is out of bound: " << model.h_01() << " " + << model.h_10(); + return false; + } + + const float limit_perspective = homography_bounds.limit_perspective(); + if (fabs(model.h_20()) > limit_perspective || // Perspective. + fabs(model.h_21()) > limit_perspective) { + VLOG(1) << "Warning: Unstable homography found. " + << "Perspective is out of bound:" << model.h_20() << " " + << model.h_21(); + return false; + } + + const float min_inlier_coverage = homography_bounds.min_inlier_coverage(); + const float registration_threshold = + std::max(homography_bounds.registration_threshold(), + homography_bounds.frac_registration_threshold() * + hypot(frame_width_, frame_height_)); + + if (average_homography_error > registration_threshold && + inlier_coverage <= min_inlier_coverage) { + VLOG(1) << "Unstable homography found. " + << "Registration (actual, threshold): " << average_homography_error + << " " << registration_threshold + << " Inlier coverage (actual, threshold): " << inlier_coverage + << " " << min_inlier_coverage; + return false; + } + + return true; +} + +bool MotionEstimation::IsStableMixtureHomography( + const MixtureHomography& homography, float min_block_inlier_coverage, + const std::vector& block_inlier_coverage) const { + if (options_.deactivate_stable_motion_estimation()) { + return true; + } + + const int num_blocks = block_inlier_coverage.size(); + std::vector stable_block(num_blocks, false); + for (int k = 0; k < num_blocks; ++k) { + stable_block[k] = block_inlier_coverage[k] > min_block_inlier_coverage; + } + + int unstable_start = -1; + int empty_start = -1; + const int max_outlier_blocks = + options_.stable_mixture_homography_bounds().max_adjacent_outlier_blocks(); + const int max_empty_blocks = + options_.stable_mixture_homography_bounds().max_adjacent_empty_blocks(); + + for (int k = 0; k < num_blocks; ++k) { + const int offset = unstable_start == 0 ? 1 : 0; + // Test for outlier blocks. + if (stable_block[k]) { + if (unstable_start >= 0 && + k - unstable_start >= max_outlier_blocks - offset) { + return false; + } + unstable_start = -1; + } else { + if (unstable_start < 0) { + unstable_start = k; + } + } + + // Test for empty blocks. + if (block_inlier_coverage[k] > 0) { + if (empty_start >= 0 && k - empty_start >= max_empty_blocks - offset) { + return false; + } + empty_start = -1; + } else { + if (empty_start < 0) { + empty_start = k; + } + } + } + + if (unstable_start >= 0 && + num_blocks - unstable_start >= max_outlier_blocks) { + return false; + } + + if (empty_start >= 0 && num_blocks - empty_start >= max_empty_blocks) { + return false; + } + + return true; +} + +float MotionEstimation::GridCoverage( + const RegionFlowFeatureList& feature_list, float min_inlier_score, + MotionEstimationThreadStorage* thread_storage) const { + CHECK(thread_storage != nullptr); + + // 10x10 grid for coverage estimation. + const int grid_size = options_.coverage_grid_size(); + const int mask_size = grid_size * grid_size; + + const float scaled_width = 1.0f / normalized_domain_.x() * grid_size; + const float scaled_height = 1.0f / normalized_domain_.y() * grid_size; + + const std::vector& grid_cell_weights = + thread_storage->GridCoverageInitializationWeights(); + CHECK_EQ(mask_size, grid_cell_weights.size()); + + const float max_inlier_score = 1.75f * min_inlier_score; + const float mid_inlier_score = 0.5 * (min_inlier_score + max_inlier_score); + + // Map min_inlier to 0.1 and max_inlier to 0.9 via logistic regression. + // f(x) = 1 / (1 + exp(-a(x - mid))) + // f(min) == 0.1 ==> a = ln(1 / 0.1 - 1) / (mid - min) + const float logistic_scale = 2.1972245 / // ln(1.0 / 0.1 - 1) + (mid_inlier_score - min_inlier_score); + + const int num_overlaps = 3; + + // Maximum coverage and number of features across shifted versions. + std::vector max_coverage(mask_size, 0.0f); + std::vector max_features(mask_size, 0); + + for (int overlap_y = 0; overlap_y < num_overlaps; ++overlap_y) { + const float shift_y = + normalized_domain_.y() / grid_size * overlap_y / num_overlaps; + for (int overlap_x = 0; overlap_x < num_overlaps; ++overlap_x) { + const float shift_x = + normalized_domain_.x() / grid_size * overlap_x / num_overlaps; + std::vector>& irls_mask = + *thread_storage->EmptyGridCoverageIrlsMask(); + CHECK_EQ(mask_size, irls_mask.size()); + + // Bin features. + for (const auto& feature : feature_list.feature()) { + if (feature.irls_weight() > 0) { + const int x = + static_cast((feature.x() - shift_x) * scaled_width); + const int y = + static_cast((feature.y() - shift_y) * scaled_height); + // Ignore features that are out of bound in one shifted version. + if (x < 0 || y < 0 || x >= grid_size || y >= grid_size) { + continue; + } + + const int grid_bin = y * grid_size + x; + irls_mask[grid_bin].push_back(feature.irls_weight()); + } + } + + for (int k = 0; k < mask_size; ++k) { + if (irls_mask[k].size() < 2) { // At least two features present for + continue; // grid cell to be considered. + } + + const int median_elem = irls_mask[k].size() / 2; + std::nth_element(irls_mask[k].begin(), + irls_mask[k].begin() + median_elem, + irls_mask[k].end()); + + const float irls_median = irls_mask[k][median_elem]; + const float inlier_score = + 1.0f / + (1.0f + std::exp(static_cast( + -logistic_scale * (irls_median - mid_inlier_score)))); + if (max_features[k] < irls_mask[k].size()) { + max_features[k] = irls_mask[k].size(); + max_coverage[k] = inlier_score; + } + } + } + } + + const float cell_weight_sum = + std::accumulate(grid_cell_weights.begin(), grid_cell_weights.end(), 0.0f); + CHECK_GT(cell_weight_sum, 0); + + return std::inner_product(max_coverage.begin(), max_coverage.end(), + grid_cell_weights.begin(), 0.0f) / + cell_weight_sum; +} + +void MotionEstimation::ComputeMixtureCoverage( + const RegionFlowFeatureList& feature_list, float min_inlier_score, + bool assume_rolling_shutter_camera, + MotionEstimationThreadStorage* thread_storage, + CameraMotion* camera_motion) const { + const int grid_size = row_weights_->NumModels(); + const int mask_size = grid_size * grid_size; + std::vector irls_mask(mask_size, 0.0f); + std::vector mask_counter(mask_size, 0.0f); + + const float scaled_width = 1.0f / normalized_domain_.x() * (grid_size - 1); + // Consider features slightly above 1 block distance away from center a block + // to vote for its coverage. + const float weight_threshold = row_weights_->WeightThreshold(1.25f); + + const float max_inlier_score = 1.75f * min_inlier_score; + const float mid_inlier_score = 0.5 * (min_inlier_score + max_inlier_score); + + // Map min_inlier to 0.1 and max_inlier to 0.9 via logistic regression. + // f(x) = 1 / (1 + exp(-a(x - mid))) + // f(min) == 0.1 ==> a = ln(1 / 0.1 - 1) / (mid - min) + const float logistic_scale = 2.1972245 / // ln(1.0 / 0.1 - 1) + (mid_inlier_score - min_inlier_score); + + std::vector texturedness; + ComputeRegionFlowFeatureTexturedness(feature_list, true, &texturedness); + + int texture_idx = 0; + for (auto feature = feature_list.feature().begin(); + feature != feature_list.feature().end(); ++feature, ++texture_idx) { + float irls_weight = feature->irls_weight(); + if (irls_weight == 0) { + continue; + } + + // Account for feature texturedness -> low textured outliers do not cause + // visible artifacts. + if (assume_rolling_shutter_camera) { + // Skip low textured outliers. + if (texturedness[texture_idx] < 0.5 && irls_weight < min_inlier_score) { + continue; + } + + // Weight by texture. + irls_weight /= (texturedness[texture_idx] + 1.e-6f); + } + + // Interpolate into bins. + const float x = feature->x() * scaled_width; + const int bin_x = x; + const float dx = x - bin_x; + const int off_x = static_cast(dx != 0); + + const float* row_weights = row_weights_->RowWeights(feature->y()); + for (int k = 0, grid_bin = bin_x; k < row_weights_->NumModels(); + ++k, grid_bin += grid_size) { + if (row_weights[k] > weight_threshold) { + irls_mask[grid_bin] += irls_weight * row_weights[k] * (1.0f - dx); + mask_counter[grid_bin] += row_weights[k] * (1.0f - dx); + irls_mask[grid_bin + off_x] += irls_weight * row_weights[k] * dx; + mask_counter[grid_bin + off_x] += row_weights[k] * dx; + } + } + } + + std::vector coverage(grid_size, 0.0f); + // Record number of occupied cells per block. + std::vector occupancy(grid_size, 0); + + for (int k = 0, grid_bin = 0; k < grid_size; ++k) { + for (int l = 0; l < grid_size; ++l, ++grid_bin) { + // At least two features (at maximum distance from block center) + // should be present for this cell to be considered. + if (mask_counter[grid_bin] < 2 * weight_threshold) { + continue; + } + + ++occupancy[k]; + + const float irls_average = irls_mask[grid_bin] / mask_counter[grid_bin]; + const float inlier_score = + 1.0f / + (1.0f + std::exp(static_cast( + -logistic_scale * (irls_average - mid_inlier_score)))); + + coverage[k] += inlier_score; + } + + // If block was occupied assign small eps coverage so it is not considered + // empty. + const float empty_block_eps = 1e-2; + if (occupancy[k] > 0 && coverage[k] == 0) { + coverage[k] = empty_block_eps; + } + } + + camera_motion->clear_mixture_inlier_coverage(); + + // For rolling shutter videos, grid cells without features are assumed to + // not cause visible distortions (no features -> lack of texture). + // Limit to 60% of number of cells, to avoid considering only one or two + // occupied cells stable. + for (int k = 0; k < grid_size; ++k) { + const float denom = + 1.0f / (assume_rolling_shutter_camera + ? std::max(grid_size * 0.6, occupancy[k]) + : grid_size); + camera_motion->add_mixture_inlier_coverage(coverage[k] * denom); + } +} + +bool MotionEstimation::EstimateHomographyIRLS( + int irls_rounds, bool compute_stability, + const PriorFeatureWeights* prior_weights, + MotionEstimationThreadStorage* thread_storage, + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const { + if (prior_weights && !prior_weights->HasCorrectDimension( + irls_rounds, feature_list->feature_size())) { + LOG(ERROR) << "Prior weights incorrectly initialized, ignoring."; + prior_weights = nullptr; + } + + std::unique_ptr local_storage; + if (thread_storage == nullptr) { + local_storage.reset(new MotionEstimationThreadStorage(options_, this)); + thread_storage = local_storage.get(); + } + + int num_nonzero_weights = + feature_list->feature_size() - + CountIgnoredRegionFlowFeatures(*feature_list, kOutlierIRLSWeight); + + // Use identity if not enough features found. + const int min_features_for_solution = 9; + if (num_nonzero_weights < min_features_for_solution) { + VLOG(1) << "Homography estimation failed, less than " + << min_features_for_solution << " features usable for estimation."; + *camera_motion->mutable_homography() = Homography(); + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + + bool use_float = true; + // Just declaring does not use memory + Eigen::Matrix matrix_e; + Eigen::Matrix solution_e; + Eigen::Matrix rhs_e; + Eigen::Matrix matrix_d; + Eigen::Matrix solution_d; + Eigen::Matrix rhs_d; + Eigen::Matrix matrix_f; + Eigen::Matrix solution_f; + Eigen::Matrix rhs_f; + + if (options_.use_exact_homography_estimation()) { + const int num_rows = + 2 * feature_list->feature_size() + + (options_.homography_perspective_regularizer() == 0 ? 0 : 1); + matrix_e = Eigen::Matrix::Zero(num_rows, 8); + rhs_e = Eigen::Matrix::Zero(8, 1); + solution_e = Eigen::Matrix::Zero(8, 1); + } else { + if (options_.use_highest_accuracy_for_normal_equations()) { + matrix_d = Eigen::Matrix::Zero(8, 8); + rhs_d = Eigen::Matrix::Zero(); + solution_d = Eigen::Matrix::Zero(8, 1); + use_float = false; + } else { + matrix_f = Eigen::Matrix::Zero(8, 8); + rhs_f = Eigen::Matrix::Zero(); + solution_f = Eigen::Matrix::Zero(8, 1); + use_float = true; + } + } + + // Multiple rounds of weighting based L2 optimization. + Homography norm_model; + const float irls_residual_scale = + GetIRLSResidualScale(camera_motion->average_magnitude(), + options_.irls_motion_magnitude_fraction()); + + const bool irls_use_l0_norm = options_.irls_use_l0_norm(); + + const std::vector* irls_priors = nullptr; + const std::vector* irls_alphas = nullptr; + if (prior_weights && prior_weights->HasNonZeroAlpha()) { + irls_priors = &prior_weights->priors; + irls_alphas = &prior_weights->alphas; + } + + Homography* prev_solution = nullptr; + if (options_.homography_exact_denominator_scaling()) { + prev_solution = &norm_model; + } + + for (int r = 0; r < irls_rounds; ++r) { + if (options_.use_exact_homography_estimation()) { + bool success = false; + + success = HomographyL2QRSolve( + *feature_list, prev_solution, + options_.homography_perspective_regularizer(), &matrix_e, + &solution_e); + if (!success) { + VLOG(1) << "Could not solve for homography."; + *camera_motion->mutable_homography() = Homography(); + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + norm_model = + HomographyAdapter::FromFloatPointer(solution_e.data(), false); + } else { + bool success = false; + if (options_.use_highest_accuracy_for_normal_equations()) { + CHECK(!use_float); + norm_model = HomographyL2NormalEquationSolve( + *feature_list, prev_solution, + options_.homography_perspective_regularizer(), &matrix_d, &rhs_d, + &solution_d, &success); + } else { + CHECK(use_float); + norm_model = HomographyL2NormalEquationSolve( + *feature_list, prev_solution, + options_.homography_perspective_regularizer(), &matrix_f, &rhs_f, + &solution_f, &success); + } + if (!success) { + VLOG(1) << "Could not solve for homography."; + *camera_motion->mutable_homography() = Homography(); + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + } + + const float alpha = irls_alphas != nullptr ? (*irls_alphas)[r] : 0.0f; + const float one_minus_alpha = 1.0f - alpha; + + // Compute weights from registration errors. + const auto feature_start = feature_list->mutable_feature()->begin(); + for (auto feature = feature_start; + feature != feature_list->mutable_feature()->end(); ++feature) { + // Ignored features marked as outliers. + if (feature->irls_weight() == 0.0f) { + continue; + } + + // Residual is expressed as geometric difference, that is + // for a point match (p<->q) with estimated homography p, + // geometric difference is defined as Hp x q. + Vector2_f lhs = HomographyAdapter::TransformPoint( + norm_model, FeatureLocation(*feature)); + // Map to original coordinate system to evaluate error. + lhs = LinearSimilarityAdapter::TransformPoint(irls_transform_, lhs); + const Vector3_f lhs3(lhs.x(), lhs.y(), 1); + const Vector2_f rhs = LinearSimilarityAdapter::TransformPoint( + irls_transform_, FeatureMatchLocation(*feature)); + + const Vector3_f rhs3(rhs.x(), rhs.y(), 1); + const Vector3_f cross = lhs3.CrossProd(rhs3); + // We only use the first 2 linearly independent rows. + const Vector2_f cross2(cross.x(), cross.y()); + + const float numerator = + alpha == 0.0f ? 1.0f + : ((*irls_priors)[feature - feature_start] * alpha + + one_minus_alpha); + + if (irls_use_l0_norm) { + feature->set_irls_weight( + numerator / (cross2.Norm() * irls_residual_scale + kIrlsEps)); + } else { + feature->set_irls_weight( + numerator / (std::sqrt(static_cast(cross2.Norm() * + irls_residual_scale)) + + kIrlsEps)); + } + } + } + + // Undo pre_transform. + Homography* model = camera_motion->mutable_homography(); + *model = ModelCompose3( + LinearSimilarityAdapter::ToHomography(inv_normalization_transform_), + norm_model, + LinearSimilarityAdapter::ToHomography(normalization_transform_)); + + if (compute_stability) { + // Score irls and save. + float average_homography_error = 0; + // Number of non-zero features. + int nnz_features = 0; + const float kMinIrlsWeight = 1e-6f; + for (const auto& feature : feature_list->feature()) { + if (feature.irls_weight() > kMinIrlsWeight) { + if (options_.irls_use_l0_norm()) { + average_homography_error += 1.0f / feature.irls_weight(); + } else { + average_homography_error += + 1.0f / (feature.irls_weight() * feature.irls_weight()); + } + ++nnz_features; + } + } + + if (nnz_features > 0) { + average_homography_error *= 1.0f / nnz_features; + } + + camera_motion->set_average_homography_error(average_homography_error); + + // TODO: Use sqrt when use_l0_norm is false. + // Need to verify that does not break face_compositor before modifying. + float inlier_threshold = + options_.stable_homography_bounds().frac_inlier_threshold() * + hypot(frame_width_, frame_height_); + camera_motion->set_homography_inlier_coverage( + GridCoverage(*feature_list, 1.0 / inlier_threshold, thread_storage)); + camera_motion->set_homography_strict_inlier_coverage(GridCoverage( + *feature_list, options_.strict_coverage_scale() / inlier_threshold, + thread_storage)); + } + return true; +} + +bool MotionEstimation::MixtureHomographyFromFeature( + const TranslationModel& camera_translation, int irls_rounds, + float regularizer, const PriorFeatureWeights* prior_weights, + RegionFlowFeatureList* feature_list, + MixtureHomography* mix_homography) const { + if (prior_weights && !prior_weights->HasCorrectDimension( + irls_rounds, feature_list->feature_size())) { + LOG(ERROR) << "Prior weights incorrectly initialized, ignoring."; + prior_weights = nullptr; + } + + const int num_mixtures = options_.num_mixtures(); + + // Compute weights if necessary. + // Compute scale to index mixture weights from normalization. + CHECK(row_weights_.get() != nullptr); + CHECK_EQ(row_weights_->YScale(), frame_height_ / normalized_domain_.y()); + CHECK_EQ(row_weights_->NumModels(), num_mixtures); + + const MotionEstimationOptions::MixtureModelMode mixture_mode = + options_.mixture_model_mode(); + int num_dof = 0; + int adjacency_constraints = 0; + switch (mixture_mode) { + case MotionEstimationOptions::FULL_MIXTURE: + num_dof = 8 * num_mixtures; + adjacency_constraints = 8 * (num_mixtures - 1); + break; + case MotionEstimationOptions::TRANSLATION_MIXTURE: + num_dof = 6 + 2 * num_mixtures; + adjacency_constraints = 2 * (num_mixtures - 1); + break; + case MotionEstimationOptions::SKEW_ROTATION_MIXTURE: + num_dof = 4 + 4 * num_mixtures; + adjacency_constraints = 4 * (num_mixtures - 1); + break; + default: + LOG(FATAL) << "Unknown MixtureModelMode specified."; + } + + Eigen::MatrixXf matrix( + 2 * feature_list->feature_size() + adjacency_constraints, num_dof); + Eigen::MatrixXf solution(num_dof, 1); + + // Multiple rounds of weighting based L2 optimization. + MixtureHomography norm_model; + + // Initialize with identity. + for (int k = 0; k < num_mixtures; ++k) { + norm_model.add_model(); + } + + const bool irls_use_l0_norm = options_.irls_use_l0_norm(); + + const std::vector* irls_priors = nullptr; + const std::vector* irls_alphas = nullptr; + if (prior_weights && prior_weights->HasNonZeroAlpha()) { + irls_priors = &prior_weights->priors; + irls_alphas = &prior_weights->alphas; + } + + for (int r = 0; r < irls_rounds; ++r) { + // Unpack solution to mixture homographies, if not full model. + std::vector solution_unpacked(8 * num_mixtures); + const float* solution_pointer = &solution_unpacked[0]; + + switch (mixture_mode) { + case MotionEstimationOptions::FULL_MIXTURE: + if (!MixtureHomographyL2DLTSolve(*feature_list, num_mixtures, + *row_weights_, regularizer, &matrix, + &solution)) { + return false; + } + // No need to unpack solution. + solution_pointer = solution.data(); + break; + + case MotionEstimationOptions::TRANSLATION_MIXTURE: + if (!TransMixtureHomographyL2DLTSolve(*feature_list, num_mixtures, + *row_weights_, regularizer, + &matrix, &solution)) { + return false; + } + { + const float* sol_ptr = solution.data(); + for (int k = 0; k < num_mixtures; ++k) { + float* curr_ptr = &solution_unpacked[8 * k]; + curr_ptr[0] = sol_ptr[0]; + curr_ptr[1] = sol_ptr[1]; + curr_ptr[2] = sol_ptr[6 + 2 * k]; + curr_ptr[3] = sol_ptr[2]; + curr_ptr[4] = sol_ptr[3]; + curr_ptr[5] = sol_ptr[6 + 2 * k + 1]; + curr_ptr[6] = sol_ptr[4]; + curr_ptr[7] = sol_ptr[5]; + } + } + break; + + case MotionEstimationOptions::SKEW_ROTATION_MIXTURE: + if (!SkewRotMixtureHomographyL2DLTSolve(*feature_list, num_mixtures, + *row_weights_, regularizer, + &matrix, &solution)) { + return false; + } + { + const float* sol_ptr = solution.data(); + for (int k = 0; k < num_mixtures; ++k) { + float* curr_ptr = &solution_unpacked[8 * k]; + curr_ptr[0] = sol_ptr[0]; + curr_ptr[1] = sol_ptr[4 + 4 * k]; + curr_ptr[2] = sol_ptr[4 + 4 * k + 2]; + curr_ptr[3] = sol_ptr[4 + 4 * k + 1]; + curr_ptr[4] = sol_ptr[1]; + curr_ptr[5] = sol_ptr[4 + 4 * k + 3]; + curr_ptr[6] = sol_ptr[2]; + curr_ptr[7] = sol_ptr[3]; + } + } + break; + + default: + LOG(FATAL) << "Unknown MixtureModelMode specified."; + } + + norm_model = MixtureHomographyAdapter::FromFloatPointer( + solution_pointer, false, 0, num_mixtures); + + const float alpha = irls_alphas != nullptr ? (*irls_alphas)[r] : 0.0f; + const float one_minus_alpha = 1.0f - alpha; + + // Evaluate IRLS error. + const auto feature_start = feature_list->mutable_feature()->begin(); + for (auto feature = feature_start; + feature != feature_list->mutable_feature()->end(); ++feature) { + if (feature->irls_weight() == 0.0f) { + continue; + } + + // Residual is expressed in geometric difference, that is + // for a point match (p<->q) with estimated homography p, + // geometric difference is defined as Hp x q. + Vector2_f lhs = MixtureHomographyAdapter::TransformPoint( + norm_model, row_weights_->RowWeightsClamped(feature->y()), + FeatureLocation(*feature)); + // Map to original coordinate system to evaluate error. + lhs = LinearSimilarityAdapter::TransformPoint(irls_transform_, lhs); + + const Vector3_f lhs3(lhs.x(), lhs.y(), 1); + const Vector2_f rhs = LinearSimilarityAdapter::TransformPoint( + irls_transform_, FeatureMatchLocation(*feature)); + + const Vector3_f rhs3(rhs.x(), rhs.y(), 1); + const Vector3_f cross = lhs3.CrossProd(rhs3); + + // We only use the first 2 linearly independent rows. + const Vector2_f cross2(cross.x(), cross.y()); + + const float numerator = + alpha == 0.0f ? 1.0f + : ((*irls_priors)[feature - feature_start] * alpha + + one_minus_alpha); + + if (irls_use_l0_norm) { + feature->set_irls_weight(numerator / (cross2.Norm() + kIrlsEps)); + } else { + feature->set_irls_weight( + numerator / + (std::sqrt(static_cast(cross2.Norm())) + kIrlsEps)); + } + } + } + + // Undo pre_transform. + *mix_homography = MixtureHomographyAdapter::ComposeLeft( + MixtureHomographyAdapter::ComposeRight( + norm_model, + LinearSimilarityAdapter::ToHomography(normalization_transform_)), + LinearSimilarityAdapter::ToHomography(inv_normalization_transform_)); + + switch (mixture_mode) { + case MotionEstimationOptions::FULL_MIXTURE: + mix_homography->set_dof(MixtureHomography::ALL_DOF); + break; + case MotionEstimationOptions::TRANSLATION_MIXTURE: + mix_homography->set_dof(MixtureHomography::TRANSLATION_DOF); + break; + case MotionEstimationOptions::SKEW_ROTATION_MIXTURE: + mix_homography->set_dof(MixtureHomography::SKEW_ROTATION_DOF); + break; + default: + LOG(FATAL) << "Unknown MixtureModelMode specified."; + } + return true; +} + +bool MotionEstimation::EstimateMixtureHomographyIRLS( + int irls_rounds, bool compute_stability, float regularizer, + int spectrum_idx, const PriorFeatureWeights* prior_weights, + MotionEstimationThreadStorage* thread_storage, + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const { + std::unique_ptr local_storage; + if (thread_storage == NULL) { + local_storage.reset(new MotionEstimationThreadStorage(options_, this)); + thread_storage = local_storage.get(); + } + + // We bin features into 3 blocks (top, middle, bottom), requiring each to + // have sufficient features. This is a specialization of the same test for + // the homography case. The tested blocks here are not related to the + // mixture blocks in any manner. + const int min_features_for_solution = 9; + const int num_blocks = 3; + std::vector features_per_block(3, 0); + const float block_scale = num_blocks / normalized_domain_.y(); + + for (const auto& feature : feature_list->feature()) { + if (feature.irls_weight() > 0) { + ++features_per_block[feature.y() * block_scale]; + } + } + + // Require at least two blocks to have sufficient features. + std::sort(features_per_block.begin(), features_per_block.end()); + if (features_per_block[1] < min_features_for_solution) { + VLOG(1) << "Mixture homography estimation not possible, less than " + << min_features_for_solution << " features present."; + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + + MixtureHomography mix_homography; + if (!MixtureHomographyFromFeature(camera_motion->translation(), irls_rounds, + regularizer, prior_weights, feature_list, + &mix_homography)) { + VLOG(1) << "Non-rigid homography estimated. " + << "CameraMotion flagged as unstable."; + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + + if (compute_stability) { + // Test if mixture is invertible for every scanline (test via grid, + // every 10 scanlines, also test one row out of frame domain). + const float test_grid_size = 10.0f / frame_height_ * normalized_domain_.y(); + bool invertible = true; + int counter = 0; + for (float y = -test_grid_size; y < normalized_domain_.y() + test_grid_size; + y += test_grid_size) { + ++counter; + const float* weights = row_weights_->RowWeightsClamped(y); + Homography test_homography = MixtureHomographyAdapter::ToBaseModel( + camera_motion->mixture_homography(), weights); + HomographyAdapter::InvertChecked(test_homography, &invertible); + if (!invertible) { + VLOG(1) << "Mixture is not invertible."; + camera_motion->set_flags(camera_motion->flags() | + CameraMotion::FLAG_SINGULAR_ESTIMATION); + return false; + } + } + } + + while (spectrum_idx >= camera_motion->mixture_homography_spectrum_size()) { + camera_motion->add_mixture_homography_spectrum(); + } + + camera_motion->mutable_mixture_homography_spectrum(spectrum_idx) + ->CopyFrom(mix_homography); + + float mixture_inlier_threshold = + options_.stable_mixture_homography_bounds().frac_inlier_threshold() * + hypot(frame_width_, frame_height_); + + // First computed mixture in the spectrum is stored in mixture homography + // member. Also compute coverage for it. + if (spectrum_idx == 0) { + camera_motion->mutable_mixture_homography()->CopyFrom( + camera_motion->mixture_homography_spectrum(0)); + if (compute_stability) { + ComputeMixtureCoverage(*feature_list, 1.0f / mixture_inlier_threshold, + true, thread_storage, camera_motion); + } + } + + // Cap rolling shutter analysis level to be valid level. + if (options_.mixture_rs_analysis_level() >= + options_.mixture_regularizer_levels()) { + LOG(WARNING) << "Resetting mixture_rs_analysis_level to " + << options_.mixture_regularizer_levels() - 1; + } + + const int rs_analysis_level = + std::min(options_.mixture_rs_analysis_level(), + options_.mixture_regularizer_levels() - 1); + + // We compute mixture coverage only for frames which can be safely assumed + // to be stable mixtures, comparing it to the homography coverage and + // recording the increase in coverage. + // Compute coverage assuming rigid camera. + // TODO: Use sqrt when use_l0_norm is false. Need to verify that + // does not break face_compositor before modifying. + if (compute_stability && spectrum_idx == rs_analysis_level) { + std::vector coverage_backup( + camera_motion->mixture_inlier_coverage().begin(), + camera_motion->mixture_inlier_coverage().end()); + + // Evaluate mixture coverage and compute rolling shutter guess. + ComputeMixtureCoverage(*feature_list, 1.0f / mixture_inlier_threshold, + false, thread_storage, camera_motion); + + std::vector mixture_inlier_coverage( + camera_motion->mixture_inlier_coverage().begin(), + camera_motion->mixture_inlier_coverage().end()); + + // Reset to original values. + if (!coverage_backup.empty()) { + camera_motion->clear_mixture_inlier_coverage(); + for (float item : coverage_backup) { + camera_motion->add_mixture_inlier_coverage(item); + } + } + + // Estimate rolling shutter score. + // Use higher threshold on inlier coverage to only consider mixtures that + // are very reliable. + const MixtureHomography& rs_mixture = + camera_motion->mixture_homography_spectrum( + camera_motion->mixture_homography_spectrum_size() - 1); + const float rs_stability_threshold = + options_.stable_mixture_homography_bounds().min_inlier_coverage() * + 1.5f; + + if (IsStableMixtureHomography(rs_mixture, rs_stability_threshold, + mixture_inlier_coverage)) { + // Only use best matches (strict coverage) to determine by how much + // mixture models improve on homographies. + // TODO: Use sqrt when use_l0_norm is false. Need to verify that + // does not break face_compositor before modifying. + float homog_inlier_threshold = + options_.stable_homography_bounds().frac_inlier_threshold() * + hypot(frame_width_, frame_height_); + homog_inlier_threshold /= options_.strict_coverage_scale(); + + float mixture_coverage = GridCoverage( + *feature_list, 1.0f / homog_inlier_threshold, thread_storage); + + const float coverage_ratio = + mixture_coverage / + (camera_motion->homography_strict_inlier_coverage() + 0.01f); + + camera_motion->set_rolling_shutter_guess(coverage_ratio); + } else { + camera_motion->set_rolling_shutter_guess(-1.0f); + } + } + + camera_motion->set_mixture_row_sigma(options_.mixture_row_sigma()); + return true; +} + +void MotionEstimation::DetermineOverlayIndices( + bool irls_weights_preinitialized, std::vector* camera_motions, + std::vector* feature_lists) const { + CHECK(camera_motions != nullptr); + CHECK(feature_lists != nullptr); + // Two stage estimation: First translation only, followed by + // overlay analysis. + const int num_frames = feature_lists->size(); + CHECK_EQ(num_frames, camera_motions->size()); + + std::vector translation_motions(num_frames); + const int irls_per_round = options_.irls_rounds(); + + // Perform quick initialization of weights and backup original ones. + if (!irls_weights_preinitialized) { + for (auto feature_list_ptr : *feature_lists) { + ResetRegionFlowFeatureIRLSWeights(1.0, feature_list_ptr); + } + } + + std::vector> original_irls_weights(num_frames); + for (int f = 0; f < num_frames; ++f) { + const RegionFlowFeatureList& feature_list = *(*feature_lists)[f]; + GetRegionFlowFeatureIRLSWeights(feature_list, &original_irls_weights[f]); + } + + ParallelFor(0, num_frames, 1, + EstimateMotionIRLSInvoker(MODEL_TRANSLATION, irls_per_round, + false, CameraMotion::VALID, + DefaultModelOptions(), this, + nullptr, // No prior weights. + nullptr, // No thread storage here. + feature_lists, &translation_motions)); + + // Restore weights. + for (int f = 0; f < num_frames; ++f) { + RegionFlowFeatureList& feature_list = *(*feature_lists)[f]; + SetRegionFlowFeatureIRLSWeights(original_irls_weights[f], &feature_list); + } + + const int chunk_size = options_.overlay_analysis_chunk_size(); + const int num_chunks = std::ceil(feature_lists->size() * (1.0f / chunk_size)); + + const int overlay_grid_size = + options_.overlay_detection_options().analysis_mask_size(); + for (int chunk = 0; chunk < num_chunks; ++chunk) { + std::vector translations; + std::vector chunk_features; + const int chunk_start = chunk * chunk_size; + const int chunk_end = std::min((chunk + 1) * chunk_size, num_frames); + for (int f = chunk_start; f < chunk_end; ++f) { + translations.push_back(translation_motions[f].translation()); + chunk_features.push_back((*feature_lists)[f]); + } + + std::vector overlay_indices; + OverlayAnalysis(translations, &chunk_features, &overlay_indices); + for (const auto& overlay_idx : overlay_indices) { + (*camera_motions)[chunk_start].add_overlay_indices(overlay_idx); + } + + // Negative marker to frame chunk_start. + for (int f = chunk_start; f < chunk_end; ++f) { + if (f > chunk_start) { + (*camera_motions)[f].add_overlay_indices(chunk_start - f); + } + + (*camera_motions)[f].set_overlay_domain(overlay_grid_size); + } + } +} + +// Features are aggregated over a regular grid of the image domain with the +// purpose to determine if a grid bin is deemed part of an overlay or not. +// In particular, we distinguish between two types of overlay features, strict +// and loose ones, based on different thresholds regarding their motion. A grid +// bin is flagged as overlay if it contains a sufficient number of the strict +// features, in which case *all* overlay feature candidates (strict and loose +// ones) are flagged by setting their irls weight to zero. +float MotionEstimation::OverlayAnalysis( + const std::vector& translations, + std::vector* feature_lists, + std::vector* overlay_indices) const { + CHECK(feature_lists != nullptr); + CHECK(overlay_indices != nullptr); + CHECK_EQ(feature_lists->size(), translations.size()); + overlay_indices->clear(); + + const int grid_size = + options_.overlay_detection_options().analysis_mask_size(); + + const int mask_size = grid_size * grid_size; + const float scaled_width = 1.0f / normalized_domain_.x() * grid_size; + const float scaled_height = 1.0f / normalized_domain_.y() * grid_size; + + const float strict_zero_motion_threshold = + options_.overlay_detection_options().strict_near_zero_motion(); + const float strict_max_translation_ratio = + options_.overlay_detection_options().strict_max_translation_ratio(); + const float loose_zero_motion_threshold = + options_.overlay_detection_options().loose_near_zero_motion(); + + const float strict_min_texturedness = + options_.overlay_detection_options().strict_min_texturedness(); + + std::vector mask_counter(mask_size, 0); + std::vector overlay_counter(mask_size, 0); + std::vector> overlay_features(mask_size); + + for (int frame = 0; frame < feature_lists->size(); ++frame) { + const TranslationModel& translation = translations[frame]; + const float trans_magnitude = + std::hypot(translation.dx(), translation.dy()); + const float strict_trans_threshold = + strict_max_translation_ratio * trans_magnitude; + + RegionFlowFeatureList* feature_list = (*feature_lists)[frame]; + std::vector texturedness; + ComputeRegionFlowFeatureTexturedness(*feature_list, false, &texturedness); + + for (int feat_idx = 0, feat_size = feature_list->feature_size(); + feat_idx < feat_size; ++feat_idx) { + RegionFlowFeature* feature = feature_list->mutable_feature(feat_idx); + const int x = static_cast(feature->x() * scaled_width); + const int y = static_cast(feature->y() * scaled_height); + const int grid_bin = y * grid_size + x; + ++mask_counter[grid_bin]; + + // If translation is near zero, this test is impossible, so continue. + if (trans_magnitude < 1.0f) { // In pixels. + continue; + } + + const float feat_magnitude = + LinearSimilarityAdapter::TransformPoint( + irls_transform_, Vector2_f(feature->dx(), feature->dy())) + .Norm(); + if (feat_magnitude <= loose_zero_motion_threshold) { + overlay_features[grid_bin].push_back(feature); + if (feat_magnitude <= strict_trans_threshold && + feat_magnitude <= strict_zero_motion_threshold && + texturedness[feat_idx] >= strict_min_texturedness) { + ++overlay_counter[grid_bin]; + } + } + } + } + + // Determine potential outlier grids. + const float overlay_min_ratio = + options_.overlay_detection_options().overlay_min_ratio(); + + const float overlay_min_features = + options_.overlay_detection_options().overlay_min_features(); + + for (int i = 0; i < mask_size; ++i) { + // Ensure sufficient features were aggregated. + if (mask_counter[i] > overlay_min_features && + overlay_counter[i] > overlay_min_ratio * mask_counter[i]) { + // Consider all features in this bin outliers. + for (auto& feature_ptr : overlay_features[i]) { + feature_ptr->set_irls_weight(0.0f); + } + overlay_indices->push_back(i); + } + } + + return overlay_indices->size() * (1.0f / mask_size); +} + +void MotionEstimation::PostIRLSSmoothing( + const std::vector& camera_motions, + std::vector* feature_lists) const { + CHECK(feature_lists != nullptr); + + std::vector> feature_grids; + std::vector> feature_taps_3; + std::vector> feature_taps_5; + + struct NonOverlayPredicate { + bool operator()(const RegionFlowFeature& feature) const { + return feature.irls_weight() != 0; + } + }; + + std::vector feature_views(feature_lists->size()); + for (int k = 0; k < feature_views.size(); ++k) { + SelectFeaturesFromList( + [](const RegionFlowFeature& feature) -> bool { + return feature.irls_weight() != 0; + }, + (*feature_lists)[k], &feature_views[k]); + } + + // In normalized domain. + BuildFeatureGrid(normalized_domain_.x(), normalized_domain_.y(), + options_.feature_grid_size(), // In normalized coords. + feature_views, FeatureLocation, &feature_taps_3, + &feature_taps_5, nullptr, &feature_grids); + + std::vector feature_frame_confidence(feature_lists->size(), 1.0f); + if (options_.frame_confidence_weighting()) { + float max_confidence = 0.0; + ; + for (int f = 0; f < feature_lists->size(); ++f) { + feature_frame_confidence[f] = + std::max(1e-3f, InlierCoverage(camera_motions[f], false)); + feature_frame_confidence[f] *= feature_frame_confidence[f]; + max_confidence = std::max(max_confidence, feature_frame_confidence[f]); + } + + const float cut_off_confidence = + options_.reset_confidence_threshold() * max_confidence; + for (int f = 0; f < feature_lists->size(); ++f) { + if (feature_frame_confidence[f] < cut_off_confidence) { + // If registration is bad, reset to identity and let adjacent + // frames do the fill in. + for (auto& feature_ptr : feature_views[f]) { + feature_ptr->set_irls_weight(1.0f); + } + } + } + } + + RunTemporalIRLSSmoothing(feature_grids, feature_taps_3, feature_taps_5, + feature_frame_confidence, &feature_views); +} + +namespace { + +void ClearInternalIRLSStructure(RegionFlowFeatureView* feature_view) { + for (auto& feature_ptr : *feature_view) { + feature_ptr->clear_internal_irls(); + } +} + +} // namespace. + +namespace { + +// Note: Push / Pull averaging is performed as reciprocal (effectively we +// average the per feature registration error and convert this back the irls +// weight using 1 / error). +void TemporalIRLSPush(const FeatureGrid& curr_grid, + const FeatureGrid* prev_grid, + const std::vector>& feature_taps, + float space_scale, const std::vector& space_lut, + float feature_scale, + const std::vector& feature_lut, + float temporal_weight, float curr_frame_confidence, + float grid_scale, int grid_dim_x, + RegionFlowFeatureView* curr_view, + RegionFlowFeatureView* prev_view) { + CHECK(curr_view != nullptr); + // Spatial filtering of inverse irls weights and the temporally weighted + // pushed result from the next frame. + for (auto& feature : *curr_view) { + float weight_sum = feature->internal_irls().weight_sum() * temporal_weight; + float value_sum = feature->internal_irls().value_sum() * temporal_weight; + + const int bin_x = feature->x() * grid_scale; + const int bin_y = feature->y() * grid_scale; + const int grid_loc = bin_y * grid_dim_x + bin_x; + + for (const auto& bin : feature_taps[grid_loc]) { + for (const auto& test_feat : curr_grid[bin]) { + const float dist = + (FeatureLocation(*test_feat) - FeatureLocation(*feature)).Norm(); + const float feature_dist = RegionFlowFeatureDistance( + feature->feature_descriptor(), test_feat->feature_descriptor()); + const float weight = + space_lut[static_cast(dist * space_scale)] * + feature_lut[static_cast(feature_dist * feature_scale)] * + curr_frame_confidence; + + weight_sum += weight; + value_sum += 1.0f / test_feat->irls_weight() * weight; + } + } + + // Only zero if spatial AND feature sigma = 0. + DCHECK_GT(weight_sum, 0); + feature->mutable_internal_irls()->set_weight_sum(weight_sum); + feature->mutable_internal_irls()->set_value_sum(value_sum); + } + + // Clear previous frames interal irls. + if (prev_view) { + ClearInternalIRLSStructure(prev_view); + } + + // Evaluate irls weight and push result to feature in the previous frame along + // the flow dimension (using spatial interpolation). + for (auto& feature : *curr_view) { + feature->set_irls_weight(1.0f / (feature->internal_irls().value_sum() / + feature->internal_irls().weight_sum())); + feature->clear_internal_irls(); + + if (prev_view == NULL) { + continue; + } + + const int bin_x = (feature->x() + feature->dx()) * grid_scale; + const int bin_y = (feature->y() + feature->dy()) * grid_scale; + const int grid_loc = bin_y * grid_dim_x + bin_x; + + for (const auto& bin : feature_taps[grid_loc]) { + for (auto& test_feat : (*prev_grid)[bin]) { + float dist = + (FeatureLocation(*test_feat) - FeatureMatchLocation(*feature)) + .Norm(); + const float feature_dist = + RegionFlowFeatureDistance(feature->feature_match_descriptor(), + test_feat->feature_descriptor()); + const float weight = + space_lut[static_cast(dist * space_scale)] * + feature_lut[static_cast(feature_dist * feature_scale)]; + TemporalIRLSSmoothing* temporal_irls = + test_feat->mutable_internal_irls(); + temporal_irls->set_value_sum(temporal_irls->value_sum() + + weight * 1.0f / feature->irls_weight()); + temporal_irls->set_weight_sum(temporal_irls->weight_sum() + weight); + } + } + } +} + +void TemporalIRLSPull(const FeatureGrid& curr_grid, + const FeatureGrid& prev_grid, + const std::vector>& feature_taps, + float space_scale, const std::vector& space_lut, + float feature_scale, + const std::vector& feature_lut, + float temporal_weight, float curr_frame_confidence, + float grid_scale, int grid_dim_x, + RegionFlowFeatureView* curr_view, + RegionFlowFeatureView* prev_view) { + // Pull irls weights of spatially neighboring features from previous frame. + // Neighborhood is displaced by flow. Pulled weights are weighted by + // temporal_weight. + for (auto& feature : *curr_view) { + const int bin_x = (feature->x() + feature->dx()) * grid_scale; + const int bin_y = (feature->y() + feature->dy()) * grid_scale; + const int grid_loc = bin_y * grid_dim_x + bin_x; + + float weight_sum = 0; + float value_sum = 0; + for (const auto& bin : feature_taps[grid_loc]) { + for (const auto& test_feat : prev_grid[bin]) { + float dist = + (FeatureLocation(*test_feat) - FeatureMatchLocation(*feature)) + .Norm(); + const float feature_dist = + RegionFlowFeatureDistance(feature->feature_match_descriptor(), + test_feat->feature_descriptor()); + const float weight = + space_lut[static_cast(dist * space_scale)] * + feature_lut[static_cast(feature_dist * feature_scale)]; + weight_sum += weight; + value_sum += weight * 1.0f / test_feat->irls_weight(); + } + } + + TemporalIRLSSmoothing* temporal_irls = feature->mutable_internal_irls(); + temporal_irls->set_value_sum(value_sum * temporal_weight); + temporal_irls->set_weight_sum(weight_sum * temporal_weight); + } + + // Spatial filtering of neighboring inverse irls_weight and above + // pulled result from the previous frame. + for (auto& feature : *curr_view) { + float weight_sum = feature->internal_irls().weight_sum(); + float value_sum = feature->internal_irls().value_sum(); + + const int bin_x = feature->x() * grid_scale; + const int bin_y = feature->y() * grid_scale; + const int grid_loc = bin_y * grid_dim_x + bin_x; + + for (const auto& bin : feature_taps[grid_loc]) { + for (const auto& test_feat : curr_grid[bin]) { + float dist = + (FeatureLocation(*test_feat) - FeatureLocation(*feature)).Norm(); + const float feature_dist = RegionFlowFeatureDistance( + feature->feature_descriptor(), test_feat->feature_descriptor()); + const float weight = + space_lut[static_cast(dist * space_scale)] * + feature_lut[static_cast(feature_dist * feature_scale)] * + curr_frame_confidence; + weight_sum += weight; + value_sum += 1.0f / test_feat->irls_weight() * weight; + } + } + + CHECK_GT(weight_sum, 0) << feature->irls_weight(); + feature->mutable_internal_irls()->set_weight_sum(weight_sum); + feature->mutable_internal_irls()->set_value_sum(value_sum); + } + + // Evaluate irls weight. + for (auto& feature : *curr_view) { + feature->set_irls_weight(1.0f / (feature->internal_irls().value_sum() / + feature->internal_irls().weight_sum())); + feature->clear_internal_irls(); + } +} + +} // namespace. + +void MotionEstimation::InitGaussLUT(float sigma, float max_range, + std::vector* lut, + float* scale) const { + CHECK(lut); + // Calculate number of bins if scale is non-zero, otherwise use one bin per + // integer in the domain [0, max_range]. + const int lut_bins = (scale != nullptr) ? (1 << 10) : std::ceil(max_range); + lut->resize(lut_bins); + + const float bin_size = max_range / lut_bins; + const float coeff = -0.5f / (sigma * sigma); + for (int i = 0; i < lut_bins; ++i) { + const float value = i * bin_size; + (*lut)[i] = std::exp(value * value * coeff); + } + + if (scale) { + *scale = 1.0f / bin_size; + } +} + +// Smooth IRLS weights across the volume. +void MotionEstimation::RunTemporalIRLSSmoothing( + const std::vector>& feature_grid, + const std::vector>& feature_taps_3, + const std::vector>& feature_taps_5, + const std::vector& frame_confidence, + std::vector* feature_views) const { + // Approximate goal for temporal window length with closest integer that + // achieves roughly homogeneous sized chunks. + const int temporal_length_goal = options_.temporal_irls_diameter(); + const int num_frames = feature_views->size(); + + if (num_frames == 0) { + return; + } + + // Clamp IRLS bounds before smoothing, otherwise outliers skew the result + // heavily. + for (auto& feature_view : *feature_views) { + ClampRegionFlowFeatureIRLSWeights(0.01, 100, &feature_view); + } + + const int num_chunks = std::min( + 1, std::ceil(static_cast(static_cast(num_frames) / + temporal_length_goal))); + const int temporal_length = std::ceil( + static_cast(static_cast(num_frames) / num_chunks)); + + const float grid_resolution = options_.feature_grid_size(); + const int grid_dim_x = + std::ceil(static_cast(normalized_domain_.x() / grid_resolution)); + const float grid_scale = 1.0f / grid_resolution; + + const float spatial_sigma = options_.spatial_sigma(); + + // Setup Gaussian LUT for smoothing in space, time and feature-space. + std::vector space_lut; + + // Using 3 tap smoothing, max distance is 2 bin diagonals, for 5 tap + // smoothing max distance in 3 bin diagonals. + // We use maximum of 3 * sqrt(2) * bin_radius plus 1% room incase maximum + // value is attained. + const float max_space_diff = sqrt(2.0) * 3.f * grid_resolution * 1.01f; + float space_scale; + + InitGaussLUT(spatial_sigma, max_space_diff, &space_lut, &space_scale); + + const float temporal_sigma = options_.temporal_sigma(); + std::vector temporal_lut; + InitGaussLUT(temporal_sigma, temporal_length, &temporal_lut, nullptr); + + const float feature_sigma = options_.feature_sigma(); + const float max_feature_diff = sqrt(3.0) * 255.0; // 3 channels. + + std::vector feature_lut; + float feature_scale; + + InitGaussLUT(feature_sigma, max_feature_diff, &feature_lut, &feature_scale); + + // Smooth each chunk independently. + // Smoothing algorithm description: + // The volumetric smoothing operation is approximated by a push and pull + // phase similar in its nature to scattered data interpolation via push / + // pull albeit in time instead of scale space. + // (for classic push/pull see: + // www.vis.uni-stuttgart.de/~kraus/preprints/vmv06_strengert.pdf) + // + // Our push/pull algorithm pushes and pulls for each feature its irls weight + // with gaussian weights (based on proximity in space, time and feature + // space) to its neighbors in the previous or next frame. The result of a + // push or pull is aggregated in the features TemporalIRLSSmoothing + // structure. + // + // In general, we first push the weights through the whole volume (clip) + // towards the first frame performing spatial and temporal smoothing, then + // pull the resulting weights from the first frame through the whole clip + // using again spatial and temporal smoothing. + // + // Specifically, in the push phase a feature's irls weight is updated using + // a weigted average (gaussian weights) of its neighboring features and any + // pushed information from the next frame + // (via TemporalIRLSSmoothing structure). + // The updated weight is then pushed along the feature's flow to the + // previous frame and spread into the corresponding TemporalIRLSSmoothing + // fields of neighboring features in the previous frame. + // Similar, the pull phase proceeds by updating a features weights using a + // weighted average of its neighboring features and any information from the + // previous frame, that is pulled along the features flow. + for (int chunk = 0; chunk < num_chunks; ++chunk) { + const int start_frame = chunk * temporal_length; + const int end_frame = std::min((chunk + 1) * temporal_length, num_frames); + + ClearInternalIRLSStructure(&(*feature_views)[end_frame - 1]); + + // Push pass. + for (int f = end_frame - 1; f >= start_frame; --f) { + RegionFlowFeatureView* curr_view = &(*feature_views)[f]; + RegionFlowFeatureView* prev_view = + f > start_frame ? &(*feature_views)[f - 1] : nullptr; + const auto& curr_grid = feature_grid[f]; + const auto* prev_grid = f > start_frame ? &feature_grid[f - 1] : nullptr; + + // Evalutate temporal weight (pushed weights to this frames are weighted + // by temporal weight). + float temporal_weight = 0; + for (int e = 1; e < end_frame - f; ++e) { + temporal_weight += temporal_lut[e]; + } + // Relative weighting to save multiplication of pushed information, + // i.e. weight 1.0 for current frame. + temporal_weight /= temporal_lut[0]; + + TemporalIRLSPush( + curr_grid, prev_grid, + options_.filter_5_taps() ? feature_taps_5 : feature_taps_3, + space_scale, space_lut, feature_scale, feature_lut, temporal_weight, + frame_confidence[f], grid_scale, grid_dim_x, curr_view, prev_view); + } + + // Pull pass. + for (int f = start_frame + 1; f < end_frame; ++f) { + RegionFlowFeatureView* curr_view = &(*feature_views)[f]; + RegionFlowFeatureView* prev_view = &(*feature_views)[f - 1]; + const auto& curr_grid = feature_grid[f]; + const auto& prev_grid = feature_grid[f - 1]; + + // Evalutate temporal weight. + float temporal_weight = 0; + for (int e = 1; e <= f - start_frame; ++e) { + temporal_weight += temporal_lut[e]; + } + // Relative weighting to save multiplication of pushed information. + temporal_weight /= temporal_lut[0]; + + TemporalIRLSPull( + curr_grid, prev_grid, + options_.filter_5_taps() ? feature_taps_5 : feature_taps_3, + space_scale, space_lut, feature_scale, feature_lut, temporal_weight, + frame_confidence[f], grid_scale, grid_dim_x, curr_view, prev_view); + } + } +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/motion_estimation.h b/mediapipe/util/tracking/motion_estimation.h new file mode 100644 index 000000000..4939a1917 --- /dev/null +++ b/mediapipe/util/tracking/motion_estimation.h @@ -0,0 +1,758 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Fits several linear motion models to the tracked features obtained +// from RegionFlowComputation. +// +// --- Multi-threaded usage (parallel motion estimation over frames) --- +// assume input: vector feature_lists +// // Can be obtained after getting RegionFlowFrame from RegionFlowComputation +// // and executing GetRegionFlowFeatureList (region_flow.h) +// MotionEstimation motion_estimation(MotionEstimationOptions(), +// frame_width, +// frame_height); +// vector camera_motions; +// motion_estimation.EstimateMotionsParallel(false, // no IRLS smoothing. +// &feature_lists, +// &camera_motions); +// // RegionFlowFeatureList can be discarded or passed to Cropper. +// +// +// --- DEPRECATED, per-frame usage --- +// assume input: RegionFlowFrame* flow_frame // from RegionFlowComputation. +// +// // Initialize with standard options. +// MotionEstimation motion_estimation(MotionEstimationOptions(), +// frame_width, +// frame_height); +// CameraMotion estimated_motion; +// motion_estimation.EstimateMotion(flow_frame, +// NULL, // deprecated param. +// NULL, // deprecated param. +// &estimation_motion); +// +// // If features are not needed anymore flow_frame can be be discarded now. +// +// // Pass motion models in estimated_motion onto MotionStabilization, +// // if stabilization is desired. + +#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_ESTIMATION_H_ +#define MEDIAPIPE_UTIL_TRACKING_MOTION_ESTIMATION_H_ + +#include +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/motion_estimation.pb.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/region_flow.h" + +namespace mediapipe { + +class Homography; +class LinearSimilarityModel; +class MixtureHomography; +class MixtureRowWeights; +class RegionFlowFeature; +class RegionFlowFeatureList; +class RegionFlowFrame; + +class EstimateMotionIRLSInvoker; +class InlierMask; +class IrlsInitializationInvoker; +// Thread local storage for pre-allocated memory. +class MotionEstimationThreadStorage; +class TrackFilterInvoker; + +class MotionEstimation { + public: + MotionEstimation(const MotionEstimationOptions& options, int frame_width, + int frame_height); + virtual ~MotionEstimation(); + MotionEstimation(const MotionEstimation&) = delete; + MotionEstimation& operator=(const MotionEstimation&) = delete; + + // Can be used to re-initialize options between EstimateMotion / + // EstimateMotionsParallel calls. + void InitializeWithOptions(const MotionEstimationOptions& options); + + // Estimates motion models from RegionFlowFeatureLists based on + // MotionEstimationOptions, in a multithreaded manner (frame parallel). + // The computed IRLS weights used on the last iteration of the highest + // degree of freedom model are *written* to the irls_weight member for each + // RegionFlowFeature in RegionFlowFeatureList which can be a useful + // feature for later processing. + // In addition the returned irls weights can be smoothed spatially + // and temporally before they are output. + // Note: The actual vector feature_lists is not modified. + virtual void EstimateMotionsParallel( + bool post_irls_weight_smoothing, + std::vector* feature_lists, + std::vector* camera_motions) const; + + // DEPRECATED function, estimating Camera motion from a single + // RegionFlowFrame. + virtual void EstimateMotion(const RegionFlowFrame& region_flow_frame, + const int*, // deprecated, must be NULL. + const int*, // deprecated, must be NULL. + CameraMotion* camera_motion) const; + + // Public facing API to directly estimate motion models (as opposed to + // a cascade of increasing degree of freedom motion models with appropiate + // stability analysis via above EstimateMotionsParallel). + // Use this if all you need is just the a specific motion + // model describing/summarizing the motion of the RegionFlowFeatureList. + // Returns false if model estimation failed (in this case an identity model + // is set in camera_motion). + // NOTE: All direct estimation functions assume normalized feature input, + // i.e. transformed via NormalizeRegionFlowFeatureList. + // + // NOTE 2: For easy direct use see Fit* functions below class. + bool EstimateTranslationModel(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion); + + bool EstimateLinearSimilarityModel(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion); + + bool EstimateAffineModel(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion); + + bool EstimateHomography(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion); + + bool EstimateMixtureHomography(RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion); + + // Static function which sets motion models (requested in options) to identity + // models. + static void ResetMotionModels(const MotionEstimationOptions& options, + CameraMotion* camera_motion); + + // The following functions ResetTo* functions reset all models that are + // present in camera_motion (tested via has_*) to identity or + // the passed model (which is embedded in higher degree of freedom models + // if applicable). CameraMotion::Type is set in accordance to function name. + static void ResetToIdentity(CameraMotion* camera_motion, + bool consider_valid = false); // Set to true + // for type VALID + + // Resets every specified model to embedded translation model. + // CameraMotion type is set to UNSTABLE. + static void ResetToTranslation(const TranslationModel& translation, + CameraMotion* camera_motion); + + // Resets every specified model with more or equal DOF than a similarity + // to the passed model. + // Camera Motion type is set to UNSTABLE_SIM. + static void ResetToSimilarity(const LinearSimilarityModel& model, + CameraMotion* camera_motion); + + // Resets every specified model with more or equal DOF than a homography + // to the passed model. If flag_as_unstable_model is set, camera motion type + // is set to UNSTABLE_HOMOG. + static void ResetToHomography(const Homography& model, + bool flag_as_unstable_model, + CameraMotion* camera_motion); + + private: + // Simple enum indicating with motion model should be estimated, mapped from + // MotionEstimationOptions. + enum MotionType { + MODEL_AVERAGE_MAGNITUDE = 0, + MODEL_TRANSLATION = 1, + MODEL_LINEAR_SIMILARITY = 2, + MODEL_AFFINE = 3, + MODEL_HOMOGRAPHY = 4, + MODEL_MIXTURE_HOMOGRAPHY = 5, + // ... internal enum values used for mixture spectrum (up to 10 mixtures are + // supported). Do not use directly. + + // Change value if new motions are added. + MODEL_NUM_VALUES = 16, + }; + + // Determines shot boundaries from estimated motion models and input + // feature_lists by setting the corresponding flag in CameraMotion. + // Make sure, this function is called after motion estimation. + void DetermineShotBoundaries( + const std::vector& feature_lists, + std::vector* camera_motions) const; + + // Implementation function to estimate CameraMotion's from + // RegionFlowFeatureLists. + void EstimateMotionsParallelImpl( + bool irls_weights_preinitialized, + std::vector* feature_lists, + std::vector* camera_motions) const; + + struct SingleTrackClipData; + struct EstimateModelOptions; + + EstimateModelOptions DefaultModelOptions() const; + + // Implementation function to estimate all motions for a specific type + // across multiple single tracks. Motions are only estimated for those + // CameraMotions with type less or equal to max_unstable_type. Flag + // irls_weights_preinitialized enables some optimizations in case it is + // set to false as features are not pre-initialized. + // Optionally pass thread_storage. + // Returns true if requested type was attempt to be estimated + // (based on options) , false otherwise. + bool EstimateMotionModels( + const MotionType& max_type, const CameraMotion::Type& max_unstable_type, + const EstimateModelOptions& options, + const MotionEstimationThreadStorage* thread_storage, // optional. + std::vector* clip_datas) const; + + // Multiplies input irls_weights by an upweight multiplier for each feature + // that is part of a sufficiently large track (contribution of each track + // length is by track_length_multiplier, mapping each track length + // to an importance weight in [0, 1]). + void LongFeatureInitialization( + const RegionFlowFeatureList& feature_list, + const LongFeatureInfo& feature_info, + const std::vector& track_length_importance, + std::vector* irls_weights) const; + + // Multiplies input irls_weights by normalization factor that downweights + // features is areas of high density. + void FeatureDensityNormalization(const RegionFlowFeatureList& feature_list, + std::vector* irls_weights) const; + + // Initializes irls weights (if + // MotionEstimationOptions::irls_initialization::activated is true), + // based on a multitude of options (RANSAC based pre-fitting of motion + // models, homography initialization, etc.). + // Processes one frame or all (if frame = -1) within a clip_data, + void IrlsInitialization(const MotionType& type, int max_unstable_type, + int frame, // Specify -1 for all frames. + const EstimateModelOptions& options, + SingleTrackClipData* clip_data) const; + + // Estimation functions for models, called via options by EstimateMotion and + // EstimateMotionsParallel. + // NOTE: All direct estimation functions assume normalized feature input, + // i.e. transformed via + // TransformRegionFlowFeatureList(normalization_transform, feature_list); + // where normalization_transform = + // LinearSimilarityAdapter::NormalizationTransform(frame_width, + // frame_height); + // + // Direct estimation functions perform estimation via iterated reweighted + // least squares (IRLS). In this case specify number of iterations (10 is a + // good default), and optionally the PriorFeatureWeights for each iteration. + // The alphas specify, how much weight should be given to the + // prior weight that the feature has before optimization. An alpha of zero + // indicates, no prior weighting, whereas as an alpha of one corresponds to + // full prior weighting. The actual prior is stored in priors. + // Each iteration is reweighted by numerator / error, where error is the L2 + // fitting error after estimation and + // numerator = (1.0 - alpha) * 1.0 + alpha * prior + struct PriorFeatureWeights { + explicit PriorFeatureWeights(int num_iterations) + : alphas(num_iterations, 0.0f) {} + PriorFeatureWeights(int num_iterations, int num_features) + : alphas(num_iterations, 0.0f), priors(num_features, 1.0f) {} + + // Tests for correct dimensions of PriorFeatureWeights. + bool HasCorrectDimension(int num_iterations, int num_features) const { + return alphas.size() == num_iterations && priors.size() == num_features; + } + + // Returns true if at least one alpha is non-zero. + bool HasNonZeroAlpha() const { + return !alphas.empty() && + *std::max_element(alphas.begin(), alphas.end()) > 0; + } + + // Returns true, if a prior was specified. + bool HasPrior() const { return !priors.empty(); } + + std::vector alphas; // Alpha for each IRLS round. + std::vector priors; // Prior weight for each feature. + + // If set, above alpha are not adjusted with iterations, but always set to + // 1.0, given full weight to the prior. + bool use_full_prior = false; + }; + + // In addition, each estimation function can compute its corresponding + // stability features and store it in CameraMotion. These features are needed + // to test via the IsStable* functions further below. + + // Estimates 2 DOF translation model. + // Note: feature_list is assumed to be normalized/transformed by + // LinearSimilarity::NormalizationTransform N. Returned irls weights and + // linear similarity are expressed in original frame, i.e. for estimated model + // M, M' = N^(-1) M N is returned. + void EstimateTranslationModelIRLS( + int irls_rounds, bool compute_stability, + RegionFlowFeatureList* feature_list, + const PriorFeatureWeights* prior_weights, // optional. + CameraMotion* camera_motion) const; + + // Estimates linear similarity from feature_list using irls_rounds iterative + // reweighted least squares iterations. For L2 estimation, use irls_round = 1. + // The irls_weight member of each RegionFlowFeature in feature_list will be + // set to the inverse residual w.r.t. estimated LinearSimilarityModel. + // Note: feature_list is assumed to be normalized/transformed by + // LinearSimilarity::NormalizationTransform N. Returned irls weights and + // linear similarity are expressed in original frame, i.e. for estimated model + // M, M' = N^(-1) M N is returned. + // Returns true if estimation was successful, otherwise returns false and sets + // the CameraMotion::type to INVALID. + bool EstimateLinearSimilarityModelIRLS( + int irls_rounds, bool compute_stability, + RegionFlowFeatureList* feature_list, + const PriorFeatureWeights* prior_weights, // optional. + CameraMotion* camera_motion) const; + + // Same as above for affine motion. + // Note: feature_list is assumed to be normalized/transformed by + // LinearSimilarity::NormalizationTransform N. Returned irls weights and + // affine model are expressed in original frame, i.e. for estimated model + // M, M' = N^(-1) M N is returned. + bool EstimateAffineModelIRLS(int irls_rounds, + RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion) const; + + // Same as above for homography. + // Note: feature_list is assumed to be normalized/transformed by + // LinearSimilarity::NormalizationTransform N. Returned irls weights and + // homography are expressed in original frame, i.e. for estimated model + // M, M' = N^(-1) M N is returned. + // Returns true if estimation was successful, otherwise returns false and sets + // the CameraMotion::type to INVALID. + bool EstimateHomographyIRLS( + int irls_rounds, bool compute_stability, + const PriorFeatureWeights* prior_weights, // optional. + MotionEstimationThreadStorage* thread_storage, // optional. + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const; + + // Same as above for mixture homography. + // Note: feature_list is assumed to be normalized/transformed by + // LinearSimilarity::NormalizationTransform N. Returned irls weights and + // mixture homography are expressed in original frame, i.e. for estimated + // model M, M' = N^(-1) M N is returned. + // Mixture model estimation customized by MotionEstimationOptions. + // Returns true if estimation was successful, otherwise returns false and sets + // the CameraMotion::type to INVALID. + // Supports computation for mixture spectrum, i.e. mixtures with different + // regularizers. For default regularizer pass + // MotionEstimationOptions::mixture_regularizer. Estimated motion will be + // stored in CameraMotion::mixture_homography_spectrum(spectrum_idx). + bool EstimateMixtureHomographyIRLS( + int irls_rounds, bool compute_stability, float regularizer, + int spectrum_idx, // 0 by default. + const PriorFeatureWeights* prior_weights, // optional. + MotionEstimationThreadStorage* thread_storage, // optional. + RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const; + + // Returns weighted variance for mean translation from feature_list (assumed + // to be in normalized coordinates). Returned variance is in unnormalized + // domain. + float TranslationVariance(const RegionFlowFeatureList& feature_list, + const Vector2_f& translation) const; + + // Replace each features irls weight by robust min-filtered irls weight + // across each track. + void MinFilterIrlsWeightByTrack(SingleTrackClipData* clip_data) const; + + // Performs filtering of irls weight across several tracking clip datas, + // to yield consistent irls weights. + void EnforceTrackConsistency( + std::vector* clip_datas) const; + + // Initializes or modifies prior_weights for passed feature_list by + // biasing toward previous (filtered) IRLS weight for that feature. + // This enables temporal coherence. + void BiasLongFeatures(RegionFlowFeatureList* feature_list, MotionType type, + const EstimateModelOptions& model_options, + PriorFeatureWeights* prior_weights) const; + + // Called by above function to determine the bias each feature is multiplied + // with. + void BiasFromFeatures(const RegionFlowFeatureList& feature_list, + MotionType type, + const EstimateModelOptions& model_options, + std::vector* bias) const; + + // Maps track index to tuple of spatial bias and number of similar + // looking long tracks. + typedef std::unordered_map> SpatialBiasMap; + void ComputeSpatialBias(MotionType type, + const EstimateModelOptions& model_options, + RegionFlowFeatureList* feature_list, + SpatialBiasMap* spatial_bias) const; + + // Updates features weights in feature_list by temporally consistent bias. + void UpdateLongFeatureBias(MotionType type, + const EstimateModelOptions& model_options, + bool remove_terminated_tracks, + bool update_irls_observation, + RegionFlowFeatureList* feature_list) const; + + // Bilateral filtering of irls weights across the passed list. + void SmoothIRLSWeights(std::deque* irls) const; + + // Helper function. Returns number of irls iterations for passed MotionType + // derived from current MotionEstimationOptions. Returns zero, if no + // estimation should be attempted. + int IRLSRoundsFromSettings(const MotionType& type) const; + + // Partitions irls_rounds into several rounds with each having irls_per_round + // interations each based on MotionEstimationOptions::EstimationPolicy. + // Post-condition: total_rounds * irls_per_rounds == irls_rounds. + void PolicyToIRLSRounds(int irls_rounds, int* total_rounds, + int* irls_per_round) const; + + // Check for specified MotionType is estimated model is stable. If not, resets + // feature's irls weights to reset_irls_weights (optional) and resets motion + // model in camera_motion to lower degree of freedom model. In this case, + // CameraMotion::Type is flagged as UNSTABLE_* where * denotes the lower + // degree of freedom model. + // Model is only checked those CameraMotions with type less than or equal to + // max_unstable_type. + void CheckModelStability( + const MotionType& type, const CameraMotion::Type& max_unstable_type, + const std::vector>* reset_irls_weights, + std::vector* feature_lists, + std::vector* camera_motions) const; + + // Implementation function called by above function, to check for a single + // model. + void CheckSingleModelStability(const MotionType& type, + const CameraMotion::Type& max_unstable_type, + const std::vector* reset_irls_weights, + RegionFlowFeatureList* feature_list, + CameraMotion* camera_motion) const; + + // Projects motion model specified by type to lower degree of freedom models. + void ProjectMotionsDown(const MotionType& type, + std::vector* camera_motions) const; + + // Filters passed feature_lists based on + // MotionEstimationOptions::irls_weight_filter. + void IRLSWeightFilter( + std::vector* feature_lists) const; + + // Inlier scale based on average motion magnitude and the fraction + // of the magnitude that is still considered an inlier. + // In general a residual of 1 pixel is assigned an IRLS weight of 1, + // this function returns a residual scale, such that a residual + // of distance_fraction * translation_magnitude equals an IRLS weight of 1 + // if multiplied by returned scale. + float GetIRLSResidualScale(const float avg_motion_magnitude, + float distance_fraction) const; + + const LinearSimilarityModel& InverseNormalizationTransform() const { + return inv_normalization_transform_; + } + + const LinearSimilarityModel& NormalizationTransform() const { + return normalization_transform_; + } + + // Returns domain normalized features fall in. + Vector2_f NormalizedDomain() const { return normalized_domain_; } + + // Returns index within the inlier mask for each feature point. + // Also returns for each bin normalizer to account for different number of + // features per bin during weighting. + void ComputeFeatureMask(const RegionFlowFeatureList& feature_list, + std::vector* mask_indices, + std::vector* bin_normalizer) const; + + // Runs multiple rounds of RANSAC, resetting outlier IRLS weight to + // a low score. + // Optionally can perform temporally consistent selection if inlier_mask is + // specified. + // Returns best model across all iterations in best_model and true if + // estimated model was deemed stable. + bool GetTranslationIrlsInitialization( + RegionFlowFeatureList* feature_list, + const EstimateModelOptions& model_options, float avg_camera_motion, + InlierMask* inlier_mask, // optional. + TranslationModel* best_model) const; + + // Same as above for linear similarities. + bool GetSimilarityIrlsInitialization( + RegionFlowFeatureList* feature_list, + const EstimateModelOptions& model_options, float avg_camera_motion, + InlierMask* inlier_mask, // optional. + LinearSimilarityModel* best_model) const; + + // Computes number of inliers and strict inliers (satisfying much stricter + // threshold) for a given feature list after model fitting. + void ComputeSimilarityInliers(const RegionFlowFeatureList& feature_list, + int* num_inliers, + int* num_strict_inliers) const; + + // Initializes irls weights based on setting + // MotionEstimationOptions::homography_irls_weight_initialization. + void GetHomographyIRLSCenterWeights(const RegionFlowFeatureList& feature_list, + std::vector* center_weights) const; + + // Checks for unreasonable large accelerationas between frames as specified by + // MotionEstimationOptions::StableTranslationBounds. + void CheckTranslationAcceleration( + std::vector* camera_motions) const; + + // Functions below, test passed model is deemed stable according to + // several heuristics set by Stable[MODEL]Bounds in MotionEstimationOptions. + bool IsStableTranslation(const TranslationModel& normalized_translation, + float translation_variance, + const RegionFlowFeatureList& features) const; + + // Tests if passed similarity is stable. Pass number of inliers from + // ComputeSimilarityInliers. + bool IsStableSimilarity(const LinearSimilarityModel& model, + const RegionFlowFeatureList& features, + int num_inliers) const; + + bool IsStableHomography(const Homography& homography, + float average_homography_error, + float inlier_coverage) const; + + bool IsStableMixtureHomography( + const MixtureHomography& homography, float min_block_inlier_coverage, + const std::vector& block_inlier_coverage) const; + + // Computes fraction (in [0, 1]) of inliers w.r.t. frame area using a grid of + // occupancy cells. A feature is consider an inlier if its irls_weight is + // larger or equal to min_inlier_score. + float GridCoverage(const RegionFlowFeatureList& feature_list, + float min_inlier_score, + MotionEstimationThreadStorage* thread_storage) const; + + // Estimates per scanline-block coverage of mixture. If + // assume_rolling_shutter_camera is set, low textured features are allowed to + // have higher error as registration errors would not be as visible here. + void ComputeMixtureCoverage(const RegionFlowFeatureList& feature_list, + float min_inlier_score, + bool assume_rolling_shutter_camera, + MotionEstimationThreadStorage* thread_storage, + CameraMotion* camera_motion) const; + + // Returns average motion magnitude as mean of the translation magnitude from + // the 10th to 90th percentile. + void EstimateAverageMotionMagnitude(const RegionFlowFeatureList& feature_list, + CameraMotion* camera_motion) const; + + // Returns per iteration weight of the feature's irls weight initialization. + float IRLSPriorWeight(int iteration, int irls_rounds) const; + + // Implementation function for above function. Estimates mixture homography + // from features and returns true if estimation was non-degenerate. + bool MixtureHomographyFromFeature( + const TranslationModel& translation, int irls_rounds, float regularizer, + const PriorFeatureWeights* prior_weights, // optional. + RegionFlowFeatureList* feature_list, + MixtureHomography* mix_homography) const; + + // Determines overlay indices (spatial bin locations that are likely to be + // affected by overlays) and stores them in corresponding member in + // CameraMotion. Features that fall within these bins will be assigned a + // weight of zero. + void DetermineOverlayIndices( + bool irls_weights_preinitialized, + std::vector* camera_motions, + std::vector* feature_lists) const; + + // Determine features likely to be part of a static overlay, by setting their + // irls weight to zero. + // Returns fraction of the image domain that is considered to be occupied by + // overlays and specific overlay cell indices in overlay_indices. + float OverlayAnalysis(const std::vector& translations, + std::vector* feature_lists, + std::vector* overlay_indices) const; + + // Smooths feature's irls_weights spatio-temporally. + void PostIRLSSmoothing( + const std::vector& camera_motions, + std::vector* feature_lists) const; + + // Initializes LUT for gaussian weighting. By default discretizes the domain + // [0, max_range] into 4K bins, returning scale to map from a value in the + // domain to the corresponding bin. If scale is nullptr max_range bins are + // created instead (in this case scale would be 1.0, i.e. value equals bin + // index). + void InitGaussLUT(float sigma, float max_range, std::vector* lut, + float* scale) const; + + // Performs fast volumetric smoothing / filtering of irls weights. Weights are + // expected to be already binned using BuildFeatureGrid. + void RunTemporalIRLSSmoothing( + const std::vector>& feature_grid, + const std::vector>& feature_taps_3, + const std::vector>& feature_taps_5, + const std::vector& frame_confidence, + std::vector* feature_views) const; + + private: + MotionEstimationOptions options_; + int frame_width_; + int frame_height_; + + LinearSimilarityModel normalization_transform_; + LinearSimilarityModel inv_normalization_transform_; + + // Transform from normalized features to irls domain. + LinearSimilarityModel irls_transform_; + + // Frame dimensions transformed by normalization transform. + Vector2_f normalized_domain_; + std::unique_ptr row_weights_; + + // For initialization biased towards previous frame. + std::unique_ptr inlier_mask_; + + // Stores current bias for each track and the last K irls observations. + struct LongFeatureBias { + explicit LongFeatureBias(float initial_weight) : bias(initial_weight) { + irls_values.push_back(1.0f / initial_weight); + } + + LongFeatureBias() : LongFeatureBias(1.0f) {} + + float bias = 1.0f; // Current bias, stores pixel error, + // i.e. 1 / IRLS. + std::vector irls_values; // Recently observed IRLS values; + // Ring buffer. + int total_observations = 1; + }; + + // Maps track id to LongFeatureBias. + typedef std::unordered_map LongFeatureBiasMap; + + // Bias map indexed by MotionType. + mutable std::vector long_feature_bias_maps_ = + std::vector(static_cast(MODEL_NUM_VALUES)); + + // Lookup tables and scale for FeatureBias computation. + struct FeatureBiasLUT { + // For ComputeSpatialBias weighting. + std::vector spatial_lut; + float spatial_scale; + std::vector color_lut; + float color_scale; + + // For BiasFromFeature computation. + std::vector bias_weight_lut; + float bias_weight_scale; + }; + + FeatureBiasLUT feature_bias_lut_; + + // Counts the number of consecutive duplicate frames for each motion model. + mutable std::vector num_duplicate_frames_ = + std::vector(static_cast(MODEL_NUM_VALUES)); + + friend class EstimateMotionIRLSInvoker; + friend class IrlsInitializationInvoker; + friend class TrackFilterInvoker; + friend class MotionEstimationThreadStorage; +}; + +// Meta-data set in the header of filter streams to communicate information used +// during camera motion estimation. +struct CameraMotionStreamHeader { + CameraMotionStreamHeader() : frame_width(0), frame_height(0) {} + int32 frame_width; + int32 frame_height; +}; + +// Direct fitting functions. +TranslationModel FitTranslationModel(const RegionFlowFeatureList& features); + +LinearSimilarityModel FitLinearSimilarityModel( + const RegionFlowFeatureList& features); + +AffineModel FitAffineModel(const RegionFlowFeatureList& features); + +Homography FitHomography(const RegionFlowFeatureList& features); + +MixtureHomography FitMixtureHomography(const RegionFlowFeatureList& features); + +// Templated fitting functions. +template +Model FitModel(const RegionFlowFeatureList& features); + +template <> +inline TranslationModel FitModel(const RegionFlowFeatureList& features) { + return FitTranslationModel(features); +} + +template <> +inline LinearSimilarityModel FitModel(const RegionFlowFeatureList& features) { + return FitLinearSimilarityModel(features); +} + +template <> +inline AffineModel FitModel(const RegionFlowFeatureList& features) { + return FitAffineModel(features); +} + +template <> +inline Homography FitModel(const RegionFlowFeatureList& features) { + return FitHomography(features); +} + +template <> +inline MixtureHomography FitModel(const RegionFlowFeatureList& features) { + return FitMixtureHomography(features); +} + +// Generic projection function that projects models in an arbitrary direction +// (that is from lower to higher or vice versa) via fast model fits, without +// any error bound checking. +// MixtureRowWeights are only necessary for ToModel == MixtureHomography. +template +ToModel ProjectViaFit(const FromModel& model, int frame_width, int frame_height, + MixtureRowWeights* row_weights = nullptr, + int grid_dim = 10) { + // Build a grid of features. + const float dx = frame_width * 1.0f / grid_dim; + const float dy = frame_height * 1.0f / grid_dim; + + // Create region flow from grid. + RegionFlowFeatureList grid_features; + grid_features.set_frame_width(frame_width); + grid_features.set_frame_height(frame_height); + grid_features.set_match_frame(-1); + + for (int k = 0; k <= grid_dim; ++k) { + for (int l = 0; l <= grid_dim; ++l) { + auto* feat = grid_features.add_feature(); + feat->set_x(l * dx); + feat->set_y(k * dy); + } + } + + RegionFlowFeatureListViaTransform(model, &grid_features, 1.0f, + 0.0f, // Replace flow. + false, // Don't change feature loc. + row_weights); + return FitModel(grid_features); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_ESTIMATION_H_ diff --git a/mediapipe/util/tracking/motion_estimation.proto b/mediapipe/util/tracking/motion_estimation.proto new file mode 100644 index 000000000..79d60c64f --- /dev/null +++ b/mediapipe/util/tracking/motion_estimation.proto @@ -0,0 +1,670 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Settings for MotionEstimation. +syntax = "proto2"; + +package mediapipe; + +// Note: In general for Estimation modes, the prefix are used as follows: +// L2: minimize squared norm of error +// IRLS: iterative reweighted least square, L2 minimization using multiple +// iterations, downweighting outliers. +// Next tag: 69 +message MotionEstimationOptions { + // Specifies which camera models should be estimated, translation is always + // estimated. + optional bool estimate_translation_irls = 1 [default = true]; + + enum LinearSimilarityEstimation { + ESTIMATION_LS_NONE = 0; + ESTIMATION_LS_L2 = 1; // L2 estimation + ESTIMATION_LS_IRLS = 4; // good performance, robust to outliers. + + // DEPRECATED modes. + ESTIMATION_LS_L2_RANSAC = 2; // DEPRECATED, use IRLS instead. + ESTIMATION_LS_L1 = 3; // DEPRECATED, use IRLS instead, or static + // function MotionEstimation:: + // EstimateLinearSimilarityL1 + } + + optional LinearSimilarityEstimation linear_similarity_estimation = 3 + [default = ESTIMATION_LS_IRLS]; + + enum AffineEstimation { + ESTIMATION_AFFINE_NONE = 0; + ESTIMATION_AFFINE_L2 = 1; + ESTIMATION_AFFINE_IRLS = 2; + } + + optional AffineEstimation affine_estimation = 30 + [default = ESTIMATION_AFFINE_NONE]; + + enum HomographyEstimation { + ESTIMATION_HOMOG_NONE = 0; + ESTIMATION_HOMOG_L2 = 1; + ESTIMATION_HOMOG_IRLS = 2; + } + + optional HomographyEstimation homography_estimation = 5 + [default = ESTIMATION_HOMOG_IRLS]; + + // By default, homography estimation minimizes an objective that is not + // strictly the L2 distance between matched points. If the flag is set, each + // row of the linear system is scaled with the exact denominator which results + // in an objective that minimizes the L2 distance. + optional bool homography_exact_denominator_scaling = 53 [default = false]; + + // Per default, we use exact solver for over-determined system using + // well-conditioned QR decomposition. For better speed, set value to false to + // use estimation via normal equations. + optional bool use_exact_homography_estimation = 54 [default = true]; + + // If set uses double instead of float when computing normal equations. + optional bool use_highest_accuracy_for_normal_equations = 55 [default = true]; + + // Regularizer for perspective part of the homography. If zero, no + // regularization is performed. Should be >= 0. + optional float homography_perspective_regularizer = 61 [default = 0]; + + // Note: Mixture models have high DOF are much more affected by outliers + // than models above. It is recommended that if IRLS estimation is NOT used, + // that mixture_regularizer is increased by a factor >=3. + enum MixtureHomographyEstimation { + ESTIMATION_HOMOG_MIX_NONE = 0; + ESTIMATION_HOMOG_MIX_L2 = 1; + ESTIMATION_HOMOG_MIX_IRLS = 2; // robust to outliers. + } + + optional MixtureHomographyEstimation mix_homography_estimation = 12 + [default = ESTIMATION_HOMOG_MIX_NONE]; + + // If row-wise mixture models are estimated, determines number of them. + // Note, changing number of mixtures, interpolation sigma and regularizer + // is very likely to impact the stability analysis for mixtures and rolling + // shutter scoring. At least MixtureHomographyBounds would need to be adjusted + // to the new values. + optional int32 num_mixtures = 13 [default = 10]; + + // If row-wise mixture models are estimated, determines how much each point is + // influenced by its neigbhoring mixtures. Specified as relative sigma + // (standard deviation) w.r.t. frame_height. + optional float mixture_row_sigma = 14 [default = 0.1]; + + // Mixture estimation uses L2 regularizer to assure that adjacent mixture + // models are similar. + optional float mixture_regularizer = 15 [default = 1e-4]; + + // Mixtures are estimated across a spectrum of exponentially increasingly + // regularizers. In particular the regularizer at level L is given as + // mixture_regularizer * mixture_regularizer_base^L. + // A maximum of 10 levels are supported (checked!). + // Note: When changing the number of levels you probably want to adapt the + // MotionStabilizationOptions::rolling_shutter_increment value as well, + // as the number of levels directly controls the highest threshold for + // the rolling shutter index analysis. + optional float mixture_regularizer_levels = 42 [default = 3]; + optional float mixture_regularizer_base = 43 [default = 2.2]; + optional int32 mixture_rs_analysis_level = 44 [default = 2]; + + // IRLS rounds to down-weight outliers (default across all models). + // Note: IRLS in combination with full mixture models (as opposed to the + // default reduced ones) is somewhat expensive. + optional int32 irls_rounds = 17 [default = 10]; + + // If set to > 0 (always needs be less than 1.0), influence of supplied prior + // irls weights is linearlly decreased from the specified prior scale (weight + // 1.0) to prior_scale. Effectively, biases the solution to the + // supplied prior features. + // Note: Without irls_weights_preinitialized set to true, this option is + // effectively a no op. + // TODO: Retire this option. + optional float irls_prior_scale = 50 [default = 0.2]; + + // Determine how to normalize irls weights w.r.t. average motion magnitude. + // In general a residual of 1 pixel is assigned an IRLS weight of 1. + // However as larger motions in general are affected by a larger error, we + // normalize irls weights, such that a residual + // of distance of irls_motion_magnitude_fraction times + // equals an IRLS weight of 1. + // Must be larger than zero. + optional float irls_motion_magnitude_fraction = 31 [default = 0.08]; + + // Scale that is applied for mixture (where error is expected to be bigger). + optional float irls_mixture_fraction_scale = 68 [default = 1.5]; + + // By default, irls weight of all features are set uniformly to one before + // estimating EACH model, refining them in subsequent irls iterations. + // If flag below is set, input irls weights are used instead for each motion + // model. + optional bool irls_weights_preinitialized = 39 [default = false]; + + // If weights are pre-initialized optionally min filter weights along track + // ids when long tracks are used. This can be used to consistently label + // outliers in time before estimation. + optional bool filter_initialized_irls_weights = 67 [default = false]; + + // If activated, irls weight of outlier features are reset. Outliers are + // defined as those features, for which the best model fit after #rounds + // iterations of RANSAC did NOT yield an error lower than cutoff. + // Only applies to translation and similarity estimation. + message IrlsOutlierInitialization { + optional bool activated = 1 [default = false]; + + optional int32 rounds = 2 [default = 100]; + optional float cutoff = 3 [default = 0.003]; + } + optional IrlsOutlierInitialization irls_initialization = 56; + + // Normalizes feature's irls weights prior to estimation such that + // feature in high density areas are downweighted. Multiplicative in case + // irls_weights_preinitialized is set to true. + optional bool feature_density_normalization = 62 [default = false]; + + // A regular grid of size feature_mask_size x feature_mask_size + // is used to normalize features w.r.t. their density. + optional int32 feature_mask_size = 63 [default = 10]; + + // In addition to above outlier and density initialization, long features + // that are present for a specified ratio of the analysis interval can be + // upweighted. This greatly improves temporal consistency. + message LongFeatureInitialization { + optional bool activated = 1 [default = false]; + + // Tracks with a length greater of equal to the specified percentile + // are upweighted by the specified upweight_multiplier. + optional float min_length_percentile = 2 [default = 0.95]; + + // Features passing above test have their irls weight increased by the + // specified multiplier prior to estimation. + optional float upweight_multiplier = 3 [default = 5]; + } + + optional LongFeatureInitialization long_feature_initialization = 66; + + // Irls initialization can be performed in a temporal depdent manner, + // (if estimation_policy() == TEMPORALLY_DEPENDENT), where + // the previous frame's motion estimation biases the IrlsInitialization of the + // currently processed frame. In particular the location and magnitude of + // inliers is used during the RANSAC selection stage, to favor those features + // that agree with the prior, represented as confidence mask of inliers + // (using same dimension as above feature_mask_size). + // After estimation, the prior is updated. + message IrlsMaskOptions { + // Amount prior is decayed after each iteration. + optional float decay = 2 [default = 0.7]; + + // Score that each inlier adds to the current prior. Specified w.r.t. total + // number of features, i.e. each feature increases a bins score by + // inlier_score. + optional float inlier_score = 3 [default = 0.4]; + + // Each inlier scores at least this value regardless of the inlier mask + // (additive). + optional float base_score = 4 [default = 0.2]; + + // Motions are scored relative to previous motion. Threshold denotes + // absolute minimum of denominator. + optional float min_translation_norm = 5 [default = 2e-3]; + + // Translation is updated in every step by blending it with the previous + // estimated translation. (alpha is within 0 to 1, where 0 indicates to use + // only measured translation, i.e. no blending). + optional float translation_blend_alpha = 6 [default = 0.7]; + + // Every time translation is updated, prior (in [0, 1]) is increased by the + // specified amount. + optional float translation_prior_increase = 7 [default = 0.2]; + + // Deprecated fields. + extensions 1; + } + + optional IrlsMaskOptions irls_mask_options = 57; + + // Describes how long feature tracks are leveraged for joint estimation across + // many frames. + message JointTrackEstimationOptions { + // For each frame-pair motion model, describing the motion between frame + // I and I - 1, estimate in addition several additional motion + // models along long feature tracks describing the motion between frame + // I and I - k * motion_stride (additional models are not output, + // but help to filter irls weights). + // Specifies total number of estimated motion models per frame-pair. Must be + // greater than zero. + optional int32 num_motion_models = 1 [default = 3]; + + // Spacing in frames for additional motion models. + optional int32 motion_stride = 2 [default = 15]; + + // If set, performs temporal smoothing across frames of the obtained irls + // weights. + optional bool temporal_smoothing = 3 [default = false]; + + // TODO: Specify which filter is used during temporal smoothing. + // TODO: Limit joint estimation to static scenes + // (dont violate rigidity of alignment in case of wide + // baselines). Adopt stride and num_motion_models + // accordingly. + } + + optional JointTrackEstimationOptions joint_track_estimation = 59; + + // Options being used to bias IRLS features if estimation mode + // TEMPORAL_LONG_FEATURE_BIAS is being used. + // Next Tag: 15 + message LongFeatureBiasOptions { + // Estimation is performed multiple times, alternating between model + // estimation and smooth temporal feature biasing for the specified number + // of rounds. + optional int32 total_rounds = 13 [default = 1]; + + // Controls how fast the bias for a track gets updated, in case feature is + // an inlier. Use higher values for less decay of background motion over + // time. + optional float inlier_bias = 1 [default = 0.98]; + + // Same as above for outliers (or features with low prior), i.e those that + // got recently seeded. + optional float outlier_bias = 2 [default = 0.7]; + + // Number of elements after which we deem estimation to be stable. + // Used to control weight of bias if fewer than the specified number have + // been observed. Also used as maximum ring buffer size (only most recent + // number of observations are kept). Must be > 0. + optional int32 num_irls_observations = 3 [default = 10]; + + // Change in irls weight magnitude (from outlier to inlier) above which we + // reset the current bias. + optional float max_irls_change_ratio = 4 [default = 10.0]; + + // Irls weight above which we consider it to be an inlier for bias update + // purposes (see above inlier and outlier bias). By default, outliers are + // allowed to update their bias faster than inliers. Must be > 0. + optional float inlier_irls_weight = 5 [default = 0.2]; + + // Standard deviation used during feature initialization. Current bias of a + // track is used to pre-weight features via gaussian weighting with + // specified standard deviation. + optional float bias_stdev = 12 [default = 1.0]; + + // When seeding new tracks (on the first frame), we bilaterally pool + // neighboring feature biases as seed. Details are controlled by options + // below. If false, the feature's estimation error is used instead + // (faster, but less spatially smooth). + // If activated it is advised to use a patch descriptor radius of at least + // 20 pixels. + optional bool use_spatial_bias = 6 [default = true]; + + // Newly observered tracks's biases are seeded by similar looking features + // in close spatial proximity. For efficieny a grid is used to determine + // proximity. + // Grid size in normalized coordinates w.r.t. frame domain. + optional float grid_size = 7 [default = 0.04]; + + // Sigma's for combining feature biases. + optional float spatial_sigma = 8 [default = 0.02]; + optional float color_sigma = 9 [default = 20.0]; + + // Defines what we consider to be a long track. Features spawned around + // locations of similar looking long tracks are considered to have + // high prior, e.g. their initilization is given more weight. + optional int32 long_track_threshold = 10 [default = 30]; + + // Determines with fraction of long tracks is considered to be sufficient + // for highly confident bias seed. + optional float long_track_confidence_fraction = 11 [default = 0.25]; + + // If activated, uses the irls weights from the estimation of the lower + // degree of freedom model to seed the bias of the higher degree of freedom + // model. This improves rigidity of the computed motion. + optional bool seed_priors_from_bias = 14 [default = false]; + } + + optional LongFeatureBiasOptions long_feature_bias_options = 64; + + // Controls how multiple models via EstimateMotionsParallel are estimated. + enum EstimationPolicy { + INDEPENDENT_PARALLEL = 1; // Models are estimated independently across + // frames in parallel. + TEMPORAL_IRLS_MASK = 2; // Previous frame's estimation biases + // current one, controlled via above + // IrlsMaskOptions. + TEMPORAL_LONG_FEATURE_BIAS = 4; // Frame's estimation is biased along + // long features, controlled via above + // LongFeatureBiasOptions. + JOINTLY_FROM_TRACKS = 3; // Estimation is performed jointly over + // chunks of frames, exercising a consistent + // irls weight PER track. Expects as input + // RegionFlowFeatureList's with long_tracks. + // Controlled via above + // JointTrackEstimationOptions. + } + optional EstimationPolicy estimation_policy = 58 + [default = INDEPENDENT_PARALLEL]; + + optional int32 coverage_grid_size = 51 [default = 10]; + + // Degree of freedom of estimated homography mixtures. If desired, specific + // parts of the homography can be held constant across the mixture. + // For fast draft TRANSLATION_MIXTURE is recommended, for high quality + // SKEW_ROTATION_MIXTURE. + enum MixtureModelMode { + FULL_MIXTURE = 0; // 8 dof * num_mixtures + TRANSLATION_MIXTURE = 1; // 6 dof + 2 dof * num_mixtures + SKEW_ROTATION_MIXTURE = 2; // 4 dof + 4 dof * num_mixtures + } + + optional MixtureModelMode mixture_model_mode = 23 + [default = SKEW_ROTATION_MIXTURE]; + + // If specified, only features that agree with the estimated linear similarity + // will be used to estimate the homography. + // If set, linear_similarity_estimation can not be ESTIMATION_NONE! (checked) + optional bool use_only_lin_sim_inliers_for_homography = 6 [default = true]; + + // Max. deviation to be considered an inlier w.r.t. estimated similarity for + // above flag. This value is set w.r.t. normalized frame diameter. + // TODO: Should take GetIRLSResidualScale into account. + optional float lin_sim_inlier_threshold = 20 [default = 0.003]; + + // If any parameter of the input flow or estimated translation exceeds these + // thresholds we deem the motion INVALID. + message TranslationBounds { + // Absolute minimum of features present. + optional int32 min_features = 1 [default = 3]; + + // Max magnitude of the translation expressed w.r.t. frame diameter + optional float frac_max_motion_magnitude = 2 [default = 0.15]; + + // Motion magnitude is only tested for if standard deviation of estimated + // translation exceeds threshold. + optional float max_motion_stdev_threshold = 4 [default = 0.01]; + + // Max standard deviation of the estimated translation (normalized to frame + // diameter). + optional float max_motion_stdev = 3 [default = 0.065]; + + // Maximum acceleration between frames. Specified relative to minimum + // velocity across two adjacent frames (absolute minimum of 0.001 is + // enforced, ~1 pix for 480p). + // If exceeded for one frame, the whole batch passed to + // EstimateMotionsParallel is labeled unstable. + optional float max_acceleration = 5 [default = 20.0]; + } + + optional TranslationBounds stable_translation_bounds = 32; + + // If any test/bound is violated, the motion is deemed UNSTABLE. + message SimilarityBounds { + // Input frame has to be labeled stable, i.e. enough features and coverage + // present. + optional bool only_stable_input = 1 [default = true]; + + // Minimum number of inlier features (absolute and as fraction of total + // number of features). + // TODO: Dataset run setting this to 0.15 + optional float min_inlier_fraction = 2 [default = 0.2]; + optional float min_inliers = 3 [default = 30]; + + // Bounds on valid similarities. We use larger values compared to + // homographies. Note: Bounds are necessary, to guarantee invertability + // of the resulting similarity. + optional float lower_scale = 4 [default = 0.8]; + optional float upper_scale = 5 [default = 1.25]; // 1 / 0.8. + optional float limit_rotation = 6 [default = 0.25]; // 15 degrees. + + // Thresholds for a feature to be considered inlier w.r.t similarity + // transform, expressed in terms of pixel residual error. Max of absolute + // and fractional thresholds is used. + // Ratio of inliers that pass regular and strict thresholds are storred in + // CameraMotion. + // + // TODO: Just use lin_sim_inlier_threshold directly, however that + // recomputes the error, and requires regression testing. Using an extra + // fractional inlier threshold for now. + // + // Absolute in pixels. + optional float inlier_threshold = 7 [default = 4.0]; + // Scaled by frame diameter. + optional float frac_inlier_threshold = 8 [default = 0]; + + // TODO: Revisit after frame selection change. + // Absolute in pixels. + optional float strict_inlier_threshold = 9 [default = 0.5]; + } + + optional SimilarityBounds stable_similarity_bounds = 33; + + // If any parameter of the estimated homography exceeds these bounds, + // we deem it UNSTABLE_SIM and use estimated similarity instead. + message HomographyBounds { + optional float lower_scale = 1 [default = 0.8]; + optional float upper_scale = 2 [default = 1.25]; // 1 / 0.8. + optional float limit_rotation = 3 [default = 0.25]; // 15 degrees. + optional float limit_perspective = 4 [default = 0.0004]; + + // Inlier coverage is only tested for if average homography error exceeds + // registration_thresholds. Max of the following two thresholds is used. + // + // Absolute in pixels. + optional float registration_threshold = 5 [default = 0.1]; + // Scaled by frame diameter. + optional float frac_registration_threshold = 8 [default = 0]; + + // Minimum fraction of inlier features w.r.t. frame area. + optional float min_inlier_coverage = 6 [default = 0.3]; + + // Grid coverage inlier threshold. Pixel errors below this + // threshold are considered inliers. Defined w.r.t. frame diameter, approx. + // 1.5 for 16:9 SD video (480p), i.e. threshold is multiplied by frame + // diameter. + optional float frac_inlier_threshold = 7 [default = 2.0e-3]; + } + + optional HomographyBounds stable_homography_bounds = 11; + + // If any parameter of the estimated homography mixture exceeds these bounds, + // we deem it UNSTABLE_HOMOG and use the estimated homography instead. + message MixtureHomographyBounds { + // Minimum fraction of inlier features w.r.t. block area. + optional float min_inlier_coverage = 1 [default = 0.4]; + + // Each block is tested to be stable, regarding the outliers. + // A frame is labeled unstable, if more or equal than the specified adjacent + // blocks are labeled outliers. + optional int32 max_adjacent_outlier_blocks = 2 [default = 5]; + + // Maximum number of adjacent empty blocks (no inliers). + optional int32 max_adjacent_empty_blocks = 3 [default = 3]; + + // Grid coverage threshold inlier threshold. See identical parameter in + // HomographyBounds. + optional float frac_inlier_threshold = 7 [default = 2.5e-3]; + } + + optional MixtureHomographyBounds stable_mixture_homography_bounds = 34; + + // Scale for stricter coverage evaluation. Used for rolling shutter guess + // computation, by only using high quality inliers. Larger values reflect + // stricter coverage. + // Specifically, when computing coverage via GridCoverage call, + // frac_inlier_threshold is reduced (divided) by specified scale below. + optional float strict_coverage_scale = 41 [default = 1.333]; + + // By default frames with zero trackable features (e.g. at the beginning, + // empty frame or shot boundary) are set identity model but still labeled as + // valid. If set to false, these frames are flagged as invalid, which can be + // useful to locate shot boundaries, etc. + optional bool label_empty_frames_as_valid = 22 [default = true]; + + // Setting for temporal smoothing of irls weights in optional post-processing + // step. + // In normalized coordinates w.r.t. frame domain. + optional float feature_grid_size = 24 [default = 0.05]; + optional float spatial_sigma = 25 [default = 0.01]; + + // Frame diameter across which smoothing is performed. + optional int32 temporal_irls_diameter = 26 [default = 20]; + optional float temporal_sigma = 27 [default = 5]; // in frames. + + // Bilateral weight (for un-normalized color domain [0, .. 255]). + optional float feature_sigma = 28 [default = 30.0]; + + // If set to false 3 taps are used. + optional bool filter_5_taps = 29 [default = false]; + + // If set, during temporal smoothing, each frame is weighted by its + // confidence, defined as the square coverage (or square mean mixture + // coverage). Therefore, low confidence fits do not errornouesly propagate + // over time. In addition, if the confidence is below the specified + // confidence_threshold (relative the the maximum coverage observed in the + // test interval), irls weights are reset to 1, i.e. biased to be + // agree with the (unkown) background motion. + optional bool frame_confidence_weighting = 48 [default = true]; + optional float reset_confidence_threshold = 49 [default = 0.4]; + + // Filters irls weights before smoothing them according to specified + // operation. + enum IRLSWeightFilter { + IRLS_FILTER_NONE = 0; + IRLS_FILTER_TEXTURE = 1; + IRLS_FILTER_CORNER_RESPONSE = 2; + } + + // Calls TextureFilteredRegionFlowFeatureIRLSWeights on computed irls weights + // before smoothing them. + optional IRLSWeightFilter irls_weight_filter = 35 + [default = IRLS_FILTER_NONE]; + + // Attempts to detect overlays, i.e. static elements burned-into the video + // that potentially corrupt motion estimation. + optional bool overlay_detection = 36 [default = false]; + // Overlay detection is performed over specified number of frames. + optional int32 overlay_analysis_chunk_size = 37 [default = 8]; + + message OverlayDetectionOptions { + // Potential overlay features are aggregated over a mask with cells + // mask_size x mask_size as specified below. + optional int32 analysis_mask_size = 1 [default = 10]; + + // There are two types of candidates of overlay features, strict and loose + // ones. Strict candidates are used to determine if a grid bin should be + // considered an overlay cell. If the grid is an overlay cell, *all* + // candidates, i.e. strict and loose ones will be flagged as overlay + // features by setting their corresponding irls weight to zero. + + // A feature is a strict overlay feature if its motion is less than + // near_zero_motion and AND less than max_translation_ratio times the + // estimated translation magnitude at that frame AND is texturedness is + // sufficiently high. + optional float strict_near_zero_motion = 2 [default = 0.2]; + optional float strict_max_translation_ratio = 3 [default = 0.2]; + // Minimum texturedness of a feature to be considered an overlay. + // Motivation: Overlays are mostly text or graphics, i.e. have visually + // distinguished features. + optional float strict_min_texturedness = 5 [default = 0.1]; + + // A feature is a loose overlay feature if its motion is less than + // loose_near_zero_motion. + optional float loose_near_zero_motion = 4 [default = 1.0]; + + // Minimum fraction of strict overlay features within a cell to be + // considered an overlay cell. + optional float overlay_min_ratio = 6 [default = 0.3]; + + // Absolute minimum number of strict overlay features within a cell to be + // considered an overlay cel.. + optional float overlay_min_features = 7 [default = 10]; + } + + optional OverlayDetectionOptions overlay_detection_options = 38; + + // Shot boundaries are introduced in 3 different scenarios: + // a) Frame has zero tracked features w.r.t. previous frame + // b) Estimated motion is deemed invalid (CameraMotion::INVALID). + // c) Visual consistency is above threshold of two adjacent frames. + message ShotBoundaryOptions { + // After cases a & b are determined from features/camera motion, they + // are verified by ensuring visual consistency is above specified threshold, + // if visual consistency has been computed. Only if this is case will the + // frame be labeled as shot boundary. Motivation is, that there should + // always be some (even small) measurable increase in the frame difference + // at a shot boundary. + // Verification is only performed if visual_consistency has been evaluated + // (value >= 0). + optional float motion_consistency_threshold = 1 [default = 0.02]; + + // Threshold for case c). Sometimes, motion estimation will miss shot + // boundaries. We define shot boundaries for which the visual consistency is + // higher than the specified threshold for at least two adjacent frames. + optional float appearance_consistency_threshold = 2 [default = 0.075]; + } + + optional ShotBoundaryOptions shot_boundary_options = 60; + + // By default, irls weights of each feature are overwritten with refined irls + // weights of the last iteration for the highest degree of freedom model that + // was estimated stable. If set to false, original irls weights are retained. + // Note: If overlay detection is activated, features to be deemed overlays + // have their irls weight set to zero, regardless of this setting. + // Similarily, an IRLSWeightFilter is applied if requested, regardless + // of this setting. + optional bool output_refined_irls_weights = 40 [default = true]; + + // Weight initialization for homography estimation. This is to bias homography + // estimation either to foreground or background. + enum HomographyIrlsWeightInitialization { + IRLS_WEIGHT_CONSTANT_ONE = 1; // Constant, treat all features equally. + IRLS_WEIGHT_CENTER_GAUSSIAN = 2; // Weight features in the center higher. + // Tends to lock onto foreground. + IRLS_WEIGHT_PERIMETER_GAUSSIAN = 3; // Weight features around the + // perimeter higher, tends to lock onto + // background and improves rigidity of + // the perspective degree of freedom. + } + + // IRLS weights for homography estimation are initialized based on the + // specified options. If, options irls_weights_preinitialized is set, + // weights are multiplied instead of reset. + optional HomographyIrlsWeightInitialization + homography_irls_weight_initialization = 45 + [default = IRLS_WEIGHT_PERIMETER_GAUSSIAN]; + + // If set to false use L1 norm irls weights instead of L0 norm irls weights. + optional bool irls_use_l0_norm = 46 [default = true]; + + // IRLS weights are determined in a limited domain (in particular helpful + // for stabilization analysis on HD videos). + // TODO: Make this the default. + optional bool domain_limited_irls_scaling = 65 [default = false]; + + // For comparison and debugging purposes. Simply estimates requested models + // without checking their stability via the stable_*_bounds parameters. + // However, invertibility is still checked to avoid invalid data being passed + // to later stages of the stabilizer. + optional bool deactivate_stable_motion_estimation = 47 [default = false]; + + // Projects higher order motions if estimated correctly down to lower order + // motions, therefore replacing the previously estimated motions. + optional bool project_valid_motions_down = 52 [default = false]; + + // DEPRECATED functionality. Use static functions as indicated instead. + // + // Non-linear similarity, use MotionEstimation::EstimateSimilarityModelL2. + optional bool estimate_similarity = 2 [deprecated = true]; + + // Deprecated fields. + extensions 7, 8, 16; +} diff --git a/mediapipe/util/tracking/motion_models.cc b/mediapipe/util/tracking/motion_models.cc new file mode 100644 index 000000000..eb6a8b314 --- /dev/null +++ b/mediapipe/util/tracking/motion_models.cc @@ -0,0 +1,924 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/motion_models.h" + +#include + +#include +#include +#include + +#include "Eigen/Core" +#include "Eigen/Dense" +#include "absl/strings/str_format.h" + +// Set to true to use catmull rom mixture weights instead of Gaussian weights +// for homography mixture estimation. +bool flags_catmull_rom_mixture_weights = false; + +namespace mediapipe { + +std::string ModelAdapter::ToString( + const TranslationModel& model) { + return absl::StrFormat("%7f", model.dx()) + " " + + absl::StrFormat("%7f", model.dy()); +} + +AffineModel ModelAdapter::ToAffine( + const TranslationModel& model) { + return AffineAdapter::FromArgs(model.dx(), model.dy(), 1, 0, 0, 1); +} + +TranslationModel ModelAdapter::FromAffine( + const AffineModel& model) { + DCHECK_EQ(model.a(), 1); + DCHECK_EQ(model.b(), 0); + DCHECK_EQ(model.c(), 0); + DCHECK_EQ(model.d(), 1); + + return TranslationAdapter::FromArgs(model.dx(), model.dy()); +} + +Homography ModelAdapter::ToHomography( + const TranslationModel& model) { + return HomographyAdapter::FromAffine(ToAffine(model)); +} + +TranslationModel ModelAdapter::FromHomography( + const Homography& model) { + return TranslationAdapter::FromAffine(HomographyAdapter::ToAffine(model)); +} + +void ModelAdapter::GetJacobianAtPoint(const Vector2_f& pt, + float* jacobian) { + DCHECK(jacobian); + jacobian[0] = 1; + jacobian[1] = 0; + jacobian[2] = 0; + jacobian[3] = 1; +} + +TranslationModel ModelAdapter::NormalizationTransform( + float frame_width, float frame_height) { + // Independent of frame size. + return TranslationModel(); +} + +TranslationModel ModelAdapter::Maximum( + const TranslationModel& lhs, const TranslationModel& rhs) { + return FromArgs(std::max(lhs.dx(), rhs.dx()), std::max(lhs.dy(), rhs.dy())); +} + +TranslationModel ModelAdapter::ProjectFrom( + const LinearSimilarityModel& model, float frame_width, float frame_height) { + return LinearSimilarityAdapter::ProjectToTranslation(model, frame_width, + frame_height); +} + +TranslationModel ModelAdapter::ProjectFrom( + const AffineModel& model, float frame_width, float frame_height) { + return ProjectFrom(AffineAdapter::ProjectToLinearSimilarity( + model, frame_width, frame_height), + frame_width, frame_height); +} + +TranslationModel ModelAdapter::ProjectFrom( + const Homography& model, float frame_width, float frame_height) { + return ProjectFrom( + HomographyAdapter::ProjectToAffine(model, frame_width, frame_height), + frame_width, frame_height); +} + +// Similarity model. +SimilarityModel ModelAdapter::FromArgs(float dx, float dy, + float scale, + float rotation) { + SimilarityModel model; + model.set_dx(dx); + model.set_dy(dy); + model.set_scale(scale); + model.set_rotation(rotation); + return model; +} + +SimilarityModel ModelAdapter::FromFloatPointer( + const float* args, bool identity_parametrization) { + DCHECK(args); + SimilarityModel model; + model.set_dx(args[0]); + model.set_dy(args[1]); + model.set_scale((identity_parametrization ? 1.f : 0.f) + args[2]); + model.set_rotation(args[3]); + return model; +} + +SimilarityModel ModelAdapter::FromDoublePointer( + const double* args, bool identity_parametrization) { + DCHECK(args); + SimilarityModel model; + model.set_dx(args[0]); + model.set_dy(args[1]); + model.set_scale((identity_parametrization ? 1.f : 0.f) + args[2]); + model.set_rotation(args[3]); + return model; +} + +Vector2_f ModelAdapter::TransformPoint( + const SimilarityModel& model, const Vector2_f& pt) { + // Setup rotation part. + const float c_r = std::cos(model.rotation()); + const float c_s = std::sin(model.rotation()); + const Vector2_f pt_rot(c_r * pt.x() - c_s * pt.y(), + c_s * pt.x() + c_r * pt.y()); + + return pt_rot * model.scale() + Vector2_f(model.dx(), model.dy()); +} + +SimilarityModel ModelAdapter::Invert( + const SimilarityModel& model) { + bool success = true; + const SimilarityModel result = InvertChecked(model, &success); + if (!success) { + LOG(ERROR) << "Model not invertible. Returning identity."; + return SimilarityModel(); + } else { + return result; + } +} + +SimilarityModel ModelAdapter::InvertChecked( + const SimilarityModel& model, bool* success) { + SimilarityModel inv_model; + inv_model.set_rotation(-model.rotation()); + + if (fabs(model.scale()) > kDetInvertibleEps) { + inv_model.set_scale(1.0 / model.scale()); + *success = true; + } else { + *success = false; + VLOG(1) << "Model is not invertible."; + return SimilarityModel(); + } + + // Rotate and scale [dx, dy] by inv_model. + const float c_r = std::cos(inv_model.rotation()); + const float c_s = std::sin(inv_model.rotation()); + const Vector2_f pt_rot(c_r * model.dx() - c_s * model.dy(), + c_s * model.dx() + c_r * model.dy()); + + const Vector2_f inv_trans = -pt_rot * inv_model.scale(); + + inv_model.set_dx(inv_trans.x()); + inv_model.set_dy(inv_trans.y()); + + return inv_model; +} + +SimilarityModel ModelAdapter::Compose( + const SimilarityModel& lhs, const SimilarityModel& rhs) { + SimilarityModel result; + result.set_scale(lhs.scale() * rhs.scale()); + result.set_rotation(lhs.rotation() + rhs.rotation()); + + // Apply lhs scale and rot to rhs translation. + const float c_r = std::cos(lhs.rotation()); + const float c_s = std::sin(lhs.rotation()); + const Vector2_f trans_rot(c_r * rhs.dx() - c_s * rhs.dy(), + c_s * rhs.dx() + c_r * rhs.dy()); + + const Vector2_f trans_concat = + trans_rot * lhs.scale() + Vector2_f(lhs.dx(), lhs.dy()); + + result.set_dx(trans_concat.x()); + result.set_dy(trans_concat.y()); + return result; +} + +float ModelAdapter::GetParameter(const SimilarityModel& model, + int id) { + switch (id) { + case 0: + return model.dx(); + case 1: + return model.dy(); + case 2: + return model.scale(); + case 3: + return model.rotation(); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } + + return 0; +} + +std::string ModelAdapter::ToString( + const SimilarityModel& model) { + return absl::StrFormat("%7f", model.dx()) + " " + + absl::StrFormat("%7f", model.dy()) + " " + + absl::StrFormat("%7f", model.scale()) + " " + + absl::StrFormat("%7f", model.rotation()); +} + +SimilarityModel ModelAdapter::NormalizationTransform( + float frame_width, float frame_height) { + const float scale = std::hypot(frame_width, frame_height); + DCHECK_NE(scale, 0); + return SimilarityAdapter::FromArgs(0, 0, 1.0 / scale, 0); +} + +TranslationModel ModelAdapter::ProjectToTranslation( + const SimilarityModel& model, float frame_width, float frame_height) { + return LinearSimilarityAdapter::ProjectToTranslation( + LinearSimilarityAdapter::FromSimilarity(model), frame_width, + frame_height); +} + +std::string ModelAdapter::ToString( + const LinearSimilarityModel& model) { + return absl::StrFormat("%7f", model.dx()) + " " + + absl::StrFormat("%7f", model.dy()) + " " + + absl::StrFormat("%7f", model.a()) + " " + + absl::StrFormat("%7f", model.b()); +} + +AffineModel ModelAdapter::ToAffine( + const LinearSimilarityModel& model) { + return AffineAdapter::FromArgs(model.dx(), model.dy(), model.a(), -model.b(), + model.b(), model.a()); +} + +LinearSimilarityModel ModelAdapter::FromAffine( + const AffineModel& model) { + DCHECK_EQ(model.a(), model.d()); + DCHECK_EQ(model.b(), -model.c()); + + return LinearSimilarityAdapter::FromArgs(model.dx(), model.dy(), model.a(), + -model.b()); +} + +Homography ModelAdapter::ToHomography( + const LinearSimilarityModel& model) { + return HomographyAdapter::FromAffine(ToAffine(model)); +} + +LinearSimilarityModel ModelAdapter::FromHomography( + const Homography& model) { + return LinearSimilarityAdapter::FromAffine( + HomographyAdapter::ToAffine(model)); +} + +SimilarityModel ModelAdapter::ToSimilarity( + const LinearSimilarityModel& model) { + const float scale = std::hypot(model.a(), model.b()); + return SimilarityAdapter::FromArgs(model.dx(), model.dy(), scale, + std::atan2(model.b(), model.a())); +} + +LinearSimilarityModel ModelAdapter::FromSimilarity( + const SimilarityModel& model) { + return LinearSimilarityAdapter::FromArgs( + model.dx(), model.dy(), model.scale() * std::cos(model.rotation()), + model.scale() * std::sin(model.rotation())); +} + +LinearSimilarityModel ModelAdapter::ScaleParameters( + const LinearSimilarityModel& model_in, float scale) { + LinearSimilarityModel model = model_in; + model.set_a(model.a() * scale); + model.set_b(model.b() * scale); + model.set_dx(model.dx() * scale); + model.set_dy(model.dy() * scale); + return model; +} + +LinearSimilarityModel ModelAdapter::AddIdentity( + const LinearSimilarityModel& model_in) { + LinearSimilarityModel model = model_in; + model.set_a(model.a() + 1); + return model; +} + +void ModelAdapter::GetJacobianAtPoint( + const Vector2_f& pt, float* jacobian) { + DCHECK(jacobian); + // First row. + jacobian[0] = 1; + jacobian[1] = 0; + jacobian[2] = pt.x(); + jacobian[3] = -pt.y(); + // Second row. + jacobian[4] = 0; + jacobian[5] = 1; + jacobian[6] = pt.y(); + jacobian[7] = pt.x(); +} + +LinearSimilarityModel +ModelAdapter::NormalizationTransform( + float frame_width, float frame_height) { + const float scale = std::hypot(frame_width, frame_height); + DCHECK_NE(scale, 0); + return LinearSimilarityAdapter::FromArgs(0, 0, 1.0 / scale, 0); +} + +LinearSimilarityModel ModelAdapter::Maximum( + const LinearSimilarityModel& lhs, const LinearSimilarityModel& rhs) { + return FromArgs(std::max(lhs.dx(), rhs.dx()), std::max(lhs.dy(), rhs.dy()), + std::max(lhs.a(), rhs.a()), std::max(lhs.b(), rhs.b())); +} + +TranslationModel ModelAdapter::ProjectToTranslation( + const LinearSimilarityModel& model, float frame_width, float frame_height) { + LinearSimilarityModel center_trans = LinearSimilarityAdapter::FromArgs( + frame_width * 0.5f, frame_height * 0.5f, 1, 0); + LinearSimilarityModel inv_center_trans = LinearSimilarityAdapter::FromArgs( + -frame_width * 0.5f, -frame_height * 0.5f, 1, 0); + + // Express model w.r.t. center. + LinearSimilarityModel center_model = + ModelCompose3(inv_center_trans, model, center_trans); + + // No need to shift back to top-left after decomposition, as translations + // are independent from coordinate origin. + return TranslationAdapter::FromArgs(center_model.dx(), center_model.dy()); +} + +std::string ModelAdapter::ToString(const AffineModel& model) { + return absl::StrFormat("%7f", model.dx()) + " " + + absl::StrFormat("%7f", model.dy()) + " " + + absl::StrFormat("%7f", model.a()) + " " + + absl::StrFormat("%7f", model.b()) + " " + + absl::StrFormat("%7f", model.c()) + " " + + absl::StrFormat("%7f", model.d()); +} + +AffineModel ModelAdapter::NormalizationTransform( + float frame_width, float frame_height) { + const float scale = std::hypot(frame_width, frame_height); + DCHECK_NE(scale, 0); + return AffineAdapter::FromArgs(0, 0, 1.0f / scale, 0, 0, 1.0f / scale); +} + +Homography ModelAdapter::ToHomography(const AffineModel& model) { + float params[8] = {model.a(), model.b(), model.dx(), model.c(), + model.d(), model.dy(), 0, 0}; + return HomographyAdapter::FromFloatPointer(params, false); +} + +AffineModel ModelAdapter::FromHomography(const Homography& model) { + DCHECK_EQ(model.h_20(), 0); + DCHECK_EQ(model.h_21(), 0); + + float params[6] = {model.h_02(), model.h_12(), // dx, dy + model.h_00(), model.h_01(), // a, b + model.h_10(), model.h_11()}; // c, d + + return FromFloatPointer(params, false); +} + +AffineModel ModelAdapter::ScaleParameters( + const AffineModel& model_in, float scale) { + AffineModel model = model_in; + model.set_a(model.a() * scale); + model.set_b(model.b() * scale); + model.set_c(model.c() * scale); + model.set_d(model.d() * scale); + model.set_dx(model.dx() * scale); + model.set_dy(model.dy() * scale); + return model; +} + +AffineModel ModelAdapter::AddIdentity( + const AffineModel& model_in) { + AffineModel model = model_in; + model.set_a(model.a() + 1); + model.set_d(model.d() + 1); + return model; +} + +void ModelAdapter::GetJacobianAtPoint(const Vector2_f& pt, + float* jacobian) { + DCHECK(jacobian); + // First row. + jacobian[0] = 1; + jacobian[1] = 0; + jacobian[2] = pt.x(); + jacobian[3] = pt.y(); + jacobian[4] = 0; + jacobian[5] = 0; + + // Second row. + jacobian[6] = 0; + jacobian[7] = 1; + jacobian[8] = 0; + jacobian[9] = 0; + jacobian[10] = pt.x(); + jacobian[11] = pt.y(); +} + +AffineModel ModelAdapter::Maximum(const AffineModel& lhs, + const AffineModel& rhs) { + return FromArgs(std::max(lhs.dx(), rhs.dx()), std::max(lhs.dy(), rhs.dy()), + std::max(lhs.a(), rhs.a()), std::max(lhs.b(), rhs.b()), + std::max(lhs.c(), rhs.c()), std::max(lhs.d(), rhs.d())); +} + +LinearSimilarityModel ModelAdapter::ProjectFrom( + const AffineModel& model, float frame_width, float frame_height) { + return AffineAdapter::ProjectToLinearSimilarity(model, frame_width, + frame_height); +} + +LinearSimilarityModel ModelAdapter::ProjectFrom( + const Homography& model, float frame_width, float frame_height) { + return ProjectFrom( + AffineAdapter::ProjectFrom(model, frame_width, frame_height), frame_width, + frame_height); +} + +LinearSimilarityModel ModelAdapter::ProjectToLinearSimilarity( + const AffineModel& model, float frame_width, float frame_height) { + AffineModel center_trans = AffineAdapter::FromArgs( + frame_width * 0.5f, frame_height * 0.5f, 1, 0, 0, 1); + AffineModel inv_center_trans = AffineAdapter::FromArgs( + -frame_width * 0.5f, -frame_height * 0.5f, 1, 0, 0, 1); + + // Express model w.r.t. center. + AffineModel center_model = + ModelCompose3(inv_center_trans, model, center_trans); + + // Determine average scale. + const float scale = std::sqrt(AffineAdapter::Determinant(center_model)); + + // Goal is approximate matrix: + // (a b (a' -b' + // c d) with b' a') + // + // := := + // v_1 v_2 + // After normalization by the scale, a' = cos(u) and b' = sin(u) + // therefore, the columns on the right hand side have norm 1 and are + // orthogonal. + // + // ==> Orthonormalize v_1 and v_2 + + Vector2_f x_map(center_model.a(), center_model.c()); // == v_1 + Vector2_f y_map(center_model.b(), center_model.d()); // == v_2 + x_map.Normalize(); + y_map.Normalize(); + + // Two approaches here + // A) Perform Grahm Schmidt, i.e. QR decomp / polar decomposition, i.e.: + // y_map' = y_map - * x_map; + // ==> no error in x_map, error increasing with larger y + // B) Compute the middle vector between x_map and y_map and create + // orthogonal system from that + // (rotate by -45': [ 1 1 + // -1 1] * 1/sqrt(2) + // Error equally distributed between x and y. + // + // Comparing approach A and B: + // + // video 1 (gleicher4) video 2 (pool dance) + // Method B + // Max diff : dx: 1.6359973 4.600112 + // dy: 2.1076841 11.51579 + // a: 1.004498 1.01036 + // b: 0.0047194548 0.027450036 + // Method A + // Max diff : dx: 4.3549204 14.145205 + // dy: 2.4496114 7.7804289 + // a: 1.0136091 1.043335 + // b: 0.0024313219 0.0065769218 + + // Using method B: + Vector2_f middle = (x_map + y_map).Normalize(); + + Vector2_f a_b = Vector2_f(middle.x() + middle.y(), // see above matrix. + middle.y() - middle.x()) + .Normalize(); + AffineModel lin_approx = AffineAdapter::FromArgs( + center_model.dx(), center_model.dy(), scale * a_b.x(), -scale * a_b.y(), + scale * a_b.y(), scale * a_b.x()); + + // Map back to top-left origin. + return LinearSimilarityAdapter::FromAffine( + ModelCompose3(center_trans, lin_approx, inv_center_trans)); +} + +AffineModel ModelAdapter::ProjectFrom(const Homography& model, + float frame_width, + float frame_height) { + return HomographyAdapter::ProjectToAffine(model, frame_width, frame_height); +} + +Homography ModelAdapter::InvertChecked(const Homography& model, + bool* success) { + // Could do adjoint method and do it by hand. Use Eigen for now, not that + // crucial at this point. + + Eigen::Matrix3d model_mat; + model_mat(0, 0) = model.h_00(); + model_mat(0, 1) = model.h_01(); + model_mat(0, 2) = model.h_02(); + model_mat(1, 0) = model.h_10(); + model_mat(1, 1) = model.h_11(); + model_mat(1, 2) = model.h_12(); + model_mat(2, 0) = model.h_20(); + model_mat(2, 1) = model.h_21(); + model_mat(2, 2) = 1.0f; + + if (model_mat.determinant() < kDetInvertibleEps) { + VLOG(1) << "Homography not invertible, det is zero."; + *success = false; + return Homography(); + } + + Eigen::Matrix3d inv_model_mat = model_mat.inverse(); + + if (inv_model_mat(2, 2) == 0) { + LOG(ERROR) << "Degenerate homography. See proto."; + *success = false; + return Homography(); + } + + *success = true; + Homography inv_model; + const float scale = 1.0f / inv_model_mat(2, 2); + inv_model.set_h_00(inv_model_mat(0, 0) * scale); + inv_model.set_h_01(inv_model_mat(0, 1) * scale); + inv_model.set_h_02(inv_model_mat(0, 2) * scale); + inv_model.set_h_10(inv_model_mat(1, 0) * scale); + inv_model.set_h_11(inv_model_mat(1, 1) * scale); + inv_model.set_h_12(inv_model_mat(1, 2) * scale); + inv_model.set_h_20(inv_model_mat(2, 0) * scale); + inv_model.set_h_21(inv_model_mat(2, 1) * scale); + + return inv_model; +} + +std::string ModelAdapter::ToString(const Homography& model) { + return absl::StrFormat("%7f", model.h_00()) + " " + + absl::StrFormat("%7f", model.h_01()) + " " + + absl::StrFormat("%7f", model.h_02()) + " " + + absl::StrFormat("%7f", model.h_10()) + " " + + absl::StrFormat("%7f", model.h_11()) + " " + + absl::StrFormat("%7f", model.h_12()) + " " + + absl::StrFormat("%7f", model.h_20()) + " " + + absl::StrFormat("%7f", model.h_21()); +} + +AffineModel ModelAdapter::ToAffine(const Homography& model) { + DCHECK_EQ(model.h_20(), 0); + DCHECK_EQ(model.h_21(), 0); + AffineModel affine_model; + affine_model.set_a(model.h_00()); + affine_model.set_b(model.h_01()); + affine_model.set_dx(model.h_02()); + affine_model.set_c(model.h_10()); + affine_model.set_d(model.h_11()); + affine_model.set_dy(model.h_12()); + return affine_model; +} + +Homography ModelAdapter::FromAffine(const AffineModel& model) { + return Embed(model); +} + +bool ModelAdapter::IsAffine(const Homography& model) { + return model.h_20() == 0 && model.h_21() == 0; +} + +void ModelAdapter::GetJacobianAtPoint(const Vector2_f& pt, + float* jacobian) { + DCHECK(jacobian); + // First row. + jacobian[0] = pt.x(); + jacobian[1] = pt.y(); + jacobian[2] = 1; + jacobian[3] = 0; + jacobian[4] = 0; + jacobian[5] = 0; + jacobian[6] = -pt.x() * pt.x(); + jacobian[7] = -pt.x() * pt.y(); + + // Second row. + jacobian[8] = 0; + jacobian[9] = 0; + jacobian[10] = 0; + jacobian[11] = pt.x(); + jacobian[12] = pt.y(); + jacobian[13] = 1; + jacobian[14] = -pt.x() * pt.y(); + jacobian[15] = -pt.y() * pt.y(); +} + +Homography ModelAdapter::NormalizationTransform( + float frame_width, float frame_height) { + const float scale = std::hypot(frame_width, frame_height); + DCHECK_NE(scale, 0); + return HomographyAdapter::FromArgs(1.0f / scale, 0, 0, 0, 1.0f / scale, 0, 0, + 0); +} + +AffineModel ModelAdapter::ProjectToAffine(const Homography& model, + float frame_width, + float frame_height) { + Homography center_trans; + center_trans.set_h_02(frame_width * 0.5f); + center_trans.set_h_12(frame_height * 0.5f); + + Homography inv_center_trans; + inv_center_trans.set_h_02(-frame_width * 0.5f); + inv_center_trans.set_h_12(-frame_height * 0.5f); + + // Express model w.r.t. center. + Homography center_model = + ModelCompose3(inv_center_trans, model, center_trans); + + // Zero out perspective. + center_model.set_h_20(0); + center_model.set_h_21(0); + + // Map back to top left and embed. + return ToAffine(ModelCompose3(center_trans, center_model, inv_center_trans)); +} + +namespace { + +// Returns true if non-empty intersection is present. In this case, start and +// end are clipped to rect, specifically after clipping the line +// [start, end] is inside the rectangle or incident to the boundary +// of the rectangle. If strict is set to true, [start, end] is strictly inside +// the rectangle, i.e. not incident to the boundary (minus start and end +// point which still can be incident). +// Implemented using Liang-Barsky algorithm. +// http://en.wikipedia.org/wiki/Liang%E2%80%93Barsky +inline bool ClipLine(const Vector2_f& rect, bool strict, Vector2_f* start, + Vector2_f* end) { + Vector2_f diff = *end - *start; + float p[4] = {-diff.x(), diff.x(), -diff.y(), diff.y()}; + + // Bounds are (x_min, y_min) = (0, 0) + // (x_max, y_max) = rect + float q[4] = {start->x() - 0, rect.x() - start->x(), start->y() - 0, + rect.y() - start->y()}; + + // Compute parametric intersection points. + float near = -1e10; + float far = 1e10; + for (int k = 0; k < 4; ++k) { + if (fabs(p[k]) < 1e-6f) { + // Line is parallel to one axis of rectangle. + if ((strict && q[k] <= 0.0f) || q[k] < 0.0f) { + // Line is outside rectangle. + return false; + } else { + // Possible intersection along other dimensions. + continue; + } + } else { + // Line is not parallel -> compute intersection. + float intersect = q[k] / p[k]; + // Sign of p determines if near or far parameter. + if (p[k] < 0.0f) { + near = std::max(near, intersect); + } else { + far = std::min(far, intersect); + } + } + } + + if (near > far) { + // Line is outside of rectangle. + return false; + } + + // Clip near and far to valid line segment interval [0, 1]. + far = std::min(far, 1.0f); + near = std::max(near, 0.0f); + + if (near <= far) { + // Non-empty intersection. Single points are considered valid intersection. + *end = *start + diff * far; + *start = *start + diff * near; + return true; + } else { + // Empty intersection. + return false; + } +} + +} // namespace. + +template +float ModelMethods::NormalizedIntersectionArea(const Model& model_1, + const Model& model_2, + const Vector2_f& rect) { + const float rect_area = rect.x() * rect.y(); + if (rect_area <= 0) { + LOG(WARNING) << "Empty rectangle passed -> empty intersection."; + return 0.0f; + } + + std::vector> lines(4); + lines[0] = std::make_pair(Vector2_f(0, 0), Vector2_f(0, rect.y())); + lines[1] = + std::make_pair(Vector2_f(0, rect.y()), Vector2_f(rect.x(), rect.y())); + lines[2] = + std::make_pair(Vector2_f(rect.x(), rect.y()), Vector2_f(rect.x(), 0)); + lines[3] = std::make_pair(Vector2_f(rect.x(), 0), Vector2_f(0, 0)); + + float model_1_area = 0.0f; + float model_2_area = 0.0f; + for (int k = 0; k < 4; ++k) { + Vector2_f start_1 = Adapter::TransformPoint(model_1, lines[k].first); + Vector2_f end_1 = Adapter::TransformPoint(model_1, lines[k].second); + // Use trapezoidal rule for polygon area. + model_1_area += 0.5 * (end_1.y() + start_1.y()) * (end_1.x() - start_1.x()); + Vector2_f start_2 = Adapter::TransformPoint(model_2, lines[k].first); + Vector2_f end_2 = Adapter::TransformPoint(model_2, lines[k].second); + model_2_area += 0.5 * (end_2.y() + start_2.y()) * (end_2.x() - start_2.x()); + } + + const float average_area = 0.5f * (model_1_area + model_2_area); + if (average_area <= 0) { + LOG(WARNING) << "Degenerative models passed -> empty intersection."; + return 0.0f; + } + + // First, clip transformed rectangle against origin defined by model_1. + bool success = true; + Model diff = ModelDiffChecked(model_2, model_1, &success); + if (!success) { + LOG(WARNING) << "Model difference is singular -> empty intersection."; + return 0.0f; + } + + float area = 0.0f; + for (int k = 0; k < 4; ++k) { + Vector2_f start_1 = Adapter::TransformPoint(diff, lines[k].first); + Vector2_f end_1 = Adapter::TransformPoint(diff, lines[k].second); + if (ClipLine(rect, false, &start_1, &end_1)) { + // Non-empty intersection. + // Transform intersection back to world coordinate system. + const Vector2_f start = Adapter::TransformPoint(model_1, start_1); + const Vector2_f end = Adapter::TransformPoint(model_1, end_1); + // Use trapezoidal rule for polygon area without explicit ordering of + // vertices. + area += 0.5 * (end.y() + start.y()) * (end.x() - start.x()); + } + } + + // Second, clip transformed rectangle against origin defined by model_2. + Model inv_diff = Adapter::InvertChecked(diff, &success); + if (!success) { + LOG(WARNING) << "Model difference is singular -> empty intersection."; + return 0.0f; + } + + for (int k = 0; k < 4; ++k) { + Vector2_f start_2 = Adapter::TransformPoint(inv_diff, lines[k].first); + Vector2_f end_2 = Adapter::TransformPoint(inv_diff, lines[k].second); + // Use strict comparison to address degenerate case of incident rectangles + // in which case intersection would be counted twice if non-strict + // comparison is employed. + if (ClipLine(rect, true, &start_2, &end_2)) { + // Transform start and end back to origin. + const Vector2_f start = Adapter::TransformPoint(model_2, start_2); + const Vector2_f end = Adapter::TransformPoint(model_2, end_2); + area += 0.5 * (end.y() + start.y()) * (end.x() - start.x()); + } + } + + // Normalize w.r.t. average rectangle area. + return area / average_area; +} + +MixtureRowWeights::MixtureRowWeights(int frame_height, int margin, float sigma, + float y_scale, int num_models) + : frame_height_(frame_height), + y_scale_(y_scale), + margin_(margin), + sigma_(sigma), + num_models_(num_models) { + mid_points_.resize(num_models_); + + // Compute mid_point (row idx) for each model. + if (flags_catmull_rom_mixture_weights) { + const float model_height = + static_cast(frame_height_) / (num_models - 1); + + // Use Catmull-rom spline. + // Compute weighting matrix. + weights_.resize(frame_height * num_models); + float spline_weights[4]; + + // No margin support for splines. + if (margin_ > 0) { + LOG(WARNING) << "No margin support when flag catmull_rom_mixture_weights " + << "is set. Margin is reset to zero, it is recommended " + << "that RowWeightsBoundChecked is used to prevent " + << "segfaults."; + margin_ = 0; + } + + for (int i = 0; i < frame_height; ++i) { + float* weight_ptr = &weights_[i * num_models]; + + float float_pos = static_cast(i) / model_height; + int int_pos = float_pos; + memset(weight_ptr, 0, sizeof(weight_ptr[0]) * num_models); + + float dy = float_pos - int_pos; + + // Weights sum to one, for all choices of dy. + // For definition see + // en.wikipedia.org/wiki/Cubic_Hermite_spline#Catmull.E2.80.93Rom_spline + spline_weights[0] = 0.5f * (dy * ((2.0f - dy) * dy - 1.0f)); + spline_weights[1] = 0.5f * (dy * dy * (3.0f * dy - 5.0f) + 2.0f); + spline_weights[2] = 0.5f * (dy * ((4.0f - 3.0f * dy) * dy + 1.0f)); + spline_weights[3] = 0.5f * (dy * dy * (dy - 1.0f)); + + weight_ptr[int_pos] += spline_weights[1]; + if (int_pos > 0) { + weight_ptr[int_pos - 1] += spline_weights[0]; + } else { + weight_ptr[int_pos] += spline_weights[0]; // Double knot. + } + + CHECK_LT(int_pos, num_models - 1); + weight_ptr[int_pos + 1] += spline_weights[2]; + if (int_pos + 1 < num_models - 1) { + weight_ptr[int_pos + 2] += spline_weights[3]; + } else { + weight_ptr[int_pos + 1] += spline_weights[3]; // Double knot. + } + } + } else { + // Gaussian weights. + const float model_height = static_cast(frame_height_) / num_models; + + for (int i = 0; i < num_models; ++i) { + mid_points_[i] = (i + 0.5f) * model_height; + } + + // Compute gaussian weights. + const int num_values = frame_height_ + 2 * margin_; + std::vector row_dist_weights(num_values); + const float common = -0.5f / (sigma * sigma); + for (int i = 0; i < num_values; ++i) { + row_dist_weights[i] = std::exp(common * i * i); + } + + // Compute weighting matrix. + weights_.resize(num_values * num_models); + for (int i = 0; i < num_values; ++i) { + float* weight_ptr = &weights_[i * num_models]; + float weight_sum = 0; + + // Gaussian weights via lookup. + for (int j = 0; j < num_models; ++j) { + weight_ptr[j] = row_dist_weights[abs(i - margin_ - mid_points_[j])]; + weight_sum += weight_ptr[j]; + } + + // Normalize. + DCHECK_GT(weight_sum, 0); + const float inv_weight_sum = 1.0f / weight_sum; + for (int j = 0; j < num_models; ++j) { + weight_ptr[j] *= inv_weight_sum; + } + } + } +} + +float MixtureRowWeights::WeightThreshold(float frac_blocks) { + const float model_height = static_cast(frame_height_) / num_models_; + + const float y = model_height * frac_blocks + mid_points_[0]; + const float* row_weights = RowWeightsClamped(y / y_scale_); + return row_weights[0]; +} + +// Explicit instantiations of ModelMethods. +template class ModelMethods; +template class ModelMethods; +template class ModelMethods; +template class ModelMethods; +template class ModelMethods; + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/motion_models.h b/mediapipe/util/tracking/motion_models.h new file mode 100644 index 000000000..567831ad5 --- /dev/null +++ b/mediapipe/util/tracking/motion_models.h @@ -0,0 +1,1849 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_H_ +#define MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_H_ + +#include +#include +#include +#include + +#include "absl/container/node_hash_map.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/singleton.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/region_flow.pb.h" // NOLINT + +namespace mediapipe { + +// Abstract Camera Model, functionality that each model must support. +template +class ModelAdapter { + public: + // Initializes a model from vector to data. + // If identity_parametrization is set, it is assumed that for args = 0 -> + // Model = identity, else args = 0 -> zero transform. + static Model FromFloatPointer(const float* args, + bool identity_parametrization); + static Model FromDoublePointer(const double* args, + bool identity_parametrization); + + // Transforms points according to model * pt. + static Vector2_f TransformPoint(const Model& model, const Vector2_f& pt); + + // Returns model^(-1), outputs feasibility to variable success (can not be + // NULL). If model is not invertible, function should return identity model. + static Model InvertChecked(const Model& model, bool* success); + + // Returns model^(-1), returns identity model if inversion is not possible, + // and warns via LOG(ERROR). It is recommended that InvertChecked is used + // instead. + // Note: Default implementation, motion models only need to supply above + // function. + static Model Invert(const Model& model); + + // Concatenates two models. Returns lhs * rhs + static Model Compose(const Model& lhs, const Model& rhs); + + // Debugging function to create plots. Output parameters separated by space. + static std::string ToString(const Model& model); + + // Returns number of DOF. + static constexpr int NumParameters(); + + // Access parameters in a model agnostic way. Order is the order of + // specification in the corresponding motion_models.proto file, i.e. + // id = proto_id - 1. + static float GetParameter(const Model& model, int id); + + // Sets parameter in a model agonstic way. Same interpretation of id as for + // GetParameter. + static void SetParameter(int id, float value, Model* model); + + // Returns normalization transform for specific frame size. Range after + // normalization is [0, 1]. + static Model NormalizationTransform(float frame_width, float frame_height); + + // Embeds a lower paramteric model into this model. + // Overload for specialization. + template + static Model Embed(const LowerModel& model); + + // Projects higher motion model H to lower one L such that for points x_i + // \sum_i || H * x - L * x|| == min + // For this we transform the models to the center of the specified frame + // domain at which degree's of freedom are usually independent. Details can be + // found in the individual implementation functions. + template + static Model ProjectFrom(const HigherModel& model, float frame_width, + float frame_height); +}; + +// Specialized implementations, with additional functionality if needed. +template <> +class ModelAdapter { + public: + static TranslationModel FromArgs(float dx, float dy); + static TranslationModel FromFloatPointer(const float* args, + bool identity_parametrization); + static TranslationModel FromDoublePointer(const double* args, + bool identity_parametrization); + + static Vector2_f TransformPoint(const TranslationModel& model, + const Vector2_f& pt); + static TranslationModel Invert(const TranslationModel& model); + static TranslationModel InvertChecked(const TranslationModel& model, + bool* success); + static TranslationModel Compose(const TranslationModel& lhs, + const TranslationModel& rhs); + static float GetParameter(const TranslationModel& model, int id); + static void SetParameter(int id, float value, TranslationModel* model); + + static std::string ToString(const TranslationModel& model); + + static constexpr int NumParameters() { return 2; } + + // Support to convert to and from affine. + static AffineModel ToAffine(const TranslationModel& model); + + // Fails with debug check, if affine model is not a translation. + static TranslationModel FromAffine(const AffineModel& model); + + static Homography ToHomography(const TranslationModel& model); + + // Fails with debug check if homography is not a translation. + static TranslationModel FromHomography(const Homography& model); + + // Evaluates Jacobian at specified point and parameters set to 0. + // Note: This is independent of whether identity parametrization was used + // or not via From*Pointer. + // Jacobian has to be of size 2 x NumParams(), and is returned in column + // major order. + static void GetJacobianAtPoint(const Vector2_f& pt, float* jacobian); + + static TranslationModel NormalizationTransform(float frame_width, + float frame_height); + + static TranslationModel Embed(const TranslationModel& model) { return model; } + + static TranslationModel ProjectFrom(const TranslationModel& model, + float frame_width, float frame_height) { + return model; + } + + static TranslationModel ProjectFrom(const LinearSimilarityModel& model, + float frame_width, float frame_height); + + static TranslationModel ProjectFrom(const AffineModel& model, + float frame_width, float frame_height); + + static TranslationModel ProjectFrom(const Homography& model, + float frame_width, float frame_height); + + // Returns parameter wise maximum. + static TranslationModel Maximum(const TranslationModel& lhs, + const TranslationModel& rhs); + + // Return determinant of model. + static float Determinant(const TranslationModel& unused) { return 1; } +}; + +template <> +class ModelAdapter { + public: + static SimilarityModel FromArgs(float dx, float dy, float scale, + float rotation); + static SimilarityModel FromFloatPointer(const float* args, bool); + static SimilarityModel FromDoublePointer(const double* args, bool); + + static Vector2_f TransformPoint(const SimilarityModel& model, + const Vector2_f& pt); + static SimilarityModel Invert(const SimilarityModel& model); + static SimilarityModel InvertChecked(const SimilarityModel& model, + bool* success); + static SimilarityModel Compose(const SimilarityModel& lhs, + const SimilarityModel& rhs); + + static float GetParameter(const SimilarityModel& model, int id); + static void SetParameter(int id, float value, SimilarityModel* model); + + static std::string ToString(const SimilarityModel& model); + + static constexpr int NumParameters() { return 4; } + + static SimilarityModel NormalizationTransform(float frame_width, + float frame_height); + + static TranslationModel ProjectToTranslation(const SimilarityModel& model, + float frame_width, + float frame_height); +}; + +template <> +class ModelAdapter { + public: + static LinearSimilarityModel FromArgs(float dx, float dy, float a, float b); + static LinearSimilarityModel FromFloatPointer(const float* args, + bool identity_parametrization); + static LinearSimilarityModel FromDoublePointer(const double* args, + bool identity_parametrization); + + static Vector2_f TransformPoint(const LinearSimilarityModel& model, + const Vector2_f& pt); + static LinearSimilarityModel Invert(const LinearSimilarityModel& model); + static LinearSimilarityModel InvertChecked(const LinearSimilarityModel& model, + bool* success); + + static LinearSimilarityModel Compose(const LinearSimilarityModel& lhs, + const LinearSimilarityModel& rhs); + + static float GetParameter(const LinearSimilarityModel& model, int id); + static void SetParameter(int id, float value, LinearSimilarityModel* model); + + static std::string ToString(const LinearSimilarityModel& model); + + static constexpr int NumParameters() { return 4; } + + // Support to convert to and from affine. + static AffineModel ToAffine(const LinearSimilarityModel& model); + + static Homography ToHomography(const LinearSimilarityModel& model); + + // Fails with debug check, if homography is not a similarity. + static LinearSimilarityModel FromHomography(const Homography& model); + + // Fails with debug check, if affine model is not a similarity. + static LinearSimilarityModel FromAffine(const AffineModel& model); + + // Conversion from and to non-linear similarity. + static SimilarityModel ToSimilarity(const LinearSimilarityModel& model); + static LinearSimilarityModel FromSimilarity(const SimilarityModel& model); + + // Additional functionality: + + // Composes scaled identity transform with model (sI * model). + static LinearSimilarityModel ScaleParameters( + const LinearSimilarityModel& model, float scale); + + // Adds identity I to model (I + model). + static LinearSimilarityModel AddIdentity(const LinearSimilarityModel& model); + + // Evaluates Jacobian at specified point and parameters set to 0. + // Note: This is independent of whether identity parametrization was used + // or not via From*Pointer. + // Jacobian has to be of size 2 x NumParams(), and is returned in column + // major order. + static void GetJacobianAtPoint(const Vector2_f& pt, float* jacobian); + + static LinearSimilarityModel NormalizationTransform(float frame_width, + float frame_height); + + static LinearSimilarityModel Embed(const TranslationModel& model) { + return FromArgs(model.dx(), model.dy(), 1, 0); + } + + static LinearSimilarityModel Embed(const LinearSimilarityModel& model) { + return model; + } + + static TranslationModel ProjectToTranslation( + const LinearSimilarityModel& model, float frame_width, + float frame_height); + + static LinearSimilarityModel ProjectFrom(const LinearSimilarityModel& model, + float frame_width, + float frame_height) { + return model; + } + + static LinearSimilarityModel ProjectFrom(const AffineModel& model, + float frame_width, + float frame_height); + + static LinearSimilarityModel ProjectFrom(const Homography& model, + float frame_width, + float frame_height); + // Returns parameter wise maximum. + static LinearSimilarityModel Maximum(const LinearSimilarityModel& lhs, + const LinearSimilarityModel& rhs); + + // Return determinant of model. + static float Determinant(const LinearSimilarityModel& m) { + return m.a() * m.a() + m.b() * m.b(); + } +}; + +template <> +class ModelAdapter { + public: + static AffineModel FromArgs(float dx, float dy, float a, float b, float c, + float d); + static AffineModel FromFloatPointer(const float* args, + bool identity_parametrization); + static AffineModel FromDoublePointer(const double* args, + bool identity_parametrization); + + static Vector2_f TransformPoint(const AffineModel& model, + const Vector2_f& pt); + static AffineModel Invert(const AffineModel& model); + static AffineModel InvertChecked(const AffineModel& model, bool* success); + static AffineModel Compose(const AffineModel& lhs, const AffineModel& rhs); + + static float GetParameter(const AffineModel& model, int id); + static void SetParameter(int id, float value, AffineModel* model); + + static std::string ToString(const AffineModel& model); + + static constexpr int NumParameters() { return 6; } + + // Support to convert to and from affine. + static AffineModel ToAffine(const AffineModel& model) { return model; } + + static AffineModel FromAffine(const AffineModel& model) { return model; } + + static Homography ToHomography(const AffineModel& model); + + // Fails with debug check, if homography is not affine. + static AffineModel FromHomography(const Homography& model); + + // Additional functionality: + + // Composes scaled identity transform with model (sI * model). + static AffineModel ScaleParameters(const AffineModel& model, float scale); + + // Adds identity I to model (I + model). + static AffineModel AddIdentity(const AffineModel& model); + + // Evaluates Jacobian at specified point and parameters set to 0. + // Note: This is independent of whether identity parametrization was used + // or not via From*Pointer. + // Jacobian has to be of size 2 x NumParams(), and is returned in column + // major order. + static void GetJacobianAtPoint(const Vector2_f& pt, float* jacobian); + + static AffineModel NormalizationTransform(float frame_width, + float frame_height); + + static AffineModel Embed(const TranslationModel& model) { + return FromArgs(model.dx(), model.dy(), 1, 0, 0, 1); + } + + static AffineModel Embed(const LinearSimilarityModel& model) { + return FromArgs(model.dx(), model.dy(), model.a(), -model.b(), model.b(), + model.a()); + } + + static AffineModel Embed(const AffineModel& model) { return model; } + + static AffineModel ProjectFrom(const AffineModel& model, float frame_width, + float frame_height) { + return model; + } + + static AffineModel ProjectFrom(const Homography& model, float frame_width, + float frame_height); + + static LinearSimilarityModel ProjectToLinearSimilarity( + const AffineModel& model, float frame_width, float frame_height); + + // Returns parameter wise maximum. + static AffineModel Maximum(const AffineModel& lhs, const AffineModel& rhs); + + static float Determinant(const AffineModel& m) { + return m.a() * m.d() - m.b() * m.c(); + } +}; + +template <> +class ModelAdapter { + public: + static Homography FromArgs(float h_00, float h_01, float h_02, float h_10, + float h_11, float h_12, float h_20, float h_21); + + static Homography FromFloatPointer(const float* args, + bool identity_parametrization); + static Homography FromDoublePointer(const double* args, + bool identity_parametrization); + + static Vector2_f TransformPoint(const Homography& model, const Vector2_f& pt); + + static Vector3_f TransformPoint3(const Homography& model, + const Vector3_f& pt); + + static Homography Invert(const Homography& model); + static Homography InvertChecked(const Homography& model, bool* success); + static Homography Compose(const Homography& lhs, const Homography& rhs); + + static float GetParameter(const Homography& model, int id); + static void SetParameter(int id, float value, Homography* model); + + static std::string ToString(const Homography& model); + + static constexpr int NumParameters() { return 8; } + + static bool IsAffine(const Homography& model); + + // Support to convert to and from affine. Debug check that model is actually + // an affine model. + static AffineModel ToAffine(const Homography& model); + static Homography FromAffine(const AffineModel& model); + + static Homography ToHomography(const Homography& model) { return model; } + static Homography FromHomography(const Homography& model) { return model; } + + // Additional functionality: + // Evaluates Jacobian at specified point and parameters set to 0. + // Note: This is independent of whether identity parametrization was used + // or not via From*Pointer. + // Jacobian has to be of size 2 x NumParams(), and is returned in column + // major order. + static void GetJacobianAtPoint(const Vector2_f& pt, float* jacobian); + + static Homography NormalizationTransform(float frame_width, + float frame_height); + + static Homography Embed(const Homography& model) { return model; } + + static Homography Embed(const AffineModel& model) { + return FromArgs(model.a(), model.b(), model.dx(), model.c(), model.d(), + model.dy(), 0, 0); + } + + static Homography Embed(const LinearSimilarityModel& model) { + return FromArgs(model.a(), -model.b(), model.dx(), model.b(), model.a(), + model.dy(), 0, 0); + } + + static Homography Embed(const TranslationModel& model) { + return FromArgs(1, 0, model.dx(), 0, 1, model.dy(), 0, 0); + } + + static float Determinant(const Homography& m) { + // Applying laplace formula to last column. + // h_00 h_01 h_02 + // h_10 h_11 h_12 + // h_20 h_21 1 <-- apply laplace. + return m.h_20() * (m.h_01() * m.h_12() - m.h_11() * m.h_02()) + + -m.h_21() * (m.h_00() * m.h_12() - m.h_10() * m.h_02()) + + 1.0f * (m.h_00() * m.h_11() - m.h_10() * m.h_01()); + } + + static AffineModel ProjectToAffine(const Homography& model, float frame_width, + float frame_height); +}; + +// Common algorithms implemented using corresponding ModelAdapter. +// Implemented in cc file, explicitly instantiated below. +template +class ModelMethods { + typedef ModelAdapter Adapter; + + public: + // Returns _normalized_ intersection area of rectangle transformed by + // model_1 and model_2, respectively. + static float NormalizedIntersectionArea(const Model& model_1, + const Model& model_2, + const Vector2_f& rect); +}; + +// Traits to bind mixture and corresponding base model together. +template +struct MixtureTraits { + typedef BaseModel base_model; + typedef MixtureModel model; +}; + +typedef MixtureTraits + LinearSimilarityTraits; +typedef MixtureTraits AffineTraits; +typedef MixtureTraits HomographyTraits; + +template +class MixtureModelAdapterBase { + public: + typedef typename MixtureTraits::model MixtureModel; + typedef typename MixtureTraits::base_model BaseModel; + typedef ModelAdapter BaseModelAdapter; + + // Initializes a model from vector to data. All weights are set to one. + // If identity_parametrization is set, it is assumed that for args = 0 -> + // Model = identity, else args = 0 -> zero transform. + // Adjacent models are assumed to be separated by + // NumParameters() + skip elements. + static MixtureModel FromFloatPointer(const float* args, + bool identity_parametrization, int skip, + int num_models); + static MixtureModel FromDoublePointer(const double* args, + bool identity_parametrization, int skip, + int num_models); + + // Mixture models are not closed under composition and inversion. + // Instead, each point has to be transformed via above functions. + // However, a mixture model can be composed with a BaseModel from either left + // or right, by component-wise composition. + // Returns mixture_model * base_model. + static MixtureModel ComposeRight(const MixtureModel& mixture_model, + const BaseModel& base_model); + + // Returns base_model * mixture_model. + static MixtureModel ComposeLeft(const MixtureModel& mixture_model, + const BaseModel& base_model); + + // Debugging function to create plots. Output parameters separated by delim. + static std::string ToString(const MixtureModel& model, std::string delim); + + // Returns total number of DOF (number of models * BaseModel DOF) + static int NumParameters(const MixtureModel& model) { + return model.model_size() * BaseModelAdapter::NumParameters(); + } + + // Access parameters in a model agnostic way. Order is the order of + // specification in the corresponding motion_models.proto file, i.e. + // id = proto_id - 1. + static float GetParameter(const MixtureModel& model, int model_id, + int param_id); + + static void SetParameter(int model_id, int param_id, float value, + MixtureModel* model); + + static MixtureModel IdentityModel(int num_mixtures); + + // Returns average model across mixture, i.e. mean of each parameter across + // the mixture. + static BaseModel MeanModel(const MixtureModel& model); + + // Fits a linear model to each parameter across mixture and returns mixture + // evaluated across line. + static MixtureModel LinearModel(const MixtureModel& model); + + static MixtureModel Embed(const BaseModel& base_model, int num_mixtures); +}; + +class MixtureRowWeights; + +template +class MixtureModelAdapter : public MixtureModelAdapterBase { + public: + typedef typename MixtureModelAdapterBase::MixtureModel + MixtureModel; + typedef typename MixtureModelAdapterBase::BaseModel BaseModel; + typedef ModelAdapter BaseModelAdapter; + + // Returns convex combination of models from supplied mixture_model, + // specifically: + // \sum_i mixture_model.model(i) * weights[i] + // where: + // b) weights[i] need to be normalized to sum to one. + static BaseModel ToBaseModel(const MixtureModel& mixture_model, + const float* weights); + + // Transforms points according ToBaseModel(model, weights) * pt; + // Note: Weights need to sum to one (not checked). + static Vector2_f TransformPoint(const MixtureModel& model, + const float* weights, const Vector2_f& pt); + + static Vector2_f TransformPoint(const MixtureModel& model, + const MixtureRowWeights& weights, + const Vector2_f& pt); + + // Transforms / solves for points according to + // ToBaseModel(model, weights)^(-1) * pt + // Fails with CHECK if model is not invertible. + // Note: Weights need to sum to one (not checked). + static Vector2_f SolveForPoint(const MixtureModel& model, + const float* weights, const Vector2_f& pt); + + // Same as above, indicating if model is invertible in parameter + // success. If model is not invertible, passed parameter + // pt is returned unchanged. + static Vector2_f SolveForPointChecked(const MixtureModel& model, + const float* weights, + const Vector2_f& pt, bool* success); +}; + +// Re-implemented for speed benefits. +template <> +class MixtureModelAdapter + : public MixtureModelAdapterBase { + public: + inline static Homography ToBaseModel(const MixtureHomography& model, + const float* weights); + inline static Vector2_f TransformPoint(const MixtureHomography& model, + const float* weights, + const Vector2_f& pt); + // Overload. OK as only input format for weights changed. + inline static Vector2_f TransformPoint(const MixtureHomography& model, + const MixtureRowWeights& weights, + const Vector2_f& pt); + + inline static Vector2_f SolveForPoint(const MixtureHomography& model, + const float* weights, + const Vector2_f& pt); + + inline static Vector2_f SolveForPointChecked(const MixtureModel& model, + const float* weights, + const Vector2_f& pt, + bool* success); +}; + +// Compositing for multiple models in order of argument list. +template +Model ModelCompose2(const Model& a, const Model& b) { + typedef ModelAdapter Adapter; + return Adapter::Compose(a, b); +} + +template +Model ModelCompose3(const Model& a, const Model& b, const Model& c) { + typedef ModelAdapter Adapter; + return Adapter::Compose(a, Adapter::Compose(b, c)); +} + +template +Model ModelCompose4(const Model& a, const Model& b, const Model& c, + const Model& d) { + typedef ModelAdapter Adapter; + return Adapter::Compose(a, Adapter::Compose(b, Adapter::Compose(c, d))); +} + +template +Model ModelInvert(const Model& model) { + typedef ModelAdapter Adapter; + return Adapter::Invert(model); +} + +// Returns model according to b^(-1) * a +template +Model ModelDiff(const Model& a, const Model& b) { + typedef ModelAdapter Adapter; + return Adapter::Compose(Adapter::Invert(b), a); +} + +template +Model ModelDiffChecked(const Model& a, const Model& b, bool* success) { + typedef ModelAdapter Adapter; + Model b_inv = Adapter::InvertChecked(b, success); + return Adapter::Compose(b_inv, a); +} + +template +Vector2_f TransformPoint(const Model& m, const Vector2_f& v) { + return ModelAdapter::TransformPoint(m, v); +} + +// Epsilon threshold for determinant. Below this threshold we consider +// the linear model to be non-invertible. +const float kDetInvertibleEps = 1e-10; + +// Threshold for stability. Used to determine if a particular motion model +// is invertible AND likely to be stable after inversion (imposes higher +// threshold on determinant than just for invertibility). +const float kDetStableEps = 1e-2; + +template +bool IsInverseStable(const Model& model) { + return ModelAdapter::Determinant(model) > kDetStableEps; +} + +// Accumulates camera motions in accum: +// If motions for frames 1..N are: F1, F2, .. FN, where Fk is the motion that +// maps frame k to k-1 (backwards motion), then cumulative motion mapping frame +// N to 0 is: +// C = F1 * F2 * ... * FN. +// +// This function computes it recursively: C(k) = C(k-1) * Fk. +template +void AccumulateModel(const Model& model, Model* accum) { + *accum = ModelCompose2(*accum, model); +} + +// Accumulates inverse camera motions in accum_inverted: +// We want to compute the inverse motion that maps frame 0 to frame N: +// C^-1 = FN^-1 * .... * F2^-1 * F1^-1. +// (inverse of C defined above for AccumulateModel). +// +// This function computes it recursively: C(k)^-1 = Fk^-1 * C(k-1)^-1. +// +// Return value indicates accumulation was successful (it might fail if model +// is not invertible), otherwise accum_inverted is left unchanged. +template +bool AccumulateInvertedModel(const Model& model, Model* accum_inverted) { + bool success = true; + const Model inv_model = ModelAdapter::InvertChecked(model, &success); + if (success) { + *accum_inverted = ModelCompose2(inv_model, *accum_inverted); + } + return success; +} + +// Returns true if |predicted * ground_truth^(-1)| < bounds (element wise). +// Use UniformModel to initialize bounds. +template +bool ModelDiffWithinBounds(const Model& ground_truth, const Model& predicted, + const Model& bounds) { + Model diff = ModelAdapter::Compose( + predicted, ModelAdapter::Invert(ground_truth)); + Model identity; + for (int p = 0; p < ModelAdapter::NumParameters(); ++p) { + const float bound = ModelAdapter::GetParameter(bounds, p); + const float diff_p = fabs(ModelAdapter::GetParameter(diff, p) - + ModelAdapter::GetParameter(identity, p)); + + if (diff_p > bound) { + LOG(WARNING) << "Param diff " << p << " out of bounds: " << diff_p + << " > " << bound << " bound"; + return false; + } + } + return true; +} + +// Returns true if model is identity within floating point accuracy. +template +bool IsModelIdentity(const Model& model) { + Model identity; + for (int p = 0; p < ModelAdapter::NumParameters(); ++p) { + const float diff_p = fabs(ModelAdapter::GetParameter(model, p) - + ModelAdapter::GetParameter(identity, p)); + + if (diff_p > 1e-6f) { + return false; + } + } + return true; +} + +// Expresses input model M w.r.t. new domain given by LinearSimilarityTransform +// (scale 0 0 +// 0 scale 0) := S -> S M S^(-1) +template +Model CoordinateTransform(const Model& model, float scale) { + return CoordinateTransform( + model, ModelAdapter::FromArgs(0, 0, scale, 0)); +} + +// For model M and similarity S returns +// S * M * S^(-1). +template +Model CoordinateTransform(const Model& model, + const LinearSimilarityModel& similarity) { + return ModelCompose3(ModelAdapter::Embed(similarity), model, + ModelAdapter::Embed(ModelInvert(similarity))); +} + +// Returns a model with all parameters set to value. +template +Model UniformModelParameters(const float value) { + std::array::NumParameters()> params; + params.fill(value); + return ModelAdapter::FromFloatPointer(params.data(), false); +} + +// Returns a blended model: a * (1 - weight_b) + b * weight_b. +// Assumes 0 <= weight_b <= 1. +// Note that blending the homographies is a non-linear operation if the +// intention is to obtain a transform that blends the points transformed by +// a and b. However, this is a linear approximation, which ignores the +// perspective division, and simply blends the coefficients. +template +Model BlendModels(const Model& a, const Model& b, float weight_b) { + Model blended; + DCHECK_GE(weight_b, 0); + DCHECK_LE(weight_b, 1); + const float weight_a = 1 - weight_b; + for (int p = 0; p < ModelAdapter::NumParameters(); ++p) { + const float pa = ModelAdapter::GetParameter(a, p); + const float pb = ModelAdapter::GetParameter(b, p); + ModelAdapter::SetParameter(p, pa * weight_a + pb * weight_b, + &blended); + } + return blended; +} + +template +std::string ModelToString(const Model& model) { + return ModelAdapter::ToString(model); +} + +// Typedef's. +typedef ModelAdapter TranslationAdapter; +typedef ModelAdapter SimilarityAdapter; +typedef ModelAdapter LinearSimilarityAdapter; +typedef ModelAdapter AffineAdapter; +typedef ModelAdapter HomographyAdapter; + +typedef ModelMethods TranslationMethods; +typedef ModelMethods SimilarityMethods; +typedef ModelMethods LinearSimilarityMethods; +typedef ModelMethods AffineMethods; +typedef ModelMethods HomographyMethods; + +typedef MixtureModelAdapter + MixtureLinearSimilarityAdapter; +typedef MixtureModelAdapter MixtureAffineAdapter; +typedef MixtureModelAdapter MixtureHomographyAdapter; + +// Stores pre-computed normalized mixture weights. +// Weights are computed for each scanline, +// based on gaussian weighting of y-location to mid-points for each model +// (specified in scanlines!). +// We use even spacing between mid-points by default. +// By supplying a y_scale != 1.f, normalized coordinates can be used as input. +// Possible unnormalized y values for RowWeigts are +// [-margin, frame_height + margin). +class MixtureRowWeights { + public: + MixtureRowWeights(int frame_height, int margin, float sigma, float y_scale, + int num_models); + int NumModels() const { return num_models_; } + float YScale() const { return y_scale_; } + float Sigma() const { return sigma_; } + + // Test if MixtureRowWeights should be re-initialized (call constructor + // again), based on changed options. + bool NeedsInitialization(int num_models, float sigma, float y_scale) const { + return (num_models != num_models_ || fabs(sigma - sigma_) > 1e-6f || + fabs(y_scale - y_scale_) > 1e-6f); + } + + const float* RowWeights(float y) const { + int bin_y = y * y_scale_ + 0.5; + DCHECK_LT(bin_y, frame_height_ + margin_); + DCHECK_GE(bin_y, -margin_); + return &weights_[(bin_y + margin_) * num_models_]; + } + + // Same as above but clamps parameter y to be within interval + // (-margin, frame_height + margin). + const float* RowWeightsClamped(float y) const { + int bin_y = y * y_scale_ + 0.5; + bin_y = std::max(-margin_, std::min(frame_height_ - 1 + margin_, bin_y)); + return &weights_[(bin_y + margin_) * num_models_]; + } + + // Returns weight threshold for fractional block distance, e.g. + // parameter 1.5f returns row weight at 1.5f * block_height from block center. + float WeightThreshold(float frac_blocks); + + private: + int frame_height_; + float y_scale_; + int margin_; + float sigma_; + int num_models_; + + std::vector mid_points_; + std::vector weights_; +}; + +// Returns pointer (caller takes ownership) of initialized MixtureRowWeights. +inline MixtureRowWeights* MixtureRowWeightsFromCameraMotion( + const CameraMotion& camera_motion, int frame_height) { + return new MixtureRowWeights(frame_height, + 0, // no margin. + camera_motion.mixture_row_sigma(), 1.0, + camera_motion.mixture_homography().model_size()); +} + +// Performs element wise smoothing of input models with per parameter sigma's +// in time (and optionally bilateral). Parameters of optional +// model_sigma that are NOT 0 are interpreted as bilateral smoothing sigma. +// Use UniformModelParameters to set all value of sigma_time to the same sigma. +template +void SmoothModels(const Model& sigma_time_model, const Model* model_sigma, + std::vector* models) { + CHECK(models); + + const int num_models = models->size(); + + std::vector> smoothed_model_data(num_models); + + for (int param = 0; param < ModelAdapter::NumParameters(); ++param) { + const float sigma_time = + ModelAdapter::GetParameter(sigma_time_model, param); + + if (sigma_time == 0) { + // Don't perform any smoothing, just copy. + for (int i = 0; i < num_models; ++i) { + smoothed_model_data[i].push_back( + ModelAdapter::GetParameter((*models)[i], param)); + } + continue; + } + + // Create lookup table for frame weights. + const int frame_radius = + std::min(num_models - 1, std::ceil(sigma_time * 1.5f)); + const int frame_diameter = 2 * frame_radius + 1; + + // Create lookup table for weights. + std::vector frame_weights(frame_diameter); + const float frame_coeff = -0.5f / (sigma_time * sigma_time); + + int frame_idx = 0; + for (int i = -frame_radius; i <= frame_radius; ++i, ++frame_idx) { + frame_weights[frame_idx] = std::exp(frame_coeff * i * i); + } + + // Create local copy with border. + std::vector param_path(num_models + 2 * frame_radius); + + const float param_sigma = + model_sigma != nullptr + ? ModelAdapter::GetParameter(*model_sigma, param) + : 0; + const float param_sigma_denom = + param_sigma != 0 ? (-0.5f / (param_sigma * param_sigma)) : 0; + + for (int model_idx = 0; model_idx < num_models; ++model_idx) { + param_path[model_idx + frame_radius] = + ModelAdapter::GetParameter((*models)[model_idx], param); + } + + // Copy right. + std::copy(param_path.rbegin() + frame_radius, + param_path.rbegin() + 2 * frame_radius, + param_path.end() - frame_radius); + + // Copy left. + std::copy(param_path.begin() + frame_radius, + param_path.begin() + 2 * frame_radius, + param_path.rend() - frame_radius); + + // Apply filter. + for (int i = 0; i < num_models; ++i) { + float value_sum = 0; + float weight_sum = 0; + const float curr_value = param_path[i + frame_radius]; + + for (int k = 0; k < frame_diameter; ++k) { + const float value = param_path[i + k]; + float weight = frame_weights[k]; + if (param_sigma != 0) { + // Bilateral filtering. + const float value_diff = curr_value - value; + weight *= std::exp(value_diff * value_diff * param_sigma_denom); + } + weight_sum += weight; + value_sum += value * weight; + } + + // Weight_sum is always > 0, as sigma is > 0. + smoothed_model_data[i].push_back(value_sum / weight_sum); + } + } + + for (int i = 0; i < num_models; ++i) { + (*models)[i].CopyFrom(ModelAdapter::FromFloatPointer( + &smoothed_model_data[i][0], false)); + } +} + +// Inline implementations. + +// Translation model. +inline TranslationModel ModelAdapter::FromArgs(float dx, + float dy) { + TranslationModel model; + model.set_dx(dx); + model.set_dy(dy); + return model; +} + +inline TranslationModel ModelAdapter::FromFloatPointer( + const float* args, bool) { + DCHECK(args); + TranslationModel model; + model.set_dx(args[0]); + model.set_dy(args[1]); + return model; +} + +inline TranslationModel ModelAdapter::FromDoublePointer( + const double* args, bool) { + DCHECK(args); + TranslationModel model; + model.set_dx(args[0]); + model.set_dy(args[1]); + return model; +} + +inline Vector2_f ModelAdapter::TransformPoint( + const TranslationModel& model, const Vector2_f& pt) { + return Vector2_f(pt.x() + model.dx(), pt.y() + model.dy()); +} + +inline TranslationModel ModelAdapter::Invert( + const TranslationModel& model) { + bool success = true; + TranslationModel result = InvertChecked(model, &success); + if (!success) { + LOG(ERROR) << "Model not invertible. Returning identity."; + return TranslationModel(); + } + + return result; +} + +inline TranslationModel ModelAdapter::InvertChecked( + const TranslationModel& model, bool* success) { + TranslationModel inv_model; + inv_model.set_dx(-model.dx()); + inv_model.set_dy(-model.dy()); + *success = true; + return inv_model; +} + +inline TranslationModel ModelAdapter::Compose( + const TranslationModel& lhs, const TranslationModel& rhs) { + TranslationModel result; + result.set_dx(lhs.dx() + rhs.dx()); + result.set_dy(lhs.dy() + rhs.dy()); + return result; +} + +inline float ModelAdapter::GetParameter( + const TranslationModel& model, int id) { + switch (id) { + case 0: + return model.dx(); + case 1: + return model.dy(); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } + return 0; +} + +inline void ModelAdapter::SetParameter( + int id, float value, TranslationModel* model) { + switch (id) { + case 0: + return model->set_dx(value); + case 1: + return model->set_dy(value); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } +} + +// Linear Similarity model. +inline LinearSimilarityModel ModelAdapter::FromArgs( + float dx, float dy, float a, float b) { + LinearSimilarityModel model; + model.set_dx(dx); + model.set_dy(dy); + model.set_a(a); + model.set_b(b); + return model; +} + +inline LinearSimilarityModel +ModelAdapter::FromFloatPointer( + const float* args, bool identity_parametrization) { + DCHECK(args); + LinearSimilarityModel model; + const float id_shift = identity_parametrization ? 1.f : 0.f; + model.set_dx(args[0]); + model.set_dy(args[1]); + model.set_a(id_shift + args[2]); + model.set_b(args[3]); + return model; +} + +inline LinearSimilarityModel +ModelAdapter::FromDoublePointer( + const double* args, bool identity_parametrization) { + DCHECK(args); + LinearSimilarityModel model; + const float id_shift = identity_parametrization ? 1.f : 0.f; + model.set_dx(args[0]); + model.set_dy(args[1]); + model.set_a(id_shift + args[2]); + model.set_b(args[3]); + return model; +} + +inline Vector2_f ModelAdapter::TransformPoint( + const LinearSimilarityModel& model, const Vector2_f& pt) { + return Vector2_f(model.a() * pt.x() - model.b() * pt.y() + model.dx(), + model.b() * pt.x() + model.a() * pt.y() + model.dy()); +} + +inline LinearSimilarityModel ModelAdapter::Invert( + const LinearSimilarityModel& model) { + bool success = true; + LinearSimilarityModel result = InvertChecked(model, &success); + if (!success) { + LOG(ERROR) << "Model not invertible. Returning identity."; + return LinearSimilarityModel(); + } else { + return result; + } +} + +inline LinearSimilarityModel ModelAdapter::InvertChecked( + const LinearSimilarityModel& model, bool* success) { + LinearSimilarityModel inv_model; + + const float det = model.a() * model.a() + model.b() * model.b(); + if (fabs(det) < kDetInvertibleEps) { + *success = false; + VLOG(1) << "Model is not invertible, det is zero."; + return LinearSimilarityModel(); + } + + *success = true; + const float inv_det = 1.0 / det; + + inv_model.set_a(model.a() * inv_det); + inv_model.set_b(-model.b() * inv_det); + + // Inverse translation is -A^(-1) * [dx dy]. + inv_model.set_dx(-(inv_model.a() * model.dx() - inv_model.b() * model.dy())); + inv_model.set_dy(-(inv_model.b() * model.dx() + inv_model.a() * model.dy())); + + return inv_model; +} + +inline LinearSimilarityModel ModelAdapter::Compose( + const LinearSimilarityModel& lhs, const LinearSimilarityModel& rhs) { + LinearSimilarityModel result; + result.set_a(lhs.a() * rhs.a() - lhs.b() * rhs.b()); + result.set_b(lhs.a() * rhs.b() + lhs.b() * rhs.a()); + + result.set_dx(lhs.a() * rhs.dx() - lhs.b() * rhs.dy() + lhs.dx()); + result.set_dy(lhs.b() * rhs.dx() + lhs.a() * rhs.dy() + lhs.dy()); + return result; +} + +inline float ModelAdapter::GetParameter( + const LinearSimilarityModel& model, int id) { + switch (id) { + case 0: + return model.dx(); + case 1: + return model.dy(); + case 2: + return model.a(); + case 3: + return model.b(); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } + + return 0; +} + +inline void ModelAdapter::SetParameter( + int id, float value, LinearSimilarityModel* model) { + switch (id) { + case 0: + return model->set_dx(value); + case 1: + return model->set_dy(value); + case 2: + return model->set_a(value); + case 3: + return model->set_b(value); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } +} + +// Affine model. +inline AffineModel ModelAdapter::FromArgs(float dx, float dy, + float a, float b, + float c, float d) { + AffineModel model; + model.set_dx(dx); + model.set_dy(dy); + model.set_a(a); + model.set_b(b); + model.set_c(c); + model.set_d(d); + return model; +} + +inline AffineModel ModelAdapter::FromFloatPointer( + const float* args, bool identity_parametrization) { + DCHECK(args); + AffineModel model; + const float id_shift = identity_parametrization ? 1.f : 0.f; + model.set_dx(args[0]); + model.set_dy(args[1]); + model.set_a(id_shift + args[2]); + model.set_b(args[3]); + model.set_c(args[4]); + model.set_d(id_shift + args[5]); + return model; +} + +inline AffineModel ModelAdapter::FromDoublePointer( + const double* args, bool identity_parametrization) { + DCHECK(args); + AffineModel model; + const float id_shift = identity_parametrization ? 1.f : 0.f; + model.set_dx(args[0]); + model.set_dy(args[1]); + model.set_a(id_shift + args[2]); + model.set_b(args[3]); + model.set_c(args[4]); + model.set_d(id_shift + args[5]); + return model; +} + +inline Vector2_f ModelAdapter::TransformPoint( + const AffineModel& model, const Vector2_f& pt) { + return Vector2_f(model.a() * pt.x() + model.b() * pt.y() + model.dx(), + model.c() * pt.x() + model.d() * pt.y() + model.dy()); +} + +// Use of Invert is discouraged, always use InvertChecked. +inline AffineModel ModelAdapter::Invert(const AffineModel& model) { + bool success = true; + AffineModel result = InvertChecked(model, &success); + if (!success) { + LOG(ERROR) << "Model not invertible. Returning identity."; + return AffineModel(); + } else { + return result; + } +} + +inline AffineModel ModelAdapter::InvertChecked( + const AffineModel& model, bool* success) { + AffineModel inv_model; + const float det = model.a() * model.d() - model.b() * model.c(); + if (fabs(det) < kDetInvertibleEps) { + *success = false; + VLOG(1) << "Model is not invertible, det is zero."; + return AffineModel(); + } + + *success = true; + const float inv_det = 1.0 / det; + + inv_model.set_a(model.d() * inv_det); + inv_model.set_d(model.a() * inv_det); + inv_model.set_c(-model.c() * inv_det); + inv_model.set_b(-model.b() * inv_det); + + // Inverse translation is -A^(-1) * [dx dy]. + inv_model.set_dx(-(inv_model.a() * model.dx() + inv_model.b() * model.dy())); + inv_model.set_dy(-(inv_model.c() * model.dx() + inv_model.d() * model.dy())); + + return inv_model; +} + +inline AffineModel ModelAdapter::Compose(const AffineModel& lhs, + const AffineModel& rhs) { + AffineModel result; + result.set_a(lhs.a() * rhs.a() + lhs.b() * rhs.c()); + result.set_b(lhs.a() * rhs.b() + lhs.b() * rhs.d()); + result.set_c(lhs.c() * rhs.a() + lhs.d() * rhs.c()); + result.set_d(lhs.c() * rhs.b() + lhs.d() * rhs.d()); + + result.set_dx(lhs.a() * rhs.dx() + lhs.b() * rhs.dy() + lhs.dx()); + result.set_dy(lhs.c() * rhs.dx() + lhs.d() * rhs.dy() + lhs.dy()); + return result; +} + +inline float ModelAdapter::GetParameter(const AffineModel& model, + int id) { + switch (id) { + case 0: + return model.dx(); + case 1: + return model.dy(); + case 2: + return model.a(); + case 3: + return model.b(); + case 4: + return model.c(); + case 5: + return model.d(); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } + + return 0; +} + +inline void ModelAdapter::SetParameter(int id, float value, + AffineModel* model) { + switch (id) { + case 0: + return model->set_dx(value); + case 1: + return model->set_dy(value); + case 2: + return model->set_a(value); + case 3: + return model->set_b(value); + case 4: + return model->set_c(value); + case 5: + return model->set_d(value); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } +} + +// Homography model. +inline Homography ModelAdapter::FromArgs(float h_00, float h_01, + float h_02, float h_10, + float h_11, float h_12, + float h_20, float h_21) { + Homography model; + model.set_h_00(h_00); + model.set_h_01(h_01); + model.set_h_02(h_02); + model.set_h_10(h_10); + model.set_h_11(h_11); + model.set_h_12(h_12); + model.set_h_20(h_20); + model.set_h_21(h_21); + return model; +} + +inline Homography ModelAdapter::FromFloatPointer( + const float* args, bool identity_parametrization) { + DCHECK(args); + Homography model; + const float id_shift = identity_parametrization ? 1.f : 0.f; + model.set_h_00(id_shift + args[0]); + model.set_h_01(args[1]); + model.set_h_02(args[2]); + model.set_h_10(args[3]); + model.set_h_11(id_shift + args[4]); + model.set_h_12(args[5]); + model.set_h_20(args[6]); + model.set_h_21(args[7]); + return model; +} + +inline Homography ModelAdapter::FromDoublePointer( + const double* args, bool identity_parametrization) { + DCHECK(args); + Homography model; + const float id_shift = identity_parametrization ? 1.f : 0.f; + model.set_h_00(id_shift + args[0]); + model.set_h_01(args[1]); + model.set_h_02(args[2]); + model.set_h_10(args[3]); + model.set_h_11(id_shift + args[4]); + model.set_h_12(args[5]); + model.set_h_20(args[6]); + model.set_h_21(args[7]); + return model; +} + +inline Vector2_f ModelAdapter::TransformPoint( + const Homography& model, const Vector2_f& pt) { + const float x = model.h_00() * pt.x() + model.h_01() * pt.y() + model.h_02(); + const float y = model.h_10() * pt.x() + model.h_11() * pt.y() + model.h_12(); + float z = model.h_20() * pt.x() + model.h_21() * pt.y() + 1.0f; + + if (z != 1.f) { + // Enforce z can not assume very small values. + constexpr float eps = 1e-12f; + if (fabs(z) < eps) { + LOG(ERROR) << "Point mapped to infinity. " + << "Degenerate homography. See proto."; + z = z >= 0 ? eps : -eps; + } + return Vector2_f(x / z, y / z); + } else { + return Vector2_f(x, y); + } +} + +inline Vector3_f ModelAdapter::TransformPoint3( + const Homography& model, const Vector3_f& pt) { + return Vector3_f( + model.h_00() * pt.x() + model.h_01() * pt.y() + model.h_02() * pt.z(), + model.h_10() * pt.x() + model.h_11() * pt.y() + model.h_12() * pt.z(), + model.h_20() * pt.x() + model.h_21() * pt.y() + pt.z()); +} + +inline Homography ModelAdapter::Invert(const Homography& model) { + bool success = true; + Homography result = InvertChecked(model, &success); + if (!success) { + LOG(ERROR) << "Model not invertible. Returning identity."; + return Homography(); + } else { + return result; + } +} + +inline Homography ModelAdapter::Compose(const Homography& lhs, + const Homography& rhs) { + Homography result; + const float z = + lhs.h_20() * rhs.h_02() + lhs.h_21() * rhs.h_12() + 1.0f * 1.0f; + CHECK_NE(z, 0) << "Degenerate homography. See proto."; + const float inv_z = 1.0 / z; + + result.set_h_00((lhs.h_00() * rhs.h_00() + lhs.h_01() * rhs.h_10() + + lhs.h_02() * rhs.h_20()) * + inv_z); + result.set_h_01((lhs.h_00() * rhs.h_01() + lhs.h_01() * rhs.h_11() + + lhs.h_02() * rhs.h_21()) * + inv_z); + result.set_h_02( + (lhs.h_00() * rhs.h_02() + lhs.h_01() * rhs.h_12() + lhs.h_02() * 1.0f) * + inv_z); + + result.set_h_10((lhs.h_10() * rhs.h_00() + lhs.h_11() * rhs.h_10() + + lhs.h_12() * rhs.h_20()) * + inv_z); + result.set_h_11((lhs.h_10() * rhs.h_01() + lhs.h_11() * rhs.h_11() + + lhs.h_12() * rhs.h_21()) * + inv_z); + result.set_h_12( + (lhs.h_10() * rhs.h_02() + lhs.h_11() * rhs.h_12() + lhs.h_12() * 1.0f) * + inv_z); + + result.set_h_20( + (lhs.h_20() * rhs.h_00() + lhs.h_21() * rhs.h_10() + 1.0f * rhs.h_20()) * + inv_z); + result.set_h_21( + (lhs.h_20() * rhs.h_01() + lhs.h_21() * rhs.h_11() + 1.f * rhs.h_21()) * + inv_z); + return result; +} + +inline float ModelAdapter::GetParameter(const Homography& model, + int id) { + switch (id) { + case 0: + return model.h_00(); + case 1: + return model.h_01(); + case 2: + return model.h_02(); + case 3: + return model.h_10(); + case 4: + return model.h_11(); + case 5: + return model.h_12(); + case 6: + return model.h_20(); + case 7: + return model.h_21(); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } + + return 0; +} + +inline void ModelAdapter::SetParameter(int id, float value, + Homography* model) { + switch (id) { + case 0: + return model->set_h_00(value); + case 1: + return model->set_h_01(value); + case 2: + return model->set_h_02(value); + case 3: + return model->set_h_10(value); + case 4: + return model->set_h_11(value); + case 5: + return model->set_h_12(value); + case 6: + return model->set_h_20(value); + case 7: + return model->set_h_21(value); + default: + LOG(FATAL) << "Parameter id is out of bounds"; + } +} + +// MixtureModelAdapterBase implementation. +template +typename MixtureTraits::model +MixtureModelAdapterBase::FromFloatPointer( + const float* args, bool identity_parametrization, int skip, + int num_models) { + MixtureModel model; + const float* arg_ptr = args; + for (int i = 0; i < num_models; + ++i, arg_ptr += BaseModelAdapter::NumParameters() + skip) { + BaseModel base = + BaseModelAdapter::FromFloatPointer(arg_ptr, identity_parametrization); + model.add_model()->CopyFrom(base); + } + + return model; +} + +template +typename MixtureTraits::model +MixtureModelAdapterBase::FromDoublePointer( + const double* args, bool identity_parametrization, int skip, + int num_models) { + MixtureModel model; + const double* arg_ptr = args; + for (int i = 0; i < num_models; + ++i, arg_ptr += BaseModelAdapter::NumParameters() + skip) { + BaseModel base = + BaseModelAdapter::FromDoublePointer(arg_ptr, identity_parametrization); + model.add_model()->CopyFrom(base); + } + + return model; +} + +template +typename MixtureTraits::model +MixtureModelAdapterBase::ComposeRight( + const MixtureModel& mixture_model, const BaseModel& base_model) { + const int num_models = mixture_model.model_size(); + MixtureModel result; + for (int m = 0; m < num_models; ++m) { + result.add_model()->CopyFrom( + BaseModelAdapter::Compose(mixture_model.model(m), base_model)); + } + return result; +} + +template +typename MixtureTraits::model +MixtureModelAdapterBase::ComposeLeft( + const MixtureModel& mixture_model, const BaseModel& base_model) { + const int num_models = mixture_model.model_size(); + MixtureModel result; + for (int m = 0; m < num_models; ++m) { + result.add_model()->CopyFrom( + BaseModelAdapter::Compose(base_model, mixture_model.model(m))); + } + return result; +} + +template +std::string MixtureModelAdapterBase::ToString( + const MixtureModel& model, std::string delim) { + std::string result = ""; + for (int m = 0, size = model.model_size(); m < size; ++m) { + result += + (m == 0 ? "" : delim) + BaseModelAdapter::ToString(model.model(m)); + } + return result; +} + +template +float MixtureModelAdapterBase::GetParameter( + const MixtureModel& model, int model_id, int param_id) { + return BaseModelAdapter::GetParameter(model.model(model_id), param_id); +} + +template +void MixtureModelAdapterBase::SetParameter(int model_id, + int param_id, + float value, + MixtureModel* model) { + BaseModelAdapter::SetParameter(param_id, value, + model->mutable_model(model_id)); +} + +template +typename MixtureTraits::model +MixtureModelAdapterBase::IdentityModel(int num_mixtures) { + MixtureModel model; + for (int i = 0; i < num_mixtures; ++i) { + model.add_model(); + } + return model; +} + +template +typename MixtureTraits::base_model +MixtureModelAdapterBase::MeanModel( + const MixtureModel& mixture_model) { + const int num_models = mixture_model.model_size(); + if (num_models == 0) { + return BaseModel(); + } + + float params[BaseModelAdapter::NumParameters()]; + memset(params, 0, sizeof(params[0]) * BaseModelAdapter::NumParameters()); + + // Average of models. + const float denom = 1.0f / num_models; + for (int k = 0; k < BaseModelAdapter::NumParameters(); ++k) { + for (int m = 0; m < num_models; ++m) { + params[k] += BaseModelAdapter::GetParameter(mixture_model.model(m), k); + } + params[k] *= denom; + } + + return BaseModelAdapter::FromFloatPointer(params, false); +} + +template +typename MixtureTraits::model +MixtureModelAdapterBase::LinearModel( + const MixtureModel& mixture_model) { + // For each parameter: Fit line param_idx -> param value. + const int num_models = mixture_model.model_size(); + if (num_models <= 1) { + return mixture_model; + } + + std::vector result(num_models * BaseModelAdapter::NumParameters()); + const double inv_models = 1.0f / num_models; + for (int p = 0; p < BaseModelAdapter::NumParameters(); ++p) { + // Calculate sum, sq_sum and inner product. + double sum_x = 0.0; + double sum_y = 0.0; + double sum_xx = 0.0; + double sum_yy = 0.0; + double sum_xy = 0.0; + for (int m = 0; m < num_models; ++m) { + const float x = m * inv_models; + sum_x += x; + sum_xx += x * x; + const double y = GetParameter(mixture_model, m, p); + sum_y += y; + sum_yy += y * y; + sum_xy += x * y; + } + + const double denom = sum_xx - inv_models * sum_x * sum_x; + CHECK_NE(denom, 0); // As num_models > 1. + const double a = (sum_xy - inv_models * sum_x * sum_y) * denom; + const double b = inv_models * (sum_y - a * sum_x); + + for (int m = 0; m < num_models; ++m) { + const float x = m * inv_models; + result[m * BaseModelAdapter::NumParameters() + p] = a * x + b; + } + } + MixtureModel result_model = + FromFloatPointer(&result[0], false, 0, num_models); + + return result_model; +} + +template +typename MixtureTraits::model MixtureModelAdapterBase::Embed( + const BaseModel& base_model, int num_mixtures) { + MixtureModel model; + for (int i = 0; i < num_mixtures; ++i) { + model.add_model()->CopyFrom(base_model); + } + return model; +} + +// MixtureModelAdapter implementation. +template +typename MixtureModelAdapterBase::BaseModel +MixtureModelAdapter::ToBaseModel( + const MixtureModel& mixture_model, const float* weights) { + const int num_models = mixture_model.model_size(); + + float params[BaseModelAdapter::NumParameters()]; + memset(params, 0, sizeof(params[0]) * BaseModelAdapter::NumParameters()); + + // Weighted combination of mixture models. + for (int m = 0; m < num_models; ++m) { + for (int k = 0; k < BaseModelAdapter::NumParameters(); ++k) { + params[k] += BaseModelAdapter::GetParameter(mixture_model.model(m), k) * + weights[m]; + } + } + + return BaseModelAdapter::FromFloatPointer(params, false); +} + +template +Vector2_f MixtureModelAdapter::TransformPoint( + const MixtureModel& model, const float* weights, const Vector2_f& pt) { + const int num_models = model.model_size(); + const Vector3_f pt3(pt.x(), pt.y(), 1.0f); + Vector3_f result(0, 0, 0); + for (int i = 0; i < num_models; ++i) { + result += + BaseModelAdapter::TransformPoint3(model.model(i), pt3 * weights[i]); + } + + DCHECK_NE(result.z(), 0) << "Degenerate mapping."; + return Vector2_f(result.x() / result.z(), result.y() / result.z()); +} + +template +Vector2_f MixtureModelAdapter::TransformPoint( + const MixtureModel& model, const MixtureRowWeights& weights, + const Vector2_f& pt) { + return TransformPoint(model, weights.RowWeightsClamped(pt.y()), pt); +} + +template +Vector2_f MixtureModelAdapter::SolveForPoint( + const MixtureModel& model, const float* weights, const Vector2_f& pt) { + BaseModel base_model = ToBaseModel(model, weights); + return BaseModelAdapter::TransformPoint(BaseModelAdapter::Invert(base_model), + pt); +} + +template +Vector2_f MixtureModelAdapter::SolveForPointChecked( + const MixtureModel& model, const float* weights, const Vector2_f& pt, + bool* success) { + BaseModel base_model = ToBaseModel(model, weights); + BaseModel inv_base_model = + BaseModelAdapter::InvertChecked(base_model, success); + return BaseModelAdapter::TransformPoint(inv_base_model, pt); +} + +// MixtureModelAdapter implementation. +inline Homography MixtureModelAdapter::ToBaseModel( + const MixtureHomography& mixture_model, const float* weights) { + const int num_models = mixture_model.model_size(); + + float params[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const Homography& const_homog = mixture_model.model(0); + + // Weighted combination of mixture models. + switch (mixture_model.dof()) { + case MixtureHomography::ALL_DOF: + for (int m = 0; m < num_models; ++m) { + params[0] += mixture_model.model(m).h_00() * weights[m]; + params[1] += mixture_model.model(m).h_01() * weights[m]; + params[2] += mixture_model.model(m).h_02() * weights[m]; + params[3] += mixture_model.model(m).h_10() * weights[m]; + params[4] += mixture_model.model(m).h_11() * weights[m]; + params[5] += mixture_model.model(m).h_12() * weights[m]; + params[6] += mixture_model.model(m).h_20() * weights[m]; + params[7] += mixture_model.model(m).h_21() * weights[m]; + } + break; + case MixtureHomography::TRANSLATION_DOF: + params[0] = const_homog.h_00(); + params[1] = const_homog.h_01(); + params[3] = const_homog.h_10(); + params[4] = const_homog.h_11(); + params[6] = const_homog.h_20(); + params[7] = const_homog.h_21(); + + for (int m = 0; m < num_models; ++m) { + params[2] += mixture_model.model(m).h_02() * weights[m]; + params[5] += mixture_model.model(m).h_12() * weights[m]; + } + break; + case MixtureHomography::SKEW_ROTATION_DOF: + params[0] = const_homog.h_00(); + params[4] = const_homog.h_11(); + params[6] = const_homog.h_20(); + params[7] = const_homog.h_21(); + for (int m = 0; m < num_models; ++m) { + params[1] += mixture_model.model(m).h_01() * weights[m]; + params[2] += mixture_model.model(m).h_02() * weights[m]; + params[3] += mixture_model.model(m).h_10() * weights[m]; + params[5] += mixture_model.model(m).h_12() * weights[m]; + } + break; + case MixtureHomography::CONST_DOF: + return const_homog; + default: + LOG(FATAL) << "Unknown type."; + } + + return HomographyAdapter::FromFloatPointer(params, false); +} + +inline Vector2_f MixtureModelAdapter::TransformPoint( + const MixtureHomography& model, const float* weights, const Vector2_f& pt) { + const int num_models = model.model_size(); + const Homography& const_homog = model.model(0); + Vector3_f result(0, 0, 0); + const Vector3_f pt3(pt.x(), pt.y(), 1.0f); + float x; + float y; + switch (model.dof()) { + case MixtureHomography::ALL_DOF: + for (int i = 0; i < num_models; ++i) { + result += HomographyAdapter::TransformPoint3(model.model(i), + pt3 * weights[i]); + } + break; + case MixtureHomography::TRANSLATION_DOF: + x = const_homog.h_00() * pt.x() + const_homog.h_01() * pt.y(); + y = const_homog.h_10() * pt.x() + const_homog.h_11() * pt.y(); + for (int i = 0; i < num_models; ++i) { + x += model.model(i).h_02() * weights[i]; + y += model.model(i).h_12() * weights[i]; + } + result = Vector3_f( + x, y, + const_homog.h_20() * pt.x() + const_homog.h_21() * pt.y() + 1.0f); + break; + case MixtureHomography::SKEW_ROTATION_DOF: + x = const_homog.h_00() * pt.x(); + y = const_homog.h_11() * pt.y(); + for (int i = 0; i < num_models; ++i) { + x += (model.model(i).h_01() * pt.y() + model.model(i).h_02()) * + weights[i]; + y += (model.model(i).h_10() * pt.x() + model.model(i).h_12()) * + weights[i]; + } + result = Vector3_f( + x, y, + const_homog.h_20() * pt.x() + const_homog.h_21() * pt.y() + 1.0f); + break; + case MixtureHomography::CONST_DOF: + return HomographyAdapter::TransformPoint(model.model(0), pt); + default: + LOG(FATAL) << "Unknown type."; + } + + DCHECK_NE(result.z(), 0) << "Degenerate mapping."; + return Vector2_f(result.x() / result.z(), result.y() / result.z()); +} + +inline Vector2_f MixtureModelAdapter::TransformPoint( + const MixtureHomography& model, const MixtureRowWeights& weights, + const Vector2_f& pt) { + return TransformPoint(model, weights.RowWeightsClamped(pt.y()), pt); +} + +inline Vector2_f MixtureModelAdapter::SolveForPoint( + const MixtureHomography& model, const float* weights, const Vector2_f& pt) { + Homography base_model = ToBaseModel(model, weights); + return HomographyAdapter::TransformPoint( + HomographyAdapter::Invert(base_model), pt); +} + +inline Vector2_f MixtureModelAdapter::SolveForPointChecked( + const MixtureHomography& model, const float* weights, const Vector2_f& pt, + bool* success) { + Homography base_model = ToBaseModel(model, weights); + Homography inv_base_model = + HomographyAdapter::InvertChecked(base_model, success); + return HomographyAdapter::TransformPoint(inv_base_model, pt); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_H_ diff --git a/mediapipe/util/tracking/motion_models.proto b/mediapipe/util/tracking/motion_models.proto new file mode 100644 index 000000000..889cc425a --- /dev/null +++ b/mediapipe/util/tracking/motion_models.proto @@ -0,0 +1,122 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Motion models, that are supported and estimated by MotionStabilization. +// Note, that transformations represent the motion of the +// feature points from the previous frame to the current one. +// Note that the actual camera movement, is the inverse of this +// transformation. +// Check carefully, which movement (feature or camera) is required. +// Note that for each model the default values always specify an identity +// transform. Follow this rule when adding new models. + +syntax = "proto2"; + +package mediapipe; + +// Here x represents a 2D point in the image plane, in the following +// coordinate system: +// *----> x +// | +// | +// v y + +// Simple translational model: +// I * x + [dx; dy] with I being 2x2 identity transform. +message TranslationModel { + optional float dx = 1 [default = 0]; + optional float dy = 2 [default = 0]; +} + +// Non-linear similarity model (w.r.t. to its parametrization). +// c_r := cos(rotation); +// s_r := sin(rotation); +// Transformation applied to x: +// [scale 0; * [c_r -s_r; * x + [dx; +// 0 scale] s_r c_r] dy] +message SimilarityModel { + optional float dx = 1 [default = 0]; + optional float dy = 2 [default = 0]; + optional float scale = 3 [default = 1]; + optional float rotation = 4 [default = 0]; // angle in [-pi, pi]. +} + +// Linear similarity model: +// [a -b; * x + [dx; +// b a] dy] +message LinearSimilarityModel { + optional float dx = 1 [default = 0]; + optional float dy = 2 [default = 0]; + optional float a = 3 [default = 1]; + optional float b = 4 [default = 0]; +} + +// Affine according to +// ( [a b * x + [dx; +// ( c d] dy] +message AffineModel { + optional float dx = 1 [default = 0]; + optional float dy = 2 [default = 0]; + optional float a = 3 [default = 1]; + optional float b = 4 [default = 0]; + optional float c = 5 [default = 0]; + optional float d = 6 [default = 1]; +} + +// Homography according to +// [h_00 h_01 h_02; +// h_10 h_11 h_12; +// h_20 h_21 1]; +// Note: The parametrization with h_22 = 1 does not always hold, e.g. +// if the origin (0, 0, 1) gets mapped to the line at infinity +// (0, 0, 1). However for video we expect small perspective +// changes between frames and this parametrization improves +// robustness greatly as it removes an additional DOF. +// Therefore, all methods in motion_stabilization should not be +// used for general wide-baseline matching of frames. +message Homography { + optional float h_00 = 1 [default = 1]; + optional float h_01 = 2 [default = 0]; + optional float h_02 = 3 [default = 0]; + optional float h_10 = 4 [default = 0]; + optional float h_11 = 5 [default = 1]; + optional float h_12 = 6 [default = 0]; + optional float h_20 = 7 [default = 0]; + optional float h_21 = 8 [default = 0]; +} + +// Mixture models with higher degrees of freedom, according to +// \sum_i model(i) * weight(i), where weights are passed during transform and +// are expected to sum to one. +message MixtureLinearSimilarity { + repeated LinearSimilarityModel model = 1; +} + +message MixtureAffine { + repeated AffineModel model = 1; +} + +message MixtureHomography { + repeated Homography model = 1; + // Specifies which degree of freedom vary across mixture. + // Can be used to implement several transformation functions quicker. + enum VariableDOF { + ALL_DOF = 0; // All dof are variable. + TRANSLATION_DOF = 1; // Only translation (h_02, h_12) varies. + SKEW_ROTATION_DOF = 2; // Translation (h_02, h_12), and skew-rotation + // (h_01, h_10) vary. + CONST_DOF = 3; // Mixture is constant. + } + optional VariableDOF dof = 2 [default = ALL_DOF]; +} diff --git a/mediapipe/util/tracking/motion_models_cv.cc b/mediapipe/util/tracking/motion_models_cv.cc new file mode 100644 index 000000000..b9b428adb --- /dev/null +++ b/mediapipe/util/tracking/motion_models_cv.cc @@ -0,0 +1,57 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/motion_models_cv.h" + +namespace mediapipe { + +void ModelCvConvert::ToCvMat(const TranslationModel& model, + cv::Mat* matrix) { + ModelCvConvert::ToCvMat( + ModelAdapter::ToAffine(model), matrix); +} + +void ModelCvConvert::ToCvMat( + const LinearSimilarityModel& model, cv::Mat* matrix) { + ModelCvConvert::ToCvMat( + ModelAdapter::ToAffine(model), matrix); +} + +void ModelCvConvert::ToCvMat(const AffineModel& model, + cv::Mat* matrix) { + matrix->create(2, 3, CV_32FC1); + matrix->at(0, 0) = model.a(); + matrix->at(0, 1) = model.b(); + matrix->at(0, 2) = model.dx(); + matrix->at(1, 0) = model.c(); + matrix->at(1, 1) = model.d(); + matrix->at(1, 2) = model.dy(); +} + +void ModelCvConvert::ToCvMat(const Homography& model, + cv::Mat* matrix) { + CHECK(matrix != nullptr); + matrix->create(3, 3, CV_32FC1); + matrix->at(0, 0) = model.h_00(); + matrix->at(0, 1) = model.h_01(); + matrix->at(0, 2) = model.h_02(); + matrix->at(1, 0) = model.h_10(); + matrix->at(1, 1) = model.h_11(); + matrix->at(1, 2) = model.h_12(); + matrix->at(2, 0) = model.h_20(); + matrix->at(2, 1) = model.h_21(); + matrix->at(2, 2) = 1.0; +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/motion_models_cv.h b/mediapipe/util/tracking/motion_models_cv.h new file mode 100644 index 000000000..3f8da051c --- /dev/null +++ b/mediapipe/util/tracking/motion_models_cv.h @@ -0,0 +1,63 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_CV_H_ +#define MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_CV_H_ + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/motion_models.pb.h" // NOLINT + +namespace mediapipe { + +template +class ModelCvConvert {}; + +// Specialized implementations, with additional functionality if needed. +template <> +class ModelCvConvert { + public: + // Returns 2x3 floating point cv::Mat with model parameters. + static void ToCvMat(const TranslationModel& model, cv::Mat* matrix); +}; + +template <> +class ModelCvConvert { + public: + // Returns 2x3 floating point cv::Mat with model parameters. + static void ToCvMat(const LinearSimilarityModel& model, cv::Mat* matrix); +}; + +template <> +class ModelCvConvert { + public: + // Returns 2x3 floating point cv::Mat with model parameters. + static void ToCvMat(const AffineModel& model, cv::Mat* matrix); +}; + +template <> +class ModelCvConvert { + public: + // Returns 3x3 floating point cv::Mat with model parameters. + static void ToCvMat(const Homography& model, cv::Mat* matrix); +}; + +template +void ModelToCvMat(const Model& model, cv::Mat* matrix) { + ModelCvConvert::ToCvMat(model, matrix); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_CV_H_ diff --git a/mediapipe/util/tracking/motion_models_test.cc b/mediapipe/util/tracking/motion_models_test.cc new file mode 100644 index 000000000..194aedcb6 --- /dev/null +++ b/mediapipe/util/tracking/motion_models_test.cc @@ -0,0 +1,335 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/motion_models.h" + +#include "mediapipe/framework/deps/message_matchers.h" +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/parse_text_proto.h" +#include "mediapipe/framework/port/proto_ns.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/motion_estimation.h" + +namespace mediapipe { +namespace { + +static const float kArrayFloat[] = {1, 2, 3, 4, 5, 6, 7, 8}; +static const double kArrayDouble[] = {1, 2, 3, 4, 5, 6, 7, 8}; + +class MotionModelsTest : public ::testing::Test {}; + +// Test from/to array and parameter indexing functions. +template +void CheckFromArrayAndGetParameter(const char* model_zero_string, + const char* model_identity_string) { + Model model_zero; + Model model_identity; + + ASSERT_TRUE( + proto_ns::TextFormat::ParseFromString(model_zero_string, &model_zero)); + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_identity_string, + &model_identity)); + + typedef ModelAdapter Adapter; + + EXPECT_THAT(Adapter::FromFloatPointer(kArrayFloat, false), + mediapipe::EqualsProto(model_zero)); + + EXPECT_THAT(Adapter::FromFloatPointer(kArrayFloat, true), + mediapipe::EqualsProto(model_identity)); + + EXPECT_THAT(Adapter::FromDoublePointer(kArrayDouble, false), + mediapipe::EqualsProto(model_zero)); + + EXPECT_THAT(Adapter::FromDoublePointer(kArrayDouble, true), + mediapipe::EqualsProto(model_identity)); + + ASSERT_LE(Adapter::NumParameters(), 8); + for (int i = 0; i < Adapter::NumParameters(); ++i) { + EXPECT_EQ(kArrayFloat[i], Adapter::GetParameter(model_zero, i)); + } +} + +TEST_F(MotionModelsTest, FromArrayAndGetParameter) { + CheckFromArrayAndGetParameter("dx: 1 dy: 2", "dx: 1 dy: 2"); + + CheckFromArrayAndGetParameter( + "dx: 1 dy: 2 scale: 3 rotation: 4", "dx: 1 dy: 2 scale: 4 rotation: 4"); + + CheckFromArrayAndGetParameter("dx: 1 dy: 2 a: 3 b: 4", + "dx: 1 dy: 2 a: 4 b: 4"); + + CheckFromArrayAndGetParameter("dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6", + "dx: 1 dy: 2 a: 4 b: 4 c: 5 d: 7"); + + CheckFromArrayAndGetParameter( + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: 4 h_11: 5 h_12: 6 " + "h_20: 7 h_21: 8 ", + "h_00: 2 h_01: 2 h_02: 3 " + "h_10: 4 h_11: 6 h_12: 6 " + "h_20: 7 h_21: 8 "); +} + +// Test point transformations. +template +void CheckTransformPoint(const char* model_string, float x_in, float y_in, + float x_out, float y_out) { + Model model; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string, &model)); + + const Vector2_f output = + ModelAdapter::TransformPoint(model, Vector2_f(x_in, y_in)); + + EXPECT_NEAR(x_out, output.x(), 1e-5); + EXPECT_NEAR(y_out, output.y(), 1e-5); +} + +TEST_F(MotionModelsTest, TransformPoint) { + CheckTransformPoint("dx: 0 dy: 0", 1, 1, 1, 1); + CheckTransformPoint("dx: 1 dy: -1", 1, 1, 2, 0); + + CheckTransformPoint( + "dx: 0 dy: 0 scale: 1 rotation: 1.57079633", 1, 2, -2, 1); + CheckTransformPoint( + "dx: 1 dy: -1 scale: 1 rotation: 1.57079633", 1, 2, -1, 0); + CheckTransformPoint( + "dx: 1 dy: -1 scale: 2 rotation: 1.57079633", 1, 2, -3, 1); + + CheckTransformPoint("dx: 0 dy: 0 a: 1 b: -0.5", 1, 2, + 2, 1.5); + CheckTransformPoint("dx: 0.5 dy: -0.5 a: 1 b: 0.5", 1, + 2, 0.5, 2); + CheckTransformPoint("dx: 0.5 dy: -0.5 a: 0.5 b: 0.5", + 1, 2, 0, 1); + + CheckTransformPoint("dx: 0 dy: 0 a: 1 b: 0.5 c: -0.5 d: 1", 1, 2, + 2, 1.5); + CheckTransformPoint("dx: 0.5 dy: -0.5 a: 2 b: -0.5 c: 0.5 d: 1", + 1, 2, 1.5, 2); + CheckTransformPoint("dx: 1 dy: -1 a: 2 b: -2 c: 1 d: -1", 1, 2, + -1, -2); + + // Transformations by Homography are followed by divsion by the 3rd element. + // Test division by value != 1. + CheckTransformPoint( + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: 4 h_11: 3 h_12: 6 " + "h_20: 7 h_21: 8 ", + 1, 2, 8.0 / 24.0, 16.0 / 24.0); + // Test division by 1. + CheckTransformPoint( + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: 4 h_11: 3 h_12: 6 " + "h_20: 2 h_21: -1 ", + 1, 2, 8.0, 16.0); +} + +// Test model inversions. +template +void CheckInvert(const char* model_string, const char* inv_model_string) { + Model model; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string, &model)); + + Model inv_model_expected; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(inv_model_string, + &inv_model_expected)); + + typedef ModelAdapter Adapter; + const Model inv_model_actual = Adapter::Invert(model); + + for (int i = 0; i < Adapter::NumParameters(); ++i) { + EXPECT_NEAR(Adapter::GetParameter(inv_model_expected, i), + Adapter::GetParameter(inv_model_actual, i), 1e-5) + << "Parameter index: " << i << " of total " << Adapter::NumParameters(); + } +} + +TEST_F(MotionModelsTest, Invert) { + CheckInvert("dx: 1 dy: -2", "dx: -1 dy: 2"); + + CheckInvert("dx: 0 dy: 0 scale: 1 rotation: 1.57079633", + "dx: 0 dy: 0 scale: 1 rotation: -1.57079633"); + CheckInvert("dx: 1 dy: -2 scale: 1 rotation: 1.57079633", + "dx: 2 dy: 1 scale: 1 rotation: -1.57079633"); + CheckInvert("dx: 1 dy: -2 scale: 0.5 rotation: 1.57079633", + "dx: 4 dy: 2 scale: 2 rotation: -1.57079633"); + + CheckInvert("dx: 1 dy: 2 a: 3 b: 4 ", + "dx: -0.44 dy: -0.08 a: 0.12 b: -0.16 "); + + // Test division by value != 1. + CheckInvert( + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: -3 h_11: -2 h_12: -1 " + "h_20: 8 h_21: -1 ", + "h_00: -0.75 h_01: -1.25 h_02: 1 " + "h_10: -1.25 h_11: -5.75 h_12: -2 " + "h_20: 4.75 h_21: 4.25 "); + // Test division by 1. + CheckInvert( + "h_00: -0.75 h_01: -1.25 h_02: 1 " + "h_10: -1.25 h_11: -5.75 h_12: -2 " + "h_20: 4.75 h_21: 4.25 ", + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: -3 h_11: -2 h_12: -1 " + "h_20: 8 h_21: -1 "); +} + +// Test model compositions. +template +void CheckCompose(const char* model_string1, const char* model_string2, + const char* composed_string) { + Model model1; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string1, &model1)); + + Model model2; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string2, &model2)); + + Model composed_expected; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(composed_string, + &composed_expected)); + + typedef ModelAdapter Adapter; + const Model composed_actual = Adapter::Compose(model1, model2); + + for (int i = 0; i < Adapter::NumParameters(); ++i) { + EXPECT_NEAR(Adapter::GetParameter(composed_expected, i), + Adapter::GetParameter(composed_actual, i), 1e-5) + << "Parameter index: " << i << " of total " << Adapter::NumParameters(); + } +} + +TEST_F(MotionModelsTest, Compose) { + CheckCompose("dx: 1 dy: -2", "dx: -3 dy: 4", + "dx: -2 dy: 2"); + + CheckCompose( + "dx: 1 dy: 2 scale: 0.5 rotation: 1.57079633 ", + "dx: -2 dy: -1 scale: 2 rotation: -1.57079633 ", + "dx: 1.5 dy: 1 scale: 1 rotation: 0 "); + + CheckCompose("dx: 1 dy: 2 a: 0.5 b: 0.5 ", + "dx: -2 dy: -1 a: 2 b: -0.5 ", + "dx: 0.5 dy: 0.5 a: 1.25 b: 0.75 "); + + // Test division by value != 1. + CheckCompose( + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: 4 h_11: 5 h_12: 6 " + "h_20: 1 h_21: -1 ", + "h_00: -3 h_01: -2 h_02: -1 " + "h_10: -4 h_11: -5 h_12: -2 " + "h_20: 7 h_21: 8 ", + "h_00: 5 h_01: 6 h_02: -1 " + "h_10: 5 h_11: 7.5 h_12: -4 " + "h_20: 4 h_21: 5.5 "); + // Test division by 1. + CheckCompose( + "h_00: 1 h_01: 2 h_02: 3 " + "h_10: 4 h_11: 5 h_12: 6 " + "h_20: 2 h_21: -1 ", + "h_00: -3 h_01: -2 h_02: -1 " + "h_10: -4 h_11: -5 h_12: -2 " + "h_20: 7 h_21: 8 ", + "h_00: 10 h_01: 12 h_02: -2 " + "h_10: 10 h_11: 15 h_12: -8 " + "h_20: 5 h_21: 9 "); +} + +// Test conversions between models and their affine representations, and +// vice-versa. +template +void CheckToFromAffine(const char* model_string, const char* affine_string) { + Model model; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string, &model)); + + AffineModel affine; + ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(affine_string, &affine)); + + typedef ModelAdapter Adapter; + EXPECT_THAT(Adapter::ToAffine(model), mediapipe::EqualsProto(affine)); + EXPECT_THAT(Adapter::FromAffine(affine), mediapipe::EqualsProto(model)); +} + +TEST_F(MotionModelsTest, ToFromAffine) { + CheckToFromAffine("dx: 1 dy: 2", + "dx: 1 dy: 2 a: 1 b: 0 c: 0 d: 1"); + + CheckToFromAffine("dx: 1 dy: 2 a: 3 b: -4", + "dx: 1 dy: 2 a: 3 b: 4 c: -4 d: 3"); + + CheckToFromAffine("dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6", + "dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6"); + + CheckToFromAffine( + "h_00: 3 h_01: 4 h_02: 1 " + "h_10: 5 h_11: 6 h_12: 2 " + "h_20: 0 h_21: 0 ", + "dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6"); + Homography homography; + ASSERT_TRUE( + proto_ns::TextFormat::ParseFromString("h_00: 3 h_01: 4 h_02: 1 " + "h_10: 5 h_11: 6 h_12: 2 " + "h_20: 0 h_21: 0 ", + &homography)); + + EXPECT_TRUE(HomographyAdapter::IsAffine(homography)); + homography.set_h_20(7); + homography.set_h_21(8); + EXPECT_FALSE(HomographyAdapter::IsAffine(homography)); +} + +TEST_F(MotionModelsTest, ProjectModels) { + // Express models w.r.t. center for easy testing. + LinearSimilarityModel center_trans = + LinearSimilarityAdapter::FromArgs(50, 50, 1, 0); + LinearSimilarityModel inv_center_trans = + LinearSimilarityAdapter::FromArgs(-50, -50, 1, 0); + + // 20 x 10 translation with scaling of factor 2 and rotation. + LinearSimilarityModel lin_sim = + LinearSimilarityAdapter::FromArgs(20, 10, 2 * cos(0.2), 2 * sin(0.2)); + + LinearSimilarityModel lin_sim_center = + ModelCompose3(center_trans, lin_sim, inv_center_trans); + + TranslationModel translation = + TranslationAdapter::ProjectFrom(lin_sim_center, 100, 100); + EXPECT_NEAR(translation.dx(), 20, 1e-3); + EXPECT_NEAR(translation.dy(), 10, 1e-3); + + translation = ProjectViaFit(lin_sim_center, 100, 100); + EXPECT_NEAR(translation.dx(), 20, 1e-3); + EXPECT_NEAR(translation.dy(), 10, 1e-3); + + Homography homog = HomographyAdapter::FromArgs( + 1, 0, 10, 0, 1, 20, 5e-3, 1e-3); // Perspective transform: yaw + pitch. + + Homography homog_center = + ModelCompose3(HomographyAdapter::Embed(center_trans), homog, + HomographyAdapter::Embed(inv_center_trans)); + // Rendering: + // https://www.wolframalpha.com/input/?i=ListPlot%5B%7B+%7B7,-7%7D,+%7B108,16%7D,+%7B104,96%7D,+%7B12.5,+125%7D,+%7B7,-7%7D%5D + + translation = TranslationAdapter::ProjectFrom(homog_center, 100, 100); + EXPECT_NEAR(translation.dx(), 10, 1e-3); + EXPECT_NEAR(translation.dy(), 20, 1e-3); + + // TODO: Investigate how ProjectViaFit can yield similar result. +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/util/tracking/motion_saliency.cc b/mediapipe/util/tracking/motion_saliency.cc new file mode 100644 index 000000000..5adafca4c --- /dev/null +++ b/mediapipe/util/tracking/motion_saliency.cc @@ -0,0 +1,700 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/motion_saliency.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/util/tracking/camera_motion.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +MotionSaliency::MotionSaliency(const MotionSaliencyOptions& options, + int frame_width, int frame_height) + : options_(options), + frame_width_(frame_width), + frame_height_(frame_height) {} + +MotionSaliency::~MotionSaliency() {} + +void MotionSaliency::SaliencyFromFeatures( + const RegionFlowFeatureList& feature_list, + std::vector* irls_weights, // optional. + SalientPointFrame* salient_frame) { + CHECK(salient_frame); + CHECK_EQ(frame_width_, feature_list.frame_width()); + CHECK_EQ(frame_height_, feature_list.frame_height()); + + if (irls_weights) { + CHECK_EQ(feature_list.feature_size(), irls_weights->size()); + } + + if (feature_list.feature_size() < 1) { + return; + } + + float max_irls_weight = 0; + if (irls_weights) { + max_irls_weight = + *std::max_element(irls_weights->begin(), irls_weights->end()); + } else { + struct FeatureIRLSComparator { + bool operator()(const RegionFlowFeature& lhs, + const RegionFlowFeature& rhs) const { + return lhs.irls_weight() < rhs.irls_weight(); + } + }; + max_irls_weight = + std::max_element(feature_list.feature().begin(), + feature_list.feature().end(), FeatureIRLSComparator()) + ->irls_weight(); + } + + // Max weight is too small for meaningful mode finding, terminate. + if (max_irls_weight < 1e-2f) { + return; + } + + // Discard small weights that just slow clustering down. + const float irls_cutoff = max_irls_weight * 1e-2f; + + int feat_idx = 0; + + // Create SalientLocation's from input feature_list. + std::vector features; + for (const auto& src_feature : feature_list.feature()) { + const float weight = + irls_weights ? (*irls_weights)[feat_idx] : src_feature.irls_weight(); + ++feat_idx; + + // Discard all features with small measure or zero weight from mode finding. + if (weight < irls_cutoff) { + continue; + } + + features.push_back(SalientLocation(FeatureLocation(src_feature), weight)); + } + + DetermineSalientFrame(features, salient_frame); +} + +void MotionSaliency::SaliencyFromPoints(const std::vector* points, + const std::vector* weights, + SalientPointFrame* salient_frame) { + // TODO: Handle vectors of size zero. + CHECK(salient_frame); + CHECK_EQ(points->size(), weights->size()); + + float max_weight = *std::max_element(weights->begin(), weights->end()); + + // Max weight is too small for meaningful mode finding, terminate. + if (max_weight < 1e-2f) { + return; + } + + // Discard small weights that just slow clustering down. + const float weight_cutoff = max_weight * 1e-2f; + + // Create SalientLocation's from input points. + std::vector features; + for (int point_idx = 0; point_idx < points->size(); ++point_idx) { + const float weight = (*weights)[point_idx]; + // Discard all features with small measure or zero weight from mode finding. + if (weight < weight_cutoff) { + continue; + } + + features.push_back(SalientLocation((*points)[point_idx], weight)); + } + + DetermineSalientFrame(features, salient_frame); +} + +// We only keep those salient points that have neighbors along the temporal +// dimension. +void MotionSaliency::SelectSaliencyInliers( + std::vector* motion_saliency, + bool rescale_to_median_saliency_weight) { + float scale = 1.0; + + if (rescale_to_median_saliency_weight) { + // Compute median saliency weight across all frames, to rescale saliency. + std::vector saliency_weights; + for (int i = 0; i < motion_saliency->size(); ++i) { + for (const auto& salient_point : (*motion_saliency)[i]->point()) { + saliency_weights.push_back(salient_point.weight()); + } + } + + // Nothing to filter in the frame chunk. + if (saliency_weights.empty()) { + return; + } + + auto median_iter = saliency_weights.begin() + saliency_weights.size() / 2; + std::nth_element(saliency_weights.begin(), median_iter, + saliency_weights.end()); + + const float median_weight = *median_iter; + if (median_weight > 0) { + scale = options_.saliency_weight() / median_weight; + } + } + + SaliencyPointList inlier_saliency(motion_saliency->size()); + const float sq_support_distance = options_.selection_support_distance() * + options_.selection_support_distance(); + + // Test each point salient point for inlierness. + for (int i = 0; i < motion_saliency->size(); ++i) { + for (const auto& salient_point : (*motion_saliency)[i]->point()) { + int support = 0; + Vector2_f salient_location(salient_point.norm_point_x(), + salient_point.norm_point_y()); + + // Find supporting points (saliency points close enough to current one) + // in adjacent frames. Linear Complexity. + for (int j = std::max(0, i - options_.selection_frame_radius()), + end_j = std::min(i + options_.selection_frame_radius(), + motion_saliency->size() - 1); + + j <= end_j; ++j) { + if (i == j) { + continue; + } + + for (const auto& compare_point : (*motion_saliency)[j]->point()) { + Vector2_f compare_location(compare_point.norm_point_x(), + compare_point.norm_point_y()); + + if ((salient_location - compare_location).Norm2() <= + sq_support_distance) { + ++support; + } + } + } // end neighbor frames iteration. + + if (support >= options_.selection_minimum_support()) { + SalientPoint* scaled_point = inlier_saliency[i].add_point(); + scaled_point->CopyFrom(salient_point); + scaled_point->set_weight(scaled_point->weight() * scale); + } + } // end point traversal. + } // end frame traversal. + + for (int k = 0; k < motion_saliency->size(); ++k) { + (*motion_saliency)[k]->Swap(&inlier_saliency[k]); + } +} + +void MotionSaliency::FilterMotionSaliency( + std::vector* saliency_point_list) { + CHECK(saliency_point_list != nullptr); + const float sigma_time = options_.filtering_sigma_time(); + const float sigma_space = options_.filtering_sigma_space(); + + const int time_radius = ceil(sigma_time * 1.5); + const int time_diameter = 2 * time_radius + 1; + + // Create lookup table for weights. + std::vector time_weights(time_diameter); + const float time_coeff = -0.5f / (sigma_time * sigma_time); + + for (int i = -time_radius, time_idx = 0; i <= time_radius; ++i, ++time_idx) { + time_weights[time_idx] = std::exp(time_coeff * i * i); + } + + // Ignore points further than 1.65 sigmas away (includes 90% of distribution). + const float space_cutoff = 1.65 * sigma_space; + const float space_exp_scale = -0.5f / (sigma_space * sigma_space); + + // Copy saliency points. + const int num_frames = saliency_point_list->size(); + std::vector points(num_frames + 2 * time_radius); + + for (int k = 0; k < saliency_point_list->size(); ++k) { + points[time_radius + k].CopyFrom(*(*saliency_point_list)[k]); + } + + // Copy border. + std::copy(points.rbegin() + time_radius, points.rbegin() + 2 * time_radius, + points.end() - time_radius); + std::copy(points.begin() + time_radius, points.begin() + 2 * time_radius, + points.rend() - time_radius); + + // Apply filter. + for (int i = time_radius; i < num_frames + time_radius; ++i) { + const int frame_idx = i - time_radius; + for (auto& sample_point : + *(*saliency_point_list)[frame_idx]->mutable_point()) { + Vector2_f point_sum(0, 0); + // Sum for left, bottom, right, top tuple. + Vector4_f bound_sum; + Vector3_f ellipse_sum(0, 0, 0); // Captures major, minor and angle. + float weight_sum = 0; + float filter_sum = 0; + + const float sample_angle = sample_point.angle(); + for (int k = i - time_radius, time_idx = 0; k <= i + time_radius; + ++k, ++time_idx) { + for (const auto& test_point : points[k].point()) { + const float diff = std::hypot( + test_point.norm_point_y() - sample_point.norm_point_y(), + test_point.norm_point_x() - sample_point.norm_point_x()); + if (diff > space_cutoff) { + continue; + } + + const float weight = time_weights[time_idx] * test_point.weight() * + std::exp(diff * diff * space_exp_scale); + + filter_sum += weight; + point_sum += + Vector2_f(test_point.norm_point_x(), test_point.norm_point_y()) * + weight; + bound_sum += Vector4_f(test_point.left(), test_point.bottom(), + test_point.right(), test_point.top()) * + weight; + weight_sum += test_point.weight() * weight; + + // Ensure test_point and sample are less than pi / 2 apart. + float test_angle = test_point.angle(); + if (fabs(test_angle - sample_angle) > M_PI / 2) { + if (sample_angle > M_PI / 2) { + test_angle += M_PI; + } else { + test_angle -= M_PI; + } + } + + ellipse_sum += Vector3_f(test_point.norm_major(), + test_point.norm_minor(), test_angle) * + weight; + } + } + + if (filter_sum > 0) { + const float inv_filter_sum = 1.0f / filter_sum; + point_sum *= inv_filter_sum; + bound_sum *= inv_filter_sum; + weight_sum *= inv_filter_sum; + ellipse_sum *= inv_filter_sum; + } + + sample_point.set_norm_point_x(point_sum.x()); + sample_point.set_norm_point_y(point_sum.y()); + sample_point.set_left(bound_sum.x()); + sample_point.set_bottom(bound_sum.y()); + sample_point.set_right(bound_sum.z()); + sample_point.set_top(bound_sum.w()); + + sample_point.set_weight(weight_sum); + sample_point.set_norm_major(ellipse_sum.x()); + sample_point.set_norm_minor(ellipse_sum.y()); + sample_point.set_angle(ellipse_sum.z()); + + if (sample_point.angle() > M_PI) { + sample_point.set_angle(sample_point.angle() - M_PI); + } + if (sample_point.angle() < 0) { + sample_point.set_angle(sample_point.angle() + M_PI); + } + } + } +} + +void MotionSaliency::CollapseMotionSaliency( + const SaliencyPointList& input_saliency, const Vector4_f& bounds, + SaliencyPointList* output_saliency) { + CHECK(output_saliency); + output_saliency->clear(); + output_saliency->resize(input_saliency.size()); + + for (int f = 0; f < input_saliency.size(); ++f) { // traverse frames. + Vector2_f mean_saliency(0, 0); + float weight_sum = 0; + for (const auto& salient_point : input_saliency[f].point()) { + mean_saliency += + Vector2_f(salient_point.norm_point_x(), salient_point.norm_point_y()); + weight_sum += 1; + } + + if (weight_sum > 0) { + SalientPoint* collapsed = (*output_saliency)[f].add_point(); + collapsed->set_norm_point_x(mean_saliency.x() / weight_sum); + collapsed->set_norm_point_y(mean_saliency.y() / weight_sum); + collapsed->set_left(bounds.x()); + collapsed->set_bottom(bounds.y()); + collapsed->set_right(bounds.z()); + collapsed->set_top(bounds.w()); + collapsed->set_weight(1.0f); + } + } +} + +namespace { + +// Describes feature mode for a feature in a RegionFlowFeatureList stored +// at index feature_idx. +struct FeatureMode { + Vector2_f location; + float irls_weight; + int feature_idx; + int mode_bin; +}; + +// Determines mode for each feature in feature_view. +// Returns modes as list of pointers in mode_ptrs. Actual modes are stored +// binned into a grid of equal size as the passed FeatureGrid +// (bins of size grid_resolution x grid_resolution). +void DetermineFeatureModes( + const FeatureFrame& features, + float grid_resolution, const Vector2_i& grid_dims, float band_width, + const FeatureGrid& feature_grid, + const std::vector>& feature_taps, + const std::vector& space_lut, float space_scale, + std::vector>* mode_grid, + std::vector* mode_ptrs) { + CHECK(mode_grid); + CHECK(mode_ptrs); + const int num_features = features.size(); + mode_ptrs->reserve(num_features); + + const float grid_scale = 1.0f / grid_resolution; + int feature_idx = 0; + const int kMaxIter = 100; + // Set convergence radius to 0.1% of bandwidth. + const float sq_conv_radius = band_width * band_width * 1e-6f; + for (const auto& feature_ptr : features) { + Vector2_f center = feature_ptr->pt; + int iter = 0; + for (; iter < kMaxIter; ++iter) { + const int bin_x = center.x() * grid_scale; + const int bin_y = center.y() * grid_scale; + const int grid_loc = bin_y * grid_dims.x() + bin_x; + + float sum_weight = 0; + Vector2_f new_center; + for (const auto& bin : feature_taps[grid_loc]) { + for (const auto& test_feat_ptr : feature_grid[bin]) { + const float dist = (test_feat_ptr->pt - center).Norm(); + const float weight = space_lut[static_cast(dist * space_scale)] * + test_feat_ptr->weight; + sum_weight += weight; + new_center += weight * test_feat_ptr->pt; + } + } + + if (sum_weight > 0) { + new_center *= (1.0f / sum_weight); + if ((center - new_center).Norm2() < sq_conv_radius) { + center = new_center; + break; + } else { + center = new_center; + } + } else { + LOG(WARNING) << "No features found in band_width radius, " + << "should not happen. "; + break; + } + } + + const int mode_bin_x = center.x() * grid_scale; + const int mode_bin_y = center.y() * grid_scale; + const int mode_grid_loc = mode_bin_y * grid_dims.x() + mode_bin_x; + FeatureMode mode{center, feature_ptr->weight, feature_idx, mode_grid_loc}; + + (*mode_grid)[mode_grid_loc].push_back(mode); + FeatureMode* added_mode = &(*mode_grid)[mode_grid_loc].back(); + (*mode_ptrs).push_back(added_mode); + ++feature_idx; + } +} + +} // namespace. + +void MotionSaliency::SalientModeFinding(std::vector* locations, + std::vector* modes) { + CHECK(modes); + CHECK(locations); + if (locations->empty()) { + return; + } + + // Scale band_width to image domain. + const float band_width = + hypot(frame_width_, frame_height_) * options_.mode_band_width(); + + // Select all salient locations with non-zero weight. + FeatureFrame salient_features; + salient_features.reserve(locations->size()); + for (auto& loc : *locations) { + if (loc.weight > 1e-6) { + salient_features.push_back(&loc); + } + } + + const int num_features = salient_features.size(); + if (num_features == 0) { + return; + } + + // Build feature grid according to bandwith. + std::vector> feature_grids; + std::vector> feature_taps; + + // Guarantee at least 1.5 sigmas in each direction are captured with + // tap 3 filtering (86 % of the data). + const float grid_resolution = 1.5f * band_width; + Vector2_i grid_dims; + BuildFeatureGrid( + frame_width_, frame_height_, grid_resolution, {salient_features}, + [](const SalientLocation& l) -> Vector2_f { return l.pt; }, &feature_taps, + nullptr, &grid_dims, &feature_grids); + + // Just one frame input, expect one grid as output. + CHECK_EQ(1, feature_grids.size()); + const auto& feature_grid = feature_grids[0]; + + // Setup Gaussian LUT for smoothing in space, using 2^10 discretization bins. + const int lut_bins = 1 << 10; + std::vector space_lut(lut_bins); + + // Using 3 tap smoothing, max distance is 2 bin diagonals. + // We use maximum of 2 * sqrt(2) * bin_radius plus 1% room in case maximum + // value is attained. + const float max_space_diff = sqrt(2.0) * 2.f * grid_resolution * 1.01f; + + const float space_bin_size = max_space_diff / lut_bins; + const float space_scale = 1.0f / space_bin_size; + const float space_coeff = -0.5f / (band_width * band_width); + for (int i = 0; i < lut_bins; ++i) { + const float value = i * space_bin_size; + space_lut[i] = std::exp(value * value * space_coeff); + } + + // Store modes for each grid bin (to be averaged later). + std::vector> mode_grid(grid_dims.x() * grid_dims.y()); + std::vector mode_ptrs; + + DetermineFeatureModes(salient_features, grid_resolution, grid_dims, + band_width, feature_grid, feature_taps, space_lut, + space_scale, &mode_grid, &mode_ptrs); + + // Read out modes, ordered by decreasing weight. + struct FeatureModeComparator { + bool operator()(const FeatureMode* mode_lhs, + const FeatureMode* mode_rhs) const { + return mode_lhs->irls_weight > mode_rhs->irls_weight; + } + }; + + // Sort pointers, to keep order immutable during flagging operations. + std::sort(mode_ptrs.begin(), mode_ptrs.end(), FeatureModeComparator()); + + for (int m = 0; m < mode_ptrs.size(); ++m) { + // We mark a mode as processed by assigning -1 to its index. + if (mode_ptrs[m]->feature_idx < 0) { + continue; + } + + FeatureMode* mode = mode_ptrs[m]; + + // Average modes within band_width based on irls_weight * spatial weight. + double sum_weight = mode->irls_weight; + double mode_x = sum_weight * mode->location.x(); + double mode_y = sum_weight * mode->location.y(); + + const Vector2_f& feat_loc = salient_features[mode->feature_idx]->pt; + double feat_x = sum_weight * feat_loc.x(); + double feat_y = sum_weight * feat_loc.y(); + double feat_xx = sum_weight * feat_loc.x() * feat_loc.x(); + double feat_xy = sum_weight * feat_loc.x() * feat_loc.y(); + double feat_yy = sum_weight * feat_loc.y() * feat_loc.y(); + + mode->feature_idx = -1; // Flag as processed, does not change order + // of traversal. + + for (const auto& bin : feature_taps[mode->mode_bin]) { + for (auto& test_mode : mode_grid[bin]) { + if (test_mode.feature_idx >= 0) { + const float dist = (test_mode.location - mode->location).Norm(); + if (dist <= band_width) { + const Vector2_f test_loc = + salient_features[test_mode.feature_idx]->pt; + const float weight = + space_lut[static_cast(dist * space_scale)] * + test_mode.irls_weight; + + sum_weight += weight; + mode_x += weight * test_mode.location.x(); + mode_y += weight * test_mode.location.y(); + + const float test_loc_x_w = weight * test_loc.x(); + const float test_loc_y_w = weight * test_loc.y(); + feat_x += test_loc_x_w; + feat_y += test_loc_y_w; + + feat_xx += test_loc_x_w * test_loc.x(); + feat_xy += test_loc_x_w * test_loc.y(); + feat_yy += test_loc_y_w * test_loc.y(); + + // Flag as processed, does not change order of traversal. + test_mode.feature_idx = -1; + } + } + } + } + + if (sum_weight >= options_.min_irls_mode_weight()) { + double inv_sum_weight = 1.0f / sum_weight; + mode_x *= inv_sum_weight; + mode_y *= inv_sum_weight; + feat_x *= inv_sum_weight; + feat_y *= inv_sum_weight; + feat_xx *= inv_sum_weight; + feat_xy *= inv_sum_weight; + feat_yy *= inv_sum_weight; + + // Covariance matrix. + const float a = feat_xx - 2.0 * feat_x * mode_x + mode_x * mode_x; + const float bc = + feat_xy - feat_x * mode_y - feat_y * mode_x + mode_x * mode_y; + const float d = feat_yy - 2.0 * feat_y * mode_y + mode_y * mode_y; + + Vector2_f axis_magnitude; + float angle; + if (!EllipseFromCovariance(a, bc, d, &axis_magnitude, &angle)) { + angle = 0; + axis_magnitude = Vector2_f(1, 1); + } else { + if (angle < 0) { + angle += M_PI; + } + CHECK_GE(angle, 0); + CHECK_LE(angle, M_PI + 1e-3); + } + + SalientMode irls_mode; + irls_mode.location = Vector2_f(mode_x, mode_y); + irls_mode.assignment_weight = sum_weight; + irls_mode.axis_magnitude = axis_magnitude; + irls_mode.angle = angle; + modes->push_back(irls_mode); + } + } + + // Sort modes by descreasing weight. + struct ModeWeightCompare { + bool operator()(const SalientMode& lhs, const SalientMode& rhs) const { + return lhs.assignment_weight > rhs.assignment_weight; + } + }; + + std::sort(modes->begin(), modes->end(), ModeWeightCompare()); +} + +// Determines the salient frame for a list of SalientLocations by performing +// mode finding and scales each point based on frame size. +void MotionSaliency::DetermineSalientFrame( + std::vector locations, SalientPointFrame* salient_frame) { + CHECK(salient_frame); + + std::vector modes; + { + MEASURE_TIME << "Mode finding"; + SalientModeFinding(&locations, &modes); + } + + const float denom_x = 1.0f / frame_width_; + const float denom_y = 1.0f / frame_height_; + + // Convert to salient points. + for (int mode_idx = 0, + mode_sz = std::min(modes.size(), options_.num_top_irls_modes()); + mode_idx < mode_sz; ++mode_idx) { + SalientPoint* pt = salient_frame->add_point(); + pt->set_norm_point_x(modes[mode_idx].location.x()); + pt->set_norm_point_y(modes[mode_idx].location.y()); + pt->set_left(options_.bound_left()); + pt->set_bottom(options_.bound_bottom()); + pt->set_right(options_.bound_right()); + pt->set_top(options_.bound_top()); + + pt->set_norm_major(modes[mode_idx].axis_magnitude.x()); + pt->set_norm_minor(modes[mode_idx].axis_magnitude.y()); + pt->set_angle(modes[mode_idx].angle); + pt->set_weight(modes[mode_idx].assignment_weight * + options_.saliency_weight()); + + ScaleSalientPoint(denom_x, denom_y, pt); + } +} + +void ForegroundWeightsFromFeatures(const RegionFlowFeatureList& feature_list, + float foreground_threshold, + float foreground_gamma, + const CameraMotion* camera_motion, + std::vector* weights) { + CHECK(weights != nullptr); + weights->clear(); + + constexpr float kEpsilon = 1e-4f; + + CHECK_GT(foreground_threshold, 0.0f); + if (camera_motion) { + foreground_threshold *= + std::max(kEpsilon, InlierCoverage(*camera_motion, false)); + } + + const float weight_denom = 1.0f / foreground_threshold; + + // Map weights to foreground measure and determine minimum irls weight. + for (const auto& feature : feature_list.feature()) { + // Skip marked outliers. + if (feature.irls_weight() == 0) { + weights->push_back(0.0f); + continue; + } + + // Maps an irls_weight of magnitude weight_denom (from above) to zero, + // with values below weight_denom assigned linearly mapped (zero is mapped + // ot 1). Avoid mapping to zero as it used to mark outliers. + const float foreground_measure = + std::max(0.0f, 1.0f - feature.irls_weight() * weight_denom); + + if (std::abs(foreground_gamma - 1.0f) < 1e-3f) { + weights->push_back(std::max(kEpsilon, foreground_measure)); + } else { + weights->push_back( + std::max(kEpsilon, std::pow(foreground_measure, foreground_gamma))); + } + } + CHECK_EQ(feature_list.feature_size(), weights->size()); +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/motion_saliency.h b/mediapipe/util/tracking/motion_saliency.h new file mode 100644 index 000000000..1dcd8e857 --- /dev/null +++ b/mediapipe/util/tracking/motion_saliency.h @@ -0,0 +1,150 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Computes MotionSaliency points that can be used for stabilization and +// retargeting. + +#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_SALIENCY_H_ +#define MEDIAPIPE_UTIL_TRACKING_MOTION_SALIENCY_H_ + +#include +#include + +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/motion_saliency.pb.h" +#include "mediapipe/util/tracking/region_flow.h" + +namespace mediapipe { +class RegionFlowFeatureList; +class RegionFlowFrame; +class SalientPointFrame; +} // namespace mediapipe + +namespace mediapipe { + +class MotionSaliency { + public: + MotionSaliency(const MotionSaliencyOptions& options, int frame_width, + int frame_height); + ~MotionSaliency(); + + // Finds modes in the RegionFlowFeatureList (clusters for high IRLS weight, + // per default features agreeing with the background motion). + // Optionally, per feature irls weights can be supplied instead of using the + // features weight to adapt modes that will be found, e.g. see + // ForegroundWeightsFromFeatures below. + void SaliencyFromFeatures(const RegionFlowFeatureList& feature_list, + std::vector* irls_weights, // optional. + SalientPointFrame* salient_frame); + + // Finds saliency points (modes) from a list of points and their respective + // weights, outputs a SalientPointFrame. + void SaliencyFromPoints(const std::vector* points, + const std::vector* weights, + SalientPointFrame* salient_frame); + + // Selects saliency inliers, by searching for close-by salient points + // (within fractional MotionSaliencyOptions::filtering_support_distance) + // across adjacent frames (considered are + // #MotionSaliencyOptions::filtering_frame_radius before and after the + // current frame). + // If at least #MotionSaliencyOptions::filtering_minimum_support + // supporting points are found the tested salient point is kept, otherwise + // discarded. + // If desired performs rescaling, such that the median salient point weight + // equals MotionSaliencyOptions::saliency_weight(). + void SelectSaliencyInliers(std::vector* motion_saliency, + bool rescale_to_median_saliency_weight); + + // Averages all salient points (unweighted average) per frame. The resulting + // mean salient point is assigned weight one, and the specified normalized + // bounds (as tuple left, bottom, right, top). + void CollapseMotionSaliency(const SaliencyPointList& input_saliency, + const Vector4_f& bounds, + SaliencyPointList* output_saliency); + + // Smooths saliency in space and time. + void FilterMotionSaliency( + std::vector* saliency_point_list); + + // Aggregates location in image domain and salient weight. + struct SalientLocation { + SalientLocation() {} + SalientLocation(const Vector2_f& _pt, float _weight) + : pt(_pt), weight(_weight) {} + Vector2_f pt; + float weight = 0; + }; + + private: + // Locates modes in a set of SalientLocation's. + // (using mean shift with bilateral weights, i.e. weight * spatial + // gaussian weighting). + // Only modes with for which the sum of total saliency weight is + // above min_irls_mode_sum are returned. + // Returns modes in the image domain as 2D points, sum of their + // assignment weights and spatial extend along major and minor axis. + // Modes are sorted w.r.t. their assignment irls weights (from highest to + // lowest). + struct SalientMode { + Vector2_f location; + // Total sum of irls weights assigned to this mode. + float assignment_weight = 0; + // Magnitude of major and minor axis storred in x and y, respectively. + Vector2_f axis_magnitude; + // Angle in radians w.r.t. x-axis. + float angle = 0; + }; + + // Note: input vector locations is not mutated by function. + void SalientModeFinding(std::vector* locations, + std::vector* modes); + + // Determines the salient frame for a list of SalientLocations by performing + // mode finding and scaling each point based on frame size. + void DetermineSalientFrame(std::vector locations, + SalientPointFrame* salient_frame); + + MotionSaliencyOptions options_; + int frame_width_; + int frame_height_; +}; + +// Returns foregroundness weights in [0, 1] for each feature, by mapping irls +// weight to foreground score in [0, 1]. +// In particular, the foreground threshold indicates the *inverse* registration +// error (i.e. the irls weight) that is deemed a complete inlier. +// Weights in the interval [0, foreground_threshold] (corresponding to +// pixel errors in the interval [1 / foreground_threshold, inf]) +// are mapped to 1 - [0, 1], i.e. foreground threshold is mapped to zero +// with weights below the threshold being assigned values > 0. +// Therefore, larger values will increase amount of detected foreground +// as well as noise. +// In addition, foreground_gamma's < 1 can be used to increase the resolution +// of small foreground motions (irls weight close to the foreground_threshold) +// at the expense of larger foreground motions (irls weight close to zero). +// If optional parameter camera_motion is specified, the passed foreground +// threshold is scaled by the InlierCoverage of the camera_motion +// (which is in 0, 1). That is for unstable frames with small coverage, +// the threshold is tighter and fewer features are considered foreground. +void ForegroundWeightsFromFeatures( + const RegionFlowFeatureList& feature_list, + float foreground_threshold, // 0.5 is a good default value. + float foreground_gamma, // use 1.0 for default + const CameraMotion* camera_motion, // optional, can be nullptr. + std::vector* weights); + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_SALIENCY_H_ diff --git a/mediapipe/util/tracking/motion_saliency.proto b/mediapipe/util/tracking/motion_saliency.proto new file mode 100644 index 000000000..b7702dae0 --- /dev/null +++ b/mediapipe/util/tracking/motion_saliency.proto @@ -0,0 +1,71 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +// Next tag: 17 +message MotionSaliencyOptions { + // Standard normalized bounds and weights used to initialize salient points. + // See region_flow.proto for details. + optional float bound_left = 1 [default = 0.3]; + optional float bound_bottom = 2 [default = 0.3]; + optional float bound_right = 15 [default = 0.3]; + optional float bound_top = 16 [default = 0.3]; + + optional float saliency_weight = 3 [default = 20]; + + // If set, scales saliency_weight by flow magnitude. + optional bool scale_weight_by_flow_magnitude = 8 [default = false]; + + // Minimum number of features within a region to be considered salient. + // Only applicable for functions accepting RegionFlowFrames. + optional int32 min_features = 4 [default = 5]; + + // If set, only considers regions flagged as forground. + optional bool use_only_foreground_regions = 9 [default = false]; + + // Specifies roughly number of foreground features mapped to one mode, + // for mode to be considered salient. + optional float min_irls_mode_weight = 10 [default = 10]; + + // Only returns the top N irls modes. + optional int32 num_top_irls_modes = 11 [default = 3]; + + // Mode finding is performed with a fraction radius of 10% of frame + // diameter by default. + optional float mode_band_width = 12 [default = 0.10]; + + // We filter salient points along the temporal dimension only, keeping those + // that have sufficient support (in form of neighboring salient points). For + // every salient point in frame n, all points in frames + // [n - filtering_frame_radius, n + filtering_frame_radius] are tested, + // whether they support the current test point. + optional int32 selection_frame_radius = 5 [default = 5]; + + // Fractional distance to be considered a supporting salient point for a test + // point. + optional float selection_support_distance = 6 [default = 0.2]; + + // Minimum number of supporting salient points that need to be present in + // order for a point to be considered an inlier. + optional int32 selection_minimum_support = 7 [default = 4]; + + // Sigma in space (normalized domain). + optional float filtering_sigma_space = 13 [default = 0.05]; + + // Sigma in time (in frames). + optional float filtering_sigma_time = 14 [default = 5]; +} diff --git a/mediapipe/util/tracking/parallel_invoker.cc b/mediapipe/util/tracking/parallel_invoker.cc new file mode 100644 index 000000000..39228ba97 --- /dev/null +++ b/mediapipe/util/tracking/parallel_invoker.cc @@ -0,0 +1,36 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/parallel_invoker.h" + +// Choose between ThreadPool, OpenMP and serial execution. +// Note only one parallel_using_* directive can be active. +int flags_parallel_invoker_mode = PARALLEL_INVOKER_MAX_VALUE; +int flags_parallel_invoker_max_threads = 4; + +namespace mediapipe { + +#if defined(PARALLEL_INVOKER_ACTIVE) +ThreadPool* ParallelInvokerThreadPool() { + static ThreadPool* pool = []() -> ThreadPool* { + ThreadPool* new_pool = + new ThreadPool("ParallelInvoker", flags_parallel_invoker_max_threads); + new_pool->StartWorkers(); + return new_pool; + }(); + return pool; +} +#endif + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/parallel_invoker.h b/mediapipe/util/tracking/parallel_invoker.h new file mode 100644 index 000000000..41148e6c1 --- /dev/null +++ b/mediapipe/util/tracking/parallel_invoker.h @@ -0,0 +1,507 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Parallel for loop execution. +// For details adapt parallel_using_* flags defined in parallel_invoker.cc. + +// Usage example (for 1D): + +// Define Functor or lambda function that implements: +// void operator()(const BlockedRange & range) const; +// (in addition functor needs to be copyable). + +// Execute a for loop in parallel from 0 to N via: +// ParallelFor(0, // start_index +// num_frames, // end_index, exclusive +// 1 // number of elements processed per iteration +// [](const BlockedRange& range) { +// // Process per-thread sub-range +// for (int i = range.begin(); i < range.end(); ++i) { +// // Process i'th item. +// } +// } + +// Specific implementation to copy a vector of images in parallel. +// class CopyInvoker { +// public: +// CopyInvoker(const vector& inputs, +// vector* outputs) +// : inputs_(inputs), outputs_(outputs) { +// } +// CopyInvoker(const CopyInvoker& rhs) +// : inputs_(rhs.inputs_), outputs_(rhs.outputs) { +// } +// void operator()(const BlockedRange& range) { +// for (int frame = range.begin(); frame < range.end(); ++frame) { +// inputs_[frame].copyTo(*(*outputs_)[frame]); +// } +// } +// private: +// const vector& inputs_; +// vector* outputs_; +// } + +// vector inputs; +// vector outputs; +// ParallelFor(0, num_frames, 1, CopyInvoker(inputs, &outputs)); +// +// OR (with lambdas): +// ParallelFor(0, num_frames, 1, +// [&inputs, &outputs](const BlockedRange& range) { +// for (int frame = range.begin(); frame < range.end(); ++frame) { +// inputs[frame].copyTo(*(outputs)[frame]); +// } +// } + +#ifndef MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_ +#define MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_ + +#include + +#include + +#include "absl/synchronization/mutex.h" +#include "mediapipe/framework/port/logging.h" + +#ifdef PARALLEL_INVOKER_ACTIVE +#include "mediapipe/framework/port/threadpool.h" + +#ifdef __APPLE__ +#include +#include +#endif + +#endif // PARALLEL_INVOKER_ACTIVE + +// Specifies parallelization implementation to use. +enum PARALLEL_INVOKER_MODE { + PARALLEL_INVOKER_NONE = 0, // Uses single threaded execution + PARALLEL_INVOKER_THREAD_POOL = 1, // Uses //thread/threadpool + PARALLEL_INVOKER_OPENMP = 2, // Uses OpenMP (requires compiler support) + PARALLEL_INVOKER_GCD = 3, // Uses GCD (Apple) + PARALLEL_INVOKER_MAX_VALUE = 4, // Increase when adding more modes +}; + +extern int flags_parallel_invoker_mode; +extern int flags_parallel_invoker_max_threads; + +// Note flag: Parallel processing only activated if +// PARALLEL_INVOKER_ACTIVE is defined. + +namespace mediapipe { + +// Partitions the range [begin, end) into equal blocks of size grain_size each +// (except last one, might be less than grain_size). +class BlockedRange { + public: + BlockedRange(int begin, int end, int grain_size) + : begin_(begin), end_(end), grain_size_(grain_size) {} + + int begin() const { return begin_; } + int end() const { return end_; } + int grain_size() const { return grain_size_; } + + private: + int begin_; + int end_; + int grain_size_; +}; + +// Partitions the range row_range x col_range into equal +// blocks of size row_range.grain_size() x col_range.grain_size() each +// (except last column and row might be of size less than grain_size in one +// or both of their dimensions). +class BlockedRange2D { + public: + BlockedRange2D(const BlockedRange& rows, const BlockedRange& cols) + : rows_(rows), cols_(cols) {} + + const BlockedRange& rows() const { return rows_; } + const BlockedRange& cols() const { return cols_; } + + private: + BlockedRange rows_; + BlockedRange cols_; +}; + +#ifdef PARALLEL_INVOKER_ACTIVE + +// Singleton ThreadPool for parallel invoker. +ThreadPool* ParallelInvokerThreadPool(); + +#ifdef __APPLE__ +// Enable to allow GCD as an option beside ThreadPool. +#define USE_PARALLEL_INVOKER_GCD 1 +#define CHECK_GCD_PARALLEL_WORK_COUNT DEBUG + +template +class ParallelInvokerGCDContext { + public: + ParallelInvokerGCDContext(const Invoker& invoker, const BlockedRange& rows) + : local_invoker_(invoker), rows_(rows) { +#if CHECK_GCD_PARALLEL_WORK_COUNT + count_ = 0; +#endif + } + + const Invoker& invoker() { +#if CHECK_GCD_PARALLEL_WORK_COUNT + // Implicitly tracking the # of launched tasks at invoker retrieval. + atomic_fetch_add(&count_, 1); +#endif + return local_invoker_; + } + const BlockedRange& rows() const { return rows_; } +#if CHECK_GCD_PARALLEL_WORK_COUNT + const int count() { return atomic_load(&count_); } +#endif + + private: + Invoker local_invoker_; + const BlockedRange& rows_; +#if CHECK_GCD_PARALLEL_WORK_COUNT + _Atomic(int32_t) count_; +#endif +}; + +template +class ParallelInvokerGCDContext2D : public ParallelInvokerGCDContext { + public: + ParallelInvokerGCDContext2D(const Invoker& invoker, const BlockedRange& rows, + const BlockedRange& cols) + : ParallelInvokerGCDContext(invoker, rows), cols_(cols) {} + + const BlockedRange& cols() const { return cols_; } + + private: + BlockedRange cols_; +}; + +template +static void ParallelForGCDTask(void* context, size_t index) { + ParallelInvokerGCDContext* invoker_context = + static_cast*>(context); + const BlockedRange& all_tasks = invoker_context->rows(); + int start = all_tasks.begin() + index * all_tasks.grain_size(); + int end = std::min(all_tasks.end(), start + all_tasks.grain_size()); + BlockedRange this_task(start, end, all_tasks.grain_size()); + + const Invoker& invoker = invoker_context->invoker(); + invoker(this_task); +} + +template +static void ParallelForGCDTask2D(void* context, size_t index) { + ParallelInvokerGCDContext2D* invoker_context = + static_cast*>(context); + // Partitioning across rows. + const BlockedRange& all_tasks = invoker_context->rows(); + int start = all_tasks.begin() + index * all_tasks.grain_size(); + int end = std::min(all_tasks.end(), start + all_tasks.grain_size()); + BlockedRange this_task(start, end, all_tasks.grain_size()); + + const Invoker& invoker = invoker_context->invoker(); + invoker(BlockedRange2D(this_task, invoker_context->cols())); +} +#endif // __APPLE__ + +#endif // PARALLEL_INVOKER_ACTIVE +// Simple wrapper for compatibility with below ParallelFor function. +template +void SerialFor(size_t start, size_t end, size_t grain_size, + const Invoker& invoker) { + invoker(BlockedRange(start, end, 1)); +} + +inline void CheckAndSetInvokerOptions() { +#if defined(PARALLEL_INVOKER_ACTIVE) +#if defined(__ANDROID__) + // If unsupported option is selected, force usage of OpenMP if detected, and + // ThreadPool otherwise. + if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE && + flags_parallel_invoker_mode != PARALLEL_INVOKER_THREAD_POOL && + flags_parallel_invoker_mode != PARALLEL_INVOKER_OPENMP) { +#if defined(_OPENMP) + LOG(WARNING) << "Unsupported invoker mode selected on Android. " + << "OpenMP linkage detected, so falling back to OpenMP"; + flags_parallel_invoker_mode = PARALLEL_INVOKER_OPENMP; +#else // _OPENMP + // Fallback mode for active parallel invoker without OpenMP is ThreadPool. + LOG(WARNING) << "Unsupported invoker mode selected on Android. " + << "Falling back to ThreadPool"; + flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL; +#endif // _OPENMP + } +#endif // __ANDROID__ + +#if defined(__APPLE__) || defined(__EMSCRIPTEN__) + // Force usage of ThreadPool if unsupported option is selected. + // (OpenMP is not supported on iOS, due to missing clang support). + if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE && +#if defined(USE_PARALLEL_INVOKER_GCD) + flags_parallel_invoker_mode != PARALLEL_INVOKER_GCD && +#endif // USE_PARALLEL_INVOKER_GCD + flags_parallel_invoker_mode != PARALLEL_INVOKER_THREAD_POOL) { + LOG(WARNING) << "Unsupported invoker mode selected on iOS. " + << "Falling back to ThreadPool mode"; + flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL; + } +#endif // __APPLE__ || __EMSCRIPTEN__ + +#if !defined(__APPLE__) && !defined(__EMSCRIPTEN__) && !defined(__ANDROID__) + flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL; +#endif // !__APPLE__ && !__EMSCRIPTEN__ && !__ANDROID__ + + // If OpenMP is requested, make sure we can actually use it, and fall back + // to ThreadPool if not. + if (flags_parallel_invoker_mode == PARALLEL_INVOKER_OPENMP) { +#if !defined(_OPENMP) + LOG(ERROR) << "OpenMP invoker mode selected but not compiling with OpenMP " + << "enabled. Falling back to ThreadPool"; + flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL; +#endif // _OPENMP + } + +#else // PARALLEL_INVOKER_ACTIVE + if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE) { + LOG(ERROR) << "Parallel execution requested but PARALLEL_INVOKER_ACTIVE " + << "compile flag is not set. Falling back to single threaded " + << "execution."; + flags_parallel_invoker_mode = PARALLEL_INVOKER_NONE; + } +#endif // PARALLEL_INVOKER_ACTIVE + + CHECK_LT(flags_parallel_invoker_mode, PARALLEL_INVOKER_MAX_VALUE) + << "Invalid invoker mode specified."; + CHECK_GE(flags_parallel_invoker_mode, 0) << "Invalid invoker mode specified."; +} + +// Performs parallel iteration from [start to end), scheduling grain_size +// iterations per thread. For each iteration +// invoker(BlockedRange(thread_local_start, thread_local_end)) +// is called. Each thread is given its local copy of invoker, i.e. +// invoker needs to have copy constructor defined. +template +void ParallelFor(size_t start, size_t end, size_t grain_size, + const Invoker& invoker) { +#ifdef PARALLEL_INVOKER_ACTIVE + CheckAndSetInvokerOptions(); + switch (flags_parallel_invoker_mode) { +#if defined(__APPLE__) + case PARALLEL_INVOKER_GCD: { + int iterations_remain = (end - start + grain_size - 1) / grain_size; + CHECK_GT(iterations_remain, 0); + if (iterations_remain == 1) { + // Execute invoker serially. + invoker(BlockedRange(start, std::min(end, start + grain_size), 1)); + } else { + BlockedRange all_tasks(start, end, grain_size); + ParallelInvokerGCDContext context(invoker, all_tasks); + dispatch_queue_t concurrent_queue = + dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); + dispatch_apply_f(iterations_remain, concurrent_queue, &context, + ParallelForGCDTask); +#if CHECK_GCD_PARALLEL_WORK_COUNT + CHECK_EQ(iterations_remain, context.count()); +#endif + } + break; + } +#endif // __APPLE__ + + case PARALLEL_INVOKER_THREAD_POOL: { + int iterations_remain = (end - start + grain_size - 1) / grain_size; + CHECK_GT(iterations_remain, 0); + if (iterations_remain == 1) { + // Execute invoker serially. + invoker(BlockedRange(start, std::min(end, start + grain_size), 1)); + break; + } + + struct { + absl::Mutex mutex; + absl::CondVar completed; + int iterations_remain GUARDED_BY(mutex); + } loop; + { + absl::MutexLock lock(&loop.mutex); + loop.iterations_remain = iterations_remain; + } + + for (int x = start; x < end; x += grain_size) { + auto loop_func = [x, end, grain_size, &loop, invoker]() { + // Execute invoker. + invoker(BlockedRange(x, std::min(end, x + grain_size), 1)); + + // Decrement counter. + absl::MutexLock lock(&loop.mutex); + --loop.iterations_remain; + if (loop.iterations_remain == 0) { + loop.completed.SignalAll(); + } + }; + + // Attempt to run in parallel, if busy run serial to avoid deadlocking. + // This can happen during nested invocation of ParallelFor, as if the + // loop iteration itself is calling ParallelFor we might deadlock if + // we can not guarantee for the iteration to be scheduled. + ParallelInvokerThreadPool()->Schedule(loop_func); + } + + // Wait on termination of all iterations. + loop.mutex.Lock(); + while (loop.iterations_remain > 0) { + loop.completed.Wait(&loop.mutex); + } + loop.mutex.Unlock(); + break; + } + + case PARALLEL_INVOKER_OPENMP: { + // Use thread-local copy of invoker. + Invoker local_invoker(invoker); +#pragma omp parallel for firstprivate(local_invoker) \ + num_threads(flags_parallel_invoker_max_threads) + for (int x = start; x < end; ++x) { + local_invoker(BlockedRange(x, x + 1, 1)); + } + break; + } + + case PARALLEL_INVOKER_NONE: { + SerialFor(start, end, grain_size, invoker); + break; + } + + case PARALLEL_INVOKER_MAX_VALUE: { + LOG(FATAL) << "Impossible."; + break; + } + } +#else + SerialFor(start, end, grain_size, invoker); +#endif // PARALLEL_INVOKER_ACTIVE +} + +// Simple wrapper for compatibility with below ParallelFor2D function. +template +void SerialFor2D(size_t start_row, size_t end_row, size_t start_col, + size_t end_col, size_t grain_size, const Invoker& invoker) { + invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1), + BlockedRange(start_col, end_col, 1))); +} + +// Same as above ParallelFor for 2D iteration. +template +void ParallelFor2D(size_t start_row, size_t end_row, size_t start_col, + size_t end_col, size_t grain_size, const Invoker& invoker) { +#ifdef PARALLEL_INVOKER_ACTIVE + CheckAndSetInvokerOptions(); + switch (flags_parallel_invoker_mode) { +#if defined(__APPLE__) + case PARALLEL_INVOKER_GCD: { + const int iterations_remain = + (end_row - start_row + grain_size - 1) / grain_size; + CHECK_GT(iterations_remain, 0); + if (iterations_remain == 1) { + // Execute invoker serially. + invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1), + BlockedRange(start_col, end_col, 1))); + } else { + BlockedRange all_tasks(start_row, end_row, grain_size); + ParallelInvokerGCDContext2D context( + invoker, all_tasks, BlockedRange(start_col, end_col, grain_size)); + dispatch_queue_t concurrent_queue = + dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); + dispatch_apply_f(iterations_remain, concurrent_queue, &context, + ParallelForGCDTask2D); +#if CHECK_GCD_PARALLEL_WORK_COUNT + CHECK_EQ(iterations_remain, context.count()); +#endif + } + break; + } +#endif // __APPLE__ + + case PARALLEL_INVOKER_THREAD_POOL: { + int iterations_remain = end_row - start_row; // Guarded by loop_mutex + CHECK_GT(iterations_remain, 0); + if (iterations_remain == 1) { + // Execute invoker serially. + invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1), + BlockedRange(start_col, end_col, 1))); + break; + } + + absl::Mutex loop_mutex; + absl::CondVar loop_completed; + + for (int y = start_row; y < end_row; ++y) { + auto loop_func = [y, start_col, end_col, &loop_mutex, &loop_completed, + &iterations_remain, invoker]() { + // Execute invoker. + invoker(BlockedRange2D(BlockedRange(y, y + 1, 1), + BlockedRange(start_col, end_col, 1))); + + // Decrement counter. + absl::MutexLock lock(&loop_mutex); + --iterations_remain; + if (iterations_remain == 0) { + loop_completed.Signal(); + } + }; + + // Attempt to run in parallel, if busy run serial to avoid deadlocking. + ParallelInvokerThreadPool()->Schedule(loop_func); + } + + // Wait on termination of all iterations. + loop_mutex.Lock(); + while (iterations_remain > 0) { + loop_completed.Wait(&loop_mutex); + } + loop_mutex.Unlock(); + break; + } + + case PARALLEL_INVOKER_OPENMP: { + // Use thread-local copy of invoker. + Invoker local_invoker(invoker); +#pragma omp parallel for firstprivate(local_invoker) \ + num_threads(flags_parallel_invoker_max_threads) + for (int y = start_row; y < end_row; ++y) { + local_invoker(BlockedRange2D(BlockedRange(y, y + 1, 1), + BlockedRange(start_col, end_col, 1))); + } + break; + } + + case PARALLEL_INVOKER_NONE: { + SerialFor2D(start_row, end_row, start_col, end_col, grain_size, invoker); + break; + } + + case PARALLEL_INVOKER_MAX_VALUE: { + LOG(FATAL) << "Impossible."; + break; + } + } +#else + SerialFor2D(start_row, end_row, start_col, end_col, grain_size, invoker); +#endif // PARALLEL_INVOKER_ACTIVE +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_ diff --git a/mediapipe/util/tracking/parallel_invoker_forbid_mixed.cc b/mediapipe/util/tracking/parallel_invoker_forbid_mixed.cc new file mode 100644 index 000000000..383ad53a3 --- /dev/null +++ b/mediapipe/util/tracking/parallel_invoker_forbid_mixed.cc @@ -0,0 +1,28 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Guard to ensure clients do not link against both, +// single and parallel version. +#ifdef PARALLEL_INVOKER_ACTIVE +int LinkageAgainstBothSingleAndParallelStabilizationVersionsDetected() { + return 0; +} + +#endif // PARALLEL_INVOKER_ACTIVE + +#ifdef PARALLEL_INVOKER_INACTIVE +int LinkageAgainstBothSingleAndParallelStabilizationVersionsDetected() { + return 1; +} +#endif // PARALLEL_INVOKER_INACTIVE diff --git a/mediapipe/util/tracking/parallel_invoker_test.cc b/mediapipe/util/tracking/parallel_invoker_test.cc new file mode 100644 index 000000000..8f1c86cb9 --- /dev/null +++ b/mediapipe/util/tracking/parallel_invoker_test.cc @@ -0,0 +1,60 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/parallel_invoker.h" + +#include +#include + +#include "absl/synchronization/mutex.h" +#include "mediapipe/framework/port/gtest.h" + +namespace mediapipe { +namespace { + +void RunParallelTest() { + absl::Mutex numbers_mutex; + std::vector numbers; + const int kArraySize = 5000; + + // Fill number array in parallel. + ParallelFor(0, kArraySize, 1, + [&numbers_mutex, &numbers](const BlockedRange& b) { + for (int k = b.begin(); k != b.end(); ++k) { + absl::MutexLock lock(&numbers_mutex); + numbers.push_back(k); + } + }); + + std::vector expected(kArraySize); + std::iota(expected.begin(), expected.end(), 0); + EXPECT_TRUE( + std::is_permutation(expected.begin(), expected.end(), numbers.begin())); +} + +TEST(ParallelInvokerTest, PhotosTest) { + flags_parallel_invoker_mode = PARALLEL_INVOKER_OPENMP; + + RunParallelTest(); +} + +TEST(ParallelInvokerTest, ThreadPoolTest) { + flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL; + + // Needs to be run in opt mode to pass. + RunParallelTest(); +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/util/tracking/push_pull_filtering.h b/mediapipe/util/tracking/push_pull_filtering.h new file mode 100644 index 000000000..f9b2c6c3c --- /dev/null +++ b/mediapipe/util/tracking/push_pull_filtering.h @@ -0,0 +1,1296 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Push pull filtering parametrized by number of channels. +// Performs sparse vector data interpolation across a specified domain. +// Optionally interpolation can be guided to be discontinuous across image +// boundaries and customized with various multipliers as described below. +// +// Note: As the file contains templated implementations it is recommended to be +// included in cc files instead of headers to speed up compilation. + +#ifndef MEDIAPIPE_UTIL_TRACKING_PUSH_PULL_FILTERING_H_ +#define MEDIAPIPE_UTIL_TRACKING_PUSH_PULL_FILTERING_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/image_util.h" +#include "mediapipe/util/tracking/push_pull_filtering.pb.h" + +namespace mediapipe { + +const float kBilateralEps = 1e-6f; + +// Push Pull algorithm can be decorated with mip-map visualizers, +// per-level weight adjusters and per-filter element weight multipliers. +// Implemented by default as no-ops below. +// +// Mip map's are made of C + 1 channel matrices, where data is stored at +// channels [0 .. C - 1], and push pull weights at index C. Commong flag +// pull_down_sampling for classes below is true for first pull stage +// (fine -> coarse) and false for second push stage (coarse -> fine). + +// Is called with the interpolated data at every level of the hierarchy. Enables +// to adjust weights or perform other kinds of modification *globally* for each +// mip-map level. +class PushPullWeightAdjuster { + public: + virtual ~PushPullWeightAdjuster() {} + + // In case of bilateral weighting, input_frame (resized to corresponding mip + // map level) is passed as well, otherwise parameter is NULL. + virtual void AdjustWeights(int mip_map_level, bool pull_down_sampling, + cv::Mat* input_frame, // Bilateral case. + cv::Mat* data_with_weights) = 0; +}; + +// Allows mip map to be visualized after first stage (pull_down_sampling == +// true) and second stage (push up sampling, i.e. pull_down_sampling == false). +// Note: For visualizers, data values in mip map are pre-multiplied by +// confidence weights in channel C if corresponding flag is_premultiplied +// is set to true). In this case normalization (division by confidence) +// has to be performed before visualization. +// Passed mip maps are borderless, i.e. views into the actual mip map with +// border being removed. +class PushPullMipMapVisualizer { + public: + virtual ~PushPullMipMapVisualizer() {} + + virtual void Visualize(const std::vector& mip_map, + bool pull_down_sampling, + const std::vector& is_premultiplied) = 0; +}; + +// FilterWeightMultiplier is a template argument to PushPullFiltering to +// adjust the filter weight at every up- and downsampling stage. +// Specifically every point (x, y) with data pointer anchor_ptr into +// the current mip maplevel (C + 1 channels, first C +// contain data, index C contains push-pull importance weight) is filtered in a +// neighborhood with several neighboring samples (pointed to by filter_ptr). +// In case of bilateral filtering img_ptr is pointing to 3 channel image pixel +// of the anchor. +// Default version of multiplier must be constructable with no arguments, and +// the following two interfaces have to be implemented: +// +// // Signals change in level, can be used for mutable initialization functions. +// void SetLevel(int mip_map_level, bool pull_down_sampling); +// +// // Function is called once for every neighbor (filter_ptr) of a pixel +// // (anchor_ptr). Location (x,y) of the pixel pointed to by anchor pointer is +// // also passed if needed for more complex operations. +// float WeightMultiplier(const float* anchor_ptr, // Points to anchor. +// const float* filter_ptr, // Offset element. +// const uint_8t* img_ptr, // NULL if not bilateral. +// int x, +// int y) const; +// +// Here is an example (used by default). + +// Default no-op. +class FilterWeightMultiplierOne { + public: + void SetLevel(int mip_map_level, bool pull_down_sampling) {} + + float GetWeight(const float* anchor_ptr, const float* filter_ptr, + const uint8* img_ptr, int x, int y) const { + return 1.0f; + } +}; + +class PushPullFilteringTest; + +// Templated by number of channels and FilterWeightMultiplier. +template +class PushPullFiltering { + public: + enum FilterType { + BINOMIAL_3X3 = 0, + BINOMIAL_5X5 = 1, + GAUSSIAN_3X3 = 2, // sigma = 1. + GAUSSIAN_5X5 = 3, // sigma = 1.6. + }; + + // Initializes push pull filter for specified domain size. + // Optionally, weight_multiplier, mip-map visualizer and + // weight adjuster can be passed as argument. + PushPullFiltering(const cv::Size& domain_size, FilterType filter_type, + bool use_bilateral, + FilterWeightMultiplier* weight_multiplier, // Optional. + PushPullMipMapVisualizer* mip_map_visualizer, // Optional. + PushPullWeightAdjuster* weight_adjuster); // Optional. + PushPullFiltering() = delete; + PushPullFiltering(const PushPullFiltering&) = delete; + PushPullFiltering& operator=(const PushPullFiltering&) = delete; + + // Returns number of pyramid levels allocated for given domain size. + int PyramidLevels() const { return downsample_pyramid_.size(); } + + // Returns domain size of n-th pyramid level (including border depending on + // filter_type). + cv::Size NthPyramidDomain(int level) { + CHECK_LT(level, PyramidLevels()); + return downsample_pyramid_[level].size(); + } + + void SetOptions(const PushPullOptions& options); + + // Push-Pull filter for C + 1 channel float displacement image + // (expected to be of size domain_size plus 1 (if filter == *_3x3) or + // 2 (if filter == *_5x5) pixel border around it, use + // BorderFromFilterType function for lookup). + // First C dimensions contain interpolated data, last dimension contains + // associated importance weight. + // Places data_values on integer location data_locations + origin with + // uniform weight (== push_pull_weight) and employs iterative weighted + // down and up-sampling. If optional data_weight is specified uses per datum + // feature weight instead (weights are expected to be within [0, 1]). + // If input_frame is specified, spatial filter type is combined with + // intensity based filtering yielding bilateral weighting. + // Results are returned in parameter results. + // Filter is performed in 2 stages: + // i) PullDownSampling: Densifies the data by successive downsampling stages, + // averaging confidence and values across the domain from sparse data + // locations to unset values. + // ii) PushUpSampling: Pushes densified data back through the pyramid by + // successive upsampling stages, over-writing unset values with filled in + // data from downsampled version. + void PerformPushPull(const std::vector& data_locations, + const std::vector>& data_values, + float push_pull_weight, cv::Point2i origin, + int readout_level, // Default: 0. + const std::vector* data_weights, // Optional. + const cv::Mat* input_frame, // Optional. + cv::Mat* results); + + // This is the same as PerformPushPull above except that it + // assumes that the data (the mip_map at level 0) is given as a cv::Mat. + // The Mat should be C + 1 channels in total. + // The first C channels (channels 0 to C-1) of mip_map_level_0 should contain + // data_values * data_weights (or push_pull_weight) at appropriate locations, + // offset by the border. + // The corresponding locations in channel C are set to the data_weights. + // Locations without data should be set to 0 in all channels. + void PerformPushPullMat(const cv::Mat& mip_map_level_0, + int readout_level, // Default: 0. + const cv::Mat* input_frame, // Optional. + cv::Mat* results); + + static constexpr int BorderFromFilterType(FilterType filter_type); + + FilterType filter_type() const { return filter_type_; } + + private: + // Implementation functions for PushPullFiltering. + void SetupFilters(); + + void SetupBilateralLUT(); + + // If allocate_base_level is set, allocates a frame for level zero of + // size domain_size + 2 * border, otherwise only levels 1 to end are + // allocated. + void AllocatePyramid(const cv::Size& domain_size, int border, int type, + bool allocate_base_level, std::vector* pyramid); + + // Downsampling operation for input_frame along pre-allocated pyramid. + void InitializeImagePyramid(const cv::Mat& input_frame, + std::vector* pyramid); + + void PerformPushPullImpl(const int readout_level, const cv::Mat* input_frame, + std::vector* mip_map_ptr); + + void PullDownSampling(int num_filter_elems, const float* filter_weights, + std::vector* mip_map_ptr); + + void PushUpSampling(int num_filter_elems, const float* filter_weights, + int readout_level, std::vector* mip_map_ptr); + + // Convenience function selecting appropiate border size based on filter_type. + template + void CopyNecessaryBorder(cv::Mat* mat); + + // Implementation function for fast upsampling. See comments before definition + // for details. + void GetUpsampleTaps3(const float* filter_weights, + const std::vector* space_offsets, // optional. + int inc_x, int inc_y, std::vector tap_weights[4], + std::vector tap_offsets[4], + std::vector tap_space_offsets[4]); + + void GetUpsampleTaps5(const float* filter_weights, + const std::vector* space_offsets, // optional. + int inc_x, int inc_y, std::vector tap_weights[4], + std::vector tap_offsets[4], + std::vector tap_space_offsets[4]); + + inline int ColorDiffL1(const uint8* lhs_ptr, const uint8* rhs_ptr) { + return abs(static_cast(lhs_ptr[0]) - static_cast(rhs_ptr[0])) + + abs(static_cast(lhs_ptr[1]) - static_cast(rhs_ptr[1])) + + abs(static_cast(lhs_ptr[2]) - static_cast(rhs_ptr[2])); + } + + const cv::Size domain_size_; + FilterType filter_type_; + int border_; + + std::array binomial5_weights_; + std::array binomial3_weights_; + std::array gaussian5_weights_; + std::array gaussian3_weights_; + + // Pyramids used by PushPull implementation. + std::vector downsample_pyramid_; + std::vector input_frame_pyramid_; + + // Pre-computed spacial offets based on selected filter for each level of the + // pyramid. + std::vector> pyramid_space_offsets_; + + bool use_bilateral_ = false; + + FilterWeightMultiplier* weight_multiplier_; + // Used to create default multiplier is none was passed. + std::unique_ptr default_weight_multiplier_; + + PushPullMipMapVisualizer* mip_map_visualizer_; + PushPullWeightAdjuster* weight_adjuster_; + PushPullOptions options_; + + std::vector bilateral_lut_; + + friend class PushPullFilteringTest; +}; + +// Typedef's for explicit instantiation, add more if needed. +typedef PushPullFiltering<1> PushPullFilteringC1; +typedef PushPullFiltering<2> PushPullFilteringC2; +typedef PushPullFiltering<3> PushPullFilteringC3; +typedef PushPullFiltering<4> PushPullFilteringC4; + +// For compatible forward declarations add this to your header: +// template class PushPullFiltering; +// class FilterWeightMultiplierOne; +// typedef PushPullFiltering<1, FilterWeightMultiplierOne> PushPullFlowC1; + +// Several concrete helper classes below. +/////////////////////////////////////////////////////////////////////////////// +// Template implementation functions below. +// +template +PushPullFiltering::PushPullFiltering( + const cv::Size& domain_size, FilterType filter_type, bool use_bilateral, + FilterWeightMultiplier* weight_multiplier, + PushPullMipMapVisualizer* mip_map_visualizer, + PushPullWeightAdjuster* weight_adjuster) + : domain_size_(domain_size), + filter_type_(filter_type), + use_bilateral_(use_bilateral), + weight_multiplier_(weight_multiplier), + mip_map_visualizer_(mip_map_visualizer), + weight_adjuster_(weight_adjuster) { + border_ = BorderFromFilterType(filter_type); + if (border_ < 0) { + LOG(FATAL) << "Unknown filter requested."; + } + + SetupFilters(); + AllocatePyramid(domain_size_, border_, CV_32FC(C + 1), true, + &downsample_pyramid_); + + if (use_bilateral_) { + SetupBilateralLUT(); + AllocatePyramid(domain_size_, border_, CV_8UC3, true, + &input_frame_pyramid_); + + // Setup space offsets. + pyramid_space_offsets_.resize(input_frame_pyramid_.size()); + for (int l = 0, num_levels = input_frame_pyramid_.size(); l < num_levels; + ++l) { + std::vector& space_offsets = pyramid_space_offsets_[l]; + const cv::Mat& pyramid_frame = input_frame_pyramid_[l]; + for (int row = -border_; row <= border_; ++row) { + for (int col = -border_; col <= border_; ++col) { + space_offsets.push_back(row * pyramid_frame.step[0] + + col * pyramid_frame.elemSize()); + } + } + } + } + + // Create default weight multiplier if none was passed. + if (weight_multiplier_ == NULL) { + default_weight_multiplier_.reset(new FilterWeightMultiplier()); + weight_multiplier_ = default_weight_multiplier_.get(); + } +} + +template +constexpr int +PushPullFiltering::BorderFromFilterType( + FilterType filter_type) { + // -1 indicates error (unknown filter_type). + return (filter_type == BINOMIAL_3X3 || filter_type == GAUSSIAN_3X3) + ? 1 + : ((filter_type == BINOMIAL_5X5 || filter_type == GAUSSIAN_5X5) + ? 2 + : -1); +} + +template +void PushPullFiltering::SetOptions( + const PushPullOptions& options) { + options_ = options; + SetupBilateralLUT(); +} + +template +void PushPullFiltering::SetupFilters() { + // Setup binomial weights. + std::array bin5_weights{{1, 4, 6, 4, 1, 4, 16, 24, 16, + 4, 6, 24, 36, 24, 6, 4, 16, 24, + 16, 4, 1, 4, 6, 4, 1}}; + + std::array bin3_weights{{1, 2, 1, 2, 4, 2, 1, 2, 1}}; + + // Normalize maximum of binomial filters to 1. + const float bin5_scale = + (1.f / std::accumulate(bin5_weights.begin(), bin5_weights.end(), 0.0f)); + + for (int i = 0; i < 25; ++i) { + binomial5_weights_[i] = bin5_weights[i] * bin5_scale; + } + + const float bin3_scale = + (1.f / std::accumulate(bin3_weights.begin(), bin3_weights.end(), 0.0f)); + + for (int i = 0; i < 9; ++i) { + binomial3_weights_[i] = bin3_weights[i] * bin3_scale; + } + + // Setup gaussian weights. + const float space_coeff_5 = -0.5f / (1.6f * 1.6f); + for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 5; ++i) { + const float radius = (j - 2) * (j - 2) + (i - 2) * (i - 2); + gaussian5_weights_[j * 5 + i] = + std::exp(static_cast(radius * space_coeff_5)); + } + } + + const float space_coeff_3 = -0.5f / (1.0f * 1.0f); + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 3; ++i) { + const float radius = (j - 1) * (j - 1) + (i - 1) * (i - 1); + gaussian3_weights_[j * 3 + i] = + std::exp(static_cast(radius * space_coeff_3)); + } + } + + // Normalize maximum of gaussian weights to 1 (center value has largest + // magnitude). + const float gauss5_scale = + 1.0f / std::accumulate(gaussian5_weights_.begin(), + gaussian5_weights_.end(), 0.0f); + const float gauss3_scale = + 1.0f / std::accumulate(gaussian3_weights_.begin(), + gaussian3_weights_.end(), 0.0f); + + for (int i = 0; i < 25; ++i) { + gaussian5_weights_[i] *= gauss5_scale; + } + + for (int i = 0; i < 9; ++i) { + gaussian3_weights_[i] *= gauss3_scale; + } +} + +template +void PushPullFiltering::SetupBilateralLUT() { + // We use L1 color distance here, maximum is 3 (channels) * 256 (max + // intensity). + const int max_bins = 3 * 256; + bilateral_lut_.resize(max_bins, 0.0f); + + const float sigma_color = options_.bilateral_sigma(); + const float gauss_color_coeff = -0.5f / (sigma_color * sigma_color); + + // Normalized such that first bin equals one. + // Avoid non-zero weights for large intensity differences. + for (int i = 0; i < max_bins; ++i) { + bilateral_lut_[i] = std::max( + kBilateralEps, + std::exp(static_cast(i * i * gauss_color_coeff))); + } +} + +template +void PushPullFiltering::AllocatePyramid( + const cv::Size& domain_size, int border, int type, bool allocate_base_level, + std::vector* pyramid) { + CHECK(pyramid != nullptr); + pyramid->clear(); + pyramid->reserve(16); // Do not anticipate videos with dimensions + // larger than 2^16. + + int width = domain_size.width; + int height = domain_size.height; + + if (allocate_base_level) { + pyramid->push_back(cv::Mat(height + 2 * border, width + 2 * border, type)); + } + + while (width > 1 && height > 1) { + width = (width + 1) / 2; + height = (height + 1) / 2; + pyramid->push_back(cv::Mat(height + 2 * border, width + 2 * border, type)); + } +} + +template +void PushPullFiltering::InitializeImagePyramid( + const cv::Mat& input_frame, std::vector* pyramid) { + CHECK(pyramid != nullptr); + CHECK_GT(pyramid->size(), 0); + + cv::Mat base_level((*pyramid)[0], + cv::Range(border_, (*pyramid)[0].rows - border_), + cv::Range(border_, (*pyramid)[0].cols - border_)); + CHECK_EQ(base_level.rows, input_frame.rows); + CHECK_EQ(base_level.cols, input_frame.cols); + CHECK_EQ(base_level.type(), input_frame.type()); + + input_frame.copyTo(base_level); + CopyNecessaryBorder(&(*pyramid)[0]); + + for (int l = 0; l < pyramid->size() - 1; ++l) { + cv::Mat source((*pyramid)[l], + cv::Range(border_, (*pyramid)[l].rows - border_), + cv::Range(border_, (*pyramid)[l].cols - border_)); + cv::Mat destination((*pyramid)[l + 1], + cv::Range(border_, (*pyramid)[l + 1].rows - border_), + cv::Range(border_, (*pyramid)[l + 1].cols - border_)); + cv::pyrDown(source, destination, destination.size()); + CopyNecessaryBorder(&(*pyramid)[l + 1]); + } +} + +template +template +void PushPullFiltering::CopyNecessaryBorder( + cv::Mat* mat) { + switch (filter_type_) { + case BINOMIAL_3X3: + case GAUSSIAN_3X3: + CopyMatBorder(mat); + break; + case BINOMIAL_5X5: + case GAUSSIAN_5X5: + CopyMatBorder(mat); + break; + default: + LOG(FATAL) << "Unknown filter"; + } +} + +// In case of upsampling there are 4 possible anchor / image configurations +// that can occur. First the general layout of upsampling: x corresponds to +// positions with defined values, 0 to the space in-between. +// x 0 x 0 x 0 x 0 +// 0 0 0 0 0 0 0 0 +// x 0 x 0 x 0 x 0 +// 0 0 0 0 0 0 0 0 +// x 0 x 0 x 0 x 0 + +// The 4 cases for 3x3 filter are: +// Case 0: +// Filter incident with x: 1x1 filter +// Case 1: +// Filter incident to 0 at x0x: 1x2 filter +// Case 2: +// Filter incident to 0 at x : 2x1 filter +// 0 +// x +// Case 3: +// Filter incident to center 0 at x 0 x : 2x2 filter +// 0 0 0 +// x 0 x +// When traversing an to be upsampled image, for even rows we have to alternate +// between cases 0 and 1, for odd rows we alternate between 2 and 3. +// Computes the weights and tap offsets for above described cases, using +// sample increment in x direction of inc_x (e.g. color channels) and +// sample increment in y of inc_y (e.g. bytes in row). +// Optionally also selects space_offsets for bilateral filtering for each +// upsampling case, if space_offsets is specified. Reasoning here is, that +// tap offsets are defined in the image domain of the low-res frame one level +// above the currently processed one, while space_offsets are used to compute +// the corresponding joint bilateral weights in the high-res frame. +template +void PushPullFiltering::GetUpsampleTaps3( + const float* filter_weights, + const std::vector* space_offsets, // optional. + int inc_x, int inc_y, std::vector tap_weights[4], + std::vector tap_offsets[4], std::vector tap_space_offsets[4]) { + // Taps for filter 0 1 2 + // 3 4 5 + // 6 7 8 + // Case 0: + tap_weights[0].push_back(filter_weights[4]); + tap_offsets[0].push_back(0); + if (space_offsets) { + tap_space_offsets[0].push_back((*space_offsets)[4]); + } + + // Case 1: + tap_weights[1].push_back(filter_weights[3]); + tap_offsets[1].push_back(0); + tap_weights[1].push_back(filter_weights[5]); + tap_offsets[1].push_back(inc_x); + if (space_offsets) { + tap_space_offsets[1].push_back((*space_offsets)[3]); + tap_space_offsets[1].push_back((*space_offsets)[5]); + } + + // Case 2: + tap_weights[2].push_back(filter_weights[1]); + tap_offsets[2].push_back(0); + tap_weights[2].push_back(filter_weights[7]); + tap_offsets[2].push_back(inc_y); + if (space_offsets) { + tap_space_offsets[2].push_back((*space_offsets)[1]); + tap_space_offsets[2].push_back((*space_offsets)[7]); + } + + // Case 3: + tap_weights[3].push_back(filter_weights[0]); + tap_offsets[3].push_back(0); + tap_weights[3].push_back(filter_weights[2]); + tap_offsets[3].push_back(inc_x); + tap_weights[3].push_back(filter_weights[6]); + tap_offsets[3].push_back(inc_y); + tap_weights[3].push_back(filter_weights[8]); + tap_offsets[3].push_back(inc_y + inc_x); + if (space_offsets) { + tap_space_offsets[3].push_back((*space_offsets)[0]); + tap_space_offsets[3].push_back((*space_offsets)[2]); + tap_space_offsets[3].push_back((*space_offsets)[6]); + tap_space_offsets[3].push_back((*space_offsets)[8]); + } +} + +// For 5x5 filter. +// Case 0: +// Filter incident to center x at x 0 x 0 x: 3x3 filter +// 0 0 0 0 0 +// x 0 x 0 x +// 0 0 0 0 0 +// x 0 x 0 x +// Case 1: +// Filter incident to center 0 at x 0 x: 3x2 filter +// 0 0 0 +// x 0 x +// 0 0 0 +// x 0 x +// Case 2: +// Filter incident to center 0 at x 0 x 0 x: 2x3 filter +// 0 0 0 0 0 +// x 0 x 0 x +// Case 3: +// Filter incident to center 0 at x 0 x : 2x2 filter +// 0 0 0 +// x 0 x +// Same as above for 5x5 filter. +template +void PushPullFiltering::GetUpsampleTaps5( + const float* filter_weights, const std::vector* space_offsets, + int inc_x, int inc_y, std::vector tap_weights[4], + std::vector tap_offsets[4], std::vector tap_space_offsets[4]) { + // Taps for filter 0 1 2 3 4 + // 5 6 7 8 9 + // 10 11 12 13 14 + // 15 16 17 18 19 + // 20 21 22 23 24 + // Case 0: + tap_weights[0].push_back(filter_weights[0]); + tap_weights[0].push_back(filter_weights[2]); + tap_weights[0].push_back(filter_weights[4]); + tap_weights[0].push_back(filter_weights[10]); + tap_weights[0].push_back(filter_weights[12]); + tap_weights[0].push_back(filter_weights[14]); + tap_weights[0].push_back(filter_weights[20]); + tap_weights[0].push_back(filter_weights[22]); + tap_weights[0].push_back(filter_weights[24]); + tap_offsets[0].push_back(-inc_y - inc_x); + tap_offsets[0].push_back(-inc_y); + tap_offsets[0].push_back(-inc_y + inc_x); + tap_offsets[0].push_back(-inc_x); + tap_offsets[0].push_back(0); + tap_offsets[0].push_back(inc_x); + tap_offsets[0].push_back(inc_y - inc_x); + tap_offsets[0].push_back(inc_y); + tap_offsets[0].push_back(inc_y + inc_x); + + if (space_offsets) { + tap_space_offsets[0].push_back((*space_offsets)[0]); + tap_space_offsets[0].push_back((*space_offsets)[2]); + tap_space_offsets[0].push_back((*space_offsets)[4]); + tap_space_offsets[0].push_back((*space_offsets)[10]); + tap_space_offsets[0].push_back((*space_offsets)[12]); + tap_space_offsets[0].push_back((*space_offsets)[14]); + tap_space_offsets[0].push_back((*space_offsets)[20]); + tap_space_offsets[0].push_back((*space_offsets)[22]); + tap_space_offsets[0].push_back((*space_offsets)[24]); + } + + // Case 1: + tap_weights[1].push_back(filter_weights[1]); + tap_weights[1].push_back(filter_weights[3]); + tap_weights[1].push_back(filter_weights[11]); + tap_weights[1].push_back(filter_weights[13]); + tap_weights[1].push_back(filter_weights[21]); + tap_weights[1].push_back(filter_weights[23]); + tap_offsets[1].push_back(-inc_y); + tap_offsets[1].push_back(-inc_y + inc_x); + tap_offsets[1].push_back(0); + tap_offsets[1].push_back(inc_x); + tap_offsets[1].push_back(inc_y); + tap_offsets[1].push_back(inc_y + inc_x); + + if (space_offsets) { + tap_space_offsets[1].push_back((*space_offsets)[1]); + tap_space_offsets[1].push_back((*space_offsets)[3]); + tap_space_offsets[1].push_back((*space_offsets)[11]); + tap_space_offsets[1].push_back((*space_offsets)[13]); + tap_space_offsets[1].push_back((*space_offsets)[21]); + tap_space_offsets[1].push_back((*space_offsets)[23]); + } + + // Repeating for readability. + // Taps for filter 0 1 2 3 4 + // 5 6 7 8 9 + // 10 11 12 13 14 + // 15 16 17 18 19 + // 20 21 22 23 24 + // Case 2: + tap_weights[2].push_back(filter_weights[5]); + tap_weights[2].push_back(filter_weights[7]); + tap_weights[2].push_back(filter_weights[9]); + tap_weights[2].push_back(filter_weights[15]); + tap_weights[2].push_back(filter_weights[17]); + tap_weights[2].push_back(filter_weights[19]); + tap_offsets[2].push_back(-inc_x); + tap_offsets[2].push_back(0); + tap_offsets[2].push_back(inc_x); + tap_offsets[2].push_back(inc_y - inc_x); + tap_offsets[2].push_back(inc_y); + tap_offsets[2].push_back(inc_y + inc_x); + + if (space_offsets) { + tap_space_offsets[2].push_back((*space_offsets)[5]); + tap_space_offsets[2].push_back((*space_offsets)[7]); + tap_space_offsets[2].push_back((*space_offsets)[9]); + tap_space_offsets[2].push_back((*space_offsets)[15]); + tap_space_offsets[2].push_back((*space_offsets)[17]); + tap_space_offsets[2].push_back((*space_offsets)[19]); + } + + // Case 3: + tap_weights[3].push_back(filter_weights[6]); + tap_weights[3].push_back(filter_weights[8]); + tap_weights[3].push_back(filter_weights[16]); + tap_weights[3].push_back(filter_weights[18]); + tap_offsets[3].push_back(0); + tap_offsets[3].push_back(inc_x); + tap_offsets[3].push_back(inc_y); + tap_offsets[3].push_back(inc_y + inc_x); + + if (space_offsets) { + tap_space_offsets[3].push_back((*space_offsets)[6]); + tap_space_offsets[3].push_back((*space_offsets)[8]); + tap_space_offsets[3].push_back((*space_offsets)[16]); + tap_space_offsets[3].push_back((*space_offsets)[18]); + } +} + +// Scattered data interpolation via PushPull algorithm. +// Interpolation is performed over a size of domain_size, where +// features are placed into the corresponding bin at location +// feature.[x()|y()] + origin. +// The output displacement is assumed be a 3 channel float image of +// size == domain_sz + 2 to account for a one pixel border. +template +void PushPullFiltering::PerformPushPull( + const std::vector& data_locations, + const std::vector>& data_values, float push_pull_weight, + cv::Point2i origin, int readout_level, + const std::vector* data_weights, const cv::Mat* input_frame, + cv::Mat* results) { + CHECK_EQ(data_locations.size(), data_values.size()); + CHECK(results != nullptr); + + if (data_weights) { + CHECK_EQ(data_weights->size(), data_locations.size()); + } + + origin.x += border_; + origin.y += border_; + + // Create mip-map view from downsample pyramid. + std::vector mip_map(PyramidLevels()); + + for (int i = 0; i < mip_map.size(); ++i) { + mip_map[i] = &downsample_pyramid_[i]; + } + + CHECK_GE(readout_level, 0); + CHECK_LT(readout_level, PyramidLevels()); + + // CHECK if passed results matrix is compatible w.r.t. type and domain. + CHECK_EQ(downsample_pyramid_[readout_level].cols, results->cols); + CHECK_EQ(downsample_pyramid_[readout_level].rows, results->rows); + CHECK_EQ(downsample_pyramid_[readout_level].type(), results->type()); + + // Use caller-allocated results Mat. + mip_map[readout_level] = results; + + // Place data_values into their final positions in mip map @ level 0. + mip_map[0]->setTo(0); + for (int idx = 0; idx < data_locations.size(); ++idx) { + const Vector2_f& location = data_locations[idx]; + const cv::Vec& value = data_values[idx]; + + float* ptr = mip_map[0]->ptr(static_cast(location.y() + 0.5f) + + origin.y) + + (C + 1) * (static_cast(location.x() + 0.5f) + origin.x); + + const float data_weight = + data_weights ? (*data_weights)[idx] : push_pull_weight; + + // Pre-multiply with data_weight. + for (int c = 0; c < C; ++c) { + ptr[c] = value[c] * data_weight; + } + + // A weight of 1 would assume zero noise in the displacements. Smaller + // values lead to a smoother interpolation that approximates the + // initial values. + ptr[C] = data_weight; + } + + PerformPushPullImpl(readout_level, input_frame, &mip_map); +} + +// This is the same as PerformPushPull above except that it +// assumes that the data (the mip_map at level 0) is given as a cv::Mat. +template +void PushPullFiltering::PerformPushPullMat( + const cv::Mat& mip_map_level_0, + int readout_level, // Default: 0. + const cv::Mat* input_frame, // Optional. + cv::Mat* results) { + CHECK(results != nullptr); + + // Create mip-map view (concat displacements with downsample_pyramid). + std::vector mip_map(PyramidLevels()); + + for (int i = 0; i < mip_map.size(); ++i) { + mip_map[i] = &downsample_pyramid_[i]; + } + + CHECK_GE(readout_level, 0); + CHECK_LT(readout_level, PyramidLevels()); + + // CHECK if passed mip_map at level[0] is compatible w.r.t. type and domain. + CHECK_EQ(mip_map_level_0.cols, results->cols); + CHECK_EQ(mip_map_level_0.rows, results->rows); + CHECK_EQ(mip_map_level_0.type(), results->type()); + + // CHECK if passed results matrix is compatible w.r.t. type and domain. + CHECK_EQ(downsample_pyramid_[readout_level].cols, results->cols); + CHECK_EQ(downsample_pyramid_[readout_level].rows, results->rows); + CHECK_EQ(downsample_pyramid_[readout_level].type(), results->type()); + + // Use caller-allocated results Mat. + mip_map[readout_level] = results; + + // Place data_values into their final positions in mip map at level 0. + mip_map_level_0.copyTo(*mip_map[0]); + + PerformPushPullImpl(readout_level, input_frame, &mip_map); +} + +// Perform sparse data interpolation. +// Generate filter weights and then perform pull-down sampling +// followed by push-up sampling. +// Assumes that the mip_map has already been allocated and +// the data inserted at level 0. +// Results are placed in (*mip_map_ptr)[readout_level]. +template +void PushPullFiltering::PerformPushPullImpl( + const int readout_level, const cv::Mat* input_frame, + std::vector* mip_map_ptr) { + const float* filter_weights; + int num_filter_elems; + switch (filter_type_) { + case BINOMIAL_3X3: + num_filter_elems = 9; + filter_weights = binomial3_weights_.data(); + break; + case BINOMIAL_5X5: + num_filter_elems = 25; + filter_weights = binomial5_weights_.data(); + break; + case GAUSSIAN_3X3: + num_filter_elems = 9; + filter_weights = gaussian3_weights_.data(); + break; + case GAUSSIAN_5X5: + num_filter_elems = 25; + filter_weights = gaussian5_weights_.data(); + break; + default: + LOG(FATAL) << "Unknown filter requested."; + } + + const std::vector& mip_map = *mip_map_ptr; + + // Borderless views into mip maps. + std::vector mip_map_views(mip_map.size()); + std::vector mip_map_view_ptrs(mip_map.size()); + for (int l = 0; l < mip_map.size(); ++l) { + mip_map_views[l] = + cv::Mat(*mip_map[l], cv::Range(border_, mip_map[l]->rows - border_), + cv::Range(border_, mip_map[l]->cols - border_)); + + mip_map_view_ptrs[l] = &mip_map_views[l]; + } + + if (use_bilateral_) { + CHECK(input_frame != nullptr); + InitializeImagePyramid(*input_frame, &input_frame_pyramid_); + } + + PullDownSampling(num_filter_elems, filter_weights, mip_map_ptr); + + if (mip_map_visualizer_) { + std::vector is_premultiplied(mip_map_view_ptrs.size(), true); + mip_map_visualizer_->Visualize(mip_map_view_ptrs, true, is_premultiplied); + } + + PushUpSampling(num_filter_elems, filter_weights, readout_level, mip_map_ptr); + + if (mip_map_visualizer_) { + std::vector is_premultiplied(mip_map_view_ptrs.size(), true); + is_premultiplied[readout_level] = false; + mip_map_visualizer_->Visualize(mip_map_view_ptrs, false, is_premultiplied); + } +} + +template +inline const T* PtrOffset(const T* ptr, int offset) { + return reinterpret_cast(reinterpret_cast(ptr) + + offset); +} + +inline void GetFilterOffsets(const cv::Mat& mat, int border, int channels, + std::vector* filter_offsets) { + for (int i = -border; i <= border; ++i) { + for (int j = -border; j <= border; ++j) { + filter_offsets->push_back(i * mat.step[0] + sizeof(float) * channels * j); + } + } +} + +// Perform filter operation at locations in zero_pos. +template +inline void FillInZeros(const std::vector& zero_pos, + int num_filter_elems, const float* filter_weights, + int border, cv::Mat* mat) { + std::vector filter_offsets; + GetFilterOffsets(*mat, border, C + 1, &filter_offsets); + for (float* zero_ptr : zero_pos) { + float weight_sum = 0; + float val_sum[C]; + memset(val_sum, 0, C * sizeof(val_sum[0])); + + for (int k = 0; k < num_filter_elems; ++k) { + const float* cur_ptr = PtrOffset(zero_ptr, filter_offsets[k]); + const float w = filter_weights[k] * cur_ptr[C]; + + for (int c = 0; c < C; ++c) { + val_sum[c] += cur_ptr[c] * w; + } + + weight_sum += w; + } + + if (weight_sum > 0) { + const float inv_weight_sum = 1.f / weight_sum; + for (int c = 0; c < C; ++c) { + val_sum[c] *= inv_weight_sum; + zero_ptr[c] = val_sum[c]; + } + } + } +} + +template +void PushPullFiltering::PullDownSampling( + int num_filter_elems, const float* filter_weights, + std::vector* mip_map_ptr) { + const std::vector& mip_map = *mip_map_ptr; + + // Filter pyramid via push-pull. + // We always filter from [border, border] to + // [width - 1 - border, height - 1 - border]. + + for (int l = 1; l < mip_map.size(); ++l) { + CopyNecessaryBorder(mip_map[l - 1]); + mip_map[l]->setTo(0); + + // Local copy for faster access. + const int border = border_; + const int channels = C + 1; + + // Signal level to weight_multiplier. + weight_multiplier_->SetLevel(l - 1, true); + + std::vector filter_offsets; + GetFilterOffsets(*mip_map[l - 1], border, channels, &filter_offsets); + + const std::vector* space_offsets = + use_bilateral_ ? &pyramid_space_offsets_[l - 1] : NULL; + + const int height = mip_map[l]->rows - 2 * border; + const int width = mip_map[l]->cols - 2 * border; + + // Downweight bilateral influence as level progress as due to iterative + // downsampling image becomes less and less reliable. + const float bilateral_scale = + std::pow(options_.pull_bilateral_scale(), l - 1); + + // Filter odd pixels (downsample). + for (int i = 0; i < height; ++i) { + float* dst_ptr = mip_map[l]->ptr(i + border) + border * channels; + const float* src_ptr = + mip_map[l - 1]->ptr(2 * i + border) + border * channels; + const uint8* img_ptr = + use_bilateral_ ? (input_frame_pyramid_[l - 1].template ptr( + 2 * i + border) + + border * 3) + : NULL; + + for (int j = 0; j < width; ++j, dst_ptr += channels, + src_ptr += 2 * channels, img_ptr += 2 * 3) { + float weight_sum = 0; + float val_sum[C]; + memset(val_sum, 0, C * sizeof(val_sum[0])); + + const int i2 = i * 2; + const int j2 = j * 2; + if (use_bilateral_) { + for (int k = 0; k < num_filter_elems; ++k) { + const float* cur_ptr = PtrOffset(src_ptr, filter_offsets[k]); + + // If neighbor is not important, skip further evaluation. + if (cur_ptr[C] < kBilateralEps * kBilateralEps) { + continue; + } + + const uint8* match_ptr = PtrOffset(img_ptr, (*space_offsets)[k]); + + float bilateral_w = bilateral_lut_[ColorDiffL1(img_ptr, match_ptr) * + bilateral_scale]; + + const float multiplier = weight_multiplier_->GetWeight( + src_ptr, cur_ptr, img_ptr, j2, i2); + + const float w = filter_weights[k] * bilateral_w * multiplier; + + // cur_ptr is already pre-multiplied with importance + // weight cur_ptr[C]. + for (int c = 0; c < C; ++c) { + val_sum[c] += cur_ptr[c] * w; + } + weight_sum += w * cur_ptr[C]; + } + } else { + for (int k = 0; k < num_filter_elems; ++k) { + const float* cur_ptr = PtrOffset(src_ptr, filter_offsets[k]); + const float multiplier = + weight_multiplier_->GetWeight(src_ptr, cur_ptr, NULL, j2, i2); + const float w = filter_weights[k] * multiplier; + + // cur_ptr is already pre-multiplied with importance + // weight cur_ptr[C]. + for (int c = 0; c < C; ++c) { + val_sum[c] += cur_ptr[c] * w; + } + + weight_sum += w * cur_ptr[C]; + } + } + + DCHECK_GE(weight_sum, 0); + + if (weight_sum >= kBilateralEps * kBilateralEps) { + const float inv_weight_sum = 1.f / weight_sum; + for (int c = 0; c < C; ++c) { + dst_ptr[c] = val_sum[c] * inv_weight_sum; + } + } else { + for (int c = 0; c <= C; ++c) { + dst_ptr[c] = 0; + } + } + + const float prop_scale = options_.pull_propagation_scale(); + weight_sum *= prop_scale; + dst_ptr[C] = std::min(1.0f, weight_sum); + } + } + + if (weight_adjuster_) { + CopyNecessaryBorder(mip_map[l]); + cv::Mat mip_map_view(*mip_map[l], + cv::Range(border_, mip_map[l]->rows - border_), + cv::Range(border_, mip_map[l]->cols - border_)); + cv::Mat image_view; + if (use_bilateral_) { + image_view = + cv::Mat(input_frame_pyramid_[l], + cv::Range(border_, input_frame_pyramid_[l].rows - border_), + cv::Range(border_, input_frame_pyramid_[l].cols - border_)); + } + weight_adjuster_->AdjustWeights( + l, true, use_bilateral_ ? &image_view : NULL, &mip_map_view); + } + + // Pre-multiply weight for next level. + for (int i = 0; i < height; ++i) { + float* data_ptr = mip_map[l]->ptr(i + border_) + border * channels; + for (int j = 0; j < width; ++j, data_ptr += channels) { + for (int c = 0; c < C; ++c) { + data_ptr[c] *= data_ptr[C]; + } + } + } + } // end level processing. +} + +template +void PushPullFiltering::PushUpSampling( + int num_filter_elems, const float* filter_weights, int readout_level, + std::vector* mip_map_ptr) { + const std::vector& mip_map = *mip_map_ptr; + + for (int l = mip_map.size() - 2; l >= readout_level; --l) { + CopyNecessaryBorder(mip_map[l + 1]); + + // Signal mip map level to weight_multiplier. + weight_multiplier_->SetLevel(l, false); + + // Instead of upsampling we use 4 special tap filters. See comment at above + // function. + std::vector tap_weights[4]; + std::vector tap_offsets[4]; + std::vector tap_space_offsets[4]; + const int channels = C + 1; + + switch (filter_type_) { + case BINOMIAL_3X3: + case GAUSSIAN_3X3: + GetUpsampleTaps3(filter_weights, + use_bilateral_ ? &pyramid_space_offsets_[l] : NULL, + channels * sizeof(float), mip_map[l + 1]->step[0], + tap_weights, tap_offsets, tap_space_offsets); + break; + case BINOMIAL_5X5: + case GAUSSIAN_5X5: + GetUpsampleTaps5(filter_weights, + use_bilateral_ ? &pyramid_space_offsets_[l] : NULL, + channels * sizeof(float), mip_map[l + 1]->step[0], + tap_weights, tap_offsets, tap_space_offsets); + break; + default: + LOG(FATAL) << "Filter unknown"; + } + + // Local copy for faster access. + const int border = border_; + const int height = mip_map[l]->rows - 2 * border; + const int width = mip_map[l]->cols - 2 * border; + + const float bilateral_scale = + std::pow(options_.push_bilateral_scale(), l + 1); + + // Apply filter. + // List of zero positions that need to be smoothed. + std::vector zero_pos; + + for (int i = 0; i < height; ++i) { + float* dst_ptr = mip_map[l]->ptr(i + border) + border * channels; + const float* src_ptr = + mip_map[l + 1]->ptr(i / 2 + border) + border * channels; + const uint8* img_ptr = + use_bilateral_ + ? (input_frame_pyramid_[l].template ptr(i + border) + + border * 3) + : NULL; + + // Select tap offset. + const int tap_kind_row = 2 * (i % 2); // odd row, case 2 & 3. + + for (int j = 0; j < width; + // Increase src_ptr only for even rows (i.e. previous one was odd). + src_ptr += channels * (j % 2), + ++j, dst_ptr += channels, img_ptr += 3) { + if (dst_ptr[C] >= 1) { // Skip if already saturated. + continue; + } + + const int tap_kind = tap_kind_row + j % 2; + const std::vector& tap_weight = tap_weights[tap_kind]; + const std::vector& tap_offset = tap_offsets[tap_kind]; + const int tap_size = tap_weight.size(); + + float weight_sum = 0; + float val_sum[C]; + memset(val_sum, 0, C * sizeof(val_sum[0])); + + if (use_bilateral_) { + const std::vector& tap_space_offset = + tap_space_offsets[tap_kind]; + for (int k = 0; k < tap_size; ++k) { + const float* cur_ptr = PtrOffset(src_ptr, tap_offset[k]); + + // If neighbor is not important, skip further evaluation. + if (cur_ptr[C] < kBilateralEps * kBilateralEps) { + continue; + } + + const uint8* match_ptr = PtrOffset(img_ptr, tap_space_offset[k]); + float bilateral_w = bilateral_lut_[ColorDiffL1(img_ptr, match_ptr) * + bilateral_scale]; + + const float multiplier = + weight_multiplier_->GetWeight(src_ptr, cur_ptr, img_ptr, j, i); + + const float w = tap_weight[k] * bilateral_w * multiplier; + + // Values in above mip map level are pre-multiplied by + // importance weight cur_ptr[C]. + for (int c = 0; c < C; ++c) { + val_sum[c] += cur_ptr[c] * w; + } + weight_sum += w * cur_ptr[C]; + } + } else { + for (int k = 0; k < tap_size; ++k) { + const float* cur_ptr = PtrOffset(src_ptr, tap_offset[k]); + const float multiplier = + weight_multiplier_->GetWeight(src_ptr, cur_ptr, NULL, j, i); + + const float w = tap_weight[k] * multiplier; + + // Values in above mip map level are pre-multiplied by weight + // cur_ptr[C]. + for (int c = 0; c < C; ++c) { + val_sum[c] += cur_ptr[c] * w; + } + + weight_sum += w * cur_ptr[C]; + } + } + + if (weight_sum >= kBilateralEps * kBilateralEps) { + const float inv_weight_sum = 1.f / weight_sum; + for (int c = 0; c < C; ++c) { + val_sum[c] *= inv_weight_sum; + } + } else { + weight_sum = 0; + for (int c = 0; c < C; ++c) { + val_sum[c] = 0; + } + + zero_pos.push_back(dst_ptr); + } + + const float prop_scale = options_.push_propagation_scale(); + weight_sum *= prop_scale; + + // Maximum influence of pushed result on current pixel. + const float alpha_inv = std::min(1.0f - dst_ptr[C], weight_sum); + const float denom = + 1.0f / (dst_ptr[C] + alpha_inv + kBilateralEps * kBilateralEps); + + // Blend (dst_ptr is premultiplied with weight dst_ptr[C], + // val_sum is normalized). + for (int c = 0; c < C; ++c) { + dst_ptr[c] = (dst_ptr[c] + val_sum[c] * alpha_inv) * denom; + } + + // Increase current confidence by above sample. + dst_ptr[C] = + std::min(1.0f, dst_ptr[C] + std::min(weight_sum, alpha_inv)); + } + } + + if (weight_adjuster_) { + CopyNecessaryBorder(mip_map[l]); + cv::Mat mip_map_view(*mip_map[l], + cv::Range(border_, mip_map[l]->rows - border_), + cv::Range(border_, mip_map[l]->cols - border_)); + cv::Mat image_view; + if (use_bilateral_) { + image_view = + cv::Mat(input_frame_pyramid_[l], + cv::Range(border, input_frame_pyramid_[l].rows - border), + cv::Range(border, input_frame_pyramid_[l].cols - border)); + } + weight_adjuster_->AdjustWeights( + l, false, use_bilateral_ ? &image_view : NULL, &mip_map_view); + } + + // Pre-multiply with weight for next level if haven't reached base level + // yet. (Base level is not pre-multiplied so result can be used directly). + if (l != readout_level) { + for (int i = 0; i < height; ++i) { + float* data_ptr = + mip_map[l]->ptr(i + border_) + border * channels; + for (int j = 0; j < width; ++j, data_ptr += channels) { + for (int c = 0; c < C; ++c) { + data_ptr[c] *= data_ptr[C]; + } + } + } + } else { + CopyNecessaryBorder(mip_map[l]); + FillInZeros(zero_pos, num_filter_elems, filter_weights, border_, + mip_map[l]); + } + } // end mip map levels. +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_PUSH_PULL_FILTERING_H_ diff --git a/mediapipe/util/tracking/push_pull_filtering.proto b/mediapipe/util/tracking/push_pull_filtering.proto new file mode 100644 index 000000000..fa27e872f --- /dev/null +++ b/mediapipe/util/tracking/push_pull_filtering.proto @@ -0,0 +1,39 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +message PushPullOptions { + // Sigma for color difference. + optional float bilateral_sigma = 1 [default = 20.0]; + + // Determines how fast confident values can propagate. Filters are normalized, + // such that confidence dissipates quickly instead of propagating. + // To ensure confidence propagates the importance weight is scaled by the + // scalars specified below. Larger values yield quicker propagation. + optional float pull_propagation_scale = 3 [default = 8.0]; + optional float push_propagation_scale = 4 [default = 8.0]; + + // Above bilateral sigma is scaled at each level by the specified scale + // (for push and pull phase). This is due to iterative downsampling of the + // guidance image introduces errors making bilateral weighting increasingly + // errorneous. + optional float pull_bilateral_scale = 5 [default = 0.7]; + optional float push_bilateral_scale = 6 [default = 0.9]; + + // Deprecated fields. + extensions 2; +} diff --git a/mediapipe/util/tracking/region_flow.cc b/mediapipe/util/tracking/region_flow.cc new file mode 100644 index 000000000..2d91cdb67 --- /dev/null +++ b/mediapipe/util/tracking/region_flow.cc @@ -0,0 +1,799 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/region_flow.h" + +#include + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/parallel_invoker.h" + +namespace mediapipe { + +namespace { + +bool IsPointWithinBounds(const Vector2_f& pt, float bounds, int frame_width, + int frame_height) { + // Ensure stability under float -> rounding operations. + if (pt.x() - 0.5f >= bounds && pt.x() + 0.5f <= frame_width - 1 - bounds && + pt.y() - 0.5f >= bounds && pt.y() + 0.5f <= frame_height - 1 - bounds) { + return true; + } else { + return false; + } +} + +} // namespace. + +void GetRegionFlowFeatureList(const RegionFlowFrame& region_flow_frame, + int distance_from_border, + RegionFlowFeatureList* flow_feature_list) { + CHECK(flow_feature_list); + flow_feature_list->clear_feature(); + const int frame_width = region_flow_frame.frame_width(); + const int frame_height = region_flow_frame.frame_height(); + flow_feature_list->set_frame_width(frame_width); + flow_feature_list->set_frame_height(frame_height); + flow_feature_list->set_unstable(region_flow_frame.unstable_frame()); + flow_feature_list->set_distance_from_border(distance_from_border); + flow_feature_list->set_blur_score(region_flow_frame.blur_score()); + + for (const auto& region_flow : region_flow_frame.region_flow()) { + for (const auto& feature : region_flow.feature()) { + if (distance_from_border > 0) { + if (!IsPointWithinBounds(FeatureLocation(feature), distance_from_border, + frame_width, frame_height) || + !IsPointWithinBounds(FeatureMatchLocation(feature), + distance_from_border, frame_width, + frame_height)) { + continue; + } + } + + flow_feature_list->add_feature()->CopyFrom(feature); + } + } +} + +float RegionFlowFeatureDistance(const PatchDescriptor& patch_desc_1, + const PatchDescriptor& patch_desc_2) { + DCHECK_EQ(patch_desc_1.data_size(), patch_desc_2.data_size()); + DCHECK_GE(patch_desc_1.data_size(), 3); + + constexpr int kNumMeans = 3; + float sq_distance_sum = 0; + for (int n = 0; n < kNumMeans; ++n) { + const float distance = patch_desc_1.data(n) - patch_desc_2.data(n); + sq_distance_sum += distance * distance; + } + + return std::sqrt(sq_distance_sum); +} + +void ResetRegionFlowFeatureIRLSWeights( + float value, RegionFlowFeatureList* flow_feature_list) { + for (auto& feature : *flow_feature_list->mutable_feature()) { + feature.set_irls_weight(value); + } +} + +double RegionFlowFeatureIRLSSum(const RegionFlowFeatureList& feature_list) { + double sum = 0.0; + for (const auto& feature : feature_list.feature()) { + sum += feature.irls_weight(); + } + return sum; +} + +void ClampRegionFlowFeatureIRLSWeights(float lower, float upper, + RegionFlowFeatureView* feature_view) { + for (auto& feature_ptr : *feature_view) { + if (feature_ptr->irls_weight() < lower) { + feature_ptr->set_irls_weight(lower); + } else if (feature_ptr->irls_weight() > upper) { + feature_ptr->set_irls_weight(upper); + } + } +} + +void ComputeRegionFlowFeatureTexturedness( + const RegionFlowFeatureList& flow_feature_list, bool use_15percent_as_max, + std::vector* texturedness) { + CHECK(texturedness != nullptr); + *texturedness = std::vector(flow_feature_list.feature_size(), 1.0f); + + int texture_idx = 0; + for (auto feature = flow_feature_list.feature().begin(); + feature != flow_feature_list.feature().end(); ++feature, ++texture_idx) { + const float feature_stdev_l1 = + PatchDescriptorColorStdevL1(feature->feature_descriptor()); + + if (feature_stdev_l1 < 0.0f) { + LOG_IF(WARNING, + []() { + static int k = 0; + return k++ < 2; + }()) + << "Feature descriptor does not contain variance information. Was " + << "ComputeRegionFlowFeatureDescriptors called?"; + continue; + } + + // feature_stdev_l1 is within [0, 3 * 128 = 384] + float alpha = feature_stdev_l1 / 384.f; + + // In [0, 1], 0 = low texture, 1 = high texture. Scale such that around + // 15% of per channel maximum stdev is considered totally textured + // (1 / 0.15 * 3 ~ 18); + if (use_15percent_as_max) { + alpha = std::min(1.0f, alpha * 18.f); + } + + (*texturedness)[texture_idx] = alpha; + } +} + +void TextureFilteredRegionFlowFeatureIRLSWeights( + float low_texture_threshold, float low_texture_outlier_clamp, + RegionFlowFeatureList* flow_feature_list) { + std::vector texturedness; + ComputeRegionFlowFeatureTexturedness(*flow_feature_list, true, &texturedness); + + int texture_idx = 0; + for (auto feature = flow_feature_list->mutable_feature()->begin(); + feature != flow_feature_list->mutable_feature()->end(); + ++feature, ++texture_idx) { + if (feature->irls_weight() == 0.0f) { + continue; + } + + if (texturedness[texture_idx] < low_texture_threshold && + feature->irls_weight() < low_texture_outlier_clamp) { + feature->set_irls_weight(low_texture_outlier_clamp); + } else { + feature->set_irls_weight(feature->irls_weight() / + (texturedness[texture_idx] + 1.e-6f)); + } + } +} + +void CornerFilteredRegionFlowFeatureIRLSWeights( + float low_corner_threshold, float low_corner_outlier_clamp, + RegionFlowFeatureList* flow_feature_list) { + for (auto feature = flow_feature_list->mutable_feature()->begin(); + feature != flow_feature_list->mutable_feature()->end(); ++feature) { + if (feature->irls_weight() == 0.0f) { + continue; + } + + const float corner_response = feature->corner_response(); + + if (corner_response < low_corner_threshold && + feature->irls_weight() < low_corner_threshold) { + feature->set_irls_weight(low_corner_outlier_clamp); + } else { + feature->set_irls_weight(feature->irls_weight() / + (corner_response + 1.e-6f)); + } + } +} + +void GetRegionFlowFeatureIRLSWeights( + const RegionFlowFeatureList& flow_feature_list, + std::vector* irls_weights) { + CHECK(irls_weights != nullptr); + irls_weights->clear(); + irls_weights->reserve(flow_feature_list.feature_size()); + for (auto feature = flow_feature_list.feature().begin(); + feature != flow_feature_list.feature().end(); ++feature) { + irls_weights->push_back(feature->irls_weight()); + } +} + +void SetRegionFlowFeatureIRLSWeights(const std::vector& irls_weights, + RegionFlowFeatureList* flow_feature_list) { + CHECK(flow_feature_list != nullptr); + CHECK_EQ(irls_weights.size(), flow_feature_list->feature_size()); + int idx = 0; + for (auto feature = flow_feature_list->mutable_feature()->begin(); + feature != flow_feature_list->mutable_feature()->end(); + ++feature, ++idx) { + feature->set_irls_weight(irls_weights[idx]); + } +} + +int CountIgnoredRegionFlowFeatures( + const RegionFlowFeatureList& flow_feature_list, float threshold) { + int count = 0; + for (auto feature = flow_feature_list.feature().begin(); + feature != flow_feature_list.feature().end(); ++feature) { + if (feature->irls_weight() <= threshold) { + ++count; + } + } + return count; +} + +namespace { + +struct RegionFlowLocator { + bool operator()(const RegionFlow& lhs, const RegionFlow& rhs) const { + return lhs.region_id() < rhs.region_id(); + } +}; + +} // namespace. + +const RegionFlow* GetRegionFlowById(int region_id, + const RegionFlowFrame& flow_frame) { + RegionFlow region_flow; + region_flow.set_region_id(region_id); + + const auto& region_pos = std::lower_bound(flow_frame.region_flow().begin(), + flow_frame.region_flow().end(), + region_flow, RegionFlowLocator()); + + if (region_pos == flow_frame.region_flow().end() || + region_pos->region_id() != region_id) { + return NULL; + } else { + return &(*region_pos); + } +} + +RegionFlow* GetMutableRegionFlowById(int region_id, + RegionFlowFrame* flow_frame) { + RegionFlow region_flow; + region_flow.set_region_id(region_id); + + auto region_pos = std::lower_bound(flow_frame->mutable_region_flow()->begin(), + flow_frame->mutable_region_flow()->end(), + region_flow, RegionFlowLocator()); + + if (region_pos == flow_frame->mutable_region_flow()->end() || + region_pos->region_id() != region_id) { + return NULL; + } else { + return &(*region_pos); + } +} + +void SortRegionFlowById(RegionFlowFrame* flow_frame) { + std::sort(flow_frame->mutable_region_flow()->begin(), + flow_frame->mutable_region_flow()->end(), RegionFlowLocator()); +} + +void InvertRegionFlow(const RegionFlowFrame& region_flow_frame, + RegionFlowFrame* inverted_flow_frame) { + CHECK(inverted_flow_frame); + inverted_flow_frame->CopyFrom(region_flow_frame); + for (auto& region_flow : *inverted_flow_frame->mutable_region_flow()) { + region_flow.set_centroid_x(region_flow.centroid_x() + region_flow.flow_x()); + region_flow.set_centroid_y(region_flow.centroid_y() + region_flow.flow_y()); + region_flow.set_flow_x(-region_flow.flow_x()); + region_flow.set_flow_y(-region_flow.flow_y()); + + for (auto& feature : *region_flow.mutable_feature()) { + feature.set_x(feature.x() + feature.dx()); + feature.set_y(feature.y() + feature.dy()); + feature.set_dx(-feature.dx()); + feature.set_dy(-feature.dy()); + } + } +} + +void InvertRegionFlowFeatureList(const RegionFlowFeatureList& feature_list, + RegionFlowFeatureList* inverted_feature_list) { + CHECK(inverted_feature_list); + *inverted_feature_list = feature_list; + for (auto& feature : *inverted_feature_list->mutable_feature()) { + InvertRegionFlowFeature(&feature); + } +} + +void InvertRegionFlowFeature(RegionFlowFeature* feature) { + Vector2_f pt_match = FeatureMatchLocation(*feature); + feature->set_x(pt_match.x()); + feature->set_y(pt_match.y()); + Vector2_f flow = FeatureFlow(*feature); + feature->set_dx(-flow.x()); + feature->set_dy(-flow.y()); +} + +void LimitFeaturesToBounds(int frame_width, int frame_height, float bounds, + RegionFlowFeatureList* feature_list) { + RegionFlowFeatureList limited; + for (const auto& feature : feature_list->feature()) { + if (!IsPointWithinBounds(FeatureLocation(feature), bounds, frame_width, + frame_height)) { + continue; + } else { + limited.add_feature()->CopyFrom(feature); + } + } + feature_list->Swap(&limited); +} + +void NormalizeRegionFlowFeatureList(RegionFlowFeatureList* feature_list) { + const LinearSimilarityModel norm_model = + LinearSimilarityAdapter::NormalizationTransform( + feature_list->frame_width(), feature_list->frame_height()); + TransformRegionFlowFeatureList(norm_model, feature_list); +} + +void DeNormalizeRegionFlowFeatureList(RegionFlowFeatureList* feature_list) { + const LinearSimilarityModel norm_model = + LinearSimilarityAdapter::NormalizationTransform( + feature_list->frame_width(), feature_list->frame_height()); + TransformRegionFlowFeatureList(ModelInvert(norm_model), feature_list); +} + +void ScaleSalientPoint(float scale_x, float scale_y, SalientPoint* sp) { + sp->set_norm_point_x(sp->norm_point_x() * scale_x); + sp->set_norm_point_y(sp->norm_point_y() * scale_y); + + const float cos_angle = std::cos(sp->angle()); + const float sin_angle = std::sin(sp->angle()); + Vector2_f major_axis = Vector2_f(cos_angle, sin_angle) * sp->norm_major(); + Vector2_f minor_axis = Vector2_f(-sin_angle, cos_angle) * sp->norm_minor(); + + major_axis[0] *= scale_x; + major_axis[1] *= scale_y; + minor_axis[0] *= scale_x; + minor_axis[1] *= scale_y; + + sp->set_norm_major(major_axis.Norm()); + sp->set_norm_minor(minor_axis.Norm()); + sp->set_angle(std::atan2(major_axis.y(), major_axis.x())); + if (sp->angle() < 0) { + sp->set_angle(sp->angle() + M_PI); + } +} + +void ScaleSaliencyList(float scale, bool normalize_to_scale, + SaliencyPointList* saliency_list) { + CHECK(saliency_list != nullptr); + for (auto& point_frame : *saliency_list) { + ScaleSalientPointFrame(scale, normalize_to_scale, &point_frame); + } +} + +void ScaleSalientPointFrame(float scale, bool normalize_to_scale, + SalientPointFrame* saliency) { + CHECK(saliency != nullptr); + float saliency_scale = scale; + if (normalize_to_scale) { + float weight_sum = 0.0f; + for (const auto& salient_point : saliency->point()) { + weight_sum += salient_point.weight(); + } + + if (weight_sum > 1e-6f) { + saliency_scale /= weight_sum; + } + } + + for (auto& salient_point : *saliency->mutable_point()) { + salient_point.set_weight(salient_point.weight() * saliency_scale); + } +} + +void ResetSaliencyBounds(float left, float bottom, float right, float top, + SaliencyPointList* saliency_list) { + CHECK(saliency_list != nullptr); + for (auto& point_frame : *saliency_list) { + for (auto& salient_point : *point_frame.mutable_point()) { + salient_point.set_left(left); + salient_point.set_bottom(bottom); + salient_point.set_right(right); + salient_point.set_top(top); + } + } +} + +bool EllipseFromCovariance(float a, float bc, float d, + Vector2_f* axis_magnitude, float* angle) { + CHECK(axis_magnitude != nullptr); + CHECK(angle != nullptr); + + // Get trace and determinant + const float trace = a + d; + const float det = a * d - bc * bc; + + // If area is very small (small axis in at least one direction) + // axis are unreliable -> return false. + if (det < 4) { // Measured in sq. pixels. + *axis_magnitude = Vector2_f(1, 1); + *angle = 0; + return false; + } + + const float discriminant = std::max(0.f, trace * trace * 0.25f - det); + const float sqrt_discrm = std::sqrt(discriminant); + + // Get eigenvalues. + float eig_1 = trace * 0.5f + sqrt_discrm; + float eig_2 = trace * 0.5f - sqrt_discrm; + + // Compute eigenvectors. + Vector2_f vec_1, vec_2; + + if (fabs(bc) < 1e-6) { + // Right-most case, we already have diagonal matrix. + vec_1.Set(1, 0); + vec_2.Set(0, 1); + } else { + vec_1.Set(eig_1 - d, bc); + vec_2.Set(eig_2 - d, bc); + + // Normalize. Norm is always > 0, as bc is > 0 via above if. + vec_1 /= vec_1.Norm(); + vec_2 /= vec_2.Norm(); + } + + // Select positive eigenvector. + if (eig_1 < 0) { + eig_1 *= -1.f; + } + + if (eig_2 < 0) { + eig_2 *= -1.f; + } + + // Major first. + if (eig_1 < eig_2) { + using std::swap; + swap(vec_1, vec_2); + swap(eig_1, eig_2); + } + + *axis_magnitude = Vector2_f(std::sqrt(eig_1), std::sqrt(eig_2)); + *angle = std::atan2(vec_1.y(), vec_1.x()); + + return eig_2 >= 1.5f; // Measurement in pixels. +} + +void BoundingBoxFromEllipse(const Vector2_f& center, float norm_major_axis, + float norm_minor_axis, float angle, + std::vector* bounding_box) { + CHECK(bounding_box != nullptr); + float dim_x; + float dim_y; + if (angle < M_PI * 0.25 || angle > M_PI * 0.75) { + dim_x = norm_major_axis; + dim_y = norm_minor_axis; + } else { + dim_y = norm_major_axis; + dim_x = norm_minor_axis; + } + + // Construct bounding for for axes aligned ellipse. + *bounding_box = { + Vector2_f(-dim_x, -dim_y), + Vector2_f(-dim_x, dim_y), + Vector2_f(dim_x, dim_y), + Vector2_f(dim_x, -dim_y), + }; + + for (Vector2_f& corner : *bounding_box) { + corner += center; + } +} + +void CopyToEmptyFeatureList(RegionFlowFeatureList* src, + RegionFlowFeatureList* dst) { + CHECK(src != nullptr); + CHECK(dst != nullptr); + + // Swap out features for empty list. + RegionFlowFeatureList empty_list; + empty_list.mutable_feature()->Swap(src->mutable_feature()); + + // Copy. + dst->CopyFrom(*src); + + // Swap back. + src->mutable_feature()->Swap(empty_list.mutable_feature()); + + // src_features should be empty as in the beginning. + CHECK_EQ(0, empty_list.feature_size()); +} + +void IntersectRegionFlowFeatureList( + const RegionFlowFeatureList& to, + std::function to_location_eval, + RegionFlowFeatureList* from, RegionFlowFeatureList* result, + std::vector* source_indices) { + CHECK(from != nullptr); + CHECK(result != nullptr); + CHECK(from->long_tracks()) << "Intersection only works for long features"; + CHECK(to.long_tracks()) << "Intersection only works for long features"; + + // Hash features in to, based on track_id. + std::unordered_map track_map; + for (const auto& feature : to.feature()) { + track_map[feature.track_id()] = &feature; + } + + // Initialize result. + CopyToEmptyFeatureList(from, result); + const int num_from_features = from->feature_size(); + result->mutable_feature()->Reserve(num_from_features); + + int feature_idx = 0; + for (const auto& feature : from->feature()) { + auto find_location = track_map.find(feature.track_id()); + if (find_location != track_map.end()) { + const Vector2_f diff = + to_location_eval(*find_location->second) - FeatureLocation(feature); + + RegionFlowFeature* new_feature = result->add_feature(); + new_feature->CopyFrom(feature); + new_feature->set_dx(diff.x()); + new_feature->set_dy(diff.y()); + if (source_indices != nullptr) { + source_indices->push_back(feature_idx); + } + } + + ++feature_idx; + } +} + +void LongFeatureStream::AddFeatures(const RegionFlowFeatureList& feature_list, + bool check_connectivity, + bool purge_non_present_features) { + if (!feature_list.long_tracks()) { + LOG(ERROR) << "Feature stream should be used only used with long feature " + << "tracks. Ensure POLICY_LONG_FEATURE was used for " + << "RegionFlowComputation."; + return; + } + + if (feature_list.match_frame() == 0) { + // Skip first frame. + return; + } + + if (std::abs(feature_list.match_frame()) != 1) { + LOG(ERROR) << "Only matching frames one frame from current one are " + << "supported"; + return; + } + + // Record id of each track that is present in the current feature_list. + std::unordered_set present_tracks; + for (auto feature : feature_list.feature()) { // Copy feature. + if (feature.track_id() < 0) { + LOG_IF(WARNING, []() { + static int k = 0; + return k++ < 2; + }()) << "Feature does not have a valid track id assigned. Ignoring."; + continue; + } + present_tracks.insert(feature.track_id()); + if (check_connectivity) { + // A new feature should never have been erased before. + CHECK(old_ids_.find(feature.track_id()) == old_ids_.end()) + << "Feature : " << feature.track_id() << "was already removed."; + } + + // Invert the features to be foward or backward according to forward_ flag. + if ((!forward_ && feature_list.match_frame() > 0) || + (forward_ && feature_list.match_frame() < 0)) { + InvertRegionFlowFeature(&feature); + } + + auto find_pos = tracks_.find(feature.track_id()); + if (find_pos != tracks_.end()) { + // Track is present, add to it. + if (check_connectivity) { + CHECK_LT((FeatureLocation(find_pos->second.back()) - + FeatureMatchLocation(feature)) + .Norm2(), + 1e-4); + } + find_pos->second.push_back(feature); + } else { + tracks_[feature.track_id()] = std::vector(1, feature); + } + } + + if (purge_non_present_features) { + std::vector to_be_removed; + for (const auto& track : tracks_) { + if (present_tracks.find(track.first) == present_tracks.end()) { + to_be_removed.push_back(track.first); + if (check_connectivity) { + old_ids_.insert(track.first); + } + } + } + for (int id : to_be_removed) { + tracks_.erase(id); + } + } +} + +void LongFeatureStream::FlattenTrack( + const std::vector& features, + std::vector* result, std::vector* irls_weight, + std::vector* flow) const { + CHECK(result != nullptr); + if (features.empty()) { + return; + } + + if (irls_weight) { + irls_weight->clear(); + } + + if (flow) { + flow->clear(); + } + + if (!forward_) { + // Backward tracking, add first match. + result->push_back(FeatureMatchLocation(features[0])); + } + + for (const auto& feature : features) { + result->push_back(FeatureLocation(feature)); + if (flow) { + flow->push_back(FeatureFlow(feature)); + } + if (irls_weight) { + irls_weight->push_back(feature.irls_weight()); + } + } + + if (forward_) { + // Forward tracking, add last match. + result->push_back(FeatureMatchLocation(features.back())); + } + + // Replicate last irls weight. + if (irls_weight) { + irls_weight->push_back(irls_weight->back()); + } +} + +const std::vector* LongFeatureStream::TrackById( + int id) const { + auto track_pos = tracks_.find(id); + if (track_pos == tracks_.end()) { + return nullptr; + } else { + return &track_pos->second; + } +} + +std::vector LongFeatureStream::FlattenedTrackById(int id) const { + const auto* track = TrackById(id); + if (track != nullptr) { + std::vector points; + FlattenTrack(*track, &points, nullptr, nullptr); + return points; + } else { + return std::vector(); + } +} + +void LongFeatureInfo::AddFeatures(const RegionFlowFeatureList& feature_list) { + if (!feature_list.long_tracks()) { + LOG(ERROR) << "Passed feature list was not computed with long tracks. "; + return; + } + + for (const auto& feature : feature_list.feature()) { + AddFeature(feature); + } + + IncrementFrame(); +} + +void LongFeatureInfo::AddFeature(const RegionFlowFeature& feature) { + if (feature.irls_weight() == 0) { // Skip outliers. + return; + } + const int track_id = feature.track_id(); + if (track_id < 0) { // Skip unassigned ids. + return; + } + auto insert_pos = track_info_.find(track_id); + if (insert_pos == track_info_.end()) { + track_info_[track_id].start = num_frames_; + track_info_[track_id].length = 1; + } else { + ++insert_pos->second.length; + } +} + +void LongFeatureInfo::TrackLengths(const RegionFlowFeatureList& feature_list, + std::vector* track_lengths) const { + CHECK(track_lengths); + const int feature_size = feature_list.feature_size(); + track_lengths->resize(feature_size); + for (int k = 0; k < feature_size; ++k) { + (*track_lengths)[k] = TrackLength(feature_list.feature(k)); + } +} + +int LongFeatureInfo::TrackLength(const RegionFlowFeature& feature) const { + const auto insert_pos = track_info_.find(feature.track_id()); + return insert_pos != track_info_.end() ? insert_pos->second.length : 0; +} + +int LongFeatureInfo::TrackStart(const RegionFlowFeature& feature) const { + const auto insert_pos = track_info_.find(feature.track_id()); + return insert_pos != track_info_.end() ? insert_pos->second.start : -1; +} + +void LongFeatureInfo::Reset() { + track_info_.clear(); + num_frames_ = 0; +} + +int LongFeatureInfo::GlobalTrackLength(float percentile) const { + std::vector track_lengths; + track_lengths.reserve(track_info_.size()); + + for (const auto& pair : track_info_) { + track_lengths.push_back(pair.second.length); + } + + if (track_lengths.empty()) { + return 0; + } + + auto percentile_item = + track_lengths.begin() + percentile * track_lengths.size(); + + std::nth_element(track_lengths.begin(), percentile_item, track_lengths.end()); + + return *percentile_item; +} + +void GridTaps(int dim_x, int dim_y, int tap_radius, + std::vector>* taps) { + CHECK(taps); + const int grid_size = dim_x * dim_y; + const int diam = 2 * tap_radius + 1; + taps->resize(grid_size); + + for (int i = 0; i < dim_y; ++i) { + for (int j = 0; j < dim_x; ++j) { + std::vector& grid_bin = (*taps)[i * dim_x + j]; + grid_bin.clear(); + grid_bin.reserve(diam * diam); + for (int k = std::max(0, i - tap_radius), + end_k = std::min(dim_y - 1, i + tap_radius); + k <= end_k; ++k) { + for (int l = std::max(0, j - tap_radius), + end_l = std::min(dim_x - 1, j + tap_radius); + l <= end_l; ++l) { + grid_bin.push_back(k * dim_x + l); + } + } + } + } +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/region_flow.h b/mediapipe/util/tracking/region_flow.h new file mode 100644 index 000000000..2f9b34227 --- /dev/null +++ b/mediapipe/util/tracking/region_flow.h @@ -0,0 +1,636 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Small helper function for RegionFlow. +#ifndef MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_H_ +#define MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +namespace mediapipe { + +typedef RegionFlowFrame::RegionFlow RegionFlow; +typedef std::vector RegionFlowFeatureView; + +inline RegionFlowFeature FeatureFromFloats(float x, float y, float dx, + float dy) { + RegionFlowFeature feat; + feat.set_x(x); + feat.set_y(y); + feat.set_dx(dx); + feat.set_dy(dy); + return feat; +} + +inline RegionFlowFeature FeatureFromVec2f(const Vector2_f& loc, + const Vector2_f& flow) { + RegionFlowFeature feat; + feat.set_x(loc.x()); + feat.set_y(loc.y()); + feat.set_dx(flow.x()); + feat.set_dy(flow.y()); + return feat; +} + +inline Vector2_f FeatureFlow(const RegionFlowFeature& feature) { + return Vector2_f(feature.dx(), feature.dy()); +} + +inline Vector2_f FeatureLocation(const RegionFlowFeature& feature) { + return Vector2_f(feature.x(), feature.y()); +} + +inline Vector2_f FeatureMatchLocation(const RegionFlowFeature& feature) { + return FeatureLocation(feature) + FeatureFlow(feature); +} + +inline Vector2_i FeatureIntLocation(const RegionFlowFeature& feature) { + return Vector2_i::Cast(FeatureLocation(feature) + Vector2_f(0.5f, 0.5f)); +} + +inline Vector2_i FeatureMatchIntLocation(const RegionFlowFeature& feature) { + return Vector2_i::Cast(FeatureMatchLocation(feature) + Vector2_f(0.5, 0.5f)); +} + +// Returns L1 norm of color standard deviation of feature descriptor, +// -1 if descriptor information is not present +// (e.g. if ComputeRegionFlowFeatureDescriptors was not called previously). +// Specifically returns stdev_red + stdev_blue + stdev_green. +inline float PatchDescriptorColorStdevL1(const PatchDescriptor& descriptor) { + constexpr int kRedIdx = 3; + constexpr int kGreenIdx = 6; + constexpr int kBlueIdx = 8; + DCHECK_GE(descriptor.data(kRedIdx), 0); + DCHECK_GE(descriptor.data(kGreenIdx), 0); + DCHECK_GE(descriptor.data(kBlueIdx), 0); + + if (descriptor.data_size() > kBlueIdx) { + return std::sqrt(descriptor.data(kRedIdx)) + + std::sqrt(descriptor.data(kGreenIdx)) + + std::sqrt(descriptor.data(kBlueIdx)); + } else { + return -1.0f; + } +} + +// Extracts features from region flow. Set distance_from_border > 0 to ensure +// feature and matched location are at least the specified distance away +// from the frame rectangle (test is not executed if distance_from_border <= 0), +// so that feature descriptors can be computed (see function below). +void GetRegionFlowFeatureList(const RegionFlowFrame& flow_frame, + int distance_from_border, + RegionFlowFeatureList* flow_feature_list); + +// Returns L2 norm of difference of mean color (first 3 dimensions of +// feature descriptors). +float RegionFlowFeatureDistance(const PatchDescriptor& patch_desc_1, + const PatchDescriptor& patch_desc_2); + +// Resets IRLS weight of each RegionFlowFeature to value. +void ResetRegionFlowFeatureIRLSWeights( + float value, RegionFlowFeatureList* flow_feature_list); + +// Returns sum of feature's irls weights. +double RegionFlowFeatureIRLSSum(const RegionFlowFeatureList& feature_list); + +// Computes per region flow feature texturedness score. Score is within [0, 1], +// where 0 means low texture and 1 high texture. Requires for each feature +// descriptor to be computed (via ComputeRegionFlowFeatureDescriptors). If +// missing, LOG(WARNING) is issued and value defaults to 1. +// If use_15percent_as_max is set, score is scaled and threshold back to [0, 1] +// such that 1 is assumed at 15% of maximum PER channel variance. +void ComputeRegionFlowFeatureTexturedness( + const RegionFlowFeatureList& region_flow_feature_list, + bool use_15percent_as_max, std::vector* texturedness); + +// IRLS weights are multiplied by inverse texturedness (expressed as variance +// of feature descriptor), effectively upweighting outliers if in low textured +// areas. Features with texturedness below low_texture_threshold can be +// optionally clamped to low_texture_outlier_clamp (set to -1 for no clamping). +void TextureFilteredRegionFlowFeatureIRLSWeights( + float low_texture_threshold, float low_texture_outlier_clamp, + RegionFlowFeatureList* flow_feature_list); + +// Same as above but normalizes w.r.t. corner response. +void CornerFilteredRegionFlowFeatureIRLSWeights( + float low_corner_threshold, float low_corner_outlier_clamp, + RegionFlowFeatureList* flow_feature_list); + +// Simple setter and getter methods for irls weights. +void GetRegionFlowFeatureIRLSWeights( + const RegionFlowFeatureList& flow_feature_list, + std::vector* irls_weights); + +void SetRegionFlowFeatureIRLSWeights(const std::vector& irls_weights, + RegionFlowFeatureList* flow_feature_list); + +// Counts number of region flow features with an irls weight of less than or +// equal to threshold. +int CountIgnoredRegionFlowFeatures( + const RegionFlowFeatureList& flow_feature_list, float threshold); + +// Locates region with id region_id in RegionFlowFrame via binary search. +// Returns NULL if no region with specied region_id is present. +const RegionFlow* GetRegionFlowById(int region_id, + const RegionFlowFrame& flow_frame); + +// Same as above for mutable RegionFlow. +RegionFlow* GetMutableRegionFlowById(int region_id, + RegionFlowFrame* flow_frame); + +void SortRegionFlowById(RegionFlowFrame* flow_frame); + +// Switches each feature with its correspondence, i.e. +// (x, y) <-> (x + dx, x + dy), (dx, dy) -> (-dx, -dy). Same holds for centroid +// and mean flow. Note: Member fundamental_matrix is invalid after inversion, +// FeaturePointLists are not affected from inversion. +void InvertRegionFlow(const RegionFlowFrame& flow_frame, + RegionFlowFrame* inverted_flow_frame); + +// Same as above for feature lists. +void InvertRegionFlowFeatureList(const RegionFlowFeatureList& feature_list, + RegionFlowFeatureList* inverted_feature_list); + +// Inverts a single feature. +void InvertRegionFlowFeature(RegionFlowFeature* feature); + +// Removes features that are out bounds of the domain: +// [bounds, frame_width - bounds] x [bounds, frame_height - bounds]. +void LimitFeaturesToBounds(int frame_width, int frame_height, float bounds, + RegionFlowFeatureList* feature_list); + +// List of saliency points for each frame. +typedef std::deque SaliencyPointList; + +// Normalizes region flow by frame diameter, i.e. uniform downscale such that +// feature list fits within [0, 1]. +void NormalizeRegionFlowFeatureList(RegionFlowFeatureList* feature_list); +// Inverse of above operation. +void DeNormalizeRegionFlowFeatureList(RegionFlowFeatureList* feature_list); + +// Templated implementations. +// Applies model to each feature and displacement vector. +template +void TransformRegionFlowFeatureList(const Model& model, + RegionFlowFeatureList* flow_feature_list) { + for (auto& feature : *flow_feature_list->mutable_feature()) { + Vector2_f pt = + ModelAdapter::TransformPoint(model, FeatureLocation(feature)); + Vector2_f match = ModelAdapter::TransformPoint( + model, FeatureMatchLocation(feature)); + feature.set_x(pt.x()); + feature.set_y(pt.y()); + feature.set_dx(match.x() - pt.x()); + feature.set_dy(match.y() - pt.y()); + } +} + +// Similar to above but applies transformation to each feature to derive +// matching location, according to the formula: +// (dx, dy) <-- a * (transformed location - location) + b * (dx, dy) +// e.g. with b = 0, and a = 1, (dx, dy) is replaced with distance +// between transformed location and original location. +// If set_match == true, the original feature location is replaced +// with its matching location. +template +void RegionFlowFeatureListViaTransform( + const Model& model, RegionFlowFeatureList* flow_feature_list, float a, + float b, bool set_match, const MixtureRowWeights* row_weights = nullptr) { + for (auto& feature : *flow_feature_list->mutable_feature()) { + Vector2_f match = + ModelAdapter::TransformPoint(model, FeatureLocation(feature)); + feature.set_dx(b * feature.dx() + a * (match.x() - feature.x())); + feature.set_dy(b * feature.dy() + a * (match.y() - feature.y())); + if (set_match) { + feature.set_x(match.x()); + feature.set_y(match.y()); + } + } +} + +template <> +inline void RegionFlowFeatureListViaTransform( + const MixtureHomography& mix, RegionFlowFeatureList* flow_feature_list, + float a, float b, bool set_match, const MixtureRowWeights* row_weights) { + CHECK(row_weights) << "Row weights required for mixtures."; + + for (auto& feature : *flow_feature_list->mutable_feature()) { + const float* weights = row_weights->RowWeights(feature.y()); + Vector2_f match = MixtureHomographyAdapter::TransformPoint( + mix, weights, FeatureLocation(feature)); + feature.set_dx(b * feature.dx() + a * (match.x() - feature.x())); + feature.set_dy(b * feature.dy() + a * (match.y() - feature.y())); + if (set_match) { + feature.set_x(match.x()); + feature.set_y(match.y()); + } + } +} + +// Helper implementation function for functions below. +// Returns pair of where predicate result is +// the boolean result of applying the predicate to the feature. +template +std::pair GetFilteredWeightImpl(const Predicate& predicate, + float reset_value, + const RegionFlowFeature& feature) { + if (feature.irls_weight() == 0.0f) { + return std::make_pair(0.0f, false); // Zero is false by default. + } else if (!predicate(feature)) { + return std::make_pair(reset_value, false); + } else { + return std::make_pair(feature.irls_weight(), true); + } +} + +// If predicate evaluates to false, corresponding irls weight is set to zero. +// Returns number of features with non-zero irls weight. +// Interface for predicate: bool operator()(const RegionFlowFeature&) const +// Example: Predicate compares registration error of a feature under some linear +// model and returns true if error is below some threshold. Consequently all +// features having registration error above the threshold are set to weight zero +// effectively ignoring those features during estimation. +template +int FilterRegionFlowFeatureList(const Predicate& predicate, float reset_value, + RegionFlowFeatureList* flow_feature_list) { + CHECK(flow_feature_list != nullptr); + int num_passing_features = 0; + for (auto& feature : *flow_feature_list->mutable_feature()) { + std::pair filter_result = + GetFilteredWeightImpl(predicate, reset_value, feature); + feature.set_irls_weight(filter_result.first); + if (filter_result.second) { + ++num_passing_features; + } + } + + return num_passing_features; +} + +// Same function as above, but instead of setting the corresponding weights, +// returns resulting weights in a float vector. +template +int FilterRegionFlowFeatureWeights(const Predicate& predicate, + float reset_value, + const RegionFlowFeatureList& feature_list, + std::vector* result_weights) { + CHECK(result_weights != nullptr); + result_weights->clear(); + + int num_passing_features = 0; + for (auto feature : feature_list.feature()) { + std::pair filter_result = + GetFilteredWeightImpl(predicate, reset_value, feature); + result_weights->push_back(filter_result.first); + if (filter_result.second) { + ++num_passing_features; + } + } + + return num_passing_features; +} + +// Select features from the passed list for which the predicate is true. +// Returned view contains pointers to mutable features. +template +void SelectFeaturesFromList(const Predicate& predicate, + RegionFlowFeatureList* feature_list, + RegionFlowFeatureView* feature_view) { + CHECK(feature_list != nullptr); + CHECK(feature_view != nullptr); + for (auto& feature : *feature_list->mutable_feature()) { + if (predicate(feature)) { + feature_view->push_back(&feature); + } + } +} + +inline void SelectAllFeaturesFromList(RegionFlowFeatureList* feature_list, + RegionFlowFeatureView* feature_view) { + CHECK(feature_list != nullptr); + CHECK(feature_view != nullptr); + for (auto& feature : *feature_list->mutable_feature()) { + feature_view->push_back(&feature); + } +} + +// Sorts region flow feature views, w.r.t. predicate. Predicate must define: +// bool operator()(const RegionFlowFeature* lhs, +// const RegionFlowFeature* rhs) const; +template +void SortRegionFlowFeatureView(const Predicate& predicate, + RegionFlowFeatureView* feature_view) { + CHECK(feature_view != nullptr); + std::sort(feature_view->begin(), feature_view->end(), predicate); +} + +// Clamps IRLS weight of each RegionFlowFeature to lie within [lower, upper]. +void ClampRegionFlowFeatureIRLSWeights( + float lower, float upper, RegionFlowFeatureView* flow_feature_list); + +// Makes a copy of src to dest without copying any features, i.e. dest will +// have the same values as src for all field except the actual features. +// Implemented by temporarily swapping features in and out, therefore source has +// to be mutable but will not be modified. +void CopyToEmptyFeatureList(RegionFlowFeatureList* src, + RegionFlowFeatureList* dst); + +// Intersects passed RegionFlowFeatureLists based on track_id, returning new +// RegionFlowFeatureList indicating new motion from features location in list +// 'from' to feature's location in list 'to' (specified by to_location +// function, e.g. pass FeatureLocation or FeatureMatchLocation here). +// Requires RegionFlowFeatureList's computed with long_tracks. +// Output result is initialized to contain all fields from input from (same +// holds for intersected features, minus their match location). +// For performance reasons, from is passed at mutable pointer to use above +// CopyToEmptyFeatureList, but is not modified on output. +// Optionally outputs source index for each feature in result into feature +// array of from. +void IntersectRegionFlowFeatureList( + const RegionFlowFeatureList& to, + std::function to_location_eval, + RegionFlowFeatureList* from, RegionFlowFeatureList* result, + std::vector* source_indices = nullptr); + +// Streaming representation for long feature tracks. Ingests +// RegionFlowFeatureList for passed frames and maps them to their corresponding +// track id. +// Usage example: +// LongFeatureStream stream; +// for (int f = 0; f < frames; ++f) { +// RegionFlowFeatureList feature_list = ... // from input. +// stream.AddFeatures(feature_list, true, true); +// +// // Traverse tracks starting at the current frame f (going backwards in +// // time). +// for (const auto& track : stream) { +// track.first // Holds id. +// track.second // Holds vector, most recent one are at +// // the end. +// Convert track to a list of points. +// vector poly_line; +// stream.FlattenTrack(track.second, &poly_line, nullptr, nullptr); +// +// // Returned track where poly_line[0] is the beginning (oldest) and +// // poly_line[poly_line.size() - 1] the end of the track (most recent +// // point). +// +// for (const Vector2_f point : poly_line) { +// // ... do something ... +// } +// } +class LongFeatureStream { + private: + // Buffers features according to their track id. Most recent region flow + // features are added last. + typedef std::unordered_map> TrackBuffer; + + public: + // Default constructor for LongFeatureStream. The default long feature stream + // is backward. + LongFeatureStream() = default; + + // Constructor for LongFeatureStream. The param forward indicates if the long + // feature stream is forward or backward. + explicit LongFeatureStream(bool forward) : forward_(forward) {} + + // Adds new features for the current frame. Region flow must be computed + // w.r.t to previous or next frame (i.e. inter-frame distance = 1, CHECKED). + // If check_connectivity is specified, CHECKS if a feature's match location + // equals it last known location in the buffer. + // Optionally removes features from the buffer that are not present + // in the current list. + void AddFeatures(const RegionFlowFeatureList& feature_list, + bool check_connectivity, bool purge_non_present_features); + + // Traversal example: + // LongFeatureStream stream; + // for (auto track : stream) { + // track.first // Holds id. + // track.second // Holds vector. + // // Note: These are always backward flow features + // // even if you added forward ones. Ordered in time, + // // oldest features come first. + // vector poly_line; + // stream.FlattenTrack(track.second, &poly_line, nullptr, nullptr); + // } + typename TrackBuffer::const_iterator begin() const { return tracks_.begin(); } + typename TrackBuffer::const_iterator end() const { return tracks_.end(); } + + // Extract track as poly-line (vector of positions). + // Specifically, tracks[0] is the beginning (oldest) and + // tracks[tracks.size() - 1] the end of the track (most recent point). + // Optionally, returns irls weight for each point pair along the track, + // i.e weight at position N, specifies weight of track between points + // N and N + 1. For convenience, weight at last position is replicated, i.e. + // length of tracks and irls_weight is identical. + // Optionally, returns the flow vector associated with each point on the + // track with a direction as requested by the flow direction of the + // constructor. Note: For N points, N - 1 flow vectors are returned. + void FlattenTrack(const std::vector& features, + std::vector* tracks, + std::vector* irls_weight, // optional. + std::vector* flow) const; // optional. + + // Random access. Returns nullptr if not found. + const std::vector* TrackById(int id) const; + + // Convenience function calling TrackById and FlattenTrack. Returns empty + // vector if track id is not present. + std::vector FlattenedTrackById(int id) const; + + private: + // Long Feature tracks indexed by id. + TrackBuffer tracks_; + + // Stores old ids that have been removed. Used during check_connectivity. + std::unordered_set old_ids_; + + // A flag indicating if the long feature stream is forward or backward. + bool forward_ = false; +}; + +// Helper class for testing which features are present, computing overall track +// length and other statistics. +// Usage: +// LongFeatureInfo lfi; +// std::vector feature_lists = FROM_OUTSIDE +// for (const auto& feature_list : feature_lists) { +// lfi.AddFeatures(feature_list); +// // Get track length for each feature, so far. +// std::vector track_length; +// lfi.TrackLenghts(feature_list, &track_length); +// } +class LongFeatureInfo { + public: + // Adds features to current info state. + void AddFeatures(const RegionFlowFeatureList& feature_list); + + // Adds a single feature. If used instead of above function, requires + // IncrementFrame to be called manually. + void AddFeature(const RegionFlowFeature& feature); + + // Returns track length for each passed feature. + // Note: If feature is not yet present, zero is returned as length. + void TrackLengths(const RegionFlowFeatureList& feature_list, + std::vector* track_lengths) const; + + // Same as above for an individual feature. + int TrackLength(const RegionFlowFeature& feature) const; + + // Returns starting frame for a feature. + int TrackStart(const RegionFlowFeature& feature) const; + + int NumFrames() const { return num_frames_; } + + void Reset(); + + // Returns track length at passed percentile across all tracks added so far. + int GlobalTrackLength(float percentile) const; + + void IncrementFrame() { ++num_frames_; } + + private: + struct TrackInfo { + int length = 0; + int start = 0; + }; + + // Maps track id to above info struct. + std::unordered_map track_info_; + + int num_frames_ = 0; +}; + +// Scales a salient point in x and y by specified scales. For example, use to +// map a salient point to a specific frame width and height. +void ScaleSalientPoint(float scale_x, float scale_y, + SalientPoint* salient_point); + +// Scales salient points in saliency by factor scale. If normalize_to_scale +// is true, the weights of salient points per frame sum up to scale. +void ScaleSalientPointFrame(float scale, bool normalize_to_scale, + SalientPointFrame* saliency); + +// Convenience function for SaliencyPointList's invoking above function on each +// frame. +void ScaleSaliencyList(float scale, bool normalize_to_scale, + SaliencyPointList* saliency_list); + +// Resets the normalized bounds of salient points in saliency list to +// specified bounds. +void ResetSaliencyBounds(float left, float bottom, float right, float top, + SaliencyPointList* saliency_list); + +// Returns major and minor axis from covariance matrix (scaled by one sigma) in +// each direction) and angle of major axis (in radians, counter clockwise). +// Returns false if degenerate (ellipses with less than 2 pixels in diameter +// in either direction). +// Assuming 2x2 Covariance matrix of the form [a bc +// bc d] +// for variances specified in pixels^2. +bool EllipseFromCovariance(float a, float bc, float d, + Vector2_f* axis_magnitude, float* angle); + +// Calculate the bounding box from an axis aligned ellipse defined by the major +// and minor axis. Returns 4 corners of a minimal bounding box. +void BoundingBoxFromEllipse(const Vector2_f& center, float norm_major_axis, + float norm_minor_axis, float angle, + std::vector* bounding_box); + +// Helper function used by BuildFeature grid to determine the sample taps +// for a domain of size dim_x x dim_y with a specified tap_radius. +void GridTaps(int dim_x, int dim_y, int tap_radius, + std::vector>* taps); + +// Generic function to bin features (of generic type Feature, specified for a +// set of frames) over the domain frame_width x frame_height into equally +// sized square bins of size grid_resolution x grid_resolution. +// Function outputs for each bin the indicies of neighboring bins according +// 3x3 or 5x5 neighborhood centered at that bin. Also output number of bins +// created along each dimension as 2 dim vector (x, y). +// FeatureEvaluator should implement: +// Vector2_f operator()(const Feature& f) const { +// // Returns spatial location for feature f; +// } +template +using FeatureFrame = std::vector; + +template +using FeatureGrid = std::vector>; + +template +void BuildFeatureGrid( + float frame_width, float frame_height, float grid_resolution, + const std::vector>& feature_views, + const FeatureEvaluator& evaluator, + std::vector>* feature_taps_3, // Optional. + std::vector>* feature_taps_5, // Optional. + Vector2_i* num_grid_bins, // Optional. + std::vector>* feature_grids) { + CHECK(feature_grids); + CHECK_GT(grid_resolution, 0.0f); + + const int num_frames = feature_views.size(); + const int grid_dim_x = std::ceil(frame_width / grid_resolution); + const int grid_dim_y = std::ceil(frame_height / grid_resolution); + const int grid_size = grid_dim_x * grid_dim_y; + const float grid_scale = 1.0f / grid_resolution; + + // Pre-compute neighbor grids. + feature_grids->clear(); + feature_grids->resize(num_frames); + for (int f = 0; f < num_frames; ++f) { + // Populate. + auto& curr_grid = (*feature_grids)[f]; + curr_grid.resize(grid_size); + const FeatureFrame& curr_view = feature_views[f]; + for (int i = 0, size = curr_view.size(); i < size; ++i) { + Feature* feature = curr_view[i]; + Vector2_f feature_loc = evaluator(*feature); + const int x = feature_loc.x() * grid_scale; + const int y = feature_loc.y() * grid_scale; + DCHECK_LT(y, grid_dim_y); + DCHECK_LT(x, grid_dim_x); + const int grid_loc = y * grid_dim_x + x; + curr_grid[grid_loc].push_back(feature); + } + } + + if (feature_taps_3 != NULL) { + GridTaps(grid_dim_x, grid_dim_y, 1, feature_taps_3); + } + if (feature_taps_5 != NULL) { + GridTaps(grid_dim_x, grid_dim_y, 2, feature_taps_5); + } + + if (num_grid_bins) { + *num_grid_bins = Vector2_i(grid_dim_x, grid_dim_y); + } +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_H_ diff --git a/mediapipe/util/tracking/region_flow.proto b/mediapipe/util/tracking/region_flow.proto new file mode 100644 index 000000000..351396771 --- /dev/null +++ b/mediapipe/util/tracking/region_flow.proto @@ -0,0 +1,295 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +option java_package = "com.google.mediapipe.tracking"; +option java_multiple_files = true; + +// Captures additional information about a RegionFlowFeature's +// surrounding patch. +// Using MotionEstimation::RetrieveRegionFlowFeatureList or +// ComputeRegionFlowFeatureDescriptors the patch descriptor has the folling +// layout: +// (9 dimensional: 3 mean intensities, 3x3 covariance matrix, (only store upper +// half (6 elems) in column major order, i.e. indices for data in patch +// descriptor refer to: +// mean: 0 1 2, covariance: 3 4 5 +// 6 7 +// 8 +message PatchDescriptor { + repeated float data = 1; // The actual feature descriptor. +} + +// Binary feature descriptor for a particular feature. +// For example: orb +// http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.370.4395&rep=rep1&type=pdf +message BinaryFeatureDescriptor { + optional bytes data = 1; +} + +// Internal datastructure used during temporal IRLS smoothing. +message TemporalIRLSSmoothing { + optional float weight_sum = 1 [default = 0]; + optional float value_sum = 2 [default = 0]; +} + +// Tracked feature at location (x,y) with flow (dx, dy) and patch based +// error (sum of absolute value of intensity difference). +// Next tag: 19 +message RegionFlowFeature { + optional float x = 1 [default = 0]; + optional float y = 2 [default = 0]; + optional float dx = 3 [default = 0]; + optional float dy = 4 [default = 0]; + + // Features that belong to the same feature track are assigned a unique id + // and are identified via it. + // Note, this id is only unique within the lifetime of a RegionFlowComputation + // object. That is, if distribution or parallelization using multiple + // instances was used, the ids are only unique within that instance context. + optional int32 track_id = 13 [default = -1]; // no id. + + // Tracking error as patch intensity residual (SSD). + optional float tracking_error = 5 [default = 0]; + + // Inverse of registration error (in pixels), after parametric motion model + // fitting. Values are in [0, 1e6]. + // Low values correspond to outliers, high values to inliers. + // Set by MotionEstimation::EstimateMotions* + optional float irls_weight = 6 [default = 1.0]; + + // Corner response (computed as minimum eigenvalue of + // block filtered 2nd moment matrix). + optional float corner_response = 11 [default = 0.0]; + + // Patch feature descriptors. *For internal use only*. External clients should + // not rely on their contents. + optional PatchDescriptor feature_descriptor = 7; + optional PatchDescriptor feature_match_descriptor = 8; + + // Internal datastructure used temporally during temporal IRLS smoothing. + optional TemporalIRLSSmoothing internal_irls = 10; + + // Optional label for debugging purposes. + optional string label = 14; + + // Flags indicating specific statuses. + enum Flags { + FLAG_BROKEN_TRACK = 1; // Used for long feature tracks if track id + // was reset. + } + + optional int32 flags = 15; + + // Unique feature id per RegionFlowComputation object. + optional int32 feature_id = 16; + + // octave (pyramid layer) from which the keypoint has been extracted + optional int32 octave = 17 [default = 0]; + + // Feature descriptor for the current feature. + optional BinaryFeatureDescriptor binary_feature_descriptor = 18; + + // Deprecated fields. + extensions 9, 12; +} + +// RegionFlowFrame is a optical flow representation where each region has a +// consistent optical flow (adheres to local translational model). +// Regions are arranged in a regular grid according to BlockDescriptor. +// Next tag: 11. +message RegionFlowFrame { + // Next tag: 8 + message RegionFlow { + required int32 region_id = 1; + + // Mean anchor point (centroid) of flow vector and mean flow. + optional float centroid_x = 2 [default = 0]; + optional float centroid_y = 3 [default = 0]; + optional float flow_x = 4 [default = 0]; + optional float flow_y = 5 [default = 0]; + + repeated RegionFlowFeature feature = 7; + + // Deprecated fields. + extensions 6; + } + + // Sorted by id for quick lookup. + repeated RegionFlow region_flow = 1; + + // Total number of features in all RegionFlow's. + optional int32 num_total_features = 2 [default = 0]; + + // If set, indicates that the frame's region flow is unstable. + // (not enough features or coverage too low). + optional bool unstable_frame = 4 [default = false]; + + // Blur score of the current frame is defined as the n-th percentile + // of the corneress of the input frame evaluated over regions of high + // corneress. For details see BlurScoreOptions in + // region_flow_computation.proto. + // The actual value is pretty meaningless, but relative to the blur score + // of other frames one can detect blurry frames, e.g. by a 'significant' + // local maxima in a sequence of blur_scores. + optional float blur_score = 7; + + optional int32 frame_width = 8; + optional int32 frame_height = 9; + + // Region flow is estimated using a grid of equal sized bins as regions. + // BlockDescriptor specifies size of bins/blocks. + message BlockDescriptor { + optional int32 block_width = 1; + optional int32 block_height = 2; + optional int32 num_blocks_x = 3 [default = 0]; + optional int32 num_blocks_y = 4 [default = 0]; + } + optional BlockDescriptor block_descriptor = 10; + + // Deprecated fields. + extensions 3, 5, 6; +} + +// Encapsulates a list of features with associated flow. +// Can be extracted from RegionFlow via GetRegionFlowFeatureList +// declared in region_flow.h. This is the essential (additional) information +// required by Cropper using wobble_suppression with displacements. +// Next tag: 13 +message RegionFlowFeatureList { + repeated RegionFlowFeature feature = 1; + optional int32 frame_width = 2; + optional int32 frame_height = 3; + + // Set from corresponding RegionFlowFrame field. + optional bool unstable = 4 [default = false]; + + // Records the minimum distance from the image border for each feature and + // matching feature (if enforced > 0). + optional int32 distance_from_border = 5 [default = 0]; + + // Set from corresponding RegionFlowFrame field. + optional float blur_score = 6; + + // If set, indicates, that features represent long tracks, i.e. each feature + // has a valid track_id() >= 0. + optional bool long_tracks = 7 [default = false]; + + // If long_tracks, stores number of long feature tracks that got rejected in + // this frame, as their patches were deemed inconsistent with the track's very + // first extracted patch. + optional float frac_long_features_rejected = 8 [default = 0]; + + // Measures visual consistency between adjacent frames. In particular, stores + // the absolute *change* in visual difference between two adjancent frame + // pairs, i.e. the modulus of the 2nd derivative of the frame appearance. + // Normalized w.r.t. number of channels and total pixels of the underlying + // frame. + // In particular for sudden changes (e.g. shot boundaries) this value will + // be significantly non-zero (> 0.05). + // Negative value per default indicates no consistency has been computed. + optional float visual_consistency = 9 [default = -1]; + + // Timestamp in micro seconds of the underlying frame, that is the frame + // for which the source features (not matching features) were computed. + optional int64 timestamp_usec = 10 [default = 0]; + + // Denotes the frame that flow was computed w.r.t. to, locally to the current + // frame. For example, if current frame is N, N + match_frame is the matching + // frame that flow was computed to. + // Values < 0 indicate backward tracking, while values > 0 indicate forward + // tracking. By default, for empty feature lists, matching frame is the + // same as current frame, i.e. match_frame = 0. + optional int32 match_frame = 11 [default = 0]; + + // Set, if frame is estimated to be an exact duplicate of the previous frame. + optional bool is_duplicated = 12 [default = false]; +} + +// Salient point location (normalized w.r.t. frame_width and frame_height, i.e. +// specified in the domain [0, 1] x [0, 1]). + +// For TYPE_INCLUDE: +// During retargeting and stabilization salient points introduce constraints +// that will try to keep the normalized location in the rectangle +// frame_size - normalized bounds. +// For this soft constraints are used, therefore the weight specifies +// how "important" the salient point is (higher is better). +// In particular for each point p the retargeter introduces two pairs of +// constraints of the form: +// x - slack < width - right +// and x + slack > 0 + left, with slack > 0 +// where the weight specifies the importance of the slack. +// +// For TYPE_EXCLUDE_*: +// Similar to above, but constraints are introduced to keep +// the point to the left of the left bound OR the right of the right bound. +// In particular: +// x - slack < left OR +// x + slack >= right +// Similar to above, the weight specifies the importance of the slack. +// +// Note: Choosing a too high weight can lead to +// jerkiness as the stabilization essentially starts tracking the salient point. +message SalientPoint { + // Normalized location of the point (within domain [0, 1] x [0, 1]. + optional float norm_point_x = 1 [default = 0.0]; + optional float norm_point_y = 2 [default = 0.0]; + + enum SalientPointType { + TYPE_INCLUDE = 1; + TYPE_EXCLUDE_LEFT = 2; + TYPE_EXCLUDE_RIGHT = 3; + } + + // Salient point type. By default we try to frame the salient point within + // the bounding box specified by left, bottom, right, top. Alternatively, one + // can choose to exclude the point. For details, see discussion above. + optional SalientPointType type = 11 [default = TYPE_INCLUDE]; + + // Bounds are specified in normalized coordinates [0, 1], FROM the specified + // border. Opposing bounds (e.g. left and right) may not add to values + // larger than 1. + // Default bounds center salient point within centering third of the frame. + optional float left = 3 [default = 0.3]; + optional float bottom = 4 [default = 0.3]; + optional float right = 9 [default = 0.3]; + optional float top = 10 [default = 0.3]; + + optional float weight = 5 [default = 15]; + + // In addition salient point can represent a region of interest (defined as + // ellipse of size norm_major x norm_minor (normalized to [0, 1] domain) + // which orientation is given by angle (in radians in [0, pi]). + // Due to aspect ratio change of the normalized domain, it is recommended that + // transformations to other domains are done via the ScaleSalientPoint + // function. + optional float norm_major = 6; + optional float norm_minor = 7; + + // Angle of major axis with x-axis (counter-clock wise, in radians). + optional float angle = 8; + + extensions 20000 to max; +} + +// Aggregates SalientPoint's for a frame. +message SalientPointFrame { + repeated SalientPoint point = 1; + + extensions 20000 to max; +} diff --git a/mediapipe/util/tracking/region_flow_computation.cc b/mediapipe/util/tracking/region_flow_computation.cc new file mode 100644 index 000000000..331246220 --- /dev/null +++ b/mediapipe/util/tracking/region_flow_computation.cc @@ -0,0 +1,3560 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/region_flow_computation.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Eigen/Core" +#include "absl/container/node_hash_set.h" +#include "absl/memory/memory.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_features2d_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/opencv_video_inc.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/camera_motion.pb.h" +#include "mediapipe/util/tracking/image_util.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/motion_estimation.h" +#include "mediapipe/util/tracking/motion_estimation.pb.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/parallel_invoker.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/tone_estimation.h" +#include "mediapipe/util/tracking/tone_estimation.pb.h" +#include "mediapipe/util/tracking/tone_models.h" +#include "mediapipe/util/tracking/tone_models.pb.h" + +using std::max; +using std::min; + +namespace mediapipe { + +typedef RegionFlowFrame::RegionFlow RegionFlow; +typedef RegionFlowFeature Feature; +constexpr float kZeroMotion = 0.25f; // Quarter pixel average motion. + +// Helper struct used by RegionFlowComputation and MotionEstimation. +// Feature position, flow and error. Unique id per track, set to -1 if no such +// id can be assigned. +struct TrackedFeature { + TrackedFeature(const Vector2_f& point_, const Vector2_f& flow_, + float tracking_error_, float corner_response_, int octave_, + int track_id_ = -1, float verify_dist_ = 0) + : point(point_), + flow(flow_), + tracking_error(tracking_error_), + corner_response(corner_response_), + octave(octave_), + irls_weight(1.0f), + num_bins(1), + track_id(track_id_), + verify_dist(verify_dist_) {} + + Vector2_f point; + Vector2_f flow; + float tracking_error = 0; + float corner_response = 0; + int octave = 0; + float irls_weight = 1.0f; + int num_bins = 1; // Total number of bins feature is binned into. + int track_id = -1; // Unique id, assigned to each feature belonging to the + // same track. Negative values indicates no id. + float verify_dist = 0; + + // Flags as defined by RegionFlowFeature. + int flags = 0; + + // Descriptors of this feature (single row). + cv::Mat descriptors; + + // Original neighborhood of the feature. Refers to the patch that the feature + // was extracted the very first time. + // Using shared_ptr instead of unique_ptr to make TrackingData copyable. + // We don't use cv::Mat directly (which is reference counted) as + // neighborhoods are only used for long feature verification (which is + // optional). + std::shared_ptr orig_neighborhood; + + void Invert() { + point += flow; + flow = -flow; + } +}; + +// Inverts features (swaps location and matches). In-place operation OK, i.e. +// inverted_list == &list acceptable and faster. +void InvertFeatureList(const TrackedFeatureList& list, + TrackedFeatureList* inverted_list) { + if (inverted_list != &list) { + *inverted_list = list; + } + + for (auto& feature : *inverted_list) { + feature.Invert(); + } +} + +// Allocates pyramid images of sufficient size (suggested OpenCV settings, +// independent of number of pyramid levels). +void AllocatePyramid(int frame_width, int frame_height, cv::Mat* pyramid) { + const int pyramid_width = frame_width + 8; + const int pyramid_height = frame_height / 2 + 1; + pyramid->create(pyramid_height, pyramid_width, CV_8UC1); +} + +namespace { + +// lab_window is used as scratch space only, to avoid allocations. +void GetPatchDescriptorAtPoint(const cv::Mat& rgb_frame, const Vector2_i& pt, + const int radius, cv::Mat* lab_window, + PatchDescriptor* descriptor) { + CHECK(descriptor); + descriptor->clear_data(); + + // Reserve enough data for mean and upper triangular part of + // covariance matrix. + descriptor->mutable_data()->Reserve(3 + 6); + + // Extract a window of the RGB frame for Lab color conversion. We know that at + // this point the window doesn't overlap with the frame boundary. The + // windowing operation just generates a reference and doesn't copy the values. + const int diameter = 2 * radius + 1; + const cv::Mat rgb_window = + rgb_frame(cv::Rect(pt.x() - radius, pt.y() - radius, diameter, diameter)); + + // Compute channel sums and means. + int sum[3] = {0, 0, 0}; + for (int y = 0; y < diameter; ++y) { + const uint8* data = rgb_window.ptr(y); + for (int x = 0; x < diameter; ++x, data += 3) { + for (int c = 0; c < 3; ++c) { + sum[c] += data[c]; + } + } + } + const float scale = 1.f / (diameter * diameter); + for (int c = 0; c < 3; ++c) { + descriptor->add_data(sum[c] * scale); // Mean value. + } + + const float denom = 1.0f / (diameter * diameter); + + // Compute the channel dot products, after centering around the respective + // channel means. Only computing upper triangular part. + int product[3][3]; + for (int c = 0; c < 3; ++c) { + for (int d = c; d < 3; ++d) { + // We want to compute + // sum_{x,y}[(data[c] - mean[c]) * (data[d] - mean[d])], + // which simplifies to + // sum_{x,y}[data[c] * data[d]] - sum[c] * sum[d] / N + // using N = diameter * diameter and sum[c] = N * mean[c]. + product[c][d] = -sum[c] * sum[d] * denom; + for (int y = 0; y < diameter; ++y) { + const uint8* data = rgb_window.ptr(y); + for (int x = 0; x < diameter; ++x, data += 3) { + product[c][d] += static_cast(data[c]) * data[d]; + } + } + } + } + + // Finally, add the descriptors only storring upper triangular part. + for (int c = 0; c < 3; ++c) { + for (int d = c; d < 3; ++d) { + descriptor->add_data(product[c][d] * scale); + } + } +} + +class PatchDescriptorInvoker { + public: + PatchDescriptorInvoker(const cv::Mat& rgb_frame, + const cv::Mat* prev_rgb_frame, int radius, + RegionFlowFeatureList* features) + : rgb_frame_(rgb_frame), + prev_rgb_frame_(prev_rgb_frame), + radius_(radius), + features_(features) {} + + void operator()(const BlockedRange& range) const { + cv::Mat lab_window; // To avoid repeated allocations below. + for (int feature_idx = range.begin(); feature_idx != range.end(); + ++feature_idx) { + RegionFlowFeature* feature = features_->mutable_feature(feature_idx); + Vector2_i pt(FeatureIntLocation(*feature)); + DCHECK_GE(pt.x(), radius_); + DCHECK_GE(pt.y(), radius_); + DCHECK_LT(pt.x(), rgb_frame_.cols - radius_); + DCHECK_LT(pt.y(), rgb_frame_.rows - radius_); + GetPatchDescriptorAtPoint(rgb_frame_, pt, radius_, &lab_window, + feature->mutable_feature_descriptor()); + + if (prev_rgb_frame_) { + Vector2_i pt_match(FeatureMatchIntLocation(*feature)); + DCHECK_GE(pt_match.x(), radius_); + DCHECK_GE(pt_match.y(), radius_); + DCHECK_LT(pt_match.x(), rgb_frame_.cols - radius_); + DCHECK_LT(pt_match.y(), rgb_frame_.rows - radius_); + GetPatchDescriptorAtPoint(*prev_rgb_frame_, pt_match, radius_, + &lab_window, + feature->mutable_feature_match_descriptor()); + } + } + } + + private: + const cv::Mat& rgb_frame_; + const cv::Mat* prev_rgb_frame_; + int radius_; + RegionFlowFeatureList* features_; +}; + +} // namespace. + +// Computes patch descriptor in color domain (LAB), see region_flow.proto for +// specifics. +// If optional parameter prev_rgb_frame is set, also computes corresponding +// feature_match_descriptor. +// IMPORTANT: Ensure that patch_descriptor_rad <= distance_from_border in +// GetRegionFlowFeatureList. Checked by function. +void ComputeRegionFlowFeatureDescriptors( + const cv::Mat& rgb_frame, const cv::Mat* prev_rgb_frame, + int patch_descriptor_radius, RegionFlowFeatureList* flow_feature_list) { + const int rows = rgb_frame.rows; + const int cols = rgb_frame.cols; + CHECK_EQ(rgb_frame.depth(), CV_8U); + CHECK_EQ(rgb_frame.channels(), 3); + + if (prev_rgb_frame) { + CHECK_EQ(prev_rgb_frame->depth(), CV_8U); + CHECK_EQ(prev_rgb_frame->channels(), 3); + CHECK_EQ(prev_rgb_frame->rows, rows); + CHECK_EQ(prev_rgb_frame->cols, cols); + } + + CHECK_LE(patch_descriptor_radius, flow_feature_list->distance_from_border()); + + ParallelFor( + 0, flow_feature_list->feature_size(), 1, + PatchDescriptorInvoker(rgb_frame, prev_rgb_frame, patch_descriptor_radius, + flow_feature_list)); +} + +// Stores 2D location's of feature points and their corresponding descriptors, +// where the i'th row in descriptors corresponds to the i'th entry in +// key_points. +struct RegionFlowComputation::ORBFeatureDescriptors { + cv::Mat descriptors; + std::vector key_points; + bool computed; + + ORBFeatureDescriptors() { Reset(); } + + void Reset() { + key_points.clear(); + computed = false; + } +}; + +struct RegionFlowComputation::FrameTrackingData { + cv::Mat frame; + + // Pyramid used for tracking. Just contains the a single image if old + // c-interface is used. + std::vector pyramid; + cv::Mat blur_data; + cv::Mat tiny_image; // Used if visual consistency verification is performed. + cv::Mat mask; // Features need to be extracted only where mask value > 0. + float mean_intensity = 0; // Mean intensity of the frame. + + // Pyramid used during feature extraction at multiple levels. + std::vector extraction_pyramid; + + // Records number of pyramid levels stored by member pyramid. If zero, pyramid + // has not been computed yet. + int pyramid_levels = 0; + + // Features extracted in this frame or tracked from a source frame. + std::vector features; + + // FrameTrackingData that resulting features were tracked from. + FrameTrackingData* source = nullptr; + + // Indicates for each feature, corresponding source feature index (index into + // above features vector for source FrameTrackingData). + std::vector feature_source_map; + + // If set, indicates that member features was pre-initialized. + bool features_initialized = false; + + // Time (in frames) when the last feature extraction was carried out for the + // features used in this frame. Time increases by 1 every time we reuse + // tracked features as the features for this frame. + int last_feature_extraction_time = -1; + + // Number of extracted and tracked features in the original extraction frame + // (referred to by last_feature_extraction_time) and the current frame. + // That is stores the number inliers according to function + // GetFeatureTrackInliers. + int num_original_extracted_and_tracked = -1; // Negative for not set. + int num_extracted_and_tracked = -1; // Negative for not set. + + // 1:1 mapping w.r.t. features. + std::vector corner_responses; + + // 1:1 mapping w.r.t. features. Records octave each feature belongs to. + std::vector octaves; + + // 1:1 mapping w.r.t. features. Records track id each feature belongs to, + // use -1 if no such track id exists. Only used for long feature tracks. + std::vector track_ids; + + // 1:1 mapping w.r.t. features. Stores the original patch neighborhood when + // the feature was extracted for the first time, that is for features with + // assigned track_id (>= 0) the data refers to a neighborhood in an earlier + // frame or else the neighborhood in the current frame. + // Stores a CV_8U grayscale square patch with length + // TrackingOptions::tracking_window_size(). + std::shared_ptr> neighborhoods; + + // Absolute frame number of this FrameTrackingData. + int frame_num = 0; + + // Timestamp of the underlying frame. + int64 timestamp_usec = 0; + + // Difference of this FrameTrackingData's tiny_image w.r.t. previous one, + // i.e. one frame earlier. + float tiny_image_diff = 0.0f; + + // Initial transform for matching features. Optional. Always stores the + // transform from current to previous frame. + std::shared_ptr initial_transform; + + ORBFeatureDescriptors orb; + + bool use_cv_tracking = false; + + FrameTrackingData(int width, int height, int extraction_levels, + bool _use_cv_tracking) + : use_cv_tracking(_use_cv_tracking) { + // Extraction pyramid. + extraction_pyramid.clear(); + for (int i = 0, iwidth = width, iheight = height; i < extraction_levels; + ++i) { + extraction_pyramid.push_back(cv::Mat(iheight, iwidth, CV_8UC1)); + iwidth = (iwidth + 1) / 2; + iheight = (iheight + 1) / 2; + } + CHECK_GE(extraction_levels, 1); + // Frame is the same as first extraction level. + frame = extraction_pyramid[0]; + + if (!use_cv_tracking) { + // Tracking pyramid for old c-interface. + pyramid.resize(1); + AllocatePyramid(width, height, &pyramid[0]); + } + } + + void BuildPyramid(int levels, int window_size, bool with_derivative) { + if (use_cv_tracking) { +#if CV_MAJOR_VERSION == 3 + // No-op if not called for opencv 3.0 (c interface computes + // pyramids in place). + // OpenCV changed how window size gets specified from our radius setting + // < 2.2 to diameter in 2.2+. + cv::buildOpticalFlowPyramid( + frame, pyramid, cv::Size(2 * window_size + 1, 2 * window_size + 1), + levels, with_derivative); + // Store max level for above pyramid. + pyramid_levels = levels; +#endif + } + } + + void Reset(int frame_num_, int64 timestamp_) { + frame_num = frame_num_; + timestamp_usec = timestamp_; + pyramid_levels = 0; + ResetFeatures(); + neighborhoods.reset(); + orb.Reset(); + } + + void ResetFeatures() { + features.clear(); + corner_responses.clear(); + octaves.clear(); + track_ids.clear(); + feature_source_map.clear(); + if (neighborhoods != nullptr) { + neighborhoods->clear(); + } + source = nullptr; + features_initialized = false; + last_feature_extraction_time = 0; + num_original_extracted_and_tracked = -1; + num_extracted_and_tracked = -1; + } + + void PreAllocateFeatures(int num_features) { + features.reserve(num_features); + octaves.reserve(num_features); + corner_responses.reserve(num_features); + track_ids.reserve(num_features); + } + + // Adds new feature and with required information. + void AddFeature(const cv::Point2f& location, float corner_response, + int octave, + int track_id, // optional, -1 for none. + const cv::Mat* neighborhood) { // optional. + features.push_back(location); + corner_responses.push_back(corner_response); + octaves.push_back(octave); + track_ids.push_back(track_id); + if (neighborhoods != nullptr) { + if (neighborhood) { + neighborhoods->push_back(*neighborhood); + } else { + neighborhoods->push_back(cv::Mat()); + } + } + } + + void RemoveFeature(int pos) { + DCHECK_LT(pos, features.size()); + features.erase(features.begin() + pos); + feature_source_map.erase(feature_source_map.begin() + pos); + corner_responses.erase(corner_responses.begin() + pos); + octaves.erase(octaves.begin() + pos); + track_ids.erase(track_ids.begin() + pos); + if (neighborhoods) { + neighborhoods->erase(neighborhoods->begin() + pos); + } + } + + // Stores grayscale square patch with length patch_size extracted at center in + // image frame and stores result in patch. + void ExtractPatch(const cv::Point2f& center, int patch_size, cv::Mat* patch) { + CHECK(patch != nullptr); + patch->create(patch_size, patch_size, CV_8UC1); + cv::getRectSubPix(frame, cv::Size(patch_size, patch_size), center, *patch); + } +}; + +// Data to be used across AddImage calls for long feature tracking. +struct RegionFlowComputation::LongTrackData { + LongTrackData() = default; + + // Returns next id and records its start frame. + int CreateNextTrackId(int start_frame, float motion_mag) { + track_info[next_track_id] = TrackInfo(start_frame, motion_mag); + const int result = next_track_id; + + // Advance. + ++next_track_id; + if (next_track_id < 0) { + LOG(ERROR) << "Exhausted maximum possible ids. RegionFlowComputation " + << "instance lifetime is likely to be too long. Consider " + << "chunking the input."; + next_track_id = 0; + } + + return result; + } + + // Returns last id that was created or -1 if an id was never created. + int LastTrackId() const { return next_track_id - 1; } + + // Returns -1 if id is not present. + int StartFrameForId(int id) const { + auto id_iter = track_info.find(id); + if (id_iter == track_info.end()) { + return -1; + } else { + return id_iter->second.start_frame; + } + } + + // Clears buffered information for all features that are not present + // anymore, i.e. not within the specified hash_set. + void RemoveAbsentFeatureEntries( + const absl::node_hash_set& present_features) { + auto entry = track_info.begin(); + while (entry != track_info.end()) { + if (present_features.find(entry->first) == present_features.end()) { + // Not present anymore, remove! + auto to_erase = entry; + ++entry; + track_info.erase(to_erase); + } else { + ++entry; + } + } + } + + float MotionMagForId(int id) const { + auto id_iter = track_info.find(id); + DCHECK(id_iter != track_info.end()); + return id_iter->second.motion_mag; + } + + void UpdateMotion(int id, float motion_mag) { + auto id_iter = track_info.find(id); + DCHECK(id_iter != track_info.end()); + if (id_iter->second.motion_mag >= 0) { + id_iter->second.motion_mag = + id_iter->second.motion_mag * 0.5f + 0.5f * motion_mag; + } + } + + // Next id to be assigned to a new track. + int next_track_id = 0; + + // Holds the previous result to seed the next frame. + TrackedFeatureList prev_result; + + // Records for each track id some additional information. + struct TrackInfo { + TrackInfo() {} + TrackInfo(int _start_frame, float _motion_mag) + : start_frame(_start_frame), motion_mag(_motion_mag) {} + int start_frame = 0; // Start frame of track. + float motion_mag = 0; // Smoothed average motion. -1 for unknown. + }; + + std::unordered_map track_info; +}; + +template +std::unique_ptr MakeUnique(T* ptr) { + return std::unique_ptr(ptr); +} + +RegionFlowComputation::RegionFlowComputation( + const RegionFlowComputationOptions& options, int frame_width, + int frame_height) + : options_(options), + frame_width_(frame_width), + frame_height_(frame_height) { + switch (options_.gain_correct_mode()) { + case RegionFlowComputationOptions::GAIN_CORRECT_DEFAULT_USER: + // Do nothing, simply use supplied bounds. + break; + + case RegionFlowComputationOptions::GAIN_CORRECT_VIDEO: { + auto* gain_bias = options_.mutable_gain_bias_bounds(); + gain_bias->Clear(); + gain_bias->set_lower_gain(0.8f); + gain_bias->set_upper_gain(1.2f); + gain_bias->set_lower_bias(-0.2f); + gain_bias->set_upper_bias(0.2f); + gain_bias->set_min_inlier_weight(0.2f); + gain_bias->set_min_inlier_fraction(0.6f); + break; + } + + case RegionFlowComputationOptions::GAIN_CORRECT_HDR: { + auto* gain_bias = options_.mutable_gain_bias_bounds(); + gain_bias->Clear(); + gain_bias->set_lower_gain(0.8f); + gain_bias->set_lower_gain(0.33f); + gain_bias->set_upper_gain(3.0f); + gain_bias->set_lower_bias(-0.5f); + gain_bias->set_upper_bias(0.5f); + gain_bias->set_min_inlier_weight(0.15f); + gain_bias->set_min_inlier_fraction(0.6f); + break; + } + + case RegionFlowComputationOptions::GAIN_CORRECT_PHOTO_BURST: { + auto* gain_bias = options_.mutable_gain_bias_bounds(); + gain_bias->Clear(); + gain_bias->set_min_inlier_fraction(0.6f); + gain_bias->set_min_inlier_weight(0.1f); + gain_bias->set_lower_gain(0.4f); + gain_bias->set_upper_gain(2.5f); + gain_bias->set_lower_bias(-0.6f); + gain_bias->set_upper_bias(0.6f); + break; + } + } + + CHECK_NE(options.tracking_options().output_flow_direction(), + TrackingOptions::CONSECUTIVELY) + << "Output direction must be either set to FORWARD or BACKWARD."; + use_downsampling_ = options_.downsample_mode() != + RegionFlowComputationOptions::DOWNSAMPLE_NONE; + downsample_scale_ = 1; + original_width_ = frame_width_; + original_height_ = frame_height_; + + switch (options_.downsample_mode()) { + case RegionFlowComputationOptions::DOWNSAMPLE_NONE: + break; + case RegionFlowComputationOptions::DOWNSAMPLE_TO_MAX_SIZE: { + const float max_size = std::max(frame_width_, frame_height_); + if (max_size > 1.03f * options_.downsampling_size()) { + downsample_scale_ = max_size / options_.downsampling_size(); + if (options_.round_downsample_factor()) { + downsample_scale_ = std::round(downsample_scale_); + } + } + break; + } + case RegionFlowComputationOptions::DOWNSAMPLE_TO_MIN_SIZE: { + const float min_size = std::min(frame_width_, frame_height_); + if (min_size > 1.03f * options_.downsampling_size()) { + downsample_scale_ = min_size / options_.downsampling_size(); + if (options_.round_downsample_factor()) { + downsample_scale_ = std::round(downsample_scale_); + } + } + break; + } + case RegionFlowComputationOptions::DOWNSAMPLE_BY_FACTOR: + case RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE: { + CHECK_GE(options_.downsample_factor(), 1); + downsample_scale_ = options_.downsample_factor(); + break; + } + case RegionFlowComputationOptions::DOWNSAMPLE_BY_SCHEDULE: { + const int frame_area = frame_width_ * frame_height_; + if (frame_area <= (16 * 1.03 / 9 * 360 * 360)) { + downsample_scale_ = + options_.downsample_schedule().downsample_factor_360p(); + } else if (frame_area <= (16 * 1.03 / 9 * 480 * 480)) { + downsample_scale_ = + options_.downsample_schedule().downsample_factor_480p(); + } else if (frame_area <= (16 * 1.03 / 9 * 720 * 720)) { + downsample_scale_ = + options_.downsample_schedule().downsample_factor_720p(); + } else { + downsample_scale_ = + options_.downsample_schedule().downsample_factor_1080p(); + } + break; + } + } + + frame_width_ = std::round(frame_width_ / downsample_scale_); + frame_height_ = std::round(frame_height_ / downsample_scale_); + + if (use_downsampling_ && + options_.downsample_mode() != + RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) { + // Make downscaled size even. + frame_width_ += frame_width_ % 2; + frame_height_ += frame_height_ % 2; + + LOG(INFO) << "Using a downsampling scale of " << downsample_scale_; + } + + // Make sure value is equal to local variable, in case someone uses that on + // accident below. + frame_width = frame_width_; + frame_height = frame_height_; + + // Allocate temporary frames. + switch (options_.image_format()) { + case RegionFlowComputationOptions::FORMAT_RGB: + case RegionFlowComputationOptions::FORMAT_BGR: + curr_color_image_.reset( + new cv::Mat(frame_height_, frame_width_, CV_8UC3)); + break; + + case RegionFlowComputationOptions::FORMAT_RGBA: + case RegionFlowComputationOptions::FORMAT_BGRA: + curr_color_image_.reset( + new cv::Mat(frame_height_, frame_width_, CV_8UC4)); + break; + + case RegionFlowComputationOptions::FORMAT_GRAYSCALE: + // Do nothing. + break; + } + + if (options_.compute_blur_score()) { + corner_values_.reset(new cv::Mat(frame_height_, frame_width_, CV_32F)); + corner_filtered_.reset(new cv::Mat(frame_height_, frame_width_, CV_32F)); + corner_mask_.reset(new cv::Mat(frame_height_, frame_width_, CV_8U)); + } + + max_long_track_length_ = 1; + switch (options_.tracking_options().tracking_policy()) { + case TrackingOptions::POLICY_SINGLE_FRAME: + if (options_.tracking_options().multi_frames_to_track() > 1) { + LOG(ERROR) << "TrackingOptions::multi_frames_to_track is > 1, " + << "but tracking_policy is set to POLICY_SINGLE_FRAME. " + << "Consider using POLICY_MULTI_FRAME instead."; + } + + frames_to_track_ = 1; + break; + case TrackingOptions::POLICY_MULTI_FRAME: + CHECK_GT(options_.tracking_options().multi_frames_to_track(), 0); + frames_to_track_ = options_.tracking_options().multi_frames_to_track(); + break; + case TrackingOptions::POLICY_LONG_TRACKS: + if (options_.tracking_options().multi_frames_to_track() > 1) { + LOG(ERROR) << "TrackingOptions::multi_frames_to_track is > 1, " + << "but tracking_policy is set to POLICY_LONG_TRACKS. " + << "Use TrackingOptions::long_tracks_max_frames to set " + << "length of long feature tracks."; + } + + if (options_.tracking_options().internal_tracking_direction() != + TrackingOptions::FORWARD) { + LOG(ERROR) << "Long tracks are only supported if tracking direction " + << "is set to FORWARD. Adjusting direction to FORWARD. " + << "This does not affect the expected " + << "output_flow_direction"; + options_.mutable_tracking_options()->set_internal_tracking_direction( + TrackingOptions::FORWARD); + } + + frames_to_track_ = 1; + max_long_track_length_ = + options_.tracking_options().long_tracks_max_frames(); + long_track_data_.reset(new LongTrackData()); + break; + } + + CHECK(!options_.gain_correction() || !IsVerifyLongFeatures()) + << "Gain correction mode with verification of long features is not " + << "supported."; + + // Tracking algorithm dependent on cv support and flag. + use_cv_tracking_ = options_.tracking_options().use_cv_tracking_algorithm(); +#if CV_MAJOR_VERSION != 3 + if (use_cv_tracking_) { + LOG(WARNING) << "Compiled without OpenCV 3.0 but cv_tracking_algorithm " + << "was requested. Falling back to older algorithm"; + use_cv_tracking_ = false; + } +#endif + + if (options_.gain_correction()) { + gain_image_.reset(new cv::Mat(frame_height_, frame_width_, CV_8UC1)); + if (!use_cv_tracking_) { + gain_pyramid_.reset(new cv::Mat()); + AllocatePyramid(frame_width_, frame_height_, gain_pyramid_.get()); + } + } + + // Determine number of levels at which to extract features. If lowest image + // size for extraction is given, it overrides extraction levels. + extraction_levels_ = options_.tracking_options().adaptive_extraction_levels(); + const int lowest_extraction_size = + options_.tracking_options().adaptive_extraction_levels_lowest_size(); + if (lowest_extraction_size > 0) { + const float frame_size = max(frame_width_, frame_height_); + extraction_levels_ = + 1 + std::ceil(std::log2(frame_size / lowest_extraction_size) - 0.01); + } + extraction_levels_ = max(1, extraction_levels_); + VLOG(1) << "Feature extraction will be done over " << extraction_levels_ + << " levels, starting at size (width, height): (" << frame_width_ + << ", " << frame_height_ << ")"; + + feature_tmp_image_1_.reset(new cv::Mat(frame_height_, frame_width_, CV_32F)); + feature_tmp_image_2_.reset(new cv::Mat(frame_height_, frame_width_, CV_32F)); + + // Allocate feature point arrays. + max_features_ = options_.tracking_options().max_features(); + + // Compute number of pyramid levels. + float track_distance = + hypot(frame_width_, frame_height_) * + options_.tracking_options().fractional_tracking_distance(); + pyramid_levels_ = PyramidLevelsFromTrackDistance(track_distance); + VLOG(1) << "Using pyramid levels: " << pyramid_levels_; + + // Compute settings for block based flow. + const float block_size = options_.fast_estimation_block_size(); + CHECK_GT(block_size, 0) << "Need positive block size"; + + block_width_ = block_size < 1 ? block_size * original_width_ : block_size; + block_height_ = block_size < 1 ? block_size * original_height_ : block_size; + // Ensure block_[width|height] is not zero. + block_width_ = max(1, block_width_); + block_height_ = max(1, block_height_); + + // Compute block pyramid levels. + double min_block_dim = max(block_width_, block_height_); + + // Choose last_level such that + // min_block_dim * 0.5^(last_level - 1) = min_block_size + double last_level = + (log(static_cast(options_.fast_estimation_min_block_size())) - + log(min_block_dim)) / + log(0.5) + + 1; + block_levels_ = max(2.0, floor(last_level)); + + Reset(); +} + +RegionFlowComputation::~RegionFlowComputation() {} + +bool RegionFlowComputation::AddImage(const cv::Mat& source, + int64 timestamp_usec) { + return AddImageAndTrack(source, cv::Mat(), timestamp_usec, Homography()); +} + +bool RegionFlowComputation::AddImageWithSeed( + const cv::Mat& source, int64 timestamp_usec, + const Homography& initial_transform) { + return AddImageAndTrack(source, cv::Mat(), timestamp_usec, initial_transform); +} + +bool RegionFlowComputation::AddImageWithMask(const cv::Mat& source, + const cv::Mat& source_mask, + int64 timestamp_usec) { + return AddImageAndTrack(source, source_mask, timestamp_usec, Homography()); +} + +RegionFlowFeatureList* RegionFlowComputation::RetrieveRegionFlowFeatureList( + bool compute_feature_descriptor, bool compute_match_descriptor, + const cv::Mat* curr_color_image, const cv::Mat* prev_color_image) { + return (RetrieveRegionFlowFeatureListImpl( + 0, compute_feature_descriptor, compute_match_descriptor, + curr_color_image ? curr_color_image : nullptr, + prev_color_image ? prev_color_image : nullptr) + .release()); +} + +RegionFlowFrame* RegionFlowComputation::RetrieveRegionFlow() { + return RetrieveMultiRegionFlow(0); +} + +std::unique_ptr +RegionFlowComputation::RetrieveRegionFlowFeatureListImpl( + int track_index, bool compute_feature_descriptor, + bool compute_match_descriptor, const cv::Mat* curr_color_image, + const cv::Mat* prev_color_image) { + CHECK_GT(region_flow_results_.size(), track_index); + CHECK(region_flow_results_[track_index].get()); + + std::unique_ptr feature_list( + std::move(region_flow_results_[track_index])); + + if (compute_feature_descriptor) { + CHECK(curr_color_image != nullptr); + CHECK_EQ(3, curr_color_image->channels()); + if (compute_match_descriptor) { + CHECK(prev_color_image != nullptr); + CHECK_EQ(3, prev_color_image->channels()); + } + + ComputeRegionFlowFeatureDescriptors( + *curr_color_image, + compute_match_descriptor ? prev_color_image : nullptr, + options_.patch_descriptor_radius(), feature_list.get()); + } else { + CHECK(!compute_match_descriptor) << "Set compute_feature_descriptor also " + << "if setting compute_match_descriptor"; + } + + return feature_list; +} + +RegionFlowFrame* RegionFlowComputation::RetrieveMultiRegionFlow(int frame) { + std::unique_ptr feature_list( + RetrieveRegionFlowFeatureListImpl(frame, + false, // No descriptors. + false, // No match descriptors. + nullptr, nullptr)); + + std::unique_ptr flow_frame(new RegionFlowFrame()); + + RegionFlowFeatureListToRegionFlow(*feature_list, flow_frame.get()); + return flow_frame.release(); +} + +RegionFlowFeatureList* +RegionFlowComputation::RetrieveMultiRegionFlowFeatureList( + int track_index, bool compute_feature_descriptor, + bool compute_match_descriptor, const cv::Mat* curr_color_image, + const cv::Mat* prev_color_image) { + return (RetrieveRegionFlowFeatureListImpl( + track_index, compute_feature_descriptor, compute_match_descriptor, + curr_color_image ? curr_color_image : nullptr, + prev_color_image ? prev_color_image : nullptr) + .release()); +} + +bool RegionFlowComputation::InitFrame(const cv::Mat& source, + const cv::Mat& source_mask, + FrameTrackingData* data) { + // Destination frame, CV_8U grayscale of dimension frame_width_ x + // frame_height_. + cv::Mat& dest_frame = data->frame; + cv::Mat& dest_mask = data->mask; + + // Do we need to downsample image? + const cv::Mat* source_ptr = &source; + if (use_downsampling_ && + options_.downsample_mode() != + RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) { + // Area based method best for downsampling. + // For color images to temporary buffer. + cv::Mat& resized = source.channels() == 1 ? dest_frame : *curr_color_image_; + cv::resize(source, resized, resized.size(), 0, 0, CV_INTER_AREA); + source_ptr = &resized; + // Resize feature extraction mask if needed. + if (!source_mask.empty()) { + dest_mask.create(resized.rows, resized.cols, CV_8UC1); + cv::resize(source_mask, dest_mask, dest_mask.size(), 0, 0, CV_INTER_NN); + } + } else if (!source_mask.empty()) { + source_mask.copyTo(dest_mask); + } + + // Stores as tiny frame before color conversion if requested. + const auto& visual_options = options_.visual_consistency_options(); + if (visual_options.compute_consistency()) { + // Allocate tiny image. + const int type = source_ptr->type(); + const int dimension = visual_options.tiny_image_dimension(); + data->tiny_image.create(dimension, dimension, type); + cv::resize(*source_ptr, data->tiny_image, data->tiny_image.size(), 0, 0, + CV_INTER_AREA); + } + + if (source_ptr->channels() == 1 && + options_.image_format() != + RegionFlowComputationOptions::FORMAT_GRAYSCALE) { + options_.set_image_format(RegionFlowComputationOptions::FORMAT_GRAYSCALE); + LOG(WARNING) << "#channels = 1, but image_format was not set to " + "FORMAT_GRAYSCALE. Assuming GRAYSCALE input."; + } + + // Convert image to grayscale. + switch (options_.image_format()) { + case RegionFlowComputationOptions::FORMAT_RGB: + if (3 != source_ptr->channels()) { + LOG(ERROR) << "Expecting 3 channel input for RGB."; + return false; + } + cv::cvtColor(*source_ptr, dest_frame, CV_RGB2GRAY); + break; + + case RegionFlowComputationOptions::FORMAT_BGR: + if (3 != source_ptr->channels()) { + LOG(ERROR) << "Expecting 3 channel input for BGR."; + return false; + } + cv::cvtColor(*source_ptr, dest_frame, CV_BGR2GRAY); + break; + + case RegionFlowComputationOptions::FORMAT_RGBA: + if (4 != source_ptr->channels()) { + LOG(ERROR) << "Expecting 4 channel input for RGBA."; + return false; + } + cv::cvtColor(*source_ptr, dest_frame, CV_RGBA2GRAY); + break; + + case RegionFlowComputationOptions::FORMAT_BGRA: + if (4 != source_ptr->channels()) { + LOG(ERROR) << "Expecting 4 channel input for BGRA."; + return false; + } + cv::cvtColor(*source_ptr, dest_frame, CV_BGRA2GRAY); + break; + + case RegionFlowComputationOptions::FORMAT_GRAYSCALE: + if (1 != source_ptr->channels()) { + LOG(ERROR) << "Expecting 1 channel input for GRAYSCALE."; + return false; + } + CHECK_EQ(1, source_ptr->channels()); + if (source_ptr != &dest_frame) { + source_ptr->copyTo(dest_frame); + } + break; + } + + // Do histogram equalization. + if (options_.histogram_equalization()) { + cv::equalizeHist(dest_frame, dest_frame); + } + + // Compute mean for gain correction. + if (options_.gain_correction()) { + data->mean_intensity = cv::mean(dest_frame)[0]; + } + + // Consistency checks; not input governed. + CHECK_EQ(dest_frame.cols, frame_width_); + CHECK_EQ(dest_frame.rows, frame_height_); + + data->BuildPyramid(pyramid_levels_, + options_.tracking_options().tracking_window_size(), + options_.compute_derivative_in_pyramid()); + + return true; +} + +bool RegionFlowComputation::AddImageAndTrack( + const cv::Mat& source, const cv::Mat& source_mask, int64 timestamp_usec, + const Homography& initial_transform) { + VLOG(1) << "Processing frame " << frame_num_ << " at " << timestamp_usec; + MEASURE_TIME << "AddImageAndTrack"; + + if (options_.downsample_mode() == + RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) { + if (frame_width_ != source.cols || frame_height_ != source.rows) { + LOG(ERROR) << "Source input dimensions incompatible with " + << "DOWNSAMPLE_TO_INPUT_SIZE. frame_width_: " << frame_width_ + << ", source.cols: " << source.cols + << ", frame_height_: " << frame_height_ + << ", source.rows: " << source.rows; + return false; + } + + if (!source_mask.empty()) { + if (frame_width_ != source_mask.cols || + frame_height_ != source_mask.rows) { + LOG(ERROR) << "Input mask dimensions incompatible with " + << "DOWNSAMPLE_TO_INPUT_SIZE"; + return false; + } + } + } else { + if (original_width_ != source.cols || original_height_ != source.rows) { + LOG(ERROR) << "Source input dimensions differ from those specified " + << "in the constructor"; + return false; + } + if (!source_mask.empty()) { + if (original_width_ != source_mask.cols || + original_height_ != source_mask.rows) { + LOG(ERROR) << "Input mask dimensions incompatible with those " + << "specified in the constructor"; + return false; + } + } + } + + // Create data queue element for current frame. If queue is full, reuse the + // data field from front in a circular buffer, otherwise add a new one. + if (data_queue_.size() > frames_to_track_) { + data_queue_.push_back(std::move(data_queue_.front())); + data_queue_.pop_front(); + } else { + data_queue_.push_back(MakeUnique(new FrameTrackingData( + frame_width_, frame_height_, extraction_levels_, use_cv_tracking_))); + } + + FrameTrackingData* curr_data = data_queue_.back().get(); + curr_data->Reset(frame_num_, timestamp_usec); + + if (!IsModelIdentity(initial_transform)) { + CHECK_EQ(1, frames_to_track_) << "Initial transform is not supported " + << "for multi frame tracking"; + Homography transform = initial_transform; + if (downsample_scale_ != 1) { + const float scale = 1.0f / downsample_scale_; + transform = CoordinateTransform(initial_transform, scale); + } + curr_data->initial_transform.reset(new Homography(transform)); + } + + if (!InitFrame(source, source_mask, curr_data)) { + LOG(ERROR) << "Could not init frame."; + return false; + } + + // Precompute blur score from original (not pre-blurred) frame. + cv::Mat& curr_frame = curr_data->frame; + curr_blur_score_ = + options_.compute_blur_score() ? ComputeBlurScore(curr_frame) : -1; + + if (options_.pre_blur_sigma() > 0) { + cv::GaussianBlur(curr_frame, curr_frame, cv::Size(0, 0), + options_.pre_blur_sigma(), options_.pre_blur_sigma()); + } + + // By default, create empty region flows for as many frames as we want to + // track. + region_flow_results_.clear(); + for (int i = 0; i < frames_to_track_; ++i) { + region_flow_results_.push_back(MakeUnique(new RegionFlowFeatureList())); + InitializeRegionFlowFeatureList(region_flow_results_.back().get()); + } + + // Do we have enough frames to start tracking? + const bool synthetic_tracks = + options_.use_synthetic_zero_motion_tracks_all_frames() || + (frame_num_ == 0 && + options_.use_synthetic_zero_motion_tracks_first_frame()); + + int curr_frames_to_track = frames_to_track_; + // Update frames-to-track to match actual frames that we can track. + if (!synthetic_tracks) { + curr_frames_to_track = min(frame_num_, frames_to_track_); + } + + // Compute region flows for all frames being tracked. + const auto internal_flow_direction = + options_.tracking_options().internal_tracking_direction(); + const bool invert_flow = internal_flow_direction != + options_.tracking_options().output_flow_direction(); + + switch (internal_flow_direction) { + case TrackingOptions::FORWARD: + if (long_track_data_ != nullptr && curr_frames_to_track > 0) { + // Long feature tracking. + TrackedFeatureList curr_result; + ComputeRegionFlow(-1, 0, synthetic_tracks, invert_flow, + &long_track_data_->prev_result, &curr_result, + region_flow_results_[0].get()); + long_track_data_->prev_result.swap(curr_result); + } else { + // Track from the closest frame last, so that the last set of features + // updated in FrameTrackingData are from the closest one. + for (int i = curr_frames_to_track; i >= 1; --i) { + ComputeRegionFlow(-i, 0, synthetic_tracks, invert_flow, nullptr, + nullptr, region_flow_results_[i - 1].get()); + } + } + break; + case TrackingOptions::BACKWARD: + for (int i = 1; i <= curr_frames_to_track; ++i) { + if (!synthetic_tracks && i > 1) { + InitializeFeatureLocationsFromPreviousResult(-i + 1, -i); + } + ComputeRegionFlow(0, -i, synthetic_tracks, invert_flow, nullptr, + nullptr, region_flow_results_[i - 1].get()); + } + break; + case TrackingOptions::CONSECUTIVELY: + const bool invert_flow_forward = + TrackingOptions::FORWARD != + options_.tracking_options().output_flow_direction(); + const bool invert_flow_backward = !invert_flow_forward; + for (int i = curr_frames_to_track; i >= 1; --i) { + // Compute forward flow. + ComputeRegionFlow(-i, 0, synthetic_tracks, invert_flow_forward, nullptr, + nullptr, region_flow_results_[i - 1].get()); + if (region_flow_results_[i - 1]->unstable()) { + // If forward flow is unstable, compute backward flow. + ComputeRegionFlow(0, -i, synthetic_tracks, invert_flow_backward, + nullptr, nullptr, + region_flow_results_[i - 1].get()); + } + } + break; + } + + if (frames_to_track_ == 1) { + const int num_features = region_flow_results_.front()->feature_size(); + if (frame_num_ == 0) { + curr_num_features_avg_ = num_features; + } else { + // Low pass filter number of current features. + constexpr float kAlpha = 0.3f; + curr_num_features_avg_ = + (1.0f - kAlpha) * curr_num_features_avg_ + kAlpha * num_features; + } + } + + ++frame_num_; + + return true; +} + +cv::Mat RegionFlowComputation::GetGrayscaleFrameFromResults() { + CHECK_GT(data_queue_.size(), 0) << "Empty queue, was AddImage* called?"; + FrameTrackingData* curr_data = data_queue_.back().get(); + CHECK(curr_data); + return curr_data->frame; +} + +void RegionFlowComputation::GetFeatureTrackInliers( + bool skip_estimation, TrackedFeatureList* features, + TrackedFeatureView* inliers) const { + CHECK(features != nullptr); + CHECK(inliers != nullptr); + inliers->clear(); + if (skip_estimation) { + inliers->reserve(features->size()); + for (auto& feature : *features) { + inliers->push_back(&feature); + } + } else { + ComputeBlockBasedFlow(features, inliers); + } +} + +float RegionFlowComputation::ComputeVisualConsistency( + FrameTrackingData* previous, FrameTrackingData* current) const { + CHECK_EQ(previous->frame_num + 1, current->frame_num); + const int total = previous->tiny_image.total(); + CHECK_GT(total, 0) << "Tiny image dimension set to zero."; + current->tiny_image_diff = + FrameDifferenceMedian(previous->tiny_image, current->tiny_image) * + (1.0f / total); + + return fabs(previous->tiny_image_diff - current->tiny_image_diff); +} + +void RegionFlowComputation::ComputeRegionFlow( + int from, int to, bool synthetic_tracks, bool invert_flow, + const TrackedFeatureList* prev_result, TrackedFeatureList* curr_result, + RegionFlowFeatureList* feature_list) { + MEASURE_TIME << "Compute RegionFlow."; + // feature_tracks should be in the outer scope since the inliers form a view + // on them (store pointers to features stored in feature_tracks). + TrackedFeatureList feature_tracks; + TrackedFeatureView feature_inliers; + + FrameTrackingData* data1 = nullptr; + FrameTrackingData* data2 = nullptr; + float frac_long_features_rejected = 0; + float visual_consistency = 0; + + if (synthetic_tracks) { + const float step = + options_.tracking_options().synthetic_zero_motion_grid_step(); + ZeroMotionGridTracks(original_width_, original_height_, step, step, + &feature_tracks); + GetFeatureTrackInliers(true /* skip_estimation */, &feature_tracks, + &feature_inliers); + } else { + const int index1 = data_queue_.size() + from - 1; + const int index2 = data_queue_.size() + to - 1; + CHECK_GE(index1, 0); + CHECK_LT(index1, data_queue_.size()); + CHECK_GE(index2, 0); + CHECK_LT(index2, data_queue_.size()); + data1 = data_queue_[index1].get(); + data2 = data_queue_[index2].get(); + + std::unique_ptr initial_transform; + if (index1 + 1 == index2) { + // Forward track, check if initial transform present. + if (data2->initial_transform != nullptr) { + initial_transform = absl::make_unique( + ModelInvert(*data2->initial_transform)); + } + } else if (index1 - 1 == index2) { + // Backward track, check if initial transform present. + if (data1->initial_transform != nullptr) { + initial_transform.reset(new Homography(*data1->initial_transform)); + } + } + + if (std::abs(from - to) == 1 && + options_.visual_consistency_options().compute_consistency()) { + FrameTrackingData* later_data = data1; + FrameTrackingData* earlier_data = data2; + if (from < to) { + std::swap(later_data, earlier_data); + } + + visual_consistency = ComputeVisualConsistency(earlier_data, later_data); + } + + bool track_features = true; + bool force_feature_extraction_next_frame = false; + if (options_.tracking_options().wide_baseline_matching()) { + CHECK(initial_transform == nullptr) + << "Can't use wide baseline matching and initial transform as the " + << "same time."; + + WideBaselineMatchFeatures(data1, data2, &feature_tracks); + track_features = + options_.tracking_options().refine_wide_baseline_matches(); + if (track_features) { + initial_transform = absl::make_unique( + HomographyAdapter::Embed(AffineModelFromFeatures(&feature_tracks))); + feature_tracks.clear(); + } else { + GetFeatureTrackInliers(options_.no_estimation_mode(), &feature_tracks, + &feature_inliers); + } + } + + if (track_features) { + ExtractFeatures(prev_result, data1); + + if (initial_transform != nullptr) { + InitializeFeatureLocationsFromTransform(from, to, *initial_transform); + } + + // Compute tracks with gain correction if requested. + bool gain_correction = options_.gain_correction(); + const float triggering_ratio = + options_.gain_correction_triggering_ratio(); + if (options_.gain_correction() && triggering_ratio > 0) { + // Only compute gain if change in intensity across frames + // is sufficiently large. + const float intensity_ratio = + std::max(data1->mean_intensity, data2->mean_intensity) / + (std::min(data1->mean_intensity, data2->mean_intensity) + 1e-6f); + gain_correction = intensity_ratio > triggering_ratio; + } + + const bool gain_hypotheses = + options_.gain_correction_multiple_hypotheses(); + + // Trigger feature extraction on next frame every time we perform + // gain correction. + force_feature_extraction_next_frame = gain_correction; + + // Backup FrameTrackingData if needed for reset when using multiple + // hypothesis. + std::unique_ptr wo_gain_data2; + if (gain_correction && gain_hypotheses) { + wo_gain_data2.reset(new FrameTrackingData(*data2)); + } + + TrackFeatures(data1, data2, &gain_correction, + &frac_long_features_rejected, &feature_tracks); + GetFeatureTrackInliers(options_.no_estimation_mode(), &feature_tracks, + &feature_inliers); + + // Second pass: If gain correction was successful and multiple hypotheses, + // are requested run again without it. + if (gain_correction && gain_hypotheses) { + // Re-run without gain correction. + TrackedFeatureList wo_gain_tracks; + TrackedFeatureView wo_gain_inliers; + + gain_correction = false; + TrackFeatures(data1, wo_gain_data2.get(), &gain_correction, nullptr, + &wo_gain_tracks); + GetFeatureTrackInliers(options_.no_estimation_mode(), &wo_gain_tracks, + &wo_gain_inliers); + + // Only use gain correction if it is better than tracking + // without gain correction, i.e gain correction should result in more + // inliers, at least by the specified fractional improvement. + const float improvement_weight = + 1.0f + options_.gain_correction_inlier_improvement_frac(); + const int gain_count = feature_inliers.size(); + const int wo_gain_count = wo_gain_inliers.size(); + if (gain_count < wo_gain_count * improvement_weight) { + // Reject gain result, insufficient improvement. Use result without + // gain correction instead. + feature_tracks.swap(wo_gain_tracks); + feature_inliers.swap(wo_gain_inliers); + std::swap(*data2, *wo_gain_data2); + VLOG(1) << "Rejecting gain correction. Number of inliers with " + << "gain: " << gain_count + << ", without gain: " << wo_gain_count; + force_feature_extraction_next_frame = false; + } + } + } // end if track features. + + if (data1->num_original_extracted_and_tracked < 0) { + // Record initial track. + data1->num_original_extracted_and_tracked = feature_inliers.size(); + } + + if (force_feature_extraction_next_frame) { + data2->num_extracted_and_tracked = 0; + } else { + data2->num_extracted_and_tracked = feature_inliers.size(); + } + + // Forward initial number of tracked features. + data2->num_original_extracted_and_tracked = + data1->num_original_extracted_and_tracked; + } + + // Convert tracks to region flow. + if (invert_flow) { + InvertFeatureList(feature_tracks, &feature_tracks); + } + + const float flow_magnitude = TrackedFeatureViewToRegionFlowFeatureList( + feature_inliers, curr_result, feature_list); + + // Assign unique ids to the features. + for (auto& feature : *feature_list->mutable_feature()) { + feature.set_feature_id(++feature_count_); + } + + if (from != to) { + // Record average flow magnitude (normalized w.r.t. tracking distance in + // frames). + flow_magnitudes_.push_back(flow_magnitude / std::abs(from - to)); + const int kMaxMagnitudeRecords = 10; + // Limit to only most recent observations. + while (flow_magnitudes_.size() > kMaxMagnitudeRecords) { + flow_magnitudes_.pop_front(); + } + + // Adaptively size pyramid based on previous observations. + // 130% of previous maximum. + if (options_.tracking_options().adaptive_tracking_distance() && + flow_magnitudes_.size() > 2) { + pyramid_levels_ = PyramidLevelsFromTrackDistance( + *std::max_element(flow_magnitudes_.begin(), flow_magnitudes_.end()) * + 1.3f); + } + } + + // Check if sufficient features found, set corresponding flags. + if (!HasSufficientFeatures(*feature_list)) { + feature_list->set_unstable(true); + // If region flow is unstable, then the tracked features in the "to" frame + // should not be relied upon for reuse. + if (data2 != nullptr) { + data2->ResetFeatures(); + } + } + + // Store additional information in feature_list. + feature_list->set_frac_long_features_rejected(frac_long_features_rejected); + feature_list->set_visual_consistency(visual_consistency); + if (invert_flow) { + if (data2 != nullptr) { + feature_list->set_timestamp_usec(data2->timestamp_usec); + } + } else { + if (data1 != nullptr) { + feature_list->set_timestamp_usec(data1->timestamp_usec); + } + } + + feature_list->set_match_frame((to - from) * (invert_flow ? -1 : 1)); +} + +// Resets computation by setting frame_num_ == 0. No need to clear other data +// structures, since previous frames are used only once frame_num_ > 0. +void RegionFlowComputation::Reset() { + frame_num_ = 0; + data_queue_.clear(); + flow_magnitudes_.clear(); +} + +namespace { + +struct FloatPointerComparator { + bool operator()(const float* lhs, const float* rhs) const { + return *lhs > *rhs; + } +}; + +// Invoker for ParallelFor. Needs to be copyable. +// Extracts features from a 2nd moment gradient response image (eig_image) +// by grid-based thresholding (removing feature responses below +// local_quality_level * maximum cell value or lowest_quality_level) and +// non-maxima suppression via dilation. Results are output in corner_pointers +// and partially sorted (limited to max_cell_features, highest first). +class GridFeatureLocator { + public: + GridFeatureLocator(int frame_width, int frame_height, int block_width, + int block_height, int bins_per_row, + float local_quality_level, float lowest_quality_level, + int max_cell_features, + std::vector>* corner_pointers, + cv::Mat* eig_image, cv::Mat* tmp_image) + : frame_width_(frame_width), + frame_height_(frame_height), + block_width_(block_width), + block_height_(block_height), + bins_per_row_(bins_per_row), + local_quality_level_(local_quality_level), + lowest_quality_level_(lowest_quality_level), + max_cell_features_(max_cell_features), + corner_pointers_(corner_pointers), + eig_image_(eig_image), + tmp_image_(tmp_image) {} + + void operator()(const BlockedRange2D& range) const { + // Iterate over views. + for (int bin_y = range.rows().begin(); bin_y != range.rows().end(); + ++bin_y) { + for (int bin_x = range.cols().begin(); bin_x != range.cols().end(); + ++bin_x) { + const int view_x = bin_x * block_width_; + const int view_y = bin_y * block_height_; + const int view_end_x = min(frame_width_, (bin_x + 1) * block_width_); + const int view_end_y = min(frame_height_, (bin_y + 1) * block_height_); + + // Guarantee at least one pixel in each dimension. + if (view_x >= view_end_x || view_y >= view_end_y) { + continue; + } + + cv::Mat eig_view(*eig_image_, cv::Range(view_y, view_end_y), + cv::Range(view_x, view_end_x)); + cv::Mat tmp_view(*tmp_image_, cv::Range(view_y, view_end_y), + cv::Range(view_x, view_end_x)); + + // Ignore features below quality level. + double maximum = 0; + cv::minMaxLoc(eig_view, nullptr, &maximum, nullptr, nullptr); + double lowest_quality = std::max(maximum * local_quality_level_, + lowest_quality_level_); + + // Copy borders that do not get dilated below. + cv::Rect borders[4] = { + cv::Rect(0, 0, eig_view.cols, 1), // top + cv::Rect(0, 0, 1, eig_view.rows), // left + cv::Rect(0, eig_view.rows - 1, eig_view.cols, 1), // bottom + cv::Rect(eig_view.cols - 1, 0, 1, eig_view.rows)}; + + for (int k = 0; k < 4; ++k) { + cv::Mat dst_view(tmp_view, borders[k]); + cv::Mat(eig_view, borders[k]).copyTo(dst_view); + } + + // Non-maxima suppression. + if (tmp_view.rows > 2 && tmp_view.cols > 2) { + // Remove border from dilate, as cv::dilate reads out of + // bound values otherwise. + cv::Mat dilate_src = + cv::Mat(eig_view, cv::Range(1, eig_view.rows - 1), + cv::Range(1, eig_view.cols - 1)); + + cv::Mat dilate_dst = + cv::Mat(tmp_view, cv::Range(1, tmp_view.rows - 1), + cv::Range(1, tmp_view.cols - 1)); + cv::Mat kernel(3, 3, CV_32F); + kernel.setTo(1.0); + cv::dilate(dilate_src, dilate_dst, kernel); + } + + const int grid_pos = bin_y * bins_per_row_ + bin_x; + std::vector& grid_cell = (*corner_pointers_)[grid_pos]; + + // Iterate over view in image domain as we store feature location + // pointers w.r.t. original frame. + for (int i = view_y; i < view_end_y; ++i) { + const float* tmp_ptr = tmp_image_->ptr(i); + const float* eig_ptr = eig_image_->ptr(i); + for (int j = view_x; j < view_end_x; ++j) { + const float max_supp_value = tmp_ptr[j]; + if (max_supp_value > lowest_quality && + max_supp_value == eig_ptr[j]) { + // This is a local maxima -> store in list. + grid_cell.push_back(eig_ptr + j); + } + } + } + + const int level_max_elems = + min(max_cell_features_, grid_cell.size()); + std::partial_sort(grid_cell.begin(), + grid_cell.begin() + level_max_elems, grid_cell.end(), + FloatPointerComparator()); + } + } + } + + private: + int frame_width_; + int frame_height_; + int block_width_; + int block_height_; + int bins_per_row_; + float local_quality_level_; + float lowest_quality_level_; + int max_cell_features_; + std::vector>* corner_pointers_; + cv::Mat* eig_image_; + cv::Mat* tmp_image_; +}; + +// Sets (2 * N + 1) x (2 * N + 1) neighborhood of the passed mask to K +// or adds K to the existing mask if add is set to true. +template +inline void SetMaskNeighborhood(int mask_x, int mask_y, cv::Mat* mask) { + DCHECK_EQ(mask->type(), CV_8U); + const int mask_start_x = max(0, mask_x - N); + const int mask_end_x = min(mask->cols - 1, mask_x + N); + const int mask_dx = mask_end_x - mask_start_x + 1; + const int mask_start_y = max(0, mask_y - N); + const int mask_end_y = min(mask->rows - 1, mask_y + N); + DCHECK_LE(mask_start_x, mask_end_x); + DCHECK_LE(mask_start_y, mask_end_y); + + if (!add) { + for (int i = mask_start_y; i <= mask_end_y; ++i) { + uint8* mask_ptr = mask->ptr(i) + mask_start_x; + memset(mask_ptr, K, mask_dx * sizeof(*mask_ptr)); + } + } else { + for (int i = mask_start_y; i <= mask_end_y; ++i) { + uint8* mask_ptr = mask->ptr(i); + for (int j = mask_start_x; j <= mask_end_x; ++j) { + mask_ptr[j] = (mask_ptr[j] & 0x7F) + K; // Limit to 128. + } + } + } +} + +} // namespace. + +void RegionFlowComputation::AdaptiveGoodFeaturesToTrack( + const std::vector& extraction_pyramid, int max_features, + float mask_scale, cv::Mat* mask, FrameTrackingData* data) { + CHECK(data != nullptr); + CHECK(feature_tmp_image_1_.get() != nullptr); + CHECK(feature_tmp_image_2_.get() != nullptr); + + cv::Mat* eig_image = feature_tmp_image_1_.get(); + cv::Mat* tmp_image = feature_tmp_image_2_.get(); + + const auto& tracking_options = options_.tracking_options(); + + // Setup grid information. + const float block_size = tracking_options.adaptive_features_block_size(); + CHECK_GT(block_size, 0) << "Need positive block size"; + + int block_width = block_size < 1 ? block_size * frame_width_ : block_size; + int block_height = block_size < 1 ? block_size * frame_height_ : block_size; + + // Ensure valid block width and height regardless of settings. + block_width = max(1, block_width); + block_height = max(1, block_height); + + bool use_harris = tracking_options.corner_extraction_method() == + TrackingOptions::EXTRACTION_HARRIS; + + const int adaptive_levels = + options_.tracking_options().adaptive_features_levels(); + + // For Harris negative values are possible, so lowest_quality level + // is being ignored. + const float lowest_quality_level = + use_harris ? -100.0f + : tracking_options.min_eig_val_settings() + .adaptive_lowest_quality_level(); + + const float local_quality_level = + use_harris + ? tracking_options.harris_settings().feature_quality_level() + : tracking_options.min_eig_val_settings().feature_quality_level(); + + bool use_fast = tracking_options.corner_extraction_method() == + TrackingOptions::EXTRACTION_FAST; + cv::Ptr fast_detector; + if (use_fast) { + fast_detector = cv::FastFeatureDetector::create( + tracking_options.fast_settings().threshold()); + } + + // Extract features at multiple scales and adaptive block sizes. + for (int e = 0, step = 1; e < extraction_pyramid.size(); ++e) { + if (data->features.size() >= max_features) { + break; + } + + const cv::Mat& image = extraction_pyramid[e]; + const int rows = image.rows; + const int cols = image.cols; + + // Compute corner response. + constexpr int kBlockSize = 3; + constexpr double kHarrisK = 0.04; // Harris magical constant as + // set by OpenCV. + std::vector fast_keypoints; + if (e == 0) { + MEASURE_TIME << "Corner extraction"; + CHECK_EQ(rows, frame_height_); + CHECK_EQ(cols, frame_width_); + + if (use_fast) { + fast_detector->detect(image, fast_keypoints); + } else if (use_harris) { + cv::cornerHarris(image, *eig_image, kBlockSize, kBlockSize, kHarrisK); + } else { + cv::cornerMinEigenVal(image, *eig_image, kBlockSize); + } + } else { + // Compute corner response on a down-scaled image and upsample. + step *= 2; + CHECK_EQ(rows, (extraction_pyramid[e - 1].rows + 1) / 2); + CHECK_EQ(cols, (extraction_pyramid[e - 1].cols + 1) / 2); + + if (use_fast) { + fast_detector->detect(image, fast_keypoints); + + for (int j = 0; j < fast_keypoints.size(); ++j) { + fast_keypoints[j].pt.x *= step; + fast_keypoints[j].pt.y *= step; + } + } else { + // Use tmp_image to compute eigen-values on resized images. + cv::Mat eig_view(*tmp_image, cv::Range(0, rows), cv::Range(0, cols)); + + if (use_harris) { + cv::cornerHarris(image, eig_view, kBlockSize, kBlockSize, kHarrisK); + } else { + cv::cornerMinEigenVal(image, eig_view, kBlockSize); + } + + // Upsample (without interpolation) eig_view to match frame size. + eig_image->setTo(0); + for (int r = 0, r_up = 0; r < rows && r_up < frame_height_; + ++r, r_up += step) { + const float* ptr = eig_view.ptr(r); + float* up_ptr = eig_image->ptr(r_up); + for (int c = 0, c_up = 0; c < cols && c_up < frame_width_; + ++c, c_up += step) { + up_ptr[c_up] = ptr[c]; + } + } + } // end if use_fast + } // end if e. + + if (use_fast) { + // TODO: Perform grid based feature detection. + std::sort(fast_keypoints.begin(), fast_keypoints.end(), + [](const cv::KeyPoint& a, const cv::KeyPoint& b) { + return b.response < a.response; + }); + + for (int j = 0; j < fast_keypoints.size(); ++j) { + const int corner_y = fast_keypoints[j].pt.y; + const int corner_x = fast_keypoints[j].pt.x; + const int mask_x = corner_x * mask_scale; + const int mask_y = corner_y * mask_scale; + + // Test if neighboring element is already set. + if (mask->at(mask_y, mask_x) >= 1) { + continue; + } + + SetMaskNeighborhood<2, 1, false>(mask_x, mask_y, mask); + + // `e` stands for which pyramid layer this feature is extracted from. + data->AddFeature(cv::Point2f(corner_x, corner_y), + min(1.0f, fast_keypoints[j].response), e, + -1, // No track id assigned yet. Only assign ids + // to successfully tracked features. + nullptr); + } + } else { + // Iterate over adaptive pyramid levels. + int level_width = block_width; + int level_height = block_height; + for (int level = 0; level < adaptive_levels; ++level) { + // Perform grid based threshold and non-maxima suppression. + const int bins_per_column = + std::ceil(static_cast(frame_height_) / level_height); + const int bins_per_row = + std::ceil(static_cast(frame_width_) / level_width); + const int num_bins = bins_per_row * bins_per_column; + const int level_max_features = max_features - data->features.size(); + if (level_max_features < 0) { + break; + } + + std::vector> corner_pointers(num_bins); + for (int k = 0; k < num_bins; ++k) { + corner_pointers[k].reserve(level_max_features); + } + + GridFeatureLocator locator( + frame_width_, frame_height_, level_width, level_height, + bins_per_row, local_quality_level, lowest_quality_level, + level_max_features, &corner_pointers, eig_image, tmp_image); + + ParallelFor2D(0, bins_per_column, 0, bins_per_row, 1, locator); + + // Round robin across bins, add one feature per bin, until + // max_features is hit. + bool more_features_available = true; + + // Index of next to be processed corner in corner_points[k] array. + std::vector corner_index(num_bins, 0); + while (more_features_available && + data->features.size() < max_features) { + more_features_available = false; + + // Add one feature per bin (stratified sampling). + for (int k = 0; k < num_bins; ++k) { + if (corner_index[k] >= corner_pointers[k].size()) { + continue; + } + + const float* corner_ptr = corner_pointers[k][corner_index[k]]; + // Advance. + ++corner_index[k]; + if (corner_index[k] < corner_pointers[k].size() - 1) { + more_features_available = true; + } + + // Map corner pointer to x and y location. + const int offset = reinterpret_cast(corner_ptr) - + eig_image->ptr(0); + + const int corner_y = offset / eig_image->step[0]; + const int corner_x = + (offset - eig_image->step[0] * corner_y) / sizeof(*corner_ptr); + + // Ensure corner is at least 2 pixel away from boundary. + if (corner_x < 2 || corner_x > frame_width_ - 2 || corner_y < 2 || + corner_y > frame_height_ - 2) { + continue; + } + + const int mask_x = corner_x * mask_scale; + const int mask_y = corner_y * mask_scale; + + // Test if neighboring element is already set. + if (mask->at(mask_y, mask_x) >= 1) { + continue; + } + + SetMaskNeighborhood<2, 1, false>(mask_x, mask_y, mask); + + // `e` stands for which pyramid layer the feature is extracted from. + data->AddFeature( + cv::Point2f(corner_x, corner_y), + min(1.0f, *corner_ptr * options_.corner_response_scale()), e, + -1, // No track id assigned yet. Only assign ids + // to successfully tracked features. + nullptr); + } // end bins. + } // end while. + + if (level + 1 < adaptive_levels) { + level_width = (level_width + 1) / 2; + level_height = (level_height + 1) / 2; + } + } // end adaptive level. + } // end use_fast + } // end extraction level. + + // If adaptive_levels or extraction_levels > 1, for 2nd or larger level, we + // can potentially add corners above the max_features threshold. In this case + // truncation performs well without having to resort to sorting the features, + // as 2nd or larger level add features of lower corner response that pass the + // locality test (as the neighborhood size decreases). + if (data->features.size() > max_features) { + data->features.resize(max_features); + data->corner_responses.resize(max_features); + data->octaves.resize(max_features); + data->track_ids.resize(max_features); + } +} + +AffineModel RegionFlowComputation::AffineModelFromFeatures( + TrackedFeatureList* features) const { + CHECK(features != nullptr); + + // Downscaled domain as output. + MotionEstimation motion_estimation(MotionEstimationOptions(), frame_width_, + frame_height_); + + RegionFlowFrame region_flow; + region_flow.set_frame_width(original_width_); + region_flow.set_frame_height(original_height_); + + TrackedFeatureView feature_view; + ComputeBlockBasedFlow(features, &feature_view); + + RegionFlowFeatureList feature_list; + TrackedFeatureViewToRegionFlowFeatureList(feature_view, nullptr, + &feature_list); + + return FitAffineModel(feature_list); +} + +void RegionFlowComputation::ZeroMotionGridFeatures( + int frame_width, int frame_height, float frac_grid_step_x, + float frac_grid_step_y, RegionFlowFeatureList* result) { + CHECK(result != nullptr); + result->Clear(); + + TrackedFeatureList features; + const int border_dist = ZeroMotionGridTracks( + frame_width, frame_height, frac_grid_step_x, frac_grid_step_y, &features); + + result->set_frame_width(frame_width); + result->set_frame_height(frame_height); + result->set_distance_from_border(border_dist); + + for (const auto& feature : features) { + RegionFlowFeature* new_feature = result->add_feature(); + new_feature->set_x(feature.point.x()); + new_feature->set_y(feature.point.y()); + new_feature->set_dx(feature.flow.x()); + new_feature->set_dy(feature.flow.y()); + } +} + +void RegionFlowComputation::DenseZeroMotionSamples( + int frame_width, int frame_height, float frac_diameter, float frac_steps_x, + float frac_steps_y, RegionFlowFeatureList* result) { + CHECK(result != nullptr); + + // Ensure patch fits into frame. + const int radius = + max(1, min(min(frame_width / 2 - 1, frame_height / 2 - 1), + hypot(frame_width, frame_height) * frac_diameter) / + 2); + result->Clear(); + result->set_frame_width(frame_width); + result->set_frame_height(frame_height); + result->set_distance_from_border(radius); + + const int start = radius; + const int end_y = frame_height - radius; + const int end_x = frame_width - radius; + + const int steps_x = max(1, frame_width * frac_steps_x); + const int steps_y = max(1, frame_height * frac_steps_y); + for (int y = start; y < end_y; y += steps_y) { + for (int x = start; x < end_x; x += steps_x) { + RegionFlowFeature* new_feature = result->add_feature(); + new_feature->set_x(x); + new_feature->set_y(y); + new_feature->set_dx(0); + new_feature->set_dy(0); + } + } +} + +namespace { + +bool PointOutOfBound(const Vector2_f& point, int frame_width, + int frame_height) { + if (point.x() < 0 || point.y() < 0 || point.x() > frame_width - 1 || + point.y() > frame_height - 1) { + return true; + } + return false; +} + +} // namespace. + +int RegionFlowComputation::ZeroMotionGridTracks(int frame_width, + int frame_height, + float frac_grid_step_x, + float frac_grid_step_y, + TrackedFeatureList* results) { + CHECK(results); + auto& tracked_features = *results; + tracked_features.clear(); + + const int grid_step_x = + max(1, static_cast(frac_grid_step_x * frame_width)); + const int grid_step_y = + max(1, static_cast(frac_grid_step_y * frame_height)); + + const int num_features_x = (frame_width - 1) / grid_step_x; + const int num_features_y = (frame_height - 1) / grid_step_y; + const int max_features = num_features_x * num_features_y; + + // Only track features in one frame for synthetic features. + tracked_features.reserve(max_features); + const int border_dist_x = grid_step_x / 2; + const int border_dist_y = grid_step_y / 2; + for (int i = 0, y = border_dist_y; i < num_features_y; + ++i, y += grid_step_y) { + for (int j = 0, x = border_dist_x; j < num_features_x; + ++j, x += grid_step_x) { + TrackedFeature tracked_feature(Vector2_f(x, y), Vector2_f(0, 0), 0.0f, + 0.0f, + -1); // No track id assigned. + tracked_features.push_back(tracked_feature); + } + } + + return min(border_dist_x, border_dist_y); +} + +bool RegionFlowComputation::GainCorrectFrame(const cv::Mat& reference_frame, + const cv::Mat& input_frame, + float reference_mean, + float input_mean, + cv::Mat* calibrated_frame) const { + CHECK(calibrated_frame); + CHECK_EQ(reference_frame.rows, input_frame.rows); + CHECK_EQ(reference_frame.cols, input_frame.cols); + + // Do not attempt gain correction for tiny images. + if (std::min(reference_frame.rows, reference_frame.cols) < 10) { + VLOG(1) << "Tiny image, aborting gain correction."; + return false; + } + + GainBiasModel gain_bias; + if (options_.fast_gain_correction()) { + const int kMinMean = 5; + if (input_mean < kMinMean) { + return false; // Badly exposed. + } + const float gain = reference_mean / input_mean; + if (gain < options_.gain_bias_bounds().lower_gain() || + gain > options_.gain_bias_bounds().upper_gain()) { + return false; // Unstable: Out of bound. + } + + gain_bias.set_gain_c1(gain); + } + + constexpr float kMaxFastGain = 1.12f; + if (!options_.fast_gain_correction() || gain_bias.gain_c1() > kMaxFastGain) { + // Estimate tone change w.r.t. reference_frame. + RegionFlowFeatureList zero_features; + DenseZeroMotionSamples( + frame_width_, frame_height_, options_.frac_gain_feature_size(), + options_.frac_gain_step(), options_.frac_gain_step(), &zero_features); + + ClipMask<1> reference_mask; + ClipMask<1> input_mask; + ToneEstimation::ComputeClipMask(ClipMaskOptions(), reference_frame, + &reference_mask); + + ToneEstimation::ComputeClipMask(ClipMaskOptions(), input_frame, + &input_mask); + + ColorToneMatches tone_matches; + ToneMatchOptions tone_match_options; + tone_match_options.set_patch_radius(zero_features.distance_from_border() - + 1); + + // Nothing to extract. + if (tone_match_options.patch_radius() < 1) { + VLOG(1) << "Patch radius is < 1, aborting gain correction."; + return false; + } + + ToneEstimation::ComputeToneMatches<1>( + tone_match_options, zero_features, input_frame, reference_frame, + input_mask, reference_mask, &tone_matches); + + // Only attempt estimation if not too much frame area is clipped. + if (tone_matches[0].size() <= 0.5 * zero_features.feature_size()) { + VLOG(1) << "Too much frame area is clipped for gain correction."; + return false; + } + + ToneEstimation::EstimateGainBiasModel(5, // number of irls iterations. + &tone_matches, &gain_bias); + + if (!ToneEstimation::IsStableGainBiasModel( + options_.gain_bias_bounds(), gain_bias, tone_matches, nullptr)) { + VLOG(1) << "Unstable gain-bias model."; + return false; + } + } + + GainBiasModelMethods::MapImageIndependent<1>(gain_bias, + false, // log_domain. + true, // normalized_model. + input_frame, calibrated_frame); + return true; +} + +void RegionFlowComputation::WideBaselineMatchFeatures( + FrameTrackingData* from_data_ptr, FrameTrackingData* to_data_ptr, + TrackedFeatureList* results) { +#if (defined(__ANDROID__) || defined(__APPLE__) || defined(__EMSCRIPTEN__)) && \ + !defined(CV_WRAPPER_3X) + LOG(FATAL) << "Supported on only with OpenCV 3.0. " + << "Use bazel build flag : --define CV_WRAPPER=3X"; +#else // (defined(__ANDROID__) || defined(__APPLE__) || + // defined(__EMSCRIPTEN__)) && !defined(CV_WRAPPER_3X) + results->clear(); + + const auto& frame1 = from_data_ptr->frame; + auto& data1 = from_data_ptr->orb; + + const auto& frame2 = to_data_ptr->frame; + auto& data2 = to_data_ptr->orb; + + cv::Ptr orb = cv::ORB::create(max_features_); + + // Compute ORB features in frame1. + if (!data1.computed) { + orb->detect(frame1, data1.key_points); + orb->compute(frame1, data1.key_points, data1.descriptors); + data1.computed = true; + } + + const int num_features = data1.key_points.size(); + if (num_features == 0) { + // No features found, probably black or extremely blurry frame. Return empty + // results. + VLOG(1) << "Couldn't extract any features. Frame probably empty."; + return; + } + + // Compute ORB features in frame2. + if (!data2.computed) { + orb->detect(frame2, data2.key_points); + orb->compute(frame2, data2.key_points, data2.descriptors); + data2.computed = true; + } + + // Match feature descriptors. + cv::BFMatcher matcher(cv::NORM_HAMMING); + std::vector> matches; + matcher.knnMatch(data2.descriptors, // Query. + data1.descriptors, // Train. + matches, + 2); // 2 closest matches per descriptor. + + results->reserve(matches.size()); + + // Get successfully matched features. + for (const auto& match : matches) { + if (match.size() > 1 && + match[0].distance < options_.tracking_options().ratio_test_threshold() * + match[1].distance) { + // Passed ratio test. + const cv::Point2f& feature_location = + data1.key_points[match[0].trainIdx].pt; + const cv::Point2f& match_location = + data2.key_points[match[0].queryIdx].pt; + + const Vector2_f feature_point(feature_location.x, feature_location.y); + const Vector2_f flow = + Vector2_f(match_location.x, match_location.y) - feature_point; + + TrackedFeature tracked_feature(feature_point * downsample_scale_, + flow * downsample_scale_, + match[0].distance, 0.0f, + -1); // no track id assigned. + + if (PointOutOfBound(tracked_feature.point, original_width_, + original_height_)) { + continue; + } + + VLOG(2) << "Flow: " << tracked_feature.flow << " @ " + << tracked_feature.point; + results->push_back(tracked_feature); + } + } +#endif // (defined(__ANDROID__) || defined(__APPLE__) || + // defined(__EMSCRIPTEN__)) && !defined(CV_WRAPPER_3X) +} + +void RegionFlowComputation::RemoveAbsentFeatures( + const TrackedFeatureList& prev_result, FrameTrackingData* data) { + CHECK(long_track_data_ != nullptr); + + // Build hash set of track ids. + absl::node_hash_set track_ids; + for (const auto& feature : prev_result) { + DCHECK_NE(feature.track_id, -1); + track_ids.insert(feature.track_id); + } + + long_track_data_->RemoveAbsentFeatureEntries(track_ids); + + // Remove indices (backwards to not destroy index positions). + for (int k = data->track_ids.size() - 1; k >= 0; --k) { + if (track_ids.find(data->track_ids[k]) == track_ids.end()) { + data->RemoveFeature(k); + } + } +} + +void RegionFlowComputation::RemoveFeaturesOutsideMask(FrameTrackingData* data) { + if (data->mask.empty()) { + return; + } + + // Remove indices (backwards to not destroy index positions). + for (int k = data->features.size() - 1; k >= 0; --k) { + const int x = static_cast(data->features[k].x + 0.5); + const int y = static_cast(data->features[k].y + 0.5); + if (data->mask.at(y, x) == 0) { + data->RemoveFeature(k); + } + } +} + +void RegionFlowComputation::ExtractFeatures( + const TrackedFeatureList* prev_result, FrameTrackingData* data) { + MEASURE_TIME << "ExtractFeatures"; + if (!options_.tracking_options().adaptive_good_features_to_track()) { + LOG(FATAL) << "Deprecated! Activate adaptive_good_features_to_track " + << "in TrackingOptions"; + } + + // Check if features can simply be re-used. + if (!data->features.empty()) { + if (prev_result) { + // Long feature tracking case, remove features, + // whose ids are not present anymore (these are mainly outliers + // that got removed during DetermineRegionFlowInliers call). + RemoveAbsentFeatures(*prev_result, data); + } + + if (data->last_feature_extraction_time == 0) { + // Features already extracted from this frame. + CHECK_EQ(data->corner_responses.size(), data->features.size()); + CHECK_EQ(data->octaves.size(), data->features.size()); + VLOG(1) << "Features already present (extracted from this frame)"; + return; + } + + // Remove features that lie outside feature extraction mask. + RemoveFeaturesOutsideMask(data); + + CHECK_EQ(data->corner_responses.size(), data->features.size()); + CHECK_EQ(data->octaves.size(), data->features.size()); + + float feature_fraction = 0; + if (data->num_original_extracted_and_tracked > 0) { + feature_fraction = data->num_extracted_and_tracked * 1.0f / + data->num_original_extracted_and_tracked; + } + + // If features were not tracked from too far away, reuse them unless the + // number of tracked features is below a threshold percentage of the + // original source features. + const int max_frame_distance = + options_.tracking_options().reuse_features_max_frame_distance(); + const float min_survived_frac = + options_.tracking_options().reuse_features_min_survived_frac(); + + if (data->last_feature_extraction_time <= max_frame_distance && + feature_fraction > min_survived_frac) { + VLOG(1) << "Features already present, (tracked " + << data->last_feature_extraction_time << " times)"; + return; + } + } + // If execution reaches this point, new features will be extracted. + + // Scale feature_distance by sqrt(scale). Sqrt is purely a heuristic choice. + float min_feature_distance = + options_.tracking_options().min_feature_distance(); + if (min_feature_distance < 1) { + min_feature_distance *= hypot(frame_width_, frame_height_); + } + if (options_.tracking_options().distance_downscale_sqrt()) { + min_feature_distance = + std::round(min_feature_distance / std::sqrt(downsample_scale_)); + } + + // Result mask that ensures we don't place features too closely. + const float mask_dim = max(1.0f, min_feature_distance * 0.5f); + const float mask_scale = 1.0f / mask_dim; + cv::Mat mask = cv::Mat::zeros(std::ceil(frame_height_ * mask_scale), + std::ceil(frame_width_ * mask_scale), CV_8U); + + // Initialize mask from frame's feature extraction mask, by downsampling and + // negating the latter mask. + if (!data->mask.empty()) { + cv::resize(data->mask, mask, mask.size(), 0, 0, CV_INTER_NN); + for (int y = 0; y < mask.rows; ++y) { + uint8* mask_ptr = mask.ptr(y); + for (int x = 0; x < mask.cols; ++x) { + mask_ptr[x] = mask_ptr[x] == 0 ? 1 : 0; + } + } + } + + data->ResetFeatures(); + const int features_to_allocate = + prev_result ? prev_result->size() * 1.2f : max_features_ / 2; + data->PreAllocateFeatures(features_to_allocate); + + if (IsVerifyLongFeatures()) { + if (data->neighborhoods == nullptr) { + data->neighborhoods.reset(new std::vector()); + } + data->neighborhoods->reserve(features_to_allocate); + } + + CHECK_EQ(data->extraction_pyramid.size(), extraction_levels_); + for (int i = 1; i < extraction_levels_; ++i) { + // Need factor 2 as OpenCV stores image + gradient pairs when + // "with_derivative" is set to true. + const int layer_stored_in_pyramid = + options_.compute_derivative_in_pyramid() ? 2 * i : i; + const bool index_within_limit = + (layer_stored_in_pyramid < data->pyramid.size()); + if (index_within_limit && options_.compute_derivative_in_pyramid() && + i <= data->pyramid_levels) { + // Just re-use from already computed pyramid. + data->extraction_pyramid[i] = data->pyramid[layer_stored_in_pyramid]; + } else { + cv::pyrDown(data->extraction_pyramid[i - 1], data->extraction_pyramid[i], + data->extraction_pyramid[i].size()); + } + } + + if (prev_result) { + // Seed feature mask and results with tracking ids. + CHECK(long_track_data_ != nullptr); + const int max_track_length = + options_.tracking_options().long_tracks_max_frames(); + // Drop a feature with a propability X, such that all qualifying + // features are dropped with a 95% probability within the interval + // [.8, 1.2] * long_tracks_max_frames. + const int lower_max_track_length = max(1.0f, 0.8f * max_track_length); + const int upper_max_track_length = 1.2f * max_track_length; + + // Ensure interval is positive. + const int interval_length = + upper_max_track_length - lower_max_track_length + 1; + // Drop probability: p == > survival: 1 - p + // (1 - p) ^ interval_length >= 5% // only 5% survival chance across + // // all frames + // ==> p <= 1 - (0.05 ^ (1.0 / interval_length) + // Ensure positive probability. + const float drop_permil = + max(1.0, (1.0 - pow(0.05, 1.0 / interval_length))); + + unsigned int seed = 900913; // = Google in leet :) + std::default_random_engine rand_gen(seed); + std::uniform_real_distribution distribution(0.0f, 1.0f); + + // Mask out locations. + // For FORWARD output flow, we need to add flow to obtain the match + // position, for BACKWARD output flow, flow is inverted, so that feature + // locations already point to locations in the current frame. + CHECK_EQ(options_.tracking_options().internal_tracking_direction(), + TrackingOptions::FORWARD); + float match_sign = options_.tracking_options().output_flow_direction() == + TrackingOptions::FORWARD + ? 1.0f + : 0.0f; + const float inv_downsample_scale_ = 1.0f / downsample_scale_; + + for (const auto& feature : *prev_result) { + // Need to convert to downsampled domain. + const Vector2_f pos = + (feature.point + feature.flow * match_sign) * inv_downsample_scale_; + + const int track_id = feature.track_id; + if (track_id < 0) { + // TODO: Use LOG_FIRST_N here. + LOG_IF(WARNING, + []() { + static int k = 0; + return k++ < 2; + }()) + << "Expecting an assigned track id, " + << "skipping feature."; + continue; + } + + // Skip features for which the track would get too long. + const int start_frame = long_track_data_->StartFrameForId(track_id); + if (start_frame < 0) { + LOG(ERROR) << "Id is not present, skipping feature."; + continue; + } + + if (data->frame_num - start_frame >= lower_max_track_length && + distribution(rand_gen) <= drop_permil) { + continue; + } + + const int mask_x = pos.x() * mask_scale; + const int mask_y = pos.y() * mask_scale; + + // Skip if already occupied by too many features. This allows paths + // to "join", without having to explicitly represent this. + // Value of 2 improves number of connected features. + constexpr int kMaxFeaturesPerBin = 1; + if (mask.at(mask_y, mask_x) >= kMaxFeaturesPerBin) { + continue; + } + + SetMaskNeighborhood<2, 1, false>(mask_x, mask_y, &mask); + + // Copy results to features. + const cv::Mat* neighborhood = (options_.verify_long_features() && + feature.orig_neighborhood != nullptr) + ? feature.orig_neighborhood.get() + : nullptr; + data->AddFeature(cv::Point2f(pos.x(), pos.y()), feature.corner_response, + feature.octave, feature.track_id, neighborhood); + } + } + + // Extracts additional features in regions excluding the mask and adds them to + // data. + AdaptiveGoodFeaturesToTrack(data->extraction_pyramid, max_features_, + mask_scale, &mask, data); + + const int num_features = data->features.size(); + CHECK_EQ(num_features, data->octaves.size()); + CHECK_EQ(num_features, data->corner_responses.size()); + CHECK_EQ(num_features, data->track_ids.size()); +} + +// Selects features based on lambda evaluator: bool (int index) +// Performs inplace moves and final resize operation. +template +int RegionFlowComputation::InplaceFeatureSelection( + FrameTrackingData* data, std::vector*> int_vecs, + std::vector*> float_vecs, const Eval& eval) { + int num_selected_features = 0; + const int num_features = data->features.size(); + DCHECK_EQ(num_features, data->corner_responses.size()); + DCHECK_EQ(num_features, data->octaves.size()); + DCHECK_EQ(num_features, data->track_ids.size()); + DCHECK_EQ(num_features, data->feature_source_map.size()); + if (data->neighborhoods != nullptr) { + DCHECK_EQ(num_features, data->neighborhoods->size()); + } + + for (const auto vec_ptr : int_vecs) { + DCHECK_EQ(num_features, vec_ptr->size()); + } + for (const auto vec_ptr : float_vecs) { + DCHECK_EQ(num_features, vec_ptr->size()); + } + + for (int i = 0; i < num_features; ++i) { + DCHECK_LE(num_selected_features, i); + if (eval(i)) { + data->features[num_selected_features] = data->features[i]; + data->corner_responses[num_selected_features] = data->corner_responses[i]; + data->octaves[num_selected_features] = data->octaves[i]; + data->track_ids[num_selected_features] = data->track_ids[i]; + data->feature_source_map[num_selected_features] = + data->feature_source_map[i]; + if (data->neighborhoods != nullptr) { + (*data->neighborhoods)[num_selected_features] = + (*data->neighborhoods)[i]; + } + + for (auto* vec_ptr : int_vecs) { + (*vec_ptr)[num_selected_features] = (*vec_ptr)[i]; + } + for (auto* vec_ptr : float_vecs) { + (*vec_ptr)[num_selected_features] = (*vec_ptr)[i]; + } + ++num_selected_features; + } + } + + // Trim to number of selected features. + data->features.resize(num_selected_features); + data->corner_responses.resize(num_selected_features); + data->octaves.resize(num_selected_features); + data->track_ids.resize(num_selected_features); + data->feature_source_map.resize(num_selected_features); + if (data->neighborhoods != nullptr) { + data->neighborhoods->resize(num_selected_features); + } + + for (const auto vec_ptr : int_vecs) { + vec_ptr->resize(num_selected_features); + } + for (const auto vec_ptr : float_vecs) { + vec_ptr->resize(num_selected_features); + } + + return num_selected_features; +} + +void RegionFlowComputation::TrackFeatures(FrameTrackingData* from_data_ptr, + FrameTrackingData* to_data_ptr, + bool* gain_correction_ptr, + float* frac_long_features_rejected, + TrackedFeatureList* results_ptr) { + MEASURE_TIME << "TrackFeatures"; + + FrameTrackingData& data1 = *from_data_ptr; + const cv::Mat& frame1 = data1.frame; + const std::vector& features1 = data1.features; + const std::vector& corner_responses1 = data1.corner_responses; + const std::vector& octaves1 = data1.octaves; + // New features will be assigned new track ids. + std::vector& track_ids1 = data1.track_ids; + + FrameTrackingData& data2 = *to_data_ptr; + cv::Mat& frame2 = data2.frame; + std::vector& features2 = data2.features; + std::vector& corner_responses2 = data2.corner_responses; + std::vector& octaves2 = data2.octaves; + std::vector& track_ids2 = data2.track_ids; + + bool& gain_correction = *gain_correction_ptr; + TrackedFeatureList& results = *results_ptr; + std::vector& feature_source_map = data2.feature_source_map; + + // Start frame for new features. Minimum of from and to. + const int min_frame = + std::min(from_data_ptr->frame_num, to_data_ptr->frame_num); + + feature_source_map.clear(); + results.clear(); + + const int num_features = features1.size(); + if (num_features == 0) { + // No features found, probably black or extremely blurry frame. + // Return empty results. + VLOG(1) << "Couldn't find any features to track. Frame probably empty."; + return; + } + + int tracking_flags = 0; + // Check if features in the destination are already initialized. If so, use + // their location as initial guess. + if (!data2.features_initialized) { + data2.ResetFeatures(); + features2.resize(num_features); + corner_responses2.resize(num_features); + octaves2.resize(num_features); + data2.source = from_data_ptr; + } else { + CHECK_EQ(data2.source, from_data_ptr); + CHECK_EQ(num_features, features2.size()); + tracking_flags |= cv::OPTFLOW_USE_INITIAL_FLOW; + } + + const int track_win_size = options_.tracking_options().tracking_window_size(); + CHECK_GT(track_win_size, 1) << "Needs to be at least 2 pixels in each " + << "direction"; + + // Proceed with gain correction only if it succeeds, and set flag accordingly. + bool frame1_gain_reference = true; + if (gain_correction) { + cv::Mat reference_frame = frame1; + cv::Mat input_frame = frame2; + float reference_mean = data1.mean_intensity; + float input_mean = data2.mean_intensity; + // Use brighter frame as reference, if requested. + if (options_.gain_correction_bright_reference() && + data1.mean_intensity < data2.mean_intensity) { + std::swap(input_frame, reference_frame); + std::swap(input_mean, reference_mean); + frame1_gain_reference = false; + } + + // Gain correct and update output with success result. + gain_correction = + GainCorrectFrame(reference_frame, input_frame, reference_mean, + input_mean, gain_image_.get()); + } + +#if CV_MAJOR_VERSION == 3 + // OpenCV changed how window size gets specified from our radius setting + // < 2.2 to diameter in 2.2+. + const cv::Size cv_window_size(track_win_size * 2 + 1, track_win_size * 2 + 1); + + cv::TermCriteria cv_criteria( + cv::TermCriteria::COUNT + cv::TermCriteria::EPS, + options_.tracking_options().tracking_iterations(), 0.02f); + + cv::_InputArray input_frame1(data1.pyramid); + cv::_InputArray input_frame2(data2.pyramid); +#endif + + // Using old c-interface for OpenCV's 2.2 tracker. + CvTermCriteria criteria; + criteria.type = CV_TERMCRIT_EPS | CV_TERMCRIT_ITER; + criteria.max_iter = options_.tracking_options().tracking_iterations(); + criteria.epsilon = 0.02f; + + feature_track_error_.resize(num_features); + feature_status_.resize(num_features); + if (use_cv_tracking_) { +#if CV_MAJOR_VERSION == 3 + if (gain_correction) { + if (!frame1_gain_reference) { + input_frame1 = cv::_InputArray(*gain_image_); + } else { + input_frame2 = cv::_InputArray(*gain_image_); + } + } + + if (options_.tracking_options().klt_tracker_implementation() == + TrackingOptions::KLT_OPENCV) { + cv::calcOpticalFlowPyrLK(input_frame1, input_frame2, features1, features2, + feature_status_, feature_track_error_, + cv_window_size, pyramid_levels_, cv_criteria, + tracking_flags); + } else { + LOG(ERROR) << "Tracking method unspecified."; + return; + } +#endif + } else { + LOG(ERROR) << "only cv tracking is supported."; + return; + } + + // Inherit corner response and octaves from extracted features. + corner_responses2 = corner_responses1; + octaves2 = octaves1; + + // Remember mapping from destination to source index; + // Fill feature_source_map with 0 ... num_features - 1. + feature_source_map.resize(num_features); + std::iota(feature_source_map.begin(), feature_source_map.end(), 0); + + // Init track ids. + track_ids2 = std::vector(num_features, -1); // Unassigned by default. + + // Select features that were successfully tracked from data1 to data2. + int num_valid_features = InplaceFeatureSelection( + to_data_ptr, {&feature_source_map}, {&feature_track_error_}, + [this](int i) -> bool { return feature_status_[i] == 1; }); + + // Init neighborhoods if needed. + if (IsVerifyLongFeatures()) { + // data1 should be initialized at this point. + CHECK(data1.neighborhoods != nullptr); + if (data2.neighborhoods == nullptr) { + data2.neighborhoods.reset(new std::vector()); + data2.neighborhoods->resize(num_valid_features); + } + } + + // Remember last id threshold before assigning new ones. + const int prev_id_threshold = + long_track_data_ != nullptr ? long_track_data_->LastTrackId() : 0; + + std::vector ids_to_verify; + std::vector motions_to_verify; + if (long_track_data_) { + // Compute motion for each feature and average magnitude. + std::vector motion_mag(num_valid_features, 0); + float avg_motion_mag = 0; + + for (int i = 0; i < num_valid_features; ++i) { + const int match_idx = feature_source_map[i]; + const cv::Point2f diff = + (features2[i] - features1[match_idx]) * downsample_scale_; + const float norm = std::abs(diff.x) + std::abs(diff.y); + motion_mag[i] = norm; + avg_motion_mag += norm; + } + + if (num_valid_features > 0) { + avg_motion_mag /= num_valid_features; + } + + bool is_duplicated = num_valid_features > 0 && avg_motion_mag < kZeroMotion; + + const float max_acc = options_.max_long_feature_acceleration(); + + // Minimum motion for stable ratio test. + constexpr float kMinMotion = 1.0f; + + // Initialize all track_ids of data2. + int num_restarted_tracks = 0; + for (int i = 0; i < num_valid_features; ++i) { + const int match_idx = feature_source_map[i]; + if (track_ids1[match_idx] < 0) { + const float motion_mag_arg = is_duplicated ? -1 : motion_mag[i]; + track_ids1[match_idx] = + long_track_data_->CreateNextTrackId(min_frame, motion_mag_arg); + track_ids2[i] = track_ids1[match_idx]; + } else if (!is_duplicated) { + // Check for change in acceleration for non duplicated frame. + const float prev_motion_mag = + long_track_data_->MotionMagForId(track_ids1[match_idx]); + + // Test for acceleration or deacceleration, but only if previous + // motion was known (from non duplicated frame). + if (prev_motion_mag >= 0 && + (motion_mag[i] > max_acc * std::max(kMinMotion, prev_motion_mag) || + prev_motion_mag > max_acc * std::max(kMinMotion, motion_mag[i]))) { + // Start new track or flag for testing. + if (options_.verify_long_feature_acceleration()) { + // Keep track id and update motion. If feature is removed and + // therefore motion was updated incorrectly, motion wont be used + // again. + track_ids2[i] = track_ids1[match_idx]; + long_track_data_->UpdateMotion(track_ids1[match_idx], + motion_mag[i]); + ids_to_verify.push_back(i); + motions_to_verify.push_back(motion_mag[i]); + } else { + ++num_restarted_tracks; + track_ids2[i] = + long_track_data_->CreateNextTrackId(min_frame, motion_mag[i]); + } + } else { + long_track_data_->UpdateMotion(track_ids1[match_idx], motion_mag[i]); + track_ids2[i] = track_ids1[match_idx]; + } + } else { + // Duplicated frame with existing track, re-use id without updating + // motion. + track_ids2[i] = track_ids1[match_idx]; + } + + if (IsVerifyLongFeatures()) { + cv::Mat& mat1 = (*data1.neighborhoods)[match_idx]; + cv::Mat& mat2 = (*data2.neighborhoods)[i]; + if (mat1.empty()) { + data1.ExtractPatch(features1[match_idx], track_win_size, &mat1); + } + data2.ExtractPatch(features2[i], track_win_size, &mat2); + } + } + VLOG(1) << "Restarted tracks: " << num_restarted_tracks; + } // end long track data. + + if (!ids_to_verify.empty() && + ids_to_verify.size() < + options_.verify_long_feature_trigger_ratio() * num_valid_features) { + // Reset feature ids, instead of triggering verification. + VLOG(1) << "Canceling feature verification, resetting tracks: " + << ids_to_verify.size() << " of " << num_valid_features; + for (int k = 0; k < ids_to_verify.size(); ++k) { + const int id = ids_to_verify[k]; + track_ids2[id] = + long_track_data_->CreateNextTrackId(min_frame, motions_to_verify[k]); + } + ids_to_verify.clear(); + motions_to_verify.clear(); + } + + // Distance between source location x and tracked-back f^(-1)(y) location + // starting at the tracked location y = f(x): x - f^(-1)(f(x)). + // Close to zero for tracks that could be verified. + std::vector verify_distance(num_valid_features, 0); + + // Compile list of indices we need to verify via backtracking. + std::vector feat_ids_to_verify; + if (options_.verify_features()) { + feat_ids_to_verify.resize(num_valid_features); + std::iota(feat_ids_to_verify.begin(), feat_ids_to_verify.end(), 0); + } else if (options_.verify_long_feature_acceleration()) { + feat_ids_to_verify = ids_to_verify; + } + + VLOG(1) << "Verifying: " << feat_ids_to_verify.size() << " out of " + << num_valid_features; + if (!feat_ids_to_verify.empty()) { + const int num_to_verify = feat_ids_to_verify.size(); + std::vector verify_features; + std::vector verify_features_tracked; + verify_features.reserve(num_to_verify); + verify_features_tracked.reserve(num_to_verify); + for (int idx : feat_ids_to_verify) { + const int match_idx = feature_source_map[idx]; + verify_features.push_back(features2[idx]); + verify_features_tracked.push_back(features1[match_idx]); + } + + tracking_flags |= cv::OPTFLOW_USE_INITIAL_FLOW; + + // Unused track error. + std::vector verify_track_error(num_to_verify); + feature_status_.resize(num_to_verify); + + if (use_cv_tracking_) { +#if CV_MAJOR_VERSION == 3 + cv::calcOpticalFlowPyrLK(input_frame2, input_frame1, verify_features, + verify_features_tracked, feature_status_, + verify_track_error, cv_window_size, + pyramid_levels_, cv_criteria, tracking_flags); +#endif + } else { + LOG(ERROR) << "only cv tracking is supported."; + return; + } + + // Check feature destinations, that when tracked back to from data1 to + // data2, don't differ more than a threshold from their original location + // in data1. + std::vector verify_result(num_valid_features, 1); // 1 for accept. + int num_accepted = 0; + for (int k = 0; k < num_to_verify; ++k) { + const int idx = feat_ids_to_verify[k]; + const int match_idx = feature_source_map[idx]; + const cv::Point2f diff = + features1[match_idx] - verify_features_tracked[k]; + const float dist = std::sqrt(diff.dot(diff)); + verify_distance[idx] = dist; + verify_result[idx] = + dist < options_.verification_distance() && feature_status_[k] == 1; + num_accepted += verify_result[idx]; + } + VLOG(1) << "Accepted number of verified features " << num_accepted; + + num_valid_features = InplaceFeatureSelection( + to_data_ptr, {&feature_source_map}, + {&feature_track_error_, &verify_distance}, + [this, verify_result](int i) -> bool { return verify_result[i]; }); + } + + if (frac_long_features_rejected != nullptr) { + *frac_long_features_rejected = 0; + } + + // Verify long features if requested. + if (IsVerifyLongFeatures() && num_valid_features > 0) { + // track_win_size > 0, checked above. + const float denom = 1.0f / (track_win_size * track_win_size * 255.0); + int new_tracks = 0; + for (int track_id : data2.track_ids) { + if (track_id > prev_id_threshold) { + ++new_tracks; + } + } + + // Select features that don't differ from the original extracted path by + // more than a threshold. If that is the case, propagate original patch from + // data1 to data2. Used for subsequent verifications. + int num_selected_features = InplaceFeatureSelection( + to_data_ptr, {&feature_source_map}, + {&feature_track_error_, &verify_distance}, + [this, denom, &data1, &data2, &feature_source_map](int i) -> bool { + cv::Mat& mat1 = (*data1.neighborhoods)[feature_source_map[i]]; + cv::Mat& mat2 = (*data2.neighborhoods)[i]; + const float norm = cv::norm(mat1, mat2, cv::NORM_L1) * denom; + if (norm < options_.long_feature_verification_threshold()) { + // Store original patch. + mat2 = mat1; + return true; + } else { + return false; + } + }); + + if (frac_long_features_rejected) { + // There needs to be a reasonable number of features to test this + // criteria. + constexpr int kMinPrevValidFeatures = 10; + if (num_valid_features > kMinPrevValidFeatures) { + *frac_long_features_rejected = + (1.0f - num_selected_features * (1.0f / num_valid_features)); + // Note: One could add number of new tracks here, i.e. + // += new_tracks / num_valid_features. + // Concern is that is very shaky video where a lot of tracks get created + // this criteria overpowers the current one. + } + } + num_valid_features = num_selected_features; + } + + // How many tracking steps were used to generate these features. + data2.last_feature_extraction_time = 1 + data1.last_feature_extraction_time; + // Features are resized, so they cannot be considered initialized. + data2.features_initialized = false; + + // Copy verified features to results. + results.reserve(num_valid_features); + for (int i = 0; i < num_valid_features; ++i) { + const int match_idx = feature_source_map[i]; + const Vector2_f point1 = + Vector2_f(features1[match_idx].x, features1[match_idx].y) * + downsample_scale_; + const Vector2_f point2 = + Vector2_f(features2[i].x, features2[i].y) * downsample_scale_; + + if (PointOutOfBound(point1, original_width_, original_height_) || + PointOutOfBound(point2, original_width_, original_height_)) { + continue; + } + + const Vector2_f flow = point2 - point1; + results.emplace_back(point1, flow, feature_track_error_[i], + corner_responses2[i], octaves2[i], track_ids2[i], + verify_distance[i]); + + if (long_track_data_ != nullptr && track_ids1[match_idx] != track_ids2[i]) { + results.back().flags |= RegionFlowFeature::FLAG_BROKEN_TRACK; + } + + if (IsVerifyLongFeatures()) { + const cv::Mat& orig_patch = (*data1.neighborhoods)[match_idx]; + results.back().orig_neighborhood.reset(new cv::Mat(orig_patch)); + } + + if (data1.orb.computed) { + results.back().descriptors = data1.orb.descriptors.row(match_idx); + } + } +} + +void RegionFlowComputation::AppendUniqueFeaturesSorted( + const TrackedFeatureView& to_be_added, TrackedFeatureView* features) const { + for (auto feature_ptr : to_be_added) { + auto insert_pos = + std::lower_bound(features->begin(), features->end(), feature_ptr); + if (insert_pos == features->end() || *insert_pos != feature_ptr) { + // Not present yet. + features->insert(insert_pos, feature_ptr); + feature_ptr->irls_weight = 1; + } + + // Present, increase score. + ++feature_ptr->irls_weight; + } +} + +void RegionFlowComputation::InitializeFeatureLocationsFromTransform( + int from, int to, const Homography& transform) { + const int index1 = data_queue_.size() + from - 1; + const int index2 = data_queue_.size() + to - 1; + FrameTrackingData& data1 = *data_queue_[index1]; + FrameTrackingData* data2 = data_queue_[index2].get(); + + data2->features = data1.features; + + for (cv::Point2f& feature : data2->features) { + const Vector2_f trans_pt = + TransformPoint(transform, Vector2_f(feature.x, feature.y)); + feature = cv::Point2f(trans_pt.x(), trans_pt.y()); + } + + data2->source = &data1; + data2->features_initialized = true; +} + +void RegionFlowComputation::InitializeFeatureLocationsFromPreviousResult( + int from, int to) { + CHECK_NE(from, to) << "Cannot initialize FrameTrackingData from itself."; + + const int index1 = data_queue_.size() + from - 1; + const int index2 = data_queue_.size() + to - 1; + CHECK_GE(index1, 0); + CHECK_LT(index1, data_queue_.size()); + CHECK_GE(index2, 0); + CHECK_LT(index2, data_queue_.size()); + const FrameTrackingData& data1 = *data_queue_[index1]; + FrameTrackingData* data2 = data_queue_[index2].get(); + CHECK(data1.source != nullptr); + + if (!data1.features_initialized) { + data2->features = data1.source->features; + for (int k = 0; k < data1.feature_source_map.size(); ++k) { + data2->features[data1.feature_source_map[k]] = data1.features[k]; + } + } else { + data2->features = data1.features; + CHECK_EQ(data1.features.size(), data1.source->features.size()); + } + data2->source = data1.source; + data2->features_initialized = true; +} + +namespace { + +void ComputeMeanForRegionFlow(RegionFlowFrame::RegionFlow* region_flow) { + // Compute mean for this region. + Vector2_f centroid(0, 0); + Vector2_f mean_flow(0, 0); + + for (const auto& feature : region_flow->feature()) { + centroid += Vector2_f(feature.x(), feature.y()); + mean_flow += Vector2_f(feature.dx(), feature.dy()); + } + + const float denom = 1.0f / region_flow->feature_size(); + centroid *= denom; + mean_flow *= denom; + + region_flow->set_centroid_x(centroid.x()); + region_flow->set_centroid_y(centroid.y()); + region_flow->set_flow_x(mean_flow.x()); + region_flow->set_flow_y(mean_flow.y()); +} + +} // namespace. + +void RegionFlowComputation::ComputeBlockBasedFlow( + TrackedFeatureList* feature_list, + TrackedFeatureView* inlier_features) const { + MEASURE_TIME << "Block based flow"; + TrackedFeatureView inlier_view; + inlier_view.reserve(feature_list->size()); + + const float frame_diam = hypot(original_width_, original_height_); + const float max_magnitude_threshold = + frame_diam * options_.max_magnitude_threshold_ratio(); + float sq_max_magnitude_threshold = + max_magnitude_threshold * max_magnitude_threshold; + + // Refine threshold, for magnitudes over N times the mean motion, if + // requested. + if (!feature_list->empty() && options_.median_magnitude_bounds() > 0) { + std::vector motion_magnitudes; + motion_magnitudes.reserve(feature_list->size()); + for (auto feature : *feature_list) { + motion_magnitudes.push_back(feature.flow.Norm2()); + } + auto median_iter = motion_magnitudes.begin() + motion_magnitudes.size() / 2; + std::nth_element(motion_magnitudes.begin(), median_iter, + motion_magnitudes.end()); + const float median = *median_iter; + // Only apply if non-zero motion is present (at least one pixel). + if (median > 1.0) { + const float outlier_threshold = median * + options_.median_magnitude_bounds() * + options_.median_magnitude_bounds(); + // Refine threshold. + sq_max_magnitude_threshold = + min(sq_max_magnitude_threshold, outlier_threshold); + } + } + + for (auto& feature : *feature_list) { + // Skip features with exceeding motions. + if (feature.flow.Norm2() < sq_max_magnitude_threshold) { + inlier_view.push_back(&feature); + inlier_view.back()->num_bins = 0; + } + } + + const int num_overlaps = options_.fast_estimation_overlap_grids(); + const int num_grids = block_levels_ * num_overlaps * num_overlaps; + + // Put all features into region bins. + std::vector grid_feature_views(num_grids); + + int grid_idx = 0; + int block_width = block_width_; + int block_height = block_height_; + + for (int level = 0; level < block_levels_; ++level) { + const float inv_block_width = 1.0f / block_width; + const float inv_block_height = 1.0f / block_height; + + for (int overlap_y = 0; overlap_y < num_overlaps; ++overlap_y) { + // | | | | <- unshifted + // | | | | | <- shifted + // ^ + // block_height * overlap_y / num_overlaps := t + // We need to add block_height - t to each value, so that at t + // we get bin 1. + // Special case is overlap_y == 0 -> shift = 0. + const int grid_shift_y = + overlap_y == 0 + ? 0 + : (block_height - block_height * overlap_y / num_overlaps); + + for (int overlap_x = 0; overlap_x < num_overlaps; + ++overlap_x, ++grid_idx) { + const int grid_shift_x = + overlap_x == 0 + ? 0 + : (block_width - block_width * overlap_x / num_overlaps); + + const int bins_per_row = + std::ceil((original_width_ + grid_shift_x) * inv_block_width); + const int bins_per_column = + std::ceil((original_height_ + grid_shift_y) * inv_block_height); + TrackedFeatureMap& feature_view = grid_feature_views[grid_idx]; + feature_view.resize(bins_per_row * bins_per_column); + + for (auto feature_ptr : inlier_view) { + const int x = feature_ptr->point.x() + 0.5f + grid_shift_x; + const int y = feature_ptr->point.y() + 0.5f + grid_shift_y; + const int block_x = x * inv_block_width; + const int block_y = y * inv_block_height; + + int block_id = block_y * bins_per_row + block_x; + feature_view[block_id].push_back(feature_ptr); + } + } + } + + // We use smallest block width and height later on. + if (level + 1 < block_levels_) { + block_width = (block_width + 1) / 2; + block_height = (block_height + 1) / 2; + } + } + + for (int k = 0; k < num_grids; ++k) { + TrackedFeatureMap& region_features = grid_feature_views[k]; + const int min_inliers = GetMinNumFeatureInliers(region_features); + for (TrackedFeatureView& feature_view : region_features) { + if (feature_view.size() >= min_inliers) { + for (auto feature_ptr : feature_view) { + ++feature_ptr->num_bins; + } + } else { + feature_view.clear(); + } + } + } + + if (num_grids == 1) { + TrackedFeatureView all_inliers; + DetermineRegionFlowInliers(grid_feature_views[0], &all_inliers); + AppendUniqueFeaturesSorted(all_inliers, inlier_features); + } else { + // Get all features across all grids without duplicates. + std::vector grid_inliers(num_grids); + ParallelFor( + 0, num_grids, 1, + [&grid_feature_views, &grid_inliers, this](const BlockedRange& range) { + for (int k = range.begin(); k < range.end(); ++k) { + grid_inliers[k].reserve(grid_feature_views[k].size()); + DetermineRegionFlowInliers(grid_feature_views[k], &grid_inliers[k]); + } + }); + + for (int grid = 0; grid < num_grids; ++grid) { + AppendUniqueFeaturesSorted(grid_inliers[grid], inlier_features); + } + } +} + +void RegionFlowComputation::DetermineRegionFlowInliers( + const TrackedFeatureMap& region_feature_map, + TrackedFeatureView* inliers) const { + CHECK(inliers); + inliers->clear(); + + // Run RANSAC on each region. + const int max_iterations = options_.ransac_rounds_per_region(); + float absolute_err_threshold = + max(options_.absolute_inlier_error_threshold(), + options_.frac_inlier_error_threshold() * + hypot(original_width_, original_height_)); + absolute_err_threshold *= absolute_err_threshold; + + // Save ptr to each inlier feature. + TrackedFeatureView inlier_set; + TrackedFeatureView best_inlier_set; + unsigned int seed = 900913; // = Google in leet :) + std::default_random_engine rand_gen(seed); + + const int min_features = GetMinNumFeatureInliers(region_feature_map); + + for (const TrackedFeatureView& region_features : region_feature_map) { + if (region_features.empty()) { + continue; + } + // Select top N inlier sets. + int loop_count = options_.top_inlier_sets(); + + // Get local copy if necessary and sort. + const TrackedFeatureView* all_features = nullptr; + TrackedFeatureView all_features_storage; + + if (loop_count > 1) { + all_features_storage = region_features; + all_features = &all_features_storage; + std::sort(all_features_storage.begin(), all_features_storage.end()); + } else { + all_features = ®ion_features; + } + + const int num_features = all_features->size(); + + // Extract inlier sets as long as more than 20% of original features remain. + int last_inlier_set_size = 0; + + while (all_features->size() >= max(min_features, num_features / 5) && + loop_count-- > 0) { + best_inlier_set.clear(); + std::uniform_int_distribution<> distribution(0, all_features->size() - 1); + + for (int i = 0; i < max_iterations; ++i) { + // Pick a random vector. + const int rand_idx = distribution(rand_gen); + Vector2_f vec = (*all_features)[rand_idx]->flow; + + float relative_err_threshold = + options_.relative_inlier_error_threshold() * vec.Norm(); + relative_err_threshold *= relative_err_threshold; + const float err_threshold = + std::max(relative_err_threshold, absolute_err_threshold); + + // Determine inlier vectors. + inlier_set.clear(); + + for (auto feature_ptr : *all_features) { + if ((feature_ptr->flow - vec).Norm2() < err_threshold) { + inlier_set.push_back(feature_ptr); + } + } + + if (inlier_set.size() >= best_inlier_set.size()) { + best_inlier_set.swap(inlier_set); + } + } + + if (best_inlier_set.size() >= + max(options_.min_feature_inliers(), last_inlier_set_size / 2)) { + last_inlier_set_size = best_inlier_set.size(); + inliers->insert(inliers->end(), best_inlier_set.begin(), + best_inlier_set.end()); + + if (loop_count > 0) { + // Remove inliers from all feature set. + TrackedFeatureView remaining_features; + std::set_difference(all_features_storage.begin(), + all_features_storage.end(), + best_inlier_set.begin(), best_inlier_set.end(), + back_inserter(remaining_features)); + all_features_storage.swap(remaining_features); + } + } else { + break; + } + } + } // end traverse region bins. +} + +int RegionFlowComputation::GetMinNumFeatureInliers( + const TrackedFeatureMap& region_feature_map) const { + // Determine number of features. + int total_features = 0; + for (const TrackedFeatureView& region_features : region_feature_map) { + total_features += region_features.size(); + } + + CHECK(!region_feature_map.empty()) + << "Empty grid passed. Check input dimensions"; + + const float threshold = + std::max(options_.min_feature_inliers(), + options_.relative_min_feature_inliers() * total_features / + region_feature_map.size()); + + return threshold; +} + +void RegionFlowComputation::RegionFlowFeatureListToRegionFlow( + const RegionFlowFeatureList& feature_list, RegionFlowFrame* frame) const { + CHECK(frame != nullptr); + + frame->set_num_total_features(feature_list.feature_size()); + frame->set_unstable_frame(feature_list.unstable()); + if (feature_list.has_blur_score()) { + frame->set_blur_score(feature_list.blur_score()); + } + frame->set_frame_width(feature_list.frame_width()); + frame->set_frame_height(feature_list.frame_height()); + + RegionFlowFrame::BlockDescriptor* block_descriptor = + frame->mutable_block_descriptor(); + + // Compute minimum block size. + int min_block_width = block_width_; + int min_block_height = block_height_; + + for (int level = 0; level < block_levels_; ++level) { + if (level + 1 < block_levels_) { + min_block_width = (min_block_width + 1) / 2; + min_block_height = (min_block_height + 1) / 2; + } + } + block_descriptor->set_block_width(min_block_width); + block_descriptor->set_block_height(min_block_height); + const int bins_per_row = + std::ceil(original_width_ * (1.0f / min_block_width)); + const int bins_per_col = + std::ceil(original_height_ * (1.0f / min_block_height)); + block_descriptor->set_num_blocks_x(bins_per_row); + block_descriptor->set_num_blocks_y(bins_per_col); + + const int num_regions = bins_per_row * bins_per_col; + frame->mutable_region_flow()->Reserve(num_regions); + for (int k = 0; k < num_regions; ++k) { + frame->add_region_flow()->set_region_id(k); + } + + // Add feature according smallest block width and height to regions. + for (auto feature : feature_list.feature()) { + const int x = static_cast(feature.x()); + const int y = static_cast(feature.y()); + // Guard, in case equation is wrong. + int region_id = min( + num_regions, y / min_block_height * bins_per_row + x / min_block_width); + *frame->mutable_region_flow(region_id)->add_feature() = feature; + } + + for (auto& region_flow : *frame->mutable_region_flow()) { + ComputeMeanForRegionFlow(®ion_flow); + } +} + +void RegionFlowComputation::InitializeRegionFlowFeatureList( + RegionFlowFeatureList* region_flow_feature_list) const { + region_flow_feature_list->set_frame_width(original_width_); + region_flow_feature_list->set_frame_height(original_height_); + if (curr_blur_score_ >= 0.0f) { + region_flow_feature_list->set_blur_score(curr_blur_score_); + } + + region_flow_feature_list->set_distance_from_border(std::max( + options_.patch_descriptor_radius(), options_.distance_from_border())); + region_flow_feature_list->set_long_tracks(long_track_data_ != nullptr); +} + +namespace { + +bool IsPointWithinBounds(const Vector2_f& pt, int bounds, int frame_width, + int frame_height) { + // Ensure stability under float -> rounding operations. + if (pt.x() - 0.5f >= bounds && pt.x() + 0.5f <= frame_width - 1 - bounds && + pt.y() - 0.5f >= bounds && pt.y() + 0.5f <= frame_height - 1 - bounds) { + return true; + } else { + return false; + } +} + +} // namespace. + +float RegionFlowComputation::TrackedFeatureViewToRegionFlowFeatureList( + const TrackedFeatureView& region_feature_view, + TrackedFeatureList* flattened_feature_list, + RegionFlowFeatureList* region_flow_feature_list) const { + const int border = region_flow_feature_list->distance_from_border(); + + region_flow_feature_list->mutable_feature()->Reserve( + region_feature_view.size()); + + float sq_flow_sum = 0; + + for (auto feature_ptr : region_feature_view) { + const Vector2_f& location = feature_ptr->point; + const Vector2_f& match_location = feature_ptr->point + feature_ptr->flow; + + if (border > 0) { + if (!IsPointWithinBounds(location, border, original_width_, + original_height_) || + !IsPointWithinBounds(match_location, border, original_width_, + original_height_)) { + continue; + } + } + + const Vector2_f& flow = feature_ptr->flow; + sq_flow_sum += flow.Norm2(); + + RegionFlowFeature* feature = region_flow_feature_list->add_feature(); + feature->set_x(location.x()); + feature->set_y(location.y()); + feature->set_dx(flow.x()); + feature->set_dy(flow.y()); + feature->set_tracking_error(feature_ptr->tracking_error); + feature->set_corner_response(feature_ptr->corner_response); + + if (long_track_data_ != nullptr) { + feature->set_track_id(feature_ptr->track_id); + } + feature->set_flags(feature_ptr->flags); + + switch (options_.irls_initialization()) { + case RegionFlowComputationOptions::INIT_UNIFORM: + feature->set_irls_weight(1.0f); + break; + + case RegionFlowComputationOptions::INIT_CONSISTENCY: + feature->set_irls_weight(2.0f * feature_ptr->irls_weight / + feature_ptr->num_bins); + break; + } + + // Remember original TrackedFeature if requested. + if (flattened_feature_list) { + flattened_feature_list->push_back(*feature_ptr); + } + + if (feature_ptr->descriptors.cols != 0) { + feature->mutable_binary_feature_descriptor()->set_data( + static_cast(feature_ptr->descriptors.data), + feature_ptr->descriptors.cols); + } + } + + const int num_features = region_flow_feature_list->feature_size(); + float avg_motion = 0; + if (num_features > 0) { + avg_motion = std::sqrt(sq_flow_sum / num_features); + if (avg_motion < kZeroMotion) { + region_flow_feature_list->set_is_duplicated(true); + } + } + + return avg_motion; +} + +// Check if enough features as specified by +// MotionStabilizationOptions::min_feature_requirement are present, and if +// features cover a sufficient area. +bool RegionFlowComputation::HasSufficientFeatures( + const RegionFlowFeatureList& feature_list) { + const int area_size = options_.min_feature_cover_grid(); + const float scaled_width = static_cast(area_size) / original_width_; + const float scaled_height = static_cast(area_size) / original_height_; + std::vector area_mask(area_size * area_size); + + for (const auto& feature : feature_list.feature()) { + int x = feature.x() * scaled_width; + int y = feature.y() * scaled_height; + area_mask[y * area_size + x] = 1; + } + + int covered_bins = std::accumulate(area_mask.begin(), area_mask.end(), 0); + float area_covered = + static_cast(covered_bins) / (area_size * area_size); + + const int num_features = feature_list.feature_size(); + bool has_sufficient_features = + num_features >= options_.min_feature_requirement() && + area_covered > options_.min_feature_cover(); + + if (has_sufficient_features) { + VLOG(1) << "Sufficient features: " << num_features; + } else { + VLOG(1) << "!! Insufficient features: " << num_features + << " required: " << options_.min_feature_requirement() + << " cover: " << area_covered + << " required: " << options_.min_feature_cover(); + } + + VLOG(1) << (has_sufficient_features ? "Has sufficient " : "Insufficient ") + << " features: " << num_features; + + return has_sufficient_features; +} + +int RegionFlowComputation::PyramidLevelsFromTrackDistance( + float track_distance) { + // The number of pyramid levels l is chosen such that + // 2^l * tracking_window_size / 2 >= track_distance. + int pyramid_levels = + std::ceil(std::log2(std::max(track_distance, 1.0f) * 2.f / + options_.tracking_options().tracking_window_size())); + // The maximum pyramid level that we are targeting. The minimum size in the + // pyramid is intended to be 2x2. + const int max_pyramid_levels = + max(1, std::log2(min(frame_height_, frame_width_)) - 1); + pyramid_levels = min(max_pyramid_levels, max(pyramid_levels, 2)); + return pyramid_levels; +} + +void RegionFlowComputation::ComputeBlurMask(const cv::Mat& input, + cv::Mat* min_eig_vals, + cv::Mat* mask) { + MEASURE_TIME << "Computing blur score"; + const auto& blur_options = options_.blur_score_options(); + cv::cornerMinEigenVal(input, *corner_values_, 3); + + // Create over-exposure mask to mask out corners in high exposed areas. + // Reason is, that motion blur does not affect lights in the same manner as + // diffuse surfaces due to the limited dynamic range of the camera. + // Blurring a light tends not to lower the corner score but simply changes the + // shape of the light with the corner score being unaffected. + cv::compare(input, 245, *corner_mask_, cv::CMP_GE); + + // Dilate reads out of bound values. + if (corner_mask_->rows > 5 && corner_mask_->cols > 5) { + cv::Mat dilate_domain(*corner_mask_, cv::Range(2, corner_mask_->rows - 2), + cv::Range(2, corner_mask_->cols - 2)); + cv::Mat kernel(5, 5, CV_8U); + kernel.setTo(1.0); + cv::dilate(dilate_domain, dilate_domain, kernel); + } + corner_values_->setTo(cv::Scalar(0), *corner_mask_); + + // Box filter corner score to diffuse and suppress noise. + cv::boxFilter( + *corner_values_, *corner_filtered_, CV_32F, + cv::Size(blur_options.box_filter_diam(), blur_options.box_filter_diam())); + + // Determine maximum cornerness in robust manner over bins. + // Specular reflections or colored lights might not be filtered out by + // the above operation. + const int max_blocks = 8; + const int block_width = + std::ceil(static_cast(corner_filtered_->cols) / max_blocks); + const int block_height = + std::ceil(static_cast(corner_filtered_->rows) / max_blocks); + std::vector block_maximums; + for (int block_y = 0; block_y < max_blocks; ++block_y) { + if (block_y * block_height >= corner_filtered_->rows) { + continue; + } + + cv::Range y_range(block_y * block_height, min((block_y + 1) * block_height, + corner_filtered_->rows)); + + for (int block_x = 0; block_x < max_blocks; ++block_x) { + if (block_x * block_width >= corner_filtered_->cols) { + continue; + } + + cv::Range x_range(block_x * block_width, min((block_x + 1) * block_width, + corner_filtered_->cols)); + cv::Mat block(*corner_filtered_, y_range, x_range); + double min_val, max_val; + cv::minMaxLoc(block, &min_val, &max_val); + block_maximums.push_back(max_val); + } + } + + auto block_median = + block_maximums.begin() + static_cast(block_maximums.size() * 0.75); + std::nth_element(block_maximums.begin(), block_median, block_maximums.end()); + const float max_val = *block_median; + + const float thresh = + max(blur_options.absolute_cornerness_threshold(), + blur_options.relative_cornerness_threshold() * max_val); + + cv::compare(*corner_filtered_, thresh, *corner_mask_, cv::CMP_GE); +} + +float RegionFlowComputation::ComputeBlurScore(const cv::Mat& input) { + // TODO: Already computed during feature extraction, re-use! + cv::cornerMinEigenVal(input, *corner_values_, 3); + ComputeBlurMask(input, corner_values_.get(), corner_mask_.get()); + + // Compute median corner score over masked area. + std::vector corner_score; + corner_score.reserve(frame_width_ * frame_height_); + for (int i = 0; i < corner_mask_->rows; ++i) { + const uint8_t* mask_ptr = corner_mask_->ptr(i); + const float* corner_ptr = corner_values_->ptr(i); + for (int j = 0; j < corner_mask_->cols; ++j) { + if (mask_ptr[j]) { + corner_score.push_back(corner_ptr[j]); + } + } + } + + const auto& blur_options = options_.blur_score_options(); + std::vector::iterator median_iter = + corner_score.begin() + + static_cast(corner_score.size() * blur_options.median_percentile()); + + float blur_score = 1e10f; + if (median_iter != corner_score.end()) { + std::nth_element(corner_score.begin(), median_iter, corner_score.end()); + if (*median_iter > 1e-10f) { + blur_score = 1.0f / *median_iter; + } + } + + return blur_score; +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/region_flow_computation.h b/mediapipe/util/tracking/region_flow_computation.h new file mode 100644 index 000000000..1d9f1159a --- /dev/null +++ b/mediapipe/util/tracking/region_flow_computation.h @@ -0,0 +1,463 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Computes the RegionFlow for a set of frames. +// Specifically, extracts Harris-like features from each frame, tracks these +// between frames and regularizes the tracked features locally (outlier +// rejection) by leveraging fast per-frame segmentation. +// Optionally, features can be assigned to either foreground or background based +// on the computation of the fundamental matrix for a pair of frames, +// +// Basic usage: +// RegionFlowComputation flow_computation(RegionFlowComputationOptions(), +// frame_width, +// frame_height); +// +// std::vector input_images; // Supplied by caller. +// for (int i = 0; i < num_frames; ++i) { +// flow_computation.AddImage(input_images[i]); +// +// // Result is owned by this caller. +// std::unique_ptr result( +// flow_computation.RetrieveRegionFlow()); +// +// // OR +// std::unique_ptr result( +// flow_computation.RetrieveRegionFlowFeatureList( +// true, // Compute feature descriptor. +// false, // no match descriptor for this example. +// &input_images[i], +// nullptr); +// +// // Do your custom processing or pass on to MotionEstimation. +// +// } + +#ifndef MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_COMPUTATION_H_ +#define MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_COMPUTATION_H_ + +#include +#include +#include +#include + +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" +#include "mediapipe/util/tracking/region_flow_computation.pb.h" + +namespace mediapipe { +class RegionFlowFeatureList; +class RegionFlowFrame; +} // namespace mediapipe + +namespace mediapipe { + +struct TrackedFeature; +typedef std::vector TrackedFeatureList; +class MotionAnalysis; + +class RegionFlowComputation { + public: + RegionFlowComputation(const RegionFlowComputationOptions& options, + int frame_width, int frame_height); + virtual ~RegionFlowComputation(); + RegionFlowComputation(const RegionFlowComputation&) = delete; + RegionFlowComputation& operator=(const RegionFlowComputation&) = delete; + + // Performs motion analysis on source w.r.t. to source passed in previous + // call. Therefore, first call will compute empty flow. If + // RegionFlowComputationOptions::frame_to_track := ftt > 0, motion analysis + // performed w.r.t. the previous ftt source passed via AddImage. + // Motion analysis uses grid-based regions to enforce locally consistent flow. + // Source is expected to be 3-channel RGB 8bit image (24bit in total), OR + // 1-channel Grayscale 8bit image, compatible with + // RegionFlowComputationOptions::ImageFormat. + // Pass the frame's timestamp to have it stored in the result or zero if not + // needed. + // Returns true on success, false otherwise. + virtual bool AddImage(const cv::Mat& source, int64 timestamp_usec); + + // Same as above, but seed initial feature position in the matching frame + // with initial_transform. + virtual bool AddImageWithSeed(const cv::Mat& source, int64 timestamp_usec, + const Homography& initial_transform); + + // Same as AddImage but also accepts an optional source_mask (pass empty + // cv::Mat to get the same behavior as AddImage). If non-empty, features are + // only extracted in regions where the mask value is > 0. Mask should be 8-bit + // grayscale of the same size as source, unless empty. + virtual bool AddImageWithMask(const cv::Mat& source, + const cv::Mat& source_mask, + int64 timestamp_usec); + + // Call after AddImage* to retrieve last downscaled, grayscale image. + cv::Mat GetGrayscaleFrameFromResults(); + + // Returns result as RegionFlowFrame. Result is owned by caller. + // Will return NULL if called twice without AddImage* call. + virtual RegionFlowFrame* RetrieveRegionFlow(); + + // Returns result as RegionFlowFeatureList. Result is owned by caller. + // Will return NULL if called twice without AddImage* call. + // Computes optionally feature descriptors (if compute_feature_descriptor + // is set, in that case curr_color_image must not be NULL) + // and additionally matching descriptor (if compute_match_descriptor is set, + // in this case prev_color_image must not be NULL). + // Passed images should be in sync with those passed to AddImage, i.e. + // source in AddImage and parameter curr_color_image should refer to the same + // image. + virtual RegionFlowFeatureList* RetrieveRegionFlowFeatureList( + bool compute_feature_descriptor, bool compute_match_descriptor, + const cv::Mat* curr_color_image, // optional. + const cv::Mat* prev_color_image); // optional. + + // Same as above, but returns specific tracked result from current frame C + // to C - track_index - 1. + virtual RegionFlowFeatureList* RetrieveMultiRegionFlowFeatureList( + int track_index, bool compute_feature_descriptor, + bool compute_match_descriptor, + const cv::Mat* curr_color_image, // optional. + const cv::Mat* prev_color_image); // optional. + + // Returns result of a specific RegionFlowFrame in case + // RegionFlowComputationOptions::frames_to_track() > 1. Result is owned by + // caller. + virtual RegionFlowFrame* RetrieveMultiRegionFlow(int frame); + + // Resets computation to ignore any previously added frames. Next frame passed + // via AddImageXXX() routines will be treated as the first frame in the + // sequence. + virtual void Reset(); + + // Creates synthetic tracks with feature points in a grid with zero motion + // w.r.t. prev frame. Points are located at the center of each grid. Step size + // is fractional w.r.t. image size. + static void ZeroMotionGridFeatures(int frame_width, int frame_height, + float frac_grid_step_x, + float frac_grid_step_y, + RegionFlowFeatureList* result); + + // Returns densly sampled motions zero motion features. + // Features are centered in a box of size frac_diameter that is shifted by + // frac_steps_x * frame_width and frac_steps_y * frame_height. + static void DenseZeroMotionSamples(int frame_width, int frame_height, + float frac_diameter, float frac_steps_x, + float frac_steps_y, + RegionFlowFeatureList* result); + + private: + typedef std::vector> + RegionFlowFeatureListVector; + + typedef std::vector TrackedFeatureView; + + // Indexed via grid bin, each bin contains list of its corresponding features. + typedef std::vector TrackedFeatureMap; + + struct FrameTrackingData; + struct LongTrackData; + struct ORBFeatureDescriptors; + + // Implementation function to retrieve the i-th RegionFlowFeatureList + // (specified track_index). Specifically, i-th feature list, denotes the flow + // from the current frame N to the previous frame N - 1 - track_index. + // Casts arguments to cv. + virtual std::unique_ptr + RetrieveRegionFlowFeatureListImpl(int track_index, + bool compute_feature_descriptor, + bool compute_match_descriptor, + const cv::Mat* curr_color_image, + const cv::Mat* prev_color_image); + + // Initializes the FrameTrackingData's members from source and source_mask. + // Returns true on success. + bool InitFrame(const cv::Mat& source, const cv::Mat& source_mask, + FrameTrackingData* data); + + // Adds image to the current buffer and starts tracking. + bool AddImageAndTrack(const cv::Mat& source, const cv::Mat& source_mask, + int64 timestamp_usec, + const Homography& initial_transform); + + // Computes *change* in visual difference between adjacent frames. Normalized + // w.r.t. number of channels and number of pixels. For this to be meaningful + // it is expected that passed FrameTrackingData's are exactly one frame apart + // (CHECKED). + float ComputeVisualConsistency(FrameTrackingData* previous, + FrameTrackingData* current) const; + + // Computes flow regularized based on regions and other options, from frame + // index "from" to index "to", specified relative to current frame, i.e. index + // of current frame = 0, prev frame = -1, next frame = 1, etc. Set invert_flow + // to true if the flow should be inverted after tracking. + // Optionally, can input the previous result to link features via ids, + // effectively creating long feature tracks. In this case you usually want to + // request the current result (same as returned in feature_list) in form of + // a TrackedFeatureList. + void ComputeRegionFlow(int from, int to, bool synthetic_tracks, + bool invert_flow, + const TrackedFeatureList* prev_result, // optional. + TrackedFeatureList* curr_result, // optional. + RegionFlowFeatureList* feature_list); + + // Gain corrects input frame w.r.t. reference frame. Returns true iff gain + // correction succeeds. If false, calibrated_frame is left untouched. + bool GainCorrectFrame(const cv::Mat& reference_frame, + const cv::Mat& input_frame, float reference_mean, + float input_mean, cv::Mat* calibrated_frame) const; + + // Feature extraction method. + // Expects as input an image pyramid of gray scale image (each subsequent + // level should be downsampled by a factor of 2 (always rounding up), CHECKED + // against). + // For each level extracts corner features across a grid by considering all + // locations that have a corner response corner repsonse above + // options_.feature_quality_level() * maximum within the grid bin. + // Features with high corner response are output first (but corner response is + // not necessarily monotonic). Feature locations are binned into mask + // (via downscaling by mask_scale), using a 5x5 patch, to discarded features + // that are too close to each other. + // Features and corner responses are added to the corresponding vectors in + // data, i.e. passed data is not cleared and expected to be initialized. + virtual void AdaptiveGoodFeaturesToTrack( + const std::vector& extraction_pyramid, int max_features, + float mask_scale, cv::Mat* mask, FrameTrackingData* data); + + // Uses prev_result to remove all features that are not present in data. + // Uses track_ids, i.e. only works with long feature processing. + void RemoveAbsentFeatures(const TrackedFeatureList& prev_result, + FrameTrackingData* data); + + // Remove features in data that lie outside the feature extraction mask for + // that frame. + void RemoveFeaturesOutsideMask(FrameTrackingData* data); + + // Extracts features for tracking from frame corresponding to data. + // Optionally, may reuse tracked features if available, based on options. + // Optionally, if new features are extracted, can use feature_list to mask out + // feature locations that should not be extracted again. + void ExtractFeatures(const TrackedFeatureList* prev_result, + FrameTrackingData* data); + + // Performs inplace feature selection, by evaluating the range + // [0, data->features.size()] via an Evaluator implementing + // [](int) -> bool. Only feature indices for which eval returns true are kept + // (using in place moves) the remainder is discarded. Applies moves operation + // to FrameTrackingData's feature, track_idx, feature_source_map and + // neighborhoods. Also applies moves to any vector and vector + // that can be optionally supplied. + // Note: All vectors are assumed to of the same size (checked in debug + // mode). + template + int InplaceFeatureSelection(FrameTrackingData* data, + std::vector*> int_vecs, + std::vector*> float_vecs, + const Evaluator& eval); + + // Tracks features between two frames (from -> to). Operates on internal data + // structure FrameTrackingData which stores all frame information relavant for + // tracking. + // + // If gain_correct is true, tracking is carried out between the "from" and the + // gain-corrected "to" image. It is also an output variable indicating whether + // gain correction succeeded or failed. + // + // Updates internal data structure, so any computation can be reused in + // successive calls to feature extraction or tracking. + void TrackFeatures(FrameTrackingData* from_data_ptr, + FrameTrackingData* to_data_ptr, bool* gain_correct, + float* frac_long_features_rejected, + TrackedFeatureList* results); + + // Wide-baseline version of above function, using feature descriptor matching + // instead of tracking. + void WideBaselineMatchFeatures(FrameTrackingData* from_data_ptr, + FrameTrackingData* to_data_ptr, + TrackedFeatureList* results); + + // Fits affine model to TrackedFeatureList via direct call of + // MotionEstimation::EstimateAffineModelIRLS. + AffineModel AffineModelFromFeatures(TrackedFeatureList* features) const; + + // Creates synthetic tracks with feature points in a grid with zero motion + // w.r.t. prev frame. Points are located at the center of each grid. Step size + // is fractional w.r.t. image size. + // Returns minimum distance from border across all features. + static int ZeroMotionGridTracks(int frame_width, int frame_height, + float frac_grid_step_x, + float frac_grid_step_y, + TrackedFeatureList* results); + + // Computes region flow using a rectangular grid of square regions. + void ComputeBlockBasedFlow(TrackedFeatureList* feature_list, + TrackedFeatureView* inlier_features) const; + + // Initializes feature locations for FrameTrackingData at index to, + // from resulting tracks in from. + void InitializeFeatureLocationsFromPreviousResult(int from, int to); + + // Initializes feature locations in "to" from initial transform by applying + // it to every feature of "from". + void InitializeFeatureLocationsFromTransform(int from, int to, + const Homography& transform); + + // Enforces a translational model within each region, only retaining inliers + // that are output to inliers. + void DetermineRegionFlowInliers(const TrackedFeatureMap& region_feature_map, + TrackedFeatureView* inliers) const; + + // Determines number of minimum inliers based on absolute and relative + // thresholds. + int GetMinNumFeatureInliers( + const TrackedFeatureMap& region_feature_map) const; + + // Internal conversion function from a feature list to corresponding frame. + void RegionFlowFeatureListToRegionFlow( + const RegionFlowFeatureList& feature_list, RegionFlowFrame* frame) const; + + // Initializes all members except actual features in a RegionFlowFeatureList. + void InitializeRegionFlowFeatureList( + RegionFlowFeatureList* region_flow_feature_list) const; + + // Converts TrackedFeatureView to RegionFlowFeatureList, flattening over + // all bins. Returns average motion magnitude. + // Optionally TrackedFeature's corresponding to each feature output in + // region_flow_feature_list can be recorded via flattened_feature_list. + float TrackedFeatureViewToRegionFlowFeatureList( + const TrackedFeatureView& region_feature_view, + TrackedFeatureList* flattened_feature_list, + RegionFlowFeatureList* region_flow_feature_list) const; + + // Determines if sufficient (spatially distributed) features are available. + bool HasSufficientFeatures(const RegionFlowFeatureList& feature_list); + + // Returns number of required pyramid levels to track the specified distance. + int PyramidLevelsFromTrackDistance(float track_distance); + + // Returns blur score (inverse of average corner measure) for input image. + // The higher the value the blurrier the frame. + float ComputeBlurScore(const cv::Mat& image); + + // Computes binary mask of pixels, for which the corner score (passed in + // min_eig_vals) can be used to as a measure to quanity the amount of blur. + // For pixelx not part of the mask the corner score is not a reliable measure + // to quanity blur. For example, discards over-exposed regions and regions + // that do not have sufficient cornerness. + // Note: Modifies the corner values! + void ComputeBlurMask(const cv::Mat& input, cv::Mat* min_eig_vals, + cv::Mat* mask); + + // Appends features in a sorted manner (by pointer location) while discarding + // duplicates. + void AppendUniqueFeaturesSorted(const TrackedFeatureView& to_be_added, + TrackedFeatureView* features) const; + + void GetFeatureTrackInliers(bool skip_estimation, + TrackedFeatureList* features, + TrackedFeatureView* inliers) const; + + bool IsVerifyLongFeatures() const { + return long_track_data_ != nullptr && options_.verify_long_features(); + } + + int DownsampleWidth() const { return frame_width_; } + int DownsampleHeight() const { return frame_height_; } + + // Returns 1.0 / scale that is being applied to the features for downscaling. + float DownsampleScale() const { return downsample_scale_; } + + private: + RegionFlowComputationOptions options_; + + // Frame width and height after downsampling. + int frame_width_; + int frame_height_; + + // Number of frames w.r.t each frame is tracked. + int frames_to_track_; + // Maximum length of long feature tracks in frames. + int max_long_track_length_; + + // Original frame width and height. + int original_width_; + int original_height_; + + // Scale and state of downsampling. + float downsample_scale_; + bool use_downsampling_; + + int pyramid_levels_; + int extraction_levels_; + + int frame_num_ = 0; + int max_features_ = 0; + float curr_blur_score_ = 0; + // Moving average of number of features across recently computed tracks. + float curr_num_features_avg_ = 0; + + // Count used to generate unique feature ids. + int feature_count_ = 0; + + // List of RegionFlow frames of size options_.frames_to_track. + RegionFlowFeatureListVector region_flow_results_; + + // Gain adapted version. + std::unique_ptr gain_image_; + std::unique_ptr gain_pyramid_; + + // Temporary buffers. + std::unique_ptr corner_values_; + std::unique_ptr corner_filtered_; + std::unique_ptr corner_mask_; + + std::unique_ptr curr_color_image_; + + // Temporary images for feature extraction. + std::unique_ptr feature_tmp_image_1_; + std::unique_ptr feature_tmp_image_2_; + + std::vector feature_status_; // Indicates if point could be + // tracked. + std::vector feature_track_error_; // Patch-based error. + + // Circular queue to buffer tracking data. + std::deque> data_queue_; + + // Global settings for block based flow. + int block_width_; + int block_height_; + int block_levels_; + + // Stores average flow magnitudes for recently processed frames. + std::deque flow_magnitudes_; + + // Records data for long feature tracks. + std::unique_ptr long_track_data_; + + bool use_cv_tracking_ = false; + + // Counter used for controlling how ofter do we run descriptor extraction. + // Count from 0 to options_.extract_descriptor_every_n_frame() - 1. + // Extract descriptors only when counter == 0. + int cnt_extract_descriptors_ = 0; + + friend class MotionAnalysis; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_COMPUTATION_H_ diff --git a/mediapipe/util/tracking/region_flow_computation.proto b/mediapipe/util/tracking/region_flow_computation.proto new file mode 100644 index 000000000..800e430a6 --- /dev/null +++ b/mediapipe/util/tracking/region_flow_computation.proto @@ -0,0 +1,576 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Options for tracking by pyramidical Lucas-Kanade. Local outlier rejection +// is performed by enforcing translation models over regions (grid-based +// or obtained from segmentation). + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/tone_estimation.proto"; + +// Next tag: 33 +message TrackingOptions { + // Describes direction of flow during feature tracking and for the output + // region flow. + enum FlowDirection { + FORWARD = 1; // Tracks are forward, from frame N-k -> frame N (k > 0). + BACKWARD = 2; // Tracks are backward, from frame N -> frame N-k + // (k > 0). + CONSECUTIVELY = 3; // Try forward and backward tracking consecutively. + // Forward tracking is done first. If the flow is + // stable, i.e. there are sufficient tracked features, + // the region flow computation ends. Otherwise, backward + // tracking is performed. + } + + // Flow direction used internally during tracking features. Forward tracking + // allows reusing tracked features instead of explicitly tracking them in + // every frame, and can therefore be faster. See the reuse_features_XXX + // options below. However, if not reusing features, then it is best to match + // the direction for both internal tracking and output flow, for peformance + // reasons. + optional FlowDirection internal_tracking_direction = 19 [default = BACKWARD]; + + // Direction of flow vectors that are computed and output by calls to retrieve + // region flow, tracked features, etc. Note when this is BACKWARD, then the + // returned flow for frame N contains features tracked *from* frame N to a + // previous frame N-k. When this is FORWARD, the flow for frame N contains + // the flow from features in a previous frame N-k, tracked *to* frame N. + // Note that the output flow direction can only be set to FORWARD or BACKWARD. + optional FlowDirection output_flow_direction = 20 [default = BACKWARD]; + + // Specifies how a feature is tracked w.r.t. previous or next frames + // (dependent on the FlowDirection options above). + // Per default, each frame is tracked w.r.t. a single neighboring frame + // (TRACK_SINGLE_FRAME). If associations across multiple frames are desired, + // TRACK_MULTI_FRAME creates multiple results for the current frame, by + // tracking features w.r.t. multiple neighbors. Number of neighbors is + // specified by multi_frames_to_track. + // If long feature tracks are desired (i.e. a track across a frame pair + // that is identified to belong to an earlier known feature), use + // TRACK_ACROSS_FRAMES. Maximum track length can be specified by + // long_tracks_max_frames. + enum TrackingPolicy { + POLICY_SINGLE_FRAME = 1; // Tracks w.r.t. previous or next frame. + POLICY_MULTI_FRAME = 2; // Tracks w.r.t. multiple frames. + POLICY_LONG_TRACKS = 3; // Create long feature tracks. + // Requires internal_tracking_direction to be + // FORWARD. Checked against. + } + + optional TrackingPolicy tracking_policy = 25 [default = POLICY_SINGLE_FRAME]; + + // Number of frame-pairs used for POLICY_MULTI_FRAME, ignored for other + // policies. + // Value of 1 means we are tracking features in the current frame, w.r.t. + // the previous one. Value of 2 denotes tracking of features in current + // w.r.t the previous one and the one before the previous one, etc. + optional int32 multi_frames_to_track = 1 [default = 1]; + + // Maximum length of long feature tracks for POLICY_LONG_TRACKS in frames. + // Note: This maximum is not hard enforced, to avoid that many long + // tracks are dropped at the same time. Instead if a feature reaches + // long_tracks_max_frames * 0.8, it will get dropped with a probability of X, + // where X is calculated, such that 95% of all qualifying features are + // dropped within the interval [.8, 1.2] * long_tracks_max_frames. + optional int32 long_tracks_max_frames = 26 [default = 300]; + + // Hard limit of maximum number of features. Control density of features, with + // min_feature_distance option. This limit is to guarantee that the + // run-time of RegionFlowComputation does not spiral out of control. + optional int32 max_features = 2 [default = 2000]; + + // Specifies the extraction method for features. + enum CornerExtractionMethod { + EXTRACTION_HARRIS = 1; // Using Harris' approximation of + // EXTRACTION_MIN_EIG_VAL. + EXTRACTION_MIN_EIG_VAL = 2; // Exact smallest eigenvalue computation. + EXTRACTION_FAST = 3; // Extract using FAST feature detector. + } + + optional CornerExtractionMethod corner_extraction_method = 27 + [default = EXTRACTION_MIN_EIG_VAL]; + + // Settings for above corner extraction methods. + message MinEigValExtractionSettings { + // Quality level of features (features with + // min_eig_value < quality_level * max_eig_value are rejected). + // Here [min|max]_eig_value denote the minimum and maximum eigen value of + // the auto-correlation matrix of the patch centered at a feature point. The + // ratio of eigenvalues denotes the "cornerness", lower means more + // pronounced corners. + // (see http://en.wikipedia.org/wiki/Harris-Affine for details.) + optional float feature_quality_level = 1 [default = 0.01]; + + // Features below this quality level are always discarded, even if their + // score is above feature_quality_level() * local maximum within that grid + // cell. This prevents us from including very poor features. + optional float adaptive_lowest_quality_level = 2 [default = 8e-5]; + } + + optional MinEigValExtractionSettings min_eig_val_settings = 28; + + message HarrisExtractionSettings { + // Same as in MinEigValExtractionSettings. + optional float feature_quality_level = 1 [default = 2.5e-4]; + + // Note, due to Harris response being negative for some pixels, + // no lowest quality level is enforced. + } + + optional HarrisExtractionSettings harris_settings = 29; + + message FastExtractionSettings { + // threshold on difference between intensity of the central pixel and pixels + // of a circle around this pixel. Empirically, the larger the threshold, the + // fewer the keypoints will be detected. + // Default value set as the same with OpenCV. + optional int32 threshold = 1 [default = 10]; + } + + optional FastExtractionSettings fast_settings = 31; + + optional int32 tracking_window_size = 4 [default = 10]; + + optional int32 tracking_iterations = 5 [default = 10]; + + // Fractional tracking distance w.r.t. to frame diameter d. The number of + // pyramid levels l is chosen such that + // 2^l * tracking_window_size / 2 >= fractional_tracking_distance * d. + // Therefore, theoretically it is guaranteed that objects moving less than + // fractional_tracking_distance * d can be tracked. + optional float fractional_tracking_distance = 6 [default = 0.15]; + + // If set, modifies tracking distance to be 130% of maximum average + // tracking distances of previous frames. + optional bool adaptive_tracking_distance = 24 [default = false]; + + // Minimum feature distance in pixels. Close features are suppressed. If value + // < 1, the distance is computed as a fraction of the frame diameter. + optional float min_feature_distance = 7 [default = 7]; + + // By default, when downscaling by factor x, the minimum feature distance + // is downscaled by a factor of sqrt(x). If set false, no scaling is + // performed. + optional bool distance_downscale_sqrt = 21 [default = true]; + + // Uses grid based extraction of features. Quality level is local within a + // grid cell and results are combined over all cells and multiple scales and + // grid offsets. + // Default option, setting it to false is deprecated and will fail. + optional bool adaptive_good_features_to_track = 8 [default = true]; + + // Size of each grid cell. Values < 1 are interpreted to be relative to + // frame_width_ x frame_height_. + optional float adaptive_features_block_size = 9 [default = .26]; + + // Scales / levels employed for feature extraction. Grid cell size is scaled + // by 0.5 for each level. + optional int32 adaptive_features_levels = 10 [default = 1]; + + // If > 1, feature extraction is carried out at multiple scales by downscaling + // the image repeatedly, extracting features (eigenvalue images) and upscaling + // them. + optional int32 adaptive_extraction_levels = 22 [default = 1]; + + // Alternate way of specifying extraction levels: number of levels is + // automatically computed by downsampling the image until its maximum + // dimension (width or height) reaches this value. Overrides + // adaptive_extraction_levels if > 0. + optional int32 adaptive_extraction_levels_lowest_size = 23 [default = 0]; + + // Grid step-size in fraction of width or height used for creating synthetic + // zero motion tracks with feature points lying on a grid. Can be set based on + // desired number of total features as 1/sqrt(num_features), + // e.g. .04 ~= 1/sqrt(600). + optional float synthetic_zero_motion_grid_step = 13 [default = .04]; + + // If set, uses ORB features with brute force matching and ratio test + // to track frames across larger perspective changes than possible with + // default KLT features. + optional bool wide_baseline_matching = 14 [default = false]; + + // Only brute force matches with + // best_match_distance < ratio_test_threshold * second_best_match_distance + // are retained. + optional float ratio_test_threshold = 15 [default = 0.8]; + + // Refines wide baseline matches by estimating affine transform to + // wide-baseline matches which is used to seed initial positions for KLT + // matches. + optional bool refine_wide_baseline_matches = 16 [default = false]; + + // When tracking features, features tracked from frame A to frame B may be + // reused as the features for frame B when tracking from it (instead of + // extracting features). The max_frame_distance flag limits the distance + // between A and B for the features to be reused. Setting it to 0 => no + // re-use. + optional int32 reuse_features_max_frame_distance = 17 [default = 0]; + + // In conjunction with above, the features are reused in frame B only if they + // are at-least this fraction of the original features in frame A. Otherwise + // they are reset and extracted from scratch. + optional float reuse_features_min_survived_frac = 18 [default = 0.7]; + + // If set uses newer OpenCV tracking algorithm. + // Recommended to be set for all new projects. + optional bool use_cv_tracking_algorithm = 30 [default = true]; + + enum KltTrackerImplementation { + UNSPECIFIED = 0; + KLT_OPENCV = 1; // Use OpenCV's implementation of KLT tracker. + } + + // Implementation choice of KLT tracker. + optional KltTrackerImplementation klt_tracker_implementation = 32 + [default = KLT_OPENCV]; + + // Deprecated fields. + extensions 3, 11, 12; +} + +// Next tag: 67 +message RegionFlowComputationOptions { + optional TrackingOptions tracking_options = 1; + + // Features are binned into grids of different resolutions (see + // fast_estimation_block_size below) and retained if they survive a localized + // translation based RANSAC algorithm and at the survivors are at least of + // size min_feature_inliers. Must be at least 3! + optional int32 min_feature_inliers = 2 [default = 3]; + + // Relative number of inlier features w.r.t. average number of features + // per grid bin. Maximum of both thresholds is used as actual threshold. + optional float relative_min_feature_inliers = 46 [default = 0.2]; + + // Pre-blur before computing features to reduce noise. Set to zero for no + // blurring. + optional float pre_blur_sigma = 33 [default = 0.8]; + + // Number of ransac rounds to estimate per region flow vector. This could be + // adaptive, but the required number of rounds is so low, that estimating + // the bound is more costly than just running it for a fixed number of times. + optional int32 ransac_rounds_per_region = 3 [default = 15]; + + // Error thresholds for a feature to be considered as an inlier in + // pixel-distance. The max of all three thresholds below is used as the actual + // threshold. + // Absolute in pixels. + optional float absolute_inlier_error_threshold = 4 [default = 2]; + // Scaled w.r.t. frame diameter. + optional float frac_inlier_error_threshold = 52 [default = 0]; + // Scaled w.r.t model estimated during each RANSAC round. + optional float relative_inlier_error_threshold = 44 [default = 0.1]; + // Returns for each grid only the top N inlier sets. + optional int32 top_inlier_sets = 45 [default = 2]; + + // For debugging purposes, uses all tracked features regardless of the above + // setting. + optional bool no_estimation_mode = 40 [default = false]; + + // Block size in pixels. If fractional block_size is used (0 < size < 1), + // it is interpreted as fraction of the image dimensions. + // We use 4 blocks in each dimension by standard. + optional float fast_estimation_block_size = 6 [default = .25]; + + // Minimum block size in pixels (larger dimension) to perform fast estimation + // on. Pyramid levels are allocated such that + // block_size * 0.5^(level - 1) = min_block_size. + // At least two levels are used. + optional int32 fast_estimation_min_block_size = 25 [default = 100]; + + // We use overlapping versions of the grid, next parameters specifies how + // many in each dimensions (total is therefore, the value squared!). + optional int32 fast_estimation_overlap_grids = 22 [default = 3]; + + // Flow features with motion above this thresholds (w.r.t. frame diameter) + // are rejected. + optional float max_magnitude_threshold_ratio = 23 [default = 0.2]; + + // Flow features that have a motion that is larger than + // median_magnitude_bounds times the median magnitude are discarded. + // If set to zero, test is not enforced. + optional float median_magnitude_bounds = 51 [default = 0.0]; + + // Determines how irls weights for computed features are initialized. + // In general, more stable features are given higher weight. + enum IrlsInitialization { + INIT_UNIFORM = 1; // All weights equal 1 + + // Feature's irls weight is initialized to a value in [0, 2] + // indicating how consistent the feature's motion is w.r.t. neighboring + // features (high values = very consistent). Determined by counting how + // often a feature is part of the inlier set for a particular bin. + INIT_CONSISTENCY = 2; + } + + // If this option is activated, feature's irls weight is initialized to the + // inverse of its computed flow. + optional IrlsInitialization irls_initialization = 49 + [default = INIT_CONSISTENCY]; + + // We support down-sampling of an incoming frame before running the + // resolution dependent part of the region flow computation (feature + // extraction and tracking if desired). + // Note that in all downsampling modes except for DOWNSAMPLE_TO_INPUT_SIZE, + // for uneven dimensions after downsampling, we always round up to + // the nearest even dimension, i.e. 350p with a downsample_factor of 2.0 + // would expect an input of size 176p. + enum DownsampleMode { + // No downsampling. + DOWNSAMPLE_NONE = 1; + // Downsizes the input frame such that frame_size == downsampling_size, + // where frame_size := max(width, height). + DOWNSAMPLE_TO_MAX_SIZE = 2; + // Downsizes frame by pre-defined factor, downsample_factor below. + DOWNSAMPLE_BY_FACTOR = 3; + // Downsampling based on downsampling schedule, see DownsampleSchedule below + // for details. + DOWNSAMPLE_BY_SCHEDULE = 4; + // Downsizes the input frame such that frame_size == downsampling_size, + // where frame_size := min(width, height). + DOWNSAMPLE_TO_MIN_SIZE = 5; + // Input frame is assumed to be already downsampled by the factor specified + // by downsample_factor below. For example if the original frame is 720p, + // and downsample_factor is set to 2.0, then we expect as input 360p. + DOWNSAMPLE_TO_INPUT_SIZE = 6; + } + + optional DownsampleMode downsample_mode = 11 [default = DOWNSAMPLE_NONE]; + + // Specify the size of either dimension here, the frame will be + // downsampled to fit downsampling_size. + optional int32 downsampling_size = 12 [default = 256]; + optional float downsample_factor = 18 [default = 2.0]; + + // If set, we will force the computed downsampling factor to be the nearest + // integer, resulting in faster downsampling. This will have no effect for + // DOWNSAMPLE_TO_INPUT_SIZE, DOWNSAMPLE_BY_FACTOR, and DOWNSAMPLE_BY_SCHEDULE, + // which should have exact values defined. + optional bool round_downsample_factor = 62 [default = false]; + + // Downsampling schedule. Frame sizes up to which a particular downsampling + // factor is applied. Factor chosen by comparing actual frame area against + // standard area (standard_width * standard_height), where standard_width = + // 16/9 X standard_height. + message DownSampleSchedule { + optional float downsample_factor_360p = 1 [default = 1]; // For <= 360p. + optional float downsample_factor_480p = 2 [default = 1]; // For <= 480p. + optional float downsample_factor_720p = 3 [default = 2]; // For <= 720p. + optional float downsample_factor_1080p = 4 [default = 2]; // >= 720p. + } + // Used if downsample_mode is DOWNSAMPLE_BY_SCHEDULE. + optional DownSampleSchedule downsample_schedule = 19; + + // Minimum number of good features that we require to be present. + // Without good features, the estimated motion models will do more harm than + // good, so it is better to use simply the identity transform for this frame, + // and set the flag unstable_models to true in RegionFlow. + optional int32 min_feature_requirement = 13 [default = 20]; + + // We also require features to cover a minimum percentage area of the frame. + // We use downsampling and plot each feature by a 1 in a grid, this is + // equivalent to plotting each feature by a rectangle in the original frame. + optional float min_feature_cover = 14 [default = 0.15]; + + // Grid size for above min feature cover. + optional int32 min_feature_cover_grid = 20 [default = 8]; + + // Computes blur score for each frame. Score is proportional to amount of + // blur present in a frame, i.e. higher scores reflect more blurred frames. + // Note that the score is dependent on the gradient distribution of the image + // content, i.e. the score itself is rather meaningless but needs to be + // compared to scores of neighboring frames. + optional bool compute_blur_score = 17 [default = false]; + + message BlurScoreOptions { + // Blur score is only computed over image regions of high cornerness + // (as blur in any direction will always alter these regions). First, the + // corner image (smallest eigenvalue of 2nd moment matrix) is box filtered, + // and then thresholded. + optional int32 box_filter_diam = 1 [default = 3]; + + // Specifies relative (w.r.t. maximum) and absolute corneress threshold + // for threshold operation. + optional float relative_cornerness_threshold = 2 [default = 3e-2]; + optional float absolute_cornerness_threshold = 3 [default = 1e-4]; + + // Blur score is defined as 1.0 / , where + // is the n-th percentile of the cornerness evaluated + // over the image regions of high corness as specified above. + optional float median_percentile = 5 [default = 0.85]; + } + + optional BlurScoreOptions blur_score_options = 31; + + // Determines how/if visual consistency is computed. If activated, + // computes the absolute *change* in visual difference between two adjancent + // frame pairs, i.e. the modulus of the 2nd derivative of the frame + // appearance. Stores result in RegionFlowFeatureList::visual_consistency. + message VisualConsistencyOptions { + // Computation of visual consistency is only performed if activated. + optional bool compute_consistency = 1 [default = true]; + + // Incoming color or gray scale image is scaled to a tiny square image of + // the specified dimension. Used to compare adjacent images via SSD. + optional int32 tiny_image_dimension = 2 [default = 20]; + } + + optional VisualConsistencyOptions visual_consistency_options = 55; + + // Radius of patch descriptor computed during RetrieveRegionFlowFeatureList + // call. + optional int32 patch_descriptor_radius = 21 [default = 3]; + // Minimum distance from image border. Must be greater or equal to + // patch_descriptor_radius. + optional int32 distance_from_border = 50 [default = 3]; + + // Corner response is scaled by scalar below and normalized to lie within + // [0, 1], where 0 is low corner score and 1 high corner score. + optional float corner_response_scale = 26 [default = 1500]; + + // Verifies reliablity of features, by back-tracking operation from matched + // location. If returned location is within verification_distance feature is + // accepted otherwise discarded. + optional bool verify_features = 27 [default = false]; + optional float verification_distance = 28 [default = 0.5]; + + // If set, consistency of long features is verified (in case tracking_policy + // is set to POLICY_LONG_FEATURES) by extracting a patch + // around the feature during the very first observation and comparing the + // matching patching along the long feature trajectory via SSD. If the + // difference is above the long_feature_verification_threshold the feature is + // removed. + optional bool verify_long_features = 53 [default = true]; + + // Maximum average per pixel error (in L1 norm) in the normalized intensity + // domain for matching patches to be considered to be consistent. + optional float long_feature_verification_threshold = 54 [default = 0.04]; + + // Long features are expected to have limited acceleration over time. + // If acceleration exceeds specified value based on the setting in + // verify_long_feature_acceleration either: + // a) verify_long_feature_acceleration = false + // A new track is started instead of continuing the old one. + // The track itself is not removed in this case. + // + // b) verify_long_feature_acceleration = true + // The track is flagged for verification, by back-tracking operation from + // matched location. If track fails verification test it is + // discarded. This only triggers if at least + // verify_long_feature_trigger_ratio of features have been flagged, + // otherwise option a is used. + optional float max_long_feature_acceleration = 56 [default = 5.0]; + optional bool verify_long_feature_acceleration = 63 [default = false]; + optional float verify_long_feature_trigger_ratio = 64 [default = 0.0]; + + // If true, histogram equalization is performed to the input image sequence + // before registration. + optional bool histogram_equalization = 57 [default = false]; + + // If true, synthetic region flows with zero motion are used for all (or just + // the first) frame. + optional bool use_synthetic_zero_motion_tracks_all_frames = 34 + [default = false]; + optional bool use_synthetic_zero_motion_tracks_first_frame = 35 + [default = false]; + + // Optional gain correction before tracking features. Improves robustness when + // lighting is changing. + optional bool gain_correction = 36 [default = false]; + + // If set performs gain correction by simply equalizing mean intensity + // between frames, instead of using ToneEstimation. + optional bool fast_gain_correction = 61 [default = false]; + + // If the multiple hypothesis flag is set, features are tracked using both + // with and without gain correction, and the hypothesis with more inliers + // is selected. + optional bool gain_correction_multiple_hypotheses = 47 [default = true]; + + // This flag, when used together with the multiple hypotheses flag, specifies + // that gain correction should increase the number of inliers by at least this + // fraction for it to be used instead of default tracking. + optional float gain_correction_inlier_improvement_frac = 48 [default = 0.1]; + + // If set, always uses the brighter frame as reference. This is the + // preferred direction of correction, to avoid overexposed regions from + // being corrected which leads to spurious matches. + optional bool gain_correction_bright_reference = 59 [default = false]; + + // Only performs gain correction if number of tracked features falls under + // specified ratio (w.r.t. previous frame). + // Set to zero, to always perform gain correction if requested. + optional float gain_correction_triggering_ratio = 60 [default = 0.0]; + + // Gain correction is based on a grid of zero motion features, independent of + // the underlying motion. Fractional parameter specifies resolution of the + // grid w.r.t. frame size. + optional float frac_gain_feature_size = 37 [default = 0.3]; + optional float frac_gain_step = 38 [default = 0.1]; + + enum GainCorrectMode { + GAIN_CORRECT_DEFAULT_USER = 1; // Uses default or user supplied bounds, + // i.e. gain_bias_bounds is left untouched. + GAIN_CORRECT_VIDEO = 2; // Uses defaults for video (most strict). + GAIN_CORRECT_HDR = 3; // Uses most relaxed settings to track + // across HDR frames, taken at different + // exposures. + GAIN_CORRECT_PHOTO_BURST = 4; // More relaxed than video but stricter + // than HDR; use for photo burst where + // exposure between frames can change. + } + + optional GainCorrectMode gain_correct_mode = 41 + [default = GAIN_CORRECT_DEFAULT_USER]; + + // Bounds for the estimated model. If not set externally, will be set + // based on GainCorrectMode. + optional ToneEstimationOptions.GainBiasBounds gain_bias_bounds = 39; + + // Supported image formats. All images are converted to grayscale + // before processing. These image formats only concern AddImage. + // IMPORTANT: All the Retrieve* methods expect RGB when the descriptors + // are computed. + enum ImageFormat { + FORMAT_GRAYSCALE = 1; + FORMAT_RGB = 2; + FORMAT_RGBA = 3; + FORMAT_BGR = 4; + FORMAT_BGRA = 5; + } + + // Image format of the input. + optional ImageFormat image_format = 58 [default = FORMAT_RGB]; + + enum DescriptorExtractorType { + ORB = + 0; // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.370.4395&rep=rep1&type=pdf + } + + // The descriptor extractor type used. + optional DescriptorExtractorType descriptor_extractor_type = 65 + [default = ORB]; + + // Whether to compute derivatives when building the pyramid. When set to + // true, it's building a Laplacian pyramid. When set to false, it's building + // a Gaussian pyramid. + optional bool compute_derivative_in_pyramid = 66 [default = true]; + + // Deprecated fields. + extensions 5, 7, 8, 9, 10, 15, 16, 24, 29, 30, 32, 42, 43; +} diff --git a/mediapipe/util/tracking/region_flow_computation_test.cc b/mediapipe/util/tracking/region_flow_computation_test.cc new file mode 100644 index 000000000..256cfefbd --- /dev/null +++ b/mediapipe/util/tracking/region_flow_computation_test.cc @@ -0,0 +1,304 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/region_flow_computation.h" + +#include + +#include +#include +#include +#include + +#include "absl/time/clock.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/port/commandlineflags.h" +#include "mediapipe/framework/port/file_helpers.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_highgui_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" + +// To ensure that the selected thresholds are robust, it is recommend +// to run this test mutiple times with time seed, if changes are made. +DEFINE_bool(time_seed, false, "Activate to test thresholds"); + +namespace mediapipe { +namespace { + +using RandomEngine = std::mt19937_64; + +struct FlowDirectionParam { + TrackingOptions::FlowDirection internal_direction; + TrackingOptions::FlowDirection output_direction; +}; + +class RegionFlowComputationTest + : public ::testing::TestWithParam { + protected: + virtual void SetUp() { + // Setup tracking direction options based on testing parameter. + const auto& param = GetParam(); + auto* tracking_options = base_options_.mutable_tracking_options(); + tracking_options->set_internal_tracking_direction(param.internal_direction); + tracking_options->set_output_flow_direction(param.output_direction); + + // Load bee image. + data_dir_ = file::JoinPath("./", "/mediapipe/util/tracking/testdata/"); + + std::string png_data; + MEDIAPIPE_CHECK_OK( + file::GetContents(data_dir_ + "stabilize_test.png", &png_data)); + std::vector buffer(png_data.begin(), png_data.end()); + original_frame_ = cv::imdecode(cv::Mat(buffer), 1); + ASSERT_FALSE(original_frame_.empty()); + ASSERT_EQ(original_frame_.type(), CV_8UC3); + } + + // Creates a movie in the specified format by displacing original_frame_ to + // random positions. + void MakeMovie(int frames, RegionFlowComputationOptions::ImageFormat format, + std::vector* movie, + std::vector* positions); + + // Outputs allocated resized input frame. + void GetResizedFrame(int width, int height, cv::Mat* result) const; + + // Runs frame pair test using RGB, RGBA or grayscale input. + void RunFramePairTest(RegionFlowComputationOptions::ImageFormat format); + + protected: + RegionFlowComputationOptions base_options_; + + private: + std::string data_dir_; + cv::Mat original_frame_; +}; + +std::vector FlowDirectionCombinations() { + return {{TrackingOptions::FORWARD, TrackingOptions::FORWARD}, + {TrackingOptions::FORWARD, TrackingOptions::BACKWARD}, + {TrackingOptions::BACKWARD, TrackingOptions::FORWARD}, + {TrackingOptions::BACKWARD, TrackingOptions::BACKWARD}}; +} + +INSTANTIATE_TEST_SUITE_P(FlowDirection, RegionFlowComputationTest, + ::testing::ValuesIn(FlowDirectionCombinations())); + +void RegionFlowComputationTest::MakeMovie( + int num_frames, RegionFlowComputationOptions::ImageFormat format, + std::vector* movie, std::vector* positions) { + CHECK(positions != nullptr); + CHECK(movie != nullptr); + + const int border = 40; + int frame_width = original_frame_.cols - 2 * border; + int frame_height = original_frame_.rows - 2 * border; + ASSERT_GT(frame_width, 0); + ASSERT_GT(frame_height, 0); + + // First generate random positions. + int seed = 900913; // google. + if (FLAGS_time_seed) { + seed = ToUnixMillis(absl::Now()) % (1 << 16); + LOG(INFO) << "Using time seed: " << seed; + } + + RandomEngine random(seed); + std::uniform_int_distribution<> uniform_dist(-10, 10); + positions->resize(num_frames); + (*positions)[0] = Vector2_f(border, border); + for (int f = 1; f < num_frames; ++f) { + Vector2_f pos = (*positions)[f - 1] + + Vector2_f(uniform_dist(random), uniform_dist(random)); + + // Clamp to valid positions. + pos.x(std::max(0, pos.x())); + pos.y(std::max(0, pos.y())); + pos.x(std::min(2 * border, pos.x())); + pos.y(std::min(2 * border, pos.y())); + + (*positions)[f] = pos; + } + + // Create movie by copying. + movie->resize(num_frames); + cv::Mat original_frame = original_frame_; + + auto convert = [&](int channel_format, int conversion_code) { + original_frame.create(original_frame_.rows, original_frame_.cols, + channel_format); + cv::cvtColor(original_frame_, original_frame, conversion_code); + }; + + switch (format) { + case RegionFlowComputationOptions::FORMAT_RGB: + break; + + case RegionFlowComputationOptions::FORMAT_BGR: + convert(CV_8UC3, CV_RGB2BGR); + break; + + case RegionFlowComputationOptions::FORMAT_GRAYSCALE: + convert(CV_8UC1, CV_RGB2GRAY); + break; + + case RegionFlowComputationOptions::FORMAT_RGBA: + convert(CV_8UC4, CV_RGB2RGBA); + break; + + case RegionFlowComputationOptions::FORMAT_BGRA: + convert(CV_8UC4, CV_RGB2BGRA); + break; + } + for (int f = 0; f < num_frames; ++f) { + (*movie)[f].create(frame_height, frame_width, original_frame.type()); + const auto& pos = (*positions)[f]; + cv::Mat tmp_view(original_frame, cv::Range(pos.y(), pos.y() + frame_height), + cv::Range(pos.x(), pos.x() + frame_width)); + tmp_view.copyTo((*movie)[f]); + } +} + +void RegionFlowComputationTest::GetResizedFrame(int width, int height, + cv::Mat* result) const { + CHECK(result != nullptr); + cv::resize(original_frame_, *result, cv::Size(width, height)); +} + +void RegionFlowComputationTest::RunFramePairTest( + RegionFlowComputationOptions::ImageFormat format) { + std::vector movie; + std::vector positions; + const int num_frames = 10; + MakeMovie(num_frames, format, &movie, &positions); + + const int frame_width = movie[0].cols; + const int frame_height = movie[0].rows; + + base_options_.set_image_format(format); + + RegionFlowComputation flow_computation(base_options_, frame_width, + frame_height); + + for (int i = 0; i < num_frames; ++i) { + flow_computation.AddImage(movie[i], 0); + + if (i > 0) { + float inliers = 0; + std::unique_ptr region_flow_frame( + flow_computation.RetrieveRegionFlow()); + // Get flow vector based on actual motion applied to frames. Direction + // based on output flow direction. + Vector2_f flow_vector; + switch (base_options_.tracking_options().output_flow_direction()) { + case TrackingOptions::BACKWARD: + flow_vector = positions[i] - positions[i - 1]; + break; + case TrackingOptions::FORWARD: + flow_vector = positions[i - 1] - positions[i]; + break; + case TrackingOptions::CONSECUTIVELY: + FAIL(); + break; + } + + // We expect all flow vectors to be similar to flow_vector. + for (const auto& region_flow : region_flow_frame->region_flow()) { + for (const auto& feature : region_flow.feature()) { + // Half a pixel error is very reasonable. + if (fabs(flow_vector.x() - FeatureFlow(feature).x()) < 0.5f && + fabs(flow_vector.y() - FeatureFlow(feature).y()) < 0.5f) { + ++inliers; + } + } + } + // 95% of all features should be tracked reliably. + EXPECT_GE(inliers / region_flow_frame->num_total_features(), 0.95f); + } + } +} + +TEST_P(RegionFlowComputationTest, FramePairTest) { + // The output flow direction should be either forward or backward. + EXPECT_NE(base_options_.tracking_options().output_flow_direction(), + TrackingOptions::CONSECUTIVELY); + // Test on grayscale input. + RunFramePairTest(RegionFlowComputationOptions::FORMAT_GRAYSCALE); + // Test on RGB input. + RunFramePairTest(RegionFlowComputationOptions::FORMAT_RGB); + // Test on BGR input. + RunFramePairTest(RegionFlowComputationOptions::FORMAT_BGR); + // Test on RGBA input. + RunFramePairTest(RegionFlowComputationOptions::FORMAT_RGBA); + // Test on BGRA input. + RunFramePairTest(RegionFlowComputationOptions::FORMAT_BGRA); +} + +TEST_P(RegionFlowComputationTest, ResolutionTests) { + // Test all kinds of resolutions (disregard resulting flow). + // Square test, synthetic tracks. + for (int dim = 1; dim <= 50; ++dim) { + RegionFlowComputationOptions options = base_options_; + // Force synthetic features, as images are too small to track anything. + options.set_use_synthetic_zero_motion_tracks_all_frames(true); + RegionFlowComputation flow_computation(options, dim, dim); + + cv::Mat input_frame; + GetResizedFrame(dim, dim, &input_frame); + // Add frame several times. + for (int i = 0; i < 5; ++i) { + flow_computation.AddImage(input_frame, 0); + + // Don't care about the result here, simply test for segfaults. + delete flow_computation.RetrieveRegionFlow(); + } + } + + // Larger frames with tracking. + for (int dim = 50; dim <= 100; ++dim) { + RegionFlowComputation flow_computation(base_options_, dim, dim); + + cv::Mat input_frame; + GetResizedFrame(dim, dim, &input_frame); + for (int i = 0; i < 5; ++i) { + flow_computation.AddImage(input_frame, 0); + delete flow_computation.RetrieveRegionFlow(); + } + } + + // Different aspect ratios, first frame synthetic only. + for (int y = 1; y <= 50; y += 3) { + for (int x = 1; x <= 100; x += 7) { + RegionFlowComputationOptions options = base_options_; + options.set_use_synthetic_zero_motion_tracks_first_frame(true); + RegionFlowComputation flow_computation(options, x, y); + + cv::Mat input_frame; + GetResizedFrame(x, y, &input_frame); + for (int i = 0; i < 5; ++i) { + flow_computation.AddImage(input_frame, 0); + delete flow_computation.RetrieveRegionFlow(); + } + } + } +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/util/tracking/region_flow_visualization.cc b/mediapipe/util/tracking/region_flow_visualization.cc new file mode 100644 index 000000000..8d36b92a5 --- /dev/null +++ b/mediapipe/util/tracking/region_flow_visualization.cc @@ -0,0 +1,196 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/region_flow_visualization.h" + +#include + +#include +#include + +#include "absl/strings/str_cat.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/parallel_invoker.h" +#include "mediapipe/util/tracking/region_flow.h" + +namespace mediapipe { + +void VisualizeRegionFlowImpl(const RegionFlowFrame& region_flow_frame, + cv::Mat* output) { + // Just draw a green line for each feature. + for (const auto& region_flow : region_flow_frame.region_flow()) { + cv::Scalar color(0, 255, 0); + + for (const auto& feature : region_flow.feature()) { + cv::Point p1(static_cast(feature.x() + 0.5f), + static_cast(feature.y() + 0.5f)); + + cv::Point p2(static_cast(feature.x() + feature.dx() + 0.5f), + static_cast(feature.y() + feature.dy() + 0.5f)); + + cv::line(*output, p1, p2, color); + } + } +} + +void VisualizeRegionFlow(const RegionFlowFrame& region_flow_frame, + cv::Mat* output) { + CHECK(output); + VisualizeRegionFlowImpl(region_flow_frame, output); +} + +void VisualizeRegionFlowFeaturesImpl(const RegionFlowFeatureList& feature_list, + const cv::Scalar& color, + const cv::Scalar& outlier, + bool irls_visualization, float scale_x, + float scale_y, cv::Mat* output) { + const float kLineScaleOneRows = 1080.0; + const int line_size = + std::max(std::min(output->rows / kLineScaleOneRows, 4), 1); + + const float text_scale = std::max(output->rows, output->cols) * 3e-4; + for (const auto& feature : feature_list.feature()) { + Vector2_i v1 = FeatureIntLocation(feature); + Vector2_i v2 = FeatureMatchIntLocation(feature); + cv::Point p1(v1.x() * scale_x, v1.y() * scale_y); + cv::Point p2(v2.x() * scale_x, v2.y() * scale_y); + + float alpha = 0; + if (irls_visualization) { + if (feature.irls_weight() < 0) { + continue; + } + + if (feature.irls_weight() == 0.0f) { + // Ignored feature, draw as circle. + cv::circle(*output, p1, 4.0 * line_size, outlier, 2 * line_size); + continue; + } + + // Scale irls_weight, such that error of one pixel equals alpha of 0.5 + // (evenly mix inlier and outlier color). + alpha = std::min(1.0f, feature.irls_weight() * 0.5f); + + } else { + const float feature_stdev_l1 = + PatchDescriptorColorStdevL1(feature.feature_descriptor()); + if (feature_stdev_l1 >= 0.0f) { + // Normalize w.r.t. maximum per channel standard deviation (128.0) + // and scale (scale could be user-defined and was chosen based on some + // test videos such that only very low textured features are colored + // with outlier color). + alpha = std::min(1.0f, feature_stdev_l1 / 128.f * 6.f); + } else { + alpha = 0.5; // Dummy value indicating, descriptor is not present. + } + } + +#ifdef __APPLE__ + cv::Scalar color_scaled(color.mul(alpha) + outlier.mul(1.0f - alpha)); +#else + cv::Scalar color_scaled(color * alpha + outlier * (1.0f - alpha)); +#endif + cv::line(*output, p1, p2, color_scaled, line_size, CV_AA); + cv::circle(*output, p1, 2.0 * line_size, color_scaled, line_size); + + if (feature.has_label()) { + cv::putText(*output, absl::StrCat(" ", feature.label()), p1, + cv::FONT_HERSHEY_SIMPLEX, text_scale, color_scaled, + 3.0 * text_scale, CV_AA); + } + } +} + +void VisualizeRegionFlowFeatures(const RegionFlowFeatureList& feature_list, + const cv::Scalar& color, + const cv::Scalar& outlier, + bool irls_visualization, float scale_x, + float scale_y, cv::Mat* output) { + CHECK(output); + VisualizeRegionFlowFeaturesImpl(feature_list, color, outlier, + irls_visualization, scale_x, scale_y, output); +} + +void VisualizeLongFeatureStreamImpl(const LongFeatureStream& stream, + const cv::Scalar& color, + const cv::Scalar& outlier, + int min_track_length, + int max_points_per_track, float scale_x, + float scale_y, cv::Mat* output) { + const float text_scale = std::max(output->cols, output->rows) * 3e-4; + for (const auto& track : stream) { + std::vector pts; + std::vector irls_weights; + stream.FlattenTrack(track.second, &pts, &irls_weights, nullptr); + + if (min_track_length > 0 && pts.size() < min_track_length) { + continue; + } + CHECK_GT(pts.size(), 1); // Should have at least two points per track. + + // Tracks are ordered with oldest point first, most recent one last. + const int start_k = + max_points_per_track > 0 + ? std::max(0, pts.size() - max_points_per_track) + : 0; + // Draw points in last to first order. + for (int k = start_k; k < pts.size(); ++k) { + cv::Point p1(pts[k].x() * scale_x, pts[k].y() * scale_y); + + if (irls_weights[k] == 0.0f) { + // Ignored feature, draw as circle. + cv::circle(*output, p1, 4.0, outlier, 2); + continue; + } + + const float alpha = std::min(1.0f, irls_weights[k] * 0.5f); +#ifdef __APPLE__ + cv::Scalar color_scaled(color.mul(alpha) + outlier.mul(1.0f - alpha)); +#else + cv::Scalar color_scaled(color * alpha + outlier * (1.0f - alpha)); +#endif + + if (k < pts.size() - 1) { + // Draw line connecting points. + cv::Point p2(pts[k + 1].x() * scale_x, pts[k + 1].y() * scale_y); + cv::line(*output, p1, p2, color_scaled, 1.0, CV_AA); + } + if (k + 1 == pts.size()) { // Last iteration. + cv::circle(*output, p1, 2.0, color_scaled, 1.0); + + const RegionFlowFeature& latest_feature = track.second.back(); + if (latest_feature.has_label()) { + cv::putText(*output, absl::StrCat(" ", latest_feature.label()), p1, + cv::FONT_HERSHEY_SIMPLEX, text_scale, color_scaled, + 3 * text_scale, CV_AA); + } + } + } + } +} + +void VisualizeLongFeatureStream(const LongFeatureStream& stream, + const cv::Scalar& color, + const cv::Scalar& outlier, int min_track_length, + int max_points_per_track, float scale_x, + float scale_y, cv::Mat* output) { + CHECK(output); + VisualizeLongFeatureStreamImpl(stream, color, outlier, min_track_length, + + max_points_per_track, scale_x, scale_y, + output); +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/region_flow_visualization.h b/mediapipe/util/tracking/region_flow_visualization.h new file mode 100644 index 000000000..b28ce9ca8 --- /dev/null +++ b/mediapipe/util/tracking/region_flow_visualization.h @@ -0,0 +1,78 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Small helper function for RegionFlow Visualization. +#ifndef MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_VISUALIZATION_H_ +#define MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_VISUALIZATION_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/region_flow.pb.h" // NOLINT + +namespace mediapipe { + +typedef RegionFlowFrame::RegionFlow RegionFlow; +typedef std::vector RegionFlowFeatureView; + +// Visualizes each tracked feature by a line connecting original and tracked +// points. Uses red for foreground features and blue for background. +// Output is expected to be a VALID 3 channel 8 bit image, clipping of lines +// is performed if points are out of bound. Image should be allocated with the +// same size that was used to compute flow_frame. +void VisualizeRegionFlow(const RegionFlowFrame& region_flow_frame, + cv::Mat* output); + +// Visualizes tracked features by lines connecting original and tracked +// points. Image is expected to be a VALID 3 channel 8 bit image, +// clipping of lines is performed if points are out of bound. +// For color, use for example cv::Scalar(0, 255, 0). If outlier_color != color, +// each feature is colored by linearly blending the two colors w.r.t. its +// irls_weight (if irls_visualization is set), otherwise feature descriptor +// patch variance is used for visualization (color for high variance, +// outlier_color for low variance). +// Parameters scale_x and scale_y allow for scaling features and matches +// uniformly before plotting to output. +// TODO: Retire irls_visualization. +void VisualizeRegionFlowFeatures(const RegionFlowFeatureList& feature_list, + const cv::Scalar& color, + const cv::Scalar& outlier, + bool irls_visualization, + float scale_x, // use 1.0 for no scaling + float scale_y, // ditto. + cv::Mat* output); + +// Similar to above, visualizes tracks as lines over time. +// Tracks with length smaller than min_track_length are not visualized. Track +// visualization is limited to max_points_per_track. +class LongFeatureStream; + +void VisualizeLongFeatureStream(const LongFeatureStream& stream, + const cv::Scalar& color, + const cv::Scalar& outlier, int min_track_length, + int max_points_per_track, float scale_x, + float scale_y, cv::Mat* output); + +} // namespace mediapipe +#endif // MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_VISUALIZATION_H_ diff --git a/mediapipe/util/tracking/streaming_buffer.cc b/mediapipe/util/tracking/streaming_buffer.cc new file mode 100644 index 000000000..e089c367f --- /dev/null +++ b/mediapipe/util/tracking/streaming_buffer.cc @@ -0,0 +1,149 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/streaming_buffer.h" + +#include "absl/strings/str_cat.h" + +namespace mediapipe { + +StreamingBuffer::StreamingBuffer( + const std::vector& data_configuration, int overlap) + : overlap_(overlap) { + CHECK_GE(overlap, 0); + for (auto& item : data_configuration) { + CHECK(data_config_.find(item.first) == data_config_.end()) + << "Tag " << item.first << " already exists"; + data_config_[item.first] = item.second; + // Init deque. + data_[item.first].clear(); + } +} + +bool StreamingBuffer::HasTag(const std::string& tag) const { + return data_config_.find(tag) != data_config_.end(); +} + +bool StreamingBuffer::HasTags(const std::vector& tags) const { + for (const auto& tag : tags) { + if (!HasTag(tag)) { + return false; + } + } + return true; +} + +int StreamingBuffer::BufferSize(const std::string& tag) const { + CHECK(HasTag(tag)); + return data_.find(tag)->second.size(); +} + +int StreamingBuffer::MaxBufferSize() const { + int max_buffer = 0; + for (const auto& elem : data_) { + max_buffer = std::max(max_buffer, BufferSize(elem.first)); + } + return max_buffer; +} + +bool StreamingBuffer::HaveEqualSize( + const std::vector& tags) const { + if (tags.size() < 2) { + return true; + } + int first_size = BufferSize(tags[0]); + for (int k = 1; k < tags.size(); ++k) { + if (BufferSize(tags[1]) != first_size) { + return false; + } + } + return true; +} + +std::vector StreamingBuffer::AllTags() const { + std::vector all_tags; + for (auto& item : data_config_) { + all_tags.push_back(item.first); + } + return all_tags; +} + +bool StreamingBuffer::TruncateBuffer(bool flush) { + // Only truncate if sufficient elements have been buffered. + const int elems_to_clear = + std::max(0, MaxBufferSize() - (flush ? 0 : overlap_)); + + if (elems_to_clear == 0) { + return true; + } + + bool is_consistent = true; + for (auto& item : data_) { + auto& buffer = item.second; + const int buffer_elems_to_clear = + std::min(elems_to_clear, buffer.size()); + if (buffer_elems_to_clear < elems_to_clear) { + LOG(WARNING) << "For tag " << item.first << " got " + << elems_to_clear - buffer_elems_to_clear + << "fewer elements than buffer can hold."; + is_consistent = false; + } + buffer.erase(buffer.begin(), buffer.begin() + buffer_elems_to_clear); + } + + first_frame_index_ += elems_to_clear; + + const int remaining_elems = flush ? 0 : overlap_; + for (auto item : data_) { + const auto& buffer = item.second; + if (buffer.size() != remaining_elems) { + LOG(WARNING) << "After trunctation, for tag " << item.first << "got " + << buffer.size() << " elements, " + << "expected " << remaining_elems; + is_consistent = false; + } + } + + return is_consistent; +} + +void StreamingBuffer::DiscardDatum(const std::string& tag, int num_frames) { + CHECK(HasTag(tag)); + auto& queue = data_[tag]; + if (queue.empty()) { + return; + } + queue.erase(queue.begin(), + queue.begin() + std::min(queue.size(), num_frames)); +} + +void StreamingBuffer::DiscardDatumFromEnd(const std::string& tag, + int num_frames) { + CHECK(HasTag(tag)); + auto& queue = data_[tag]; + if (queue.empty()) { + return; + } + queue.erase(queue.end() - std::min(queue.size(), num_frames), + queue.end()); +} + +void StreamingBuffer::DiscardData(const std::vector& tags, + int num_frames) { + for (const std::string& tag : tags) { + DiscardDatum(tag, num_frames); + } +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/streaming_buffer.h b/mediapipe/util/tracking/streaming_buffer.h new file mode 100644 index 000000000..c2be024f4 --- /dev/null +++ b/mediapipe/util/tracking/streaming_buffer.h @@ -0,0 +1,519 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_STREAMING_BUFFER_H_ +#define MEDIAPIPE_UTIL_TRACKING_STREAMING_BUFFER_H_ + +#include +#include +#include +#include +#include +#include + +#include "absl/container/node_hash_map.h" +#include "absl/types/any.h" +#include "mediapipe/framework/port/logging.h" + +namespace mediapipe { + +// Streaming buffer to store arbitrary data over a chunk of frames with overlap +// between chunks. +// Useful to compute solutions that require as input buffered inputs I_ij, of +// type T_j for all frames i of a chunk. Output solutions S_ik of type T_k +// for each frame i (T_k is not associated with input types T_i in any way) +// can than be stored in the buffer as well and made available to the next +// chunk. +// After solution S_ik has been computed, buffered results (I_ij, S_ik) can be +// output and buffer is truncated to discard all elements minus the overlap. +// Remaining elements (I_ij, S_ik) form the basis for the next chunk. +// Again, arbitrary input data I_ij is buffered until buffer is full. +// However, previous solutions S_ik from the overlap still remain in the buffer +// and can be used to compute new solutions in a temporally constrained manner. + +// Detailed usage example: +// Setup streaming buffer, that will buffer cv::Mat's and AffineModel's over a +// set of 100 frames. Some algorithm will then compute for each input pair the +// foreground saliency in the form of a SaliencyPointList proto. +// +// // Prepare streaming buffer with 2 inputs and one solution, i.e. 3 different +// // data types. +// vector data_config = { +// TaggedPointerType("frame"), +// TaggedPointerType("motion"), +// TaggedPointerType("saliency") }; +// +// // Create new buffer with overlap of 10 frames between chunks. +// StreamingBuffer streaming_buffer(data_config, 10); +// +// // Buffer inputs. +// for (int k = 0; k < 100; ++k) { +// std::unique_ptr input_frame = ... +// std::unique_ptr affine_model = ... +// streaming_buffer.AddDatum("frame", std::move(input_frame)); +// streaming_buffer.AddDatum("motion", std::move(affine_model); +// // OR: +// streaming_buffer.AddData({"frame", "motion"}, +// std::move(input_frame), +// std::move(affine_model)); +// // Note: WrapUnique from util::gtl is imported into this namespace when +// // including this file. +// +// // Get maximum buffer size. +// int buffer_size = streaming_buffer.MaxBufferSize("frame"); +// +// // Reached chunk boundary? +// if (buffer_size == 100) { +// // Check that we buffered one frame for each motion. +// CHECK(streaming_buffer.HaveEqualSize({"frame", "motion"})); +// +// // Compute saliency. +// for (int k = 0; k < 100; ++k) { +// cv::Mat& frame = streaming_buffer.GetDatum("frame", k); +// AffineModel* model = streaming_buffer.GetMutableDatum( +// "motion", k); +// +// // OR: +// std::tuple data = +// streaming_buffer.GetMutableData({"frame", "motion"}, k); +// +// // Resulting saliency. +// std::unique_ptr saliency(new SaliencyPointList); +// +// // Some Computation using frame and model from above ... +// +// // Store computed saliency. +// streaming_buffer.AddDatum("saliency", saliency.release()); +// } +// +// // Output elements, transfers ownership. +// streaming_buffer.OutputDatum( +// false /*is_flush*/, "frame", [](int frame_idx, +// std::unique_ptr frame) { +// }); +// +// // OR: +// streaming_buffer.OutputData(/*is_flush*/, {"frame", "motion", "saliency"}, +// [](int frame_idx, +// std::unique_ptr frame, +// std::unique_ptr model, +// std::unique_ptr saliency) { +// }); +// +// // Truncate buffers for next chunk. +// streaming_buffer.TruncateBuffer(false /* is_flush */); +// +// // End chunk boundary processing. +// } + +// Stores pair (tag, TypeId of type). +typedef std::pair TaggedType; + +// Returns TaggedType for type T* tagged with passed std::string. +template +TaggedType TaggedPointerType(const std::string& tag); + +// Helper function to create unique_ptr's until std::make_unique is allowed. +template +std::unique_ptr MakeUnique(T* ptr); + +template +inline size_t TypeId() { + static_assert(sizeof(char*) <= sizeof(size_t), + "ptr size too large for size_t"); + static char dummy_var; + return reinterpret_cast(&dummy_var); +} + +// Note: If any of the function below are called with a tag not registered by +// the constructor, the function will fail with CHECK. +// Also, if any of the functions below is called with an existing tag but +// incompatible type, the function will fail with CHECK. +class StreamingBuffer { + public: + // Constructs a new buffer with passed mappings (TAG_NAME, DATA_TYPE) of + // maximum size buffer_size. + // Data_configuration must have unique tag for each type. + StreamingBuffer(const std::vector& data_configuration, + int overlap); + + // Call will transfer ownership to StreamingBuffer. + // Returns true if datum was successfully stored, false otherwise. + template + void AddDatum(const std::string& tag, std::unique_ptr pointer); + + // Same as above but forwards pointer T* to a unique_ptr. + // Transfers ownership. + template + void EmplaceDatum(const std::string& tag, T* pointer); + + // Creates a deep copy and stores it in the StreamingBuffer. + template + void AddDatumCopy(const std::string& tag, const T& datum); + + // Convenience function to store multiple pointers at once. + // Returns true if all data was successfully stored, false if at least one + // failed. + template + void AddData(const std::vector& tags, + std::unique_ptr... pointers); + + // Convenience function to add a whole vector of objects to the buffer. + // For each datum a copy will be created. + template + void AddDatumVector(const std::string& tag, const std::vector& datum_vec); + + // Retrieves datum with specified tag and frame index. Returns nullptr if + // datum does not exist. + template + const T* GetDatum(const std::string& tag, int frame_index) const; + + // Gets a reference on the datum. + template + T& GetDatumRef(const std::string& tag, int frame_index) const; + + // Same as above for mutable pointer. + template + T* GetMutableDatum(const std::string& tag, int frame_index) const; + + // Access all elements for a tag as vector, recommended usage: + // for (auto ptr : buffer.GetDatumVector("some_tag")) { ... } + template + std::vector GetDatumVector(const std::string& tag) const; + template + std::vector GetMutableDatumVector(const std::string& tag) const; + + // Gets a vector of references. + template + std::vector> GetReferenceVector( + const std::string& tag) const; + + template + std::vector> GetConstReferenceVector( + const std::string& tag) const; + + // Returns number of buffered inputs for specified tag. + int BufferSize(const std::string& tag) const; + + // Returns maximum over all tags. + int MaxBufferSize() const; + + // Returns true if the buffers for all passed tags have equal size. + // Call with HaveEqualSize(AllTags()) to check if all buffers have equal size. + bool HaveEqualSize(const std::vector& tags) const; + + // Check if all items buffered for the specified tag are initialized (i.e. not + // equal to nullptr). + template + bool IsInitialized(const std::string& tag) const; + + // Returns all tags. + std::vector AllTags() const; + + // Output function to be called after output values have been computed from + // input values and stored in the buffer as well. + // Use to transfer ownership for buffered content out of the streaming buffer, + // by iteratively calling an output functor for each frame. + // + // Specifically, functor is repeatedly called with: + // void operator()(int frame_index, std::unique_ptr pointer) + // + // If flush is set, functor will be called with all frames in the half-open + // interval [0, MaxBufferSize()), otherwise interval is limited to + // [0, MaxBufferSize() - overlap). + // Note that the interval is independent of the tag, i.e. for consistency + // the interval is constant for all tags (if items are not buffered or + // exists a nullptr is passed instead). + // Ownership is transferred to caller via unique_ptr. + // Element storred in buffer is reset to nullptr. + // Note: Does not truncate the actual buffer, use TruncateBuffer after + // outputting all desired data. + template + void OutputDatum(bool flush, const std::string& tag, const Functor& functor); + + // Same as above for multiple elements. + // Functor is repeatedly called with: + // void operator()(int frame_index, std::unique_ptr... pointers) + template + void OutputData(bool flush, const std::vector& tags, + const Functor& functor); + + // Releases and returns the input at the specified tag and frame_index. + // Returns nullptr if datum does not exists. + // Element stored in buffer is reset to nullptr. + // It is recommended that above OutputFunctions are used instead. + template + std::unique_ptr ReleaseDatum(const std::string& tag, int frame_index); + + // Truncates the buffer by discarding all its elements within the interval + // [0, MaxBufferSize() - overlap] if flush is set to false, or + // [0, MaxBufferSize()] otherwise. + // Should be called after chunk boundary has been reached, computation of + // outputs from inputs is done and data has been Output via + // Output[Datum|Data]. + // Returns true if each element within the truncated interval exists, + // and all buffers have the same number of remaining elements + // (#overlap if flush is false, zero otherwise). + bool TruncateBuffer(bool flush); + + // Discards first num_frames of data for specified tag. + // This is different from ReleaseDatum, as it will shrink the buffer for + // the specified tag. + void DiscardDatum(const std::string& tag, int num_frames); + + /// Same as above, but removes num_frames items from the end of the buffer. + void DiscardDatumFromEnd(const std::string& tag, int num_frames); + + // Same as above for a list of tags. + void DiscardData(const std::vector& tags, int num_frames); + + // Returns true if tag exist. + bool HasTag(const std::string& tag) const; + // Returns true if all passed tags exist. + bool HasTags(const std::vector& tags) const; + + // Returns frame index of the first item in the buffer. This is useful to + // determine how many frames in total went through the buffer after several + // *successfull* calls of Truncate(). + int FirstFrameIndex() const { return first_frame_index_; } + + template + using PointerType = std::shared_ptr>; + template + PointerType CreatePointer(T* t); + + private: + // Implementation function with aquiring ownership in case failure while + // adding single argument pointer occurs. + template + void AddDataImpl(const std::vector& tags, + std::unique_ptr pointer, + std::unique_ptr... pointers); + // Terminates recursive template expansion for AddDataImpl. Will never be + // called. + void AddDataImpl(const std::vector& tags) { + CHECK(tags.empty()); + } + + private: + int overlap_ = 0; + int first_frame_index_ = 0; + absl::node_hash_map> data_; + + // Stores tag, TypeId of corresponding type. + absl::node_hash_map data_config_; +}; + +//// Implementation details. +template +TaggedType TaggedPointerType(const std::string& tag) { + return std::make_pair(tag, TypeId>()); +} + +template +StreamingBuffer::PointerType StreamingBuffer::CreatePointer(T* t) { + return PointerType(new std::unique_ptr(t)); +} + +template +void StreamingBuffer::AddDatum(const std::string& tag, + std::unique_ptr pointer) { + CHECK(HasTag(tag)); + CHECK_EQ(data_config_[tag], TypeId>()); + auto& buffer = data_[tag]; + absl::any packet(PointerType(CreatePointer(pointer.release()))); + buffer.push_back(packet); +} + +template +void StreamingBuffer::EmplaceDatum(const std::string& tag, T* pointer) { + std::unique_ptr forwarded(pointer); + AddDatum(tag, std::move(forwarded)); +} + +template +void StreamingBuffer::AddDatumCopy(const std::string& tag, const T& datum) { + AddDatum(tag, std::unique_ptr(new T(datum))); +} + +template +void StreamingBuffer::AddData(const std::vector& tags, + std::unique_ptr... pointers) { + CHECK_EQ(tags.size(), sizeof...(pointers)) + << "Number of tags and data pointers is inconsistent"; + return AddDataImpl(tags, std::move(pointers)...); +} + +template +void StreamingBuffer::AddDatumVector(const std::string& tag, + const std::vector& datum_vec) { + for (const auto& datum : datum_vec) { + AddDatumCopy(tag, datum); + } +} + +template +void StreamingBuffer::AddDataImpl(const std::vector& tags, + std::unique_ptr pointer, + std::unique_ptr... pointers) { + AddDatum(tags[0], std::move(pointer)); + if (sizeof...(pointers) > 0) { + return AddDataImpl(std::vector(tags.begin() + 1, tags.end()), + std::move(pointers)...); + } +} + +template +std::unique_ptr MakeUnique(T* t) { + return std::unique_ptr(t); +} + +template +const T* StreamingBuffer::GetDatum(const std::string& tag, + int frame_index) const { + return GetMutableDatum(tag, frame_index); +} + +template +T& StreamingBuffer::GetDatumRef(const std::string& tag, int frame_index) const { + return *GetMutableDatum(tag, frame_index); +} + +template +T* StreamingBuffer::GetMutableDatum(const std::string& tag, + int frame_index) const { + CHECK_GE(frame_index, 0); + CHECK(HasTag(tag)); + auto& buffer = data_.find(tag)->second; + if (frame_index > buffer.size()) { + return nullptr; + } else { + const absl::any& packet = buffer[frame_index]; + if (absl::any_cast>(&packet) == nullptr) { + LOG(ERROR) << "Stored item is not of requested type. " + << "Check data configuration."; + return nullptr; + } + + // Unpack and return. + const PointerType& pointer = + *absl::any_cast>(&packet); + return pointer->get(); + } +} + +template +std::vector StreamingBuffer::GetDatumVector( + const std::string& tag) const { + std::vector result(GetMutableDatumVector(tag)); + return std::vector(result.begin(), result.end()); +} + +template +std::vector> StreamingBuffer::GetReferenceVector( + const std::string& tag) const { + std::vector ptrs(GetMutableDatumVector(tag)); + std::vector> refs; + refs.reserve(ptrs.size()); + for (T* ptr : ptrs) { + refs.push_back(std::ref(*ptr)); + } + return refs; +} + +template +std::vector> +StreamingBuffer::GetConstReferenceVector(const std::string& tag) const { + std::vector ptrs(GetDatumVector(tag)); + std::vector> refs; + refs.reserve(ptrs.size()); + for (const T* ptr : ptrs) { + refs.push_back(std::cref(*ptr)); + } + return refs; +} + +template +bool StreamingBuffer::IsInitialized(const std::string& tag) const { + CHECK(HasTag(tag)); + const auto& buffer = data_.find(tag)->second; + int idx = 0; + for (const auto& item : buffer) { + const PointerType* pointer = absl::any_cast>(&item); + CHECK(pointer != nullptr); + if (*pointer == nullptr) { + LOG(ERROR) << "Data for " << tag << " at frame " << idx + << " is not initialized."; + return false; + } + } + return true; +} + +template +std::vector StreamingBuffer::GetMutableDatumVector( + const std::string& tag) const { + CHECK(HasTag(tag)); + auto& buffer = data_.find(tag)->second; + std::vector result; + for (const auto& packet : buffer) { + if (absl::any_cast>(&packet) == nullptr) { + LOG(ERROR) << "Stored item is not of requested type. " + << "Check data configuration."; + result.push_back(nullptr); + } else { + result.push_back( + absl::any_cast>(&packet)->get()->get()); + } + } + return result; +} + +template +void StreamingBuffer::OutputDatum(bool flush, const std::string& tag, + const Functor& functor) { + CHECK(HasTag(tag)); + const int end_frame = MaxBufferSize() - (flush ? 0 : overlap_); + for (int k = 0; k < end_frame; ++k) { + functor(k, ReleaseDatum(tag, k)); + } +} + +template +std::unique_ptr StreamingBuffer::ReleaseDatum(const std::string& tag, + int frame_index) { + CHECK(HasTag(tag)); + CHECK_GE(frame_index, 0); + + auto& buffer = data_.find(tag)->second; + if (frame_index >= buffer.size()) { + return nullptr; + } else { + const absl::any& packet = buffer[frame_index]; + if (absl::any_cast>(&packet) == nullptr) { + LOG(ERROR) << "Stored item is not of requested type. " + << "Check data configuration."; + return nullptr; + } + + // Unpack and return. + const PointerType& pointer = + *absl::any_cast>(&packet); + return std::move(*pointer); + } +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_STREAMING_BUFFER_H_ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0000 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0000 new file mode 100644 index 000000000..3da0a3784 Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0000 differ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0001 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0001 new file mode 100644 index 000000000..dddde7033 Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0001 differ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0002 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0002 new file mode 100644 index 000000000..30fe6cfd5 Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0002 differ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0003 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0003 new file mode 100644 index 000000000..5e481e56d Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0003 differ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0004 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0004 new file mode 100644 index 000000000..b6d55e3d8 Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0004 differ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0005 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0005 new file mode 100644 index 000000000..3a4609500 Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0005 differ diff --git a/mediapipe/util/tracking/testdata/box_tracker/chunk_0006 b/mediapipe/util/tracking/testdata/box_tracker/chunk_0006 new file mode 100644 index 000000000..3992696d5 Binary files /dev/null and b/mediapipe/util/tracking/testdata/box_tracker/chunk_0006 differ diff --git a/mediapipe/util/tracking/testdata/stabilize_test.png b/mediapipe/util/tracking/testdata/stabilize_test.png new file mode 100644 index 000000000..239a3ff9e Binary files /dev/null and b/mediapipe/util/tracking/testdata/stabilize_test.png differ diff --git a/mediapipe/util/tracking/tone_estimation.cc b/mediapipe/util/tracking/tone_estimation.cc new file mode 100644 index 000000000..fd6e8edfe --- /dev/null +++ b/mediapipe/util/tracking/tone_estimation.cc @@ -0,0 +1,377 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tone_estimation.h" + +#include + +#include +#include +#include +#include + +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/tone_models.pb.h" + +namespace mediapipe { + +ToneEstimation::ToneEstimation(const ToneEstimationOptions& options, + int frame_width, int frame_height) + : options_(options), + frame_width_(frame_width), + frame_height_(frame_height), + original_width_(frame_width), + original_height_(frame_height) { + switch (options_.downsample_mode()) { + case ToneEstimationOptions::DOWNSAMPLE_NONE: + break; + case ToneEstimationOptions::DOWNSAMPLE_TO_MAX_SIZE: { + const float max_size = std::max(frame_width_, frame_height_); + if (max_size > 1.03f * options_.downsampling_size()) { + downsample_scale_ = max_size / options_.downsampling_size(); + frame_height_ /= downsample_scale_; + frame_width_ /= downsample_scale_; + use_downsampling_ = true; + } + break; + } + case ToneEstimationOptions::DOWNSAMPLE_TO_MIN_SIZE: { + const float min_size = std::min(frame_width_, frame_height_); + if (min_size > 1.03f * options_.downsampling_size()) { + downsample_scale_ = min_size / options_.downsampling_size(); + frame_height_ /= downsample_scale_; + frame_width_ /= downsample_scale_; + use_downsampling_ = true; + } + + break; + } + case ToneEstimationOptions::DOWNSAMPLE_BY_FACTOR: { + CHECK_GE(options_.downsample_factor(), 1); + frame_width_ /= options_.downsample_factor(); + frame_height_ /= options_.downsample_factor(); + downsample_scale_ = options_.downsample_factor(); + use_downsampling_ = true; + break; + } + } + + if (use_downsampling_) { + resized_input_.reset(new cv::Mat(frame_height_, frame_width_, CV_8UC3)); + prev_resized_input_.reset( + new cv::Mat(frame_height_, frame_width_, CV_8UC3)); + } +} + +ToneEstimation::~ToneEstimation() {} + +void ToneEstimation::EstimateToneChange( + const RegionFlowFeatureList& feature_list_input, + const cv::Mat& curr_frame_input, const cv::Mat* prev_frame_input, + ToneChange* tone_change, cv::Mat* debug_output) { + CHECK_EQ(original_height_, curr_frame_input.rows); + CHECK_EQ(original_width_, curr_frame_input.cols); + CHECK(tone_change != nullptr); + + const cv::Mat& curr_frame = + use_downsampling_ ? *resized_input_ : curr_frame_input; + const cv::Mat* prev_frame = (use_downsampling_ && prev_frame_input) + ? prev_resized_input_.get() + : prev_frame_input; + + RegionFlowFeatureList scaled_feature_list; + const RegionFlowFeatureList& feature_list = + use_downsampling_ ? scaled_feature_list : feature_list_input; + + if (use_downsampling_) { + cv::resize(curr_frame_input, *resized_input_, resized_input_->size()); + if (prev_frame_input) { + cv::resize(*prev_frame_input, *prev_resized_input_, + prev_resized_input_->size()); + } + LinearSimilarityModel scale_transform; + scale_transform.set_a(1.0f / downsample_scale_); + scaled_feature_list = feature_list_input; + TransformRegionFlowFeatureList(scale_transform, &scaled_feature_list); + } + + CHECK_EQ(frame_height_, curr_frame.rows); + CHECK_EQ(frame_width_, curr_frame.cols); + + ClipMask<3> curr_clip; + ComputeClipMask<3>(options_.clip_mask_options(), curr_frame, &curr_clip); + + // Compute tone statistics. + tone_change->set_frac_clipped(cv::sum(curr_clip.mask)[0] / + (frame_height_ * frame_width_)); + + IntensityPercentiles(curr_frame, curr_clip.mask, + options_.tone_match_options().log_domain(), tone_change); + + ColorToneMatches color_tone_matches; + // TODO: Buffer clip mask. + if (prev_frame) { + ClipMask<3> prev_clip; + ComputeClipMask<3>(options_.clip_mask_options(), *prev_frame, &prev_clip); + ComputeToneMatches<3>(options_.tone_match_options(), feature_list, + curr_frame, *prev_frame, curr_clip, prev_clip, + &color_tone_matches, debug_output); + + EstimateGainBiasModel(options_.irls_iterations(), &color_tone_matches, + tone_change->mutable_gain_bias()); + + if (!IsStableGainBiasModel(options_.stable_gain_bias_bounds(), + tone_change->gain_bias(), color_tone_matches, + tone_change->mutable_stability_stats())) { + VLOG(1) << "Warning: Estimated gain-bias is unstable."; + // Reset to identity. + tone_change->mutable_gain_bias()->CopyFrom(GainBiasModel()); + tone_change->set_type(ToneChange::INVALID); + } + + // TODO: EstimateMixtureGainBiasModel(); + } +} + +void ToneEstimation::IntensityPercentiles(const cv::Mat& frame, + const cv::Mat& clip_mask, + bool log_domain, + ToneChange* tone_change) const { + cv::Mat intensity(frame.rows, frame.cols, CV_8UC1); + cv::cvtColor(frame, intensity, CV_RGB2GRAY); + + std::vector histogram(256, 0.0f); + + for (int i = 0; i < intensity.rows; ++i) { + const uint8* intensity_ptr = intensity.ptr(i); + const uint8* clip_ptr = clip_mask.ptr(i); + + for (int j = 0; j < intensity.cols; ++j) { + if (!clip_ptr[j]) { + ++histogram[intensity_ptr[j]]; + } + } + } + + // Construct cumulative histogram. + std::partial_sum(histogram.begin(), histogram.end(), histogram.begin()); + + // Normalize histogram. + const float histogram_sum = histogram.back(); + if (histogram_sum == 0) { + // Frame is of solid color. Use default values. + return; + } + + const float denom = 1.0f / histogram_sum; + for (auto& entry : histogram) { + entry *= denom; + } + + std::vector percentiles; + percentiles.push_back(options_.stats_low_percentile()); + percentiles.push_back(options_.stats_low_mid_percentile()); + percentiles.push_back(options_.stats_mid_percentile()); + percentiles.push_back(options_.stats_high_mid_percentile()); + percentiles.push_back(options_.stats_high_percentile()); + + std::vector percentile_values(percentiles.size()); + + const float log_denom = 1.0f / LogDomainLUT().MaxLogDomainValue(); + for (int k = 0; k < percentile_values.size(); ++k) { + const int percentile_bin = + std::lower_bound(histogram.begin(), histogram.end(), percentiles[k]) - + histogram.begin(); + percentile_values[k] = percentile_bin; + if (log_domain) { + percentile_values[k] = + LogDomainLUT().Map(percentile_values[k]) * log_denom; + } else { + percentile_values[k] *= (1.0f / 255.0f); + } + } + + tone_change->set_low_percentile(percentile_values[0]); + tone_change->set_low_mid_percentile(percentile_values[1]); + tone_change->set_mid_percentile(percentile_values[2]); + tone_change->set_high_mid_percentile(percentile_values[3]); + tone_change->set_high_percentile(percentile_values[4]); +} + +void ToneEstimation::EstimateGainBiasModel(int irls_iterations, + ColorToneMatches* color_tone_matches, + GainBiasModel* gain_bias_model) { + CHECK(color_tone_matches != nullptr); + CHECK(gain_bias_model != nullptr); + + // Effectively estimate each model independently. + float solution_ptr[6] = {1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f}; + + const int num_channels = color_tone_matches->size(); + CHECK_GT(num_channels, 0); + CHECK_LE(num_channels, 3); + + // TODO: One IRLS weight per color match. + for (int c = 0; c < num_channels; ++c) { + std::deque& patch_tone_matches = (*color_tone_matches)[c]; + // Reset irls weight. + int num_matches = 0; + for (auto& patch_tone_match : patch_tone_matches) { + patch_tone_match.set_irls_weight(1.0); + num_matches += patch_tone_match.tone_match_size(); + } + + // Do not attempt solution if not matches have been found. + if (num_matches < 3) { + continue; + } + + cv::Mat model_mat(num_matches, 2, CV_32F); + cv::Mat rhs(num_matches, 1, CV_32F); + cv::Mat solution(2, 1, CV_32F); + + for (int iteration = 0; iteration < irls_iterations; ++iteration) { + // Setup matrix. + int row = 0; + for (const auto& patch_tone_match : patch_tone_matches) { + const float irls_weight = patch_tone_match.irls_weight(); + for (const auto& tone_match : patch_tone_match.tone_match()) { + float* row_ptr = model_mat.ptr(row); + float* rhs_ptr = rhs.ptr(row); + row_ptr[0] = tone_match.curr_val() * irls_weight; + row_ptr[1] = irls_weight; + rhs_ptr[0] = tone_match.prev_val() * irls_weight; + ++row; + } + } + + // Solve. + if (!cv::solve(model_mat, rhs, solution, cv::DECOMP_QR)) { + // Fallback to identity. + solution_ptr[2 * c] = 1; + solution_ptr[2 * c + 1] = 0; + break; // Break to next color channel. + } + + float a = solution.at(0, 0); + float b = solution.at(1, 0); + + // Copy to solution. + solution_ptr[2 * c] = a; + solution_ptr[2 * c + 1] = b; + + // Evaluate error. + for (auto& patch_tone_match : patch_tone_matches) { + const int num_tone_matches = patch_tone_match.tone_match_size(); + + if (num_tone_matches == 0) { + continue; + } + + float summed_error = 0.0f; + for (const auto& tone_match : patch_tone_match.tone_match()) { + // Express tone registration error in 0 .. 100. + const float error = + 100.0f * (tone_match.curr_val() * a + b - tone_match.prev_val()); + summed_error += error * error; + } + + // Compute RMSE. + const float patch_error = + std::sqrt(static_cast(summed_error / num_tone_matches)); + // TODO: L1 instead of L0? + patch_tone_match.set_irls_weight(1.0f / (patch_error + 1e-6f)); + } + } + } + + gain_bias_model->CopyFrom( + GainBiasModelAdapter::FromPointer(solution_ptr, false)); + + // Test invertability, reset if failed. + const float det = gain_bias_model->gain_c1() * gain_bias_model->gain_c2() * + gain_bias_model->gain_c3(); + if (fabs(det) < 1e-6f) { + LOG(WARNING) << "Estimated gain bias model is not invertible. " + << "Falling back to identity model."; + gain_bias_model->CopyFrom(GainBiasModel()); + } +} + +bool ToneEstimation::IsStableGainBiasModel( + const ToneEstimationOptions::GainBiasBounds& bounds, + const GainBiasModel& model, const ColorToneMatches& color_tone_matches, + ToneChange::StabilityStats* stats) { + if (stats != nullptr) { + stats->Clear(); + } + + // Test each channel for stability. + if (model.gain_c1() < bounds.lower_gain() || + model.gain_c1() > bounds.upper_gain() || + model.bias_c1() < bounds.lower_bias() || + model.bias_c1() > bounds.upper_bias()) { + return false; + } + + if (model.gain_c2() < bounds.lower_gain() || + model.gain_c2() > bounds.upper_gain() || + model.bias_c2() < bounds.lower_bias() || + model.bias_c2() > bounds.upper_bias()) { + return false; + } + + if (model.gain_c3() < bounds.lower_gain() || + model.gain_c3() > bounds.upper_gain() || + model.bias_c3() < bounds.lower_bias() || + model.bias_c3() > bounds.upper_bias()) { + return false; + } + + // Test each channel independently. + int total_inliers = 0; + int total_tone_matches = 0; + double total_inlier_weight = 0.0; + for (const auto& patch_tone_matches : color_tone_matches) { + int num_inliers = 0; + for (const auto& patch_tone_match : patch_tone_matches) { + if (patch_tone_match.irls_weight() > bounds.min_inlier_weight()) { + ++num_inliers; + // Clamp the weight to a registration error of 1 intensity value + // difference (out of 255). Since weight are inversely proportional to + // registration errors in the range 0..100, this corresponds to a max + // weight of 2.55. + total_inlier_weight += std::min(2.55f, patch_tone_match.irls_weight()); + } + } + + if (num_inliers < + bounds.min_inlier_fraction() * patch_tone_matches.size()) { + return false; + } + + total_inliers += num_inliers; + total_tone_matches += patch_tone_matches.size(); + } + + if (stats != nullptr && total_tone_matches > 0) { + stats->set_num_inliers(total_inliers); + stats->set_inlier_fraction(total_inliers * 1.0f / total_tone_matches); + stats->set_inlier_weight(total_inlier_weight); + } + + return true; +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/tone_estimation.h b/mediapipe/util/tracking/tone_estimation.h new file mode 100644 index 000000000..0fa049e2b --- /dev/null +++ b/mediapipe/util/tracking/tone_estimation.h @@ -0,0 +1,409 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Fits tone models to intensity matches +// (gathered from order statistics of matching patches supplied by +// RegionFlowFeatureList's). + +#ifndef MEDIAPIPE_UTIL_TRACKING_TONE_ESTIMATION_H_ +#define MEDIAPIPE_UTIL_TRACKING_TONE_ESTIMATION_H_ + +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/region_flow.h" +#include "mediapipe/util/tracking/region_flow.pb.h" +#include "mediapipe/util/tracking/tone_estimation.pb.h" +#include "mediapipe/util/tracking/tone_models.h" + +namespace mediapipe { + +class GainBiasModel; +class RegionFlowFeatureList; + +typedef std::deque PatchToneMatches; + +// Each vector element presents its own channel. +typedef std::vector ColorToneMatches; + +// Clip mask for C channels. +template +struct ClipMask { + ClipMask() { + min_exposure_threshold.resize(C); + max_exposure_threshold.resize(C); + } + + cv::Mat mask; + std::vector min_exposure_threshold; + std::vector max_exposure_threshold; +}; + +class ToneEstimation { + public: + ToneEstimation(const ToneEstimationOptions& options, int frame_width, + int frame_height); + virtual ~ToneEstimation(); + ToneEstimation(const ToneEstimation&) = delete; + ToneEstimation& operator=(const ToneEstimation&) = delete; + + // Estimates ToneChange models from matching feature points. + // Input RegionFlowFeatureList supplies (x, y) matches, where x is a feature + // point in curr_frame, and y the matching feature point in prev_frame. The + // estimated ToneChange describes the change from curr_frame to + // prev_frame in the tone domain using a variety of linear models as specified + // by ToneEstimationOptions. + // If debug_output is not nullptr, contains visualization of inlier patches + // and clip masks for debugging: current and previous frames are rendered + // side-by-side (left:current, right:previous). + // TODO: Parallel estimation across frames. + void EstimateToneChange(const RegionFlowFeatureList& feature_list, + const cv::Mat& curr_frame, const cv::Mat* prev_frame, + ToneChange* tone_change, + cv::Mat* debug_output = nullptr); + + // Returns mask of clipped pixels for input frame + // (pixels that are clipped due to limited dynamic range are set to 1), + // as well as per channel value denoting the min and max exposure threshold. + template + static void ComputeClipMask(const ClipMaskOptions& options, + const cv::Mat& frame, ClipMask* clip_mask); + + // Returns color tone matches of size C from passed feature list. Tone matches + // are obtained from order statistics of sampled patches at feature locations. + // Matches are obtained for each color channel, with over and under-exposed + // patches being discarded. + // If debug_output is not NULL, contains visualization of inlier patches and + // clip masks for debugging. + template + static void ComputeToneMatches(const ToneMatchOptions& tone_match_options, + const RegionFlowFeatureList& feature_list, + const cv::Mat& curr_frame, + const cv::Mat& prev_frame, + const ClipMask& curr_clip_mask, + const ClipMask& prev_clip_mask, + ColorToneMatches* color_tone_matches, + cv::Mat* debug_output = nullptr); + + // Can be called with color tone matches for 1 - 3 channels, will simply set + // missing channels in GainBiasModel to identity. + // Returns the estimated gain and bias model for tone change, + // that maps colors in current frame to those in the prev frame + // using color_tone_matches as corresponding samples. Modifies + // the irls_weights of samples in color_tone_matches based on their + // fit to the estimated gain_bias_model. + static void EstimateGainBiasModel(int irls_iterations, + ColorToneMatches* color_tone_matches, + GainBiasModel* gain_bias_model); + + // Tests if the estimated gain-bias model is stable based on: + // (1) whether the model parameters are within gain and bias bounds, and + // (2) there are enough inliers in the tone matches. + // Some statistics related to stability analysis are returned in stats (if + // not NULL). + static bool IsStableGainBiasModel( + const ToneEstimationOptions::GainBiasBounds& gain_bias_bounds, + const GainBiasModel& model, const ColorToneMatches& tone_matches, + ToneChange::StabilityStats* stats = nullptr); + + private: + // Computes normalized intensity percentiles. + void IntensityPercentiles(const cv::Mat& frame, const cv::Mat& clip_mask, + bool log_domain, ToneChange* tone_change) const; + + private: + ToneEstimationOptions options_; + int frame_width_; + int frame_height_; + + int original_width_; + int original_height_; + + float downsample_scale_ = 1.0f; + bool use_downsampling_ = false; + + std::unique_ptr resized_input_; + std::unique_ptr prev_resized_input_; +}; + +// Template implementation functions. +template +void ToneEstimation::ComputeClipMask(const ClipMaskOptions& options, + const cv::Mat& frame, + ClipMask* clip_mask) { + CHECK(clip_mask != nullptr); + CHECK_EQ(frame.channels(), C); + + // Over / Underexposure handling. + // Masks pixels affected by clipping. + clip_mask->mask.create(frame.rows, frame.cols, CV_8U); + + const float min_exposure_thresh = options.min_exposure() * 255.0f; + const float max_exposure_thresh = options.max_exposure() * 255.0f; + const int max_clipped_channels = options.max_clipped_channels(); + + std::vector planes; + cv::split(frame, planes); + CHECK_EQ(C, planes.size()); + float min_exposure[C]; + float max_exposure[C]; + for (int c = 0; c < C; ++c) { + min_exposure[c] = min_exposure_thresh; + max_exposure[c] = max_exposure_thresh; + } + + for (int c = 0; c < C; ++c) { + clip_mask->min_exposure_threshold[c] = min_exposure[c]; + clip_mask->max_exposure_threshold[c] = max_exposure[c]; + } + + for (int i = 0; i < frame.rows; ++i) { + const uint8* img_ptr = frame.ptr(i); + uint8* clip_ptr = clip_mask->mask.template ptr(i); + + for (int j = 0; j < frame.cols; ++j) { + const int idx = C * j; + int clipped_channels = 0; // Count clipped channels. + + for (int p = 0; p < C; ++p) { + clipped_channels += + static_cast(img_ptr[idx + p] < min_exposure[p] || + img_ptr[idx + p] > max_exposure[p]); + } + + if (clipped_channels > max_clipped_channels) { + clip_ptr[j] = 1; + } else { + clip_ptr[j] = 0; + } + } + } + + // Dilate to address blooming. + const int dilate_diam = options.clip_mask_diameter(); + const int dilate_rad = ceil(dilate_diam * 0.5); + // Remove border from dilate, as cv::dilate reads out of + // bound values otherwise. + if (clip_mask->mask.rows > 2 * dilate_rad && + clip_mask->mask.cols > 2 * dilate_rad) { + cv::Mat dilate_domain = + cv::Mat(clip_mask->mask, + cv::Range(dilate_rad, clip_mask->mask.rows - dilate_rad), + cv::Range(dilate_rad, clip_mask->mask.cols - dilate_rad)); + cv::Mat kernel(options.clip_mask_diameter(), options.clip_mask_diameter(), + CV_8U); + kernel.setTo(1.0); + cv::dilate(dilate_domain, dilate_domain, kernel); + } +} + +template +void ToneEstimation::ComputeToneMatches( + const ToneMatchOptions& options, const RegionFlowFeatureList& feature_list, + const cv::Mat& curr_frame, const cv::Mat& prev_frame, + const ClipMask& curr_clip_mask, // Optional. + const ClipMask& prev_clip_mask, // Optional. + ColorToneMatches* color_tone_matches, cv::Mat* debug_output) { + CHECK(color_tone_matches != nullptr); + CHECK_EQ(curr_frame.channels(), C); + CHECK_EQ(prev_frame.channels(), C); + + color_tone_matches->clear(); + color_tone_matches->resize(C); + + const int patch_diam = 2 * options.patch_radius() + 1; + const int patch_area = patch_diam * patch_diam; + const float patch_denom = 1.0f / (patch_area); + const float log_denom = 1.0f / LogDomainLUT().MaxLogDomainValue(); + + int num_matches = 0; + std::vector curr_intensities[C]; + std::vector prev_intensities[C]; + for (int c = 0; c < C; ++c) { + curr_intensities[c].resize(256, 0); + prev_intensities[c].resize(256, 0); + } + + // Debugging output. Horizontally concatenate current and previous frames. + cv::Mat curr_debug; + cv::Mat prev_debug; + if (debug_output != nullptr) { + const int rows = std::max(curr_frame.rows, prev_frame.rows); + const int cols = curr_frame.cols + prev_frame.cols; + const cv::Rect curr_rect(cv::Point(0, 0), curr_frame.size()); + const cv::Rect prev_rect(cv::Point(curr_frame.cols, 0), prev_frame.size()); + debug_output->create(rows, cols, CV_8UC3); + debug_output->setTo(cv::Scalar(255, 0, 0)); + curr_debug = (*debug_output)(curr_rect); + prev_debug = (*debug_output)(prev_rect); + curr_frame.copyTo(curr_debug, curr_clip_mask.mask ^ cv::Scalar(1)); + prev_frame.copyTo(prev_debug, prev_clip_mask.mask ^ cv::Scalar(1)); + } + + const int patch_radius = options.patch_radius(); + const int frame_width = curr_frame.cols; + const int frame_height = curr_frame.rows; + for (const auto& feature : feature_list.feature()) { + // Extract matching masks at this feature. + Vector2_i curr_loc = FeatureIntLocation(feature); + Vector2_i prev_loc = FeatureMatchIntLocation(feature); + + // Start bound inclusive, end bound exclusive. + Vector2_i curr_start = Vector2_i(std::max(0, curr_loc.x() - patch_radius), + std::max(0, curr_loc.y() - patch_radius)); + + Vector2_i curr_end = + Vector2_i(std::min(frame_width, curr_loc.x() + patch_radius + 1), + std::min(frame_height, curr_loc.y() + patch_radius + 1)); + + Vector2_i prev_start = Vector2_i(std::max(0, prev_loc.x() - patch_radius), + std::max(0, prev_loc.y() - patch_radius)); + + Vector2_i prev_end = + Vector2_i(std::min(frame_width, prev_loc.x() + patch_radius + 1), + std::min(frame_height, prev_loc.y() + patch_radius + 1)); + + Vector2_i curr_size = curr_end - curr_start; + Vector2_i prev_size = prev_end - prev_start; + + if (prev_size != curr_size || + (curr_size.x() * curr_size.y()) != patch_area) { + continue; // Ignore border patches. + } + + // TODO: Instead of summing masks, perform box filter over mask + // and just grab element at feature location. + cv::Mat curr_patch_mask(curr_clip_mask.mask, + cv::Range(curr_start.y(), curr_end.y()), + cv::Range(curr_start.x(), curr_end.x())); + cv::Mat prev_patch_mask(prev_clip_mask.mask, + cv::Range(prev_start.y(), prev_end.y()), + cv::Range(prev_start.x(), prev_end.x())); + + // Skip patch if too many clipped pixels. + if (cv::sum(curr_patch_mask)[0] * patch_denom > + options.max_frac_clipped() || + cv::sum(prev_patch_mask)[0] * patch_denom > + options.max_frac_clipped()) { + continue; + } + + // Extract matching patches at this feature. + cv::Mat curr_patch(curr_frame, cv::Range(curr_start.y(), curr_end.y()), + cv::Range(curr_start.x(), curr_end.x())); + cv::Mat prev_patch(prev_frame, cv::Range(prev_start.y(), prev_end.y()), + cv::Range(prev_start.x(), prev_end.x())); + + // Reset histograms to zero. + for (int c = 0; c < C; ++c) { + std::fill(curr_intensities[c].begin(), curr_intensities[c].end(), 0); + std::fill(prev_intensities[c].begin(), prev_intensities[c].end(), 0); + } + + // Build histogram (to sidestep sorting). + // Note: We explicitly add over and under-exposed pixels to the + // histogram, as we are going to sample the histograms at specific + // percentiles modeling the shift in intensity between frames. + // (If the average intensity increases by N, histogram shifts by N + // bins to the right). However, matches that are over or underexposed + // are discarded afterwards. + for (int i = 0; i < patch_diam; ++i) { + const uint8* prev_ptr = prev_patch.ptr(i); + const uint8* curr_ptr = curr_patch.ptr(i); + for (int j = 0; j < patch_diam; ++j) { + const int j_c = C * j; + for (int c = 0; c < C; ++c) { + ++curr_intensities[c][curr_ptr[j_c + c]]; + ++prev_intensities[c][prev_ptr[j_c + c]]; + } + } + } + + // Sample at percentiles. + for (int c = 0; c < C; ++c) { + // Accumulate. + for (int k = 1; k < 256; ++k) { + curr_intensities[c][k] += curr_intensities[c][k - 1]; + prev_intensities[c][k] += prev_intensities[c][k - 1]; + } + + float percentile = options.min_match_percentile(); + float percentile_step = + (options.max_match_percentile() - options.min_match_percentile()) / + options.match_percentile_steps(); + + PatchToneMatch patch_tone_match; + for (int k = 0; k < options.match_percentile_steps(); + ++k, percentile += percentile_step) { + const auto& curr_iter = std::lower_bound(curr_intensities[c].begin(), + curr_intensities[c].end(), + percentile * patch_area); + + const auto& prev_iter = std::lower_bound(prev_intensities[c].begin(), + prev_intensities[c].end(), + percentile * patch_area); + + const int curr_int = curr_iter - curr_intensities[c].begin(); + const int prev_int = prev_iter - prev_intensities[c].begin(); + + // If either clipped, discard match. + if (curr_int < curr_clip_mask.min_exposure_threshold[c] || + curr_int > curr_clip_mask.max_exposure_threshold[c] || + prev_int < prev_clip_mask.min_exposure_threshold[c] || + prev_int > prev_clip_mask.max_exposure_threshold[c]) { + continue; + } + + ToneMatch* tone_match = patch_tone_match.add_tone_match(); + if (options.log_domain()) { + tone_match->set_curr_val(LogDomainLUT().Map(curr_int) * log_denom); + tone_match->set_prev_val(LogDomainLUT().Map(prev_int) * log_denom); + } else { + tone_match->set_curr_val(curr_int * (1.0f / 255.0f)); + tone_match->set_prev_val(prev_int * (1.0f / 255.0f)); + } + } + + (*color_tone_matches)[c].push_back(patch_tone_match); + + // Debug output. + if (debug_output != nullptr) { + cv::rectangle(curr_debug, cv::Point(curr_start.x(), curr_start.y()), + cv::Point(curr_end.x(), curr_end.y()), + cv::Scalar(0, 0, 255)); + cv::rectangle(prev_debug, cv::Point(prev_start.x(), prev_start.y()), + cv::Point(prev_end.x(), prev_end.y()), + cv::Scalar(0, 0, 255)); + } + } + + ++num_matches; + } + + VLOG(1) << "Extracted fraction: " + << static_cast(num_matches) / + std::max(1, feature_list.feature_size()); +} + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_TONE_ESTIMATION_H_ diff --git a/mediapipe/util/tracking/tone_estimation.proto b/mediapipe/util/tracking/tone_estimation.proto new file mode 100644 index 000000000..711765a08 --- /dev/null +++ b/mediapipe/util/tracking/tone_estimation.proto @@ -0,0 +1,196 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/tone_models.proto"; + +// Capture tone change between two frames and per-frame tone statistics. +// The estimated tone change describes the transformation of color intensities +// from the current to the previous frame. +// Next tag: 16 +message ToneChange { + optional GainBiasModel gain_bias = 1; + optional AffineToneModel affine = 2; + + optional MixtureGainBiasModel mixture_gain_bias = 3; + optional MixtureAffineToneModel mixture_affine = 4; + + // TODO: Implement. + optional float mixture_domain_sigma = 5; + + // Tone statistics, describing statistics of intensities of the current frame + // (independent from the previous frame). + + // Fraction of clipped pixels in [0, 1]. + // A pixel is considered clipped if more than + // ToneEstimationOptions::max_clipped_channels are over- + // or under exposed. + optional float frac_clipped = 6 [default = 0.0]; + + // Intensity percentiles of the current frame. Normalized to [0, 1]. + // We sort the intensities in the image (RGB to intensity conversion) and + // select the intensities that fall at the ranks specified by the + // stats_[low|mid|high]_percentile options below as the + /* + // Minimum fractional number of inlier gain features for gain correction + // deemed to be successful. + optional float min_frac_gain_inliers = 39 [default = 0.8]; + + // On average accept up to 10% intensity registration error. + optional float min_gain_inlier_weight = 40 [default = 0.1]; + */ + // [low|mid|high]_percentile's. + optional float low_percentile = 8; + optional float low_mid_percentile = 9; + optional float mid_percentile = 10; + optional float high_mid_percentile = 11; + optional float high_percentile = 12; + + // If set, all models are estimated in log domain, specifically + // intensity I is transformed via log(1.0 + I) := I' + // Consequently after apply the models, intensity needs to be transformed + // back to visible range via exp(I') - 1.0. + optional bool log_domain = 13 [default = false]; + + // ToneChange type indicates whether highest degree of freedom (DOF) + // model estimation was deemed stable, in which case ToneChange::Type is set + // to VALID. + // If a model was deemed not stable (according to *StabilityBounds in + // ToneEstimationOptions), it is set to the lower dof type which was deemed + // stable. + enum Type { + VALID = 0; + INVALID = 10; // Identity model, gain bias unrealiable. + } + + optional Type type = 14 [default = VALID]; + + // Stats based on stability analysis. + message StabilityStats { + // Number of tone matches that were iniliers (used for tone estimation). + optional int32 num_inliers = 1; + // Fraction of tone matches that were inliers. + optional float inlier_fraction = 2; + // Total IRLS weight summed over all inliers. + optional double inlier_weight = 3; + } + optional StabilityStats stability_stats = 15; +} + +message ToneMatchOptions { + // ToneChange's are fit to ToneMatches extracted from matching patches, using + // order statistics of their corresponding intensities. Matches are defined by + // having the same percentile of ordered intensities. If any member of the + // ToneMatch is below under or above over-exposed the match is discarded + // (based on parameters min and max_exposure above). + // Matches are extracted from min_match_percentile to max_match_percentile in + // #match_percentile_steps equidistant steps. + optional float min_match_percentile = 1 [default = 0.01]; + optional float max_match_percentile = 2 [default = 0.99]; + optional int32 match_percentile_steps = 3 [default = 10]; + + // Patch radius from which order statistics are collected. + optional int32 patch_radius = 4 [default = 18]; + + // Only matches with not too many pixels over- or underexposed are used. + optional float max_frac_clipped = 5 [default = 0.4]; + + // If set matches will be collected in the log domain. + optional bool log_domain = 8 [default = false]; +} + +message ClipMaskOptions { + // Over/Under exposure setting. Pixels that are clipped due to limited + // dynamic range are masked out from analysis. Values specified w.r.t. + // [0, 1] range. + optional float min_exposure = 1 [default = 0.02]; + optional float max_exposure = 2 [default = 0.98]; + + // A pixel can have clipped color values in atmost max_clipped_channels before + // it will be labeled as clipped. + optional int32 max_clipped_channels = 4 [default = 1]; + + // Over-exposure tends to show blooming (neighboring pixels are affected by + // over-exposure as well). For robustness mask of clipped pixels is dilated + // with structuring element of diameter clip_mask_diam. + optional int32 clip_mask_diameter = 5 [default = 5]; +} + +// Next tag: 13 +message ToneEstimationOptions { + optional ToneMatchOptions tone_match_options = 1; + optional ClipMaskOptions clip_mask_options = 2; + + // Percentiles for tone statistics. + optional float stats_low_percentile = 3 [default = 0.05]; + optional float stats_low_mid_percentile = 4 [default = 0.2]; + optional float stats_mid_percentile = 5 [default = 0.5]; + optional float stats_high_mid_percentile = 6 [default = 0.8]; + optional float stats_high_percentile = 7 [default = 0.95]; + + optional int32 irls_iterations = 8 [default = 10]; + + message GainBiasBounds { + optional float min_inlier_fraction = 1 [default = 0.75]; + + // Accept 2% intensity difference as valid inlier. + optional float min_inlier_weight = 2 [default = 0.5]; + + optional float lower_gain = 3 [default = 0.75]; + optional float upper_gain = 4 [default = 1.334]; + + optional float lower_bias = 5 [default = -0.2]; + optional float upper_bias = 6 [default = 0.2]; + } + + optional GainBiasBounds stable_gain_bias_bounds = 9; + + // We support down-sampling of an incoming frame before running the + // resolution dependent part of the tone estimation. + // tracking if desired). + enum DownsampleMode { + DOWNSAMPLE_NONE = 1; // no downsampling. + DOWNSAMPLE_TO_MAX_SIZE = 2; // downsizes frame such that frame_size == + // downsampling_size. + // frame_size := max(width, height). + DOWNSAMPLE_BY_FACTOR = 3; // downsizes frame by pre-defined factor. + DOWNSAMPLE_TO_MIN_SIZE = 4; // downsizes frame such that frame_size == + // downsampling_size. + // frame_size := min(width, height). + } + + optional DownsampleMode downsample_mode = 10 [default = DOWNSAMPLE_NONE]; + + // Specify the size of either dimension here, the frame will be + // downsampled to fit downsampling_size. + optional int32 downsampling_size = 11 [default = 256]; + optional float downsample_factor = 12 [default = 2.0]; +} + +message ToneMatch { + // Intensity in current frame. + optional float curr_val = 1; + // Matching intensity in previous frame. + optional float prev_val = 2; +} + +message PatchToneMatch { + // Several intensity matches computed from equal percentiles of matching patch + // pairs. No number or particular ordering is assumed. + repeated ToneMatch tone_match = 1; + optional float irls_weight = 2 [default = 1.0]; +} diff --git a/mediapipe/util/tracking/tone_models.cc b/mediapipe/util/tracking/tone_models.cc new file mode 100644 index 000000000..ab98bfbaf --- /dev/null +++ b/mediapipe/util/tracking/tone_models.cc @@ -0,0 +1,144 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tone_models.h" + +#include + +#include "absl/strings/str_format.h" + +namespace mediapipe { + +LogDomainLUTImpl::LogDomainLUTImpl() { + log_lut_.resize(256); + const float alpha = 0.05f; + + // Mapping: log(1 + x) = y \in [0, log(256.0)]. + for (int k = 0; k < 256; ++k) { + log_lut_[k] = std::log(1.0f + alpha * k); + } + + exp_lut_.resize(kExpBins); + max_log_value_ = std::log(1.0f + alpha * 255.0f) * 1.001; + inv_max_log_value_ = 1.0f / max_log_value_; + + const float denom = (1.0f / (kExpBins - 2)); + // Inverse operation: exp(y) - 1.0 = x \in [0, 255]. + for (int k = 0; k < kExpBins; ++k) { + exp_lut_[k] = + (std::exp(k * denom * max_log_value_) - 1.0f) * (1.0f / alpha); + } +} + +template +void ToneModelMethods::MapImage(const Model& model, + bool log_domain, + bool normalized_model, + const cv::Mat& input, + cv::Mat* output) { + CHECK(output != nullptr); + + const int out_channels = output->channels(); + CHECK_EQ(input.channels(), 3); + CHECK_LE(out_channels, 3); + CHECK_EQ(input.rows, output->rows); + CHECK_EQ(input.cols, output->cols); + + float norm_scale = + normalized_model + ? (log_domain ? (1.0f / LogDomainLUT().MaxLogDomainValue()) + : (1.0f / 255.0f)) + : 1.0f; + + const float inv_norm_scale = 1.0f / norm_scale; + + for (int i = 0; i < input.rows; ++i) { + const uint8* input_ptr = input.ptr(i); + uint8* output_ptr = output->ptr(i); + for (int j = 0; j < input.cols; + ++j, input_ptr += 3, output_ptr += out_channels) { + Vector3_f color_vec(input_ptr[0], input_ptr[1], input_ptr[2]); + Vector3_f mapped; + if (log_domain) { + mapped = Adapter::TransformColor( + model, LogDomainLUT().MapVec(color_vec) * norm_scale); + mapped = LogDomainLUT().UnMapVec(mapped * inv_norm_scale); + } else { + mapped = Adapter::TransformColor(model, color_vec * norm_scale) * + inv_norm_scale; + } + // Clamp to visible range and output. + const Vector3_i result = RoundAndClampColor(mapped); + for (int c = 0; c < out_channels; ++c) { + output_ptr[c] = result[c]; + } + } + } +} + +GainBiasModel ToneModelAdapter::AddIdentity( + const GainBiasModel& model) { + GainBiasModel result = model; + result.set_gain_c1(result.gain_c1() + 1.0f); + result.set_gain_c2(result.gain_c2() + 1.0f); + result.set_gain_c3(result.gain_c3() + 1.0f); + return result; +} + +GainBiasModel ToneModelAdapter::ScaleParameters( + const GainBiasModel& model, float scale) { + GainBiasModel result = model; + result.set_gain_c1(result.gain_c1() * scale); + result.set_gain_c2(result.gain_c2() * scale); + result.set_gain_c3(result.gain_c3() * scale); + result.set_bias_c1(result.bias_c1() * scale); + result.set_bias_c2(result.bias_c2() * scale); + result.set_bias_c3(result.bias_c3() * scale); + return result; +} + +std::string ToneModelAdapter::ToString( + const GainBiasModel& model) { + return absl::StrFormat("%f %f | %f %f | %f %f", model.gain_c1(), + model.bias_c1(), model.gain_c2(), model.bias_c2(), + model.gain_c3(), model.bias_c3()); +} + +AffineToneModel ToneModelAdapter::AddIdentity( + const AffineToneModel& model) { + AffineToneModel result = model; + result.set_g_00(result.g_00() + 1.0f); + result.set_g_11(result.g_11() + 1.0f); + result.set_g_22(result.g_22() + 1.0f); + return result; +} + +AffineToneModel ToneModelAdapter::ScaleParameters( + const AffineToneModel& model, float scale) { + float elems[kNumParameters]; + ToPointer(model, elems); + for (int k = 0; k < kNumParameters; ++k) { + elems[k] *= scale; + } + return FromPointer(elems, false); +} + +// Explicit instantiations. +template class MixtureToneAdapter; +template class MixtureToneAdapter; + +template class ToneModelMethods; +template class ToneModelMethods; + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/tone_models.h b/mediapipe/util/tracking/tone_models.h new file mode 100644 index 000000000..266257e1f --- /dev/null +++ b/mediapipe/util/tracking/tone_models.h @@ -0,0 +1,628 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Implements various tone-models describing tonal change of color intensities +// across frame pairs. + +#ifndef MEDIAPIPE_UTIL_TRACKING_TONE_MODELS_H_ +#define MEDIAPIPE_UTIL_TRACKING_TONE_MODELS_H_ + +#include +#include +#include +#include + +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/tone_models.pb.h" + +namespace mediapipe { +// Abstract Adapter for tone models. +template +class ToneModelAdapter { + public: + // Initialized Model from pointer to arguments. If identitiy_parametrization + // is set, args = {0,0, ... 0} correspond to identity model. + template + static Model FromPointer(const T* args, bool identity_parametrization); + + // Outputs parameters to pointer args. + template + static void ToPointer(const GainBiasModel& model, T* args); + + // Transforms color (RGB space) according to model. + static Vector3_f TransformColor(const Model& model, const Vector3_f& color); + + // Inverts color mode, if not invertible success is set to zero. + static Model InvertChecked(const Model& model, bool* success); + static Model Compose(const Model& lhs, const Model& rhs); + + static int NumParameters(); + static float GetParameter(const Model& model, int idx); + + static float Determinant(const Model& model); +}; + +// LogDomain LUT: +// Converts from intensity domain to log-domain via Map, inverse mapping via +// UnMap. +// Use as Singleton only, via inline LogDomainLUT() +class LogDomainLUTImpl { + public: + LogDomainLUTImpl(const LogDomainLUTImpl&) = delete; + LogDomainLUTImpl& operator=(const LogDomainLUTImpl&) = delete; + + // Maps intensity between [0, 255] to log-domain by applying log(1 + x). + // Result will be within [0, log(256)]. Truncation occurs for non-integer + // inputs. Discritization error is at most 0.5 intensity levels. + inline float Map(float value) const { + return log_lut_[std::max(0, std::min(255, value + 0.5f))]; + } + + inline Vector3_f MapVec(Vector3_f vec) const { + return Vector3_f(Map(vec.x()), Map(vec.y()), Map(vec.z())); + } + + // Unmaps intensity from log-domain in [0, log(256)] = [0, 2.4] to regular + // domain, via exp(x) - 1.0. Discritization error of at most 0.01 intensity + // levels might occur. + inline float UnMap(float value) const { + const int bin = value * inv_max_log_value_ * (kExpBins - 2); + return exp_lut_[std::max(0, std::min(bin, kExpBins - 1))]; + } + + inline Vector3_f UnMapVec(Vector3_f vec) const { + return Vector3_f(UnMap(vec.x()), UnMap(vec.y()), UnMap(vec.z())); + } + + float MaxLogDomainValue() const { return max_log_value_; } // ~ log(256). + + private: + friend const LogDomainLUTImpl& LogDomainLUT(); + + LogDomainLUTImpl(); + std::vector log_lut_; + std::vector exp_lut_; + float max_log_value_; + float inv_max_log_value_; + enum { + kExpBins = 2560 + }; // Allocated such that exp(2.4) - + // exp(2.4 * (1.0 - 1.0 / kExpBins)) < 0.01. +}; + +inline const LogDomainLUTImpl& LogDomainLUT() { + static auto p = new LogDomainLUTImpl(); + return *p; +} + +inline Vector3_i RoundAndClampColor(const Vector3_f& vec) { + return Vector3_i( + std::max(0, std::min(255, static_cast(vec.x() + 0.5f))), + std::max(0, std::min(255, static_cast(vec.y() + 0.5f))), + std::max(0, std::min(255, static_cast(vec.z() + 0.5f)))); +} + +template <> +class ToneModelAdapter { + public: + template + static GainBiasModel FromPointer(const T* args, + bool identity_parametrization); + template + static void ToPointer(const GainBiasModel& model, T* args); + + inline static Vector3_f TransformColor(const GainBiasModel& model, + const Vector3_f& color); + + inline static GainBiasModel InvertChecked(const GainBiasModel& model, + bool* success); + + inline static GainBiasModel Compose(const GainBiasModel& lhs, + const GainBiasModel& rhs); + + enum { kNumParameters = 6 }; + static int NumParameters() { return kNumParameters; } + inline static float GetParameter(const GainBiasModel& model, int idx); + + static float Determinant(const GainBiasModel& model); + + static GainBiasModel AddIdentity(const GainBiasModel& model); + static GainBiasModel ScaleParameters(const GainBiasModel& model, float scale); + + static std::string ToString(const GainBiasModel& model); +}; + +template <> +class ToneModelAdapter { + public: + template + static AffineToneModel FromPointer(const T* args, + bool identity_parametrization); + template + static void ToPointer(const AffineToneModel& model, T* args) { + return ToPointerPad(model, false, args); + } + + // If pad_square is set the 12 DOF 3x4 model is embedded in a 4x4 model + // (last row = [ 0 0 0 1]) and output to args (row major ordering), + // otherwise the original 12 DOF 3x4 model is written to args (row major). + template + static void ToPointerPad(const AffineToneModel& model, bool pad_square, + T* args); + + inline static Vector3_f TransformColor(const AffineToneModel& model, + const Vector3_f& color); + + inline static AffineToneModel InvertChecked(const AffineToneModel& model, + bool* success); + + inline static AffineToneModel Compose(const AffineToneModel& lhs, + const AffineToneModel& rhs); + + enum { kNumParameters = 12 }; + static int NumParameters() { return kNumParameters; } + inline static float GetParameter(const AffineToneModel& model, int idx); + + // Used during stabilization: Adds identity model (model + I) + // and scales each parameter by scale. + static AffineToneModel AddIdentity(const AffineToneModel& model); + static AffineToneModel ScaleParameters(const AffineToneModel& model, + float scale); +}; + +typedef ToneModelAdapter GainBiasModelAdapter; +typedef ToneModelAdapter AffineToneModelAdapter; + +// Mixture Models. +template +struct MixtureToneTraits { + typedef BaseModel BaseModelType; + typedef MixtureModel ModelType; +}; + +typedef MixtureToneTraits + GainBiasModelTraits; +typedef MixtureToneTraits + AffineToneModelTraits; + +template +class MixtureToneAdapter { + typedef typename Traits::ModelType MixtureModel; + typedef typename Traits::BaseModelType BaseModel; + typedef ToneModelAdapter BaseModelAdapter; + enum { kBaseNumParameters = BaseModelAdapter::kNumParameters }; + + public: + template + static MixtureModel FromPointer(const T* args, bool identity_parametrization, + int skip, int num_models); + + template + static void ToPointer(const MixtureModel& model, T* args); + + static int NumParameters(const MixtureModel& model) { + return model.model_size() * kBaseNumParameters; + } + + inline static float GetParameter(const MixtureModel& model, int model_id, + int param_id); + + inline static MixtureModel IdentityModel(int num_mixtures); + + inline static BaseModel ToBaseModel(const MixtureModel& mixture_model, + const float* weights); + + inline static Vector3_f TransformColor(const MixtureModel& model, + const float* weights, + const Vector3_f& pt); + + // Mixtures are not invertible via closed form, but invertible for a specific + // set of weights by computing the underlying BaseModel via ToBaseModel. + // Resulting BaseModel is applied to pt, if BaseModel is not invertible + // success is set to false. + inline static Vector3_f SolveForColorChecked(const MixtureModel& model, + const float* weights, + const Vector3_f& pt, + bool* success); +}; + +typedef MixtureToneAdapter MixtureGainBiasModelAdapter; +typedef MixtureToneAdapter MixtureAffineToneModelAdapter; + +template +GainBiasModel ToneModelAdapter::FromPointer(const T* args, + bool identity) { + DCHECK(args); + GainBiasModel model; + const float id_shift = identity ? 1.0f : 0.0f; + model.set_gain_c1(args[0] + id_shift); + model.set_bias_c1(args[1]); + model.set_gain_c2(args[2] + id_shift); + model.set_bias_c2(args[3]); + model.set_gain_c3(args[4] + id_shift); + model.set_bias_c3(args[5]); + return model; +} + +template +void ToneModelAdapter::ToPointer(const GainBiasModel& model, + T* args) { + args[0] = model.gain_c1(); + args[1] = model.bias_c1(); + args[2] = model.gain_c2(); + args[3] = model.bias_c2(); + args[4] = model.gain_c3(); + args[5] = model.bias_c3(); +} + +inline Vector3_f ToneModelAdapter::TransformColor( + const GainBiasModel& model, const Vector3_f& color) { + return Vector3_f(model.gain_c1() * color.x() + model.bias_c1(), + model.gain_c2() * color.y() + model.bias_c2(), + model.gain_c3() * color.z() + model.bias_c3()); +} + +inline float ToneModelAdapter::Determinant( + const GainBiasModel& model) { + return model.gain_c1() * model.gain_c2() * model.gain_c3(); +} + +inline GainBiasModel ToneModelAdapter::InvertChecked( + const GainBiasModel& model, bool* success) { + // (g_1 0 0 b_1 == (1/g_1 0 0 -b_1/g_1 + // 0 g_2 0 b_2 0 1/g_2 0 -b_2/g_2 + // 0 0 g_3 b_3 0 0 1/g_3 -b_3/g_3 + // 0 0 0 1)^(-1) 0 0 0 1 + + // Compute determinant. + const float det = GainBiasModelAdapter::Determinant(model); + if (fabs(det) < 1e-10f) { + *success = false; + LOG(ERROR) << "Model not invertible."; + return GainBiasModel(); + } + + *success = true; + GainBiasModel result; + const float inv_gain_c1 = 1.0f / model.gain_c1(); + const float inv_gain_c2 = 1.0f / model.gain_c2(); + const float inv_gain_c3 = 1.0f / model.gain_c3(); + result.set_gain_c1(inv_gain_c1); + result.set_bias_c1(inv_gain_c1 * -model.bias_c1()); + result.set_gain_c2(inv_gain_c2); + result.set_bias_c2(inv_gain_c2 * -model.bias_c2()); + result.set_gain_c3(inv_gain_c3); + result.set_bias_c3(inv_gain_c3 * -model.bias_c3()); + return result; +} + +inline GainBiasModel ToneModelAdapter::Compose( + const GainBiasModel& lhs, const GainBiasModel& rhs) { + GainBiasModel result; + result.set_gain_c1(lhs.gain_c1() * rhs.gain_c1()); + result.set_bias_c1(lhs.gain_c1() * rhs.bias_c1() + lhs.bias_c1()); + result.set_gain_c2(lhs.gain_c2() * rhs.gain_c2()); + result.set_bias_c2(lhs.gain_c2() * rhs.bias_c2() + lhs.bias_c2()); + result.set_gain_c3(lhs.gain_c3() * rhs.gain_c3()); + result.set_bias_c3(lhs.gain_c3() * rhs.bias_c3() + lhs.bias_c3()); + return result; +} + +inline float ToneModelAdapter::GetParameter( + const GainBiasModel& model, int idx) { + switch (idx) { + case 0: + return model.gain_c1(); + case 1: + return model.bias_c1(); + case 2: + return model.gain_c2(); + case 3: + return model.bias_c2(); + case 4: + return model.gain_c3(); + case 5: + return model.bias_c3(); + default: + LOG(FATAL) << "Unknown parameter requested."; + } + + return 0.0f; +} +template +AffineToneModel ToneModelAdapter::FromPointer(const T* args, + bool identity) { + DCHECK(args); + AffineToneModel model; + const float id_shift = identity ? 1.0f : 0.0f; + model.set_g_00(args[0] + id_shift); + model.set_g_01(args[1]); + model.set_g_02(args[2]); + model.set_g_03(args[3]); + + model.set_g_10(args[4]); + model.set_g_11(args[5] + id_shift); + model.set_g_12(args[6]); + model.set_g_13(args[7]); + + model.set_g_20(args[8]); + model.set_g_21(args[9]); + model.set_g_22(args[10] + id_shift); + model.set_g_23(args[11]); + return model; +} + +template +void ToneModelAdapter::ToPointerPad( + const AffineToneModel& model, bool pad_square, T* args) { + DCHECK(args); + args[0] = model.g_00(); + args[1] = model.g_01(); + args[2] = model.g_02(); + args[3] = model.g_03(); + + args[4] = model.g_10(); + args[5] = model.g_11(); + args[6] = model.g_12(); + args[7] = model.g_13(); + + args[8] = model.g_20(); + args[9] = model.g_21(); + args[10] = model.g_22(); + args[11] = model.g_23(); + + if (pad_square) { + args[12] = 0; + args[13] = 0; + args[14] = 0; + args[15] = 1; + } +} + +inline Vector3_f ToneModelAdapter::TransformColor( + const AffineToneModel& model, const Vector3_f& color) { + return Vector3_f(model.g_00() * color.x() + model.g_01() * color.y() + + model.g_02() * color.z() + model.g_03(), + model.g_10() * color.x() + model.g_11() * color.y() + + model.g_12() * color.z() + model.g_13(), + model.g_20() * color.x() + model.g_21() * color.y() + + model.g_22() * color.z() + model.g_23()); +} + +inline AffineToneModel ToneModelAdapter::InvertChecked( + const AffineToneModel& model, bool* success) { + double data[16]; + double inv_data[16]; + ToPointerPad(model, true, data); + + cv::Mat model_mat(4, 4, CV_64F, data); + cv::Mat inv_model_mat(4, 4, CV_64F, inv_data); + + if (cv::invert(model_mat, inv_model_mat) < 1e-10) { + LOG(ERROR) << "AffineToneModel not invertible, det is zero."; + *success = false; + return AffineToneModel(); + } + + *success = true; + return FromPointer(inv_data, false); +} + +inline AffineToneModel ToneModelAdapter::Compose( + const AffineToneModel& lhs, const AffineToneModel& rhs) { + AffineToneModel result; + double lhs_data[16]; + double rhs_data[16]; + double result_data[16]; + ToPointerPad(lhs, true, lhs_data); + ToPointerPad(rhs, true, rhs_data); + + cv::Mat lhs_mat(4, 4, CV_64F, lhs_data); + cv::Mat rhs_mat(4, 4, CV_64F, rhs_data); + cv::Mat result_mat(4, 4, CV_64F, result_data); + + cv::gemm(lhs_mat, rhs_mat, 1.0, cv::Mat(), 0, result_mat); + return FromPointer(result_data, false); +} + +inline float ToneModelAdapter::GetParameter( + const AffineToneModel& model, int idx) { + switch (idx) { + case 0: + return model.g_00(); + case 1: + return model.g_01(); + case 2: + return model.g_02(); + case 3: + return model.g_03(); + case 4: + return model.g_10(); + case 5: + return model.g_11(); + case 6: + return model.g_12(); + case 7: + return model.g_13(); + case 8: + return model.g_20(); + case 9: + return model.g_21(); + case 10: + return model.g_22(); + case 11: + return model.g_23(); + default: + LOG(FATAL) << "Unknown parameter requested."; + } + + return 0.0f; +} + +template +template +typename Traits::ModelType MixtureToneAdapter::FromPointer( + const T* args, bool identity_parametrization, int skip, int num_models) { + MixtureModel model; + const T* arg_ptr = args; + for (int i = 0; i < num_models; ++i, arg_ptr += kBaseNumParameters + skip) { + BaseModel base = + BaseModelAdapter::FromPointer(arg_ptr, identity_parametrization); + model.add_model()->CopyFrom(base); + } + + return model; +} + +template +template +void MixtureToneAdapter::ToPointer(const MixtureModel& model, T* args) { + const T* arg_ptr = args; + const int num_models = model.model_size(); + for (int i = 0; i < num_models; ++i, arg_ptr += kBaseNumParameters) { + BaseModel base = BaseModelAdapter::ToPointer(model.model(i), arg_ptr); + } +} + +template +inline float MixtureToneAdapter::GetParameter(const MixtureModel& model, + int model_id, + int param_id) { + return BaseModelAdapter::GetParameter(model.model(model_id), param_id); +} + +template +inline typename Traits::ModelType MixtureToneAdapter::IdentityModel( + int num_mixtures) { + MixtureModel model; + for (int i = 0; i < num_mixtures; ++i) { + model.add_model(); + } + return model; +} + +template +inline typename MixtureToneAdapter::BaseModel +MixtureToneAdapter::ToBaseModel(const MixtureModel& mixture_model, + const float* weights) { + const int num_models = mixture_model.model_size(); + + float params[kBaseNumParameters]; + memset(params, 0, sizeof(params[0]) * kBaseNumParameters); + + // Weighted combination of mixture models. + for (int m = 0; m < num_models; ++m) { + for (int k = 0; k < kBaseNumParameters; ++k) { + params[k] += BaseModelAdapter::GetParameter(mixture_model.model(m), k) * + weights[m]; + } + } + + return BaseModelAdapter::FromPointer(params, false); +} + +template +inline Vector3_f MixtureToneAdapter::TransformColor( + const MixtureModel& model, const float* weights, const Vector3_f& pt) { + const int num_models = model.model_size(); + Vector3_f result(0, 0, 0); + for (int i = 0; i < num_models; ++i) { + result += BaseModelAdapter::TransformColor(model.model(i), pt) * weights[i]; + } + + return result; +} + +template +inline Vector3_f MixtureToneAdapter::SolveForColorChecked( + const MixtureModel& model, const float* weights, const Vector3_f& pt, + bool* success) { + BaseModel base_model = ToBaseModel(model, weights); + BaseModel inv_base_model = + BaseModelAdapter::InvertChecked(base_model, success); + return BaseModelAdapter::TransformColor(inv_base_model, pt); +} + +template +class ToneModelMethods { + public: + // Maps image input to output by applying model to each pixel's intensity. + // Set log_domain to true, if model has been estimated in the log-domain. + // Set normalized_model to true, if model is normalized (expected input + // intensity range in [0, 1]). + // Number of output channels (N) should be <= number of input channels (M), + // in which case the only first N input channels are copied to the first N + // output channels. + static void MapImage(const Model& model, bool log_domain, + bool normalized_model, const cv::Mat& input, + cv::Mat* output); + + // Fast mapping version of above function via LUT if color model is + // independent across specified C channels (that is model is a diagonal + // matrix). + // Otherwise incorrect results will be obtained. + template + static void MapImageIndependent(const Model& model, bool log_domain, + bool normalized_model, const cv::Mat& input, + cv::Mat* output); +}; + +typedef ToneModelMethods + GainBiasModelMethods; + +typedef ToneModelMethods + AffineToneModelMethods; + +template +template +void ToneModelMethods::MapImageIndependent( + const Model& model, bool log_domain, bool normalized_model, + const cv::Mat& input, cv::Mat* output) { + CHECK(output != nullptr); + CHECK_EQ(input.channels(), C); + CHECK_EQ(output->channels(), C); + + // Input LUT which will be mapped to the output LUT by the tone change model. + // Needs 3 channels to represent input RGB colors, but since they are assumed + // independent, we can assign the same value to each channel, which will be + // transformed by the respective channel's transform. + cv::Mat lut_input(1, 256, CV_8UC3); + uint8* lut_ptr = lut_input.ptr(0); + for (int k = 0; k < 256; ++k, lut_ptr += 3) { + for (int c = 0; c < 3; ++c) { + lut_ptr[c] = k; + } + } + + // Output LUT. Only needs C channels (as many in the input/output). + cv::Mat lut(1, 256, CV_8UC(C)); + MapImage(model, log_domain, normalized_model, lut_input, &lut); + + cv::LUT(input, lut, *output); +} + +// TODO: Implement for mixtures. +// typedef ToneModelMethods +// MixtureGainBiasModelMethods; +// typedef ToneModelMethods +// MixtureAffineToneModelMethods; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_TONE_MODELS_H_ diff --git a/mediapipe/util/tracking/tone_models.proto b/mediapipe/util/tracking/tone_models.proto new file mode 100644 index 000000000..d53ad2172 --- /dev/null +++ b/mediapipe/util/tracking/tone_models.proto @@ -0,0 +1,64 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Models to describe color transform between adjacent frames. +// Color transform are always specified for 3 channels (RGB). + +syntax = "proto2"; + +package mediapipe; + +// Transforms a 3D color vector x = (c1, c2, c3) according to +// [ gain_c1 0 0 bias_c1 * [ c1 +// 0 gain_c2 0 bias_c2 c2 +// 0 0 gain_c3 bias_c3 ] c3 +// 1 ] +message GainBiasModel { + optional float gain_c1 = 1 [default = 1.0]; + optional float bias_c1 = 2 [default = 0.0]; + optional float gain_c2 = 3 [default = 1.0]; + optional float bias_c2 = 4 [default = 0.0]; + optional float gain_c3 = 5 [default = 1.0]; + optional float bias_c3 = 6 [default = 0.0]; +} + +message MixtureGainBiasModel { + repeated GainBiasModel model = 1; +} + +// Transforms a 3D color vector x = (c1, c2, c3) according to +// [ g_00 g_01 g_02 g_03 * [ c1 +// g_10 g_11 g_12 g_13 c2 +// g_20 g_21 g_22 g_23 ] c3 +// 1 ] +message AffineToneModel { + optional float g_00 = 1 [default = 1.0]; + optional float g_01 = 2 [default = 0.0]; + optional float g_02 = 3 [default = 0.0]; + optional float g_03 = 4 [default = 0.0]; + + optional float g_10 = 5 [default = 0.0]; + optional float g_11 = 6 [default = 1.0]; + optional float g_12 = 7 [default = 0.0]; + optional float g_13 = 8 [default = 0.0]; + + optional float g_20 = 9 [default = 0.0]; + optional float g_21 = 10 [default = 0.0]; + optional float g_22 = 11 [default = 1.0]; + optional float g_23 = 12 [default = 0.0]; +} + +message MixtureAffineToneModel { + repeated AffineToneModel model = 1; +} diff --git a/mediapipe/util/tracking/tracked_detection.cc b/mediapipe/util/tracking/tracked_detection.cc new file mode 100644 index 000000000..3127a9e44 --- /dev/null +++ b/mediapipe/util/tracking/tracked_detection.cc @@ -0,0 +1,144 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tracked_detection.h" + +#include "mediapipe/framework/formats/rect.pb.h" + +namespace mediapipe { + +namespace { + +// Struct for carrying boundary information. +struct NormalizedRectBounds { + float left, right, top, bottom; +}; + +// Computes the area of a NormalizedRect. +float BoxArea(const NormalizedRect& box) { return box.width() * box.height(); } + +// Computes the bounds of a NormalizedRect. +void GetNormalizedRectBounds(const NormalizedRect& box, + NormalizedRectBounds* bounds) { + bounds->left = box.x_center() - box.width() / 2.f; + bounds->right = box.x_center() + box.width() / 2.f; + bounds->top = box.y_center() - box.height() / 2.f; + bounds->bottom = box.y_center() + box.height() / 2.f; +} + +// Computes the overlapping area of two boxes, ignoring rotation. +float OverlapArea(const NormalizedRect& box1, const NormalizedRect& box2) { + NormalizedRectBounds bounds1, bounds2; + GetNormalizedRectBounds(box1, &bounds1); + GetNormalizedRectBounds(box2, &bounds2); + const float x_overlap = + std::max(0.f, std::min(bounds1.right, bounds2.right) - + std::max(bounds1.left, bounds2.left)); + const float y_overlap = + std::max(0.f, std::min(bounds1.bottom, bounds2.bottom) - + std::max(bounds1.top, bounds2.top)); + return x_overlap * y_overlap; +} + +std::array ComputeCorners(const NormalizedRect& normalized_box, + const Vector2_f& center, + int image_width, int image_height) { + NormalizedRectBounds bounds; + GetNormalizedRectBounds(normalized_box, &bounds); + // Rotate 4 corner w.r.t. center. + std::array corners{{ + Vector2_f(bounds.left * image_width, bounds.top * image_height), + Vector2_f(bounds.left * image_width, bounds.bottom * image_height), + Vector2_f(bounds.right * image_width, bounds.bottom * image_height), + Vector2_f(bounds.right * image_width, bounds.top * image_height), + }}; + if (std::abs(normalized_box.rotation()) <= 1e-5) { + return corners; + } + + const float cos_a = std::cos(normalized_box.rotation()); + const float sin_a = std::sin(normalized_box.rotation()); + for (int k = 0; k < 4; ++k) { + // Scale and rotate w.r.t. center. + const Vector2_f rad = corners[k] - center; + const Vector2_f rot_rad(cos_a * rad.x() - sin_a * rad.y(), + sin_a * rad.x() + cos_a * rad.y()); + corners[k] = center + rot_rad; + } + return corners; +} + +} // namespace + +void TrackedDetection::AddLabel(const std::string& label, float score) { + auto label_ptr = label_to_score_map_.find(label); + if (label_ptr == label_to_score_map_.end()) { + label_to_score_map_[label] = score; + } else { + label_ptr->second = label_ptr->second > score ? label_ptr->second : score; + } +} + +bool TrackedDetection::IsSameAs(const TrackedDetection& other) const { + const auto box0 = bounding_box_; + const auto box1 = other.bounding_box_; + const double box0_area = BoxArea(box0); + const double box1_area = BoxArea(box1); + const double overlap_area = OverlapArea(box0, box1); + + // For cases where a small object is in front of a big object. + // TODO: This is hard threshold. Make the threshold smaller + // (e.g. 2.0) will cause issues when two detections of the same object is + // vertial to each other. For example, if we first get a detection (e.g. a + // long water bottle) vertically and then change the camera to horizontal + // quickly, then it will get another detection which will have a diamond shape + // that is much larger than the previous rectangle one. + const double kMaxAreaRatio = 3.0; + if (box0_area / box1_area > kMaxAreaRatio || + box1_area / box0_area > kMaxAreaRatio) + return false; + + const double kMaxOverlapRatio = 0.5; + if (overlap_area / box0_area > kMaxOverlapRatio || + overlap_area / box1_area > kMaxOverlapRatio) { + return true; + } + + return false; +} + +void TrackedDetection::MergeLabelScore(const TrackedDetection& other) { + for (const auto& label_score : other.label_to_score_map()) { + const auto label_score_ptr = label_to_score_map_.find(label_score.first); + if (label_score_ptr == label_to_score_map_.end()) { + AddLabel(label_score.first, label_score.second); + } else { + // TODO: Consider other strategy of merging scores, e.g. mean. + label_score_ptr->second = + std::max(label_score_ptr->second, label_score.second); + } + } +} + +std::array TrackedDetection::GetCorners( + float image_width, float image_height) const { + NormalizedRectBounds bounds; + GetNormalizedRectBounds(bounding_box_, &bounds); + Vector2_f center((bounds.right + bounds.left) / 2.0f * image_width, + (bounds.bottom + bounds.top) / 2.0f * image_height); + + return ComputeCorners(bounding_box_, center, image_width, image_height); +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/tracked_detection.h b/mediapipe/util/tracking/tracked_detection.h new file mode 100644 index 000000000..d4d0feee4 --- /dev/null +++ b/mediapipe/util/tracking/tracked_detection.h @@ -0,0 +1,127 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_TRACKED_DETECTION_H_ +#define MEDIAPIPE_UTIL_TRACKING_TRACKED_DETECTION_H_ + +#include + +#include "absl/container/node_hash_map.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/framework/port/vector.h" + +namespace mediapipe { + +// Class for keeping records of detected objects. +// Each detection should be set an id to uniquely identify it. +class TrackedDetection { + public: + TrackedDetection() = delete; + // Creates a detection that has a bounding box occupying the entire image. + TrackedDetection(int unique_id, int64 timestamp) + : unique_id_(unique_id), + initial_timestamp_(timestamp), + last_updated_timestamp_(timestamp), + previous_id_(-1) {} + + // Creates a detection with the specified bounding box normalized by image + // dimensions. + TrackedDetection(int unique_id, int64 timestamp, + const ::mediapipe::NormalizedRect& bounding_box) + : unique_id_(unique_id), + initial_timestamp_(timestamp), + last_updated_timestamp_(timestamp), + previous_id_(-1) { + bounding_box_.CopyFrom(bounding_box); + set_bounds(); + } + + // Adds a (label, score) pair. If the label already exists then keep the + // higher score of both entries. + void AddLabel(const std::string& label, float score); + + // Checks if two detections are in fact the same object. + // TODO: This is just the first step. It will be expand to a family + // of functionalities that do reconciliation. + bool IsSameAs(const TrackedDetection& other) const; + + // Merges labels and score from another TrackedDetection. + void MergeLabelScore(const TrackedDetection& other); + + int64 initial_timestamp() const { return initial_timestamp_; } + int64 last_updated_timestamp() const { return last_updated_timestamp_; } + void set_last_updated_timestamp(int64 timestamp) { + last_updated_timestamp_ = timestamp; + } + + int unique_id() const { return unique_id_; } + + int previous_id() const { return previous_id_; } + void set_previous_id(int id) { previous_id_ = id; } + + const ::mediapipe::NormalizedRect& bounding_box() const { + return bounding_box_; + } + float left() const { return left_; } + float right() const { return right_; } + float top() const { return top_; } + float bottom() const { return bottom_; } + + void set_bounding_box(const ::mediapipe::NormalizedRect& bounding_box) { + bounding_box_ = bounding_box; + set_bounds(); + } + + // Gets corners of the bounding box of the object in image coordinates. + std::array GetCorners(float width, float height) const; + + // Gets corners of the bounding box of the object in normalized coordinates. + std::array GetCorners() const { return GetCorners(1.0f, 1.0f); } + + const absl::node_hash_map& label_to_score_map() const { + return label_to_score_map_; + } + + private: + void set_bounds() { + left_ = bounding_box_.x_center() - bounding_box_.width() / 2.f; + right_ = bounding_box_.x_center() + bounding_box_.width() / 2.f; + top_ = bounding_box_.y_center() - bounding_box_.height() / 2.f; + bottom_ = bounding_box_.y_center() + bounding_box_.height() / 2.f; + } + + int unique_id_; + + ::mediapipe::NormalizedRect bounding_box_; + float left_ = 0.f; + float right_ = 0.f; + float top_ = 0.f; + float bottom_ = 0.f; + + // The timestamp in milliseconds when this object is initialized. + int64 initial_timestamp_; + // The latest timestamp when this object is updated. + int64 last_updated_timestamp_; + + // The id of the previous detection that this detection is associated with. + int previous_id_; + + // Each detection could have multiple labels, each with a corresponding + // detection score. + absl::node_hash_map label_to_score_map_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_TRACKED_DETECTION_H_ diff --git a/mediapipe/util/tracking/tracked_detection_manager.cc b/mediapipe/util/tracking/tracked_detection_manager.cc new file mode 100644 index 000000000..4a822ea6b --- /dev/null +++ b/mediapipe/util/tracking/tracked_detection_manager.cc @@ -0,0 +1,191 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tracked_detection_manager.h" + +#include + +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/util/tracking/tracked_detection.h" + +namespace { + +using ::mediapipe::TrackedDetection; + +// Checks if a point is out of view. +// x and y should both be in [0, 1] to be considered in view. +bool IsPointOutOfView(float x, float y) { + return x < 0.0f || x > 1.0f || y < 0.0f || y > 1.0f; +} + +// Checks if all corners of a bounding box of an object are out of view. +bool AreCornersOutOfView(const TrackedDetection& object) { + std::array corners = object.GetCorners(); + for (int k = 0; k < 4; ++k) { + if (!IsPointOutOfView(corners[k].x(), corners[k].y())) { + return false; + } + } + return true; +} +} // namespace + +namespace mediapipe { + +std::vector TrackedDetectionManager::AddDetection( + std::unique_ptr detection) { + std::vector ids_to_remove; + + int64 latest_duplicate_timestamp = 0; + // TODO: All detections should be fastforwarded to the current + // timestamp before adding the detection manager. E.g. only check they are the + // same if the timestamp are the same. + for (auto& existing_detection_ptr : detections_) { + const auto& existing_detection = *existing_detection_ptr.second; + if (detection->IsSameAs(existing_detection)) { + // Merge previous labels to the new detection. Because new detections + // usually have better bounding box than the one from tracking. + // TODO: This might cause unstable change of the bounding box. + // TODO: Add a filter for the box using new detection as + // observation. + detection->MergeLabelScore(existing_detection); + // Pick the duplicated detection that has the latest initial timestamp as + // the previous detection of the new one. + if (existing_detection.initial_timestamp() > latest_duplicate_timestamp) { + latest_duplicate_timestamp = existing_detection.initial_timestamp(); + if (existing_detection.previous_id() == -1) { + detection->set_previous_id(existing_detection.unique_id()); + } else { + detection->set_previous_id(existing_detection.previous_id()); + } + } + ids_to_remove.push_back(existing_detection_ptr.first); + } + } + // Erase old detections. + for (auto id : ids_to_remove) { + detections_.erase(id); + } + const int id = detection->unique_id(); + detections_[id] = std::move(detection); + return ids_to_remove; +} + +std::vector TrackedDetectionManager::UpdateDetectionLocation( + int id, const NormalizedRect& bounding_box, int64 timestamp) { + // TODO: Remove all boxes that are not updating. + auto detection_ptr = detections_.find(id); + if (detection_ptr == detections_.end()) { + return std::vector(); + } + auto& detection = *detection_ptr->second; + detection.set_bounding_box(bounding_box); + detection.set_last_updated_timestamp(timestamp); + + // It's required to do this here in addition to in AddDetection because during + // fast motion, two or more detections of the same object could coexist since + // the locations could be quite different before they are propagated to the + // same timestamp. + return RemoveDuplicatedDetections(detection.unique_id()); +} + +std::vector TrackedDetectionManager::RemoveObsoleteDetections( + int64 timestamp) { + std::vector ids_to_remove; + for (auto& existing_detection : detections_) { + if (existing_detection.second->last_updated_timestamp() < timestamp) { + ids_to_remove.push_back(existing_detection.first); + } + } + for (auto idx : ids_to_remove) { + detections_.erase(idx); + } + return ids_to_remove; +} + +std::vector TrackedDetectionManager::RemoveOutOfViewDetections() { + std::vector ids_to_remove; + for (auto& existing_detection : detections_) { + if (AreCornersOutOfView(*existing_detection.second)) { + ids_to_remove.push_back(existing_detection.first); + } + } + for (auto idx : ids_to_remove) { + detections_.erase(idx); + } + return ids_to_remove; +} + +std::vector TrackedDetectionManager::RemoveDuplicatedDetections(int id) { + std::vector ids_to_remove; + auto detection_ptr = detections_.find(id); + if (detection_ptr == detections_.end()) { + return ids_to_remove; + } + auto& detection = *detection_ptr->second; + + // For duplciated detections, we keep the one that's added most recently. + auto latest_detection = detection_ptr->second.get(); + // For setting up the |previous_id| of the latest detection. For now, if there + // are multiple duplicated detections at the same timestamp, we will use the + // one that has the second latest initial timestamp + const TrackedDetection* previous_detection = nullptr; + for (auto& existing_detection : detections_) { + auto other = *(existing_detection.second); + if (detection.unique_id() != other.unique_id()) { + // Only check if they are updated at the same timestamp. Comparing + // locations of detections at different timestamp is not correct. + if (detection.last_updated_timestamp() == + other.last_updated_timestamp()) { + if (detection.IsSameAs(other)) { + const TrackedDetection* detection_to_remove = nullptr; + if (latest_detection->initial_timestamp() >= + other.initial_timestamp()) { + // Removes the earlier one. + ids_to_remove.push_back(other.unique_id()); + detection_to_remove = existing_detection.second.get(); + latest_detection->MergeLabelScore(other); + } else { + ids_to_remove.push_back(latest_detection->unique_id()); + detection_to_remove = latest_detection; + existing_detection.second->MergeLabelScore(*latest_detection); + latest_detection = existing_detection.second.get(); + } + if (!previous_detection || + previous_detection->initial_timestamp() < + detection_to_remove->initial_timestamp()) { + previous_detection = detection_to_remove; + } + } + } + } + } + + // If the latest detection is not the one passed into this function, it might + // already has its previous detection. In that case, we don't override it. + if (latest_detection->previous_id() == -1 && previous_detection) { + if (previous_detection->previous_id() == -1) { + latest_detection->set_previous_id(previous_detection->unique_id()); + } else { + latest_detection->set_previous_id(previous_detection->previous_id()); + } + } + + for (auto idx : ids_to_remove) { + detections_.erase(idx); + } + return ids_to_remove; +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/tracked_detection_manager.h b/mediapipe/util/tracking/tracked_detection_manager.h new file mode 100644 index 000000000..d42d7c17c --- /dev/null +++ b/mediapipe/util/tracking/tracked_detection_manager.h @@ -0,0 +1,82 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_DETECTION_MANAGER_H_ +#define MEDIAPIPE_UTIL_TRACKING_DETECTION_MANAGER_H_ + +#include "absl/container/node_hash_map.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/util/tracking/tracked_detection.h" + +namespace mediapipe { + +// Class for managing detections. +// The detections are from either server side or a mobile detector and are +// tracked using either 2D or 3D tracker. The TrackedDetectionManager is used to +// identify duplicated detections or obsolete detections to keep a set of +// active detections. +class TrackedDetectionManager { + public: + TrackedDetectionManager() = default; + ~TrackedDetectionManager() = default; + + // Adds a new detection at a given timestamp. Returns the IDs of the + // detections that are removed due to duplication. + std::vector AddDetection(std::unique_ptr detection); + + // Updates the location of a detection identified by its id. + // Returns false if the requested id doesn't exist a corresponding + // detection. Returns the IDs of the detections that are removed due to + // duplication. + std::vector UpdateDetectionLocation( + int id, const ::mediapipe::NormalizedRect& bounding_box, int64 timestamp); + + // Removes detections that are not updated after |timestamp|. Returns the IDs + // of the detections that are removed. + std::vector RemoveObsoleteDetections(int64 timestamp); + + // TODO: Do we really need this? Pursuit tracker doesn't do well + // in loop closure. Boxes out of view are usually not attached to objects + // any more when it's in view again. Returns the IDs of the + // detections that are removed. + std::vector RemoveOutOfViewDetections(); + + int GetNumDetections() const { return detections_.size(); } + + // Get TrackedDetection by its unique id. + const TrackedDetection* GetTrackedDetection(int id) const { + auto detection_ptr = detections_.find(id); + if (detection_ptr == detections_.end()) { + return nullptr; + } + return detection_ptr->second.get(); + } + + const absl::node_hash_map>& + GetAllTrackedDetections() const { + return detections_; + } + + private: + // Finds all detections that are duplicated with the one of |id| and remove + // all detections except the one that is added most recently. Returns the IDs + // of the detections that are removed. + std::vector RemoveDuplicatedDetections(int id); + + absl::node_hash_map> detections_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_DETECTION_MANAGER_H_ diff --git a/mediapipe/util/tracking/tracked_detection_test.cc b/mediapipe/util/tracking/tracked_detection_test.cc new file mode 100644 index 000000000..60b9df1b1 --- /dev/null +++ b/mediapipe/util/tracking/tracked_detection_test.cc @@ -0,0 +1,109 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tracked_detection.h" + +#include "mediapipe/framework/port/gtest.h" + +namespace mediapipe { + +const float kErrorMargin = 1e-4f; + +TEST(TrackedDetectionTest, ConstructorWithoutBox) { + TrackedDetection detection(0, 0); + EXPECT_EQ(detection.unique_id(), 0); + EXPECT_EQ(detection.previous_id(), -1); + EXPECT_NEAR(detection.left(), 0.f, kErrorMargin); + EXPECT_NEAR(detection.top(), 0.f, kErrorMargin); + EXPECT_NEAR(detection.right(), 0.f, kErrorMargin); + EXPECT_NEAR(detection.bottom(), 0.f, kErrorMargin); +} + +TEST(TrackedDetectionTest, ConstructorWithAnOutOfBoundBox) { + NormalizedRect box; + box.set_x_center(-0.15); + box.set_y_center(-0.15); + box.set_height(0.1); + box.set_width(0.1); + TrackedDetection detection(0, 0, box); + EXPECT_EQ(detection.unique_id(), 0); + EXPECT_EQ(detection.previous_id(), -1); + EXPECT_NEAR(detection.left(), -0.2f, kErrorMargin); + EXPECT_NEAR(detection.top(), -0.2f, kErrorMargin); + EXPECT_NEAR(detection.right(), -0.1f, kErrorMargin); + EXPECT_NEAR(detection.bottom(), -0.1f, kErrorMargin); + EXPECT_NEAR(detection.bounding_box().width(), 0.1f, kErrorMargin); + EXPECT_NEAR(detection.bounding_box().height(), 0.1f, kErrorMargin); +} + +TEST(TrackedDetectionTest, ConstructorWithAnOutOfBoundBox2) { + NormalizedRect box; + box.set_x_center(0.5); + box.set_y_center(0.5); + box.set_height(1.2); + box.set_width(1.2); + TrackedDetection detection(0, 0, box); + EXPECT_EQ(detection.unique_id(), 0); + EXPECT_EQ(detection.previous_id(), -1); + EXPECT_NEAR(detection.left(), -0.1f, kErrorMargin); + EXPECT_NEAR(detection.top(), -0.1f, kErrorMargin); + EXPECT_NEAR(detection.right(), 1.1f, kErrorMargin); + EXPECT_NEAR(detection.bottom(), 1.1f, kErrorMargin); + EXPECT_NEAR(detection.bounding_box().width(), 1.2f, kErrorMargin); + EXPECT_NEAR(detection.bounding_box().height(), 1.2f, kErrorMargin); +} + +TEST(TrackedDetectionTest, CheckSameDetections) { + NormalizedRect box0; + box0.set_x_center(0.5); + box0.set_y_center(0.5); + box0.set_height(0.5); + box0.set_width(0.5); + NormalizedRect box1; + box1.set_x_center(0.55); + box1.set_y_center(0.5); + box1.set_height(0.5); + box1.set_width(0.5); + NormalizedRect box2; + box2.set_x_center(0.75); + box2.set_y_center(0.55); + box2.set_height(0.5); + box2.set_width(0.5); + TrackedDetection detection0(0, 0, box0); + TrackedDetection detection1(1, 0, box1); + TrackedDetection detection2(2, 0, box2); + EXPECT_TRUE(detection0.IsSameAs(detection1)); + EXPECT_FALSE(detection0.IsSameAs(detection2)); +} + +TEST(TrackedDetectionTest, MergeLabels) { + TrackedDetection detection0(0, 0); + detection0.AddLabel("1", 1.f); + detection0.AddLabel("2", 2.f); + detection0.AddLabel("3", 3.f); + + TrackedDetection detection1(1, 0); + detection0.AddLabel("1", 0.f); + detection0.AddLabel("2", 3.f); + detection0.AddLabel("4", 4.f); + + detection0.MergeLabelScore(detection1); + const auto& labels = detection0.label_to_score_map(); + EXPECT_EQ(labels.find("1")->second, 1.f); + EXPECT_EQ(labels.find("2")->second, 3.f); + EXPECT_EQ(labels.find("3")->second, 3.f); + EXPECT_EQ(labels.find("4")->second, 4.f); +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/tracking.cc b/mediapipe/util/tracking/tracking.cc new file mode 100644 index 000000000..3d4ea39e8 --- /dev/null +++ b/mediapipe/util/tracking/tracking.cc @@ -0,0 +1,3386 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tracking.h" + +#include +#include +#include +#include +#include +#include + +#include "Eigen/Core" +#include "Eigen/Dense" +#include "Eigen/SVD" +#include "absl/algorithm/container.h" +#include "absl/memory/memory.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_calib3d_inc.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/util/tracking/flow_packager.pb.h" +#include "mediapipe/util/tracking/measure_time.h" +#include "mediapipe/util/tracking/motion_models.h" + +namespace mediapipe { + +bool MotionBox::print_motion_box_warnings_ = true; + +namespace { + +static constexpr int kNormalizationGridSize = 10; +constexpr float kShortScale = 16383.0f; +constexpr float kInvShortScale = 1.0f / kShortScale; + +// Motion vectors with weights larger than kMinInlierWeight are classified as +// inliers. +constexpr float kMinInlierWeight = 0.5f; +// Motion vectors with weights smaller han kMaxOutlierWeight are classified as +// outliers. +constexpr float kMaxOutlierWeight = 0.1f; + +// Lexicographic (first x, then y) comparator for MotionVector::pos. +struct MotionVectorComparator { + bool operator()(const MotionVector& lhs, const MotionVector& rhs) const { + return (lhs.pos.x() < rhs.pos.x()) || + (lhs.pos.x() == rhs.pos.x() && lhs.pos.y() < rhs.pos.y()); + } +}; + +void StoreInternalState(const std::vector& vectors, + const std::vector& inlier_weights, + float aspect_ratio, MotionBoxInternalState* internal) { + const int num_vectors = vectors.size(); + CHECK_EQ(num_vectors, inlier_weights.size()); + + float scale_x = 1.0f; + float scale_y = 1.0f; + ScaleFromAspect(aspect_ratio, true, &scale_x, &scale_y); + + internal->Clear(); + for (int k = 0; k < num_vectors; ++k) { + internal->add_pos_x(vectors[k]->pos.x() * scale_x); + internal->add_pos_y(vectors[k]->pos.y() * scale_y); + internal->add_dx(vectors[k]->object.x() * scale_x); + internal->add_dy(vectors[k]->object.y() * scale_y); + internal->add_camera_dx(vectors[k]->background.x() * scale_x); + internal->add_camera_dy(vectors[k]->background.y() * scale_y); + internal->add_track_id(vectors[k]->track_id); + internal->add_inlier_score(inlier_weights[k]); + } +} + +// protolite compatible MotionBoxState_TrackStatus_Name +std::string TrackStatusToString(MotionBoxState::TrackStatus status) { + switch (status) { + case MotionBoxState::BOX_UNTRACKED: + return "BOX_UNTRACKED"; + case MotionBoxState::BOX_EMPTY: + return "BOX_EMPTY"; + case MotionBoxState::BOX_NO_FEATURES: + return "BOX_NO_FEATURES"; + case MotionBoxState::BOX_TRACKED: + return "BOX_TRACKED"; + case MotionBoxState::BOX_DUPLICATED: + return "BOX_DUPLICATED"; + case MotionBoxState::BOX_TRACKED_OUT_OF_BOUND: + return "BOX_TRACKED_OUT_OF_BOUND"; + } + LOG(FATAL) << "Should not happen."; + return "UNKNOWN"; +} + +void ClearInlierState(MotionBoxState* state) { + state->clear_inlier_ids(); + state->clear_inlier_length(); + state->clear_inlier_id_match_pos(); + state->clear_outlier_ids(); + state->clear_outlier_id_match_pos(); +} + +// Returns orthogonal error system from motion_vec scaled by irls_scale. +std::pair ComputeIrlsErrorSystem( + const Vector2_f& irls_scale, const Vector2_f& motion_vec) { + Vector2_f irls_vec = motion_vec.Normalize(); + Vector2_f irls_vec_ortho = irls_vec.Ortho(); + return std::make_pair(irls_vec * irls_scale.x(), + irls_vec_ortho * irls_scale.y()); +} + +// Returns error for a given difference vector and error system. +float ErrorDiff(const Vector2_f& diff, + const std::pair& error_system) { + // Error system is an orthogonal system of originally unit vectors that were + // pre-multiplied by the corresponding irls scale. + // One can think of this function here as L2 norm *after* scaling the whole + // vector space w.r.t. the error system. + // + // In particular, we will project the vector diff onto this system and then + // scale the magnitude along each direction with the corresponding irls scale. + // As scalar multiplication is commutative with the dot product of vectors + // pre-multiplication of the scale with the error system is sufficient. + return Vector2_f(diff.DotProd(error_system.first), + diff.DotProd(error_system.second)) + .Norm(); +} + +// Returns true if point is within the inlier extent of the passed state (within +// small bound of 5% of frame diameter). +bool PointWithinInlierExtent(const Vector2_f pt, const MotionBoxState& state) { + // No extent known, assume to be inside. + if (state.prior_weight() == 0) { + return true; + } + + const float width_radius = state.inlier_width() * 0.55f; + const float height_radius = state.inlier_height() * 0.55f; + const float left = state.inlier_center_x() - width_radius; + const float right = state.inlier_center_x() + width_radius; + const float top = state.inlier_center_y() - height_radius; + const float bottom = state.inlier_center_y() + height_radius; + + return pt.x() >= left && pt.x() <= right && pt.y() >= top && pt.y() <= bottom; +} + +// Taken from MotionEstimation::LinearSimilarityL2SolveSystem. +bool LinearSimilarityL2Solve( + const std::vector& motion_vectors, + const std::vector& weights, LinearSimilarityModel* model) { + CHECK(model); + if (motion_vectors.size() < 4) { + LOG(ERROR) << "Requiring at least 4 input vectors for sufficient solve."; + return false; + } + + cv::Mat matrix(4, 4, CV_32F); + cv::Mat solution(4, 1, CV_32F); + cv::Mat rhs(4, 1, CV_32F); + + matrix.setTo(0); + rhs.setTo(0); + + CHECK_EQ(motion_vectors.size(), weights.size()); + for (int k = 0; k < motion_vectors.size(); ++k) { + const float x = motion_vectors[k]->pos.x(); + const float y = motion_vectors[k]->pos.y(); + const float w = weights[k]; + + // double J[2 * 4] = {1, 0, x, -y, + // 0, 1, y, x}; + // Compute J^t * J * w = {1, 0, x, -y + // 0, 1, y, x, + // x, y, xx+yy, 0, + // -y, x, 0, xx+yy} * w; + + const float x_w = x * w; + const float y_w = y * w; + const float xx_yy_w = (x * x + y * y) * w; + float* matrix_ptr = matrix.ptr(0); + matrix_ptr[0] += w; + matrix_ptr[2] += x_w; + matrix_ptr[3] += -y_w; + + matrix_ptr += 4; + matrix_ptr[1] += w; + matrix_ptr[2] += y_w; + matrix_ptr[3] += x_w; + + matrix_ptr += 4; + matrix_ptr[0] += x_w; + matrix_ptr[1] += y_w; + matrix_ptr[2] += xx_yy_w; + + matrix_ptr += 4; + matrix_ptr[0] += -y_w; + matrix_ptr[1] += x_w; + matrix_ptr[3] += xx_yy_w; + + float* rhs_ptr = rhs.ptr(0); + + const float m_x = motion_vectors[k]->object.x() * w; + const float m_y = motion_vectors[k]->object.y() * w; + + rhs_ptr[0] += m_x; + rhs_ptr[1] += m_y; + rhs_ptr[2] += x * m_x + y * m_y; + rhs_ptr[3] += -y * m_x + x * m_y; + } + + // Solution parameters p. + if (cv::solve(matrix, rhs, solution)) { + const float* ptr = solution.ptr(0); + model->set_dx(ptr[0]); + model->set_dy(ptr[1]); + model->set_a(ptr[2] + 1.0); // Identity parametrization. + model->set_b(ptr[3]); + return true; + } else { + return false; + } +} + +// Taken from MotionEstimation::HomographyL2NormalEquationSolve +bool HomographyL2Solve(const std::vector& motion_vectors, + const std::vector& weights, Homography* model) { + CHECK(model); + + cv::Mat matrix(8, 8, CV_32F); + cv::Mat solution(8, 1, CV_32F); + cv::Mat rhs(8, 1, CV_32F); + + matrix.setTo(0); + rhs.setTo(0); + + // Matrix multiplications are hand-coded for speed improvements vs. + // opencv's cvGEMM calls. + CHECK_EQ(motion_vectors.size(), weights.size()); + for (int k = 0; k < motion_vectors.size(); ++k) { + const float x = motion_vectors[k]->pos.x(); + const float y = motion_vectors[k]->pos.y(); + const float w = weights[k]; + + const float xw = x * w; + const float yw = y * w; + const float xxw = x * x * w; + const float yyw = y * y * w; + const float xyw = x * y * w; + const float mx = x + motion_vectors[k]->object.x(); + const float my = y + motion_vectors[k]->object.y(); + + const float mxxyy = mx * mx + my * my; + // Jacobian + // double J[2 * 8] = {x, y, 1, 0, 0, 0, -x * m_x, -y * m_x, + // {0, 0, 0, x, y, 1, -x * m_y, -y * m_y} + // + // // Compute J^t * J * w = + // ( xx xy x 0 0 0 -xx*mx -xy*mx ) + // ( xy yy y 0 0 0 -xy*mx -yy*mx ) + // ( x y 1 0 0 0 -x*mx -y*mx ) + // ( 0 0 0 xx xy x -xx*my -xy*my ) + // ( 0 0 0 xy yy y -xy*my -yy*my ) + // ( 0 0 0 x y 1 -x*my -y*my ) + // ( -xx*mx -xy*mx -x*mx -xx*my -xy*my -x*my xx*mxxyy xy*mxxyy ) + // ( -xy*mx -yy*mx -y*mx -xy*my -yy*my -y*my xy*mxxyy yy*mxxyy ) * w + + // 1st row: xx xy x 0 0 0 -xx*mx -xy*mx + float* matrix_ptr = matrix.ptr(0); + matrix_ptr[0] += xxw; + matrix_ptr[1] += xyw; + matrix_ptr[2] += xw; + matrix_ptr[6] += -xxw * mx; + matrix_ptr[7] += -xyw * mx; + + // 2nd row: xy yy y 0 0 0 -xy*mx -yy*mx + matrix_ptr += 8; + matrix_ptr[0] += xyw; + matrix_ptr[1] += yyw; + matrix_ptr[2] += yw; + matrix_ptr[6] += -xyw * mx; + matrix_ptr[7] += -yyw * mx; + + // 3rd row: x y 1 0 0 0 -x*mx -y*mx + matrix_ptr += 8; + matrix_ptr[0] += xw; + matrix_ptr[1] += yw; + matrix_ptr[2] += w; + matrix_ptr[6] += -xw * mx; + matrix_ptr[7] += -yw * mx; + + // 4th row: 0 0 0 xx xy x -xx*my -xy*my + matrix_ptr += 8; + matrix_ptr[3] += xxw; + matrix_ptr[4] += xyw; + matrix_ptr[5] += xw; + matrix_ptr[6] += -xxw * my; + matrix_ptr[7] += -xyw * my; + + // 5th row: 0 0 0 xy yy y -xy*my -yy*my + matrix_ptr += 8; + matrix_ptr[3] += xyw; + matrix_ptr[4] += yyw; + matrix_ptr[5] += yw; + matrix_ptr[6] += -xyw * my; + matrix_ptr[7] += -yyw * my; + + // 6th row: 0 0 0 x y 1 -x*my -y*my + matrix_ptr += 8; + matrix_ptr[3] += xw; + matrix_ptr[4] += yw; + matrix_ptr[5] += w; + matrix_ptr[6] += -xw * my; + matrix_ptr[7] += -yw * my; + + // 7th row: -xx*mx -xy*mx -x*mx -xx*my -xy*my -x*my xx*mxxyy xy*mxxyy + matrix_ptr += 8; + matrix_ptr[0] += -xxw * mx; + matrix_ptr[1] += -xyw * mx; + matrix_ptr[2] += -xw * mx; + matrix_ptr[3] += -xxw * my; + matrix_ptr[4] += -xyw * my; + matrix_ptr[5] += -xw * my; + matrix_ptr[6] += xxw * mxxyy; + matrix_ptr[7] += xyw * mxxyy; + + // 8th row: -xy*mx -yy*mx -y*mx -xy*my -yy*my -y*my xy*mxxyy yy*mxxyy + matrix_ptr += 8; + matrix_ptr[0] += -xyw * mx; + matrix_ptr[1] += -yyw * mx; + matrix_ptr[2] += -yw * mx; + matrix_ptr[3] += -xyw * my; + matrix_ptr[4] += -yyw * my; + matrix_ptr[5] += -yw * my; + matrix_ptr[6] += xyw * mxxyy; + matrix_ptr[7] += yyw * mxxyy; + + // Right hand side: + // b = ( x + // y ) + // Compute J^t * b * w = + // ( x*mx y*mx mx x*my y*my my -x*mxxyy -y*mxxyy ) * w + + float* rhs_ptr = rhs.ptr(0); + rhs_ptr[0] += xw * mx; + rhs_ptr[1] += yw * mx; + rhs_ptr[2] += mx * w; + rhs_ptr[3] += xw * my; + rhs_ptr[4] += yw * my; + rhs_ptr[5] += my * w; + rhs_ptr[6] += -xw * mxxyy; + rhs_ptr[7] += -yw * mxxyy; + } + + // Solution parameters p. + if (cv::solve(matrix, rhs, solution)) { + const float* ptr = solution.ptr(0); + *model = HomographyAdapter::FromFloatPointer(ptr, false); + return true; + } + + return false; +} + +void TransformQuadInMotionBoxState(const MotionBoxState& curr_pos, + const Homography& homography, + MotionBoxState* next_pos) { + CHECK(next_pos != nullptr); + if (!curr_pos.has_pos_x() || !curr_pos.has_pos_y() || !curr_pos.has_width() || + !curr_pos.has_height()) { + LOG(ERROR) << "Previous box does not exist, cannot transform!"; + return; + } + const int kQuadVerticesSize = 8; + const MotionBoxState::Quad* curr_quad_ptr = nullptr; + auto quad = absl::make_unique(); + if (curr_pos.has_quad() && + curr_pos.quad().vertices_size() == kQuadVerticesSize) { + curr_quad_ptr = &curr_pos.quad(); + } else { + std::array corners = + GetCornersOfRotatedRect(curr_pos, Vector2_f(1.0f, 1.0f)); + for (const auto& vertex : corners) { + quad->add_vertices(vertex.x()); + quad->add_vertices(vertex.y()); + } + curr_quad_ptr = quad.get(); + } + + MotionBoxState::Quad* next_pos_quad = next_pos->mutable_quad(); + bool next_pos_quad_existed = true; + if (next_pos_quad->vertices_size() != kQuadVerticesSize) { + next_pos_quad_existed = false; + next_pos_quad->clear_vertices(); + } + for (int i = 0; i < kQuadVerticesSize / 2; ++i) { + const auto& curr_pos_quad_vertex = Vector2_f( + curr_quad_ptr->vertices(i * 2), curr_quad_ptr->vertices(i * 2 + 1)); + const auto& next_pos_quad_vertex_diff = + HomographyAdapter::TransformPoint(homography, curr_pos_quad_vertex) - + curr_pos_quad_vertex; + if (next_pos_quad_existed) { + next_pos_quad->set_vertices(i * 2, next_pos_quad->vertices(i * 2) + + next_pos_quad_vertex_diff.x()); + next_pos_quad->set_vertices( + i * 2 + 1, + next_pos_quad->vertices(i * 2 + 1) + next_pos_quad_vertex_diff.y()); + } else { + next_pos_quad->add_vertices(curr_pos_quad_vertex.x() + + next_pos_quad_vertex_diff.x()); + next_pos_quad->add_vertices(curr_pos_quad_vertex.y() + + next_pos_quad_vertex_diff.y()); + } + } +} + +void UpdateStatePositionAndSizeFromStateQuad(MotionBoxState* box_state) { + Vector2_f top_left, bottom_right; + MotionBoxBoundingBox(*box_state, &top_left, &bottom_right); + box_state->set_width(bottom_right.x() - top_left.x()); + box_state->set_height(bottom_right.y() - top_left.y()); + box_state->set_pos_x(top_left.x()); + box_state->set_pos_y(top_left.y()); +} + +void ApplyCameraTrackingDegrees(const MotionBoxState& prev_state, + const Homography& background_model, + const TrackStepOptions& options, + const Vector2_f& domain, + MotionBoxState* next_state) { + // Determine center translation. + Vector2_f center(MotionBoxCenter(prev_state)); + const Vector2_f background_motion = + HomographyAdapter::TransformPoint(background_model, center) - center; + + if (options.tracking_degrees() == + TrackStepOptions::TRACKING_DEGREE_TRANSLATION || + !options.track_object_and_camera()) { + SetMotionBoxPosition(MotionBoxPosition(*next_state) + background_motion, + next_state); + return; + } + + // Transform corners and fit similarity. + // Overall idea: + // We got corners, x0, x1, x2, x3 of the rect in the previous location + // transform by background_model H. + // Assuming H = [A | t], their target location in the next frame + // is: + // xi' = A * xi + t for i = 0..3 + // We want to express the location of ci' w.r.t. to the translated center c, + // to decouple H from the translation of the center. + // In particular, we are looking for the translation of the center: + // c* = c + t* and points + // xi* = xi + t* + // Express location of xi' w.r.t. c: + // xi' = A(xi* - c*) + c* + // Axi + t = A(xi - c) + c + t* + // Axi + t = Axi - Ac + c + t* + // t* = Ac - c + t + std::array corners = MotionBoxCorners(prev_state); + std::vector corner_vecs(4); + std::vector corner_vec_ptrs(4); + + for (int k = 0; k < 4; ++k) { + MotionVector v; + v.pos = corners[k]; + v.object = HomographyAdapter::TransformPoint(background_model, corners[k]) - + corners[k]; + corner_vecs[k] = v; + corner_vec_ptrs[k] = &corner_vecs[k]; + } + + LinearSimilarityModel linear_similarity; + LinearSimilarityL2Solve(corner_vec_ptrs, std::vector(4, 1.0f), + &linear_similarity); + + SimilarityModel similarity = + LinearSimilarityAdapter::ToSimilarity(linear_similarity); + + // See above derivation, motion of the center is t* = Ac + t - c; + // Could also make the point that background_model instead of + // linear_similarity is more accurate here due to the fitting operation above. + SetMotionBoxPosition(MotionBoxPosition(*next_state) + + TransformPoint(linear_similarity, center) - center, + next_state); + + switch (options.tracking_degrees()) { + case TrackStepOptions::TRACKING_DEGREE_TRANSLATION: + break; + case TrackStepOptions::TRACKING_DEGREE_CAMERA_SCALE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE: + next_state->set_scale(next_state->scale() * similarity.scale()); + break; + case TrackStepOptions::TRACKING_DEGREE_CAMERA_ROTATION: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION: + next_state->set_rotation(next_state->rotation() + similarity.rotation()); + break; + case TrackStepOptions::TRACKING_DEGREE_CAMERA_ROTATION_SCALE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE: + next_state->set_scale(next_state->scale() * similarity.scale()); + next_state->set_rotation(next_state->rotation() + similarity.rotation()); + break; + case TrackStepOptions::TRACKING_DEGREE_CAMERA_PERSPECTIVE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE: + TransformQuadInMotionBoxState(prev_state, background_model, next_state); + if (prev_state.has_pnp_homography()) { + *(next_state->mutable_pnp_homography()) = HomographyAdapter::Compose( + prev_state.pnp_homography(), background_model); + UpdateStatePositionAndSizeFromStateQuad(next_state); + } + break; + } +} + +void ApplyObjectMotion(const MotionBoxState& curr_pos, + const Vector2_f& object_translation, + const LinearSimilarityModel& object_similarity, + const Homography& object_homography, + const TrackStepOptions& options, + MotionBoxState* next_pos) { + switch (options.tracking_degrees()) { + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE: { + Vector2_f center(MotionBoxCenter(curr_pos)); + // See ApplyCameraTrackingDegrees for derivation. + SetMotionBoxPosition(MotionBoxPosition(*next_pos) + + TransformPoint(object_similarity, center) - + center, + next_pos); + SimilarityModel similarity = + LinearSimilarityAdapter::ToSimilarity(object_similarity); + if (options.tracking_degrees() != + TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION) { + next_pos->set_scale(next_pos->scale() * similarity.scale()); + } + if (options.tracking_degrees() != + TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE) { + next_pos->set_rotation(next_pos->rotation() + similarity.rotation()); + } + break; + } + + case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE: { + Vector2_f center(MotionBoxCenter(curr_pos)); + SetMotionBoxPosition( + MotionBoxPosition(*next_pos) + + HomographyAdapter::TransformPoint(object_homography, center) - + center, + next_pos); + TransformQuadInMotionBoxState(curr_pos, object_homography, next_pos); + break; + } + default: + // Use translation per default. + SetMotionBoxPosition(MotionBoxPosition(*next_pos) + object_translation, + next_pos); + } +} + +bool IsBoxValid(const MotionBoxState& state) { + const float kMaxBoxHeight = + 10000.0f; // as relative to normalized [0, 1] space + const float kMaxBoxWidth = + 10000.0f; // as relative to normalized [0, 1] space + if (state.width() > kMaxBoxWidth) { + LOG(ERROR) << "box width " << state.width() << " too big"; + return false; + } + if (state.height() > kMaxBoxHeight) { + LOG(ERROR) << "box height " << state.height() << " too big"; + return false; + } + + return true; +} + +Homography PnpHomographyFromRotationAndTranslation(const cv::Mat& rvec, + const cv::Mat& tvec) { + cv::Mat homography_matrix(3, 3, CV_64F); + cv::Rodrigues(rvec, homography_matrix); + + for (int c = 0; c < 3; ++c) { + homography_matrix.at(c, 2) = tvec.at(c); + } + + // check non zero + homography_matrix /= homography_matrix.at(2, 2); + + return HomographyAdapter::FromDoublePointer(homography_matrix.ptr(0), + false); +} + +// Translate CameraIntrinsics proto to cv format. +void ConvertCameraIntrinsicsToCvMat( + const TrackStepOptions::CameraIntrinsics& camera_intrinsics, + cv::Mat* camera_mat, cv::Mat* dist_coef) { + *camera_mat = cv::Mat::eye(3, 3, CV_64F); + *dist_coef = cv::Mat::zeros(1, 5, CV_64FC1); + camera_mat->at(0, 0) = camera_intrinsics.fx(); + camera_mat->at(1, 1) = camera_intrinsics.fy(); + camera_mat->at(0, 2) = camera_intrinsics.cx(); + camera_mat->at(1, 2) = camera_intrinsics.cy(); + dist_coef->at(0) = camera_intrinsics.k0(); + dist_coef->at(1) = camera_intrinsics.k1(); + dist_coef->at(4) = camera_intrinsics.k2(); +} + +} // namespace. + +void ScaleFromAspect(float aspect, bool invert, float* scale_x, + float* scale_y) { + *scale_x = aspect >= 1.0f ? 1.0f : aspect; + *scale_y = aspect >= 1.0f ? 1.0f / aspect : 1.0f; + if (invert) { + *scale_x = 1.0f / *scale_x; + *scale_y = 1.0f / *scale_y; + } +} + +std::array MotionBoxCorners(const MotionBoxState& state, + const Vector2_f& scaling) { + std::array transformed_corners; + if (state.has_quad() && state.quad().vertices_size() == 8) { + for (int k = 0; k < 4; ++k) { + transformed_corners[k] = Vector2_f(state.quad().vertices(2 * k), + state.quad().vertices(2 * k + 1)) + .MulComponents(scaling); + } + } else { + transformed_corners = GetCornersOfRotatedRect(state, scaling); + } + + return transformed_corners; +} + +bool MotionBoxLines(const MotionBoxState& state, const Vector2_f& scaling, + std::array* box_lines) { + CHECK(box_lines); + std::array corners = MotionBoxCorners(state, scaling); + std::array lines; + for (int k = 0; k < 4; ++k) { + const Vector2_f diff = corners[(k + 1) % 4] - corners[k]; + const Vector2_f normal = diff.Ortho().Normalize(); + box_lines->at(k).Set(normal.x(), normal.y(), -normal.DotProd(corners[k])); + // Double check that second point is on the computed line. + if (box_lines->at(k).DotProd(Vector3_f(corners[(k + 1) % 4].x(), + corners[(k + 1) % 4].y(), 1.0f)) >= + 0.02f) { + LOG(ERROR) << "box is abnormal. Line equations don't satisfy constraint"; + return false; + } + } + return true; +} + +void MotionBoxBoundingBox(const MotionBoxState& state, Vector2_f* top_left, + Vector2_f* bottom_right) { + CHECK(top_left); + CHECK(bottom_right); + + std::array corners = MotionBoxCorners(state); + + // Determine min and max across dimension. + *top_left = Vector2_f(std::numeric_limits::max(), + std::numeric_limits::max()); + + *bottom_right = Vector2_f(std::numeric_limits::min(), + std::numeric_limits::min()); + + for (const Vector2_f& c : corners) { + top_left->x(std::min(top_left->x(), c.x())); + top_left->y(std::min(top_left->y(), c.y())); + bottom_right->x(std::max(bottom_right->x(), c.x())); + bottom_right->y(std::max(bottom_right->y(), c.y())); + } +} + +void MotionBoxInlierLocations(const MotionBoxState& state, + std::vector* inlier_pos) { + CHECK(inlier_pos); + inlier_pos->clear(); + for (int k = 0; k < state.inlier_id_match_pos_size(); k += 2) { + inlier_pos->push_back( + Vector2_f(state.inlier_id_match_pos(k) * kInvShortScale, + state.inlier_id_match_pos(k + 1) * kInvShortScale)); + } +} + +void MotionBoxOutlierLocations(const MotionBoxState& state, + std::vector* outlier_pos) { + CHECK(outlier_pos); + outlier_pos->clear(); + for (int k = 0; k < state.outlier_id_match_pos_size(); k += 2) { + outlier_pos->push_back( + Vector2_f(state.outlier_id_match_pos(k) * kInvShortScale, + state.outlier_id_match_pos(k + 1) * kInvShortScale)); + } +} + +std::array GetCornersOfRotatedRect(const MotionBoxState& state, + const Vector2_f& scaling) { + std::array transformed_corners; + // Scale and rotate 4 corner w.r.t. center. + const Vector2_f center = MotionBoxCenter(state).MulComponents(scaling); + const Vector2_f top_left = MotionBoxPosition(state).MulComponents(scaling); + const std::array corners{{ + top_left, + top_left + Vector2_f(0, state.height() * scaling.y()), + top_left + + Vector2_f(state.width() * scaling.x(), state.height() * scaling.y()), + top_left + Vector2_f(state.width() * scaling.x(), 0), + }}; + + const float cos_a = std::cos(state.rotation()); + const float sin_a = std::sin(state.rotation()); + for (int k = 0; k < 4; ++k) { + // Scale and rotate w.r.t. center. + const Vector2_f rad = corners[k] - center; + const Vector2_f rot_rad(cos_a * rad.x() - sin_a * rad.y(), + sin_a * rad.x() + cos_a * rad.y()); + const Vector2_f transformed_corner = center + rot_rad * state.scale(); + transformed_corners[k] = transformed_corner; + } + + return transformed_corners; +} + +void InitializeQuadInMotionBoxState(MotionBoxState* state) { + CHECK(state != nullptr); + // Every quad has 4 vertices. Each vertex has x and y 2 coordinates. So + // a total of 8 floating point values. + if (state->quad().vertices_size() != 8) { + MotionBoxState::Quad* quad_ptr = state->mutable_quad(); + quad_ptr->clear_vertices(); + std::array corners = + GetCornersOfRotatedRect(*state, Vector2_f(1.0f, 1.0f)); + for (const auto& vertex : corners) { + quad_ptr->add_vertices(vertex.x()); + quad_ptr->add_vertices(vertex.y()); + } + } +} + +void InitializeInliersOutliersInMotionBoxState(const TrackingData& tracking, + MotionBoxState* state) { + MotionVectorFrame mvf; // Holds motion from current to previous frame. + MotionVectorFrameFromTrackingData(tracking, &mvf); + + std::array box_lines; + if (!MotionBoxLines(*state, Vector2_f(1.0f, 1.0f), &box_lines)) { + LOG(ERROR) << "Error in computing MotionBoxLines."; + return; + } + + // scale for motion vectors. + float scale_x, scale_y; + ScaleFromAspect(mvf.aspect_ratio, true, &scale_x, &scale_y); + + state->clear_inlier_ids(); + state->clear_inlier_length(); + state->clear_outlier_ids(); + + float inlier_center_x = 0.0f; + float inlier_center_y = 0.0f; + int cnt_inlier = 0; + + float min_x = std::numeric_limits::max(); + float max_x = -std::numeric_limits::max(); + float min_y = std::numeric_limits::max(); + float max_y = -std::numeric_limits::max(); + + for (const auto& motion_vec : mvf.motion_vectors) { + const float pos_x = motion_vec.pos.x() * scale_x; + const float pos_y = motion_vec.pos.y() * scale_y; + + bool insider = true; + + for (const Vector3_f& line : box_lines) { + if (line.DotProd(Vector3_f(pos_x, pos_y, 1.0f)) > 0.0f) { + insider = false; + break; + } + } + + if (insider) { + ++cnt_inlier; + inlier_center_x += pos_x; + inlier_center_y += pos_y; + + min_x = std::min(min_x, pos_x); + max_x = std::max(max_x, pos_x); + min_y = std::min(min_y, pos_y); + max_y = std::max(max_y, pos_y); + + state->add_inlier_ids(motion_vec.track_id); + state->add_inlier_length(1.0f); + } else { + state->add_outlier_ids(motion_vec.track_id); + } + } + + if (cnt_inlier) { + state->set_prior_weight(1.0f); + state->set_inlier_center_x(inlier_center_x / cnt_inlier); + state->set_inlier_center_y(inlier_center_y / cnt_inlier); + state->set_inlier_width(max_x - min_x); + state->set_inlier_height(max_y - min_y); + } +} + +void InitializePnpHomographyInMotionBoxState( + const TrackingData& tracking, const TrackStepOptions& track_step_options, + MotionBoxState* state) { + // Only happen when `quad` and `aspect_ratio` are all specified. + if (!state->has_quad()) { + VLOG(1) << "Skip pnp tracking since box does not contain quad info."; + return; + } + + const int kQuadCornersSize = 4; + CHECK_EQ(state->quad().vertices_size(), kQuadCornersSize * 2); + float scale_x, scale_y; + ScaleFromAspect(tracking.frame_aspect(), false, &scale_x, &scale_y); + std::vector corners_2d(kQuadCornersSize); + if (track_step_options.has_camera_intrinsics()) { + const auto& camera = track_step_options.camera_intrinsics(); + for (int c = 0; c < kQuadCornersSize; ++c) { + corners_2d[c].x = state->quad().vertices(c * 2) * camera.w(); + corners_2d[c].y = state->quad().vertices(c * 2 + 1) * camera.h(); + } + + cv::Mat camera_mat, dist_coef; + ConvertCameraIntrinsicsToCvMat(camera, &camera_mat, &dist_coef); + cv::undistortPoints(corners_2d, corners_2d, camera_mat, dist_coef); + } else { + const float center_x = scale_x * 0.5f; + const float center_y = scale_y * 0.5f; + for (int c = 0; c < kQuadCornersSize; ++c) { + corners_2d[c].x = state->quad().vertices(c * 2) * scale_x - center_x; + corners_2d[c].y = state->quad().vertices(c * 2 + 1) * scale_y - center_y; + } + } + + if (!state->has_aspect_ratio()) { + if (!track_step_options.forced_pnp_tracking()) { + VLOG(1) << "Skip pnp tracking since aspect ratio is unknown and " + "estimation of it is not forced."; + return; + } + const float u2_u0 = corners_2d[2].x - corners_2d[0].x; + const float v2_v0 = corners_2d[2].y - corners_2d[0].y; + const float u3_u1 = corners_2d[3].x - corners_2d[1].x; + const float v3_v1 = corners_2d[3].y - corners_2d[1].y; + + constexpr float kEpsilon = 1e-6f; + const float denominator = u2_u0 * v3_v1 - v2_v0 * u3_u1; + if (std::abs(denominator) < kEpsilon) { + LOG(WARNING) << "Zero denominator. Failed calculating aspect ratio."; + return; + } + + float s[kQuadCornersSize]; + s[0] = ((corners_2d[2].x - corners_2d[3].x) * v3_v1 - + (corners_2d[2].y - corners_2d[3].y) * u3_u1) * + 2.0f / denominator; + s[1] = -(u2_u0 * (corners_2d[2].y - corners_2d[3].y) - + v2_v0 * (corners_2d[2].x - corners_2d[3].x)) * + 2.0f / denominator; + s[2] = 2.0f - s[0]; + s[3] = 2.0f - s[1]; + + std::vector corners(kQuadCornersSize); + for (int i = 0; i < kQuadCornersSize; ++i) { + if (s[0] <= 0) { + LOG(WARNING) << "Negative scale. Failed calculating aspect ratio."; + return; + } + corners[i] = + Vector3_f(corners_2d[i].x * s[i], corners_2d[i].y * s[i], s[i]); + } + + const Vector3_f width_edge = corners[2] - corners[1]; + const Vector3_f height_edge = corners[0] - corners[1]; + const float height_norm = height_edge.Norm(); + const float width_norm = width_edge.Norm(); + if (height_norm < kEpsilon || width_norm < kEpsilon) { + LOG(WARNING) + << "abnormal 3d quadrangle. Failed calculating aspect ratio."; + return; + } + + constexpr float kMaxCosAngle = 0.258819; // which is cos(75 deg) + if (width_edge.DotProd(height_edge) / height_norm / width_norm > + kMaxCosAngle) { + LOG(WARNING) + << "abnormal 3d quadrangle. Failed calculating aspect ratio."; + return; + } + + state->set_aspect_ratio(width_norm / height_norm); + } + + CHECK_GT(state->aspect_ratio(), 0.0f); + + const float half_width = state->aspect_ratio(); + const float half_height = 1.0f; + const std::vector corners_3d{ + cv::Point3f(-half_width, -half_height, 0.0f), + cv::Point3f(-half_width, half_height, 0.0f), + cv::Point3f(half_width, half_height, 0.0f), + cv::Point3f(half_width, -half_height, 0.0f), + }; + + std::vector motion_vectors(kQuadCornersSize); + std::vector motion_vector_pointers(kQuadCornersSize); + + for (int c = 0; c < kQuadCornersSize; ++c) { + motion_vectors[c].pos = Vector2_f(corners_3d[c].x, corners_3d[c].y); + motion_vectors[c].object = + Vector2_f(corners_2d[c].x, corners_2d[c].y) - motion_vectors[c].pos; + + motion_vector_pointers[c] = &motion_vectors[c]; + } + + const std::vector weights(kQuadCornersSize, 1.0f); + HomographyL2Solve(motion_vector_pointers, weights, + state->mutable_pnp_homography()); +} + +// Scales velocity and all other velocity dependent fields according to +// temporal_scale. +void ScaleStateTemporally(float temporal_scale, MotionBoxState* state) { + state->set_dx(state->dx() * temporal_scale); + state->set_dy(state->dy() * temporal_scale); + state->set_kinetic_energy(state->kinetic_energy() * temporal_scale); +} + +void ScaleStateAspect(float aspect, bool invert, MotionBoxState* state) { + float scale_x = 1.0f; + float scale_y = 1.0f; + ScaleFromAspect(aspect, invert, &scale_x, &scale_y); + + if (state->has_quad() && state->quad().vertices_size() == 8) { + for (int i = 0; i < 4; ++i) { + float curr_val = state->quad().vertices(i * 2); + state->mutable_quad()->set_vertices(i * 2, curr_val * scale_x); + curr_val = state->quad().vertices(i * 2 + 1); + state->mutable_quad()->set_vertices(i * 2 + 1, curr_val * scale_y); + } + } + + state->set_pos_x(state->pos_x() * scale_x); + state->set_pos_y(state->pos_y() * scale_y); + state->set_width(state->width() * scale_x); + state->set_height(state->height() * scale_y); + state->set_dx(state->dx() * scale_x); + state->set_dy(state->dy() * scale_y); + state->set_inlier_center_x(state->inlier_center_x() * scale_x); + state->set_inlier_center_y(state->inlier_center_y() * scale_y); + state->set_inlier_width(state->inlier_width() * scale_x); + state->set_inlier_height(state->inlier_height() * scale_y); +} + +MotionVector MotionVector::FromInternalState( + const MotionBoxInternalState& internal, int index) { + CHECK_LT(index, internal.pos_x_size()); + MotionVector v; + v.pos = Vector2_f(internal.pos_x(index), internal.pos_y(index)); + v.object = Vector2_f(internal.dx(index), internal.dy(index)); + v.background = + Vector2_f(internal.camera_dx(index), internal.camera_dy(index)); + v.track_id = internal.track_id(index); + return v; +} + +void MotionBox::ResetAtFrame(int frame, const MotionBoxState& state) { + states_.clear(); + queue_start_ = frame; + + states_.push_back(state); + states_.back().set_track_status(MotionBoxState::BOX_TRACKED); + // Init inlier dimensions from state if not set. + if (states_.back().inlier_width() == 0 || + states_.back().inlier_height() == 0) { + states_.back().set_inlier_width(state.width()); + states_.back().set_inlier_height(state.height()); + } + + initial_state_ = state; +} + +bool MotionBox::TrackStep(int from_frame, + const MotionVectorFrame& motion_vectors, + bool forward) { + if (!TrackableFromFrame(from_frame)) { + LOG(WARNING) << "Tracking requested for initial position that is not " + << "trackable."; + return false; + } + const int queue_pos = from_frame - queue_start_; + + MotionBoxState new_state; + if (motion_vectors.is_duplicated) { + // Do not track or update the state, just copy. + new_state = states_[queue_pos]; + new_state.set_track_status(MotionBoxState::BOX_DUPLICATED); + } else { + // Compile history and perform tracking. + std::vector history; + constexpr int kHistorySize = 10; + if (forward) { + for (int k = queue_pos - 1; k >= std::max(0, queue_pos - kHistorySize); + --k) { + history.push_back(&states_[k]); + } + } else { + for (int k = queue_pos + 1; + k <= std::min(states_.size() - 1, queue_pos + kHistorySize); + ++k) { + history.push_back(&states_[k]); + } + } + + TrackStepImpl(from_frame, states_[queue_pos], motion_vectors, history, + &new_state); + } + + if (new_state.track_status() < MotionBoxState::BOX_TRACKED) { + new_state.set_tracking_confidence(0.0f); + } + if (!new_state.has_tracking_confidence()) { + // In this case, track status should be >= MotionBoxState::BOX_TRACKED + new_state.set_tracking_confidence(1.0f); + } + + VLOG(1) << "Track status from frame " << from_frame << ": " + << TrackStatusToString(new_state.track_status()) + << ". Has quad: " << new_state.has_quad(); + + constexpr float kFailureDisparity = 0.8f; + if (new_state.track_status() >= MotionBoxState::BOX_TRACKED) { + if (forward) { + const int new_pos = queue_pos + 1; + if (new_pos < states_.size()) { + states_[new_pos] = new_state; + } else { + states_.push_back(new_state); + } + + // Check for successive tracking failures of in bound boxes. + if (new_pos >= options_.max_track_failures()) { + int num_track_errors = 0; + // Cancel at the N + 1 tracking failure. + for (int f = new_pos - options_.max_track_failures(); f <= new_pos; + ++f) { + if (states_[f].track_status() != + MotionBoxState::BOX_TRACKED_OUT_OF_BOUND) { + num_track_errors += (fabs(states_[f].motion_disparity()) * + states_[f].prior_weight() > + kFailureDisparity); + } + } + + if (num_track_errors >= options_.max_track_failures()) { + LOG_IF(INFO, print_motion_box_warnings_) + << "Tracking failed during max track failure " + << "verification."; + states_[new_pos].set_track_status(MotionBoxState::BOX_UNTRACKED); + return false; + } + } + } else { + // Backward tracking. + int new_pos = queue_pos - 1; + if (new_pos >= 0) { + states_[new_pos] = new_state; + } else { + states_.push_front(new_state); + --queue_start_; + new_pos = 0; + } + + // Check for successive tracking failures. + if (new_pos + options_.max_track_failures() + 1 < states_.size()) { + int num_track_errors = 0; + // Cancel at the N + 1 tracking failure. + for (int f = new_pos; f <= new_pos + options_.max_track_failures(); + ++f) { + if (states_[f].track_status() != + MotionBoxState::BOX_TRACKED_OUT_OF_BOUND) { + num_track_errors += (fabs(states_[f].motion_disparity()) * + states_[f].prior_weight() > + kFailureDisparity); + } + } + + if (num_track_errors >= options_.max_track_failures()) { + LOG_IF(INFO, print_motion_box_warnings_) + << "Tracking failed during max track failure " + << "verification."; + states_[new_pos].set_track_status(MotionBoxState::BOX_UNTRACKED); + return false; + } + } + } + + // Signal track success. + return true; + } else { + LOG_IF(WARNING, print_motion_box_warnings_) + << "Tracking error at " << from_frame + << " status : " << TrackStatusToString(new_state.track_status()); + return false; + } +} + +namespace { + +Vector2_f SpatialPriorPosition(const Vector2_f& location, + const MotionBoxState& state) { + const int grid_size = state.spatial_prior_grid_size(); + return Vector2_f( + Clamp((location.x() - state.pos_x()) / state.width(), 0, 1) * + (grid_size - 1), + Clamp((location.y() - state.pos_y()) / state.height(), 0, 1) * + (grid_size - 1)); +} + +// Creates spatial prior for current set of inlier vectors and blends +// it with previous prior (based on blend_prior). If interpolate is set to true, +// uses more accurate interpolation into bins, instead of nearest neighbor +// interpolation. If use_next_position is set to true, the position in +// the next/previous frame is used instead of the current one. +void ComputeSpatialPrior(bool interpolate, bool use_next_position, + float blend_prior, MotionBoxState* update_pos) { + const int grid_size = update_pos->spatial_prior_grid_size(); + + std::vector old_prior(update_pos->spatial_prior().begin(), + update_pos->spatial_prior().end()); + std::vector old_confidence(update_pos->spatial_confidence().begin(), + update_pos->spatial_confidence().end()); + + CHECK_EQ(old_confidence.size(), old_prior.size()); + CHECK(old_confidence.empty() || + grid_size * grid_size == old_confidence.size()) + << "Empty or priors of constant size expected"; + + update_pos->clear_spatial_prior(); + update_pos->clear_spatial_confidence(); + auto* spatial_prior = update_pos->mutable_spatial_prior(); + auto* spatial_confidence = update_pos->mutable_spatial_confidence(); + spatial_prior->Reserve(grid_size * grid_size); + spatial_confidence->Reserve(grid_size * grid_size); + + for (int k = 0; k < grid_size * grid_size; ++k) { + spatial_prior->AddAlreadyReserved(0); + spatial_confidence->AddAlreadyReserved(0); + } + + // Aggregate inlier weights (0 = outlier, 1 = total inlier) across grid. + const MotionBoxInternalState& internal = update_pos->internal(); + const int num_elems = internal.pos_x_size(); + + for (int k = 0; k < num_elems; ++k) { + MotionVector vec = MotionVector::FromInternalState(internal, k); + const Vector2_f pos = + use_next_position ? vec.MatchLocation() : vec.Location(); + float weight = internal.inlier_score(k); + + const Vector2_f grid_pos = SpatialPriorPosition(pos, *update_pos); + + if (use_next_position) { + // Check for out of bound and skip. + if (grid_pos.x() < 0 || grid_pos.y() < 0 || + grid_pos.x() > update_pos->spatial_prior_grid_size() - 1 || + grid_pos.y() > update_pos->spatial_prior_grid_size() - 1) { + continue; + } + } + + if (interpolate) { + const int int_x = static_cast(grid_pos.x()); + const int int_y = static_cast(grid_pos.y()); + + CHECK_GE(grid_pos.x(), 0) << pos.x() << ", " << update_pos->pos_x(); + CHECK_GE(grid_pos.y(), 0); + CHECK_LE(grid_pos.x(), grid_size - 1); + CHECK_LE(grid_pos.y(), grid_size - 1); + + const float dx = grid_pos.x() - int_x; + const float dy = grid_pos.y() - int_y; + const float dx_1 = 1.0f - dx; + const float dy_1 = 1.0f - dy; + const int stride = static_cast(dx != 0); + + int grid_pos = int_y * grid_size + int_x; + + // Bilinear interpolation. Total sum of weights across all 4 additions + // (for prior and confidence each), is one. + *spatial_prior->Mutable(grid_pos) += dx_1 * dy_1 * weight; + *spatial_confidence->Mutable(grid_pos) += dx_1 * dy_1; + + *spatial_prior->Mutable(grid_pos + stride) += dx * dy_1 * weight; + *spatial_confidence->Mutable(grid_pos + stride) += dx * dy_1; + + grid_pos += (dy != 0) * grid_size; + *spatial_prior->Mutable(grid_pos) += dx_1 * dy * weight; + *spatial_confidence->Mutable(grid_pos) += dx_1 * dy; + + *spatial_prior->Mutable(grid_pos + stride) += dx * dy * weight; + *spatial_confidence->Mutable(grid_pos + stride) += dx * dy; + } else { + // Nearest neighbor. + const int grid_bin = static_cast(grid_pos.y() + 0.5f) * grid_size + + static_cast(grid_pos.x() + 0.5f); + *spatial_prior->Mutable(grid_bin) += weight; + *spatial_confidence->Mutable(grid_bin) += 1; + } + } + + // Normalize, i.e. max truncation. + float total_prior_difference = 0; + float weight_sum = 0; + for (int k = 0; k < grid_size * grid_size; ++k) { + // Convert aggregated inlier weights to grid cell prior. + // Here we consider a grid cell to be an inlier, if at least two + // 2 inliers within that cell where found. + *spatial_prior->Mutable(k) = std::min(1.0f, spatial_prior->Get(k) * 0.5f); + *spatial_confidence->Mutable(k) = + std::min(1.0f, spatial_confidence->Get(k) * 0.5f); + + if (!old_prior.empty()) { + // Truncated error, consider a difference of 0.2 within normal + // update range. + const float difference = std::max( + 0.f, fabs(update_pos->spatial_prior(k) - old_prior[k]) - 0.2f); + // Weight error by confidence. + total_prior_difference += difference * update_pos->spatial_confidence(k); + weight_sum += update_pos->spatial_confidence(k); + + // Blend confidence with previous confidence. + const float curr_confidence = + update_pos->spatial_confidence(k) * (1.0f - blend_prior); + const float prev_confidence = old_confidence[k] * blend_prior; + + float summed_confidence = curr_confidence + prev_confidence; + const float denom = + summed_confidence > 0 ? 1.0f / summed_confidence : 1.0f; + + // Update prior and confidence as weighted linear combination between + // current and previous prior. + *spatial_prior->Mutable(k) = + (update_pos->spatial_prior(k) * curr_confidence + + old_prior[k] * prev_confidence) * + denom; + + *spatial_confidence->Mutable(k) = + (update_pos->spatial_confidence(k) * curr_confidence + + prev_confidence * prev_confidence) * + denom; + } + } + + update_pos->set_prior_diff(std::sqrt( + total_prior_difference * (weight_sum > 0 ? 1.0f / weight_sum : 1.0f))); +} + +} // namespace. + +void MotionBox::GetStartPosition(const MotionBoxState& curr_pos, + float aspect_ratio, float* expand_mag, + Vector2_f* top_left, + Vector2_f* bottom_right) const { + CHECK(top_left); + CHECK(bottom_right); + CHECK(expand_mag); + + MotionBoxBoundingBox(curr_pos, top_left, bottom_right); + + if (curr_pos.has_pnp_homography()) { + *expand_mag = 0.0f; + } else { + // Expaned box by the specified minimum expansion_size. For fast moving + // objects, we ensure that magnitude is twice the box velocity, but not more + // than 1/4 of the box diameter. + *expand_mag = std::max(options_.expansion_size(), + std::min(MotionBoxSize(curr_pos).Norm() * 0.25f, + MotionBoxVelocity(curr_pos).Norm() * 2.0f)); + } + + // Expansion magnitude is not non-uniformly scaled w.r.t. aspect ratio + // to ensure inclusion test in GetVectorsAndWeights can assume uniform + // explansion magnitude. + const Vector2_f expand = Vector2_f(*expand_mag, *expand_mag); + *top_left -= expand; + *bottom_right += expand; +} + +void MotionBox::GetSpatialGaussWeights(const MotionBoxState& box_state, + const Vector2_f& inv_box_domain, + float* spatial_gauss_x, + float* spatial_gauss_y) const { + CHECK(spatial_gauss_x); + CHECK(spatial_gauss_y); + + // Space sigma depends on how much the tracked object fills the rectangle. + // We get this information from the inlier extent of the previous + // estimation. + // Motivation: Choose sigma s such that the inlier domain equals 90% coverage. + // i.e. using z-score one sided of 95% = 1.65 + // s * 1.65 = domain + // ==> s = domain / 1.65f + const float space_sigma_x = std::max( + options_.spatial_sigma(), box_state.inlier_width() * inv_box_domain.x() * + 0.5f * box_state.prior_weight() / 1.65f); + const float space_sigma_y = options_.spatial_sigma(); + std::max(options_.spatial_sigma(), box_state.inlier_height() * + inv_box_domain.y() * 0.5f * + box_state.prior_weight() / 1.65f); + + *spatial_gauss_x = -0.5f / (space_sigma_x * space_sigma_x); + *spatial_gauss_y = -0.5f / (space_sigma_y * space_sigma_y); +} + +// Computes for each vector its 2D grid position for a grid spanning +// the domain top_left to bottom_right. +// Note: Passed vectors must lie within the domain or function will return false +// for error. +template +bool ComputeGridPositions(const Vector2_f& top_left, + const Vector2_f& bottom_right, + const std::vector& vectors, + std::vector* grid_positions) { + CHECK(grid_positions); + + // Slightly larger domain to avoid boundary issues. + const Vector2_f inv_grid_domain( + (1.0f - 1e-3f) / (bottom_right.x() - top_left.x()), + (1.0f - 1e-3f) / (bottom_right.y() - top_left.y())); + + grid_positions->clear(); + grid_positions->reserve(vectors.size()); + for (const MotionVector* vec : vectors) { + // Get grid position. Note grid is never rotated, but we only use it for + // density estimation. + const Vector2_f grid_pos = + (vec->pos - top_left).MulComponents(inv_grid_domain) * (kGridSize - 1); + if (grid_pos.x() < 0 || grid_pos.y() < 0 || grid_pos.x() > kGridSize || + grid_pos.y() > kGridSize) { + return false; + } + + grid_positions->push_back(grid_pos); + } + + return true; +} + +template +void AddToGrid(const Vector2_f& grid_pos, std::vector* grid) { + const float grid_x = grid_pos.x(); + const float grid_y = grid_pos.y(); + + const int int_grid_x = grid_x; + const int int_grid_y = grid_y; + + const float dx = grid_x - int_grid_x; + const float dy = grid_y - int_grid_y; + const float dxdy = dx * dy; + const float dx_plus_dy = dx + dy; + + const int inc_x = dx != 0; + const int inc_y = dy != 0; + + int bin_idx = int_grid_y * kGridSize + int_grid_x; + // (1 - dx)(1 - dy) = 1 - (dx + dy) + dx*dy + (*grid)[bin_idx] += 1 - dx_plus_dy + dxdy; + // dx * (1 - dy) = dx - dxdy + (*grid)[bin_idx + inc_x] += dx - dxdy; + + bin_idx += kGridSize * inc_y; + // (1 - dx) * dy = dy - dxdy + (*grid)[bin_idx] += dy - dxdy; + (*grid)[bin_idx + inc_x] += dxdy; +} + +template +float SampleFromGrid(const Vector2_f& grid_pos, + const std::vector& grid) { + const float grid_x = grid_pos.x(); + const float grid_y = grid_pos.y(); + + const int int_grid_x = grid_x; + const int int_grid_y = grid_y; + + const float dx = grid_x - int_grid_x; + const float dy = grid_y - int_grid_y; + const float dxdy = dx * dy; + const float dx_plus_dy = dx + dy; + int inc_x = dx != 0; + int inc_y = dy != 0; + + float normalizer = 0; + int bin_idx = int_grid_y * kGridSize + int_grid_x; + + // See above. + normalizer += grid[bin_idx] * (1 - dx_plus_dy + dxdy); + normalizer += grid[bin_idx + inc_x] * (dx - dxdy); + + bin_idx += kGridSize * inc_y; + normalizer += grid[bin_idx] * (dy - dxdy); + normalizer += grid[bin_idx + inc_x] * dxdy; + + const float inv_normalizer = normalizer > 0 ? 1.0f / normalizer : 0; + // Density should always decrease weight; never increase. + return std::min(1.0f, inv_normalizer); +} + +MotionBox::DistanceWeightsComputer::DistanceWeightsComputer( + const MotionBoxState& initial_state, const MotionBoxState& current_state, + const TrackStepOptions& options) { + tracking_degrees_ = options.tracking_degrees(); + const Vector2_f box_domain(current_state.width() * current_state.scale(), + current_state.height() * current_state.scale()); + CHECK_GT(box_domain.x(), 0.0f); + CHECK_GT(box_domain.y(), 0.0f); + inv_box_domain_ = Vector2_f(1.0f / box_domain.x(), 1.0f / box_domain.y()); + + // Space sigma depends on how much the tracked object fills the rectangle. + // We get this information from the inlier extent of the previous + // estimation. + // Motivation: Choose sigma s such that the inlier domain equals 90% + // coverage. + // i.e. using z-score one sided of 95% = 1.65 + // s * 1.65 = domain + // ==> s = domain / 1.65f + const float space_sigma_x = + std::max(options.spatial_sigma(), + current_state.inlier_width() * inv_box_domain_.x() * 0.5f * + current_state.prior_weight() / 1.65f); + + const float space_sigma_y = + std::max(options.spatial_sigma(), + current_state.inlier_height() * inv_box_domain_.y() * 0.5f * + current_state.prior_weight() / 1.65f); + + spatial_gauss_x_ = -0.5f / (space_sigma_x * space_sigma_x); + spatial_gauss_y_ = -0.5f / (space_sigma_y * space_sigma_y); + + if (tracking_degrees_ == TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION || + tracking_degrees_ == + TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE) { + cos_neg_a_ = std::cos(-current_state.rotation()); + sin_neg_a_ = std::sin(-current_state.rotation()); + if (std::abs(current_state.rotation()) > 0.01f) { + is_large_rotation_ = true; + } + } + + // Compute box center as blend between geometric center and inlier center. + constexpr float kMaxBoxCenterBlendWeight = 0.5f; + box_center_ = + Lerp(MotionBoxCenter(current_state), InlierCenter(current_state), + std::min(kMaxBoxCenterBlendWeight, current_state.prior_weight())); + if (tracking_degrees_ == + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE) { + CHECK(initial_state.has_quad()); + CHECK(current_state.has_quad()); + homography_ = + ComputeHomographyFromQuad(current_state.quad(), initial_state.quad()); + box_center_transformed_ = + HomographyAdapter::TransformPoint(homography_, box_center_); + } +} + +float MotionBox::DistanceWeightsComputer::ComputeDistanceWeight( + const MotionVector& test_vector) { + // Distance weighting. + Vector2_f diff_center; + if (tracking_degrees_ == + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE) { + Vector2_f test_vector_transformed = + HomographyAdapter::TransformPoint(homography_, test_vector.pos); + diff_center = test_vector_transformed - box_center_transformed_; + } else { + diff_center = test_vector.pos - box_center_; + if (is_large_rotation_) { + // Rotate difference vector to normalized domain. + diff_center.Set( + cos_neg_a_ * diff_center.x() - sin_neg_a_ * diff_center.y(), + sin_neg_a_ * diff_center.x() + cos_neg_a_ * diff_center.y()); + } + } + + const Vector2_f diff = diff_center.MulComponents(inv_box_domain_); + // Regular gaussian with variance in each direction, assuming directions + // are indpendent. + float weight = std::exp(diff.x() * diff.x() * spatial_gauss_x_ + + diff.y() * diff.y() * spatial_gauss_y_); + return weight; +} + +Homography MotionBox::DistanceWeightsComputer::ComputeHomographyFromQuad( + const MotionBoxState::Quad& src_quad, + const MotionBoxState::Quad& dst_quad) { + std::vector src_quad_vec(8); + std::vector dst_quad_vec(8); + for (int i = 0; i < 8; ++i) { + src_quad_vec[i] = src_quad.vertices(i); + dst_quad_vec[i] = dst_quad.vertices(i); + } + // Construct the matrix + Eigen::Matrix A = Eigen::Matrix::Zero(); + for (int i = 0; i < 4; ++i) { + const int r0 = 2 * i; + const int r1 = 2 * i + 1; + A(r0, 0) = src_quad_vec[r0]; + A(r0, 1) = src_quad_vec[r1]; + A(r0, 2) = 1.f; + A(r0, 6) = -src_quad_vec[r0] * dst_quad_vec[r0]; + A(r0, 7) = -src_quad_vec[r1] * dst_quad_vec[r0]; + A(r1, 3) = src_quad_vec[r0]; + A(r1, 4) = src_quad_vec[r1]; + A(r1, 5) = 1.f; + A(r1, 6) = -src_quad_vec[r0] * dst_quad_vec[r1]; + A(r1, 7) = -src_quad_vec[r1] * dst_quad_vec[r1]; + } + + // Map arrays to Eigen vectors without memcpy + std::vector homography_vector(8); + Eigen::Map > x(homography_vector.data()); + Eigen::Map > b(dst_quad_vec.data()); + + x = A.fullPivLu().solve(b); + Homography homography; + homography.set_h_00(homography_vector[0]); + homography.set_h_01(homography_vector[1]); + homography.set_h_02(homography_vector[2]); + homography.set_h_10(homography_vector[3]); + homography.set_h_11(homography_vector[4]); + homography.set_h_12(homography_vector[5]); + homography.set_h_20(homography_vector[6]); + homography.set_h_21(homography_vector[7]); + return homography; +} + +bool MotionBox::GetVectorsAndWeights( + const std::vector& motion_vectors, int start_idx, int end_idx, + const Vector2_f& top_left, const Vector2_f& bottom_right, + const MotionBoxState& box_state, bool valid_background_model, + bool is_chunk_boundary, float temporal_scale, float expand_mag, + const std::vector& history, + std::vector* vectors, std::vector* weights, + int* number_of_good_prior, int* number_of_cont_inliers) const { + CHECK(weights); + CHECK(vectors); + CHECK(number_of_good_prior); + CHECK(number_of_cont_inliers); + + const int num_max_vectors = end_idx - start_idx; + weights->clear(); + vectors->clear(); + weights->reserve(num_max_vectors); + vectors->reserve(num_max_vectors); + + const Vector2_f box_domain(box_state.width() * box_state.scale(), + box_state.height() * box_state.scale()); + + CHECK_GT(box_domain.x(), 0.0f); + CHECK_GT(box_domain.y(), 0.0f); + const Vector2_f inv_box_domain(1.0f / box_domain.x(), 1.0f / box_domain.y()); + + // The four lines of the rotated and scaled box. + std::array box_lines; + if (!MotionBoxLines(box_state, Vector2_f(1.0f, 1.0f), &box_lines)) { + LOG(ERROR) << "Error in computing MotionBoxLines. Return 0 good inits and " + "continued inliers"; + return false; + } + + // Get list of previous tracking inliers and outliers. + // Ids are used for non-chunk boundaries (faster matching), locations + // for chunk boundaries. + std::unordered_map inlier_ids; + std::unordered_set outlier_ids; + std::vector inlier_locations; + std::vector outlier_locations; + + if (!is_chunk_boundary) { + MotionBoxInliers(box_state, &inlier_ids); + MotionBoxOutliers(box_state, &outlier_ids); + + // Never map ids in history across a chunk boundary. + for (const auto* state_ptr : history) { + MotionBoxOutliers(*state_ptr, &outlier_ids); + } + // Why don't we build inlier map from a history of inliers? + // It is unlikely we skip a feature as an inlier, it is either + // consistently part of the motion model or it is not. + } else { + MotionBoxInlierLocations(box_state, &inlier_locations); + MotionBoxOutlierLocations(box_state, &outlier_locations); + } + + // Indicator for each vector, if inlier or outlier from prev. estimation. + std::vector is_inlier; + std::vector is_outlier; + is_inlier.reserve(num_max_vectors); + is_outlier.reserve(num_max_vectors); + int num_cont_inliers = 0; + + // Approx. 2 pix at SD resolution. + constexpr float kSqProximity = 2e-3 * 2e-3; + + for (int k = start_idx; k < end_idx; ++k) { + // x is within bound due to sorting. + const MotionVector& test_vector = motion_vectors[k]; + + if (test_vector.pos.y() < top_left.y() || + test_vector.pos.y() > bottom_right.y()) { + continue; + } + + if (std::abs(box_state.rotation()) > 0.01f || + options_.tracking_degrees() == + TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE) { + // Test also if vector is within transformed convex area. + bool accepted = true; + for (const Vector3_f& line : box_lines) { + if (line.DotProd(Vector3_f(test_vector.pos.x(), test_vector.pos.y(), + 1.0f)) > expand_mag) { + // Outside, reject. + accepted = false; + break; + } + } + if (!accepted) { + continue; + } + } + + vectors->push_back(&motion_vectors[k]); + + auto is_close_to_test_vector = [test_vector](const Vector2_f v) -> bool { + return (v - test_vector.pos).Norm2() < kSqProximity; + }; + + const bool is_inlier_flag = + inlier_ids.find(test_vector.track_id) != inlier_ids.end() || + std::find_if(inlier_locations.begin(), inlier_locations.end(), + is_close_to_test_vector) != inlier_locations.end(); + num_cont_inliers += is_inlier_flag; + + const bool is_outlier_flag = + outlier_ids.find(test_vector.track_id) != outlier_ids.end() || + std::find_if(outlier_locations.begin(), outlier_locations.end(), + is_close_to_test_vector) != outlier_locations.end(); + + is_inlier.push_back(is_inlier_flag); + is_outlier.push_back(is_outlier_flag); + } + + CHECK_EQ(vectors->size(), is_inlier.size()); + CHECK_EQ(vectors->size(), is_outlier.size()); + + const float prev_motion_mag = MotionBoxVelocity(box_state).Norm(); + + // Try to lock on object again, if disparity is high. + constexpr float kMinPriorMotionWeight = 0.2f; + const float prior_motion_weight = + std::max(kMinPriorMotionWeight, std::abs(box_state.motion_disparity())) * + box_state.prior_weight(); + + const float motion_sigma = + std::max(options_.min_motion_sigma(), + prev_motion_mag * options_.relative_motion_sigma()); + const float motion_gaussian_scale = -0.5f / (motion_sigma * motion_sigma); + + // Maps current kinetic energy to [0, 1] quantifying static (0) vs. moving (1) + // object. + // Map normalized thresholds to current frame period. + const float low_kinetic_energy = + options_.low_kinetic_energy() * temporal_scale; + const float high_kinetic_energy = + options_.high_kinetic_energy() * temporal_scale; + const float kinetic_identity = LinearRamp( + box_state.kinetic_energy(), low_kinetic_energy, high_kinetic_energy); + int num_good_inits = 0; + + // Map number of continued inliers to score in [0, 1]. + const float cont_inlier_score = LinearRamp(num_cont_inliers, 10, 30); + + VLOG(1) << "GetVectorsAndWeights, found cont. inliers: " << num_cont_inliers + << " score: " << cont_inlier_score; + + DistanceWeightsComputer distance_weights_computer(initial_state_, box_state, + options_); + for (int k = 0; k < vectors->size(); ++k) { + const MotionVector& test_vector = *(*vectors)[k]; + + float weight = distance_weights_computer.ComputeDistanceWeight(test_vector); + + if (valid_background_model) { + const float motion_diff = + fabs(prev_motion_mag - test_vector.object.Norm()); + const float motion_weight = + std::exp(motion_gaussian_scale * motion_diff * motion_diff); + + // Blend with spatial weight, that is the higher the disparity + // (i.e. we lost tracking, the more inclined we are to lock + // onto vectors of similar motion magnitude regardless of their + // position). + // Note: One might feel inclined to always bias towards the previous + // motion, by multiplying weight with motion_weight. This however fails + // when tracking objects that start at rest and start moving. + weight = Lerp(weight, motion_weight, prior_motion_weight); + } + + // There are two kinds of vectors we are trying to balance here: + // - inliers from previous estimation + // - similar vectors + // + // Current strategy: + // - For static objects: Boost inliers a lot, discount outliers a lot, + // do not care about similar vectors. + // - For moving objects: Boost inliers proportional to number of continued + // inliers, discount outliers a lot, boost similar vectors and + // actively downweight dis-similar objects. + // + // Motivation: Inliers are usually not very stable, so if not sufficient + // have been continued prefer velocity over inliers for moving objects. + + // NOTE: Regarding additive vs. multiplicative weights. We need to + // multiply the weight here. Adding the weight messes + // with the gaussian spatial weighting which in turn makes it hard to lock + // onto moving objects in the first place (as center is assumed to be + // placed over moving objects, this helps distinguish initial foreground + // and background). + + // Up-weighting of inlier vectors and vectors of similar motion. + float upweight = 1.0f; + if (is_inlier[k]) { + // Previous track, boost weight significantly. + // + // NOTE: Regarding the amount of up-weighting: Long features are not + // very stable on moving objects. Therefore only upweight strongly for + // static objects. + constexpr float kWeakMultiplier = 5.0f; + constexpr float kStrongMultiplier = 20.0f; + + // Map 0 -> 1 and values >= 0.5 -> 0, because long features are not + // very stable on moving objects. Therefore only upweight strongly for + // static objects. + const float kinetic_alpha = + std::max(0.0f, 1.0f - 2.0f * kinetic_identity); + + // Choose strong multiplier only when kinetic_alpha OR inlier score + // support it. + const float multiplier = Lerp(kWeakMultiplier, kStrongMultiplier, + std::max(cont_inlier_score, kinetic_alpha)); + upweight *= multiplier; + } + + // Scale weight boost for moving objects by prior, i.e. modulate + // strength of scale w.r.t. confidence. + const float kin_scale = Lerp(1, 10, box_state.prior_weight()); + // 80% moving object weighted by prior. This weighting is biasing towards + // moving object when the prior is low. + if (kinetic_identity >= 0.8f * box_state.prior_weight() && + test_vector.object.Norm() > high_kinetic_energy && !is_outlier[k]) { + // If we track a moving object, long tracks are less likely to be stable + // due to appearance variations. In that case boost similar vectors. + upweight *= 5.f * kin_scale; + } + + float downweight = 1.0f; + // Down-weighting of outlier vectors and vectors of different motion. + if (!is_inlier[k]) { + // Outlier. + if (is_outlier[k]) { + // Note: Outlier ids might overlap with inliers as outliers are built + // from a history of frames. + // *Always favor inliers over outliers*! Important to keep!! + downweight *= 20.0f; + } + + // Vectors of different motion, for 100% moving object downweight + // vectors with small motion. + if (kinetic_identity >= 1.0f * box_state.prior_weight() && + test_vector.object.Norm() < low_kinetic_energy) { + downweight *= 2.f * kin_scale; + } + } + + // Cap any kind of up or down weighting so that no vector overwhelms + // all others. + const float kMaxWeight = 100.f; + upweight = std::min(kMaxWeight, upweight); + downweight = std::min(kMaxWeight, downweight); + weight *= upweight / downweight; + + num_good_inits += (weight >= 0.1f); + weights->push_back(weight); + } + + const int num_vectors = vectors->size(); + CHECK_EQ(num_vectors, weights->size()); + + const float weight_sum = + std::accumulate(weights->begin(), weights->end(), 0.0f); + + // Normalize weights. + if (weight_sum > 0) { + const float inv_weight_sum = 1.0f / weight_sum; + for (auto& weight : *weights) { + weight *= inv_weight_sum; + } + } + + *number_of_good_prior = num_good_inits; + *number_of_cont_inliers = num_cont_inliers; + + return true; +} + +void MotionBox::TranslationIrlsInitialization( + const std::vector& vectors, + const Vector2_f& irls_scale, std::vector* weights) const { + const int num_features = vectors.size(); + + const auto& irls_options = options_.irls_initialization(); + if (!irls_options.activated() || !num_features) { + return; + } + + // Bool indicator which features agree with model in each round. + // In case no RANSAC rounds are performed considered all features inliers. + std::vector best_features(num_features, 1); + std::vector curr_features(num_features); + float best_sum = 0; + + unsigned int seed = 900913; + std::default_random_engine rand_gen(seed); + std::uniform_int_distribution<> distribution(0, num_features - 1); + + const float cutoff = irls_options.cutoff(); + const float sq_cutoff = cutoff * cutoff; + + for (int rounds = 0; rounds < irls_options.rounds(); ++rounds) { + float curr_sum = 0; + // Pick a random vector. + const int rand_idx = distribution(rand_gen); + const Vector2_f flow = vectors[rand_idx]->object; + const auto error_system = ComputeIrlsErrorSystem(irls_scale, flow); + + // curr_features gets set for every feature below; no need to reset. + for (int i = 0; i < num_features; ++i) { + const Vector2_f diff = vectors[i]->object - flow; + const float error = ErrorDiff(diff, error_system); + curr_features[i] = static_cast(error < sq_cutoff); + if (curr_features[i]) { + curr_sum += (*weights)[i]; + } + } + + if (curr_sum > best_sum) { + best_sum = curr_sum; + std::swap(best_features, curr_features); + } + } + + std::vector inlier_weights; + inlier_weights.reserve(num_features); + + // Score outliers low. + int num_inliers = 0; + for (int i = 0; i < num_features; ++i) { + if (best_features[i] == 0) { + (*weights)[i] = 1e-10f; + } else { + ++num_inliers; + inlier_weights.push_back((*weights)[i]); + } + } + + if (!inlier_weights.empty()) { + // Ensure that all selected inlier features have at least median weight. + auto median = inlier_weights.begin() + inlier_weights.size() * 0.5f; + std::nth_element(inlier_weights.begin(), median, inlier_weights.end()); + + for (int i = 0; i < num_features; ++i) { + if (best_features[i] != 0) { + (*weights)[i] = std::max(*median, (*weights)[i]); + } + } + } +} + +void MotionBox::EstimateObjectMotion( + const std::vector& motion_vectors, + const std::vector& prior_weights, int num_continued_inliers, + const Vector2_f& irls_scale, std::vector* weights, + Vector2_f* object_translation, LinearSimilarityModel* object_similarity, + Homography* object_homography) const { + CHECK(object_translation); + CHECK(object_similarity); + CHECK(object_homography); + + const int num_vectors = motion_vectors.size(); + CHECK_EQ(num_vectors, prior_weights.size()); + CHECK_EQ(num_vectors, weights->size()); + + // Create backup of weights if needed. + std::vector similarity_weights; + + switch (options_.tracking_degrees()) { + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE: + similarity_weights = *weights; + break; + + default: + // Nothing to do here. + break; + } + + EstimateTranslation(motion_vectors, prior_weights, irls_scale, weights, + object_translation); + + TranslationModel translation_model = TranslationAdapter::FromArgs( + object_translation->x(), object_translation->y()); + + // For any additional degrees of freedom, require a good set of inliers. + if (num_continued_inliers < options_.object_similarity_min_contd_inliers()) { + VLOG_IF(2, options_.tracking_degrees() != + TrackStepOptions::TRACKING_DEGREE_TRANSLATION) + << "Falling back to translation!!!"; + VLOG(1) << "num_continued_inliers: " << num_continued_inliers << " < " + << options_.object_similarity_min_contd_inliers() + << ", fall back to translation"; + *object_similarity = LinearSimilarityAdapter::Embed(translation_model); + *object_homography = HomographyAdapter::Embed(translation_model); + return; + } + + switch (options_.tracking_degrees()) { + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE: + case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE: { + if (EstimateSimilarity(motion_vectors, prior_weights, irls_scale, + &similarity_weights, object_similarity)) { + if (!ObjectMotionValidator::IsValidSimilarity( + *object_similarity, options_.box_similarity_max_scale(), + options_.box_similarity_max_rotation())) { + LOG(WARNING) << "Unstable similarity model - falling back to " + << "translation."; + *object_similarity = + LinearSimilarityAdapter::Embed(translation_model); + } else { + // Good estimation, use weights as output. + weights->swap(similarity_weights); + } + } else { + *object_similarity = LinearSimilarityAdapter::Embed(translation_model); + } + + break; + } + case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE: + if (EstimateHomography(motion_vectors, prior_weights, irls_scale, + &similarity_weights, object_homography)) { + if (!ObjectMotionValidator::IsValidHomography( + *object_homography, options_.quad_homography_max_scale(), + options_.quad_homography_max_rotation())) { + LOG(WARNING) << "Unstable homography model - falling back to " + << "translation."; + *object_homography = HomographyAdapter::Embed(translation_model); + } else { + weights->swap(similarity_weights); + } + } else { + *object_homography = HomographyAdapter::Embed(translation_model); + } + VLOG(1) << "Got homography: " + << HomographyAdapter::ToString(*object_homography); + break; + default: + // Plenty of CAMERA_ cases are not handled in this function. + break; + } +} + +void MotionBox::EstimateTranslation( + const std::vector& motion_vectors, + const std::vector& prior_weights, const Vector2_f& irls_scale, + std::vector* weights, Vector2_f* translation) const { + CHECK(weights); + CHECK(translation); + + const int iterations = options_.irls_iterations(); + + // NOTE: Floating point accuracy is totally sufficient here. + // We tried changing to double now 3 times and it just does + // not matter. Do not do it again. - Past self + Vector2_f object_translation; + const int num_vectors = motion_vectors.size(); + const float kEpsilon = 1e-8f; + + VLOG(1) << "Estimating translation for " << num_vectors << " vectors"; + + for (int i = 0; i < iterations; ++i) { + float flow_sum = 0; + object_translation = Vector2_f(0.0, 0.0); + for (int k = 0; k < num_vectors; ++k) { + const MotionVector& motion_vector = *motion_vectors[k]; + const Vector2_f flow = motion_vector.object; + object_translation += flow * (*weights)[k]; + flow_sum += (*weights)[k]; + } + + if (flow_sum > 0) { + object_translation *= (1.0 / flow_sum); + + const auto error_system = + ComputeIrlsErrorSystem(irls_scale, object_translation); + + // Update irls weights. + for (int k = 0; k < num_vectors; ++k) { + const MotionVector& motion_vector = *motion_vectors[k]; + Vector2_f diff(motion_vector.object - object_translation); + const float error = ErrorDiff(diff, error_system); + // In last iteration compute weights without any prior bias. + const float numerator = (i + 1 == iterations) ? 1.0f : prior_weights[k]; + (*weights)[k] = numerator / (error + kEpsilon); + } + } + } + *translation = object_translation; + + VLOG(1) << "Got translation: " << *translation; +} + +bool MotionBox::EstimateSimilarity( + const std::vector& motion_vectors, + const std::vector& prior_weights, const Vector2_f& irls_scale, + std::vector* weights, LinearSimilarityModel* lin_sim) const { + CHECK(weights); + CHECK(lin_sim); + + const int iterations = options_.irls_iterations(); + LinearSimilarityModel object_similarity; + const int num_vectors = motion_vectors.size(); + const float kEpsilon = 1e-8f; + + VLOG(1) << "Estimating similarity for " << num_vectors << " vectors"; + for (int i = 0; i < iterations; ++i) { + if (LinearSimilarityL2Solve(motion_vectors, *weights, &object_similarity)) { + // Update irls weights. + for (int k = 0; k < num_vectors; ++k) { + const MotionVector& motion_vector = *motion_vectors[k]; + const Vector2_f model_vec = + TransformPoint(object_similarity, motion_vector.pos) - + motion_vector.pos; + const auto error_system = ComputeIrlsErrorSystem(irls_scale, model_vec); + + Vector2_f diff(motion_vector.object - model_vec); + const float error = ErrorDiff(diff, error_system); + // In last iteration compute weights without any prior bias. + const float numerator = (i + 1 == iterations) ? 1.0f : prior_weights[k]; + (*weights)[k] = numerator / (error + kEpsilon); + } + } else { + return false; + } + } + *lin_sim = object_similarity; + + VLOG(1) << "Got similarity: " + << LinearSimilarityAdapter::ToString(object_similarity); + return true; +} + +bool MotionBox::EstimateHomography( + const std::vector& motion_vectors, + const std::vector& prior_weights, const Vector2_f& irls_scale, + std::vector* weights, Homography* object_homography) const { + CHECK(weights); + + const int iterations = options_.irls_iterations(); + Homography homography; + const int num_vectors = motion_vectors.size(); + const float kEpsilon = 1e-8f; + + VLOG(1) << "Estimating homography for " << num_vectors << " vectors"; + for (int i = 0; i < iterations; ++i) { + if (HomographyL2Solve(motion_vectors, *weights, &homography)) { + // Update irls weights. + for (int k = 0; k < num_vectors; ++k) { + const MotionVector& motion_vector = *motion_vectors[k]; + const Vector2_f model_vec = + TransformPoint(homography, motion_vector.pos) - motion_vector.pos; + const auto error_system = ComputeIrlsErrorSystem(irls_scale, model_vec); + + Vector2_f diff(motion_vector.object - model_vec); + const float error = ErrorDiff(diff, error_system); + // In last iteration compute weights without any prior bias. + const float numerator = (i + 1 == iterations) ? 1.0f : prior_weights[k]; + (*weights)[k] = numerator / (error + kEpsilon); + } + } else { + return false; + } + } + *object_homography = homography; + + return true; +} + +bool MotionBox::EstimatePnpHomography( + const MotionBoxState& curr_pos, + const std::vector& motion_vectors, + const std::vector& weights, float domain_x, float domain_y, + Homography* pnp_homography) const { + constexpr int kMinVectors = 4; + if (motion_vectors.size() < kMinVectors) return false; + + Homography inv_h = HomographyAdapter::Invert(curr_pos.pnp_homography()); + + std::vector vectors_3d; + vectors_3d.reserve(motion_vectors.size()); + std::vector vectors_2d; + vectors_2d.reserve(motion_vectors.size()); + if (options_.has_camera_intrinsics()) { + const auto& camera = options_.camera_intrinsics(); + cv::Mat camera_mat, dist_coef; + ConvertCameraIntrinsicsToCvMat(camera, &camera_mat, &dist_coef); + float scale = std::max(camera.w(), camera.h()); + + std::vector mv_p; + mv_p.reserve(motion_vectors.size()); + std::vector mv_q; + mv_q.reserve(motion_vectors.size()); + for (int j = 0; j < motion_vectors.size(); ++j) { + if (weights[j] < kMaxOutlierWeight) continue; + mv_p.emplace_back(motion_vectors[j]->pos.x() * scale, + motion_vectors[j]->pos.y() * scale); + + Vector2_f q = motion_vectors[j]->pos + motion_vectors[j]->object + + motion_vectors[j]->background; + mv_q.emplace_back(q.x() * scale, q.y() * scale); + } + + if (mv_p.size() < kMinVectors) return false; + + cv::undistortPoints(mv_p, mv_p, camera_mat, dist_coef); + cv::undistortPoints(mv_q, mv_q, camera_mat, dist_coef); + + vectors_3d.resize(mv_p.size()); + vectors_2d.resize(mv_q.size()); + for (int j = 0; j < mv_p.size(); ++j) { + Vector2_f p = TransformPoint(inv_h, Vector2_f(mv_p[j].x, mv_p[j].y)); + vectors_3d[j] = cv::Point3f(p.x(), p.y(), 0.0f); + vectors_2d[j] = cv::Point2f(mv_q[j].x, mv_q[j].y); + } + } else { + Vector2_f center(domain_x * 0.5f, domain_y * 0.5f); + for (int j = 0; j < motion_vectors.size(); ++j) { + if (weights[j] < kMaxOutlierWeight) continue; + Vector2_f p = TransformPoint(inv_h, motion_vectors[j]->pos - center); + vectors_3d.emplace_back(p.x(), p.y(), 0.0f); + + Vector2_f q = motion_vectors[j]->pos + motion_vectors[j]->object + + motion_vectors[j]->background - center; + vectors_2d.emplace_back(q.x(), q.y()); + } + + if (vectors_3d.size() < kMinVectors) return false; + } + + // TODO: use previous rvec and tvec to initilize the solver. + cv::Mat rvec, tvec; + cv::solvePnP(vectors_3d, vectors_2d, + cv::Mat::eye(3, 3, CV_64F) /* camera_mat */, + cv::Mat::zeros(1, 5, CV_64FC1) /* dist_coef */, rvec, tvec); + *pnp_homography = PnpHomographyFromRotationAndTranslation(rvec, tvec); + + return true; +} + +void MotionBox::ApplyObjectMotionPerspectively(const MotionBoxState& curr_pos, + const Homography& pnp_homography, + float domain_x, float domain_y, + MotionBoxState* next_pos) const { + const float half_width = curr_pos.aspect_ratio(); + const float half_height = 1.0f; + + constexpr int kQuadCornersSize = 4; + + // Omitting the 3rd dimension because they are all zeros. + const std::vector corners_3d{ + Vector2_f(-half_width, -half_height), + Vector2_f(-half_width, half_height), + Vector2_f(half_width, half_height), + Vector2_f(half_width, -half_height), + }; + + std::vector corners_2d(kQuadCornersSize); + for (int c = 0; c < kQuadCornersSize; ++c) { + corners_2d[c] = + HomographyAdapter::TransformPoint(pnp_homography, corners_3d[c]); + } + + if (options_.has_camera_intrinsics()) { + std::vector cv_points(4); + for (int c = 0; c < kQuadCornersSize; ++c) { + cv_points[c] = cv::Point3f(corners_2d[c].x(), corners_2d[c].y(), 1.0); + } + + const auto& camera = options_.camera_intrinsics(); + cv::Mat camera_mat, dist_coef; + ConvertCameraIntrinsicsToCvMat(camera, &camera_mat, &dist_coef); + cv::Mat dummy_zeros = cv::Mat::zeros(1, 3, CV_64FC1); + std::vector cv_points_distorted; + cv::projectPoints(cv_points, dummy_zeros /* rvec */, dummy_zeros /* tvec */, + camera_mat, dist_coef, cv_points_distorted); + const float scale = 1.0f / std::max(camera.w(), camera.h()); + for (int c = 0; c < kQuadCornersSize; ++c) { + next_pos->mutable_quad()->set_vertices(c * 2, + cv_points_distorted[c].x * scale); + next_pos->mutable_quad()->set_vertices(c * 2 + 1, + cv_points_distorted[c].y * scale); + } + } else { + const float center_x = domain_x * 0.5f; + const float center_y = domain_y * 0.5f; + for (int c = 0; c < kQuadCornersSize; ++c) { + next_pos->mutable_quad()->set_vertices(c * 2, + corners_2d[c].x() + center_x); + next_pos->mutable_quad()->set_vertices(c * 2 + 1, + corners_2d[c].y() + center_y); + } + } + + *next_pos->mutable_pnp_homography() = pnp_homography; + UpdateStatePositionAndSizeFromStateQuad(next_pos); +} + +float MotionBox::ComputeMotionDisparity( + const MotionBoxState& curr_pos, const Vector2_f& irls_scale, + float continued_inliers, int num_inliers, + const Vector2_f& object_translation) const { + // Motion disparity does not take into account change of direction, + // only use parallel irls scale. + const float curr_velocity = MotionBoxVelocity(curr_pos).Norm(); + const float sign = object_translation.Norm() < curr_velocity ? -1.0f : 1.0f; + const float motion_diff = fabs(object_translation.Norm() - curr_velocity); + + // Score difference. + const float measured_motion_disparity = LinearRamp( + motion_diff * irls_scale.x(), options_.motion_disparity_low_level(), + options_.motion_disparity_high_level()); + + // Cap disparity measurement by inlier ratio, to account for objects + // suddenly stopping/accelerating. In this case measured disparity might be + // high, whereas inliers continue to be tracked. + const float max_disparity = 1.0f - continued_inliers; + + const float capped_disparity = + std::min(max_disparity, measured_motion_disparity); + + // Take into account large disparity in previous frames. Score by prior of + // previous motion. + const float motion_disparity = + std::max(curr_pos.motion_disparity() * options_.disparity_decay(), + capped_disparity) * + curr_pos.prior_weight(); + + // Map number of inliers to score in [0, 1], assuming a lot of inliers + // indicate lock onto object. + const float inlier_score = LinearRamp(num_inliers, 20, 40); + + // Decaying motion disparity faster if number of inliers indicate lock onto + // tracking objects has occurred. + return std::min(1.0f - inlier_score, motion_disparity) * sign; +} + +void MotionBox::ScoreAndRecordInliers( + const MotionBoxState& curr_pos, + const std::vector& vectors, + const std::vector& grid_positions, + const std::vector& pre_estimation_weights, + const std::vector& post_estimation_weights, + float background_discrimination, MotionBoxState* next_pos, + std::vector* inlier_weights, std::vector* inlier_density, + int* continued_inliers, int* swapped_inliers, float* motion_inliers_out, + float* kinetic_average_out) const { + CHECK(inlier_weights); + CHECK(inlier_density); + CHECK(continued_inliers); + CHECK(swapped_inliers); + CHECK(motion_inliers_out); + CHECK(kinetic_average_out); + + std::unordered_map prev_inliers; + MotionBoxInliers(curr_pos, &prev_inliers); + + std::unordered_set prev_outliers; + MotionBoxOutliers(curr_pos, &prev_outliers); + + ClearInlierState(next_pos); + + // Continued inlier fraction denotes amount of spatial occlusion. Very low + // values indicate that we are in very difficult tracking territory. + *continued_inliers = 0; + *swapped_inliers = 0; + float kinetic_average = 0; + float kinetic_average_sum = 0; + float motion_inliers = 0; + const int num_vectors = vectors.size(); + inlier_weights->resize(num_vectors); + inlier_density->resize(num_vectors); + + // Inliers normalization grid. + std::vector grid_count(kNormalizationGridSize * + kNormalizationGridSize); + const float prev_object_motion = MotionBoxVelocity(curr_pos).Norm(); + // Count number of similar moving inliers as previous object motion. + const float similar_motion_threshold = + std::max(2e-3f, prev_object_motion * 0.3f); + + // If background discrimination is low, inliers are ambiguous: Hard to + // distinguish from earlier outliers. In this case do not record inliers + // outside our current tracking extent, as everything will look like an + // inlier. + // + // TODO: Compute 2nd moment for inliers and describe as ellipse, + // improve shape here then. + bool inlier_ambiguity = background_discrimination < 0.5f; + int rejected = 0; + int num_inliers = 0; + for (int k = 0; k < num_vectors; ++k) { + (*inlier_weights)[k] = + LinearRamp(post_estimation_weights[k], options_.inlier_low_weight(), + options_.inlier_high_weight()); + const int track_id = vectors[k]->track_id; + + bool is_prev_outlier = prev_outliers.find(track_id) != prev_outliers.end(); + + const Vector2_f match_loc = vectors[k]->MatchLocation(); + if ((*inlier_weights)[k] > kMinInlierWeight) { // Inlier. + if (is_prev_outlier) { + ++(*swapped_inliers); + } + + if (inlier_ambiguity && + !PointWithinInlierExtent(vectors[k]->Location(), curr_pos)) { + ++rejected; + continue; + } + + ++num_inliers; + + AddToGrid(grid_positions[k], &grid_count); + + if (track_id >= 0) { + next_pos->add_inlier_ids(track_id); + next_pos->add_inlier_id_match_pos(match_loc.x() * kShortScale); + next_pos->add_inlier_id_match_pos(match_loc.y() * kShortScale); + auto pos = prev_inliers.find(track_id); + if (pos != prev_inliers.end()) { + // Count length of observation. + next_pos->add_inlier_length(pos->second + 1.0f); + ++(*continued_inliers); + } else { + next_pos->add_inlier_length(1); + } + } + + // Note: This should be weighted by the pre estimation weights, simply + // adding a 1 for each inlier leads to lower irls averages. + kinetic_average += vectors[k]->object.Norm() * pre_estimation_weights[k]; + kinetic_average_sum += pre_estimation_weights[k]; + + // Count number of inliers that agree with previous kinetic energy + // estimate. + if (std::abs(vectors[k]->object.Norm() - prev_object_motion) * + curr_pos.prior_weight() < + similar_motion_threshold) { + motion_inliers += pre_estimation_weights[k]; + } + } else if ((*inlier_weights)[k] < kMaxOutlierWeight) { // Outlier. + next_pos->add_outlier_ids(track_id); + next_pos->add_outlier_id_match_pos(match_loc.x() * kShortScale); + next_pos->add_outlier_id_match_pos(match_loc.y() * kShortScale); + } + } + + // Read out density of inliers. + for (int k = 0; k < num_vectors; ++k) { + if ((*inlier_weights)[k] > kMinInlierWeight) { + (*inlier_density)[k] = 2 * SampleFromGrid( + grid_positions[k], grid_count); + } else { + (*inlier_density)[k] = 0; + } + } + + if (kinetic_average_sum > 0) { + kinetic_average *= 1.0f / kinetic_average_sum; + } + + VLOG(1) << "num inliers: " << num_inliers << " rejected: " << rejected; + + *kinetic_average_out = kinetic_average; + *motion_inliers_out = motion_inliers; +} + +void MotionBox::ComputeInlierCenterAndExtent( + const std::vector& motion_vectors, + const std::vector& weights, const std::vector& density, + const MotionBoxState& box_state, float* min_inlier_sum, Vector2_f* center, + Vector2_f* extent) const { + CHECK(min_inlier_sum); + CHECK(center); + CHECK(extent); + + float weight_sum = 0; + float inlier_sum = 0; + const int num_vectors = motion_vectors.size(); + CHECK_EQ(num_vectors, weights.size()); + CHECK_EQ(num_vectors, density.size()); + + Vector2_f first_moment(0.0f, 0.0f); + Vector2_f second_moment(0.0f, 0.0f); + + Vector2_f top_left; + Vector2_f bottom_right; + MotionBoxBoundingBox(box_state, &top_left, &bottom_right); + + for (int k = 0; k < num_vectors; ++k) { + const MotionVector& motion_vector = *motion_vectors[k]; + const Vector2_f match = motion_vector.MatchLocation(); + float space_multiplier = 1.0f; + // Decrease contribution of out of bound inliers. + // Note: If all inliers are out of bound this down weighting will have no + // effect. It is designed to prevent skewing the inlier center towards + // similar moving inliers outside the tracked box. + if (match.x() < top_left.x() || match.x() > bottom_right.x() || + match.y() < top_left.y() || match.y() > bottom_right.y()) { + space_multiplier = 0.25f; + } + const float w = weights[k] * density[k] * space_multiplier; + if (w > 0) { + first_moment += w * match; + second_moment += + w * Vector2_f(match.x() * match.x(), match.y() * match.y()); + weight_sum += w; + inlier_sum += weights[k]; + } + } + + // Update center if sufficient inliers present. + if (inlier_sum > *min_inlier_sum) { + const float inv_weight_sum = 1.0f / weight_sum; + first_moment *= inv_weight_sum; + second_moment *= inv_weight_sum; + + *center = first_moment; + *extent = second_moment - Vector2_f(first_moment.x() * first_moment.x(), + first_moment.y() * first_moment.y()); + + // 1.645 sigmas in each direction = 90% of the data captured. + *extent = + Vector2_f(std::sqrt(extent->x()) * 3.29, std::sqrt(extent->y()) * 3.29); + } else { + // Gravitate back to box center with inlier center. + *center = Lerp(MotionBoxCenter(box_state), InlierCenter(box_state), 0.5f); + } + + // Record number of inliers present. + *min_inlier_sum = weight_sum; +} + +float MotionBox::ScaleEstimate( + const std::vector& motion_vectors, + const std::vector& weights, float min_sum) const { + const int num_vectors = motion_vectors.size(); + CHECK_EQ(num_vectors, weights.size()); + + float scale_sum = 0; + + // First moments. + Vector2_d sum_coords(0, 0); + Vector2_d match_sum_coords(0, 0); + + // Second moments. + Vector2_d sum_sq_coords(0, 0); + Vector2_d match_sum_sq_coords(0, 0); + + for (int k = 0; k < num_vectors; ++k) { + const MotionVector& motion_vector = *motion_vectors[k]; + + const Vector2_d pos(motion_vector.pos.x(), motion_vector.pos.y()); + double weight = weights[k]; + sum_coords += weight * pos; + sum_sq_coords += weight * Vector2_d(pos.x() * pos.x(), pos.y() * pos.y()); + + const Vector2_d match = + Vector2_d::Cast(motion_vector.MatchLocation()); + match_sum_coords += weight * match; + match_sum_sq_coords += + weight * Vector2_d(match.x() * match.x(), match.y() * match.y()); + scale_sum += weights[k]; + } + + if (scale_sum > min_sum) { + const double denom = 1.0f / scale_sum; + sum_coords *= denom; + match_sum_coords *= denom; + sum_sq_coords *= denom; + match_sum_sq_coords *= denom; + + const float curr_scale = + sqrt(sum_sq_coords.x() - sum_coords.x() * sum_coords.x() + + sum_sq_coords.y() - sum_coords.y() * sum_coords.y()); + const float next_scale = sqrt( + match_sum_sq_coords.x() - match_sum_coords.x() * match_sum_coords.x() + + match_sum_sq_coords.y() - match_sum_coords.y() * match_sum_coords.y()); + return next_scale / curr_scale; + } + + return 1.0f; +} + +void MotionBox::ApplySpringForce(const Vector2_f& center_of_interest, + const float rel_threshold, + const float spring_force, + MotionBoxState* box_state) const { + // Apply spring force towards center of interest. + const Vector2_f center = MotionBoxCenter(*box_state); + const float center_diff_x = center_of_interest.x() - center.x(); + const float center_diff_y = center_of_interest.y() - center.y(); + + const float diff_x = fabs(center_diff_x) - box_state->width() * rel_threshold; + const float diff_y = + fabs(center_diff_y) - box_state->height() * rel_threshold; + + if (diff_x > 0) { + const float correction_mag = diff_x * spring_force; + const float correction = + center_diff_x < 0 ? -correction_mag : correction_mag; + box_state->set_pos_x(box_state->pos_x() + correction); + } + + if (diff_y > 0) { + const float correction_mag = diff_y * spring_force; + const float correction = + center_diff_y < 0 ? -correction_mag : correction_mag; + box_state->set_pos_y(box_state->pos_y() + correction); + } +} + +void MotionBox::TrackStepImpl(int from_frame, const MotionBoxState& curr_pos, + const MotionVectorFrame& motion_frame, + const std::vector& history, + MotionBoxState* next_pos) const { + // Create new curr pos with velocity scaled to current duration. + constexpr float kDefaultPeriodMs = 1000.0f / kTrackingDefaultFps; + + // Scale to be applied to velocity related fields in MotionBoxState + // to transform state from standard frame period to current one. + float temporal_scale = (motion_frame.duration_ms == 0) + ? 1.0f + : motion_frame.duration_ms / kDefaultPeriodMs; + + MotionBoxState curr_pos_normalized = curr_pos; + ScaleStateTemporally(temporal_scale, &curr_pos_normalized); + ScaleStateAspect(motion_frame.aspect_ratio, false, &curr_pos_normalized); + + TrackStepImplDeNormalized(from_frame, curr_pos_normalized, motion_frame, + history, next_pos); + + // Scale back velocity and aspect to normalized domains. + ScaleStateTemporally(1.0f / temporal_scale, next_pos); + ScaleStateAspect(motion_frame.aspect_ratio, true, next_pos); + + // Test if out of bound, only for moving objects. + const float static_motion = + options_.static_motion_temporal_ratio() * temporal_scale; + if (MotionBoxVelocity(*next_pos).Norm() > static_motion) { + // Test if close to boundary and keep moving towards it. + constexpr float kRatio = 0.3; + if ((next_pos->pos_x() < -next_pos->width() * kRatio && + next_pos->dx() < -static_motion / 2) || + (next_pos->pos_y() < -next_pos->height() * kRatio && + next_pos->dy() < -static_motion / 2) || + (next_pos->pos_x() > 1.0f - next_pos->width() * (1.0f - kRatio) && + next_pos->dx() > static_motion / 2) || + (next_pos->pos_y() > 1.0f - next_pos->height() * (1.0f - kRatio) && + next_pos->dy() > static_motion / 2)) { + VLOG(1) << "Tracked box went out of bound."; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + } +} + +float MotionBox::ComputeTrackingConfidence( + const MotionBoxState& motion_box_state) const { + const float inlier_num_lower_bound = 10.0f; + const float inlier_num_upper_bound = 30.0f; + const float confidence = + LinearRamp(motion_box_state.inlier_ids_size(), inlier_num_lower_bound, + inlier_num_upper_bound); + return confidence; +} + +// General tracking algorithm overview: +// Tracking algorithm consists of 6 main stages. +// 1.) Selecting features from passed MotionVectorFrame based on incidence with +// rectangle defined by curr_pos +// 2.) Assigning each vector a prior weight. Vector are mainly scored by a box +// centered gaussian, giving more weights to vectors within the center of +// the box. If current state is deemed unreliable, vectors with similar +// velocity to previously estimated velocity are favored. If current state +// indicates tracking of a moving object, high velocity vectors are favored. +// 3.) Estimating a translational model via IRLS enforcing the prior of step 2 +// in every iteration. +// 4.) Score how much the estimate model deviates from the previous motion +// (termed motion disparity) and how discriminative the motion is +// from the background motion (termed motion discrimination) +// 5.) Computing the inlier center (position of vectors used for the motion +// model in the next frame) and center of high velocity vectors. Apply a +// spring force towards each center based on the motion discrimination. +// 6.) Update velocity and kinetic energy by blending current measurment with +// previous one. +void MotionBox::TrackStepImplDeNormalized( + int from_frame, const MotionBoxState& curr_pos, + const MotionVectorFrame& motion_frame, + const std::vector& history, + MotionBoxState* next_pos) const { + CHECK(next_pos); + + constexpr float kDefaultPeriodMs = 1000.0f / kTrackingDefaultFps; + float temporal_scale = (motion_frame.duration_ms == 0) + ? 1.0f + : motion_frame.duration_ms / kDefaultPeriodMs; + + // Initialize to current position. + *next_pos = curr_pos; + + if (!IsBoxValid(curr_pos)) { + LOG(ERROR) << "curr_pos is not a valid box. Stop tracking!"; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + + Vector2_f top_left, bottom_right; + float expand_mag = 0.0f; + GetStartPosition(curr_pos, motion_frame.aspect_ratio, &expand_mag, &top_left, + &bottom_right); + + const float aspect_ratio = motion_frame.aspect_ratio; + float domain_x = 1.0f; + float domain_y = 1.0f; + ScaleFromAspect(aspect_ratio, false, &domain_x, &domain_y); + + // Binary search for start and end index (lexicographic search, i.e. + // x indices are guaranteed to be within bounds, but y coordinates could be + // outside and need to be checked against the domain of the box via + // GetVectorsAndWeights below). + MotionVector search_start; + MotionVector search_end; + search_start.pos = top_left; + search_end.pos = bottom_right; + + int start_idx = std::lower_bound(motion_frame.motion_vectors.begin(), + motion_frame.motion_vectors.end(), + search_start, MotionVectorComparator()) - + motion_frame.motion_vectors.begin(); + + int end_idx = std::lower_bound(motion_frame.motion_vectors.begin(), + motion_frame.motion_vectors.end(), search_end, + MotionVectorComparator()) - + motion_frame.motion_vectors.begin(); + + const float static_motion = + options_.static_motion_temporal_ratio() * temporal_scale; + if (start_idx >= end_idx || top_left.x() >= domain_x - expand_mag || + top_left.y() >= domain_y - expand_mag || bottom_right.x() <= expand_mag || + bottom_right.y() <= expand_mag) { + // Empty box, no features found. It can happen if box is outside + // field of view, or there is no features in the box. + // Move box by background model if it has static motion, else move return + // tracking error. + if (MotionBoxVelocity(curr_pos).Norm() > static_motion || + (!motion_frame.valid_background_model && from_frame != queue_start_)) { + next_pos->set_track_status(MotionBoxState::BOX_NO_FEATURES); + } else { + // Static object, move by background model. + next_pos->set_track_status(MotionBoxState::BOX_TRACKED_OUT_OF_BOUND); + ApplyCameraTrackingDegrees(curr_pos, motion_frame.background_model, + options_, Vector2_f(domain_x, domain_y), + next_pos); + + // The further the quad is away from the FOV (range 0 to 1), the larger + // scale change will be applied to the quad by homography transform. To + // some certain extent, the position of vertices will be flipped from + // positive to negative, or vice versa. Here we reject all the quads with + // abnormal shape by convexity of the quad.. + if (next_pos->has_quad() && + (ObjectMotionValidator::IsQuadOutOfFov( + next_pos->quad(), Vector2_f(domain_x, domain_y)) || + !ObjectMotionValidator::IsValidQuad(next_pos->quad()))) { + LOG(ERROR) << "Quad is out of fov or not convex. Cancel tracking."; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + } + return; + } + + float start_x = Clamp(top_left.x(), 0.0f, domain_x); + float start_y = Clamp(top_left.y(), 0.0f, domain_y); + float end_x = Clamp(bottom_right.x(), 0.0f, domain_x); + float end_y = Clamp(bottom_right.y(), 0.0f, domain_y); + + const Vector2_f curr_pos_size = MotionBoxSize(curr_pos); + constexpr float kMinSize = 1e-3f; // 1 pix for 1080p. + if (start_x >= end_x || start_y >= end_y || curr_pos_size.x() < kMinSize || + curr_pos_size.y() < kMinSize) { + next_pos->set_track_status(MotionBoxState::BOX_EMPTY); + return; + } + + top_left = Vector2_f(start_x, start_y); + bottom_right = Vector2_f(end_x, end_y); + + // Get indices of features within box, corresponding priors and position + // in feature grid. + std::vector vectors; + std::vector prior_weights; + bool valid_background_model = motion_frame.valid_background_model; + + int num_good_inits; + int num_cont_inliers; + const bool get_vec_weights_status = GetVectorsAndWeights( + motion_frame.motion_vectors, start_idx, end_idx, top_left, bottom_right, + curr_pos, valid_background_model, motion_frame.is_chunk_boundary, + temporal_scale, expand_mag, history, &vectors, &prior_weights, + &num_good_inits, &num_cont_inliers); + if (!get_vec_weights_status) { + LOG(ERROR) << "error in GetVectorsAndWeights. Terminate tracking."; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + + // `num_good_inits` comes from motion vector weights, but pnp solver currently + // does not depend on weights. So for pnp tracking mode, we don't fall back to + // camera motion tracking based on num_good_inits. + if (!curr_pos.has_pnp_homography() && + (num_good_inits < 3 && + MotionBoxVelocity(curr_pos).Norm() <= static_motion)) { + // Static object, move by background model. + next_pos->set_track_status(MotionBoxState::BOX_TRACKED); + ApplyCameraTrackingDegrees(curr_pos, motion_frame.background_model, + options_, Vector2_f(domain_x, domain_y), + next_pos); + VLOG(1) << "No good inits; applying camera motion for static object"; + + if (next_pos->has_quad() && + !ObjectMotionValidator::IsValidQuad(next_pos->quad())) { + LOG(ERROR) << "Quad is not convex. Cancel tracking."; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + return; + } + + VLOG(1) << "Good inits: " << num_good_inits; + + const int num_vectors = vectors.size(); + CHECK_EQ(num_vectors, prior_weights.size()); + + Vector2_f object_translation; + + // Compute a rough estimate of the current motion. + for (int k = 0; k < num_vectors; ++k) { + object_translation += vectors[k]->Motion() * prior_weights[k]; + } + + Vector2_f prev_object_motion = MotionBoxVelocity(curr_pos); + + // Estimate expected motion magnitude. In case of low prior, skew more + // towards rough estimate instead of previous motion. + const float motion_mag_estimate = + std::max(object_translation.Norm(), + prev_object_motion.Norm() * curr_pos.prior_weight()); + + // For motivation about this: See MotionEstimation::GetIRLSResidualScale. + // Assume 1 pixel estimation error for tracked features at 360p video. + // This serves as absolute minimum of the estimation error, so we do not + // scale translation fractions below this threshold. + constexpr float kMinError = 1.25e-3f; + + // We use a combination of absolute and relative error. If a predefined + // fraction of the motion exceeds the minimum error, we scale the error + // such that the relative error equals the min error. + // We use different thresholds parallel and perpendicular to the estimation + // direction. + // Motivation is that we allow for more error perpendicular to an estimation + // (angular difference) than it is direction (magnitude error). + + // Scale in parallel, orthogonal direction. + Vector2_f irls_scale(1.0f, 1.0f); + + const Vector2_f kMotionPercentage(0.1f, 0.25f); + const Vector2_f motion_mag_scaled = motion_mag_estimate * kMotionPercentage; + + if (motion_mag_scaled.x() > kMinError) { + irls_scale.x(kMinError / motion_mag_scaled.x()); + } + if (motion_mag_scaled.y() > kMinError) { + irls_scale.y(kMinError / motion_mag_scaled.y()); + } + + // Irls init for translation. + // TODO: Adjust to object tracking degrees. + TranslationIrlsInitialization(vectors, irls_scale, &prior_weights); + + LinearSimilarityModel object_similarity; + Homography object_homography; + Homography pnp_homography; + + std::vector weights = prior_weights; + if (num_good_inits > 0) { + MEASURE_TIME << "Estimate object motion."; + EstimateObjectMotion(vectors, prior_weights, num_cont_inliers, irls_scale, + &weights, &object_translation, &object_similarity, + &object_homography); + } else { + // There is no hope to estimate a model in a stable manner here. + object_translation = prev_object_motion; + VLOG(1) << "No good inits, reusing prev. motion instead of estimation"; + } + + // Multiplier to quanitify how discriminative object motion is + // (larger motions are more discriminative). + // Note: Independent from temporal scale. + float background_discrimination = curr_pos.background_discrimination(); + if (valid_background_model) { + background_discrimination = + LinearRamp(object_translation.Norm(), + options_.background_discrimination_low_level(), + options_.background_discrimination_high_level()); + } + + // Score weights from motion estimation to determine set of inliers. + std::vector inlier_weights; + std::vector inlier_density; + + // Compute grid positions for each vector to determine density of inliers. + std::vector grid_positions; + ComputeGridPositions(top_left, bottom_right, vectors, + &grid_positions); + + // Continued inlier fraction denotes amount of spatial occlusion. Very low + // values indicate that we are in very difficult tracking territory. + int continued_inliers = 0; + int swapped_inliers = 0; + float kinetic_average = 0; + float motion_inliers = 0; + ScoreAndRecordInliers(curr_pos, vectors, grid_positions, prior_weights, + weights, background_discrimination, next_pos, + &inlier_weights, &inlier_density, &continued_inliers, + &swapped_inliers, &motion_inliers, &kinetic_average); + + const int num_prev_inliers = curr_pos.inlier_ids_size(); + const int num_inliers = next_pos->inlier_ids_size(); + // Must be in [0, 1]. + const float continued_inlier_fraction = + num_prev_inliers == 0 ? 1.0f + : continued_inliers * 1.0f / num_prev_inliers; + // Within [0, M], where M is maximum number of features. Values of > 1 + // indicate significant number of inlieres were outliers in previous frame. + const float swapped_inlier_fraction = + num_prev_inliers == 0 ? 0.0f : swapped_inliers * 1.0f / num_prev_inliers; + + if (curr_pos.has_pnp_homography()) { + MEASURE_TIME << "Estimate pnp homography."; + + // Use IRLS homography `inlier_weights` to determin inliers / outliers. + // The rationale is: solving homography transform is 20X faster than solving + // perspective transform (0.05ms vs 1ms). So, we use 5 iterations of + // reweighted homography to filter out outliers first. And only use inliers + // to solve for perspective. + if (!EstimatePnpHomography(curr_pos, vectors, inlier_weights, domain_x, + domain_y, &pnp_homography)) { + // Here, we can either cancel tracking or apply homography or even + // translation as our best guess. But since some specific use cases of + // pnp tracking (for example Augmented Images) prefer high precision + // over high recall, we choose to cancel tracking once and for all. + VLOG(1) << "Not enough motion vectors to solve pnp. Cancel tracking."; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + } + + // Compute disparity. + if (num_good_inits > 0) { + next_pos->set_motion_disparity(ComputeMotionDisparity( + curr_pos, irls_scale, continued_inliers, num_inliers, + valid_background_model ? object_translation : prev_object_motion)); + } else { + // No good features, signal error. + next_pos->set_motion_disparity(1.0); + } + + VLOG(1) << "Motion inliers: " << motion_inliers + << ", continued inliers: " << continued_inliers + << ", continued ratio: " << continued_inlier_fraction + << ", swapped fraction: " << swapped_inlier_fraction + << ", motion disparity: " << next_pos->motion_disparity(); + + if (options_.cancel_tracking_with_occlusion_options().activated() && + curr_pos.track_status() != MotionBoxState::BOX_DUPLICATED && + continued_inlier_fraction < + options_.cancel_tracking_with_occlusion_options() + .min_motion_continuity()) { + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + LOG(INFO) << "Occlusion detected. continued_inlier_fraction: " + << continued_inlier_fraction << " too low. Stop tracking"; + return; + } + + // Force reset of state when inlier continuity is severly violated, + // disparity maxes out or significant of number of inliers were outliers in + // the previous frame. + if (std::max(continued_inlier_fraction, motion_inliers) < 0.15f || + std::abs(next_pos->motion_disparity()) >= 1.0f || + swapped_inlier_fraction >= 2.5) { + VLOG(1) << "Track error, state reset."; + // Bad tracking error occurred. + // Current set of inliers is not reliable. + ClearInlierState(next_pos); + next_pos->set_motion_disparity(1.0f); + inlier_weights.assign(inlier_weights.size(), 0); + + // Reuse previous motion and discrimination. + object_translation = prev_object_motion; + background_discrimination = curr_pos.background_discrimination(); + } + + next_pos->set_inlier_sum( + std::accumulate(inlier_weights.begin(), inlier_weights.end(), 0.0f)); + if (history.empty()) { + // Assign full confidence on first frame, otherwise all other stats + // are zero and there is no way to compute. + next_pos->set_tracking_confidence(1.0f); + LOG(INFO) << "no history. confidence : 1.0"; + } else { + next_pos->set_tracking_confidence(ComputeTrackingConfidence(*next_pos)); + VLOG(1) << "confidence: " << next_pos->tracking_confidence(); + } + next_pos->set_background_discrimination(background_discrimination); + + // Slowly decay current kinetic energy. Blend with current measurement based + // on disparity (high = use previous value, low = use current one). + next_pos->set_kinetic_energy(std::max( + options_.kinetic_energy_decay() * curr_pos.kinetic_energy(), + kinetic_average * (1.0f - std::abs(next_pos->motion_disparity())))); + + float inlier_max = curr_pos.inlier_sum(); + int num_tracked_frames_in_history = 0; + for (auto entry : history) { + inlier_max = std::max(inlier_max, entry->inlier_sum()); + if (entry->track_status() == MotionBoxState::BOX_TRACKED) { + num_tracked_frames_in_history++; + } + } + + const float inlier_ratio = + inlier_max > 0 ? (next_pos->inlier_sum() / (inlier_max + 1e-3f)) : 0.0f; + + next_pos->set_inlier_ratio(inlier_ratio); + + const bool is_perfect_fit = inlier_ratio > 0.85f && inlier_ratio < 1.15f; + + // num_tracked_frames_in_history has to be greater than 1, since the first + // frame is marked as BOX_TRACKED in ResetAtFrame. + if (options_.cancel_tracking_with_occlusion_options().activated() && + curr_pos.track_status() != MotionBoxState::BOX_DUPLICATED && + num_tracked_frames_in_history > 1 && + inlier_ratio < options_.cancel_tracking_with_occlusion_options() + .min_inlier_ratio()) { + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + LOG(INFO) << "inlier_ratio: " << inlier_ratio + << " too small. Stop tracking. inlier_max: " << inlier_max + << ". length in history: " << history.size(); + return; + } + + // Blend measured object motion based on motion disparity, i.e. the more the + // measured and previous motion agree, the less the smoothing. This is to + // propagate the box in the direction of the previous object motion + // in case tracking has been lost. + // Allow new measurements to propagate slowly. + if (valid_background_model && !is_perfect_fit) { + // Always move some fraction in the direction of the measured object + // even if deemed in disagreement with previous motion. + const float kMinimumBlend = 0.2f; + object_translation = Lerp( + object_translation, prev_object_motion, + std::min(1.0f - kMinimumBlend, std::abs(next_pos->motion_disparity()))); + } + + if (curr_pos.has_pnp_homography()) { + ApplyObjectMotionPerspectively(curr_pos, pnp_homography, domain_x, domain_y, + next_pos); + } else { + ApplyObjectMotion(curr_pos, object_translation, object_similarity, + object_homography, options_, next_pos); + + ApplyCameraTrackingDegrees(curr_pos, motion_frame.background_model, + options_, Vector2_f(domain_x, domain_y), + next_pos); + } + + if (next_pos->has_quad() && + !ObjectMotionValidator::IsValidQuad(next_pos->quad())) { + LOG(ERROR) << "Quad is not convex. Cancel tracking."; + next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED); + return; + } + + // Storing pre-comp weights. + const std::vector& internal_weights = + options_.use_post_estimation_weights_for_state() ? inlier_weights + : prior_weights; + + StoreInternalState(vectors, internal_weights, aspect_ratio, + next_pos->mutable_internal()); + + // Compute center of inliers in next frame and change in scale for inliers. + Vector2_f inlier_center; + Vector2_f inlier_extent; + float min_inlier_weight = 2.0f; // Only update inlier_center if more + // inliers than specified are found. + ComputeInlierCenterAndExtent(vectors, inlier_weights, inlier_density, + *next_pos, &min_inlier_weight, &inlier_center, + &inlier_extent); + + // Determine difference to previous estimate. + Vector2_f prev_inlier_center(InlierCenter(curr_pos)); + const float rel_inlier_center_diff = + (inlier_center - prev_inlier_center).Norm() / + MotionBoxSize(curr_pos).Norm(); + + // Smooth with previous location, based on relative inlier difference. + // A difference of 1.0 is mapped to a weight of 1.0 (total outlier). + // Blend weight is maxed out at .6f to always allow measurements to propagate + // over time (assuming high motion discrimination). + const float center_blend = + std::min(Lerp(0.95f, 0.6f, background_discrimination), + rel_inlier_center_diff) * + curr_pos.prior_weight(); + inlier_center = Lerp(inlier_center, prev_inlier_center, center_blend); + + next_pos->set_inlier_center_x(inlier_center.x()); + next_pos->set_inlier_center_y(inlier_center.y()); + + // Update extent only when sufficient inliers are present. + // TODO: This is too hacky, evaluate. + if (min_inlier_weight > 30) { + Vector2_f prev_inlier_extent(curr_pos.inlier_width(), + curr_pos.inlier_height()); + // Blend with previous extent based on prior and discrimination. + inlier_extent = Lerp( + inlier_extent, prev_inlier_extent, + curr_pos.prior_weight() * Lerp(1.0, 0.85, background_discrimination)); + next_pos->set_inlier_width(inlier_extent.x()); + next_pos->set_inlier_height(inlier_extent.y()); + } + + VLOG(1) << "Inlier extent " << next_pos->inlier_width() << " , " + << next_pos->inlier_height(); + + // Spring force applied to the inlier_center is modulated by the background + // discrimination. Motivation: Low background discrimination leads to inlier + // center more biased towards previous result due to update weight being + // tampered down. Always apply a minimum force. + // TODO: During challenging (low inlier) situations this can + // save the lock onto objects. Cook up a condition to set min spring force to + // 0.25 or so. + constexpr float kMinSpringForceFraction = 0.0; + ApplySpringForce(inlier_center, options_.inlier_center_relative_distance(), + std::min(1.0f, options_.inlier_spring_force() * + std::max(kMinSpringForceFraction, + background_discrimination)), + next_pos); + + if (options_.compute_spatial_prior()) { + // Blend based on object multiplier using high prior weight for low + // multipliers. + // Magic update numbers, prior is not important for tracking, only for + // visualization purposes. + const float prior_weight = Lerp(0.98, 0.85, background_discrimination); + ComputeSpatialPrior(true, true, prior_weight, next_pos); + } + + // Update velocity. + float velocity_update_weight = + is_perfect_fit + ? 0.0f + : (options_.velocity_update_weight() * curr_pos.prior_weight()); + // Computed object motion is completely random when background model is + // invalid. Use previous motion in this case. + if (!valid_background_model) { + velocity_update_weight = 1.0f; + } + + next_pos->set_dx( + Lerp(object_translation.x(), curr_pos.dx(), velocity_update_weight)); + next_pos->set_dy( + Lerp(object_translation.y(), curr_pos.dy(), velocity_update_weight)); + + // Update prior. + if (valid_background_model) { + next_pos->set_prior_weight(std::min( + 1.0f, curr_pos.prior_weight() + options_.prior_weight_increase())); + } else { + next_pos->set_prior_weight(std::max( + 0.0f, curr_pos.prior_weight() - options_.prior_weight_increase())); + } + + next_pos->set_track_status(MotionBoxState::BOX_TRACKED); +} + +void MotionVectorFrameFromTrackingData(const TrackingData& tracking_data, + MotionVectorFrame* motion_vector_frame) { + CHECK(motion_vector_frame != nullptr); + + const auto& motion_data = tracking_data.motion_data(); + float aspect_ratio = tracking_data.frame_aspect(); + if (aspect_ratio < 0.1 || aspect_ratio > 10.0f) { + LOG(ERROR) << "Aspect ratio : " << aspect_ratio << " is out of bounds. " + << "Resetting to 1.0."; + aspect_ratio = 1.0f; + } + + float scale_x, scale_y; + // Normalize longest dimension to 1 under aspect ratio preserving scaling. + ScaleFromAspect(aspect_ratio, false, &scale_x, &scale_y); + + scale_x /= tracking_data.domain_width(); + scale_y /= tracking_data.domain_height(); + + const bool use_background_model = + !(tracking_data.frame_flags() & TrackingData::FLAG_BACKGROUND_UNSTABLE); + + Homography homog_scale = HomographyAdapter::Embed( + AffineAdapter::FromArgs(0, 0, scale_x, 0, 0, scale_y)); + + Homography inv_homog_scale = HomographyAdapter::Embed( + AffineAdapter::FromArgs(0, 0, 1.0f / scale_x, 0, 0, 1.0f / scale_y)); + + // Might be just the identity if not set. + const Homography background_model = tracking_data.background_model(); + const Homography background_model_scaled = + ModelCompose3(homog_scale, background_model, inv_homog_scale); + + motion_vector_frame->background_model.CopyFrom(background_model_scaled); + motion_vector_frame->valid_background_model = use_background_model; + motion_vector_frame->is_duplicated = + tracking_data.frame_flags() & TrackingData::FLAG_DUPLICATED; + motion_vector_frame->is_chunk_boundary = + tracking_data.frame_flags() & TrackingData::FLAG_CHUNK_BOUNDARY; + motion_vector_frame->aspect_ratio = tracking_data.frame_aspect(); + motion_vector_frame->motion_vectors.reserve(motion_data.row_indices_size()); + + motion_vector_frame->motion_vectors.clear(); + const bool long_tracks = motion_data.track_id_size() > 0; + + for (int c = 0; c < motion_data.col_starts_size() - 1; ++c) { + const float x = c; + const float scaled_x = x * scale_x; + + for (int r = motion_data.col_starts(c), + r_end = motion_data.col_starts(c + 1); + r < r_end; ++r) { + MotionVector motion_vector; + + const float y = motion_data.row_indices(r); + const float scaled_y = y * scale_y; + + const float dx = motion_data.vector_data(2 * r); + const float dy = motion_data.vector_data(2 * r + 1); + + if (use_background_model) { + Vector2_f loc(x, y); + Vector2_f background_motion = + HomographyAdapter::TransformPoint(background_model, loc) - loc; + motion_vector.background = Vector2_f(background_motion.x() * scale_x, + background_motion.y() * scale_y); + } + + motion_vector.pos = Vector2_f(scaled_x, scaled_y); + motion_vector.object = Vector2_f(dx * scale_x, dy * scale_y); + + if (long_tracks) { + motion_vector.track_id = motion_data.track_id(r); + } + motion_vector_frame->motion_vectors.push_back(motion_vector); + } + } +} + +void FeatureAndDescriptorFromTrackingData( + const TrackingData& tracking_data, std::vector* features, + std::vector* descriptors) { + const auto& motion_data = tracking_data.motion_data(); + float aspect_ratio = tracking_data.frame_aspect(); + if (aspect_ratio < 0.1 || aspect_ratio > 10.0f) { + LOG(ERROR) << "Aspect ratio : " << aspect_ratio << " is out of bounds. " + << "Resetting to 1.0."; + aspect_ratio = 1.0f; + } + + if (motion_data.feature_descriptors_size() == 0) { + LOG(WARNING) << "Feature descriptors not exist"; + return; + } + + float scale_x, scale_y; + // Normalize longest dimension to 1 under aspect ratio preserving scaling. + ScaleFromAspect(aspect_ratio, false, &scale_x, &scale_y); + scale_x /= tracking_data.domain_width(); + scale_y /= tracking_data.domain_height(); + + features->clear(); + descriptors->clear(); + + for (int c = 0; c < motion_data.col_starts_size() - 1; ++c) { + const float x = c; + const float scaled_x = x * scale_x; + + for (int r = motion_data.col_starts(c), + r_end = motion_data.col_starts(c + 1); + r < r_end; ++r) { + const std::string& descriptor = motion_data.feature_descriptors(r).data(); + + if (absl::c_all_of(descriptor, [](char c) { return c == 0; })) { + continue; + } + + const float y = motion_data.row_indices(r); + const float scaled_y = y * scale_y; + + features->emplace_back(scaled_x, scaled_y); + descriptors->emplace_back(descriptor); + } + } +} + +void InvertMotionVectorFrame(const MotionVectorFrame& input, + MotionVectorFrame* output) { + CHECK(output != nullptr); + + output->background_model.CopyFrom(ModelInvert(input.background_model)); + output->valid_background_model = input.valid_background_model; + output->is_duplicated = input.is_duplicated; + output->is_chunk_boundary = input.is_chunk_boundary; + output->duration_ms = input.duration_ms; + output->aspect_ratio = input.aspect_ratio; + output->motion_vectors.clear(); + output->motion_vectors.reserve(input.motion_vectors.size()); + + const float aspect_ratio = input.aspect_ratio; + float domain_x = 1.0f; + float domain_y = 1.0f; + ScaleFromAspect(aspect_ratio, false, &domain_x, &domain_y); + + // Explicit copy. + for (auto motion_vec : input.motion_vectors) { + motion_vec.background *= -1.0f; + motion_vec.object *= -1.0f; + + motion_vec.pos -= motion_vec.background + motion_vec.object; + + // Inverted vector might be out of bound. + if (motion_vec.pos.x() < 0.0f || motion_vec.pos.x() > domain_x || + motion_vec.pos.y() < 0.0f || motion_vec.pos.y() > domain_y) { + continue; + } + + // Approximately 40 - 60% of all inserts happen to be at the end. + if (output->motion_vectors.empty() || + MotionVectorComparator()(output->motion_vectors.back(), motion_vec)) { + output->motion_vectors.push_back(motion_vec); + } else { + output->motion_vectors.insert( + std::lower_bound(output->motion_vectors.begin(), + output->motion_vectors.end(), motion_vec, + MotionVectorComparator()), + motion_vec); + } + } +} + +float TrackingDataDurationMs(const TrackingDataChunk::Item& item) { + return (item.timestamp_usec() - item.prev_timestamp_usec()) * 1e-3f; +} + +void GetFeatureIndicesWithinBox(const std::vector& features, + const MotionBoxState& box_state, + const Vector2_f& box_scaling, + float max_enlarge_size, int min_num_features, + std::vector* inlier_indices) { + CHECK(inlier_indices); + inlier_indices->clear(); + + if (features.empty()) return; + std::array box_lines; + if (!MotionBoxLines(box_state, box_scaling, &box_lines)) { + LOG(ERROR) << "Error in computing MotionBoxLines."; + return; + } + + // If the box size isn't big enough to cover sufficient features to + // reacquire the box, the following code will try to iteratively enlarge the + // box size by half of 'max_enlarge_size' to include more features, but + // maximimum twice. + float distance_threshold = 0.0f; + int inliers_count = 0; + std::vector chosen(features.size(), false); + std::vector signed_distance(features.size()); + + for (int j = 0; j < features.size(); ++j) { + float max_dist = std::numeric_limits::lowest(); + for (const Vector3_f& line : box_lines) { + float dist = + line.DotProd(Vector3_f(features[j].x(), features[j].y(), 1.0f)); + if (dist > max_enlarge_size) { + max_dist = dist; + break; + } + max_dist = std::max(dist, max_dist); + } + + signed_distance[j] = max_dist; + if (signed_distance[j] < distance_threshold) { + ++inliers_count; + chosen[j] = true; + inlier_indices->push_back(j); + } + } + + const float box_enlarge_step = max_enlarge_size * 0.5f; + while (inliers_count < min_num_features) { + distance_threshold += box_enlarge_step; + if (distance_threshold > max_enlarge_size) break; + for (int j = 0; j < features.size(); ++j) { + if (chosen[j]) continue; + if (signed_distance[j] < distance_threshold) { + ++inliers_count; + chosen[j] = true; + inlier_indices->push_back(j); + } + } + } +} + +} // namespace mediapipe. diff --git a/mediapipe/util/tracking/tracking.h b/mediapipe/util/tracking/tracking.h new file mode 100644 index 000000000..782eae47f --- /dev/null +++ b/mediapipe/util/tracking/tracking.h @@ -0,0 +1,643 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_TRACKING_H_ +#define MEDIAPIPE_UTIL_TRACKING_TRACKING_H_ + +// Performs tracking via rectangular regions (MotionBoxes) from pre-initialized +// positions, using metadata from tracked features (TrackingData converted to +// MotionVectorFrames), forward and backward in time. + +#include +#include +#include +#include +#include + +#include "mediapipe/framework/port/vector.h" +#include "mediapipe/util/tracking/flow_packager.pb.h" +#include "mediapipe/util/tracking/motion_models.h" +#include "mediapipe/util/tracking/motion_models.pb.h" +#include "mediapipe/util/tracking/tracking.pb.h" + +namespace mediapipe { + +// Useful helper functions. +// +// Clamps values to be within interval [left, right]. +inline float Clamp(float value, float left, float right) { + return value < left ? left : (value > right ? right : value); +} + +// Standard linear interpolation function. +template +T Lerp(T a, T b, float alpha) { + return static_cast(a * (1.0f - alpha) + b * alpha); +} + +// Approximates sigmoid function with a linear ramp, mapping +// x <= lhs to 0, x >= rhs to 1 (for lhs < rhs) linear in between interval +// [lhs, rhs]. If lhs > rhs, roles are reversed. +inline float LinearRamp(float value, float lhs, float rhs) { + return Clamp((value - lhs) / (rhs - lhs), 0, 1); +} + +inline Vector2_f MotionBoxPosition(const MotionBoxState& state) { + return Vector2_f(state.pos_x(), state.pos_y()); +} + +inline void SetMotionBoxPosition(const Vector2_f& pos, MotionBoxState* state) { + state->set_pos_x(pos.x()); + state->set_pos_y(pos.y()); +} + +// TODO: this needs to be changed for quad +inline Vector2_f MotionBoxSize(const MotionBoxState& state) { + return Vector2_f(state.width(), state.height()); +} + +inline void SetMotionBoxSize(const Vector2_f& size, MotionBoxState* state) { + state->set_width(size.x()); + state->set_height(size.y()); +} + +inline Vector2_f MotionBoxCenter(const MotionBoxState& state) { + return MotionBoxPosition(state) + 0.5f * MotionBoxSize(state); +} + +inline Vector2_f InlierCenter(const MotionBoxState& state) { + return Vector2_f(state.inlier_center_x(), state.inlier_center_y()); +} + +inline Vector2_f MotionBoxVelocity(const MotionBoxState& state) { + return Vector2_f(state.dx(), state.dy()); +} + +inline void SetMotionBoxVelocity(const Vector2_f& velo, MotionBoxState* state) { + state->set_dx(velo.x()); + state->set_dy(velo.y()); +} + +// Derive normalization factor from image aspect ratio so that the scale for the +// longer edge is 1. scale will be reversed if `invert` is true. +void ScaleFromAspect(float aspect, bool invert, float* scale_x, float* scale_y); + +// Returns 4 corners of the MotionBox as top_left, bottom_left, bottom_right +// and top_right. Applies 2D scaling prior to rotation, which is necessary to +// preserve orthogonality of the rotation if the scaling is not isotropic. +std::array MotionBoxCorners( + const MotionBoxState& state, + const Vector2_f& scaling = Vector2_f(1.0f, 1.0f)); + +// Computes corresponding line equations for MotionBoxCorners. +// Output line equations on 4 sides. +// Returns true if box is normal, false if we encounter abnormal box which +// leads to numerical problems. +// Applies 2D scaling prior to rotation, which is necessary to +// preserve orthogonality of the rotation if the scaling is not isotropic. +bool MotionBoxLines(const MotionBoxState& state, const Vector2_f& scaling, + std::array* box_lines); + +// Returns top-left and bottom right corner of the bounding box +// of the MotionBoxState. +void MotionBoxBoundingBox(const MotionBoxState& state, Vector2_f* top_left, + Vector2_f* bottom_right); + +// Adds all inliers from state to the inlier map (as id, score) tuple. +// If id already exist, score is updated to be the maximum of current and +// existing score. +inline void MotionBoxInliers(const MotionBoxState& state, + std::unordered_map* inliers) { + CHECK(inliers); + const int num_inliers = state.inlier_ids_size(); + DCHECK_EQ(num_inliers, state.inlier_length_size()); + + for (int k = 0; k < num_inliers; ++k) { + (*inliers)[state.inlier_ids(k)] = + std::max((*inliers)[state.inlier_ids(k)], state.inlier_length(k)); + } +} + +// Adds all outliers from state to the outlier map. +inline void MotionBoxOutliers(const MotionBoxState& state, + std::unordered_set* outliers) { + for (int id : state.outlier_ids()) { + outliers->insert(id); + } +} + +// Returns inlier locations from state (normalized in [0, 1] domain). +void MotionBoxInlierLocations(const MotionBoxState& state, + std::vector* inlier_pos); +// Same for outlier positions. +void MotionBoxOutlierLocations(const MotionBoxState& state, + std::vector* outlier_pos); + +// Get corners of rotated rectangle. Note that the quad component in +// MotionBoxState is not used in this function. Only the rotated rectangle +// is used. +// Inputs: +// -- state: the MotionBoxState where we extract the rotated rectangle +// -- scaling: additional scaling we apply on x and y axis +// Output: +// corners in counter-clockwise order +std::array GetCornersOfRotatedRect(const MotionBoxState& state, + const Vector2_f& scaling); + +// Use the position, width, and height in MotionBoxState to initialize +// the quad. Only use it when you want to get homography for tracking. +void InitializeQuadInMotionBoxState(MotionBoxState* state); + +// Initializes inliers and outliers related fields in MotionBoxState from +// TrackingData. The box or quad position will be read from `state` so they need +// to be set beforehand. +void InitializeInliersOutliersInMotionBoxState(const TrackingData& tracking, + MotionBoxState* state); + +// Initializes pnp_homography field in MotionBoxState using perspective +// transform between a physical rectangle with specified aspect ratio and a +// screen quad. +void InitializePnpHomographyInMotionBoxState( + const TrackingData& tracking, const TrackStepOptions& track_step_options, + MotionBoxState* state); + +// Represents the motion of a feature at pos between frames, differentiating +// object from background motion (supplied via a MotionVectorFrame). +struct MotionVector { + MotionVector() : pos(0, 0), background(0, 0), object(0, 0) {} + + MotionVector(const Vector2_f& pos_, const Vector2_f& background_, + const Vector2_f& object_) + : pos(pos_), background(background_), object(object_) {} + + Vector2_f Location() const { return pos; } + Vector2_f MatchLocation() const { return pos + background + object; } + Vector2_f Motion() const { return background + object; } + + // Position of the feature in normalized domain [0, 1]. + Vector2_f pos; + // Motion due to background (i.e. camera motion). + Vector2_f background; + // Motion due to foreground (i.e. object motion in addition to background). + // If feature belong to background, object motion is nearly zero. + Vector2_f object; + + int track_id = -1; + + // Returns the MotionVector stored in the internal state at index. + static MotionVector FromInternalState(const MotionBoxInternalState& internal, + int index); +}; + +constexpr float kTrackingDefaultFps = 30.0; + +// Holds motion vectors and background model for each frame. +// Note: Specified in the aspect preserving domain under uniform scaling, +// longest dimension normalized to 1, i.e. if aspect_ratio >= 1, width is +// normalized to 1 otherwise height is normalized to 1. +struct MotionVectorFrame { + std::vector motion_vectors; + + Homography background_model; + bool valid_background_model = true; + bool is_duplicated = false; // Set if frame is duplicated w.r.t. + // previous one. + bool is_chunk_boundary = false; // Set if this is the first frame in a chunk. + + float duration_ms = 1000.0f / kTrackingDefaultFps; + + // Aspect ratio (w/h) of the original frame. + float aspect_ratio = 1.0f; +}; + +// Transforms TrackingData to MotionVectorFrame, ready to be used by tracking +// algorithm (so the MotionVectorFrame data is denormalized). +void MotionVectorFrameFromTrackingData(const TrackingData& tracking_data, + MotionVectorFrame* motion_vector_frame); + +// Transform TrackingData to feature positions and descriptors, ready to be used +// by detection (re-acquisition) algorithm (so the "features" is denomalized). +// Descriptors with all 0s will be discarded. +void FeatureAndDescriptorFromTrackingData( + const TrackingData& tracking_data, std::vector* features, + std::vector* descriptors); + +// Inverts MotionVectorFrame (by default defined as motion from current to +// previous frame) to hold motion from previous to current frame. +void InvertMotionVectorFrame(const MotionVectorFrame& input, + MotionVectorFrame* output); + +// Returns duration in ms for this chunk item. +float TrackingDataDurationMs(const TrackingDataChunk::Item& item); + +// Returns feature indices that are within the given box. If the box size isn't +// big enough to cover sufficient features (i.e., min_num_features), this will +// iteratively enlarge the box size (up to max_enlarge_size) to include more +// features. The argument box_scaling is used in MotionBoxLines() to get +// properly scaled box corners. Note: box_scaling and max_enlarge_size need to +// be in normalized image space. +// TODO: Add unit test. +void GetFeatureIndicesWithinBox(const std::vector& features, + const MotionBoxState& box_state, + const Vector2_f& box_scaling, + float max_enlarge_size, int min_num_features, + std::vector* inlier_indices); + +// Represents a moving box over time. Initial position is supplied via +// ResetAtFrame, and subsequent positions for previous and next frames are +// determined via tracking by TrackStep method. +// Example usage: +// // Assuming metadata is available: vector mvf; +// MotionBoxState box_state; +// // Set to center 20%. +// box_state.set_pos_x(0.4); +// box_state.set_pos_y(0.4); +// box_state.set_width(0.2); +// box_state.set_height(0.2); +// +// // Initialize first position at frame 5. +// MotionBox motion_box(TrackStepOptions()); +// motion_box.ResetAtFrame(4, box_state); +// // Track 4 frames backward and forward in time. +// for (int i = 0; i < 4; ++i) { +// // Tracking steps need to be called contiguously, as otherwise no +// // prior location for the track is present and TrackStep will fail. +// // Backward. +// motion_box.TrackStep(4 - i, mvf[4 -i], false, nullptr); +// // Get position -> consume for display, etc. +// motion_box.StateAtFrame(4 - i); +// +// // Forward. +// motion_box.TrackStep(4 + i, mvf[4 -i], true, nullptr); +// // Get position -> consume. +// motion_box.StateAtFrame(4 + i); +// } +class MotionBox { + public: + explicit MotionBox(const TrackStepOptions& track_step_options) + : options_(track_step_options) {} + + MotionBox() = default; + + // Sets and overwrites MotionBoxState at specified frame. Use to supply + // initial position. + void ResetAtFrame(int frame, const MotionBoxState& state); + + // Tracks MotionBox from state at from_frame either forward or backward in + // time, based on the passed MotionVectorFrame. (MotionVectorFrame has to + // correspond to requested tracking direction, this is not checked against). + // Returns true if tracking was successful. + // Note: It is assumed that from_frame already has a valid location, either + // via ResetAtFrame or previous successful execution of TrackStep. That is + // TrackStep needs to be called contiguously from a initialized position + // via ResetFrame. Otherwise no prior location for the track is present (at + // from_frame) and TrackStep will fail (return false). + bool TrackStep(int from_frame, const MotionVectorFrame& motion_vectors, + bool forward); + + MotionBoxState StateAtFrame(int frame) const { + if (frame < queue_start_ || + frame >= queue_start_ + static_cast(states_.size())) { + LOG(ERROR) << "Requesting state at unknown frame " << frame + << ". Returning UNTRACKED."; + MotionBoxState invalid; + invalid.set_track_status(MotionBoxState::BOX_UNTRACKED); + return invalid; + } else { + MotionBoxState result = states_[frame - queue_start_]; + if (!options_.return_internal_state()) { + result.clear_internal(); + } + return result; + } + } + + MotionBoxState* MutableStateAtFrame(int frame) { + if (frame < queue_start_ || frame >= queue_start_ + states_.size()) { + return NULL; + } else { + return &states_[frame - queue_start_]; + } + } + + bool TrackableFromFrame(int frame) const { + return StateAtFrame(frame).track_status() >= MotionBoxState::BOX_TRACKED; + } + + void set_start_track(int frame) { start_track_ = frame; } + int start_track() const { return start_track_; } + void set_end_track(int frame) { end_track_ = frame; } + int end_track() const { return end_track_; } + + void TrimFront(const int cache_size) { + int trim_count = states_.size() - cache_size; + if (trim_count > 0) { + queue_start_ += trim_count; + while (trim_count-- > 0) { + states_.pop_front(); + } + } + } + + void TrimBack(const int cache_size) { + int trim_count = states_.size() - cache_size; + if (trim_count > 0) { + while (trim_count-- > 0) { + states_.pop_back(); + } + } + } + + // If this variable is set to true, then TrackStep would print warning + // messages when tracking is failed. + // Default value is true and is set in tracking.cc. + static bool print_motion_box_warnings_; + + private: + // Determines next position from curr_pos based on tracking data in + // motion_vectors. Also receives history of the last N positions. + void TrackStepImplDeNormalized( + int frome_frame, const MotionBoxState& curr_pos, + const MotionVectorFrame& motion_vectors, + const std::vector& history, + MotionBoxState* next_pos) const; + + // Pre-normalization wrapper for above function. De-normalizes domain + // to aspect preserving domain and velocity to current frame period. + void TrackStepImpl(int from_frame, const MotionBoxState& curr_pos, + const MotionVectorFrame& motion_frame, + const std::vector& history, + MotionBoxState* next_pos) const; + + // Implementation functions for above TrackStepImpl. + // Returns bounding box for start position and the expansion magnitude + // (normalized) that was applied. + void GetStartPosition(const MotionBoxState& curr_pos, float aspect_ratio, + float* expand_mag, Vector2_f* top_left, + Vector2_f* bottom_right) const; + + // Outputs spatial sigma in x and y for spatial weighting. + // Pass current box_state and inverse box domain size. + void GetSpatialGaussWeights(const MotionBoxState& box_state, + const Vector2_f& inv_box_domain, + float* spatial_gauss_x, + float* spatial_gauss_y) const; + + // Outputs subset of motion_vectors that are within the specified domain + // (top_left to bottom_right). Only searches over the range specified via + // start and end idx. + // Each vector is weighted based on gaussian proximity, similar motion, + // track continuity, etc. which forms the prior weight of each feature. + // Features are binned into a grid of fixed dimension for density analysis. + // Also output number of vectors with good prior weights (> 0.1), and number + // of continued inliers. + // Returns true on success, false on failure. When it returns false, the + // output values are not reliable. + bool GetVectorsAndWeights( + const std::vector& motion_vectors, int start_idx, + int end_idx, const Vector2_f& top_left, const Vector2_f& bottom_right, + const MotionBoxState& box_state, bool valid_background_model, + bool is_chunk_boundary, + float temporal_scale, // Scale for velocity from standard frame period. + float expand_mag, const std::vector& history, + std::vector* vectors, + std::vector* prior_weights, int* number_of_good_prior, + int* number_of_cont_inliers) const; + + // Initializes weights by performing multiple ransac rounds from vectors. + // Error is scaled by irls scale along parallel and orthogonal direction. + void TranslationIrlsInitialization( + const std::vector& vectors, + const Vector2_f& irls_scale, std::vector* weights) const; + + // Wrapper function, estimating object motion w.r.t. various degrees + // of freedom. + void EstimateObjectMotion( + const std::vector& motion_vectors, + const std::vector& prior_weights, int num_continued_inliers, + const Vector2_f& irls_scale, std::vector* weights, + Vector2_f* object_translation, LinearSimilarityModel* object_similarity, + Homography* object_homography) const; + + // Perform IRLS estimation of the passed motion_vector's object motion. + // Each vector is weighted by original_weight / estimation_error, where + // estimation_error is refined in each estimation round. + // Outputs final translation and resulting irls weights (1.0 / + // estimation_error, i.e. with prior bias). + void EstimateTranslation( + const std::vector& motion_vectors, + const std::vector& orig_weights, const Vector2_f& irls_scale, + std::vector* weights, Vector2_f* translation) const; + + // Same as above for similarity. Returns false on failure (numerical + // instability is most common case here). + bool EstimateSimilarity( + const std::vector& motion_vectors, + const std::vector& orig_weights, const Vector2_f& irls_scale, + std::vector* weights, LinearSimilarityModel* lin_sim) const; + + // Same as above for homograph. + bool EstimateHomography( + const std::vector& motion_vectors, + const std::vector& prior_weights, const Vector2_f& irls_scale, + std::vector* weights, Homography* object_homography) const; + + // Perform 6DoF perspective transform based homography estimation using + // motion_vector's object + background motion. + // weights are used to determine whether a vector is inlier or outliers. + // The perspective solver will exclude those vectors with weights smaller than + // kMaxOutlierWeight (0.1). + bool EstimatePnpHomography( + const MotionBoxState& curr_pos, + const std::vector& motion_vectors, + const std::vector& weights, float domain_x, float domain_y, + Homography* pnp_homography) const; + + // Apply pre-computed perspective transform based homography to the next pos. + void ApplyObjectMotionPerspectively(const MotionBoxState& curr_pos, + const Homography& pnp_homography, + float domain_x, float domain_y, + MotionBoxState* next_pos) const; + + // Scores every vector after translation estimation into inliers and outliers + // (based on post_estimation_weights, inlierness is a measure in [0, 1]), + // and records result in next_pos as well as returning inlierness per vector + // in inlier_weights. + // Also computes the following statistics: + // - inlier_density: Local density for each inlier, i.e. measure of how many + // other inliers are close to that point. In [0, 1]. + // - continued_inliers: Number of inliers that continue to be present already + // in curr_pos (same track id) + // - swapped_inliers: Number of inliers that are outliers in curr_pos (same + // track id). + // - motion_inliers: Number of inliers of similar motion as previous state. + // This measure is complementary to above continued inliers + // in case object is moving significantly, in which case + // tracks tend to be short lived. + // - kinetic_average: Average object norm of all inliers weighted by + // pre_estimation_weights. + void ScoreAndRecordInliers(const MotionBoxState& curr_pos, + const std::vector& vectors, + const std::vector& grid_positions, + const std::vector& pre_estimation_weights, + const std::vector& post_estimation_weights, + float background_discrimination, + MotionBoxState* next_pos, + std::vector* inlier_weights, + std::vector* inlier_density, + int* continued_inliers, int* swapped_inliers, + float* motion_inliers, + float* kinetic_average) const; + + // Computes motion disparity (in [0, 1]), that is how well does the current + // object motion agree with the previous object motion. + // 0 indicates perfect match, 1 indicates signicant difference. + float ComputeMotionDisparity(const MotionBoxState& curr_pos, + const Vector2_f& irls_scale, + float continued_inliers, int num_inliers, + const Vector2_f& object_motion) const; + + // Computes inlier center and extent (vector positions weighted by + // weights and density). + // Sets center inlier center, if inlier_weight is above min_inlier_sum, + // else to the Motion box center. + void ComputeInlierCenterAndExtent( + const std::vector& motion_vectors, + const std::vector& weights, const std::vector& density, + const MotionBoxState& state, float* min_inlier_sum, Vector2_f* center, + Vector2_f* extent) const; + + float ScaleEstimate(const std::vector& motion_vectors, + const std::vector& weights, float min_sum) const; + + // Applies spring force from box_state's position to center_of_interest, if + // difference is above rel_threshold. Correcting force equals difference + // above threshold times the spring_force coefficient. + void ApplySpringForce(const Vector2_f& center_of_interest, + const float rel_threshold, const float spring_force, + MotionBoxState* box_state) const; + + // Compute the tracking confidence and return the value. + // The confidence is a float value in [0, 1], with 0 being least confident, + // and 1 being most confident. + float ComputeTrackingConfidence(const MotionBoxState& motion_box_state) const; + + private: + class ObjectMotionValidator { + public: + static bool IsValidSimilarity( + const LinearSimilarityModel& linear_similarity_model, float max_scale, + float max_rotation) { + SimilarityModel similarity_model = + LinearSimilarityAdapter::ToSimilarity(linear_similarity_model); + + if (similarity_model.scale() < 1.0f / max_scale || + similarity_model.scale() > max_scale || + std::abs(similarity_model.rotation()) > max_rotation) { + return false; + } + return true; + } + + static bool IsValidHomography(const Homography& homography, float max_scale, + float max_rotation) { + // Filter out abnormal homography. Otherwise the determinant of + // projected affine matrix will be negative. + if (!IsInverseStable(homography)) { + LOG(WARNING) << "Homography matrix is not stable."; + return false; + } + + LinearSimilarityModel similarity_model = + LinearSimilarityAdapter::ProjectFrom(homography, 1.0f, 1.0f); + return IsValidSimilarity(similarity_model, max_scale, max_rotation); + } + + // Check if it is a convex quad. + static bool IsValidQuad(const MotionBoxState::Quad& quad) { + const int kQuadVerticesSize = 8; + CHECK_EQ(quad.vertices_size(), kQuadVerticesSize); + for (int a = 0; a < kQuadVerticesSize; a += 2) { + int b = (a + 2) % kQuadVerticesSize; + int c = (a - 2 + kQuadVerticesSize) % kQuadVerticesSize; + Vector2_f ab(quad.vertices(b) - quad.vertices(a), + quad.vertices(b + 1) - quad.vertices(a + 1)); + Vector2_f ac(quad.vertices(c) - quad.vertices(a), + quad.vertices(c + 1) - quad.vertices(a + 1)); + + // Since quad's vertices is defined in counter-clockwise manner, we only + // accept negative cross product. + if (ab.CrossProd(ac) >= 0) { + return false; + } + } + + return true; + } + + // Check if all the 4 corners of the quad are out of FOV. + static bool IsQuadOutOfFov(const MotionBoxState::Quad& quad, + const Vector2_f& fov) { + const int kQuadVerticesSize = 8; + CHECK_EQ(quad.vertices_size(), kQuadVerticesSize); + bool too_far = true; + for (int j = 0; j < kQuadVerticesSize; j += 2) { + if (quad.vertices(j) < fov.x() && quad.vertices(j) > 0.0f && + quad.vertices(j + 1) < fov.y() && quad.vertices(j + 1) > 0.0f) { + too_far = false; + break; + } + } + return too_far; + } + }; + + class DistanceWeightsComputer { + public: + DistanceWeightsComputer(const MotionBoxState& initial_state, + const MotionBoxState& current_state, + const TrackStepOptions& options); + + // Compute distance weight based on input motion vector position. + float ComputeDistanceWeight(const MotionVector& test_vector); + + private: + Homography ComputeHomographyFromQuad(const MotionBoxState::Quad& src_quad, + const MotionBoxState::Quad& dst_quad); + + float cos_neg_a_; + float sin_neg_a_; + float spatial_gauss_x_; + float spatial_gauss_y_; + Vector2_f inv_box_domain_; + Vector2_f box_center_; + Vector2_f box_center_transformed_; + bool is_large_rotation_ = false; + Homography homography_; // homography from current box to initial box + TrackStepOptions::TrackingDegrees tracking_degrees_; + }; + + TrackStepOptions options_; + std::deque states_; + int queue_start_; + + int start_track_; + int end_track_; + + MotionBoxState initial_state_; +}; + +} // namespace mediapipe. + +#endif // MEDIAPIPE_UTIL_TRACKING_TRACKING_H_ diff --git a/mediapipe/util/tracking/tracking.proto b/mediapipe/util/tracking/tracking.proto new file mode 100644 index 000000000..ff003801f --- /dev/null +++ b/mediapipe/util/tracking/tracking.proto @@ -0,0 +1,415 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/util/tracking/motion_models.proto"; + +// Next tag: 38 +message MotionBoxState { + // Position (top-left corner) and fixed size of the current MotionBox, + // specified w.r.t. normalized domain (in [0, 1] along both dimensions). + optional float pos_x = 1; + optional float pos_y = 2; + optional float width = 3; + optional float height = 4; + + // Optional degrees of freedom; scale and rotation w.r.t. center of the box, + // i.e. [pos_x, pos_y] + 0.5 * [width, height]. + // To activate see TrackStepOptions::TrackingDegrees. + optional float scale = 5 [default = 1.0]; + optional float rotation = 30 [default = 0.0]; // in radians. + + message Quad { + // Vertex 0 is according to x_0 = vertices(0), y_0 = vertices(1) + // Vertex 1 is according to x_1 = vertices(2), y_1 = vertices(3) + // Vertex 2 is according to x_2 = vertices(4), y_2 = vertices(5) + // Vertex 3 is according to x_3 = vertices(6), y_3 = vertices(7) + // Order of vertices should be aligned in counter-clockwise manner + // 0---------3 + // | | + // | | + // 1---------2 + repeated float vertices = 1; + } + + // This field is only used when we try to track under + // TRACKING_DEGREE_OBJECT_PERSPECTIVE. + optional Quad quad = 34; + + // Aspect ratio (width / height) for the tracked rectangle in physical space. + optional float aspect_ratio = 35; + + // Whether we want this box to be potentially grouped with other boxes + // to track together. This is useful for tracking small boxes that lie + // on a plane. For example, when we detect a plane, + // track the plane, then all boxes within the plane can share the same + // homography transform. + optional bool request_grouping = 37 [default = false]; + + // For quad tracking using pnp solver, + // Whether we use perspective-n-points to track quad between frames. + // That mode requires: + // 1. The quad which is being tracked is an rectangle in the physical world. + // 2. The `asepct_ratio` field has to be set in MotionBoxState. + optional Homography pnp_homography = 36; + + // Object velocity in x and y, specified as normalized spatial unit per + // standard frame period (here calibrated w.r.t. kTrackingDefaultFps = 30 + // FPS), that is 33.3 ms. Object velocity refers to velocity after + // subtracting camera motion. + // If current frame period is 66.67 ms (i.e. 15 fps); actual velocity is + // obtained by multipling with a factor of 2. Similar for 60 fps factor + // is 0.5f. + // Standard frame period is chosen for legacy reasons to keep TrackStepOptions + // defaults. + optional float dx = 7; + optional float dy = 8; + + // Weighted average of object velocity magnitude of inlier points (expressed + // in normalized spatial units per standard frame period). + optional float kinetic_energy = 17; + + // Specifies how valid the prior was in the last step. + optional float prior_weight = 9; + + // Tracking status indicating result of tracking: + // UNTRACKED: Box can not be tracked + // (either out of bound or too many tracking failures). + // EMPTY: Box has size of <= 0 along at least on of its dimensions + // (collapsed). + // NO_FEATURES: No features found within the box, tracking is not possible. + // TRACKED: Successful tracking. + // DUPLICATED: Successful tracked, but duplicated from previous result as + // frame was duplicated. + // BOX_TRACKED_OUT_OF_BOUND: Successful tracked, out of bound from screen + // area. Will advance by camera motion. Only used for static objects. + enum TrackStatus { + BOX_UNTRACKED = 0; + BOX_EMPTY = 1; + BOX_NO_FEATURES = 2; + BOX_TRACKED = 3; + BOX_DUPLICATED = 4; + BOX_TRACKED_OUT_OF_BOUND = 5; + } + + optional TrackStatus track_status = 10 [default = BOX_UNTRACKED]; + + // Spatial prior (presence of inliers, i.e. where is the object located within + // the box that is currently being tracked) as a pair of + // a) prior (in [0, 1]) and + // b) confidence (number of features converted to score within + // [0, 1]). + // Prior is defined over a grid of size spatial_prior_grid_size x + // spatial_prior_grid_size. + optional int32 spatial_prior_grid_size = 11 [default = 10]; + repeated float spatial_prior = 12 [packed = true]; + repeated float spatial_confidence = 13 [packed = true]; + + // Difference score between previous prior and current prior (in [0, 1]). + // Currently not used. + optional float prior_diff = 14; + + // Score determining how much predicted motion disagrees with measured motion. + // If measured motion deviates strongly from predicted motion, disparity is + // +/-1, if motion agrees with predicted motion, disparity is 0. + // Sign indicates measured motion is accelerating (> 0) + // or de-accelerating (< 0) w.r.t. predicted motion. + optional float motion_disparity = 15; + + // Score determining how discriminative estimated motion model is. + // In [0, 1] where 0 no discrimination w.r.t. background and 1 + // high discrimination. + optional float background_discrimination = 16; + + // Center of mass for inliers after tracking (center of feature that were used + // for motion estimation) + optional float inlier_center_x = 18; + optional float inlier_center_y = 19; + + // Approximate number of inliers (each features scores a zero [outlier] + // or one [inlier]). + optional float inlier_sum = 24; + + // Ratio of above inlier_sum to average inlier_sum across last states. + optional float inlier_ratio = 25; + + // Extent (width and height of inliers). + optional float inlier_width = 22; + optional float inlier_height = 23; + + // Set of current inlier tracking ids. + repeated uint32 inlier_ids = 26 [packed = true]; + // Corresponding x,y coordinates for each inlier. + repeated uint32 inlier_id_match_pos = 31 [packed = true]; + // Corresponding inlier score (currently: length of inlier observed). + repeated uint32 inlier_length = 27 [packed = true]; + + // Set of outlier ids. + repeated uint32 outlier_ids = 28 [packed = true]; + // Corresponding x,y coordinates for each outlier. + repeated uint32 outlier_id_match_pos = 32 [packed = true]; + + // Confidence of box tracked in the range [0, 1], with 0 being least + // confident, and 1 being most confident. A reasonable threshold is 0.5 + // to filter out unconfident boxes. + optional float tracking_confidence = 33; + + // Additional internal state. + optional MotionBoxInternalState internal = 29; + + reserved 20, 21; +} + +// Captures additional internal state info about the tracking. +message MotionBoxInternalState { + // Stores all motion vectors that were used for tracking + // as packed arrays, capturing position, object motion, camera motion, + // tracking id and corresponding inlier weight. + repeated float pos_x = 1 [packed = true]; + repeated float pos_y = 2 [packed = true]; + repeated float dx = 3 [packed = true]; + repeated float dy = 4 [packed = true]; + repeated float camera_dx = 5 [packed = true]; + repeated float camera_dy = 6 [packed = true]; + repeated int32 track_id = 7 [packed = true]; + + // Within [0, 1]. 0 = outlier; 1 = inlier. + repeated float inlier_score = 8 [packed = true]; +} + +// Next tag: 42 +message TrackStepOptions { + // Degrees of freedom being used for tracking. By default tracker only uses + // translation. Additionally scale and rotation from the camera motion + // and / or object motion can be taken into account. + enum TrackingDegrees { + TRACKING_DEGREE_TRANSLATION = 0; + + // Additional tracking degrees according to camera motion. + TRACKING_DEGREE_CAMERA_SCALE = 1; + TRACKING_DEGREE_CAMERA_ROTATION = 2; + TRACKING_DEGREE_CAMERA_ROTATION_SCALE = 3; + // TODO: Implement! + TRACKING_DEGREE_CAMERA_PERSPECTIVE = 4; + + // Tracking degrees modeling object motion. Note that additional + // object degrees of freedom are only applied when estimation is deemed + // stable, in particular sufficient inliers are present. + // By default, does NOT apply camera motion. If that is desired set + // the flag: track_object_and_camera to true. + TRACKING_DEGREE_OBJECT_SCALE = 5; + TRACKING_DEGREE_OBJECT_ROTATION = 6; + TRACKING_DEGREE_OBJECT_ROTATION_SCALE = 7; + TRACKING_DEGREE_OBJECT_PERSPECTIVE = 8; + } + + optional TrackingDegrees tracking_degrees = 28 + [default = TRACKING_DEGREE_TRANSLATION]; + + // If set and one of the TRACKING_DEGREE_OBJECT degrees are set also applies + // camera motion in addition to the object motion. + optional bool track_object_and_camera = 32 [default = false]; + + // Number of iterations to iteratively estimate model and re-estimate + // influence of each vector. + optional int32 irls_iterations = 1 [default = 5]; + + // Gaussian spatial prior sigma relative to box size. + // For motivation, see this plot: http://goo.gl/BCfcy. + optional float spatial_sigma = 2 [default = 0.15]; + + // Gaussian velocity prior sigma. It is computed as the maximum of the + // absolute minimum sigma (in normalized domain) and the relative sigma + // w.r.t. previous motion. + optional float min_motion_sigma = 3 [default = 0.002]; + optional float relative_motion_sigma = 4 [default = 0.3]; + + // Settings for motion disparity. Difference between previous and current + // motion magnitude is scored linearly, from motion_disparity_low_level to + // motion_disparity_high_level (mapped to score of 0 and 1 respectively). + // Motivation is to ensure acceleration between frames are within reasonable + // bounds. + // Represents a maximum acceleration of around 4 - 5 pixels per frame in 360p + // video to be unpenalized, with accelerations of around >= 10 pixels being + // considered inconsitent with prediction. + optional float motion_disparity_low_level = 6 [default = 8e-3]; + optional float motion_disparity_high_level = 7 [default = 1.6e-2]; + + // Motion disparity decays across frames. Disparity of previous frame decays + // over time. If disparity in current frame is not higher, i.e. the larger + // of the current and decayed disparity is taken. + // Motivation is, that if acceleration was unreasonable high (and we likely + // lost tracking) we enter a stage of trying to regain tracking by looking for + // vectors that agree with the previous prediction. + optional float disparity_decay = 8 [default = 0.8]; + + // Object motion is given as linear combination of previous and measured + // motion depending on the motion_disparity (a high disparity is giving high + // weight to the previous motion). + // We enforce at least a minimum of the below motion_prior_weight regardless + // of the motion disparity. + optional float motion_prior_weight = 9 [default = 0.2]; + + // Settings for motion discrimination. + // + // Current motion magnitude is scored linearly, + // from background_discrimination_low_level to + // background_discrimination_high_level (mapped to score of 0 and 1 + // respectively). + // Motivation is that high object motions are easy to discriminate from the + // background, whereas small object motions are virtually indistinguishable. + // Represents a range of 2 - 4 pixels for 360p video. + optional float background_discrimination_low_level = 10 [default = 4e-3]; + optional float background_discrimination_high_level = 11 [default = 8e-3]; + + // Spring force settings. If difference between predicted center of the box in + // the next frame and the predicted center of the inliers deviates by more + // than inlier_center_relative_distance times the box [width|height] + // a spring force is applied to the box. The amount of force is spring_force + // times the difference. + optional float inlier_center_relative_distance = 12 [default = 0.1]; + optional float inlier_spring_force = 13 [default = 0.3]; + + // Same as above, but for the center of large motion magnitudes. + optional float kinetic_center_relative_distance = 14 [default = 0.4]; + optional float kinetic_spring_force = 15 [default = 0.5]; + + // Spring force towards large motions is only applied when kinetic energy is + // above the specified threshold. + optional float kinetic_spring_force_min_kinetic_energy = 21 [default = 3e-3]; + + // Bias of old velocity during update step. + optional float velocity_update_weight = 16 [default = 0.7]; + + // Maximum number of frames considered to be tracking failures -> + // If over threshold, box is considered untrackable. + optional int32 max_track_failures = 17 [default = 10]; + + // Domain used for tracking is always larger than the current box. + // If current motion is not negligible, box is expanded in the direction the + // motion, otherwise expanded in all directions by the amount specified below + // (w.r.t. normalized domain). + optional float expansion_size = 18 [default = 0.05]; + + // Features are scored based on the magnitude of their irls weights, mapped to + // [0, 1] using the following range. The range represents roughly 3 - 1.5 + // pixels error for 360p video. + optional float inlier_low_weight = 19 [default = 250]; + optional float inlier_high_weight = 20 [default = 500]; + + // Kinetic energy decays over time by the specified rate. + optional float kinetic_energy_decay = 22 [default = 0.98]; + + // Amount by which prior is increased/decreased in case of valid/invalid + // measurements. + optional float prior_weight_increase = 23 [default = 0.2]; + + // We map the amount of present kinetic energy linearly to the domain [0, 1] + // describing if an object is static (0) or moving (1). + optional float low_kinetic_energy = 24 [default = 1e-3]; // ~0.4 pix + optional float high_kinetic_energy = 25 [default = 4e-3]; // ~3 pix + + // Outputs internal state to MotionBoxState. + optional bool return_internal_state = 26 [default = false]; + + // Specifies which weights are stored in the internal state. By default + // post-estimation weights are stored, otherwise pre-estimation weights + // are stored. + optional bool use_post_estimation_weights_for_state = 29 [default = true]; + + // Computes spatial grid of inliers and stores it in the MotionBoxState. + optional bool compute_spatial_prior = 27 [default = false]; + + // Irls initialization by performing several rounds of RANSAC to preselect + // features for motion estimation scoring outliers low and inliers to be at + // least of median inlier weight. + message IrlsInitialization { + optional bool activated = 1 [default = false]; + + // Rounds of RANSAC. + optional int32 rounds = 2 [default = 50]; + + // Normalized cutoff threshold for a vector to be considered an inlier. + optional float cutoff = 3 [default = 0.005]; + } + optional IrlsInitialization irls_initialization = 30; + + // Ratio between static motion and temporal scale. This is actually + // the threshold on speed, under which we consider static (non-moving object). + optional float static_motion_temporal_ratio = 33 [default = 3e-3]; + + // Different control parameters to terminate tracking when + // occlusion occurs. + message CancelTrackingWithOcclusionOptions { + optional bool activated = 1 [default = false]; + optional float min_motion_continuity = 2 [default = 0.4]; + optional float min_inlier_ratio = 3 [default = 0.1]; + } + optional CancelTrackingWithOcclusionOptions + cancel_tracking_with_occlusion_options = 34; + + // If number of continued inliers is less than this number, then the object + // motion model will fall back to translation model. + // Set this min_continued_inliers threshold to a low number to make sure + // they follow local object rotation and scale, but it may result in un-robust + // rotation and scale estimation if the threshold is too low. Recommend that + // you don't set a number < 4. + optional int32 object_similarity_min_contd_inliers = 35 [default = 30]; + + // Maximum acceptable scale component of object similarity transform. + // Minimum scale is computed as 1.0 / max_scale. + // Exclusive for tracking a box with similarity. + optional float box_similarity_max_scale = 36 [default = 1.05]; + + // Maximum acceptable object similarity rotation in radians. + optional float box_similarity_max_rotation = 37 [default = 0.2]; + + // Homography transform will first be projected to similarity, and the scale + // component of the similarity transform should be within the range of + // [1.0 / max_scale, max_scale]. + optional float quad_homography_max_scale = 38 [default = 1.2]; + + // The rotation component of the projected similarity should be smaller than + // this maximum rotation threshold. + optional float quad_homography_max_rotation = 39 [default = 0.3]; + + // Pre-calibrated camera intrinsics parameters, including focal length, center + // point, distortion coefficients (only 3 radial factors) and image width / + // height. The image formation model is described here: + // https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html + // Only used for quad tracking mode. Leave it empty if unknown. + message CameraIntrinsics { + optional float fx = 1; + optional float fy = 2; + optional float cx = 3; + optional float cy = 4; + optional float k0 = 5; + optional float k1 = 6; + optional float k2 = 7; + optional int32 w = 8; + optional int32 h = 9; + } + optional CameraIntrinsics camera_intrinsics = 40; + + // Specifically for quad tracking (aka TRACKING_DEGREE_OBJECT_PERSPECTIVE + // mode), if aspect_ratio field is set in start pos, pnp tracking will be + // deployed. If aspect_ratio is unknown (not set), but forced_pnp_tracking is + // true, we will first estimate the aspect ratio for the 3D quadrangle, then + // perform pnp tracking. If aspect_ratio is unknown and pnp tracking is not + // forced, general homography tracking will be deployed. + optional bool forced_pnp_tracking = 41 [default = false]; +} diff --git a/mediapipe/util/tracking/tracking_visualization_utilities.cc b/mediapipe/util/tracking/tracking_visualization_utilities.cc new file mode 100644 index 000000000..c3ec91563 --- /dev/null +++ b/mediapipe/util/tracking/tracking_visualization_utilities.cc @@ -0,0 +1,224 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/util/tracking/tracking_visualization_utilities.h" + +#include "absl/strings/str_format.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/util/tracking/box_tracker.h" +#include "mediapipe/util/tracking/tracking.h" + +namespace mediapipe { + +void RenderState(const MotionBoxState& box_state, bool print_stats, + cv::Mat* frame) { +#ifndef NO_RENDERING + CHECK(frame != nullptr); + + const int frame_width = frame->cols; + const int frame_height = frame->rows; + + const Vector2_f top_left(box_state.pos_x() * frame_width, + box_state.pos_y() * frame_height); + + cv::rectangle(*frame, + cv::Point(box_state.inlier_center_x() * frame_width - 2, + box_state.inlier_center_y() * frame_height - 2), + cv::Point(box_state.inlier_center_x() * frame_width + 2, + box_state.inlier_center_y() * frame_height + 2), + cv::Scalar(255, 255, 0, 255), 2); + + cv::rectangle( + *frame, + cv::Point( + (box_state.inlier_center_x() - box_state.inlier_width() * 0.5) * + frame_width, + (box_state.inlier_center_y() - box_state.inlier_height() * 0.5) * + frame_height), + cv::Point( + (box_state.inlier_center_x() + box_state.inlier_width() * 0.5) * + frame_width, + (box_state.inlier_center_y() + box_state.inlier_height() * 0.5) * + frame_height), + cv::Scalar(0, 0, 255, 255), 1); + + std::vector inlier_locs; + std::vector outlier_locs; + MotionBoxInlierLocations(box_state, &inlier_locs); + MotionBoxOutlierLocations(box_state, &outlier_locs); + float scale_x = 1.0f; + float scale_y = 1.0f; + ScaleFromAspect(frame_width * 1.0f / frame_height, true, &scale_x, &scale_y); + + for (const Vector2_f& loc : inlier_locs) { + cv::circle(*frame, + cv::Point(loc.x() * scale_x * frame_width, + loc.y() * scale_y * frame_height), + 4.0, cv::Scalar(0, 255, 0, 128), 1); + } + for (const Vector2_f& loc : outlier_locs) { + cv::circle(*frame, + cv::Point(loc.x() * scale_x * frame_width, + loc.y() * scale_y * frame_height), + 4.0, cv::Scalar(255, 0, 0, 128), 1); + } + + if (!print_stats) { + return; + } + + std::vector stats; + stats.push_back( + absl::StrFormat("Motion: %.4f, %.4f", box_state.dx(), box_state.dy())); + stats.push_back( + absl::StrFormat("KinEnergy: %.4f", box_state.kinetic_energy())); + stats.push_back( + absl::StrFormat("Disparity: %.2f", box_state.motion_disparity())); + stats.push_back(absl::StrFormat("Discrimination: %.2f", + box_state.background_discrimination())); + stats.push_back( + absl::StrFormat("InlierRatio: %2.2f", box_state.inlier_ratio())); + stats.push_back( + absl::StrFormat("InlierNum: %3d", box_state.inlier_ids_size())); + + stats.push_back(absl::StrFormat("Prior: %.2f", box_state.prior_weight())); + stats.push_back(absl::StrFormat("TrackingConfidence: %.2f", + box_state.tracking_confidence())); + + for (int k = 0; k < stats.size(); ++k) { + // Display some stats below the box. + cv::putText(*frame, stats[k], + cv::Point(top_left.x(), top_left.y() + (k + 1) * 12), + cv::FONT_HERSHEY_PLAIN, 1.0, cv::Scalar(0, 0, 0, 255)); + + cv::putText(*frame, stats[k], + cv::Point(top_left.x() + 1, top_left.y() + (k + 1) * 12 + 1), + cv::FONT_HERSHEY_PLAIN, 1.0, cv::Scalar(255, 255, 255, 255)); + } + + // Visualize locking state via heuristically chosen thresholds. Locking + // state is purely for visualization/illustration and has no further + // meaning. + std::string lock_text; + cv::Scalar lock_color; + if (box_state.motion_disparity() > 0.8) { + lock_text = "Lock lost"; + lock_color = cv::Scalar(255, 0, 0, 255); + } else if (box_state.motion_disparity() > 0.4) { + lock_text = "Aquiring lock"; + lock_color = cv::Scalar(255, 255, 0, 255); + } else if (box_state.motion_disparity() > 0.1) { + lock_text = "Locked"; + lock_color = cv::Scalar(0, 255, 0, 255); + } + + cv::putText(*frame, lock_text, cv::Point(top_left.x() + 1, top_left.y() - 4), + cv::FONT_HERSHEY_PLAIN, 0.8, cv::Scalar(255, 255, 255, 255)); + + cv::putText(*frame, lock_text, cv::Point(top_left.x(), top_left.y() - 5), + cv::FONT_HERSHEY_PLAIN, 0.8, lock_color); +#else + LOG(FATAL) << "Code stripped out because of NO_RENDERING"; +#endif +} + +void RenderInternalState(const MotionBoxInternalState& internal, + cv::Mat* frame) { +#ifndef NO_RENDERING + CHECK(frame != nullptr); + + const int num_vectors = internal.pos_x_size(); + + // Determine max inlier score for visualization. + float max_score = 0; + for (int k = 0; k < num_vectors; ++k) { + max_score = std::max(max_score, internal.inlier_score(k)); + } + + float alpha_scale = 1.0f; + if (max_score > 0) { + alpha_scale = 1.0f / max_score; + } + + const int frame_width = frame->cols; + const int frame_height = frame->rows; + for (int k = 0; k < num_vectors; ++k) { + const MotionVector v = MotionVector::FromInternalState(internal, k); + cv::Point p1(v.pos.x() * frame_width, v.pos.y() * frame_height); + Vector2_f match = v.pos + v.object; + + cv::Point p2(match.x() * frame_width, match.y() * frame_height); + const float alpha = internal.inlier_score(k) * alpha_scale; + + cv::Scalar color_scaled(0 * alpha + (1.0f - alpha) * 255, + 255 * alpha + (1.0f - alpha) * 0, + 0 * alpha + (1.0f - alpha) * 0); + + cv::line(*frame, p1, p2, color_scaled, 1, CV_AA); + cv::circle(*frame, p1, 2.0, color_scaled, 1); + } +#else + LOG(FATAL) << "Code stripped out because of NO_RENDERING"; +#endif +} + +void RenderTrackingData(const TrackingData& data, cv::Mat* mat, + bool antialiasing) { +#ifndef NO_RENDERING + CHECK(mat != nullptr); + + MotionVectorFrame mvf; + MotionVectorFrameFromTrackingData(data, &mvf); + const float aspect_ratio = mvf.aspect_ratio; + float scale_x, scale_y; + ScaleFromAspect(aspect_ratio, true, &scale_x, &scale_y); + scale_x *= mat->cols; + scale_y *= mat->rows; + + for (const auto& motion_vector : mvf.motion_vectors) { + Vector2_f pos = motion_vector.Location(); + Vector2_f match = motion_vector.MatchLocation(); + + cv::Point p1(pos.x() * scale_x, pos.y() * scale_y); + cv::Point p2(match.x() * scale_x, match.y() * scale_y); + + // iOS cannot display a width 1 antialiased line, so we provide + // an option for 8-connected drawing instead. + cv::line(*mat, p1, p2, cv::Scalar(0, 255, 0, 255), 1, + antialiasing ? CV_AA : 8); + } +#else + LOG(FATAL) << "Code stripped out because of NO_RENDERING"; +#endif +} + +void RenderBox(const TimedBoxProto& box_proto, cv::Mat* mat) { +#ifndef NO_RENDERING + CHECK(mat != nullptr); + + TimedBox box = TimedBox::FromProto(box_proto); + std::array corners = box.Corners(mat->cols, mat->rows); + + for (int k = 0; k < 4; ++k) { + cv::line(*mat, cv::Point2f(corners[k].x(), corners[k].y()), + cv::Point2f(corners[(k + 1) % 4].x(), corners[(k + 1) % 4].y()), + cv::Scalar(255, 0, 0, 255), // Red. + 4); + } +#else + LOG(FATAL) << "Code stripped out because of NO_RENDERING"; +#endif +} + +} // namespace mediapipe diff --git a/mediapipe/util/tracking/tracking_visualization_utilities.h b/mediapipe/util/tracking/tracking_visualization_utilities.h new file mode 100644 index 000000000..ede86a033 --- /dev/null +++ b/mediapipe/util/tracking/tracking_visualization_utilities.h @@ -0,0 +1,43 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TRACKING_TRACKING_VISUALIZATION_UTILITIES_H_ +#define MEDIAPIPE_UTIL_TRACKING_TRACKING_VISUALIZATION_UTILITIES_H_ + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" +#include "mediapipe/util/tracking/flow_packager.pb.h" +#include "mediapipe/util/tracking/tracking.pb.h" + +namespace mediapipe { + +// Visualizes state to frame. Also overlays statistics if print_stats is set. +void RenderState(const MotionBoxState& box_state, bool print_stats, + cv::Mat* frame); + +// Visualizes internal tracking state to frame. +void RenderInternalState(const MotionBoxInternalState& internal, + cv::Mat* frame); + +// Visualizes tracking data (specifically, the motion_vectors) to frame. +// Optional third parameter can be used to disable antialiasing. +void RenderTrackingData(const TrackingData& data, cv::Mat* mat, + bool antialiasing = false); + +// Visualize TimeBoxProto onto image. +void RenderBox(const TimedBoxProto& box_proto, cv::Mat* mat); + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TRACKING_TRACKING_VISUALIZATION_UTILITIES_H_ diff --git a/third_party/ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff b/third_party/ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff new file mode 100644 index 000000000..5b2661516 --- /dev/null +++ b/third_party/ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff @@ -0,0 +1,14 @@ +diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl +index d90e5a3..4420565 100644 +--- a/bazel/ceres.bzl ++++ b/bazel/ceres.bzl +@@ -202,6 +203,6 @@ def ceres_library(name, + ], + visibility = ["//visibility:public"], + deps = [ +- "@com_github_eigen_eigen//:eigen", +- "@com_github_google_glog//:glog", ++ "@eigen_archive//:eigen", ++ "@com_github_glog_glog//:glog", + ], + ) \ No newline at end of file diff --git a/third_party/opencv_linux.BUILD b/third_party/opencv_linux.BUILD index ef2aeb49c..4d2eb254e 100644 --- a/third_party/opencv_linux.BUILD +++ b/third_party/opencv_linux.BUILD @@ -7,6 +7,7 @@ exports_files(["LICENSE"]) # The following build rule assumes that OpenCV is installed by # 'apt-get install libopencv-core-dev libopencv-highgui-dev \' +# ' libopencv-calib3d-dev libopencv-features2d-dev \' # ' libopencv-imgproc-dev libopencv-video-dev' on Debian/Ubuntu. # If you install OpenCV separately, please modify the build rule accordingly. cc_library( diff --git a/third_party/org_tensorflow_e3a7bdbebb99352351a19e2e403136166aa52934.diff b/third_party/org_tensorflow_e3a7bdbebb99352351a19e2e403136166aa52934.diff new file mode 100644 index 000000000..4f8a10f77 --- /dev/null +++ b/third_party/org_tensorflow_e3a7bdbebb99352351a19e2e403136166aa52934.diff @@ -0,0 +1,19 @@ +diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl +index a0546b5f83..c7a9f39ded 100755 +--- a/tensorflow/workspace.bzl ++++ b/tensorflow/workspace.bzl +@@ -172,9 +172,9 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): + name = "eigen_archive", + build_file = clean_dep("//third_party:eigen.BUILD"), + patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"), +- sha256 = "6d8ed482addd14892d7b0bd98fec2c02f18fdab97775bda68c3f2a99ffb190fb", +- strip_prefix = "eigen-eigen-66be6c76fc01", ++ sha256 = "add24720f99ab4f3222f4c8a887f2609554cf9187d4f7d24a777a151a0ee2548", ++ strip_prefix = "eigen-git-mirror-4898dcdb06f1b1b0441b8e15119764793f8997e2", + urls = [ +- "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/66be6c76fc01.tar.gz", +- "https://bitbucket.org/eigen/eigen/get/66be6c76fc01.tar.gz", ++ "https://storage.googleapis.com/mirror.tensorflow.org/github.com/eigenteam/eigen-git-mirror/archive/4898dcdb06f1b1b0441b8e15119764793f8997e2.tar.gz", ++ "https://github.com/eigenteam/eigen-git-mirror/archive/4898dcdb06f1b1b0441b8e15119764793f8997e2.tar.gz", + ], + ) \ No newline at end of file