Project import generated by Copybara.

GitOrigin-RevId: d91373b4d4d10abef49cab410caa6aadf0875049
This commit is contained in:
MediaPipe Team 2019-12-06 15:33:11 -08:00 committed by mgyong
parent 137867d088
commit d16cc3be5b
137 changed files with 41548 additions and 31 deletions

View File

@ -19,6 +19,9 @@ build --incompatible_depset_is_not_iterable=false
# Sets the default Apple platform to macOS.
build --apple_platform_type=macos
# Allow debugging with XCODE
build --apple_generate_dsym
# Android configs.
build:android --crosstool_top=//external:android/crosstool
build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain

View File

@ -35,6 +35,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libopencv-highgui-dev \
libopencv-imgproc-dev \
libopencv-video-dev \
libopencv-calib3d-dev \
libopencv-features2d-dev \
software-properties-common && \
add-apt-repository -y ppa:openjdk-r/ppa && \
apt-get update && apt-get install -y openjdk-8-jdk && \

View File

@ -14,12 +14,13 @@
* [Face Detection](mediapipe/docs/face_detection_mobile_gpu.md)
* [Hair Segmentation](mediapipe/docs/hair_segmentation_mobile_gpu.md)
* [Object Detection](mediapipe/docs/object_detection_mobile_gpu.md)
* [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md)
![hand_tracking](mediapipe/docs/images/mobile/hand_tracking_3d_android_gpu_small.gif)
![multi-hand_tracking](mediapipe/docs/images/mobile/multi_hand_tracking_android_gpu_small.gif)
![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif)
![hair_segmentation](mediapipe/docs/images/mobile/hair_segmentation_android_gpu_small.gif)
![object_detection](mediapipe/docs/images/mobile/object_detection_android_gpu_small.gif)
![object_tracking](mediapipe/docs/images/mobile/object_tracking_android_gpu_small.gif)
## Installation
Follow these [instructions](mediapipe/docs/install.md).

View File

@ -116,6 +116,13 @@ http_archive(
"https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT,
"https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT,
],
# Patch https://github.com/tensorflow/tensorflow/commit/e3a7bdbebb99352351a19e2e403136166aa52934
patches = [
"@//third_party:org_tensorflow_e3a7bdbebb99352351a19e2e403136166aa52934.diff"
],
patch_args = [
"-p1",
],
strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT,
sha256 = _TENSORFLOW_SHA256,
)
@ -123,8 +130,22 @@ http_archive(
load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
tf_workspace(tf_repo_name = "org_tensorflow")
http_archive(
name = "ceres_solver",
url = "https://github.com/ceres-solver/ceres-solver/archive/1.14.0.zip",
patches = [
"@//third_party:ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff"
],
patch_args = [
"-p1",
],
strip_prefix = "ceres-solver-1.14.0",
sha256 = "5ba6d0db4e784621fda44a50c58bb23b0892684692f0c623e2063f9c19f192f1"
)
# Please run
# $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \
# libopencv-calib3d-dev libopencv-features2d-dev \
# libopencv-imgproc-dev libopencv-video-dev
new_local_repository(
name = "linux_opencv",
@ -292,3 +313,4 @@ http_archive(
strip_prefix = "google-toolbox-for-mac-2.2.1",
build_file = "@//third_party:google_toolbox_for_mac.BUILD",
)

View File

@ -9,6 +9,7 @@
"mediapipe/examples/ios/facedetectiongpu/BUILD",
"mediapipe/examples/ios/handdetectiongpu/BUILD",
"mediapipe/examples/ios/handtrackinggpu/BUILD",
"mediapipe/examples/ios/multihandtrackinggpu/BUILD",
"mediapipe/examples/ios/objectdetectioncpu/BUILD",
"mediapipe/examples/ios/objectdetectiongpu/BUILD"
],
@ -18,6 +19,7 @@
"//mediapipe/examples/ios/facedetectiongpu:FaceDetectionGpuApp",
"//mediapipe/examples/ios/handdetectiongpu:HandDetectionGpuApp",
"//mediapipe/examples/ios/handtrackinggpu:HandTrackingGpuApp",
"//mediapipe/examples/ios/multihandtrackinggpu:MultiHandTrackingGpuApp",
"//mediapipe/examples/ios/objectdetectioncpu:ObjectDetectionCpuApp",
"//mediapipe/examples/ios/objectdetectiongpu:ObjectDetectionGpuApp",
"//mediapipe/objc:mediapipe_framework_ios"
@ -84,6 +86,8 @@
"mediapipe/examples/ios/handdetectiongpu/Base.lproj",
"mediapipe/examples/ios/handtrackinggpu",
"mediapipe/examples/ios/handtrackinggpu/Base.lproj",
"mediapipe/examples/ios/multihandtrackinggpu",
"mediapipe/examples/ios/multihandtrackinggpu/Base.lproj",
"mediapipe/examples/ios/objectdetectioncpu",
"mediapipe/examples/ios/objectdetectioncpu/Base.lproj",
"mediapipe/examples/ios/objectdetectiongpu",

View File

@ -16,6 +16,7 @@
"mediapipe/examples/ios/facedetectiongpu",
"mediapipe/examples/ios/handdetectiongpu",
"mediapipe/examples/ios/handtrackinggpu",
"mediapipe/examples/ios/multihandtrackinggpu",
"mediapipe/examples/ios/objectdetectioncpu",
"mediapipe/examples/ios/objectdetectiongpu"
],

View File

@ -165,6 +165,7 @@ cc_library(
deps = [
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:status",
],
alwayslink = 1,
)

View File

@ -13,11 +13,12 @@
// limitations under the License.
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/port/canonical_errors.h"
#include "mediapipe/framework/port/logging.h"
namespace mediapipe {
// Attach the header from one stream to another stream.
// Attach the header from a stream or side input to another stream.
//
// The header stream (tag HEADER) must not have any packets in it.
//
@ -25,17 +26,53 @@ namespace mediapipe {
// calculator to not need a header or to accept a separate stream with
// a header, that would be more future proof.
//
// Example usage 1:
// node {
// calculator: "AddHeaderCalculator"
// input_stream: "DATA:audio"
// input_stream: "HEADER:audio_header"
// output_stream: "audio_with_header"
// }
//
// Example usage 2:
// node {
// calculator: "AddHeaderCalculator"
// input_stream: "DATA:audio"
// input_side_packet: "HEADER:audio_header"
// output_stream: "audio_with_header"
// }
//
class AddHeaderCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc) {
cc->Inputs().Tag("HEADER").SetNone();
bool has_side_input = false;
bool has_header_stream = false;
if (cc->InputSidePackets().HasTag("HEADER")) {
cc->InputSidePackets().Tag("HEADER").SetAny();
has_side_input = true;
}
if (cc->Inputs().HasTag("HEADER")) {
cc->Inputs().Tag("HEADER").SetNone();
has_header_stream = true;
}
if (has_side_input == has_header_stream) {
return mediapipe::InvalidArgumentError(
"Header must be provided via exactly one of side input and input "
"stream");
}
cc->Inputs().Tag("DATA").SetAny();
cc->Outputs().Index(0).SetSameAs(&cc->Inputs().Tag("DATA"));
return ::mediapipe::OkStatus();
}
::mediapipe::Status Open(CalculatorContext* cc) override {
const Packet& header = cc->Inputs().Tag("HEADER").Header();
Packet header;
if (cc->InputSidePackets().HasTag("HEADER")) {
header = cc->InputSidePackets().Tag("HEADER");
}
if (cc->Inputs().HasTag("HEADER")) {
header = cc->Inputs().Tag("HEADER").Header();
}
if (!header.IsEmpty()) {
cc->Outputs().Index(0).SetHeader(header);
}

View File

@ -14,8 +14,10 @@
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/calculator_runner.h"
#include "mediapipe/framework/port/canonical_errors.h"
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/framework/port/status_matchers.h"
#include "mediapipe/framework/timestamp.h"
#include "mediapipe/framework/tool/validate_type.h"
@ -24,7 +26,7 @@ namespace mediapipe {
class AddHeaderCalculatorTest : public ::testing::Test {};
TEST_F(AddHeaderCalculatorTest, Works) {
TEST_F(AddHeaderCalculatorTest, HeaderStream) {
CalculatorGraphConfig::Node node;
node.set_calculator("AddHeaderCalculator");
node.add_input_stream("HEADER:header_stream");
@ -96,4 +98,62 @@ TEST_F(AddHeaderCalculatorTest, NoPacketsOnHeaderStream) {
ASSERT_FALSE(runner.Run().ok());
}
TEST_F(AddHeaderCalculatorTest, InputSidePacket) {
CalculatorGraphConfig::Node node;
node.set_calculator("AddHeaderCalculator");
node.add_input_stream("DATA:data_stream");
node.add_output_stream("merged_stream");
node.add_input_side_packet("HEADER:header");
CalculatorRunner runner(node);
// Set header and add 5 packets.
runner.MutableSidePackets()->Tag("HEADER") =
Adopt(new std::string("my_header"));
for (int i = 0; i < 5; ++i) {
Packet packet = Adopt(new int(i)).At(Timestamp(i * 1000));
runner.MutableInputs()->Tag("DATA").packets.push_back(packet);
}
// Run calculator.
MP_ASSERT_OK(runner.Run());
ASSERT_EQ(1, runner.Outputs().NumEntries());
// Test output.
EXPECT_EQ(std::string("my_header"),
runner.Outputs().Index(0).header.Get<std::string>());
const std::vector<Packet>& output_packets = runner.Outputs().Index(0).packets;
ASSERT_EQ(5, output_packets.size());
for (int i = 0; i < 5; ++i) {
const int val = output_packets[i].Get<int>();
EXPECT_EQ(i, val);
EXPECT_EQ(Timestamp(i * 1000), output_packets[i].Timestamp());
}
}
TEST_F(AddHeaderCalculatorTest, UsingBothSideInputAndStream) {
CalculatorGraphConfig::Node node;
node.set_calculator("AddHeaderCalculator");
node.add_input_stream("HEADER:header_stream");
node.add_input_stream("DATA:data_stream");
node.add_output_stream("merged_stream");
node.add_input_side_packet("HEADER:header");
CalculatorRunner runner(node);
// Set both headers and add 5 packets.
runner.MutableSidePackets()->Tag("HEADER") =
Adopt(new std::string("my_header"));
runner.MutableSidePackets()->Tag("HEADER") =
Adopt(new std::string("my_header"));
for (int i = 0; i < 5; ++i) {
Packet packet = Adopt(new int(i)).At(Timestamp(i * 1000));
runner.MutableInputs()->Tag("DATA").packets.push_back(packet);
}
// Run should fail because header can only be provided one way.
EXPECT_EQ(runner.Run().code(), ::mediapipe::InvalidArgumentError("").code());
}
} // namespace mediapipe

View File

@ -330,22 +330,27 @@ void PacketResamplerCalculator::UpdateNextOutputTimestampWithJitter() {
return ::mediapipe::OkStatus();
}
LOG_IF(WARNING, frame_time_usec_ <
(cc->InputTimestamp() - last_packet_.Timestamp()).Value())
<< "Adding jitter is meaningless when upsampling.";
const int64 curr_diff =
(next_output_timestamp_ - cc->InputTimestamp()).Value();
const int64 last_diff =
(next_output_timestamp_ - last_packet_.Timestamp()).Value();
if (curr_diff * last_diff > 0) {
return ::mediapipe::OkStatus();
if (frame_time_usec_ <
(cc->InputTimestamp() - last_packet_.Timestamp()).Value()) {
LOG_FIRST_N(WARNING, 2)
<< "Adding jitter is not very useful when upsampling.";
}
while (true) {
const int64 last_diff =
(next_output_timestamp_ - last_packet_.Timestamp()).Value();
RET_CHECK_GT(last_diff, 0.0);
const int64 curr_diff =
(next_output_timestamp_ - cc->InputTimestamp()).Value();
if (curr_diff > 0.0) {
break;
}
OutputWithinLimits(cc, (std::abs(curr_diff) > last_diff
? last_packet_
: cc->Inputs().Get(input_data_id_).Value())
.At(next_output_timestamp_));
UpdateNextOutputTimestampWithJitter();
}
OutputWithinLimits(cc, (std::abs(curr_diff) > std::abs(last_diff)
? last_packet_
: cc->Inputs().Get(input_data_id_).Value())
.At(next_output_timestamp_));
UpdateNextOutputTimestampWithJitter();
return ::mediapipe::OkStatus();
}

View File

@ -501,8 +501,11 @@ void ImageCroppingCalculator::GetOutputDimensions(CalculatorContext* cc,
row_max = std::max(row_max, transformed_points_[i * 2 + 1]);
}
*dst_width = std::round((col_max - col_min) * src_width);
*dst_height = std::round((row_max - row_min) * src_height);
int width = static_cast<int>(std::round((col_max - col_min) * src_width));
int height = static_cast<int>(std::round((row_max - row_min) * src_height));
// Minimum output dimension 1x1 prevents creation of textures with 0x0.
*dst_width = std::max(1, width);
*dst_height = std::max(1, height);
}
} // namespace mediapipe

View File

@ -983,3 +983,31 @@ cc_test(
"//mediapipe/framework/port:parse_text_proto",
],
)
cc_library(
name = "detections_to_timed_box_list_calculator",
srcs = ["detections_to_timed_box_list_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/formats:detection_cc_proto",
"//mediapipe/framework/formats:location_data_cc_proto",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/util/tracking:box_tracker",
],
alwayslink = 1,
)
cc_library(
name = "detection_unique_id_calculator",
srcs = ["detection_unique_id_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/formats:detection_cc_proto",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
],
alwayslink = 1,
)

View File

@ -0,0 +1,110 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/detection.pb.h"
#include "mediapipe/framework/port/status.h"
namespace mediapipe {
namespace {
constexpr char kDetectionsTag[] = "DETECTIONS";
constexpr char kDetectionListTag[] = "DETECTION_LIST";
// Each detection processed by DetectionUniqueIDCalculator will be assigned an
// unique id that starts from 1. If a detection already has an ID other than 0,
// the ID will be overwritten.
static int64 detection_id = 0;
inline int GetNextDetectionId() { return ++detection_id; }
} // namespace
// Assign a unique id to detections.
// Note that the calculator will consume the input vector of Detection or
// DetectionList. So the input stream can not be connected to other calculators.
//
// Example config:
// node {
// calculator: "DetectionUniqueIdCalculator"
// input_stream: "DETECTIONS:detections"
// output_stream: "DETECTIONS:output_detections"
// }
class DetectionUniqueIdCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc) {
RET_CHECK(cc->Inputs().HasTag(kDetectionListTag) ||
cc->Inputs().HasTag(kDetectionsTag))
<< "None of the input streams are provided.";
if (cc->Inputs().HasTag(kDetectionListTag)) {
RET_CHECK(cc->Outputs().HasTag(kDetectionListTag));
cc->Inputs().Tag(kDetectionListTag).Set<DetectionList>();
cc->Outputs().Tag(kDetectionListTag).Set<DetectionList>();
}
if (cc->Inputs().HasTag(kDetectionsTag)) {
RET_CHECK(cc->Outputs().HasTag(kDetectionsTag));
cc->Inputs().Tag(kDetectionsTag).Set<std::vector<Detection>>();
cc->Outputs().Tag(kDetectionsTag).Set<std::vector<Detection>>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status Open(CalculatorContext* cc) override {
cc->SetOffset(::mediapipe::TimestampDiff(0));
return ::mediapipe::OkStatus();
}
::mediapipe::Status Process(CalculatorContext* cc) override;
};
REGISTER_CALCULATOR(DetectionUniqueIdCalculator);
::mediapipe::Status DetectionUniqueIdCalculator::Process(
CalculatorContext* cc) {
if (cc->Inputs().HasTag(kDetectionListTag) &&
!cc->Inputs().Tag(kDetectionListTag).IsEmpty()) {
auto result =
cc->Inputs().Tag(kDetectionListTag).Value().Consume<DetectionList>();
if (result.ok()) {
auto detection_list = std::move(result).ValueOrDie();
for (Detection& detection : *detection_list->mutable_detection()) {
detection.set_detection_id(GetNextDetectionId());
}
cc->Outputs()
.Tag(kDetectionListTag)
.Add(detection_list.release(), cc->InputTimestamp());
}
}
if (cc->Inputs().HasTag(kDetectionsTag) &&
!cc->Inputs().Tag(kDetectionsTag).IsEmpty()) {
auto result = cc->Inputs()
.Tag(kDetectionsTag)
.Value()
.Consume<std::vector<Detection>>();
if (result.ok()) {
auto detections = std::move(result).ValueOrDie();
for (Detection& detection : *detections) {
detection.set_detection_id(GetNextDetectionId());
}
cc->Outputs()
.Tag(kDetectionsTag)
.Add(detections.release(), cc->InputTimestamp());
}
}
return ::mediapipe::OkStatus();
}
} // namespace mediapipe

View File

@ -39,6 +39,8 @@ constexpr char kKeypointLabel[] = "KEYPOINT";
// The ratio of detection label font height to the height of detection bounding
// box.
constexpr double kLabelToBoundingBoxRatio = 0.1;
// Perserve 2 decimal digits.
constexpr float kNumScoreDecimalDigitsMultipler = 100;
} // namespace
@ -235,18 +237,26 @@ void DetectionsToRenderDataCalculator::AddLabels(
std::string label_str = detection.label().empty()
? absl::StrCat(detection.label_id(i))
: detection.label(i);
const float rounded_score =
std::round(detection.score(i) * kNumScoreDecimalDigitsMultipler) /
kNumScoreDecimalDigitsMultipler;
std::string label_and_score =
absl::StrCat(label_str, options.text_delimiter(), detection.score(i),
absl::StrCat(label_str, options.text_delimiter(), rounded_score,
options.text_delimiter());
label_and_scores.push_back(label_and_score);
}
std::vector<std::string> labels;
if (options.render_detection_id()) {
const std::string detection_id_str =
absl::StrCat("Id: ", detection.detection_id());
labels.push_back(detection_id_str);
}
if (options.one_label_per_line()) {
labels.swap(label_and_scores);
labels.insert(labels.end(), label_and_scores.begin(),
label_and_scores.end());
} else {
labels.push_back(absl::StrJoin(label_and_scores, ""));
}
// Add the render annotations for "label(_id),score".
for (int i = 0; i < labels.size(); ++i) {
auto label = labels.at(i);

View File

@ -53,4 +53,7 @@ message DetectionsToRenderDataCalculatorOptions {
// instances of this calculator are present in the graph, this value
// should be unique among them.
optional string scene_class = 7 [default = "DETECTION"];
// If true, renders the detection id in the first line before the labels.
optional bool render_detection_id = 8 [default = false];
}

View File

@ -0,0 +1,110 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/detection.pb.h"
#include "mediapipe/framework/formats/location_data.pb.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/tracking/box_tracker.h"
namespace mediapipe {
namespace {
constexpr char kDetectionsTag[] = "DETECTIONS";
constexpr char kDetectionListTag[] = "DETECTION_LIST";
constexpr char kBoxesTag[] = "BOXES";
} // namespace
// A calculator that converts Detection proto to TimedBoxList proto for
// tracking.
//
// Please note that only Location Data formats of RELATIVE_BOUNDING_BOX are
// supported.
//
// Example config:
// node {
// calculator: "DetectionsToTimedBoxListCalculator"
// input_stream: "DETECTIONS:detections"
// output_stream: "BOXES:boxes"
// }
class DetectionsToTimedBoxListCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc) {
RET_CHECK(cc->Inputs().HasTag(kDetectionListTag) ||
cc->Inputs().HasTag(kDetectionsTag))
<< "None of the input streams are provided.";
if (cc->Inputs().HasTag(kDetectionListTag)) {
cc->Inputs().Tag(kDetectionListTag).Set<DetectionList>();
}
if (cc->Inputs().HasTag(kDetectionsTag)) {
cc->Inputs().Tag(kDetectionsTag).Set<std::vector<Detection>>();
}
cc->Outputs().Tag(kBoxesTag).Set<TimedBoxProtoList>();
return ::mediapipe::OkStatus();
}
::mediapipe::Status Open(CalculatorContext* cc) override {
cc->SetOffset(TimestampDiff(0));
return ::mediapipe::OkStatus();
}
::mediapipe::Status Process(CalculatorContext* cc) override;
private:
void ConvertDetectionToTimedBox(const Detection& detection,
TimedBoxProto* box, CalculatorContext* cc);
};
REGISTER_CALCULATOR(DetectionsToTimedBoxListCalculator);
::mediapipe::Status DetectionsToTimedBoxListCalculator::Process(
CalculatorContext* cc) {
auto output_timed_box_list = absl::make_unique<TimedBoxProtoList>();
if (cc->Inputs().HasTag(kDetectionListTag)) {
const auto& detection_list =
cc->Inputs().Tag(kDetectionListTag).Get<DetectionList>();
for (const auto& detection : detection_list.detection()) {
TimedBoxProto* box = output_timed_box_list->add_box();
ConvertDetectionToTimedBox(detection, box, cc);
}
}
if (cc->Inputs().HasTag(kDetectionsTag)) {
const auto& detections =
cc->Inputs().Tag(kDetectionsTag).Get<std::vector<Detection>>();
for (const auto& detection : detections) {
TimedBoxProto* box = output_timed_box_list->add_box();
ConvertDetectionToTimedBox(detection, box, cc);
}
}
cc->Outputs().Tag(kBoxesTag).Add(output_timed_box_list.release(),
cc->InputTimestamp());
return ::mediapipe::OkStatus();
}
void DetectionsToTimedBoxListCalculator::ConvertDetectionToTimedBox(
const Detection& detection, TimedBoxProto* box, CalculatorContext* cc) {
const auto& relative_bounding_box =
detection.location_data().relative_bounding_box();
box->set_left(relative_bounding_box.xmin());
box->set_right(relative_bounding_box.xmin() + relative_bounding_box.width());
box->set_top(relative_bounding_box.ymin());
box->set_bottom(relative_bounding_box.ymin() +
relative_bounding_box.height());
box->set_id(detection.detection_id());
box->set_time_msec(cc->InputTimestamp().Microseconds() / 1000);
}
} // namespace mediapipe

View File

@ -37,6 +37,67 @@ proto_library(
deps = ["//mediapipe/framework:calculator_proto"],
)
proto_library(
name = "motion_analysis_calculator_proto",
srcs = ["motion_analysis_calculator.proto"],
deps = [
"//mediapipe/framework:calculator_proto",
"//mediapipe/util/tracking:motion_analysis_proto",
],
)
proto_library(
name = "flow_packager_calculator_proto",
srcs = ["flow_packager_calculator.proto"],
deps = [
"//mediapipe/framework:calculator_proto",
"//mediapipe/util/tracking:flow_packager_proto",
],
)
proto_library(
name = "box_tracker_calculator_proto",
srcs = ["box_tracker_calculator.proto"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_proto",
"//mediapipe/util/tracking:box_tracker_proto",
],
)
mediapipe_cc_proto_library(
name = "motion_analysis_calculator_cc_proto",
srcs = ["motion_analysis_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
"//mediapipe/util/tracking:motion_analysis_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":motion_analysis_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "flow_packager_calculator_cc_proto",
srcs = ["flow_packager_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
"//mediapipe/util/tracking:flow_packager_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":flow_packager_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "box_tracker_calculator_cc_proto",
srcs = ["box_tracker_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
"//mediapipe/util/tracking:box_tracker_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":box_tracker_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "flow_to_image_calculator_cc_proto",
srcs = ["flow_to_image_calculator.proto"],
@ -131,6 +192,94 @@ cc_library(
alwayslink = 1,
)
cc_library(
name = "motion_analysis_calculator",
srcs = ["motion_analysis_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":motion_analysis_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/formats:image_frame",
"//mediapipe/framework/formats:image_frame_opencv",
"//mediapipe/framework/formats:video_stream_header",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/util/tracking:camera_motion",
"//mediapipe/util/tracking:camera_motion_cc_proto",
"//mediapipe/util/tracking:frame_selection_cc_proto",
"//mediapipe/util/tracking:motion_analysis",
"//mediapipe/util/tracking:motion_estimation",
"//mediapipe/util/tracking:motion_models",
"//mediapipe/util/tracking:region_flow_cc_proto",
"@com_google_absl//absl/strings",
],
alwayslink = 1,
)
cc_library(
name = "flow_packager_calculator",
srcs = ["flow_packager_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":flow_packager_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/util/tracking:camera_motion_cc_proto",
"//mediapipe/util/tracking:flow_packager",
"//mediapipe/util/tracking:region_flow_cc_proto",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
],
alwayslink = 1,
)
cc_library(
name = "box_tracker_calculator",
srcs = ["box_tracker_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":box_tracker_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/formats:image_frame",
"//mediapipe/framework/formats:image_frame_opencv",
"//mediapipe/framework/formats:video_stream_header", # fixdeps: keep -- required for exobazel build.
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:parse_text_proto",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/framework/tool:options_util",
"//mediapipe/util/tracking",
"//mediapipe/util/tracking:box_tracker",
"//mediapipe/util/tracking:tracking_visualization_utilities",
"@com_google_absl//absl/strings",
],
alwayslink = 1,
)
cc_library(
name = "tracked_detection_manager_calculator",
srcs = ["tracked_detection_manager_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/formats:detection_cc_proto",
"//mediapipe/framework/formats:location_data_cc_proto",
"//mediapipe/framework/formats:rect_cc_proto",
"//mediapipe/framework/port:status",
"//mediapipe/util/tracking",
"//mediapipe/util/tracking:box_tracker",
"//mediapipe/util/tracking:tracked_detection",
"//mediapipe/util/tracking:tracked_detection_manager",
"//mediapipe/util/tracking:tracking_visualization_utilities",
"@com_google_absl//absl/container:node_hash_map",
],
alwayslink = 1,
)
filegroup(
name = "test_videos",
srcs = [
@ -201,3 +350,64 @@ cc_test(
"//mediapipe/framework/port:parse_text_proto",
],
)
MEDIAPIPE_DEPS = [
"//mediapipe/calculators/video:box_tracker_calculator",
"//mediapipe/calculators/video:flow_packager_calculator",
"//mediapipe/calculators/video:motion_analysis_calculator",
"//mediapipe/framework/stream_handler:fixed_size_input_stream_handler",
"//mediapipe/framework/stream_handler:sync_set_input_stream_handler",
]
mediapipe_binary_graph(
name = "parallel_tracker_binarypb",
graph = "testdata/parallel_tracker_graph.pbtxt",
output_name = "testdata/parallel_tracker.binarypb",
visibility = ["//visibility:public"],
deps = MEDIAPIPE_DEPS,
)
mediapipe_binary_graph(
name = "tracker_binarypb",
graph = "testdata/tracker_graph.pbtxt",
output_name = "testdata/tracker.binarypb",
visibility = ["//visibility:public"],
deps = MEDIAPIPE_DEPS,
)
cc_test(
name = "tracking_graph_test",
size = "small",
srcs = ["tracking_graph_test.cc"],
copts = ["-DPARALLEL_INVOKER_ACTIVE"] + select({
"//mediapipe:apple": [],
"//mediapipe:android": [],
"//conditions:default": [],
}),
data = [
":testdata/lenna.png",
":testdata/parallel_tracker.binarypb",
":testdata/tracker.binarypb",
],
deps = [
":box_tracker_calculator",
":box_tracker_calculator_cc_proto",
":flow_packager_calculator",
":motion_analysis_calculator",
"//mediapipe/framework:calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework:packet",
"//mediapipe/framework/deps:file_path",
"//mediapipe/framework/formats:image_frame",
"//mediapipe/framework/port:advanced_proto",
"//mediapipe/framework/port:core_proto",
"//mediapipe/framework/port:file_helpers",
"//mediapipe/framework/port:gtest_main",
"//mediapipe/framework/port:opencv_highgui",
"//mediapipe/framework/port:status",
"//mediapipe/framework/stream_handler:fixed_size_input_stream_handler",
"//mediapipe/framework/stream_handler:sync_set_input_stream_handler",
"//mediapipe/util/tracking:box_tracker_cc_proto",
"//mediapipe/util/tracking:tracking_cc_proto",
],
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,55 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
import "mediapipe/util/tracking/box_tracker.proto";
message BoxTrackerCalculatorOptions {
extend CalculatorOptions {
optional BoxTrackerCalculatorOptions ext = 268767860;
}
optional BoxTrackerOptions tracker_options = 1;
// Initial position to be tracked. Can also be supplied as side packet or
// as input stream.
optional TimedBoxProtoList initial_position = 2;
// If set and VIZ stream is present, renders tracking data into the
// visualization.
optional bool visualize_tracking_data = 3 [default = false];
// If set and VIZ stream is present, renders the box state
// into the visualization.
optional bool visualize_state = 4 [default = false];
// If set and VIZ stream is present, renders the internal box state
// into the visualization.
optional bool visualize_internal_state = 5 [default = false];
// Size of the track data cache during streaming mode. This allows to buffer
// track_data's for fast forward tracking, i.e. any TimedBox received
// via input stream START_POS can be tracked towards the current track head
// (i.e. last received TrackingData). Measured in number of frames.
optional int32 streaming_track_data_cache_size = 6 [default = 0];
// Add a transition period of N frames to smooth the jump from original
// tracking to reset start pos with motion compensation. The transition will
// be a linear decay of original tracking result. 0 means no transition.
optional int32 start_pos_transition_frames = 7 [default = 0];
}

View File

@ -0,0 +1,281 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <fstream>
#include <memory>
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "mediapipe/calculators/video/flow_packager_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/flow_packager.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
using mediapipe::CameraMotion;
using mediapipe::FlowPackager;
using mediapipe::RegionFlowFeatureList;
using mediapipe::TrackingData;
using mediapipe::TrackingDataChunk;
// A calculator that packages input CameraMotion and RegionFlowFeatureList
// into a TrackingData and optionally writes TrackingDataChunks to file.
//
// Input stream:
// FLOW: Input region flow (proto RegionFlowFeatureList).
// CAMERA: Input camera stream (proto CameraMotion, optional).
//
// Input side packets:
// CACHE_DIR: Optional caching directory tracking files are written to.
//
// Output streams.
// TRACKING: Output tracking data (proto TrackingData, per frame
// optional).
// TRACKING_CHUNK: Output tracking chunks (proto TrackingDataChunk,
// per chunk, optional), output at the first timestamp
// of each chunk.
// COMPLETE: Optional output packet sent on PreStream to
// to signal downstream calculators that all data has been
// processed and calculator is closed. Can be used to indicate
// that all data as been written to CACHE_DIR.
class FlowPackagerCalculator : public CalculatorBase {
public:
~FlowPackagerCalculator() override = default;
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
::mediapipe::Status Close(CalculatorContext* cc) override;
// Writes passed chunk to disk.
void WriteChunk(const TrackingDataChunk& chunk) const;
// Initializes next chunk for tracking beginning from last frame of
// current chunk (Chunking is design with one frame overlap).
void PrepareCurrentForNextChunk(TrackingDataChunk* chunk);
private:
FlowPackagerCalculatorOptions options_;
// Caching options.
bool use_caching_ = false;
bool build_chunk_ = false;
std::string cache_dir_;
int chunk_idx_ = -1;
TrackingDataChunk tracking_chunk_;
int frame_idx_ = 0;
Timestamp prev_timestamp_;
std::unique_ptr<FlowPackager> flow_packager_;
};
REGISTER_CALCULATOR(FlowPackagerCalculator);
::mediapipe::Status FlowPackagerCalculator::GetContract(
CalculatorContract* cc) {
if (!cc->Inputs().HasTag("FLOW")) {
return tool::StatusFail("No input flow was specified.");
}
cc->Inputs().Tag("FLOW").Set<RegionFlowFeatureList>();
if (cc->Inputs().HasTag("CAMERA")) {
cc->Inputs().Tag("CAMERA").Set<CameraMotion>();
}
if (cc->Outputs().HasTag("TRACKING")) {
cc->Outputs().Tag("TRACKING").Set<TrackingData>();
}
if (cc->Outputs().HasTag("TRACKING_CHUNK")) {
cc->Outputs().Tag("TRACKING_CHUNK").Set<TrackingDataChunk>();
}
if (cc->Outputs().HasTag("COMPLETE")) {
cc->Outputs().Tag("COMPLETE").Set<bool>();
}
if (cc->InputSidePackets().HasTag("CACHE_DIR")) {
cc->InputSidePackets().Tag("CACHE_DIR").Set<std::string>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status FlowPackagerCalculator::Open(CalculatorContext* cc) {
options_ = cc->Options<FlowPackagerCalculatorOptions>();
flow_packager_.reset(new FlowPackager(options_.flow_packager_options()));
use_caching_ = cc->InputSidePackets().HasTag("CACHE_DIR");
build_chunk_ = use_caching_ || cc->Outputs().HasTag("TRACKING_CHUNK");
if (use_caching_) {
cache_dir_ = cc->InputSidePackets().Tag("CACHE_DIR").Get<std::string>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status FlowPackagerCalculator::Process(CalculatorContext* cc) {
InputStream* flow_stream = &(cc->Inputs().Tag("FLOW"));
const RegionFlowFeatureList& flow = flow_stream->Get<RegionFlowFeatureList>();
const Timestamp timestamp = flow_stream->Value().Timestamp();
const CameraMotion* camera_motion = nullptr;
if (cc->Inputs().HasTag("CAMERA")) {
InputStream* camera_stream = &(cc->Inputs().Tag("CAMERA"));
camera_motion = &camera_stream->Get<CameraMotion>();
}
std::unique_ptr<TrackingData> tracking_data(new TrackingData());
flow_packager_->PackFlow(flow, camera_motion, tracking_data.get());
if (build_chunk_) {
if (chunk_idx_ < 0) { // Lazy init, determine first start.
chunk_idx_ =
timestamp.Value() / 1000 / options_.caching_chunk_size_msec();
tracking_chunk_.set_first_chunk(true);
}
CHECK_GE(chunk_idx_, 0);
TrackingDataChunk::Item* item = tracking_chunk_.add_item();
item->set_frame_idx(frame_idx_);
item->set_timestamp_usec(timestamp.Value());
if (frame_idx_ > 0) {
item->set_prev_timestamp_usec(prev_timestamp_.Value());
}
if (cc->Outputs().HasTag("TRACKING")) {
// Need to copy as output is requested.
*item->mutable_tracking_data() = *tracking_data;
} else {
item->mutable_tracking_data()->Swap(tracking_data.get());
}
const int next_chunk_msec =
options_.caching_chunk_size_msec() * (chunk_idx_ + 1);
if (timestamp.Value() / 1000 >= next_chunk_msec) {
if (cc->Outputs().HasTag("TRACKING_CHUNK")) {
cc->Outputs()
.Tag("TRACKING_CHUNK")
.Add(new TrackingDataChunk(tracking_chunk_),
Timestamp(tracking_chunk_.item(0).timestamp_usec()));
}
if (use_caching_) {
WriteChunk(tracking_chunk_);
}
PrepareCurrentForNextChunk(&tracking_chunk_);
}
}
if (cc->Outputs().HasTag("TRACKING")) {
cc->Outputs()
.Tag("TRACKING")
.Add(tracking_data.release(), flow_stream->Value().Timestamp());
}
prev_timestamp_ = timestamp;
++frame_idx_;
return ::mediapipe::OkStatus();
}
::mediapipe::Status FlowPackagerCalculator::Close(CalculatorContext* cc) {
if (frame_idx_ > 0) {
tracking_chunk_.set_last_chunk(true);
if (cc->Outputs().HasTag("TRACKING_CHUNK")) {
cc->Outputs()
.Tag("TRACKING_CHUNK")
.Add(new TrackingDataChunk(tracking_chunk_),
Timestamp(tracking_chunk_.item(0).timestamp_usec()));
}
if (use_caching_) {
WriteChunk(tracking_chunk_);
}
}
if (cc->Outputs().HasTag("COMPLETE")) {
cc->Outputs().Tag("COMPLETE").Add(new bool(true), Timestamp::PreStream());
}
return ::mediapipe::OkStatus();
}
void FlowPackagerCalculator::WriteChunk(const TrackingDataChunk& chunk) const {
if (chunk.item_size() == 0) {
LOG(ERROR) << "Write chunk called with empty tracking data."
<< "This can only occur if the spacing between frames "
<< "is larger than the requested chunk size. Try increasing "
<< "the chunk size";
return;
}
auto format_runtime =
absl::ParsedFormat<'d'>::New(options_.cache_file_format());
std::string chunk_file;
if (format_runtime) {
chunk_file =
cache_dir_ + "/" + absl::StrFormat(*format_runtime, chunk_idx_);
} else {
LOG(ERROR) << "chache_file_format wrong. fall back to chunk_%04d.";
chunk_file = cache_dir_ + "/" + absl::StrFormat("chunk_%04d", chunk_idx_);
}
std::string data;
chunk.SerializeToString(&data);
const char* temp_filename = tempnam(cache_dir_.c_str(), nullptr);
std::ofstream out_file(temp_filename);
if (!out_file) {
LOG(ERROR) << "Could not open " << temp_filename;
} else {
out_file.write(data.data(), data.size());
}
if (rename(temp_filename, chunk_file.c_str()) != 0) {
LOG(ERROR) << "Failed to rename to " << chunk_file;
}
LOG(INFO) << "Wrote chunk : " << chunk_file;
}
void FlowPackagerCalculator::PrepareCurrentForNextChunk(
TrackingDataChunk* chunk) {
CHECK(chunk);
if (chunk->item_size() == 0) {
LOG(ERROR) << "Called with empty chunk. Unexpected.";
return;
}
chunk->set_first_chunk(false);
// Buffer last item for next chunk.
TrackingDataChunk::Item last_item;
last_item.Swap(chunk->mutable_item(chunk->item_size() - 1));
chunk->Clear();
chunk->add_item()->Swap(&last_item);
++chunk_idx_;
}
} // namespace mediapipe

View File

@ -0,0 +1,36 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
import "mediapipe/util/tracking/flow_packager.proto";
message FlowPackagerCalculatorOptions {
extend CalculatorOptions {
optional FlowPackagerCalculatorOptions ext = 271236147;
}
optional mediapipe.FlowPackagerOptions flow_packager_options = 1;
// Chunk size for caching files that are written to the externally specified
// caching directory. Specified in msec.
// Note that each chunk always contains at its end the first frame of the
// next chunk (to enable forward tracking across chunk boundaries).
optional int32 caching_chunk_size_msec = 2 [default = 2500];
optional string cache_file_format = 3 [default = "chunk_%04d"];
}

View File

@ -0,0 +1,988 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cmath>
#include <fstream>
#include <memory>
#include "absl/strings/numbers.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "mediapipe/calculators/video/motion_analysis_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/image_frame.h"
#include "mediapipe/framework/formats/image_frame_opencv.h"
#include "mediapipe/framework/formats/video_stream_header.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/tracking/camera_motion.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/frame_selection.pb.h"
#include "mediapipe/util/tracking/motion_analysis.h"
#include "mediapipe/util/tracking/motion_estimation.h"
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
using mediapipe::AffineAdapter;
using mediapipe::CameraMotion;
using mediapipe::FrameSelectionResult;
using mediapipe::Homography;
using mediapipe::HomographyAdapter;
using mediapipe::LinearSimilarityModel;
using mediapipe::MixtureHomography;
using mediapipe::MixtureRowWeights;
using mediapipe::MotionAnalysis;
using mediapipe::ProjectViaFit;
using mediapipe::RegionFlowComputationOptions;
using mediapipe::RegionFlowFeatureList;
using mediapipe::SalientPointFrame;
using mediapipe::TranslationModel;
const char kOptionsTag[] = "OPTIONS";
// A calculator that performs motion analysis on an incoming video stream.
//
// Input streams: (at least one of them is required).
// VIDEO: The input video stream (ImageFrame, sRGB, sRGBA or GRAY8).
// SELECTION: Optional input stream to perform analysis only on selected
// frames. If present needs to contain camera motion
// and features.
//
// Input side packets:
// CSV_FILE: Read motion models as homographies from CSV file. Expected
// to be defined in the frame domain (un-normalized).
// Should store 9 floats per row.
// Specify number of homographies per frames via option
// meta_models_per_frame. For values > 1, MixtureHomographies
// are created, for value == 1, a single Homography is used.
// DOWNSAMPLE: Optionally specify downsampling factor via input side packet
// overriding value in the graph settings.
// Output streams (all are optional).
// FLOW: Sparse feature tracks in form of proto RegionFlowFeatureList.
// CAMERA: Camera motion as proto CameraMotion describing the per frame-
// pair motion. Has VideoHeader from input video.
// SALIENCY: Foreground saliency (objects moving different from the
// background) as proto SalientPointFrame.
// VIZ: Visualization stream as ImageFrame, sRGB, visualizing
// features and saliency (set via
// analysis_options().visualization_options())
// DENSE_FG: Dense foreground stream, describing per-pixel foreground-
// ness as confidence between 0 (background) and 255
// (foreground). Output is ImageFrame (GRAY8).
// VIDEO_OUT: Optional output stream when SELECTION is used. Output is input
// VIDEO at the selected frames. Required VIDEO to be present.
// GRAY_VIDEO_OUT: Optional output stream for downsampled, grayscale video.
// Requires VIDEO to be present and SELECTION to not be used.
class MotionAnalysisCalculator : public CalculatorBase {
// TODO: Activate once leakr approval is ready.
// typedef com::google::android::libraries::micro::proto::Data HomographyData;
public:
~MotionAnalysisCalculator() override = default;
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
::mediapipe::Status Close(CalculatorContext* cc) override;
private:
// Outputs results to Outputs() if MotionAnalysis buffered sufficient results.
// Otherwise no-op. Set flush to true to force output of all buffered data.
void OutputMotionAnalyzedFrames(bool flush, CalculatorContext* cc);
// Lazy init function to be called on Process.
::mediapipe::Status InitOnProcess(InputStream* video_stream,
InputStream* selection_stream);
// Parses CSV file contents to homographies.
bool ParseModelCSV(const std::string& contents,
std::deque<Homography>* homographies);
// Turns list of 9-tuple floating values into set of homographies.
bool HomographiesFromValues(const std::vector<float>& homog_values,
std::deque<Homography>* homographies);
// Appends CameraMotions and features from homographies.
// Set append_identity to true to add an identity transform to the beginning
// of the each list *in addition* to the motions derived from homographies.
void AppendCameraMotionsFromHomographies(
const std::deque<Homography>& homographies, bool append_identity,
std::deque<CameraMotion>* camera_motions,
std::deque<RegionFlowFeatureList>* features);
// Helper function to subtract current metadata motion from features. Used
// for hybrid estimation case.
void SubtractMetaMotion(const CameraMotion& meta_motion,
RegionFlowFeatureList* features);
// Inverse of above function to add back meta motion and replace
// feature location with originals after estimation.
void AddMetaMotion(const CameraMotion& meta_motion,
const RegionFlowFeatureList& meta_features,
RegionFlowFeatureList* features, CameraMotion* motion);
MotionAnalysisCalculatorOptions options_;
int frame_width_ = -1;
int frame_height_ = -1;
int frame_idx_ = 0;
// Buffers incoming video frame packets (if visualization output is requested)
std::vector<Packet> packet_buffer_;
// Buffers incoming timestamps until MotionAnalysis is ready to output via
// above OutputMotionAnalyzedFrames.
std::vector<Timestamp> timestamp_buffer_;
// Input indicators for each stream.
bool selection_input_ = false;
bool video_input_ = false;
// Output indicators for each stream.
bool region_flow_feature_output_ = false;
bool camera_motion_output_ = false;
bool saliency_output_ = false;
bool visualize_output_ = false;
bool dense_foreground_output_ = false;
bool video_output_ = false;
bool grayscale_output_ = false;
bool csv_file_input_ = false;
// Inidicates if saliency should be computed.
bool with_saliency_ = false;
// Set if hybrid meta analysis - see proto for details.
bool hybrid_meta_analysis_ = false;
// Concatenated motions for each selected frame. Used in case
// hybrid estimation is requested to fallback to valid models.
std::deque<CameraMotion> selected_motions_;
// Normalized homographies from CSV file or metadata.
std::deque<Homography> meta_homographies_;
std::deque<CameraMotion> meta_motions_;
std::deque<RegionFlowFeatureList> meta_features_;
// Offset into above meta_motions_ and features_ when using
// hybrid meta analysis.
int hybrid_meta_offset_ = 0;
std::unique_ptr<MotionAnalysis> motion_analysis_;
std::unique_ptr<MixtureRowWeights> row_weights_;
};
REGISTER_CALCULATOR(MotionAnalysisCalculator);
::mediapipe::Status MotionAnalysisCalculator::GetContract(
CalculatorContract* cc) {
if (cc->Inputs().HasTag("VIDEO")) {
cc->Inputs().Tag("VIDEO").Set<ImageFrame>();
}
// Optional input stream from frame selection calculator.
if (cc->Inputs().HasTag("SELECTION")) {
cc->Inputs().Tag("SELECTION").Set<FrameSelectionResult>();
}
RET_CHECK(cc->Inputs().HasTag("VIDEO") || cc->Inputs().HasTag("SELECTION"))
<< "Either VIDEO, SELECTION must be specified.";
if (cc->Outputs().HasTag("FLOW")) {
cc->Outputs().Tag("FLOW").Set<RegionFlowFeatureList>();
}
if (cc->Outputs().HasTag("CAMERA")) {
cc->Outputs().Tag("CAMERA").Set<CameraMotion>();
}
if (cc->Outputs().HasTag("SALIENCY")) {
cc->Outputs().Tag("SALIENCY").Set<SalientPointFrame>();
}
if (cc->Outputs().HasTag("VIZ")) {
cc->Outputs().Tag("VIZ").Set<ImageFrame>();
}
if (cc->Outputs().HasTag("DENSE_FG")) {
cc->Outputs().Tag("DENSE_FG").Set<ImageFrame>();
}
if (cc->Outputs().HasTag("VIDEO_OUT")) {
cc->Outputs().Tag("VIDEO_OUT").Set<ImageFrame>();
}
if (cc->Outputs().HasTag("GRAY_VIDEO_OUT")) {
// We only output grayscale video if we're actually performing full region-
// flow analysis on the video.
RET_CHECK(cc->Inputs().HasTag("VIDEO") &&
!cc->Inputs().HasTag("SELECTION"));
cc->Outputs().Tag("GRAY_VIDEO_OUT").Set<ImageFrame>();
}
if (cc->InputSidePackets().HasTag("CSV_FILE")) {
cc->InputSidePackets().Tag("CSV_FILE").Set<std::string>();
}
if (cc->InputSidePackets().HasTag("DOWNSAMPLE")) {
cc->InputSidePackets().Tag("DOWNSAMPLE").Set<float>();
}
if (cc->InputSidePackets().HasTag(kOptionsTag)) {
cc->InputSidePackets().Tag(kOptionsTag).Set<CalculatorOptions>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status MotionAnalysisCalculator::Open(CalculatorContext* cc) {
options_ =
tool::RetrieveOptions(cc->Options<MotionAnalysisCalculatorOptions>(),
cc->InputSidePackets(), kOptionsTag);
video_input_ = cc->Inputs().HasTag("VIDEO");
selection_input_ = cc->Inputs().HasTag("SELECTION");
region_flow_feature_output_ = cc->Outputs().HasTag("FLOW");
camera_motion_output_ = cc->Outputs().HasTag("CAMERA");
saliency_output_ = cc->Outputs().HasTag("SALIENCY");
visualize_output_ = cc->Outputs().HasTag("VIZ");
dense_foreground_output_ = cc->Outputs().HasTag("DENSE_FG");
video_output_ = cc->Outputs().HasTag("VIDEO_OUT");
grayscale_output_ = cc->Outputs().HasTag("GRAY_VIDEO_OUT");
csv_file_input_ = cc->InputSidePackets().HasTag("CSV_FILE");
hybrid_meta_analysis_ = options_.meta_analysis() ==
MotionAnalysisCalculatorOptions::META_ANALYSIS_HYBRID;
if (video_output_) {
RET_CHECK(selection_input_) << "VIDEO_OUT requires SELECTION input";
}
if (selection_input_) {
switch (options_.selection_analysis()) {
case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION:
RET_CHECK(!visualize_output_)
<< "Visualization not supported for NO_ANALYSIS_USE_SELECTION";
RET_CHECK(!dense_foreground_output_)
<< "Dense foreground not supported for NO_ANALYSIS_USE_SELECTION";
RET_CHECK(!saliency_output_)
<< "Saliency output not supported for NO_ANALYSIS_USE_SELECTION";
break;
case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE:
case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED:
RET_CHECK(video_input_) << "Need video input for feature tracking.";
break;
case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES:
// Nothing to add here.
break;
}
}
if (visualize_output_ || dense_foreground_output_ || video_output_) {
RET_CHECK(video_input_) << "Video input required.";
}
if (csv_file_input_) {
RET_CHECK(!selection_input_)
<< "Can not use selection input with csv input.";
if (!hybrid_meta_analysis_) {
RET_CHECK(!saliency_output_ && !visualize_output_ &&
!dense_foreground_output_ && !grayscale_output_)
<< "CSV file and meta input only supports flow and camera motion "
<< "output when using metadata only.";
}
}
if (csv_file_input_) {
// Read from file and parse.
const std::string filename =
cc->InputSidePackets().Tag("CSV_FILE").Get<std::string>();
std::string file_contents;
std::ifstream input_file(filename, std::ios::in);
input_file.seekg(0, std::ios::end);
const int file_length = input_file.tellg();
file_contents.resize(file_length);
input_file.seekg(0, std::ios::beg);
input_file.read(&file_contents[0], file_length);
input_file.close();
RET_CHECK(ParseModelCSV(file_contents, &meta_homographies_))
<< "Could not parse CSV file";
}
// Get video header from video or selection input if present.
const VideoHeader* video_header = nullptr;
if (video_input_ && !cc->Inputs().Tag("VIDEO").Header().IsEmpty()) {
video_header = &(cc->Inputs().Tag("VIDEO").Header().Get<VideoHeader>());
} else if (selection_input_ &&
!cc->Inputs().Tag("SELECTION").Header().IsEmpty()) {
video_header = &(cc->Inputs().Tag("SELECTION").Header().Get<VideoHeader>());
} else {
LOG(WARNING) << "No input video header found. Downstream calculators "
"expecting video headers are likely to fail.";
}
with_saliency_ = options_.analysis_options().compute_motion_saliency();
// Force computation of saliency if requested as output.
if (cc->Outputs().HasTag("SALIENCY")) {
with_saliency_ = true;
if (!options_.analysis_options().compute_motion_saliency()) {
LOG(WARNING) << "Enable saliency computation. Set "
<< "compute_motion_saliency to true to silence this "
<< "warning.";
options_.mutable_analysis_options()->set_compute_motion_saliency(true);
}
}
if (options_.bypass_mode()) {
cc->SetOffset(TimestampDiff(0));
}
if (cc->InputSidePackets().HasTag("DOWNSAMPLE")) {
options_.mutable_analysis_options()
->mutable_flow_options()
->set_downsample_factor(
cc->InputSidePackets().Tag("DOWNSAMPLE").Get<float>());
}
// If no video header is provided, just return and initialize on the first
// Process() call.
if (video_header == nullptr) {
return ::mediapipe::OkStatus();
}
////////////// EARLY RETURN; ONLY HEADER OUTPUT SHOULD GO HERE ///////////////
if (visualize_output_) {
cc->Outputs().Tag("VIZ").SetHeader(Adopt(new VideoHeader(*video_header)));
}
if (video_output_) {
cc->Outputs()
.Tag("VIDEO_OUT")
.SetHeader(Adopt(new VideoHeader(*video_header)));
}
if (cc->Outputs().HasTag("DENSE_FG")) {
std::unique_ptr<VideoHeader> foreground_header(
new VideoHeader(*video_header));
foreground_header->format = ImageFormat::GRAY8;
cc->Outputs().Tag("DENSE_FG").SetHeader(Adopt(foreground_header.release()));
}
if (cc->Outputs().HasTag("CAMERA")) {
cc->Outputs().Tag("CAMERA").SetHeader(
Adopt(new VideoHeader(*video_header)));
}
if (cc->Outputs().HasTag("SALIENCY")) {
cc->Outputs()
.Tag("SALIENCY")
.SetHeader(Adopt(new VideoHeader(*video_header)));
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status MotionAnalysisCalculator::Process(CalculatorContext* cc) {
if (options_.bypass_mode()) {
return ::mediapipe::OkStatus();
}
InputStream* video_stream =
video_input_ ? &(cc->Inputs().Tag("VIDEO")) : nullptr;
InputStream* selection_stream =
selection_input_ ? &(cc->Inputs().Tag("SELECTION")) : nullptr;
// Checked on Open.
CHECK(video_stream || selection_stream);
// Lazy init.
if (frame_width_ < 0 || frame_height_ < 0) {
MP_RETURN_IF_ERROR(InitOnProcess(video_stream, selection_stream));
}
const Timestamp timestamp = cc->InputTimestamp();
if ((csv_file_input_) && !hybrid_meta_analysis_) {
if (camera_motion_output_) {
RET_CHECK(!meta_motions_.empty()) << "Insufficient metadata.";
CameraMotion output_motion = meta_motions_.front();
meta_motions_.pop_front();
output_motion.set_timestamp_usec(timestamp.Value());
cc->Outputs().Tag("CAMERA").Add(new CameraMotion(output_motion),
timestamp);
}
if (region_flow_feature_output_) {
RET_CHECK(!meta_features_.empty()) << "Insufficient frames in CSV file";
RegionFlowFeatureList output_features = meta_features_.front();
meta_features_.pop_front();
output_features.set_timestamp_usec(timestamp.Value());
cc->Outputs().Tag("FLOW").Add(new RegionFlowFeatureList(output_features),
timestamp);
}
++frame_idx_;
return ::mediapipe::OkStatus();
}
if (motion_analysis_ == nullptr) {
// We do not need MotionAnalysis when using just metadata.
motion_analysis_.reset(new MotionAnalysis(options_.analysis_options(),
frame_width_, frame_height_));
}
std::unique_ptr<FrameSelectionResult> frame_selection_result;
// Always use frame if selection is not activated.
bool use_frame = !selection_input_;
if (selection_input_) {
CHECK(selection_stream);
// Fill in timestamps we process.
if (!selection_stream->Value().IsEmpty()) {
ASSIGN_OR_RETURN(
frame_selection_result,
selection_stream->Value().ConsumeOrCopy<FrameSelectionResult>());
use_frame = true;
// Make sure both features and camera motion are present.
RET_CHECK(frame_selection_result->has_camera_motion() &&
frame_selection_result->has_features())
<< "Frame selection input error at: " << timestamp
<< " both camera motion and features need to be "
"present in FrameSelectionResult. "
<< frame_selection_result->has_camera_motion() << " , "
<< frame_selection_result->has_features();
}
}
if (selection_input_ && use_frame &&
options_.selection_analysis() ==
MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION) {
// Output concatenated results, nothing to compute here.
if (camera_motion_output_) {
cc->Outputs().Tag("CAMERA").Add(
frame_selection_result->release_camera_motion(), timestamp);
}
if (region_flow_feature_output_) {
cc->Outputs().Tag("FLOW").Add(frame_selection_result->release_features(),
timestamp);
}
if (video_output_) {
cc->Outputs().Tag("VIDEO_OUT").AddPacket(video_stream->Value());
}
return ::mediapipe::OkStatus();
}
if (use_frame) {
if (!selection_input_) {
const cv::Mat input_view =
formats::MatView(&video_stream->Get<ImageFrame>());
if (hybrid_meta_analysis_) {
// Seed with meta homography.
RET_CHECK(hybrid_meta_offset_ < meta_motions_.size())
<< "Not enough metadata received for hybrid meta analysis";
Homography initial_transform =
meta_motions_[hybrid_meta_offset_].homography();
std::function<void(RegionFlowFeatureList*)> subtract_helper = std::bind(
&MotionAnalysisCalculator::SubtractMetaMotion, this,
meta_motions_[hybrid_meta_offset_], std::placeholders::_1);
// Keep original features before modification around.
motion_analysis_->AddFrameGeneric(
input_view, timestamp.Value(), initial_transform, nullptr, nullptr,
&subtract_helper, &meta_features_[hybrid_meta_offset_]);
++hybrid_meta_offset_;
} else {
motion_analysis_->AddFrame(input_view, timestamp.Value());
}
} else {
selected_motions_.push_back(frame_selection_result->camera_motion());
switch (options_.selection_analysis()) {
case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION:
return ::mediapipe::UnknownErrorBuilder(MEDIAPIPE_LOC)
<< "Should not reach this point!";
case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES:
motion_analysis_->AddFeatures(frame_selection_result->features());
break;
case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE: {
const cv::Mat input_view =
formats::MatView(&video_stream->Get<ImageFrame>());
motion_analysis_->AddFrame(input_view, timestamp.Value());
break;
}
case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED: {
Homography homography;
CameraMotionToHomography(frame_selection_result->camera_motion(),
&homography);
const cv::Mat input_view =
formats::MatView(&video_stream->Get<ImageFrame>());
motion_analysis_->AddFrameGeneric(input_view, timestamp.Value(),
homography, &homography);
break;
}
}
}
timestamp_buffer_.push_back(timestamp);
++frame_idx_;
VLOG_EVERY_N(0, 100) << "Analyzed frame " << frame_idx_;
// Buffer input frames only if visualization is requested.
if (visualize_output_ || video_output_) {
packet_buffer_.push_back(video_stream->Value());
}
// If requested, output grayscale thumbnails
if (grayscale_output_) {
cv::Mat grayscale_mat = motion_analysis_->GetGrayscaleFrameFromResults();
std::unique_ptr<ImageFrame> grayscale_image(new ImageFrame(
ImageFormat::GRAY8, grayscale_mat.cols, grayscale_mat.rows));
cv::Mat image_frame_mat = formats::MatView(grayscale_image.get());
grayscale_mat.copyTo(image_frame_mat);
cc->Outputs()
.Tag("GRAY_VIDEO_OUT")
.Add(grayscale_image.release(), timestamp);
}
// Output other results, if we have any yet.
OutputMotionAnalyzedFrames(false, cc);
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status MotionAnalysisCalculator::Close(CalculatorContext* cc) {
// Guard against empty videos.
if (motion_analysis_) {
OutputMotionAnalyzedFrames(true, cc);
}
if (csv_file_input_) {
if (!meta_motions_.empty()) {
LOG(ERROR) << "More motions than frames. Unexpected! Remainder: "
<< meta_motions_.size();
}
}
return ::mediapipe::OkStatus();
}
void MotionAnalysisCalculator::OutputMotionAnalyzedFrames(
bool flush, CalculatorContext* cc) {
std::vector<std::unique_ptr<RegionFlowFeatureList>> features;
std::vector<std::unique_ptr<CameraMotion>> camera_motions;
std::vector<std::unique_ptr<SalientPointFrame>> saliency;
const int buffer_size = timestamp_buffer_.size();
const int num_results = motion_analysis_->GetResults(
flush, &features, &camera_motions, with_saliency_ ? &saliency : nullptr);
CHECK_LE(num_results, buffer_size);
if (num_results == 0) {
return;
}
for (int k = 0; k < num_results; ++k) {
// Region flow features and camera motion for this frame.
auto& feature_list = features[k];
auto& camera_motion = camera_motions[k];
const Timestamp timestamp = timestamp_buffer_[k];
if (selection_input_ && options_.hybrid_selection_camera()) {
if (camera_motion->type() > selected_motions_.front().type()) {
// Composited type is more stable.
camera_motion->Swap(&selected_motions_.front());
}
selected_motions_.pop_front();
}
if (hybrid_meta_analysis_) {
AddMetaMotion(meta_motions_.front(), meta_features_.front(),
feature_list.get(), camera_motion.get());
meta_motions_.pop_front();
meta_features_.pop_front();
}
// Video frame for visualization.
std::unique_ptr<ImageFrame> visualization_frame;
cv::Mat visualization;
if (visualize_output_) {
// Initialize visualization frame with original frame.
visualization_frame.reset(new ImageFrame());
visualization_frame->CopyFrom(packet_buffer_[k].Get<ImageFrame>(), 16);
visualization = formats::MatView(visualization_frame.get());
motion_analysis_->RenderResults(
*feature_list, *camera_motion,
with_saliency_ ? saliency[k].get() : nullptr, &visualization);
cc->Outputs().Tag("VIZ").Add(visualization_frame.release(), timestamp);
}
// Output dense foreground mask.
if (dense_foreground_output_) {
std::unique_ptr<ImageFrame> foreground_frame(
new ImageFrame(ImageFormat::GRAY8, frame_width_, frame_height_));
cv::Mat foreground = formats::MatView(foreground_frame.get());
motion_analysis_->ComputeDenseForeground(*feature_list, *camera_motion,
&foreground);
cc->Outputs().Tag("DENSE_FG").Add(foreground_frame.release(), timestamp);
}
// Output flow features if requested.
if (region_flow_feature_output_) {
cc->Outputs().Tag("FLOW").Add(feature_list.release(), timestamp);
}
// Output camera motion.
if (camera_motion_output_) {
cc->Outputs().Tag("CAMERA").Add(camera_motion.release(), timestamp);
}
if (video_output_) {
cc->Outputs().Tag("VIDEO_OUT").AddPacket(packet_buffer_[k]);
}
// Output saliency.
if (saliency_output_) {
cc->Outputs().Tag("SALIENCY").Add(saliency[k].release(), timestamp);
}
}
if (hybrid_meta_analysis_) {
hybrid_meta_offset_ -= num_results;
CHECK_GE(hybrid_meta_offset_, 0);
}
timestamp_buffer_.erase(timestamp_buffer_.begin(),
timestamp_buffer_.begin() + num_results);
if (visualize_output_ || video_output_) {
packet_buffer_.erase(packet_buffer_.begin(),
packet_buffer_.begin() + num_results);
}
}
::mediapipe::Status MotionAnalysisCalculator::InitOnProcess(
InputStream* video_stream, InputStream* selection_stream) {
if (video_stream) {
frame_width_ = video_stream->Get<ImageFrame>().Width();
frame_height_ = video_stream->Get<ImageFrame>().Height();
// Ensure image options are set correctly.
auto* region_options =
options_.mutable_analysis_options()->mutable_flow_options();
// Use two possible formats to account for different channel orders.
RegionFlowComputationOptions::ImageFormat image_format;
RegionFlowComputationOptions::ImageFormat image_format2;
switch (video_stream->Get<ImageFrame>().Format()) {
case ImageFormat::GRAY8:
image_format = image_format2 =
RegionFlowComputationOptions::FORMAT_GRAYSCALE;
break;
case ImageFormat::SRGB:
image_format = RegionFlowComputationOptions::FORMAT_RGB;
image_format2 = RegionFlowComputationOptions::FORMAT_BGR;
break;
case ImageFormat::SRGBA:
image_format = RegionFlowComputationOptions::FORMAT_RGBA;
image_format2 = RegionFlowComputationOptions::FORMAT_BGRA;
break;
default:
RET_CHECK(false) << "Unsupported image format.";
}
if (region_options->image_format() != image_format &&
region_options->image_format() != image_format2) {
LOG(WARNING) << "Requested image format in RegionFlowComputation "
<< "does not match video stream format. Overriding.";
region_options->set_image_format(image_format);
}
// Account for downsampling mode INPUT_SIZE. In this case we are handed
// already downsampled frames but the resulting CameraMotion should
// be computed on higher resolution as specifed by the downsample scale.
if (region_options->downsample_mode() ==
RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) {
const float scale = region_options->downsample_factor();
frame_width_ = static_cast<int>(std::round(frame_width_ * scale));
frame_height_ = static_cast<int>(std::round(frame_height_ * scale));
}
} else if (selection_stream) {
const auto& camera_motion =
selection_stream->Get<FrameSelectionResult>().camera_motion();
frame_width_ = camera_motion.frame_width();
frame_height_ = camera_motion.frame_height();
} else {
LOG(FATAL) << "Either VIDEO or SELECTION stream need to be specified.";
}
// Filled by CSV file parsing.
if (!meta_homographies_.empty()) {
CHECK(csv_file_input_);
AppendCameraMotionsFromHomographies(meta_homographies_,
true, // append identity.
&meta_motions_, &meta_features_);
meta_homographies_.clear();
}
// Filter weights before using for hybrid mode.
if (hybrid_meta_analysis_) {
auto* motion_options =
options_.mutable_analysis_options()->mutable_motion_options();
motion_options->set_filter_initialized_irls_weights(true);
}
return ::mediapipe::OkStatus();
}
bool MotionAnalysisCalculator::ParseModelCSV(
const std::string& contents, std::deque<Homography>* homographies) {
std::vector<absl::string_view> values =
absl::StrSplit(contents, absl::ByAnyChar(",\n"));
// Trim off any empty lines.
while (values.back().empty()) {
values.pop_back();
}
// Convert to float.
std::vector<float> homog_values;
homog_values.reserve(values.size());
for (const auto& value : values) {
double value_64f;
if (!absl::SimpleAtod(value, &value_64f)) {
LOG(ERROR) << "Not a double, expected!";
return false;
}
homog_values.push_back(value_64f);
}
return HomographiesFromValues(homog_values, homographies);
}
bool MotionAnalysisCalculator::HomographiesFromValues(
const std::vector<float>& homog_values,
std::deque<Homography>* homographies) {
CHECK(homographies);
// Obvious constants are obvious :D
constexpr int kHomographyValues = 9;
if (homog_values.size() % kHomographyValues != 0) {
LOG(ERROR) << "Contents not a multiple of " << kHomographyValues;
return false;
}
for (int k = 0; k < homog_values.size(); k += kHomographyValues) {
std::vector<double> h_vals(kHomographyValues);
for (int l = 0; l < kHomographyValues; ++l) {
h_vals[l] = homog_values[k + l];
}
// Normalize last entry to 1.
if (h_vals[kHomographyValues - 1] == 0) {
LOG(ERROR) << "Degenerate homography, last entry is zero";
return false;
}
const double scale = 1.0f / h_vals[kHomographyValues - 1];
for (int l = 0; l < kHomographyValues; ++l) {
h_vals[l] *= scale;
}
Homography h = HomographyAdapter::FromDoublePointer(h_vals.data(), false);
homographies->push_back(h);
}
if (homographies->size() % options_.meta_models_per_frame() != 0) {
LOG(ERROR) << "Total homographies not a multiple of specified models "
<< "per frame.";
return false;
}
return true;
}
void MotionAnalysisCalculator::SubtractMetaMotion(
const CameraMotion& meta_motion, RegionFlowFeatureList* features) {
if (meta_motion.mixture_homography().model_size() > 0) {
CHECK(row_weights_ != nullptr);
RegionFlowFeatureListViaTransform(meta_motion.mixture_homography(),
features, -1.0f,
1.0f, // subtract transformed.
true, // replace feature loc.
row_weights_.get());
} else {
RegionFlowFeatureListViaTransform(meta_motion.homography(), features, -1.0f,
1.0f, // subtract transformed.
true); // replace feature loc.
}
// Clamp transformed features to domain and handle outliers.
const float domain_diam =
hypot(features->frame_width(), features->frame_height());
const float motion_mag = meta_motion.average_magnitude();
// Same irls fraction as used by MODEL_MIXTURE_HOMOGRAPHY scaling in
// MotionEstimation.
const float irls_fraction = options_.analysis_options()
.motion_options()
.irls_mixture_fraction_scale() *
options_.analysis_options()
.motion_options()
.irls_motion_magnitude_fraction();
float err_scale = std::max(1.0f, motion_mag * irls_fraction);
const float max_err =
options_.meta_outlier_domain_ratio() * domain_diam * err_scale;
const float max_err_sq = max_err * max_err;
for (auto& feature : *features->mutable_feature()) {
feature.set_x(
std::max(0.0f, std::min(features->frame_width() - 1.0f, feature.x())));
feature.set_y(
std::max(0.0f, std::min(features->frame_height() - 1.0f, feature.y())));
// Label anything with large residual motion an outlier.
if (FeatureFlow(feature).Norm2() > max_err_sq) {
feature.set_irls_weight(0.0f);
}
}
}
void MotionAnalysisCalculator::AddMetaMotion(
const CameraMotion& meta_motion, const RegionFlowFeatureList& meta_features,
RegionFlowFeatureList* features, CameraMotion* motion) {
// Restore old feature location.
CHECK_EQ(meta_features.feature_size(), features->feature_size());
for (int k = 0; k < meta_features.feature_size(); ++k) {
auto feature = features->mutable_feature(k);
const auto& meta_feature = meta_features.feature(k);
feature->set_x(meta_feature.x());
feature->set_y(meta_feature.y());
feature->set_dx(meta_feature.dx());
feature->set_dy(meta_feature.dy());
}
// Composite camera motion.
*motion = ComposeCameraMotion(*motion, meta_motion);
// Restore type from metadata, i.e. do not declare motions as invalid.
motion->set_type(meta_motion.type());
motion->set_match_frame(-1);
}
void MotionAnalysisCalculator::AppendCameraMotionsFromHomographies(
const std::deque<Homography>& homographies, bool append_identity,
std::deque<CameraMotion>* camera_motions,
std::deque<RegionFlowFeatureList>* features) {
CHECK(camera_motions);
CHECK(features);
CameraMotion identity;
identity.set_frame_width(frame_width_);
identity.set_frame_height(frame_height_);
*identity.mutable_translation() = TranslationModel();
*identity.mutable_linear_similarity() = LinearSimilarityModel();
*identity.mutable_homography() = Homography();
identity.set_type(CameraMotion::VALID);
identity.set_match_frame(0);
RegionFlowFeatureList empty_list;
empty_list.set_long_tracks(true);
empty_list.set_match_frame(-1);
empty_list.set_frame_width(frame_width_);
empty_list.set_frame_height(frame_height_);
if (append_identity) {
camera_motions->push_back(identity);
features->push_back(empty_list);
}
const int models_per_frame = options_.meta_models_per_frame();
CHECK_GT(models_per_frame, 0) << "At least one model per frame is needed";
CHECK_EQ(0, homographies.size() % models_per_frame);
const int num_frames = homographies.size() / models_per_frame;
// Heuristic sigma, similar to what we use for rolling shutter removal.
const float mixture_sigma = 1.0f / models_per_frame;
if (row_weights_ == nullptr) {
row_weights_.reset(new MixtureRowWeights(frame_height_,
frame_height_ / 10, // 10% margin
mixture_sigma * frame_height_,
1.0f, models_per_frame));
}
for (int f = 0; f < num_frames; ++f) {
MixtureHomography mix_homog;
const int model_start = f * models_per_frame;
for (int k = 0; k < models_per_frame; ++k) {
const Homography& homog = homographies[model_start + k];
*mix_homog.add_model() = ModelInvert(homog);
}
CameraMotion c = identity;
c.set_match_frame(-1);
if (mix_homog.model_size() > 1) {
*c.mutable_mixture_homography() = mix_homog;
c.set_mixture_row_sigma(mixture_sigma);
for (int k = 0; k < models_per_frame; ++k) {
c.add_mixture_inlier_coverage(1.0f);
}
*c.add_mixture_homography_spectrum() = mix_homog;
c.set_rolling_shutter_motion_index(0);
*c.mutable_homography() = ProjectViaFit<Homography>(
mix_homog, frame_width_, frame_height_, row_weights_.get());
} else {
// Guaranteed to exist because to check that models_per_frame > 0 above.
*c.mutable_homography() = mix_homog.model(0);
}
// Project remaining motions down.
*c.mutable_linear_similarity() = ProjectViaFit<LinearSimilarityModel>(
c.homography(), frame_width_, frame_height_);
*c.mutable_translation() = ProjectViaFit<TranslationModel>(
c.homography(), frame_width_, frame_height_);
c.set_average_magnitude(
std::hypot(c.translation().dx(), c.translation().dy()));
camera_motions->push_back(c);
features->push_back(empty_list);
}
}
} // namespace mediapipe

View File

@ -0,0 +1,111 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
import "mediapipe/util/tracking/motion_analysis.proto";
// Next tag: 10
message MotionAnalysisCalculatorOptions {
extend CalculatorOptions {
optional MotionAnalysisCalculatorOptions ext = 270698255;
}
optional mediapipe.MotionAnalysisOptions analysis_options = 1;
// Determines how optional input SELECTION (if present) is used to compute
// the final camera motion.
enum SelectionAnalysis {
// Recompute camera motion for selected frame neighbors.
ANALYSIS_RECOMPUTE = 1;
// Use composited camera motion and region flow from SELECTION input. No
// tracking or re-computation is performed.
// Note that in this case only CAMERA, FLOW and VIDEO_OUT tags are
// supported as output.
NO_ANALYSIS_USE_SELECTION = 2;
// Recompute camera motion for selected frame neighbors using
// features supplied by SELECTION input. No feature tracking is performed.
ANALYSIS_FROM_FEATURES = 3;
// Recomputes camera motion for selected frame neighbors but seeds
// initial transform with camera motion from SELECTION input.
ANALYSIS_WITH_SEED = 4;
}
optional SelectionAnalysis selection_analysis = 4
[default = ANALYSIS_WITH_SEED];
// If activated when SELECTION input is activated, will replace the computed
// camera motion (for any of the ANALYSIS_* case above) with the one supplied
// by the frame selection, in case the frame selection one is more stable.
// For example, if recomputed camera motion is unstable but the one from
// the selection result is stable, will use the stable result instead.
optional bool hybrid_selection_camera = 5 [default = false];
// Determines how optional input META is used to compute the final camera
// motion.
enum MetaAnalysis {
// Uses metadata supplied motions as is.
META_ANALYSIS_USE_META = 1;
// Seeds visual tracking from metadata motions - estimates visual residual
// motion and combines with metadata.
META_ANALYSIS_HYBRID = 2;
}
optional MetaAnalysis meta_analysis = 8 [default = META_ANALYSIS_USE_META];
// Determines number of homography models per frame stored in the CSV file
// or the homography metadata in META.
// For values > 1, MixtureHomographies are created.
optional int32 meta_models_per_frame = 6 [default = 1];
// Used for META_ANALYSIS_HYBRID. Rejects features which flow deviates
// domain_ratio * image diagonal size from the ground truth metadata motion.
optional float meta_outlier_domain_ratio = 9 [default = 0.0015];
// If true, the MotionAnalysisCalculator will skip all processing and emit no
// packets on any output. This is useful for quickly creating different
// versions of a MediaPipe graph without changing its structure, assuming that
// downstream calculators can handle missing input packets.
// TODO: Remove this hack. See b/36485206 for more details.
optional bool bypass_mode = 7 [default = false];
}
// Taken from
// java/com/google/android/libraries/microvideo/proto/microvideo.proto to
// satisfy leakr requirements
// TODO: Remove and use above proto.
message HomographyData {
// For each frame, there are 12 homography matrices stored. Each matrix is
// 3x3 (9 elements). This field will contain 12 x 3 x 3 float values. The
// first row of the first homography matrix will be followed by the second row
// of the first homography matrix, followed by third row of first homography
// matrix, followed by the first row of the second homography matrix, etc.
repeated float motion_homography_data = 1 [packed = true];
// Vector containing histogram counts for individual patches in the frame.
repeated uint32 histogram_count_data = 2 [packed = true];
// The width of the frame at the time metadata was sampled.
optional int32 frame_width = 3;
// The height of the frame at the time metadata was sampled.
optional int32 frame_height = 4;
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 247 KiB

View File

@ -0,0 +1,134 @@
input_stream: "image_cpu_frames"
input_stream: "start_pos"
input_stream: "ra_track"
num_threads: 4
node: {
calculator: "MotionAnalysisCalculator"
input_stream: "VIDEO:image_cpu_frames"
output_stream: "CAMERA:camera_motion"
output_stream: "FLOW:region_flow"
options: {
[mediapipe.MotionAnalysisCalculatorOptions.ext]: {
analysis_options: {
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
flow_options: {
# Maybe move down to 50
fast_estimation_min_block_size: 100
top_inlier_sets: 1
frac_inlier_error_threshold: 3e-3
downsample_mode: DOWNSAMPLE_NONE
verification_distance: 5.0
verify_long_feature_acceleration: true
verify_long_feature_trigger_ratio: 0.1
tracking_options: {
max_features: 500
adaptive_extraction_levels: 2
min_eig_val_settings: {
adaptive_lowest_quality_level: 2e-4
}
klt_tracker_implementation: KLT_OPENCV
}
}
motion_options: {
label_empty_frames_as_valid: false
}
}
}
}
}
node: {
calculator: "FlowPackagerCalculator"
input_stream: "FLOW:region_flow"
input_stream: "CAMERA:camera_motion"
output_stream: "TRACKING:tracking_data"
options: {
[mediapipe.FlowPackagerCalculatorOptions.ext]: {
flow_packager_options: {
binary_tracking_data_support: false
}
}
}
}
node: {
calculator: "BoxTrackerCalculator"
input_stream: "TRACKING:tracking_data"
input_stream: "START_POS:start_pos"
output_stream: "BOXES:boxes"
input_side_packet: "OPTIONS:calculator_options"
input_stream_handler: {
input_stream_handler: "SyncSetInputStreamHandler"
options: {
[mediapipe.SyncSetInputStreamHandlerOptions.ext]: {
sync_set: {
tag_index: "TRACKING"
}
sync_set: {
tag_index: "START_POS"
}
}
}
}
options: {
[mediapipe.BoxTrackerCalculatorOptions.ext]: {
tracker_options: {
track_step_options: {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_PERSPECTIVE
object_similarity_min_contd_inliers: 6
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}
node: {
calculator: "BoxTrackerCalculator"
input_stream: "TRACKING:tracking_data"
input_stream: "RA_TRACK:ra_track"
output_stream: "RA_BOXES:ra_boxes"
input_side_packet: "OPTIONS:calculator_options"
input_stream_handler: {
input_stream_handler: "SyncSetInputStreamHandler"
options: {
[mediapipe.SyncSetInputStreamHandlerOptions.ext]: {
sync_set: {
tag_index: "TRACKING"
}
sync_set: {
tag_index: "RA_TRACK"
}
}
}
}
options: {
[mediapipe.BoxTrackerCalculatorOptions.ext]: {
tracker_options: {
track_step_options: {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_PERSPECTIVE
object_similarity_min_contd_inliers: 6
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}

View File

@ -0,0 +1,122 @@
input_stream: "image_cpu_frames"
input_stream: "start_pos"
input_stream: "cancel_object_id"
input_stream: "ra_track"
input_stream: "restart_pos"
input_stream: "track_time"
num_threads: 4
node: {
calculator: "MotionAnalysisCalculator"
options: {
[mediapipe.MotionAnalysisCalculatorOptions.ext]: {
analysis_options: {
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
flow_options: {
# Maybe move down to 50
fast_estimation_min_block_size: 100
top_inlier_sets: 1
frac_inlier_error_threshold: 3e-3
# For mobile application, downsample before input into graph
# and use DOWNSAMPLE_TO_INPUT_SIZE and specify
# downsampling_factor option or DOWNSAMPLE input_side_packet
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
verification_distance: 5.0
verify_long_feature_acceleration: true
verify_long_feature_trigger_ratio: 0.1
tracking_options: {
max_features: 500
corner_extraction_method: EXTRACTION_FAST
adaptive_extraction_levels: 2
min_eig_val_settings: {
adaptive_lowest_quality_level: 2e-4
}
klt_tracker_implementation: KLT_OPENCV
}
}
}
}
}
# Drops packets if calculator cannot keep up with the input rate.
input_stream_handler: {
input_stream_handler: "FixedSizeInputStreamHandler"
}
input_stream: "VIDEO:image_cpu_frames"
input_side_packet: "DOWNSAMPLE:analysis_downsample_factor"
input_side_packet: "OPTIONS:calculator_options"
output_stream: "CAMERA:camera_motion"
output_stream: "FLOW:region_flow"
}
node: {
calculator: "FlowPackagerCalculator"
input_stream: "FLOW:region_flow"
input_stream: "CAMERA:camera_motion"
output_stream: "TRACKING:tracking_data"
options: {
[mediapipe.FlowPackagerCalculatorOptions.ext]: {
flow_packager_options: {
binary_tracking_data_support: false
}
}
}
}
node: {
calculator: "BoxTrackerCalculator"
input_side_packet: "OPTIONS:calculator_options"
input_stream: "TRACKING:tracking_data"
input_stream: "TRACK_TIME:track_time"
input_stream: "START_POS:start_pos"
input_stream: "RESTART_POS:restart_pos"
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream: "RA_TRACK:ra_track"
output_stream: "BOXES:boxes"
output_stream: "RA_BOXES:ra_boxes"
input_stream_handler: {
input_stream_handler: "SyncSetInputStreamHandler"
options: {
[mediapipe.SyncSetInputStreamHandlerOptions.ext]: {
sync_set: {
tag_index: "TRACKING"
tag_index: "TRACK_TIME"
}
sync_set: {
tag_index: "START_POS"
}
sync_set: {
tag_index: "RESTART_POS"
}
sync_set: {
tag_index: "CANCEL_OBJECT_ID"
}
sync_set: {
tag_index: "RA_TRACK"
}
}
}
}
options: {
[mediapipe.BoxTrackerCalculatorOptions.ext]: {
tracker_options: {
track_step_options: {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
object_similarity_min_contd_inliers: 10
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}

View File

@ -0,0 +1,319 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "absl/container/node_hash_map.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/detection.pb.h"
#include "mediapipe/framework/formats/location_data.pb.h"
#include "mediapipe/framework/formats/rect.pb.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/tracking/box_tracker.h"
#include "mediapipe/util/tracking/tracked_detection.h"
#include "mediapipe/util/tracking/tracked_detection_manager.h"
#include "mediapipe/util/tracking/tracking.h"
namespace mediapipe {
namespace {
constexpr int kDetectionUpdateTimeOutMS = 5000;
constexpr char kDetectionsTag[] = "DETECTIONS";
constexpr char kDetectionBoxesTag[] = "DETECTION_BOXES";
constexpr char kDetectionListTag[] = "DETECTION_LIST";
constexpr char kTrackingBoxesTag[] = "TRACKING_BOXES";
constexpr char kCancelObjectIdTag[] = "CANCEL_OBJECT_ID";
// Move |src| to the back of |dst|.
void MoveIds(std::vector<int>* dst, std::vector<int> src) {
dst->insert(dst->end(), std::make_move_iterator(src.begin()),
std::make_move_iterator(src.end()));
}
int64 GetInputTimestampMs(::mediapipe::CalculatorContext* cc) {
return cc->InputTimestamp().Microseconds() / 1000; // 1 ms = 1000 us.
}
// Converts a Mediapipe Detection Proto to a TrackedDetection class.
std::unique_ptr<TrackedDetection> GetTrackedDetectionFromDetection(
const Detection& detection, int64 timestamp) {
std::unique_ptr<TrackedDetection> tracked_detection =
absl::make_unique<TrackedDetection>(detection.detection_id(), timestamp);
const float top = detection.location_data().relative_bounding_box().ymin();
const float bottom =
detection.location_data().relative_bounding_box().ymin() +
detection.location_data().relative_bounding_box().height();
const float left = detection.location_data().relative_bounding_box().xmin();
const float right = detection.location_data().relative_bounding_box().xmin() +
detection.location_data().relative_bounding_box().width();
NormalizedRect bounding_box;
bounding_box.set_x_center((left + right) / 2.f);
bounding_box.set_y_center((top + bottom) / 2.f);
bounding_box.set_height(bottom - top);
bounding_box.set_width(right - left);
tracked_detection->set_bounding_box(bounding_box);
for (int i = 0; i < detection.label_size(); ++i) {
tracked_detection->AddLabel(detection.label(i), detection.score(i));
}
return tracked_detection;
}
// Converts a TrackedDetection class to a Mediapipe Detection Proto.
Detection GetAxisAlignedDetectionFromTrackedDetection(
const TrackedDetection& tracked_detection) {
Detection detection;
LocationData* location_data = detection.mutable_location_data();
auto corners = tracked_detection.GetCorners();
float x_min = std::numeric_limits<float>::max();
float x_max = std::numeric_limits<float>::min();
float y_min = std::numeric_limits<float>::max();
float y_max = std::numeric_limits<float>::min();
for (int i = 0; i < 4; ++i) {
x_min = std::min(x_min, corners[i].x());
x_max = std::max(x_max, corners[i].x());
y_min = std::min(y_min, corners[i].y());
y_max = std::max(y_max, corners[i].y());
}
location_data->set_format(LocationData::RELATIVE_BOUNDING_BOX);
LocationData::RelativeBoundingBox* relative_bbox =
location_data->mutable_relative_bounding_box();
relative_bbox->set_xmin(x_min);
relative_bbox->set_ymin(y_min);
relative_bbox->set_width(x_max - x_min);
relative_bbox->set_height(y_max - y_min);
// Use previous id which is the id the object when it's first detected.
if (tracked_detection.previous_id() > 0) {
detection.set_detection_id(tracked_detection.previous_id());
} else {
detection.set_detection_id(tracked_detection.unique_id());
}
for (const auto& label_and_score : tracked_detection.label_to_score_map()) {
detection.add_label(label_and_score.first);
detection.add_score(label_and_score.second);
}
return detection;
}
} // namespace
// TrackedDetectionManagerCalculator accepts detections and tracking results at
// different frame rate for real time tracking of targets.
// Input:
// DETECTIONS: A vector<Detection> of newly detected targets.
// TRACKING_BOXES: A TimedBoxProtoList which contains a list of tracked boxes
// from previous detections.
//
// Output:
// CANCEL_OBJECT_ID: Ids of targets that are missing/lost such that it should
// be removed from tracking.
// DETECTIONS: List of detections that are being tracked.
// DETECTION_BOXES: List of bounding boxes of detections that are being
// tracked.
//
// Usage example:
// node {
// calculator: "TrackedDetectionManagerCalculator"
// input_stream: "DETECTIONS:detections"
// input_stream: "TRACKING_BOXES:boxes"
// output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
// output_stream: "DETECTIONS:output_detections"
// }
class TrackedDetectionManagerCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Process(CalculatorContext* cc) override;
private:
// Adds new list of detections to |waiting_for_update_detections_|.
void AddDetectionList(const DetectionList& detection_list,
CalculatorContext* cc);
void AddDetections(const std::vector<Detection>& detections,
CalculatorContext* cc);
// Manages existing and new detections.
TrackedDetectionManager tracked_detection_manager_;
// Set of detections that are not up to date yet. These detections will be
// added to the detection manager until they got updated from the box tracker.
absl::node_hash_map<int, std::unique_ptr<TrackedDetection>>
waiting_for_update_detections_;
};
REGISTER_CALCULATOR(TrackedDetectionManagerCalculator);
::mediapipe::Status TrackedDetectionManagerCalculator::GetContract(
CalculatorContract* cc) {
if (cc->Inputs().HasTag(kDetectionsTag)) {
cc->Inputs().Tag(kDetectionsTag).Set<std::vector<Detection>>();
}
if (cc->Inputs().HasTag(kDetectionListTag)) {
cc->Inputs().Tag(kDetectionListTag).Set<DetectionList>();
}
if (cc->Inputs().HasTag(kTrackingBoxesTag)) {
cc->Inputs().Tag(kTrackingBoxesTag).Set<TimedBoxProtoList>();
}
if (cc->Outputs().HasTag(kCancelObjectIdTag)) {
cc->Outputs().Tag(kCancelObjectIdTag).Set<int>();
}
if (cc->Outputs().HasTag(kDetectionsTag)) {
cc->Outputs().Tag(kDetectionsTag).Set<std::vector<Detection>>();
}
if (cc->Outputs().HasTag(kDetectionBoxesTag)) {
cc->Outputs().Tag(kDetectionBoxesTag).Set<std::vector<NormalizedRect>>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status TrackedDetectionManagerCalculator::Process(
CalculatorContext* cc) {
if (cc->Inputs().HasTag("TRACKING_BOXES")) {
if (!cc->Inputs().Tag("TRACKING_BOXES").IsEmpty()) {
const TimedBoxProtoList& tracked_boxes =
cc->Inputs().Tag("TRACKING_BOXES").Get<TimedBoxProtoList>();
// Collect all detections that are removed.
auto removed_detection_ids = absl::make_unique<std::vector<int>>();
for (const TimedBoxProto& tracked_box : tracked_boxes.box()) {
NormalizedRect bounding_box;
bounding_box.set_x_center((tracked_box.left() + tracked_box.right()) /
2.f);
bounding_box.set_y_center((tracked_box.bottom() + tracked_box.top()) /
2.f);
bounding_box.set_height(tracked_box.bottom() - tracked_box.top());
bounding_box.set_width(tracked_box.right() - tracked_box.left());
bounding_box.set_rotation(tracked_box.rotation());
// First check if this box updates a detection that's waiting for
// update from the tracker.
auto waiting_for_update_detectoin_ptr =
waiting_for_update_detections_.find(tracked_box.id());
if (waiting_for_update_detectoin_ptr !=
waiting_for_update_detections_.end()) {
// Add the detection and remove duplicated detections.
auto removed_ids = tracked_detection_manager_.AddDetection(
std::move(waiting_for_update_detectoin_ptr->second));
MoveIds(removed_detection_ids.get(), std::move(removed_ids));
waiting_for_update_detections_.erase(
waiting_for_update_detectoin_ptr);
}
auto removed_ids = tracked_detection_manager_.UpdateDetectionLocation(
tracked_box.id(), bounding_box, tracked_box.time_msec());
MoveIds(removed_detection_ids.get(), std::move(removed_ids));
}
// TODO: Should be handled automatically in detection manager.
auto removed_ids = tracked_detection_manager_.RemoveObsoleteDetections(
GetInputTimestampMs(cc) - kDetectionUpdateTimeOutMS);
MoveIds(removed_detection_ids.get(), std::move(removed_ids));
// TODO: Should be handled automatically in detection manager.
removed_ids = tracked_detection_manager_.RemoveOutOfViewDetections();
MoveIds(removed_detection_ids.get(), std::move(removed_ids));
if (!removed_detection_ids->empty() &&
cc->Outputs().HasTag(kCancelObjectIdTag)) {
auto timestamp = cc->InputTimestamp();
for (int box_id : *removed_detection_ids) {
// The timestamp is incremented (by 1 us) because currently the box
// tracker calculator only accepts one cancel object ID for any given
// timestamp.
cc->Outputs()
.Tag(kCancelObjectIdTag)
.AddPacket(mediapipe::MakePacket<int>(box_id).At(timestamp++));
}
}
// Output detections and corresponding bounding boxes.
const auto& all_detections =
tracked_detection_manager_.GetAllTrackedDetections();
auto output_detections = absl::make_unique<std::vector<Detection>>();
auto output_boxes = absl::make_unique<std::vector<NormalizedRect>>();
for (const auto& detection_ptr : all_detections) {
const auto& detection = *detection_ptr.second;
// Only output detections that are synced.
if (detection.last_updated_timestamp() <
cc->InputTimestamp().Microseconds() / 1000) {
continue;
}
output_detections->emplace_back(
GetAxisAlignedDetectionFromTrackedDetection(detection));
output_boxes->emplace_back(detection.bounding_box());
}
if (cc->Outputs().HasTag(kDetectionsTag)) {
cc->Outputs()
.Tag(kDetectionsTag)
.Add(output_detections.release(), cc->InputTimestamp());
}
if (cc->Outputs().HasTag(kDetectionBoxesTag)) {
cc->Outputs()
.Tag(kDetectionBoxesTag)
.Add(output_boxes.release(), cc->InputTimestamp());
}
}
}
if (cc->Inputs().HasTag(kDetectionsTag) &&
!cc->Inputs().Tag(kDetectionsTag).IsEmpty()) {
const auto detections =
cc->Inputs().Tag(kDetectionsTag).Get<std::vector<Detection>>();
AddDetections(detections, cc);
}
if (cc->Inputs().HasTag(kDetectionListTag) &&
!cc->Inputs().Tag(kDetectionListTag).IsEmpty()) {
const auto detection_list =
cc->Inputs().Tag(kDetectionListTag).Get<DetectionList>();
AddDetectionList(detection_list, cc);
}
return ::mediapipe::OkStatus();
}
void TrackedDetectionManagerCalculator::AddDetectionList(
const DetectionList& detection_list, CalculatorContext* cc) {
for (const auto& detection : detection_list.detection()) {
// Convert from microseconds to milliseconds.
std::unique_ptr<TrackedDetection> new_detection =
GetTrackedDetectionFromDetection(
detection, cc->InputTimestamp().Microseconds() / 1000);
const int id = new_detection->unique_id();
waiting_for_update_detections_[id] = std::move(new_detection);
}
}
void TrackedDetectionManagerCalculator::AddDetections(
const std::vector<Detection>& detections, CalculatorContext* cc) {
for (const auto& detection : detections) {
// Convert from microseconds to milliseconds.
std::unique_ptr<TrackedDetection> new_detection =
GetTrackedDetectionFromDetection(
detection, cc->InputTimestamp().Microseconds() / 1000);
const int id = new_detection->unique_id();
waiting_for_update_detections_[id] = std::move(new_detection);
}
}
} // namespace mediapipe

View File

@ -0,0 +1,709 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "mediapipe/calculators/video/box_tracker_calculator.pb.h"
#include "mediapipe/framework/calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/deps/file_path.h"
#include "mediapipe/framework/formats/image_frame.h"
#include "mediapipe/framework/packet.h"
#include "mediapipe/framework/port/advanced_proto_inc.h"
#include "mediapipe/framework/port/file_helpers.h"
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/opencv_highgui_inc.h"
#include "mediapipe/framework/port/proto_ns.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/framework/port/status_matchers.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
#include "mediapipe/util/tracking/tracking.pb.h"
#ifdef __APPLE__
#include <CoreFoundation/CoreFoundation.h>
#endif // defined(__APPLE__)
namespace mediapipe {
namespace {
using ::testing::FloatNear;
using ::testing::Test;
std::string GetTestDir() {
#ifdef __APPLE__
char path[1024];
CFURLRef bundle_url = CFBundleCopyBundleURL(CFBundleGetMainBundle());
CFURLGetFileSystemRepresentation(
bundle_url, true, reinterpret_cast<UInt8*>(path), sizeof(path));
CFRelease(bundle_url);
return ::mediapipe::file::JoinPath(path, "testdata");
#elif defined(__ANDROID__)
char path[1024];
getcwd(path, sizeof(path));
return ::mediapipe::file::JoinPath(path,
"mediapipe/calculators/video/testdata");
#else
return ::mediapipe::file::JoinPath(
"./",
// This should match the path of the output files
// of the genrule() that generates test model files.
"mediapipe/calculators/video/testdata");
#endif // defined(__APPLE__)
}
bool LoadBinaryTestGraph(const std::string& graph_path,
CalculatorGraphConfig* config) {
std::ifstream ifs;
ifs.open(graph_path.c_str());
proto_ns::io::IstreamInputStream in_stream(&ifs);
bool success = config->ParseFromZeroCopyStream(&in_stream);
ifs.close();
if (!success) {
LOG(ERROR) << "could not parse test graph: " << graph_path;
}
return success;
}
class TrackingGraphTest : public Test {
protected:
TrackingGraphTest() {}
void SetUp() override {
test_dir_ = GetTestDir();
const auto graph_path = file::JoinPath(test_dir_, "tracker.binarypb");
ASSERT_TRUE(LoadBinaryTestGraph(graph_path, &config_));
original_image_ = cv::imread(file::JoinPath(test_dir_, "lenna.png"));
CreateInputFramesFromOriginalImage(kNumImages, kTranslationStep,
&input_frames_packets_);
const auto& first_input_img = input_frames_packets_[0].Get<ImageFrame>();
const int img_width = first_input_img.Width();
const int img_height = first_input_img.Height();
translation_step_x_ = kTranslationStep / static_cast<float>(img_width);
translation_step_y_ = kTranslationStep / static_cast<float>(img_height);
// Creat new configure and packet dump vector to store output.
mediapipe::CalculatorGraphConfig config_copy = config_;
mediapipe::tool::AddVectorSink("boxes", &config_copy, &output_packets_);
mediapipe::tool::AddVectorSink("ra_boxes", &config_copy,
&random_access_results_packets_);
// Initialize graph
MP_ASSERT_OK(graph_.Initialize(config_copy));
const auto parallel_graph_path =
file::JoinPath(test_dir_, "parallel_tracker.binarypb");
CalculatorGraphConfig parallel_config;
ASSERT_TRUE(LoadBinaryTestGraph(parallel_graph_path, &parallel_config));
mediapipe::tool::AddVectorSink("boxes", &parallel_config, &output_packets_);
mediapipe::tool::AddVectorSink("ra_boxes", &parallel_config,
&random_access_results_packets_);
MP_ASSERT_OK(parallel_graph_.Initialize(parallel_config));
}
void CreateInputFramesFromOriginalImage(
int num_images, int translation_step,
std::vector<Packet>* input_frames_packets);
void TearDown() override {
output_packets_.clear();
random_access_results_packets_.clear();
}
std::unique_ptr<TimedBoxProtoList> MakeBoxList(
const Timestamp& timestamp, const std::vector<bool>& is_quad_tracking,
const std::vector<bool>& is_pnp_tracking,
const std::vector<bool>& reacquisition) const;
void RunGraphWithSidePacketsAndInputs(
const std::map<std::string, mediapipe::Packet>& side_packets,
const mediapipe::Packet& start_pos_packet);
// Utility functions used to judge if a given quad or box is near to the
// groundtruth location at a given frame.
// Examine box.reacquisition() field equals to `reacquisition`.
// `frame` can be float number to account for inter-frame interpolation.
void ExpectBoxAtFrame(const TimedBoxProto& box, float frame,
bool reacquisition);
// Examine box.aspect_ratio() field equals to `aspect_ratio` if asepct_ratio
// is positive.
void ExpectQuadAtFrame(const TimedBoxProto& box, float frame,
float aspect_ratio, bool reacquisition);
// Utility function to judge if two quad are near to each other.
void ExpectQuadNear(const TimedBoxProto& box1, const TimedBoxProto& box2);
std::unique_ptr<TimedBoxProtoList> CreateRandomAccessTrackingBoxList(
const std::vector<Timestamp>& start_timestamps,
const std::vector<Timestamp>& end_timestamps) const;
CalculatorGraph graph_;
CalculatorGraph parallel_graph_;
CalculatorGraphConfig config_;
std::string test_dir_;
cv::Mat original_image_;
std::vector<Packet> input_frames_packets_;
std::vector<mediapipe::Packet> output_packets_;
std::vector<mediapipe::Packet> random_access_results_packets_;
float translation_step_x_; // normalized translation step in x direction
float translation_step_y_; // normalized translation step in y direction
static constexpr float kInitialBoxHalfWidthNormalized = 0.25f;
static constexpr float kInitialBoxHalfHeightNormalized = 0.25f;
static constexpr float kImageAspectRatio = 1.0f; // for lenna.png
static constexpr float kInitialBoxLeft =
0.5f - kInitialBoxHalfWidthNormalized;
static constexpr float kInitialBoxRight =
0.5f + kInitialBoxHalfWidthNormalized;
static constexpr float kInitialBoxTop =
0.5f - kInitialBoxHalfHeightNormalized;
static constexpr float kInitialBoxBottom =
0.5f + kInitialBoxHalfHeightNormalized;
static constexpr int kFrameIntervalUs = 30000;
static constexpr int kNumImages = 8;
// Each image is shifted to the right and bottom by kTranslationStep
// pixels compared with the previous image.
static constexpr int kTranslationStep = 10;
static constexpr float kEqualityTolerance = 3e-4f;
};
void TrackingGraphTest::ExpectBoxAtFrame(const TimedBoxProto& box, float frame,
bool reacquisition) {
EXPECT_EQ(box.reacquisition(), reacquisition);
EXPECT_TRUE(box.has_rotation());
EXPECT_THAT(box.rotation(), FloatNear(0, kEqualityTolerance));
EXPECT_THAT(box.left(),
FloatNear(kInitialBoxLeft - frame * translation_step_x_,
kEqualityTolerance));
EXPECT_THAT(box.top(), FloatNear(kInitialBoxTop - frame * translation_step_y_,
kEqualityTolerance));
EXPECT_THAT(box.bottom(),
FloatNear(kInitialBoxBottom - frame * translation_step_y_,
kEqualityTolerance));
EXPECT_THAT(box.right(),
FloatNear(kInitialBoxRight - frame * translation_step_x_,
kEqualityTolerance));
}
void TrackingGraphTest::ExpectQuadAtFrame(const TimedBoxProto& box, float frame,
float aspect_ratio,
bool reacquisition) {
EXPECT_TRUE(box.has_quad()) << "quad must exist!";
if (aspect_ratio > 0) {
EXPECT_TRUE(box.has_aspect_ratio());
EXPECT_NEAR(box.aspect_ratio(), aspect_ratio, kEqualityTolerance);
}
EXPECT_EQ(box.reacquisition(), reacquisition);
const auto& quad = box.quad();
EXPECT_EQ(8, quad.vertices_size())
<< "quad has only " << box.quad().vertices_size() << " vertices";
EXPECT_THAT(quad.vertices(0),
FloatNear(kInitialBoxLeft - frame * translation_step_x_,
kEqualityTolerance));
EXPECT_THAT(quad.vertices(1),
FloatNear(kInitialBoxTop - frame * translation_step_y_,
kEqualityTolerance));
EXPECT_THAT(quad.vertices(3),
FloatNear(kInitialBoxBottom - frame * translation_step_y_,
kEqualityTolerance));
EXPECT_THAT(quad.vertices(4),
FloatNear(kInitialBoxRight - frame * translation_step_x_,
kEqualityTolerance));
}
void TrackingGraphTest::ExpectQuadNear(const TimedBoxProto& box1,
const TimedBoxProto& box2) {
EXPECT_TRUE(box1.has_quad());
EXPECT_TRUE(box2.has_quad());
EXPECT_EQ(8, box1.quad().vertices_size())
<< "quad has only " << box1.quad().vertices_size() << " vertices";
EXPECT_EQ(8, box2.quad().vertices_size())
<< "quad has only " << box2.quad().vertices_size() << " vertices";
for (int j = 0; j < box1.quad().vertices_size(); ++j) {
EXPECT_NEAR(box1.quad().vertices(j), box2.quad().vertices(j),
kEqualityTolerance);
}
}
std::unique_ptr<TimedBoxProtoList> TrackingGraphTest::MakeBoxList(
const Timestamp& timestamp, const std::vector<bool>& is_quad_tracking,
const std::vector<bool>& is_pnp_tracking,
const std::vector<bool>& reacquisition) const {
auto box_list = absl::make_unique<TimedBoxProtoList>();
int box_id = 0;
for (int j = 0; j < is_quad_tracking.size(); ++j) {
TimedBoxProto* box = box_list->add_box();
if (is_quad_tracking[j]) {
box->mutable_quad()->add_vertices(kInitialBoxLeft);
box->mutable_quad()->add_vertices(kInitialBoxTop);
box->mutable_quad()->add_vertices(kInitialBoxLeft);
box->mutable_quad()->add_vertices(kInitialBoxBottom);
box->mutable_quad()->add_vertices(kInitialBoxRight);
box->mutable_quad()->add_vertices(kInitialBoxBottom);
box->mutable_quad()->add_vertices(kInitialBoxRight);
box->mutable_quad()->add_vertices(kInitialBoxTop);
if (is_pnp_tracking[j]) {
box->set_aspect_ratio(kImageAspectRatio);
}
} else {
box->set_left(kInitialBoxLeft);
box->set_right(kInitialBoxRight);
box->set_top(kInitialBoxTop);
box->set_bottom(kInitialBoxBottom);
}
box->set_id(box_id++);
box->set_time_msec(timestamp.Value() / 1000);
box->set_reacquisition(reacquisition[j]);
}
return box_list;
}
void TrackingGraphTest::CreateInputFramesFromOriginalImage(
int num_images, int translation_step,
std::vector<Packet>* input_frames_packets) {
const int crop_width = original_image_.cols - num_images * translation_step;
const int crop_height = original_image_.rows - num_images * translation_step;
for (int i = 0; i < num_images; ++i) {
cv::Rect roi(i * translation_step, i * translation_step, crop_width,
crop_height);
cv::Mat cropped_img = cv::Mat(original_image_, roi);
auto cropped_image_frame = absl::make_unique<ImageFrame>(
ImageFormat::SRGB, crop_width, crop_height, cropped_img.step[0],
cropped_img.data, ImageFrame::PixelDataDeleter::kNone);
Timestamp curr_timestamp = Timestamp(i * kFrameIntervalUs);
Packet image_packet =
Adopt(cropped_image_frame.release()).At(curr_timestamp);
input_frames_packets->push_back(image_packet);
}
}
void TrackingGraphTest::RunGraphWithSidePacketsAndInputs(
const std::map<std::string, mediapipe::Packet>& side_packets,
const mediapipe::Packet& start_pos_packet) {
// Start running the graph
MP_EXPECT_OK(graph_.StartRun(side_packets));
MP_EXPECT_OK(graph_.AddPacketToInputStream("start_pos", start_pos_packet));
for (auto frame_packet : input_frames_packets_) {
MP_EXPECT_OK(
graph_.AddPacketToInputStream("image_cpu_frames", frame_packet));
MP_EXPECT_OK(graph_.WaitUntilIdle());
}
MP_EXPECT_OK(graph_.CloseAllInputStreams());
MP_EXPECT_OK(graph_.WaitUntilDone());
}
std::unique_ptr<TimedBoxProtoList>
TrackingGraphTest::CreateRandomAccessTrackingBoxList(
const std::vector<Timestamp>& start_timestamps,
const std::vector<Timestamp>& end_timestamps) const {
CHECK_EQ(start_timestamps.size(), end_timestamps.size());
auto ra_boxes = absl::make_unique<TimedBoxProtoList>();
for (int i = 0; i < start_timestamps.size(); ++i) {
auto start_box_list =
MakeBoxList(start_timestamps[i], std::vector<bool>{true},
std::vector<bool>{true}, std::vector<bool>{false});
auto end_box_list =
MakeBoxList(end_timestamps[i], std::vector<bool>{true},
std::vector<bool>{true}, std::vector<bool>{false});
*(ra_boxes->add_box()) = (*start_box_list).box(0);
*(ra_boxes->add_box()) = (*end_box_list).box(0);
}
return ra_boxes;
}
TEST_F(TrackingGraphTest, BasicBoxTrackingSanityCheck) {
// Create input side packets.
std::map<std::string, mediapipe::Packet> side_packets;
side_packets.insert(std::make_pair("analysis_downsample_factor",
mediapipe::MakePacket<float>(1.0f)));
side_packets.insert(std::make_pair(
"calculator_options",
mediapipe::MakePacket<CalculatorOptions>(CalculatorOptions())));
// Run the graph with input side packets, start_pos, and input image frames.
Timestamp start_box_time = input_frames_packets_[0].Timestamp();
// is_quad_tracking is used to indicate whether to track quad for each
// individual box.
std::vector<bool> is_quad_tracking{false};
// is_pnp_tracking is used to indicate whether to use perspective transform to
// track quad.
std::vector<bool> is_pnp_tracking{false};
// is_reacquisition is used to indicate whether to enable reacquisition for
// the box.
std::vector<bool> is_reacquisition{false};
auto start_box_list = MakeBoxList(start_box_time, is_quad_tracking,
is_pnp_tracking, is_reacquisition);
Packet start_pos_packet = Adopt(start_box_list.release()).At(start_box_time);
RunGraphWithSidePacketsAndInputs(side_packets, start_pos_packet);
EXPECT_EQ(input_frames_packets_.size(), output_packets_.size());
for (int i = 0; i < output_packets_.size(); ++i) {
const TimedBoxProtoList& boxes =
output_packets_[i].Get<TimedBoxProtoList>();
EXPECT_EQ(is_quad_tracking.size(), boxes.box_size());
ExpectBoxAtFrame(boxes.box(0), i, false);
}
}
TEST_F(TrackingGraphTest, BasicQuadTrackingSanityCheck) {
// Create input side packets.
std::map<std::string, mediapipe::Packet> side_packets;
side_packets.insert(std::make_pair("analysis_downsample_factor",
mediapipe::MakePacket<float>(1.0f)));
CalculatorOptions calculator_options;
calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext)
->mutable_tracker_options()
->mutable_track_step_options()
->set_tracking_degrees(
TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE);
side_packets.insert(std::make_pair(
"calculator_options",
mediapipe::MakePacket<CalculatorOptions>(calculator_options)));
Timestamp start_box_time = input_frames_packets_[0].Timestamp();
// Box id 0 use quad tracking with 8DoF homography transform.
// Box id 1 use quad tracking with 6DoF perspective transform.
// Box id 2 use box tracking with 4DoF similarity transform.
std::vector<bool> is_quad_tracking{true, true, false};
std::vector<bool> is_pnp_tracking{false, true, false};
std::vector<bool> is_reacquisition{true, false, true};
auto start_box_list = MakeBoxList(start_box_time, is_quad_tracking,
is_pnp_tracking, is_reacquisition);
Packet start_pos_packet = Adopt(start_box_list.release()).At(start_box_time);
RunGraphWithSidePacketsAndInputs(side_packets, start_pos_packet);
EXPECT_EQ(input_frames_packets_.size(), output_packets_.size());
for (int i = 0; i < output_packets_.size(); ++i) {
const TimedBoxProtoList& boxes =
output_packets_[i].Get<TimedBoxProtoList>();
EXPECT_EQ(is_quad_tracking.size(), boxes.box_size());
for (int j = 0; j < boxes.box_size(); ++j) {
const TimedBoxProto& box = boxes.box(j);
if (is_quad_tracking[box.id()]) {
ExpectQuadAtFrame(box, i,
is_pnp_tracking[box.id()] ? kImageAspectRatio : -1.0f,
is_reacquisition[box.id()]);
} else {
ExpectBoxAtFrame(box, i, is_reacquisition[box.id()]);
}
}
}
}
TEST_F(TrackingGraphTest, TestRandomAccessTrackingResults) {
// Create input side packets.
std::map<std::string, mediapipe::Packet> side_packets;
side_packets.insert(std::make_pair("analysis_downsample_factor",
mediapipe::MakePacket<float>(1.0f)));
CalculatorOptions calculator_options;
calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext)
->mutable_tracker_options()
->mutable_track_step_options()
->set_tracking_degrees(
TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE);
side_packets.insert(std::make_pair(
"calculator_options",
mediapipe::MakePacket<CalculatorOptions>(calculator_options)));
ASSERT_GT(input_frames_packets_.size(), 2); // at least 3 frames
ASSERT_TRUE(input_frames_packets_[2].Timestamp() -
input_frames_packets_[1].Timestamp() >
TimestampDiff(1000));
constexpr int start_frame = 0;
Timestamp start_box_time = input_frames_packets_[start_frame].Timestamp();
auto start_box_list =
MakeBoxList(start_box_time, std::vector<bool>{true},
std::vector<bool>{true}, std::vector<bool>{false});
constexpr int end_frame = 2;
Timestamp end_box_time = input_frames_packets_[end_frame].Timestamp();
// Also test reverse random access tracking.
// This offset of 1ms is simulating the case where the start query timestamp
// to be not any existing frame timestamp. In reality, it's highly encouraged
// to have the start query timestamp be aligned with frame timestamp.
constexpr int reverse_start_frame = 1;
Timestamp reverse_start_box_time =
input_frames_packets_[reverse_start_frame].Timestamp() + 1000;
auto ra_boxes = CreateRandomAccessTrackingBoxList(
{start_box_time, reverse_start_box_time}, {end_box_time, start_box_time});
Packet ra_packet = Adopt(ra_boxes.release()).At(start_box_time);
Packet start_packet = Adopt(start_box_list.release()).At(start_box_time);
// Start running the ordinary graph, verify random access produce same result
// as normal tracking.
MP_EXPECT_OK(graph_.StartRun(side_packets));
MP_EXPECT_OK(graph_.AddPacketToInputStream("start_pos", start_packet));
for (auto frame_packet : input_frames_packets_) {
MP_EXPECT_OK(
graph_.AddPacketToInputStream("image_cpu_frames", frame_packet));
Packet track_time_packet = Adopt(new int(0)).At(frame_packet.Timestamp());
MP_EXPECT_OK(
graph_.AddPacketToInputStream("track_time", track_time_packet));
MP_EXPECT_OK(graph_.WaitUntilIdle());
}
MP_EXPECT_OK(graph_.AddPacketToInputStream("ra_track", ra_packet));
MP_EXPECT_OK(graph_.CloseAllInputStreams());
MP_EXPECT_OK(graph_.WaitUntilDone());
EXPECT_EQ(input_frames_packets_.size(), output_packets_.size());
const TimedBoxProtoList tracking_result =
output_packets_[end_frame].Get<TimedBoxProtoList>();
EXPECT_EQ(1, tracking_result.box_size());
// Should have 1 random access packet.
EXPECT_EQ(1, random_access_results_packets_.size());
const TimedBoxProtoList& ra_result =
random_access_results_packets_[0].Get<TimedBoxProtoList>();
// Two box tracking results. One for comparison with normal tracking. The
// other for reverse random access tracking.
EXPECT_EQ(2, ra_result.box_size());
// Check if randan access tracking has same result with normal tracking.
ExpectQuadNear(tracking_result.box(0), ra_result.box(0));
ExpectQuadAtFrame(ra_result.box(0), end_frame - start_frame,
kImageAspectRatio, false);
ExpectQuadAtFrame(ra_result.box(1), start_frame - reverse_start_frame - 1,
kImageAspectRatio, false);
// Clear output and ra result packet vector before test parallel graph.
TearDown();
// Start running the parallel graph, verify random access produce same result
// as normal tracking.
MP_EXPECT_OK(parallel_graph_.StartRun(side_packets));
MP_EXPECT_OK(
parallel_graph_.AddPacketToInputStream("start_pos", start_packet));
for (auto frame_packet : input_frames_packets_) {
MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("image_cpu_frames",
frame_packet));
MP_EXPECT_OK(parallel_graph_.WaitUntilIdle());
}
MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("ra_track", ra_packet));
MP_EXPECT_OK(parallel_graph_.CloseAllInputStreams());
MP_EXPECT_OK(parallel_graph_.WaitUntilDone());
EXPECT_EQ(input_frames_packets_.size(), output_packets_.size());
const TimedBoxProtoList parallel_tracking_result =
output_packets_[end_frame].Get<TimedBoxProtoList>();
EXPECT_EQ(1, parallel_tracking_result.box_size());
// should have only 1 random access
EXPECT_EQ(1, random_access_results_packets_.size());
const TimedBoxProtoList& parallel_ra_result =
random_access_results_packets_[0].Get<TimedBoxProtoList>();
EXPECT_EQ(2, parallel_ra_result.box_size());
// Check if randan access tracking has same result with normal tracking.
ExpectQuadNear(parallel_tracking_result.box(0), parallel_ra_result.box(0));
ExpectQuadAtFrame(parallel_ra_result.box(0), end_frame - start_frame,
kImageAspectRatio, false);
ExpectQuadAtFrame(parallel_ra_result.box(1),
start_frame - reverse_start_frame - 1, kImageAspectRatio,
false);
}
// Tests what happens when random access request timestamps are
// outside of cache.
TEST_F(TrackingGraphTest, TestRandomAccessTrackingTimestamps) {
// Create input side packets.
std::map<std::string, mediapipe::Packet> side_packets;
side_packets.insert(std::make_pair("analysis_downsample_factor",
mediapipe::MakePacket<float>(1.0f)));
CalculatorOptions calculator_options;
calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext)
->mutable_tracker_options()
->mutable_track_step_options()
->set_tracking_degrees(
TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE);
// We intentionally don't cache all frames, to see what happens when
// random access tracking request time falls outside cache range.
calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext)
->set_streaming_track_data_cache_size(input_frames_packets_.size() - 1);
side_packets.insert(std::make_pair(
"calculator_options",
mediapipe::MakePacket<CalculatorOptions>(calculator_options)));
// Set up random access boxes
const int num_frames = input_frames_packets_.size();
const int64 usec_in_sec = 1000000;
std::vector<Timestamp> start_timestamps{
input_frames_packets_[0].Timestamp() - usec_in_sec, // forward
input_frames_packets_[0].Timestamp(), // forward
input_frames_packets_[1].Timestamp(), // forward
input_frames_packets_[num_frames - 1].Timestamp() + usec_in_sec, // fwd
input_frames_packets_[0].Timestamp(), // backward
input_frames_packets_[num_frames - 1].Timestamp(), // backward
input_frames_packets_[num_frames - 1].Timestamp(), // backward
input_frames_packets_[num_frames - 1].Timestamp() + usec_in_sec // back
};
std::vector<Timestamp> end_timestamps{
input_frames_packets_[num_frames - 1].Timestamp(),
input_frames_packets_[num_frames - 1].Timestamp(),
input_frames_packets_[num_frames - 1].Timestamp() + usec_in_sec,
input_frames_packets_[num_frames - 1].Timestamp() + 2 * usec_in_sec,
input_frames_packets_[0].Timestamp() - usec_in_sec,
input_frames_packets_[0].Timestamp(),
input_frames_packets_[0].Timestamp() - usec_in_sec,
input_frames_packets_[1].Timestamp()};
auto ra_boxes =
CreateRandomAccessTrackingBoxList(start_timestamps, end_timestamps);
Packet ra_packet =
Adopt(ra_boxes.release()).At(input_frames_packets_[0].Timestamp());
// Run the graph and check if the outside-cache request have no results.
// Start running the parallel graph, verify random access produce same result
// as normal tracking.
MP_EXPECT_OK(parallel_graph_.StartRun(side_packets));
for (auto frame_packet : input_frames_packets_) {
MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("image_cpu_frames",
frame_packet));
MP_EXPECT_OK(parallel_graph_.WaitUntilIdle());
}
MP_EXPECT_OK(parallel_graph_.AddPacketToInputStream("ra_track", ra_packet));
MP_EXPECT_OK(parallel_graph_.CloseAllInputStreams());
MP_EXPECT_OK(parallel_graph_.WaitUntilDone());
// should have 1 random access packet with 0 result boxes
EXPECT_EQ(1, random_access_results_packets_.size());
const auto& ra_returned_boxes =
random_access_results_packets_[0].Get<TimedBoxProtoList>();
const int num_returned_ra_boxes = ra_returned_boxes.box_size();
EXPECT_EQ(0, num_returned_ra_boxes);
}
TEST_F(TrackingGraphTest, TestTransitionFramesForReacquisition) {
// Create input side packets.
std::map<std::string, mediapipe::Packet> side_packets;
side_packets.insert(std::make_pair("analysis_downsample_factor",
mediapipe::MakePacket<float>(1.0f)));
CalculatorOptions calculator_options;
calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext)
->mutable_tracker_options()
->mutable_track_step_options()
->set_tracking_degrees(
TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE);
constexpr int kTransitionFrames = 3;
calculator_options.MutableExtension(BoxTrackerCalculatorOptions::ext)
->set_start_pos_transition_frames(kTransitionFrames);
side_packets.insert(std::make_pair(
"calculator_options",
mediapipe::MakePacket<CalculatorOptions>(calculator_options)));
Timestamp start_box_time = input_frames_packets_[0].Timestamp();
// Box id 0 use quad tracking with 8DoF homography transform.
// Box id 1 use quad tracking with 6DoF perspective transform.
// Box id 2 use box tracking with 4DoF similarity transform.
std::vector<bool> is_quad_tracking{true, true, false};
std::vector<bool> is_pnp_tracking{false, true, false};
std::vector<bool> is_reacquisition{true, true, true};
auto start_box_list = MakeBoxList(start_box_time, is_quad_tracking,
is_pnp_tracking, is_reacquisition);
Packet start_pos_packet = Adopt(start_box_list.release()).At(start_box_time);
// Setting box pos restart from initial position (frame 0's position).
constexpr int kRestartFrame = 3;
Timestamp restart_box_time = input_frames_packets_[kRestartFrame].Timestamp();
auto restart_box_list = MakeBoxList(restart_box_time, is_quad_tracking,
is_pnp_tracking, is_reacquisition);
Packet restart_pos_packet =
Adopt(restart_box_list.release()).At(restart_box_time);
MP_EXPECT_OK(graph_.StartRun(side_packets));
MP_EXPECT_OK(graph_.AddPacketToInputStream("start_pos", start_pos_packet));
for (int j = 0; j < input_frames_packets_.size(); ++j) {
// Add TRACK_TIME stream queries in between 2 frames.
if (j > 0) {
Timestamp track_time = Timestamp((j - 0.5f) * kFrameIntervalUs);
LOG(INFO) << track_time.Value();
Packet track_time_packet = Adopt(new Timestamp).At(track_time);
MP_EXPECT_OK(
graph_.AddPacketToInputStream("track_time", track_time_packet));
}
MP_EXPECT_OK(graph_.AddPacketToInputStream("image_cpu_frames",
input_frames_packets_[j]));
Packet track_time_packet =
Adopt(new int(0)).At(input_frames_packets_[j].Timestamp());
MP_EXPECT_OK(
graph_.AddPacketToInputStream("track_time", track_time_packet));
MP_EXPECT_OK(graph_.WaitUntilIdle());
if (j == kRestartFrame) {
MP_EXPECT_OK(
graph_.AddPacketToInputStream("restart_pos", restart_pos_packet));
}
}
MP_EXPECT_OK(graph_.CloseAllInputStreams());
MP_EXPECT_OK(graph_.WaitUntilDone());
EXPECT_EQ(input_frames_packets_.size() * 2 - 1, output_packets_.size());
for (int i = 0; i < output_packets_.size(); ++i) {
const TimedBoxProtoList& boxes =
output_packets_[i].Get<TimedBoxProtoList>();
EXPECT_EQ(is_quad_tracking.size(), boxes.box_size());
float frame_id = i / 2.0f;
float expected_frame_id;
if (frame_id <= kRestartFrame) {
// before transition
expected_frame_id = frame_id;
} else {
float transition_frames = frame_id - kRestartFrame;
if (transition_frames <= kTransitionFrames) {
// transitioning.
expected_frame_id =
kRestartFrame -
transition_frames / kTransitionFrames * kRestartFrame +
transition_frames;
} else {
// after transition.
expected_frame_id = transition_frames;
}
}
for (int j = 0; j < boxes.box_size(); ++j) {
const TimedBoxProto& box = boxes.box(j);
if (is_quad_tracking[box.id()]) {
ExpectQuadAtFrame(box, expected_frame_id,
is_pnp_tracking[box.id()] ? kImageAspectRatio : -1.0f,
is_reacquisition[box.id()]);
} else {
ExpectBoxAtFrame(box, expected_frame_id, is_reacquisition[box.id()]);
}
}
}
}
// TODO: Add test for reacquisition.
} // namespace
} // namespace mediapipe

View File

@ -39,6 +39,11 @@ pipeline.
the same TFLite model in a CPU-based pipeline. This example highlights how
graphs can be easily adapted to run on CPU v.s. GPU.
### Object Detection and Tracking with GPU
[Object Detection and Tracking with GPU](./object_tracking_mobile_gpu.md) illustrates how to
use MediaPipe for object detection and tracking.
### Face Detection with GPU
[Face Detection with GPU](./face_detection_mobile_gpu.md) illustrates how to use

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

View File

@ -63,6 +63,7 @@ To build and run iOS apps:
```bash
$ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \
libopencv-calib3d-dev libopencv-features2d-dev \
libopencv-imgproc-dev libopencv-video-dev
```
@ -411,6 +412,7 @@ To build and run iOS apps:
```bash
username@DESKTOP-TMVLBJ1:~/mediapipe$ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \
libopencv-calib3d-dev libopencv-features2d-dev \
libopencv-imgproc-dev libopencv-video-dev
```

View File

@ -0,0 +1,475 @@
# Object Detection and Tracking
This doc focuses on the
[below example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt)
that performs object detection and tracking.
Note that object detection is using TensorFlow Lite on GPU while tracking is using CPU.
For overall context on hand detection and hand tracking, please read this
[Google Developer Blog](https://mediapipe.page.link/objecttrackingblog).
![object_tracking_android_gpu_gif](images/mobile/object_tracking_android_gpu.gif)
## Android
[Source](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu)
To build and install the app:
```bash
bazel build -c opt --config=android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu
```
Once the app is built, install it on Android device with:
```bash
adb install bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/objecttrackinggpu.apk
```
## Graph
The object detection and tracking [main graph](#main-graph) internally utilizes a
[object detection subgraph](#object-detection-subgraph), a
[object tracking subgraph](#object-tracking-subgraph) and a
[renderer subgraph](#renderer-subgraph).
The subgraphs show up in the main graph visualization as nodes colored in
purple, and the subgraph itself can also be visualized just like a regular
graph. For more information on how to visualize a graph that includes subgraphs,
see the Visualizing Subgraphs section in the
[visualizer documentation](./visualizer.md).
### Main Graph
![object_detection_mobile_gpu_graph](images/mobile/object_tracking_mobile_gpu.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection/object_detection_mobile_gpu.pbtxt)
```bash
# MediaPipe graph that performs object detection and tracking.
# Used in the examples in
# mediapipie/examples/android/src/java/com/mediapipe/apps/objecttrackinggpu
# Images on GPU coming into and out of the graph.
input_stream: "input_video"
output_stream: "output_video"
# Resamples the images by specific frame rate. This calculator is used to
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
# consumption for expensive process.
node {
calculator: "PacketResamplerCalculator"
input_stream: "DATA:input_video"
output_stream: "DATA:throttled_input_video"
node_options: {
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
frame_rate: 0.5
}
}
}
# Subgraph that detections objects (see object_detection_gpu.pbtxt).
node {
calculator: "ObjectDetectionSubgraph"
input_stream: "IMAGE:throttled_input_video"
output_stream: "DETECTIONS:output_detections"
}
# Subgraph that tracks objects (see object_tracking.pbtxt).
node {
calculator: "ObjectTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "DETECTIONS:output_detections"
output_stream: "DETECTIONS:tracked_detections"
}
# Subgraph that renders annotations and overlays them on top of the input
# images (see renderer_gpu.pbtxt).
node {
calculator: "RendererSubgraph"
input_stream: "IMAGE:input_video"
input_stream: "DETECTIONS:tracked_detections"
output_stream: "IMAGE:output_video"
}
```
### Object Detection Subgraph
![object_detection_gpu_subgraph](images/mobile/object_detection_gpu_subgraph.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/object_detection_gpu.pbtxt)
```bash
# MediaPipe object detection subgraph.
type: "ObjectDetectionSubgraph"
input_stream: "IMAGE:input_video"
output_stream: "DETECTIONS:output_detections"
# Transforms the input image on GPU to a 320x320 image. To scale the image, by
# default it uses the STRETCH scale mode that maps the entire input image to the
# entire transformed image. As a result, image aspect ratio may be changed and
# objects in the image may be deformed (stretched or squeezed), but the object
# detection model used in this graph is agnostic to that deformation.
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:transformed_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 320
output_height: 320
}
}
}
# Converts the transformed input image on GPU into an image tensor stored as a
# TfLiteTensor.
node {
calculator: "TfLiteConverterCalculator"
input_stream: "IMAGE_GPU:transformed_input_video"
output_stream: "TENSORS_GPU:image_tensor"
}
# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
# vector of tensors representing, for instance, detection boxes/keypoints and
# scores.
node {
calculator: "TfLiteInferenceCalculator"
input_stream: "TENSORS_GPU:image_tensor"
output_stream: "TENSORS_GPU:detection_tensors"
node_options: {
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
model_path: "mediapipe/models/ssdlite_object_detection.tflite"
}
}
}
# Generates a single side packet containing a vector of SSD anchors based on
# the specification in the options.
node {
calculator: "SsdAnchorsCalculator"
output_side_packet: "anchors"
node_options: {
[type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
input_size_height: 320
input_size_width: 320
anchor_offset_x: 0.5
anchor_offset_y: 0.5
strides: 16
strides: 32
strides: 64
strides: 128
strides: 256
strides: 512
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
reduce_boxes_in_lowest_layer: true
}
}
}
# Decodes the detection tensors generated by the TensorFlow Lite model, based on
# the SSD anchors and the specification in the options, into a vector of
# detections. Each detection describes a detected object.
node {
calculator: "TfLiteTensorsToDetectionsCalculator"
input_stream: "TENSORS_GPU:detection_tensors"
input_side_packet: "ANCHORS:anchors"
output_stream: "DETECTIONS:detections"
node_options: {
[type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] {
num_classes: 91
num_boxes: 2034
num_coords: 4
ignore_classes: 0
sigmoid_score: true
apply_exponential_on_box_size: true
x_scale: 10.0
y_scale: 10.0
h_scale: 5.0
w_scale: 5.0
min_score_thresh: 0.6
}
}
}
# Performs non-max suppression to remove excessive detections.
node {
calculator: "NonMaxSuppressionCalculator"
input_stream: "detections"
output_stream: "filtered_detections"
node_options: {
[type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] {
min_suppression_threshold: 0.4
max_num_detections: 3
overlap_type: INTERSECTION_OVER_UNION
return_empty_detections: true
}
}
}
# Maps detection label IDs to the corresponding label text. The label map is
# provided in the label_map_path option.
node {
calculator: "DetectionLabelIdToTextCalculator"
input_stream: "filtered_detections"
output_stream: "output_detections"
node_options: {
[type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] {
label_map_path: "mediapipe/models/ssdlite_object_detection_labelmap.txt"
}
}
}
```
### Object Tracking Subgraph
Object tracking subgraph uses a box tracking subgraph which is a generic
tracking library that can be used for other use cases.
![object_tracking_subgraph](images/mobile/object_tracking_subgraph.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/object_tracking.pbtxt)
```bash
# MediaPipe object tracking subgraph.
type: "ObjectTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "DETECTIONS:new_detections"
output_stream: "DETECTIONS:tracked_detections"
# Assigns an unique id for each new detection.
node {
calculator: "DetectionUniqueIdCalculator"
input_stream: "DETECTIONS:new_detections"
output_stream: "DETECTIONS:detections_with_id"
}
# Converts detections to TimedBox protos which are used as initial location
# for tracking.
node {
calculator: "DetectionsToTimedBoxListCalculator"
input_stream: "DETECTIONS:detections_with_id"
output_stream: "BOXES:start_pos"
}
# Subgraph that tracks boxes (see box_tracking.pbtxt).
node {
calculator: "BoxTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "BOXES:start_pos"
input_stream: "CANCEL_ID:cancel_object_id"
output_stream: "BOXES:boxes"
}
# Managers new detected objects and objects that are being tracked.
# It associates the duplicated detections and updates the locations of
# detections from tracking.
node: {
calculator: "TrackedDetectionManagerCalculator"
input_stream: "DETECTIONS:detections_with_id"
input_stream: "TRACKING_BOXES:boxes"
output_stream: "DETECTIONS:tracked_detections"
output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "TRACKING_BOXES"
}
sync_set {
tag_index: "DETECTIONS"
}
}
}
}
}
```
### Box Tracking Subgraph
![box_tracking_subgraph](images/mobile/box_tracking_subgraph.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/box_tracking.pbtxt)
```bash
# MediaPipe box tracking subgraph.
type: "BoxTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "BOXES:start_pos"
input_stream: "CANCEL_ID:cancel_object_id"
output_stream: "BOXES:boxes"
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:downscaled_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 240
output_height: 320
}
}
}
# Converts GPU buffer to ImageFrame for processing tracking.
node: {
calculator: "GpuBufferToImageFrameCalculator"
input_stream: "downscaled_input_video"
output_stream: "downscaled_input_video_cpu"
}
# Performs motion analysis on an incoming video stream.
node: {
calculator: "MotionAnalysisCalculator"
input_stream: "VIDEO:downscaled_input_video_cpu"
output_stream: "CAMERA:camera_motion"
output_stream: "FLOW:region_flow"
node_options: {
[type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: {
analysis_options {
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
flow_options {
fast_estimation_min_block_size: 100
top_inlier_sets: 1
frac_inlier_error_threshold: 3e-3
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
verification_distance: 5.0
verify_long_feature_acceleration: true
verify_long_feature_trigger_ratio: 0.1
tracking_options {
max_features: 500
adaptive_extraction_levels: 2
min_eig_val_settings {
adaptive_lowest_quality_level: 2e-4
}
klt_tracker_implementation: KLT_OPENCV
}
}
}
}
}
}
# Reads optical flow fields defined in
# mediapipe/framework/formats/motion/optical_flow_field.h,
# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized
# to 0-255.
node: {
calculator: "FlowPackagerCalculator"
input_stream: "FLOW:region_flow"
input_stream: "CAMERA:camera_motion"
output_stream: "TRACKING:tracking_data"
node_options: {
[type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: {
flow_packager_options: {
binary_tracking_data_support: false
}
}
}
}
# Tracks box positions over time.
node: {
calculator: "BoxTrackerCalculator"
input_stream: "TRACKING:tracking_data"
input_stream: "TRACK_TIME:input_video"
input_stream: "START_POS:start_pos"
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream_info: {
tag_index: "CANCEL_OBJECT_ID"
back_edge: true
}
output_stream: "BOXES:boxes"
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "TRACKING"
tag_index: "TRACK_TIME"
}
sync_set {
tag_index: "START_POS"
}
sync_set {
tag_index: "CANCEL_OBJECT_ID"
}
}
}
}
node_options: {
[type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: {
tracker_options: {
track_step_options {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_SCALE
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}
```
### Renderer Subgraph
![hand_renderer_gpu_subgraph.pbtxt](images/mobile/object_tracking_renderer_gpu_subgraph.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/renderer_gpu.pbtxt)
```bash
# MediaPipe object tracking rendering subgraph.
type: "RendererSubgraph"
input_stream: "IMAGE:input_image"
input_stream: "DETECTIONS:detections"
output_stream: "IMAGE:output_image"
# Converts the detections to drawing primitives for annotation overlay.
node {
calculator: "DetectionsToRenderDataCalculator"
input_stream: "DETECTIONS:detections"
output_stream: "RENDER_DATA:detections_render_data"
node_options: {
[type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
thickness: 4.0
color { r: 255 g: 0 b: 0 }
render_detection_id: true
}
}
}
# Draws annotations and overlays them on top of the input images.
node {
calculator: "AnnotationOverlayCalculator"
input_stream: "INPUT_FRAME_GPU:input_image"
input_stream: "detections_render_data"
output_stream: "OUTPUT_FRAME_GPU:output_image"
}
```

View File

@ -0,0 +1,33 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="com.google.mediapipe.apps.objecttrackinggpu">
<uses-sdk
android:minSdkVersion="21"
android:targetSdkVersion="27" />
<!-- For using the camera -->
<uses-permission android:name="android.permission.CAMERA" />
<uses-feature android:name="android.hardware.camera" />
<uses-feature android:name="android.hardware.camera.autofocus" />
<!-- For MediaPipe -->
<uses-feature android:glEsVersion="0x00020000" android:required="true" />
<application
android:allowBackup="true"
android:label="@string/app_name"
android:supportsRtl="true"
android:theme="@style/AppTheme">
<activity
android:name=".MainActivity"
android:exported="true"
android:screenOrientation="portrait">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>

View File

@ -0,0 +1,81 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:private"])
cc_binary(
name = "libmediapipe_jni.so",
linkshared = 1,
linkstatic = 1,
deps = [
"//mediapipe/graphs/tracking:mobile_calculators",
"//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni",
],
)
cc_library(
name = "mediapipe_jni_lib",
srcs = [":libmediapipe_jni.so"],
alwayslink = 1,
)
# Maps the binary graph to an alias (e.g., the app name) for convenience so that the alias can be
# easily incorporated into the app via, for example,
# MainActivity.BINARY_GRAPH_NAME = "appname.binarypb".
genrule(
name = "binary_graph",
srcs = ["//mediapipe/graphs/tracking:mobile_gpu_binary_graph"],
outs = ["objecttrackinggpu.binarypb"],
cmd = "cp $< $@",
)
android_library(
name = "mediapipe_lib",
srcs = glob(["*.java"]),
assets = [
":binary_graph",
"//mediapipe/models:ssdlite_object_detection.tflite",
"//mediapipe/models:ssdlite_object_detection_labelmap.txt",
],
assets_dir = "",
manifest = "AndroidManifest.xml",
resource_files = glob(["res/**"]),
deps = [
":mediapipe_jni_lib",
"//mediapipe/java/com/google/mediapipe/components:android_camerax_helper",
"//mediapipe/java/com/google/mediapipe/components:android_components",
"//mediapipe/java/com/google/mediapipe/framework:android_framework",
"//mediapipe/java/com/google/mediapipe/glutil",
"//third_party:androidx_appcompat",
"//third_party:androidx_constraint_layout",
"//third_party:androidx_legacy_support_v4",
"//third_party:androidx_recyclerview",
"//third_party:opencv",
"@androidx_concurrent_futures//jar",
"@androidx_lifecycle//jar",
"@com_google_guava_android//jar",
],
)
android_binary(
name = "objecttrackinggpu",
manifest = "AndroidManifest.xml",
manifest_values = {"applicationId": "com.google.mediapipe.apps.objecttrackinggpu"},
multidex = "native",
deps = [
":mediapipe_lib",
],
)

View File

@ -0,0 +1,167 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.mediapipe.apps.objecttrackinggpu;
import android.graphics.SurfaceTexture;
import android.os.Bundle;
import androidx.appcompat.app.AppCompatActivity;
import android.util.Size;
import android.view.SurfaceHolder;
import android.view.SurfaceView;
import android.view.View;
import android.view.ViewGroup;
import com.google.mediapipe.components.CameraHelper;
import com.google.mediapipe.components.CameraXPreviewHelper;
import com.google.mediapipe.components.ExternalTextureConverter;
import com.google.mediapipe.components.FrameProcessor;
import com.google.mediapipe.components.PermissionHelper;
import com.google.mediapipe.framework.AndroidAssetUtil;
import com.google.mediapipe.glutil.EglManager;
/** Main activity of MediaPipe example apps. */
public class MainActivity extends AppCompatActivity {
private static final String TAG = "MainActivity";
private static final String BINARY_GRAPH_NAME = "objecttrackinggpu.binarypb";
private static final String INPUT_VIDEO_STREAM_NAME = "input_video";
private static final String OUTPUT_VIDEO_STREAM_NAME = "output_video";
private static final CameraHelper.CameraFacing CAMERA_FACING = CameraHelper.CameraFacing.BACK;
// Flips the camera-preview frames vertically before sending them into FrameProcessor to be
// processed in a MediaPipe graph, and flips the processed frames back when they are displayed.
// This is needed because OpenGL represents images assuming the image origin is at the bottom-left
// corner, whereas MediaPipe in general assumes the image origin is at top-left.
private static final boolean FLIP_FRAMES_VERTICALLY = true;
static {
// Load all native libraries needed by the app.
System.loadLibrary("mediapipe_jni");
System.loadLibrary("opencv_java3");
}
// {@link SurfaceTexture} where the camera-preview frames can be accessed.
private SurfaceTexture previewFrameTexture;
// {@link SurfaceView} that displays the camera-preview frames processed by a MediaPipe graph.
private SurfaceView previewDisplayView;
// Creates and manages an {@link EGLContext}.
private EglManager eglManager;
// Sends camera-preview frames into a MediaPipe graph for processing, and displays the processed
// frames onto a {@link Surface}.
private FrameProcessor processor;
// Converts the GL_TEXTURE_EXTERNAL_OES texture from Android camera into a regular texture to be
// consumed by {@link FrameProcessor} and the underlying MediaPipe graph.
private ExternalTextureConverter converter;
// Handles camera access via the {@link CameraX} Jetpack support library.
private CameraXPreviewHelper cameraHelper;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
previewDisplayView = new SurfaceView(this);
setupPreviewDisplayView();
// Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g.,
// binary graphs.
AndroidAssetUtil.initializeNativeAssetManager(this);
eglManager = new EglManager(null);
processor =
new FrameProcessor(
this,
eglManager.getNativeContext(),
BINARY_GRAPH_NAME,
INPUT_VIDEO_STREAM_NAME,
OUTPUT_VIDEO_STREAM_NAME);
processor.getVideoSurfaceOutput().setFlipY(FLIP_FRAMES_VERTICALLY);
PermissionHelper.checkAndRequestCameraPermissions(this);
}
@Override
protected void onResume() {
super.onResume();
converter = new ExternalTextureConverter(eglManager.getContext());
converter.setFlipY(FLIP_FRAMES_VERTICALLY);
converter.setConsumer(processor);
if (PermissionHelper.cameraPermissionsGranted(this)) {
startCamera();
}
}
@Override
protected void onPause() {
super.onPause();
converter.close();
}
@Override
public void onRequestPermissionsResult(
int requestCode, String[] permissions, int[] grantResults) {
super.onRequestPermissionsResult(requestCode, permissions, grantResults);
PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults);
}
private void setupPreviewDisplayView() {
previewDisplayView.setVisibility(View.GONE);
ViewGroup viewGroup = findViewById(R.id.preview_display_layout);
viewGroup.addView(previewDisplayView);
previewDisplayView
.getHolder()
.addCallback(
new SurfaceHolder.Callback() {
@Override
public void surfaceCreated(SurfaceHolder holder) {
processor.getVideoSurfaceOutput().setSurface(holder.getSurface());
}
@Override
public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
// (Re-)Compute the ideal size of the camera-preview display (the area that the
// camera-preview frames get rendered onto, potentially with scaling and rotation)
// based on the size of the SurfaceView that contains the display.
Size viewSize = new Size(width, height);
Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize);
// Connect the converter to the camera-preview frames as its input (via
// previewFrameTexture), and configure the output width and height as the computed
// display size.
converter.setSurfaceTextureAndAttachToGLContext(
previewFrameTexture, displaySize.getWidth(), displaySize.getHeight());
}
@Override
public void surfaceDestroyed(SurfaceHolder holder) {
processor.getVideoSurfaceOutput().setSurface(null);
}
});
}
private void startCamera() {
cameraHelper = new CameraXPreviewHelper();
cameraHelper.setOnCameraStartedListener(
surfaceTexture -> {
previewFrameTexture = surfaceTexture;
// Make the display view visible to start showing the preview. This triggers the
// SurfaceHolder.Callback added to (the holder of) previewDisplayView.
previewDisplayView.setVisibility(View.VISIBLE);
});
cameraHelper.startCamera(this, CAMERA_FACING, /*surfaceTexture=*/ null);
}
}

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="utf-8"?>
<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:app="http://schemas.android.com/apk/res-auto"
xmlns:tools="http://schemas.android.com/tools"
android:layout_width="match_parent"
android:layout_height="match_parent">
<FrameLayout
android:id="@+id/preview_display_layout"
android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:layout_weight="1">
<TextView
android:id="@+id/no_camera_access_view"
android:layout_height="fill_parent"
android:layout_width="fill_parent"
android:gravity="center"
android:text="@string/no_camera_access" />
</FrameLayout>
</androidx.constraintlayout.widget.ConstraintLayout>

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="colorPrimary">#008577</color>
<color name="colorPrimaryDark">#00574B</color>
<color name="colorAccent">#D81B60</color>
</resources>

View File

@ -0,0 +1,4 @@
<resources>
<string name="app_name" translatable="false">Object Tracking GPU</string>
<string name="no_camera_access" translatable="false">Please grant camera permissions.</string>
</resources>

View File

@ -0,0 +1,11 @@
<resources>
<!-- Base application theme. -->
<style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
<!-- Customize your theme here. -->
<item name="colorPrimary">@color/colorPrimary</item>
<item name="colorPrimaryDark">@color/colorPrimaryDark</item>
<item name="colorAccent">@color/colorAccent</item>
</style>
</resources>

View File

@ -93,15 +93,15 @@ FILEPATTERN = "kinetics_700_%s_25fps_rgb_flow"
SPLITS = {
"train": {
"shards": 1000,
"examples": 541490
"examples": 541279
},
"validate": {
"shards": 100,
"examples": 34715
"examples": 34688
},
"test": {
"shards": 100,
"examples": 69321
"examples": 69278
},
"custom": {
"csv": None, # Add a CSV for your own data here.

View File

@ -541,5 +541,90 @@ TEST_F(CalculatorGraphEventLoopTest, WaitToAddPacketToInputStream) {
ASSERT_EQ(kNumInputPackets, output_packets_.size());
}
// Captures log messages during testing.
class TextMessageLogSink : public LogSink {
public:
std::vector<std::string> messages;
void Send(const LogEntry& entry) {
messages.push_back(std::string(entry.text_message()));
}
};
// Verifies that CalculatorGraph::UnthrottleSources does not run repeatedly
// in a "busy-loop" while the graph is throttled due to a graph-output stream.
TEST_F(CalculatorGraphEventLoopTest, UnthrottleSources) {
CalculatorGraphConfig graph_config;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(
R"(
node {
calculator: "PassThroughCalculator"
input_stream: "input_numbers"
output_stream: "output_numbers"
}
input_stream: "input_numbers"
output_stream: "output_numbers"
num_threads: 2
max_queue_size: 5
)",
&graph_config));
constexpr int kQueueSize = 5;
// Initialize and start the mediapipe graph.
CalculatorGraph graph;
MP_ASSERT_OK(graph.Initialize(graph_config));
graph.SetGraphInputStreamAddMode(
CalculatorGraph::GraphInputStreamAddMode::ADD_IF_NOT_FULL);
auto poller_status = graph.AddOutputStreamPoller("output_numbers");
MP_ASSERT_OK(poller_status.status());
mediapipe::OutputStreamPoller& poller = poller_status.ValueOrDie();
poller.SetMaxQueueSize(kQueueSize);
MP_ASSERT_OK(graph.StartRun({}));
// Lambda that adds a packet to the calculator graph.
auto add_packet = [&graph](std::string s, int i) {
return graph.AddPacketToInputStream(s, MakePacket<int>(i).At(Timestamp(i)));
};
// Start capturing VLOG messages from the mediapipe::Scheduler.
TextMessageLogSink log_listener;
mediapipe::AddLogSink(&log_listener);
SetVLOGLevel("scheduler", 3);
// Add just enough packets to fill the output stream queue.
std::vector<Packet> out_packets;
for (int i = 0; i < kQueueSize; ++i) {
MP_EXPECT_OK(add_packet("input_numbers", i));
MP_EXPECT_OK(graph.WaitUntilIdle());
}
// The graph is throttled due to the full output stream.
EXPECT_FALSE(add_packet("input_numbers", kQueueSize).ok());
// CalculatorGraph::UnthrottleSources should be called just one time.
absl::SleepFor(absl::Milliseconds(100));
// Read all packets from the output stream queue and close the graph.
for (int i = 0; i < kQueueSize; ++i) {
Packet packet;
EXPECT_TRUE(poller.Next(&packet));
out_packets.push_back(packet);
}
MP_EXPECT_OK(graph.CloseAllInputStreams());
MP_EXPECT_OK(graph.WaitUntilDone());
EXPECT_EQ(kQueueSize, out_packets.size());
// Stop capturing VLOG messages.
SetVLOGLevel("scheduler", 0);
mediapipe::RemoveLogSink(&log_listener);
// Count and validate the calls to UnthrottleSources.
int loop_count = 0;
for (auto& message : log_listener.messages) {
loop_count += (message == "HandleIdle: unthrottling") ? 1 : 0;
}
EXPECT_GE(loop_count, 1);
EXPECT_LE(loop_count, 2);
}
} // namespace
} // namespace mediapipe

View File

@ -241,7 +241,9 @@ cc_test(
proto_library(
name = "rect_proto",
srcs = ["rect.proto"],
visibility = ["//visibility:public"],
visibility = [
"//visibility:public",
],
)
mediapipe_cc_proto_library(
@ -253,10 +255,21 @@ mediapipe_cc_proto_library(
deps = [":rect_proto"],
)
java_lite_proto_library(
name = "rect_java_proto_lite",
strict_deps = 0,
visibility = [
"//mediapipe:__subpackages__",
],
deps = [":landmark_proto"],
)
proto_library(
name = "landmark_proto",
srcs = ["landmark.proto"],
visibility = ["//visibility:public"],
visibility = [
"//visibility:public",
],
)
mediapipe_cc_proto_library(
@ -269,7 +282,9 @@ mediapipe_cc_proto_library(
java_lite_proto_library(
name = "landmark_java_proto_lite",
strict_deps = 0,
visibility = ["//mediapipe:__subpackages__"],
visibility = [
"//mediapipe:__subpackages__",
],
deps = [":landmark_proto"],
)

View File

@ -176,6 +176,7 @@ cc_library(
deps = [
"//mediapipe/framework:port",
"@com_github_glog_glog//:glog",
"@com_google_absl//absl/time",
],
)

View File

@ -15,6 +15,50 @@
#ifndef MEDIAPIPE_PORT_LOGGING_H_
#define MEDIAPIPE_PORT_LOGGING_H_
#include "absl/time/time.h"
#include "glog/logging.h"
namespace mediapipe {
using LogSeverity = google::LogSeverity;
const auto SetVLOGLevel = google::SetVLOGLevel;
class LogEntry {
public:
LogEntry(LogSeverity severity, const struct ::tm* tm_time,
absl::string_view message)
: severity_(severity),
timestamp_(absl::FromTM(*tm_time, absl::LocalTimeZone())),
text_message_(message) {}
LogSeverity log_severity() const { return severity_; }
absl::Time timestamp() const { return timestamp_; }
absl::string_view text_message() const { return text_message_; }
private:
LogSeverity severity_;
absl::Time timestamp_;
absl::string_view text_message_;
};
class LogSink : public google::LogSink {
public:
virtual ~LogSink() = default;
virtual void Send(const LogEntry& entry) = 0;
virtual void WaitTillSent() {}
private:
virtual void send(LogSeverity severity, const char* full_filename,
const char* base_filename, int line,
const struct ::tm* tm_time, const char* message,
size_t message_len) {
LogEntry log_entry(severity, tm_time,
absl::string_view(message, message_len));
Send(log_entry);
}
};
inline void AddLogSink(LogSink* destination) {
google::AddLogSink(destination);
}
inline void RemoveLogSink(LogSink* destination) {
google::RemoveLogSink(destination);
}
} // namespace mediapipe
#endif // MEDIAPIPE_PORT_LOGGING_H_

View File

@ -0,0 +1,176 @@
# MediaPipe graph that performs LSTM based object detection on desktop with
# TensorFlow Lite on CPU.
# max_queue_size limits the number of packets enqueued on any input stream
# by throttling inputs to the graph. This makes the graph only process one
# frame per time.
max_queue_size: 1
# Decodes an input video file into images and a video header.
node {
calculator: "OpenCvVideoDecoderCalculator"
input_side_packet: "INPUT_FILE_PATH:input_video_path"
output_stream: "VIDEO:input_video"
output_stream: "VIDEO_PRESTREAM:input_video_header"
}
# Transforms the input image on CPU to a 256x256 image. To scale the image, by
# default it uses the STRETCH scale mode that maps the entire input image to the
# entire transformed image. As a result, image aspect ratio may be changed and
# objects in the image may be deformed (stretched or squeezed), but the object
# detection model used in this graph is agnostic to that deformation.
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE:input_video"
output_stream: "IMAGE:transformed_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 256
output_height: 256
}
}
}
# Converts the transformed input image on CPU into an image tensor as a
# TfLiteTensor. The zero_center option is set to true to normalize the
# pixel values to [-1.f, 1.f] as opposed to [0.f, 1.f].
node {
calculator: "TfLiteConverterCalculator"
input_stream: "IMAGE:transformed_input_video"
output_stream: "TENSORS:image_tensor"
node_options: {
[type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] {
zero_center: true
use_quantized_tensors: true
}
}
}
# Joins the input image_tensor and previous lstm state tensors into a single
# output vector of tensors.
node {
calculator: "ConcatenateTfLiteTensorVectorCalculator"
input_stream: "image_tensor"
input_stream: "prev_lstm_tensors"
output_stream: "concatenated_tensors"
}
# Runs a TensorFlow Lite model on CPU that takes an image tensor and state
# tensors, then outputs a vector of tensors representing, for instance,
# detection boxes/keypoints and scores.
node {
calculator: "TfLiteInferenceCalculator"
input_stream: "TENSORS:concatenated_tensors"
output_stream: "TENSORS:inference_tensors"
node_options: {
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
model_path: "mediapipe/models/lstdlite_object_detection.tflite"
}
}
}
# Splits up the output tensors into two streams [detection_boxes,
# detection_classes, detection_scores, num_detections], and [lstm_c, lstm_h].
node {
calculator: "SplitTfLiteTensorVectorCalculator"
input_stream: "inference_tensors"
output_stream: "detection_tensors"
output_stream: "lstm_tensors"
options {
[type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] {
ranges: { begin: 0 end: 4 }
ranges: { begin: 4 end: 6 }
}
}
}
# Waits for the LSTM state output from the TfLite inferencing to feed back in as
# an input tensor for the next frame.
node {
calculator: "PreviousLoopbackCalculator"
input_stream: "MAIN:transformed_input_video"
input_stream: "LOOP:lstm_tensors"
input_stream_info: { tag_index: "LOOP" back_edge: true }
output_stream: "PREV_LOOP:prev_lstm_tensors"
}
# Decodes the detection tensors generated by the TensorFlow Lite model, based on
# the specification in the options, into a vector of detections. Each detection
# describes a detected object.
node {
calculator: "TfLiteTensorsToDetectionsCalculator"
input_stream: "TENSORS:detection_tensors"
output_stream: "DETECTIONS:detections"
node_options: {
[type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] {
num_classes: 7
num_boxes: 40
num_coords: 4
}
}
}
# Performs non-max suppression to remove excessive detections.
node {
calculator: "NonMaxSuppressionCalculator"
input_stream: "detections"
output_stream: "filtered_detections"
node_options: {
[type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] {
min_suppression_threshold: 0.4
min_score_threshold: 0.2
max_num_detections: 5
overlap_type: INTERSECTION_OVER_UNION
}
}
}
# Maps detection label IDs to the corresponding label text. The label map is
# provided in the label_map_path option.
node {
calculator: "DetectionLabelIdToTextCalculator"
input_stream: "filtered_detections"
output_stream: "output_detections"
node_options: {
[type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] {
label_map_path: "mediapipe/models/lstdlite_object_detection_labelmap.txt"
}
}
}
# Converts the detections to drawing primitives for annotation overlay.
node {
calculator: "DetectionsToRenderDataCalculator"
input_stream: "DETECTIONS:output_detections"
output_stream: "RENDER_DATA:render_data"
node_options: {
[type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
thickness: 2.0
color { r: 255 g: 255 b: 255 }
}
}
}
# Draws annotations and overlays them on top of the original image coming into
# the graph.
node {
calculator: "AnnotationOverlayCalculator"
input_stream: "INPUT_FRAME:input_video"
input_stream: "render_data"
output_stream: "OUTPUT_FRAME:output_video"
}
# Encodes the annotated images into a video file, adopting properties specified
# in the input video header, e.g., video framerate.
node {
calculator: "OpenCvVideoEncoderCalculator"
input_stream: "VIDEO:output_video"
input_stream: "VIDEO_PRESTREAM:input_video_header"
input_side_packet: "OUTPUT_FILE_PATH:output_video_path"
node_options: {
[type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: {
codec: "avc1"
video_format: "mp4"
}
}
}

View File

@ -0,0 +1,39 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
load(
"//mediapipe/framework/tool:mediapipe_graph.bzl",
"mediapipe_binary_graph",
)
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
cc_library(
name = "mobile_calculators",
deps = [
"//mediapipe/calculators/core:packet_resampler_calculator",
"//mediapipe/graphs/tracking/subgraphs:object_detection_gpu",
"//mediapipe/graphs/tracking/subgraphs:object_tracking",
"//mediapipe/graphs/tracking/subgraphs:renderer_gpu",
],
)
mediapipe_binary_graph(
name = "mobile_gpu_binary_graph",
graph = "object_detection_tracking_mobile_gpu.pbtxt",
output_name = "mobile_gpu.binarypb",
deps = [":mobile_calculators"],
)

View File

@ -0,0 +1,46 @@
# MediaPipe graph that performs object detection and tracking with TensorFlow
# Lite on GPU.
# Used in the examples in
# mediapipie/examples/android/src/java/com/mediapipe/apps/objecttrackinggpu
# Images on GPU coming into and out of the graph.
input_stream: "input_video"
output_stream: "output_video"
# Resamples the images by specific frame rate. This calculator is used to
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
# consumption for expensive process.
node {
calculator: "PacketResamplerCalculator"
input_stream: "DATA:input_video"
output_stream: "DATA:throttled_input_video"
node_options: {
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
frame_rate: 0.5
}
}
}
# Subgraph that detections objects (see object_detection_gpu.pbtxt).
node {
calculator: "ObjectDetectionSubgraph"
input_stream: "IMAGE:throttled_input_video"
output_stream: "DETECTIONS:output_detections"
}
# Subgraph that tracks objects (see object_tracking.pbtxt).
node {
calculator: "ObjectTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "DETECTIONS:output_detections"
output_stream: "DETECTIONS:tracked_detections"
}
# Subgraph that renders annotations and overlays them on top of the input
# images (see renderer_gpu.pbtxt).
node {
calculator: "RendererSubgraph"
input_stream: "IMAGE:input_video"
input_stream: "DETECTIONS:tracked_detections"
output_stream: "IMAGE:output_video"
}

View File

@ -0,0 +1,76 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
load(
"//mediapipe/framework/tool:mediapipe_graph.bzl",
"mediapipe_simple_subgraph",
)
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
mediapipe_simple_subgraph(
name = "box_tracking",
graph = "box_tracking.pbtxt",
register_as = "BoxTrackingSubgraph",
deps = [
"//mediapipe/calculators/image:image_transformation_calculator",
"//mediapipe/calculators/video:box_tracker_calculator",
"//mediapipe/calculators/video:flow_packager_calculator",
"//mediapipe/calculators/video:motion_analysis_calculator",
"//mediapipe/framework/stream_handler:immediate_input_stream_handler",
"//mediapipe/framework/stream_handler:sync_set_input_stream_handler",
"//mediapipe/gpu:gpu_buffer_to_image_frame_calculator",
],
)
mediapipe_simple_subgraph(
name = "object_tracking",
graph = "object_tracking.pbtxt",
register_as = "ObjectTrackingSubgraph",
deps = [
"//mediapipe/calculators/util:detection_unique_id_calculator",
"//mediapipe/calculators/util:detections_to_timed_box_list_calculator",
"//mediapipe/calculators/video:tracked_detection_manager_calculator",
"//mediapipe/framework/stream_handler:sync_set_input_stream_handler",
"//mediapipe/graphs/tracking/subgraphs:box_tracking",
],
)
mediapipe_simple_subgraph(
name = "object_detection_gpu",
graph = "object_detection_gpu.pbtxt",
register_as = "ObjectDetectionSubgraph",
deps = [
"//mediapipe/calculators/image:image_transformation_calculator",
"//mediapipe/calculators/tflite:ssd_anchors_calculator",
"//mediapipe/calculators/tflite:tflite_converter_calculator",
"//mediapipe/calculators/tflite:tflite_inference_calculator",
"//mediapipe/calculators/tflite:tflite_tensors_to_detections_calculator",
"//mediapipe/calculators/util:detection_label_id_to_text_calculator",
"//mediapipe/calculators/util:non_max_suppression_calculator",
],
)
mediapipe_simple_subgraph(
name = "renderer_gpu",
graph = "renderer_gpu.pbtxt",
register_as = "RendererSubgraph",
deps = [
"//mediapipe/calculators/util:annotation_overlay_calculator",
"//mediapipe/calculators/util:detections_to_render_data_calculator",
"//mediapipe/calculators/util:rect_to_render_data_calculator",
],
)

View File

@ -0,0 +1,126 @@
# MediaPipe box tracking subgraph.
type: "BoxTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "BOXES:start_pos"
input_stream: "CANCEL_ID:cancel_object_id"
output_stream: "BOXES:boxes"
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:downscaled_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 240
output_height: 320
}
}
}
# Converts GPU buffer to ImageFrame for processing tracking.
node: {
calculator: "GpuBufferToImageFrameCalculator"
input_stream: "downscaled_input_video"
output_stream: "downscaled_input_video_cpu"
}
# Performs motion analysis on an incoming video stream.
node: {
calculator: "MotionAnalysisCalculator"
input_stream: "VIDEO:downscaled_input_video_cpu"
output_stream: "CAMERA:camera_motion"
output_stream: "FLOW:region_flow"
node_options: {
[type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: {
analysis_options {
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
flow_options {
fast_estimation_min_block_size: 100
top_inlier_sets: 1
frac_inlier_error_threshold: 3e-3
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
verification_distance: 5.0
verify_long_feature_acceleration: true
verify_long_feature_trigger_ratio: 0.1
tracking_options {
max_features: 500
adaptive_extraction_levels: 2
min_eig_val_settings {
adaptive_lowest_quality_level: 2e-4
}
klt_tracker_implementation: KLT_OPENCV
}
}
}
}
}
}
# Reads optical flow fields defined in
# mediapipe/framework/formats/motion/optical_flow_field.h,
# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized
# to 0-255.
node: {
calculator: "FlowPackagerCalculator"
input_stream: "FLOW:region_flow"
input_stream: "CAMERA:camera_motion"
output_stream: "TRACKING:tracking_data"
node_options: {
[type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: {
flow_packager_options: {
binary_tracking_data_support: false
}
}
}
}
# Tracks box positions over time.
node: {
calculator: "BoxTrackerCalculator"
input_stream: "TRACKING:tracking_data"
input_stream: "TRACK_TIME:input_video"
input_stream: "START_POS:start_pos"
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream_info: {
tag_index: "CANCEL_OBJECT_ID"
back_edge: true
}
output_stream: "BOXES:boxes"
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "TRACKING"
tag_index: "TRACK_TIME"
}
sync_set {
tag_index: "START_POS"
}
sync_set {
tag_index: "CANCEL_OBJECT_ID"
}
}
}
}
node_options: {
[type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: {
tracker_options: {
track_step_options {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_SCALE
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}

View File

@ -0,0 +1,128 @@
# MediaPipe object detection subgraph.
type: "ObjectDetectionSubgraph"
input_stream: "IMAGE:input_video"
output_stream: "DETECTIONS:output_detections"
# Transforms the input image on GPU to a 320x320 image. To scale the image, by
# default it uses the STRETCH scale mode that maps the entire input image to the
# entire transformed image. As a result, image aspect ratio may be changed and
# objects in the image may be deformed (stretched or squeezed), but the object
# detection model used in this graph is agnostic to that deformation.
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:transformed_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 320
output_height: 320
}
}
}
# Converts the transformed input image on GPU into an image tensor stored as a
# TfLiteTensor.
node {
calculator: "TfLiteConverterCalculator"
input_stream: "IMAGE_GPU:transformed_input_video"
output_stream: "TENSORS_GPU:image_tensor"
}
# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
# vector of tensors representing, for instance, detection boxes/keypoints and
# scores.
node {
calculator: "TfLiteInferenceCalculator"
input_stream: "TENSORS_GPU:image_tensor"
output_stream: "TENSORS_GPU:detection_tensors"
node_options: {
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
model_path: "mediapipe/models/ssdlite_object_detection.tflite"
}
}
}
# Generates a single side packet containing a vector of SSD anchors based on
# the specification in the options.
node {
calculator: "SsdAnchorsCalculator"
output_side_packet: "anchors"
node_options: {
[type.googleapis.com/mediapipe.SsdAnchorsCalculatorOptions] {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
input_size_height: 320
input_size_width: 320
anchor_offset_x: 0.5
anchor_offset_y: 0.5
strides: 16
strides: 32
strides: 64
strides: 128
strides: 256
strides: 512
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
reduce_boxes_in_lowest_layer: true
}
}
}
# Decodes the detection tensors generated by the TensorFlow Lite model, based on
# the SSD anchors and the specification in the options, into a vector of
# detections. Each detection describes a detected object.
node {
calculator: "TfLiteTensorsToDetectionsCalculator"
input_stream: "TENSORS_GPU:detection_tensors"
input_side_packet: "ANCHORS:anchors"
output_stream: "DETECTIONS:detections"
node_options: {
[type.googleapis.com/mediapipe.TfLiteTensorsToDetectionsCalculatorOptions] {
num_classes: 91
num_boxes: 2034
num_coords: 4
ignore_classes: 0
sigmoid_score: true
apply_exponential_on_box_size: true
x_scale: 10.0
y_scale: 10.0
h_scale: 5.0
w_scale: 5.0
min_score_thresh: 0.6
}
}
}
# Performs non-max suppression to remove excessive detections.
node {
calculator: "NonMaxSuppressionCalculator"
input_stream: "detections"
output_stream: "filtered_detections"
node_options: {
[type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] {
min_suppression_threshold: 0.4
max_num_detections: 3
overlap_type: INTERSECTION_OVER_UNION
return_empty_detections: true
}
}
}
# Maps detection label IDs to the corresponding label text. The label map is
# provided in the label_map_path option.
node {
calculator: "DetectionLabelIdToTextCalculator"
input_stream: "filtered_detections"
output_stream: "output_detections"
node_options: {
[type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] {
label_map_path: "mediapipe/models/ssdlite_object_detection_labelmap.txt"
}
}
}

View File

@ -0,0 +1,56 @@
# MediaPipe object tracking subgraph.
type: "ObjectTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "DETECTIONS:new_detections"
output_stream: "DETECTIONS:tracked_detections"
# Assigns an unique id for each new detection.
node {
calculator: "DetectionUniqueIdCalculator"
input_stream: "DETECTIONS:new_detections"
output_stream: "DETECTIONS:detections_with_id"
}
# Converts detections to TimedBox protos which are used as initial location
# for tracking.
node {
calculator: "DetectionsToTimedBoxListCalculator"
input_stream: "DETECTIONS:detections_with_id"
output_stream: "BOXES:start_pos"
}
# Subgraph that tracks boxes (see box_tracking.pbtxt).
node {
calculator: "BoxTrackingSubgraph"
input_stream: "VIDEO:input_video"
input_stream: "BOXES:start_pos"
input_stream: "CANCEL_ID:cancel_object_id"
output_stream: "BOXES:boxes"
}
# Managers new detected objects and objects that are being tracked.
# It associates the duplicated detections and updates the locations of
# detections from tracking.
node: {
calculator: "TrackedDetectionManagerCalculator"
input_stream: "DETECTIONS:detections_with_id"
input_stream: "TRACKING_BOXES:boxes"
output_stream: "DETECTIONS:tracked_detections"
output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "TRACKING_BOXES"
}
sync_set {
tag_index: "DETECTIONS"
}
}
}
}
}

View File

@ -0,0 +1,29 @@
# MediaPipe object tracking rendering subgraph.
type: "RendererSubgraph"
input_stream: "IMAGE:input_image"
input_stream: "DETECTIONS:detections"
output_stream: "IMAGE:output_image"
# Converts the detections to drawing primitives for annotation overlay.
node {
calculator: "DetectionsToRenderDataCalculator"
input_stream: "DETECTIONS:detections"
output_stream: "RENDER_DATA:detections_render_data"
node_options: {
[type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
thickness: 4.0
color { r: 255 g: 0 b: 0 }
render_detection_id: true
}
}
}
# Draws annotations and overlays them on top of the input images.
node {
calculator: "AnnotationOverlayCalculator"
input_stream: "INPUT_FRAME_GPU:input_image"
input_stream: "detections_render_data"
output_stream: "OUTPUT_FRAME_GPU:output_image"
}

View File

@ -50,4 +50,16 @@ public abstract class MediaPipeRunner extends Graph {
* Stops the running graph and releases the resource. Call this in Activity onDestroy callback.
*/
public abstract void release();
/**
* Stops the running graph and releases the resource. Call this in Activity onDestroy callback.
*
* <p>Like {@link #release()} but with a timeout. The default behavior is to call
* {@link #release()}. The implementation can override this method if it cares about timing out
* releasing resources.
*
* @param timeoutMillis the time it takes to force timeout from releasing mff context.
*/
public void release(long timeoutMillis) {
release();
}
}

View File

@ -0,0 +1,742 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
load("//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library")
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
proto_library(
name = "tone_models_proto",
srcs = ["tone_models.proto"],
)
proto_library(
name = "tone_estimation_proto",
srcs = ["tone_estimation.proto"],
deps = [":tone_models_proto"],
)
proto_library(
name = "region_flow_computation_proto",
srcs = ["region_flow_computation.proto"],
deps = [
":tone_estimation_proto",
],
)
proto_library(
name = "motion_saliency_proto",
srcs = ["motion_saliency.proto"],
)
proto_library(
name = "motion_estimation_proto",
srcs = ["motion_estimation.proto"],
)
proto_library(
name = "motion_analysis_proto",
srcs = ["motion_analysis.proto"],
deps = [
":motion_estimation_proto",
":motion_saliency_proto",
":region_flow_computation_proto",
],
)
proto_library(
name = "region_flow_proto",
srcs = ["region_flow.proto"],
)
proto_library(
name = "motion_models_proto",
srcs = ["motion_models.proto"],
)
proto_library(
name = "camera_motion_proto",
srcs = ["camera_motion.proto"],
deps = [":motion_models_proto"],
)
proto_library(
name = "push_pull_filtering_proto",
srcs = ["push_pull_filtering.proto"],
)
proto_library(
name = "frame_selection_solution_evaluator_proto",
srcs = ["frame_selection_solution_evaluator.proto"],
)
proto_library(
name = "frame_selection_proto",
srcs = ["frame_selection.proto"],
deps = [
":camera_motion_proto",
":frame_selection_solution_evaluator_proto",
":region_flow_proto",
],
)
proto_library(
name = "flow_packager_proto",
srcs = ["flow_packager.proto"],
deps = [
":motion_models_proto",
":region_flow_proto",
],
)
proto_library(
name = "tracking_proto",
srcs = ["tracking.proto"],
deps = [
":motion_models_proto",
],
)
proto_library(
name = "box_tracker_proto",
srcs = ["box_tracker.proto"],
deps = [":tracking_proto"],
)
mediapipe_cc_proto_library(
name = "tone_models_cc_proto",
srcs = ["tone_models.proto"],
visibility = ["//visibility:public"],
deps = [":tone_models_proto"],
)
mediapipe_cc_proto_library(
name = "tone_estimation_cc_proto",
srcs = ["tone_estimation.proto"],
cc_deps = [":tone_models_cc_proto"],
visibility = ["//visibility:public"],
deps = [":tone_estimation_proto"],
)
mediapipe_cc_proto_library(
name = "region_flow_computation_cc_proto",
srcs = ["region_flow_computation.proto"],
cc_deps = [
":tone_estimation_cc_proto",
":tone_models_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":region_flow_computation_proto"],
)
mediapipe_cc_proto_library(
name = "motion_saliency_cc_proto",
srcs = ["motion_saliency.proto"],
visibility = ["//visibility:public"],
deps = [":motion_saliency_proto"],
)
mediapipe_cc_proto_library(
name = "motion_estimation_cc_proto",
srcs = ["motion_estimation.proto"],
visibility = ["//visibility:public"],
deps = [":motion_estimation_proto"],
)
mediapipe_cc_proto_library(
name = "motion_analysis_cc_proto",
srcs = ["motion_analysis.proto"],
cc_deps = [
":motion_estimation_cc_proto",
":motion_saliency_cc_proto",
":region_flow_computation_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":motion_analysis_proto"],
)
mediapipe_cc_proto_library(
name = "region_flow_cc_proto",
srcs = ["region_flow.proto"],
cc_deps = [":motion_models_cc_proto"],
visibility = ["//visibility:public"],
deps = [":region_flow_proto"],
)
mediapipe_cc_proto_library(
name = "motion_models_cc_proto",
srcs = ["motion_models.proto"],
visibility = ["//visibility:public"],
deps = [":motion_models_proto"],
)
mediapipe_cc_proto_library(
name = "camera_motion_cc_proto",
srcs = ["camera_motion.proto"],
cc_deps = [":motion_models_cc_proto"],
visibility = ["//visibility:public"],
deps = [":camera_motion_proto"],
)
mediapipe_cc_proto_library(
name = "push_pull_filtering_cc_proto",
srcs = ["push_pull_filtering.proto"],
visibility = ["//visibility:public"],
deps = [":push_pull_filtering_proto"],
)
mediapipe_cc_proto_library(
name = "frame_selection_solution_evaluator_cc_proto",
srcs = ["frame_selection_solution_evaluator.proto"],
visibility = ["//visibility:public"],
deps = [":frame_selection_solution_evaluator_proto"],
)
mediapipe_cc_proto_library(
name = "frame_selection_cc_proto",
srcs = ["frame_selection.proto"],
cc_deps = [
":camera_motion_cc_proto",
":frame_selection_solution_evaluator_cc_proto",
":region_flow_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":frame_selection_proto"],
)
mediapipe_cc_proto_library(
name = "flow_packager_cc_proto",
srcs = ["flow_packager.proto"],
cc_deps = [
":motion_models_cc_proto",
":region_flow_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":flow_packager_proto"],
)
mediapipe_cc_proto_library(
name = "tracking_cc_proto",
srcs = ["tracking.proto"],
cc_deps = [":motion_models_cc_proto"],
visibility = ["//visibility:public"],
deps = [":tracking_proto"],
)
mediapipe_cc_proto_library(
name = "box_tracker_cc_proto",
srcs = ["box_tracker.proto"],
cc_deps = [":tracking_cc_proto"],
visibility = ["//visibility:public"],
deps = [":box_tracker_proto"],
)
cc_library(
name = "motion_models",
srcs = ["motion_models.cc"],
hdrs = ["motion_models.h"],
deps = [
":camera_motion_cc_proto",
":motion_models_cc_proto",
":region_flow_cc_proto",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:singleton",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/strings:str_format",
"@eigen_archive//:eigen",
],
)
cc_library(
name = "motion_models_cv",
srcs = ["motion_models_cv.cc"],
hdrs = ["motion_models_cv.h"],
deps = [
":camera_motion_cc_proto",
":motion_models",
":motion_models_cc_proto",
"//mediapipe/framework/port:opencv_core",
],
)
cc_library(
name = "region_flow",
srcs = ["region_flow.cc"],
hdrs = ["region_flow.h"],
deps = [
":measure_time",
":motion_models",
":parallel_invoker",
":region_flow_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "camera_motion",
srcs = ["camera_motion.cc"],
hdrs = ["camera_motion.h"],
deps = [
":camera_motion_cc_proto",
":motion_models",
":region_flow",
":region_flow_cc_proto",
"@com_google_absl//absl/strings:str_format",
],
)
PARALLEL_COPTS = ["-DPARALLEL_INVOKER_ACTIVE"] + select({
"//mediapipe:apple": [],
"//mediapipe:android": [],
"//conditions:default": [],
})
PARALLEL_LINKOPTS = []
cc_library(
name = "measure_time",
srcs = ["measure_time.cc"],
hdrs = ["measure_time.h"],
deps = [
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/synchronization",
"@com_google_absl//absl/time",
],
)
cc_library(
name = "parallel_invoker",
srcs = ["parallel_invoker.cc"],
hdrs = ["parallel_invoker.h"],
copts = PARALLEL_COPTS,
linkopts = PARALLEL_LINKOPTS,
deps = [
":parallel_invoker_forbid_mixed_active",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:threadpool",
"@com_google_absl//absl/synchronization",
],
)
cc_library(
name = "parallel_invoker_forbid_mixed_active",
srcs = ["parallel_invoker_forbid_mixed.cc"],
copts = PARALLEL_COPTS,
alwayslink = 1, # Forces all symbols to be included.
)
cc_library(
name = "image_util",
srcs = ["image_util.cc"],
hdrs = ["image_util.h"],
deps = [
":motion_models",
":motion_models_cc_proto",
":region_flow",
":region_flow_cc_proto",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:vector",
],
)
cc_library(
name = "streaming_buffer",
srcs = ["streaming_buffer.cc"],
hdrs = ["streaming_buffer.h"],
deps = [
"//mediapipe/framework/port:logging",
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:any",
],
)
cc_library(
name = "motion_estimation",
srcs = ["motion_estimation.cc"],
hdrs = ["motion_estimation.h"],
copts = PARALLEL_COPTS,
linkopts = PARALLEL_LINKOPTS,
deps = [
":camera_motion",
":camera_motion_cc_proto",
":measure_time",
":motion_estimation_cc_proto",
":motion_models",
":motion_models_cc_proto",
":parallel_invoker",
":region_flow",
":region_flow_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/strings",
"@eigen_archive//:eigen",
],
)
cc_library(
name = "motion_saliency",
srcs = ["motion_saliency.cc"],
hdrs = ["motion_saliency.h"],
deps = [
":camera_motion",
":measure_time",
":motion_saliency_cc_proto",
":region_flow",
":region_flow_cc_proto",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:vector",
],
)
cc_library(
name = "push_pull_filtering",
hdrs = ["push_pull_filtering.h"],
deps = [
":image_util",
":push_pull_filtering_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_highgui",
],
)
cc_library(
name = "tone_models",
srcs = ["tone_models.cc"],
hdrs = ["tone_models.h"],
deps = [
":tone_models_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/strings:str_format",
],
)
cc_library(
name = "tone_estimation",
srcs = ["tone_estimation.cc"],
hdrs = ["tone_estimation.h"],
deps = [
":motion_models_cc_proto",
":region_flow",
":region_flow_cc_proto",
":tone_estimation_cc_proto",
":tone_models",
":tone_models_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:vector",
],
)
cc_library(
name = "region_flow_computation",
srcs = ["region_flow_computation.cc"],
hdrs = ["region_flow_computation.h"],
copts = PARALLEL_COPTS,
linkopts = PARALLEL_LINKOPTS,
deps = [
":camera_motion_cc_proto",
":image_util",
":measure_time",
":motion_estimation",
":motion_estimation_cc_proto",
":motion_models",
":motion_models_cc_proto",
":parallel_invoker",
":region_flow",
":region_flow_cc_proto",
":region_flow_computation_cc_proto",
":tone_estimation",
":tone_estimation_cc_proto",
":tone_models",
":tone_models_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_features2d",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:opencv_video",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/container:node_hash_set",
"@com_google_absl//absl/memory",
"@eigen_archive//:eigen",
],
)
cc_library(
name = "region_flow_visualization",
srcs = ["region_flow_visualization.cc"],
hdrs = ["region_flow_visualization.h"],
deps = [
":measure_time",
":motion_models",
":parallel_invoker",
":region_flow",
":region_flow_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "motion_analysis",
srcs = ["motion_analysis.cc"],
hdrs = ["motion_analysis.h"],
deps = [
":camera_motion",
":camera_motion_cc_proto",
":image_util",
":measure_time",
":motion_analysis_cc_proto",
":motion_estimation",
":motion_estimation_cc_proto",
":motion_saliency",
":motion_saliency_cc_proto",
":push_pull_filtering",
":region_flow",
":region_flow_cc_proto",
":region_flow_computation",
":region_flow_computation_cc_proto",
":region_flow_visualization",
":streaming_buffer",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/strings:str_format",
],
)
cc_library(
name = "flow_packager",
srcs = ["flow_packager.cc"],
hdrs = ["flow_packager.h"],
deps = [
":camera_motion",
":camera_motion_cc_proto",
":flow_packager_cc_proto",
":motion_estimation",
":motion_estimation_cc_proto",
":motion_models",
":motion_models_cc_proto",
":region_flow",
":region_flow_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "tracking",
srcs = ["tracking.cc"],
hdrs = ["tracking.h"],
copts = PARALLEL_COPTS,
linkopts = PARALLEL_LINKOPTS,
deps = [
":flow_packager_cc_proto",
":measure_time",
":motion_models",
":motion_models_cc_proto",
":motion_models_cv",
":parallel_invoker",
":region_flow",
":tracking_cc_proto",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_calib3d",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/algorithm:container",
"@com_google_absl//absl/memory",
"@eigen_archive//:eigen",
],
alwayslink = 1,
)
cc_library(
name = "box_tracker",
srcs = ["box_tracker.cc"],
hdrs = ["box_tracker.h"],
deps = [
":box_tracker_cc_proto",
":flow_packager_cc_proto",
":measure_time",
":tracking",
":tracking_cc_proto",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:threadpool",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/synchronization",
"@com_google_absl//absl/time",
],
)
cc_library(
name = "tracking_visualization_utilities",
srcs = ["tracking_visualization_utilities.cc"],
hdrs = ["tracking_visualization_utilities.h"],
deps = [
":box_tracker",
":box_tracker_cc_proto",
":flow_packager_cc_proto",
":tracking",
":tracking_cc_proto",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"@com_google_absl//absl/strings:str_format",
],
)
cc_test(
name = "parallel_invoker_test",
srcs = ["parallel_invoker_test.cc"],
copts = PARALLEL_COPTS,
linkopts = PARALLEL_LINKOPTS,
deps = [
":parallel_invoker",
"//mediapipe/framework/port:gtest_main",
"@com_google_absl//absl/synchronization",
],
)
cc_test(
name = "image_util_test",
srcs = [
"image_util_test.cc",
],
deps = [
":image_util",
"//mediapipe/framework/port:gtest_main",
],
)
cc_test(
name = "motion_models_test",
srcs = ["motion_models_test.cc"],
deps = [
":motion_estimation",
":motion_models",
"//mediapipe/framework/deps:message_matchers",
"//mediapipe/framework/port:gtest_main",
"//mediapipe/framework/port:parse_text_proto",
"//mediapipe/framework/port:vector",
],
)
cc_test(
name = "region_flow_computation_test",
srcs = ["region_flow_computation_test.cc"],
copts = PARALLEL_COPTS,
data = ["testdata/stabilize_test.png"],
linkopts = PARALLEL_LINKOPTS,
deps = [
":region_flow",
":region_flow_cc_proto",
":region_flow_computation",
"//mediapipe/framework/deps:file_path",
"//mediapipe/framework/port:commandlineflags",
"//mediapipe/framework/port:file_helpers",
"//mediapipe/framework/port:gtest_main",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_highgui",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:status",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/time",
],
)
cc_test(
name = "box_tracker_test",
timeout = "short",
srcs = ["box_tracker_test.cc"],
data = glob(["testdata/box_tracker/*"]),
deps = [
":box_tracker",
"//mediapipe/framework/deps:file_path",
"//mediapipe/framework/port:gtest_main",
],
)
cc_library(
name = "tracked_detection",
srcs = [
"tracked_detection.cc",
],
hdrs = [
"tracked_detection.h",
],
deps = [
"//mediapipe/framework/formats:rect_cc_proto",
"//mediapipe/framework/port:vector",
"@com_google_absl//absl/container:node_hash_map",
],
)
cc_test(
name = "tracked_detection_test",
srcs = [
"tracked_detection_test.cc",
],
deps = [
":tracked_detection",
"//mediapipe/framework/port:gtest_main",
],
)
cc_library(
name = "tracked_detection_manager",
srcs = [
"tracked_detection_manager.cc",
],
hdrs = [
"tracked_detection_manager.h",
],
deps = [
":tracked_detection",
"//mediapipe/framework/formats:rect_cc_proto",
"@com_google_absl//absl/container:node_hash_map",
],
)

View File

@ -0,0 +1,2 @@
jianingwei
yegenzhi

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,372 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_UTIL_TRACKING_BOX_TRACKER_H_
#define MEDIAPIPE_UTIL_TRACKING_BOX_TRACKER_H_
#include <inttypes.h>
#include <map>
#include <unordered_map>
#include <vector>
#include "absl/strings/str_format.h"
#include "absl/synchronization/mutex.h"
#include "mediapipe/framework/port/threadpool.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
#include "mediapipe/util/tracking/flow_packager.pb.h"
#include "mediapipe/util/tracking/tracking.h"
#include "mediapipe/util/tracking/tracking.pb.h"
namespace mediapipe {
// Describes a rectangle box at an instance of time.
struct TimedBox {
static const int kNumQuadVertices = 4;
// Dimensions are in normalized coordinates [0, 1].
float top = 0;
float left = 0;
float bottom = 0;
float right = 0;
// Rotation of box w.r.t. center in radians.
float rotation = 0;
int64 time_msec = 0;
// Confidence of the tracked box in range [0, 1].
float confidence = 0;
std::vector<Vector2_f> quad_vertices;
// Aspect ratio (width / height) for the tracked rectangle in physical space.
float aspect_ratio = -1.0;
// Whether we want this box to be potentially grouped with other boxes
// to track together. This is useful for tracking small boxes that lie
// on a plane. For example, when we detect a plane,
// track the plane, then all boxes within the plane can share the same
// homography transform.
bool request_grouping = false;
bool operator<(const TimedBox& rhs) const {
return time_msec < rhs.time_msec;
}
// Returns (1.0 - alpha) * lhs + alpha * rhs;
static TimedBox Blend(const TimedBox& lhs, const TimedBox& rhs, double alpha);
// Returns lhs * alpha + rhs * beta;
static TimedBox Blend(const TimedBox& lhs, const TimedBox& rhs, double alpha,
double beta);
std::string ToString() const {
return absl::StrFormat(
"top: %.3f left: %.3f bottom: %.3f right: %.3f "
"rot: %.3f t: %d",
top, left, bottom, right, rotation, static_cast<int64_t>(time_msec));
}
// Returns corners of TimedBox in the requested domain.
std::array<Vector2_f, 4> Corners(float width, float height) const;
static TimedBox FromProto(const TimedBoxProto& proto);
TimedBoxProto ToProto() const;
};
// TimedBox augment with internal states.
struct InternalTimedBox : public TimedBox {
InternalTimedBox() = default;
// Convenience constructor.
InternalTimedBox(const TimedBox& box, const MotionBoxState* state_)
: TimedBox(box), state(state_) {}
// Corresponding MotionBoxState a TimedBox.
std::shared_ptr<const MotionBoxState> state;
};
// Initializes beginning tracking state from a TimedBox.
void MotionBoxStateFromTimedBox(const TimedBox& box, MotionBoxState* state);
// Retrieves box position and time from a tracking state.
void TimedBoxFromMotionBoxState(const MotionBoxState& state, TimedBox* box);
// Downgrade tracking degrees of TrackStepOptions to
// TRACKING_DEGREE_OBJECT_ROTATION_SCALE if originally
// specified TRACKING_DEGREE_OBJECT_PERSPECTIVE, but start pos
// does not contain quad or contains invalid quad.
template <typename T>
void ChangeTrackingDegreesBasedOnStartPos(
const T& start_pos, TrackStepOptions* track_step_options) {
if (track_step_options->tracking_degrees() ==
TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE &&
(!start_pos.has_quad() || start_pos.quad().vertices_size() != 8)) {
track_step_options->set_tracking_degrees(
TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE);
VLOG(1) << "Originally specified TRACKING_DEGREE_OBJECT_PERSPECTIVE, but "
"changed to TRACKING_DEGREE_OBJECT_ROTATION_SCALE";
}
}
// Set of boxes for a particular checkpoint.
typedef std::deque<InternalTimedBox> PathSegment;
// Stores the PathSegment for each checkpoint time.
typedef std::map<int, PathSegment> Path;
// Returns box at specified time for a specific path segment.
// Returns true on success, otherwise box is left untouched.
// Optionally returns closest known MotionBoxState if state is not null.
bool TimedBoxAtTime(const PathSegment& segment, int64 time_msec, TimedBox* box,
MotionBoxState* state = nullptr);
// Tracks timed boxes from cached TrackingDataChunks created by
// FlowPackagerCalculator. For usage see accompanying test.
class BoxTracker {
public:
// Initializes a new BoxTracker to work on cached TrackingData from a chunk
// directory.
BoxTracker(const std::string& cache_dir, const BoxTrackerOptions& options);
// Initializes a new BoxTracker to work on the passed TrackingDataChunks.
// If copy_data is true, BoxTracker will retain its own copy of the data;
// otherwise the passed pointer need to be valid for the lifetime of the
// BoxTracker.
BoxTracker(const std::vector<const TrackingDataChunk*>& tracking_data,
bool copy_data, const BoxTrackerOptions& options);
// Add single TrackingDataChunk. This chunk must be correctly aligned with
// existing chunks. If chunk starting timestamp is larger than next valid
// chunk timestamp, empty chunks will be added to fill the gap. If copy_data
// is true, BoxTracker will retain its own copy of the data; otherwise the
// passed pointers need to be valid for the lifetime of the BoxTracker.
void AddTrackingDataChunk(const TrackingDataChunk* chunk, bool copy_data);
// Add new TrackingDataChunks. This method allows to streamline BoxTracker
// usage. Instead of collecting all tracking chunks and then calling the
// constructor, it makes possible to pass only minimally required chunks to
// constructor, and then append new chunks when available.
// Caller is responsible to guarantee that all chunks that are necessary for
// tracking are at the disposal of BoxTracker before calling NewBoxTrack, i.e.
// min_msec and max_msec parameters must be in the range of chunks passed to
// BoxTracker.
// Chunks will be added in the order they are specified in tracking_data. Each
// chunk must be correctly aligned with existing chunks. If chunk starting
// timestamp is larger than next valid chunk timestamp, empty chunks will be
// added to fill the gap. If copy_data is true, BoxTracker will retain its own
// copy of the data; otherwise the passed pointers need to be valid for the
// lifetime of the BoxTracker.
void AddTrackingDataChunks(
const std::vector<const TrackingDataChunk*>& tracking_data,
bool copy_data);
// Starts a new track for the specified box until tracking terminates or
// until track length achieves max_length.
// Does not block caller, returns immediately.
// Note: Use positive integers for id, we reserve negative ones for debugging
// and visualization purposes.
void NewBoxTrack(const TimedBox& initial_pos, int id, int64 min_msec = 0,
int64 max_msec = std::numeric_limits<int64>::max());
// Returns interval for which the state of the specified box is known.
// (Returns -1, -1 if id is missing or no tracking has been done).
std::pair<int64, int64> TrackInterval(int id);
// Returns box position for requested box at specified time.
// Returns false if no such box exists.
// Optional, if record_path_states is enabled in options outputs the
// corresponding states for the resulting TimedBox. Note, as TimedBoxes are
// generally interpolated from the underlying data and blended across
// checkpoints, the returned states are the closest (snapped) known tracking
// states for the left and right checkpoint path. Therefore the size of states
// is either two or one (if only one checkpoint exists).
bool GetTimedPosition(int id, int64 time_msec, TimedBox* result,
std::vector<MotionBoxState>* states = nullptr);
// Returns chunk index for specified time.
int ChunkIdxFromTime(int64 msec) const {
return msec / options_.caching_chunk_size_msec();
}
// Returns true if any tracking is ongoing for the specified id.
bool IsTrackingOngoingForId(int id) LOCKS_EXCLUDED(status_mutex_);
// Returns true if any tracking is ongoing.
bool IsTrackingOngoing() LOCKS_EXCLUDED(status_mutex_);
// Cancels all ongoing tracks. To avoid race conditions all NewBoxTrack's in
// flight will also be canceled. Future NewBoxTrack's will be canceled.
// NOTE: To resume execution, you have to call ResumeTracking() before
// issuing more NewBoxTrack calls.
void CancelAllOngoingTracks() LOCKS_EXCLUDED(status_mutex_);
void ResumeTracking() LOCKS_EXCLUDED(status_mutex_);
// Waits for all ongoing tracks to complete.
// Optionally accepts a timeout in microseconds (== 0 for infinite wait).
// Returns true on success, false if timeout is reached.
// NOTE: If WaitForAllOngoingTracks timed out, CancelAllOngoingTracks() must
// be called before destructing the BoxTracker object or dangeling running
// threads might try to access invalid data.
bool WaitForAllOngoingTracks(int timeout_us = 0)
LOCKS_EXCLUDED(status_mutex_);
// Debug function to obtain raw TrackingData closest to the specified
// timestamp. This call will read from disk on every invocation so it is
// expensive.
// To not interfere with other tracking requests it is recommended that you
// use a unique id here.
// Returns true on success.
bool GetTrackingData(int id, int64 request_time_msec,
TrackingData* tracking_data,
int* tracking_data_msec = nullptr);
private:
// Asynchronous implementation function for box tracking. Schedules forward
// and backward tracking.
void NewBoxTrackAsync(const TimedBox& initial_pos, int id, int64 min_msec,
int64 max_msec);
typedef std::pair<const TrackingDataChunk*, bool> AugmentedChunkPtr;
// Attempts to read chunk at chunk_idx if it exists. Reads from cache
// directory or from in memory cache.
// Important: 2nd part of return value indicates if returned tracking data
// will be owned by the caller (if true). In that case caller is responsible
// for releasing the returned chunk.
AugmentedChunkPtr ReadChunk(int id, int checkpoint, int chunk_idx);
// Attempts to read specified chunk from caching directory. Blocks and waits
// until chunk is available or internal time out is reached.
// Returns nullptr if data could not be read.
std::unique_ptr<TrackingDataChunk> ReadChunkFromCache(int id, int checkpoint,
int chunk_idx);
// Waits with timeout for chunkfile to become available. Returns true on
// success, false if waited till timeout or when canceled.
bool WaitForChunkFile(int id, int checkpoint, const std::string& chunk_file)
LOCKS_EXCLUDED(status_mutex_);
// Determines closest index in passed TrackingDataChunk
int ClosestFrameIndex(int64 msec, const TrackingDataChunk& chunk) const;
// Adds new TimedBox to specified checkpoint with state.
void AddBoxResult(const TimedBox& box, int id, int checkpoint,
const MotionBoxState& state);
// Callback can only handle 5 args max.
// Set own_data to true for args to assume ownership.
struct TrackingImplArgs {
TrackingImplArgs(AugmentedChunkPtr chunk_ptr,
const MotionBoxState& start_state_, int start_frame_,
int chunk_idx_, int id_, int checkpoint_, bool forward_,
bool first_call_, int64 min_msec_, int64 max_msec_)
: start_state(start_state_),
start_frame(start_frame_),
chunk_idx(chunk_idx_),
id(id_),
checkpoint(checkpoint_),
forward(forward_),
first_call(first_call_),
min_msec(min_msec_),
max_msec(max_msec_) {
if (chunk_ptr.second) {
chunk_data_buffer.reset(chunk_ptr.first);
}
chunk_data = chunk_ptr.first;
}
TrackingImplArgs(const TrackingImplArgs&) = default;
// Storage for tracking data.
std::shared_ptr<const TrackingDataChunk> chunk_data_buffer;
// Pointer to the actual tracking data. Usually points to the buffer
// but can also point to external data for performance reasons.
const TrackingDataChunk* chunk_data;
MotionBoxState start_state;
int start_frame;
int chunk_idx;
int id;
int checkpoint;
bool forward = true;
bool first_call = true;
int64 min_msec; // minimum timestamp to track to
int64 max_msec; // maximum timestamp to track to
};
// Actual tracking algorithm.
void TrackingImpl(const TrackingImplArgs& args);
// Ids are scheduled exclusively, run this method to acquire lock.
// Returns false if id could not be scheduled (e.g. id got canceled during
// waiting).
bool WaitToScheduleId(int id) LOCKS_EXCLUDED(status_mutex_);
// Signals end of scheduling phase. Requires status mutex to be held.
void DoneSchedulingId(int id) EXCLUSIVE_LOCKS_REQUIRED(status_mutex_);
// Removes all checkpoints within vicinity of new checkpoint.
void RemoveCloseCheckpoints(int id, int checkpoint)
EXCLUSIVE_LOCKS_REQUIRED(status_mutex_);
// Removes specific checkpoint.
void ClearCheckpoint(int id, int checkpoint)
EXCLUSIVE_LOCKS_REQUIRED(status_mutex_);
// Terminates tracking for specific id and checkpoint.
void CancelTracking(int id, int checkpoint)
EXCLUSIVE_LOCKS_REQUIRED(status_mutex_);
// Implementation function for IsTrackingOngoing assuming mutex is already
// held.
bool IsTrackingOngoingMutexHeld() EXCLUSIVE_LOCKS_REQUIRED(status_mutex_);
// Captures tracking status for each checkpoint
struct TrackStatus {
// Indicates that all current tracking processes should be canceled.
bool canceled = false;
// Number of tracking requests than are currently ongoing.
int tracks_ongoing = 0;
};
private:
// Stores computed tracking paths_ for all boxes.
std::unordered_map<int, Path> paths_ GUARDED_BY(path_mutex_);
absl::Mutex path_mutex_;
// For each id and each checkpoint stores current tracking status.
std::unordered_map<int, std::map<int, TrackStatus>> track_status_
GUARDED_BY(status_mutex_);
// Keeps track which ids are currently processing in NewBoxTrack.
std::unordered_map<int, bool> new_box_track_ GUARDED_BY(status_mutex_);
absl::Mutex status_mutex_;
bool canceling_ GUARDED_BY(status_mutex_) = false;
// Use to signal changes to status_condvar_;
absl::CondVar status_condvar_ GUARDED_BY(status_mutex_);
BoxTrackerOptions options_;
// Caching directory for TrackingData stored on disk.
std::string cache_dir_;
// Pointers to tracking data stored in memory.
std::vector<const TrackingDataChunk*> tracking_data_;
// Buffer for tracking data in case we retain a deep copy.
std::vector<std::unique_ptr<TrackingDataChunk>> tracking_data_buffer_;
// Workers that run the tracking algorithm.
std::unique_ptr<ThreadPool> tracking_workers_;
};
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_BOX_TRACKER_H_

View File

@ -0,0 +1,87 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/tracking.proto";
option java_package = "com.google.mediapipe.tracking";
option java_outer_classname = "BoxTrackerProto";
message BoxTrackerOptions {
// Chunk size for caching files. Should be equal to those
// written by the FlowPackagerCalculator.
optional int32 caching_chunk_size_msec = 1 [default = 2500];
// Chunk file format.
optional string cache_file_format = 2 [default = "chunk_%04d"];
// Number of simultaneous tracking requests.
optional int32 num_tracking_workers = 3 [default = 8];
// Maximum waiting time for next chunk, till function times out.
optional int32 read_chunk_timeout_msec = 4 [default = 60000];
// If set, box tracker will record the state for each computed TimedBox
// across all paths.
optional bool record_path_states = 5 [default = false];
// Actual tracking options to be used for every step.
optional TrackStepOptions track_step_options = 6;
}
// Next tag: 13
// Proto equivalent of struct TimedBox.
message TimedBoxProto {
// Normalized coords - in [0, 1]
optional float top = 1;
optional float left = 2;
optional float bottom = 3;
optional float right = 4;
// Rotation of box w.r.t. center in radians.
optional float rotation = 7;
optional MotionBoxState.Quad quad = 9;
optional int64 time_msec = 5 [default = 0];
// Unique per object id to disambiguate boxes.
optional int32 id = 6 [default = -1];
// Confidence of box tracked in the range [0, 1], with 0 being least
// confident, and 1 being most confident. A reasonable threshold is 0.5
// to filter out unconfident boxes.
optional float confidence = 8;
// Aspect ratio (width / height) for the tracked rectangle in physical space.
// If this field is provided, quad tracking will be performed using
// 6 degrees of freedom perspective transform between physical rectangle and
// frame quad. Otherwise, 8 degrees of freedom homography tracking between
// adjacent frames will be used.
optional float aspect_ratio = 10;
// Whether or not to enable reacquisition functionality for this specific box.
optional bool reacquisition = 11 [default = false];
// Whether we want this box to be potentially grouped with other boxes
// to track together. This is useful for tracking small boxes that lie
// on a plane. For example, when we detect a plane,
// track the plane, then all boxes within the plane can share the same
// homography transform.
optional bool request_grouping = 12 [default = false];
}
message TimedBoxProtoList {
repeated TimedBoxProto box = 1;
}

View File

@ -0,0 +1,117 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/box_tracker.h"
#include "mediapipe/framework/deps/file_path.h"
#include "mediapipe/framework/port/gtest.h"
namespace mediapipe {
namespace {
constexpr double kWidth = 1280.0;
constexpr double kHeight = 720.0;
// Ground truth test; testing tracking accuracy and multi-thread load testing.
TEST(BoxTrackerTest, MovingBoxTest) {
const std::string cache_dir =
file::JoinPath("./", "/mediapipe/util/tracking/testdata/box_tracker");
BoxTracker box_tracker(cache_dir, BoxTrackerOptions());
// Ground truth positions of the overlay (linear in between).
// @ 0: (50, 100)
// @ 3000: (50, 400)
// @ 6000: (500, 400)
// @ 9000: (1000, 50)
// @ 12000: (50, 100)
// @ 15000: (1000, 400)
//
// size of overlay: 220 x 252
std::vector<Vector2_d> positions{
{50.0 / kWidth, 100.0 / kHeight}, {50.0 / kWidth, 400.0 / kHeight},
{500.0 / kWidth, 400.0 / kHeight}, {1000.0 / kWidth, 50.0 / kHeight},
{50.0 / kWidth, 100.0 / kHeight}, {1000.0 / kWidth, 400.0 / kHeight},
};
const Vector2_d overlay_sz(220.0 / kWidth, 252.0 / kHeight);
TimedBox initial_pos;
initial_pos.left = positions[1].x();
initial_pos.top = positions[1].y();
initial_pos.right = initial_pos.left + overlay_sz.x();
initial_pos.bottom = initial_pos.top + overlay_sz.y();
initial_pos.time_msec = 3000;
// Test multithreading under load, ensure this does not crash or stall.
box_tracker.NewBoxTrack(initial_pos, 0);
// Cancel right after issuing.
box_tracker.CancelAllOngoingTracks();
// Should not be scheduled.
box_tracker.NewBoxTrack(initial_pos, 0);
EXPECT_FALSE(box_tracker.IsTrackingOngoing());
box_tracker.ResumeTracking();
box_tracker.NewBoxTrack(initial_pos, 0);
// Two cancelations in a row should not block.
box_tracker.CancelAllOngoingTracks();
box_tracker.CancelAllOngoingTracks();
box_tracker.ResumeTracking();
// Start again for real this time.
box_tracker.NewBoxTrack(initial_pos, 0);
// Wait to terminate.
box_tracker.WaitForAllOngoingTracks();
// Check that tracking did not abort.
EXPECT_EQ(0, box_tracker.TrackInterval(0).first);
EXPECT_GT(box_tracker.TrackInterval(0).second, 15000);
auto boxes_equal = [](const TimedBox& lhs, const TimedBox& rhs) {
constexpr float kAccuracy = 0.015f;
return (lhs.time_msec == rhs.time_msec &&
std::abs(lhs.top - rhs.top) < kAccuracy &&
std::abs(lhs.left - rhs.left) < kAccuracy &&
std::abs(lhs.right - rhs.right) < kAccuracy &&
std::abs(lhs.bottom - rhs.bottom) < kAccuracy);
};
for (int k = 0; k < 15000; k += 33) {
TimedBox box;
EXPECT_TRUE(box_tracker.GetTimedPosition(0, k, &box));
// One groundtruth position every 3s, linear in between.
const int rect_pos = k / 3000;
const int dt = k - rect_pos * 3000;
const int next_rect_pos = dt == 0 ? rect_pos : (rect_pos + 1);
// Blend to get initial position.
const double alpha = dt / 3000.0;
const Vector2_d gt_pos =
(1.0 - alpha) * positions[rect_pos] + alpha * positions[next_rect_pos];
TimedBox gt_box;
gt_box.time_msec = k;
gt_box.top = gt_pos.y();
gt_box.left = gt_pos.x();
gt_box.right = gt_box.left + overlay_sz.x();
gt_box.bottom = gt_box.top + overlay_sz.y();
EXPECT_TRUE(boxes_equal(gt_box, box));
}
}
} // namespace
} // namespace mediapipe

View File

@ -0,0 +1,408 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/camera_motion.h"
#include <numeric>
#include "absl/strings/str_format.h"
#include "mediapipe/util/tracking/region_flow.h"
namespace mediapipe {
void CameraMotionToTranslation(const CameraMotion& camera_motion,
TranslationModel* model) {
if (camera_motion.has_translation()) {
model->CopyFrom(camera_motion.translation());
} else {
model->Clear();
}
}
void CameraMotionToLinearSimilarity(const CameraMotion& camera_motion,
LinearSimilarityModel* model) {
if (camera_motion.has_linear_similarity()) {
model->CopyFrom(camera_motion.linear_similarity());
} else {
TranslationModel translation;
CameraMotionToTranslation(camera_motion, &translation);
model->CopyFrom(LinearSimilarityAdapter::Embed(translation));
}
}
void CameraMotionToAffine(const CameraMotion& camera_motion,
AffineModel* model) {
if (camera_motion.has_affine()) {
model->CopyFrom(camera_motion.affine());
} else {
LinearSimilarityModel similarity;
CameraMotionToLinearSimilarity(camera_motion, &similarity);
model->CopyFrom(AffineAdapter::Embed(similarity));
}
}
void CameraMotionToHomography(const CameraMotion& camera_motion,
Homography* model) {
if (camera_motion.has_homography()) {
model->CopyFrom(camera_motion.homography());
} else {
AffineModel affine;
CameraMotionToAffine(camera_motion, &affine);
model->CopyFrom(HomographyAdapter::Embed(affine));
}
}
void CameraMotionToMixtureHomography(const CameraMotion& camera_motion,
MixtureHomography* model) {
if (camera_motion.has_mixture_homography()) {
model->CopyFrom(camera_motion.mixture_homography());
} else {
Homography homography;
CameraMotionToHomography(camera_motion, &homography);
model->CopyFrom(MixtureHomographyAdapter::Embed(homography, 1));
}
}
CameraMotion ComposeCameraMotion(const CameraMotion& lhs,
const CameraMotion& rhs) {
CHECK_EQ(lhs.frame_width(), rhs.frame_width());
CHECK_EQ(lhs.frame_height(), rhs.frame_height());
CameraMotion result = rhs;
if (lhs.has_translation() || rhs.has_translation()) {
*result.mutable_translation() =
ModelCompose2(lhs.translation(), rhs.translation());
}
if (lhs.has_similarity() || rhs.has_similarity()) {
*result.mutable_similarity() =
ModelCompose2(lhs.similarity(), rhs.similarity());
}
if (lhs.has_linear_similarity() || rhs.has_linear_similarity()) {
*result.mutable_linear_similarity() =
ModelCompose2(lhs.linear_similarity(), rhs.linear_similarity());
}
if (lhs.has_affine() || rhs.has_affine()) {
*result.mutable_affine() = ModelCompose2(lhs.affine(), rhs.affine());
}
if (lhs.has_homography() || rhs.has_homography()) {
*result.mutable_homography() =
ModelCompose2(lhs.homography(), rhs.homography());
}
if (rhs.has_mixture_homography()) {
if (lhs.has_mixture_homography()) {
LOG(ERROR) << "Mixture homographies are not closed under composition, "
<< "Only rhs mixtures composed with lhs homographies "
<< "are supported.";
} else if (lhs.type() <= CameraMotion::UNSTABLE_SIM) {
// We only composit base model when stability is sufficient.
*result.mutable_mixture_homography() =
MixtureHomographyAdapter::ComposeLeft(rhs.mixture_homography(),
lhs.homography());
}
} else if (lhs.has_mixture_homography()) {
LOG(ERROR) << "Only rhs mixtures supported.";
}
// Select max unstable type.
result.set_type(std::max(lhs.type(), rhs.type()));
result.set_average_magnitude(lhs.average_magnitude() +
rhs.average_magnitude());
result.set_translation_variance(
std::max(lhs.translation_variance(), rhs.translation_variance()));
result.set_similarity_inlier_ratio(
std::min(lhs.similarity_inlier_ratio(), rhs.similarity_inlier_ratio()));
result.set_similarity_strict_inlier_ratio(
std::min(lhs.similarity_strict_inlier_ratio(),
rhs.similarity_strict_inlier_ratio()));
result.set_average_homography_error(
std::max(lhs.average_homography_error(), rhs.average_homography_error()));
result.set_homography_inlier_coverage(std::min(
lhs.homography_inlier_coverage(), rhs.homography_inlier_coverage()));
result.set_homography_strict_inlier_coverage(
std::min(lhs.homography_strict_inlier_coverage(),
rhs.homography_strict_inlier_coverage()));
// TODO: Overlay stuff.
result.set_flags(lhs.flags() | rhs.flags());
result.set_timestamp_usec(
std::max(lhs.timestamp_usec(), rhs.timestamp_usec()));
result.set_match_frame(lhs.match_frame() + rhs.match_frame());
// TODO: Rest.
return result;
}
CameraMotion InvertCameraMotion(const CameraMotion& motion) {
CameraMotion inverted = motion;
if (motion.has_translation()) {
*inverted.mutable_translation() = ModelInvert(motion.translation());
}
if (motion.has_similarity()) {
*inverted.mutable_similarity() = ModelInvert(motion.similarity());
}
if (motion.has_linear_similarity()) {
*inverted.mutable_linear_similarity() =
ModelInvert(motion.linear_similarity());
}
if (motion.has_affine()) {
*inverted.mutable_affine() = ModelInvert(motion.affine());
}
if (motion.has_homography()) {
*inverted.mutable_homography() = ModelInvert(motion.homography());
}
if (motion.has_mixture_homography()) {
LOG(ERROR) << "Mixture homographies are not closed under inversion.";
}
return inverted;
}
void SubtractCameraMotionFromFeatures(
const std::vector<CameraMotion>& camera_motions,
std::vector<RegionFlowFeatureList*>* feature_lists) {
CHECK(feature_lists != nullptr);
CHECK_GE(camera_motions.size(), feature_lists->size());
if (feature_lists->empty()) {
return;
}
bool use_mixtures = camera_motions[0].has_mixture_homography();
std::unique_ptr<MixtureRowWeights> row_weights;
if (use_mixtures) {
row_weights.reset(MixtureRowWeightsFromCameraMotion(
camera_motions[0], (*feature_lists)[0]->frame_height()));
}
for (int k = 0; k < feature_lists->size(); ++k) {
Homography background_model;
MixtureHomography background_model_mixture;
if (use_mixtures) {
CameraMotionToMixtureHomography(camera_motions[k],
&background_model_mixture);
} else {
CameraMotionToHomography(camera_motions[k], &background_model);
}
// Remove motion due to camera motion, leaving only foreground motion.
for (auto& feature : *(*feature_lists)[k]->mutable_feature()) {
const Vector2_f background_motion =
(!use_mixtures ? HomographyAdapter::TransformPoint(
background_model, FeatureLocation(feature))
: MixtureHomographyAdapter::TransformPoint(
background_model_mixture, *row_weights,
FeatureLocation(feature))) -
FeatureLocation(feature);
const Vector2_f object_motion = FeatureFlow(feature) - background_motion;
feature.set_dx(object_motion.x());
feature.set_dy(object_motion.y());
}
}
}
float ForegroundMotion(const CameraMotion& camera_motion,
const RegionFlowFeatureList& feature_list) {
if (camera_motion.has_mixture_homography()) {
LOG(WARNING) << "Mixture homographies are present but function is only "
<< "using homographies. Truncation error likely.";
}
Homography background_motion;
CameraMotionToHomography(camera_motion, &background_motion);
float foreground_motion = 0;
for (auto& feature : feature_list.feature()) {
const float error = (FeatureMatchLocation(feature) -
HomographyAdapter::TransformPoint(
background_motion, FeatureLocation(feature)))
.Norm();
foreground_motion += error;
}
if (feature_list.feature_size() > 0) {
foreground_motion *= 1.0f / feature_list.feature_size();
}
return foreground_motion;
}
void InitCameraMotionFromFeatureList(const RegionFlowFeatureList& feature_list,
CameraMotion* camera_motion) {
camera_motion->set_blur_score(feature_list.blur_score());
camera_motion->set_frac_long_features_rejected(
feature_list.frac_long_features_rejected());
camera_motion->set_timestamp_usec(feature_list.timestamp_usec());
camera_motion->set_match_frame(feature_list.match_frame());
camera_motion->set_frame_width(feature_list.frame_width());
camera_motion->set_frame_height(feature_list.frame_height());
}
std::string CameraMotionFlagToString(const CameraMotion& camera_motion) {
std::string text;
if (camera_motion.flags() & CameraMotion::FLAG_SHOT_BOUNDARY) {
text += "SHOT_BOUNDARY|";
}
if (camera_motion.flags() & CameraMotion::FLAG_BLURRY_FRAME) {
text += absl::StrFormat("BLURRY_FRAME %.2f|", camera_motion.bluriness());
}
if (camera_motion.flags() & CameraMotion::FLAG_MAJOR_OVERLAY) {
text += "MAJOR_OVERLAY|";
}
if (camera_motion.flags() & CameraMotion::FLAG_SHARP_FRAME) {
text += "SHARP_FRAME|";
}
if (camera_motion.flags() & CameraMotion::FLAG_SHOT_FADE) {
text += "SHOT_FADE|";
}
if (camera_motion.flags() & CameraMotion::FLAG_DUPLICATED) {
text += "DUPLICATED|";
}
return text;
}
std::string CameraMotionTypeToString(const CameraMotion& motion) {
switch (motion.type()) {
case CameraMotion::VALID:
return "VALID";
case CameraMotion::UNSTABLE_HOMOG:
return "UNSTABLE_HOMOG";
case CameraMotion::UNSTABLE_SIM:
return "UNSTABLE_SIM";
case CameraMotion::UNSTABLE:
return "UNSTABLE";
case CameraMotion::INVALID:
return "INVALID";
}
return "NEVER HAPPENS WITH CLANG";
}
float InlierCoverage(const CameraMotion& camera_motion,
bool use_homography_coverage) {
const int num_block_coverages = camera_motion.mixture_inlier_coverage_size();
if (num_block_coverages == 0 || use_homography_coverage) {
if (camera_motion.has_homography()) {
return camera_motion.homography_inlier_coverage();
} else {
return 1.0f;
}
} else {
return std::accumulate(camera_motion.mixture_inlier_coverage().begin(),
camera_motion.mixture_inlier_coverage().end(),
0.0f) *
(1.0f / num_block_coverages);
}
}
template <class CameraMotionContainer>
CameraMotion FirstCameraMotionForLooping(
const CameraMotionContainer& camera_motions) {
if (camera_motions.size() < 2) {
LOG(ERROR) << "Not enough camera motions for refinement.";
return CameraMotion();
}
CameraMotion loop_motion = camera_motions[0];
// Only update motions present in first camera motion.
bool has_translation = loop_motion.has_translation();
bool has_similarity = loop_motion.has_linear_similarity();
bool has_homography = loop_motion.has_homography();
TranslationModel translation;
LinearSimilarityModel similarity;
Homography homography;
for (int i = 1; i < camera_motions.size(); ++i) {
const CameraMotion& motion = camera_motions[i];
if (motion.has_mixture_homography()) {
// TODO: Implement
LOG(WARNING) << "This function does not validly apply mixtures; "
<< "which are currently not closed under composition. ";
}
switch (motion.type()) {
case CameraMotion::INVALID:
has_translation = false;
has_similarity = false;
has_homography = false;
break;
case CameraMotion::UNSTABLE:
has_similarity = false;
has_homography = false;
break;
case CameraMotion::UNSTABLE_SIM:
has_homography = false;
break;
case CameraMotion::VALID:
case CameraMotion::UNSTABLE_HOMOG:
break;
default:
LOG(FATAL) << "Unknown CameraMotion::type.";
}
// Only accumulate motions which are valid for the entire chain, otherwise
// keep the pre-initialized motions.
has_translation =
has_translation && motion.has_translation() &&
AccumulateInvertedModel(motion.translation(), &translation);
has_similarity =
has_similarity && motion.has_linear_similarity() &&
AccumulateInvertedModel(motion.linear_similarity(), &similarity);
has_homography = has_homography && motion.has_homography() &&
AccumulateInvertedModel(motion.homography(), &homography);
}
if (has_translation) {
*loop_motion.mutable_translation() = translation;
}
if (has_similarity && IsInverseStable(similarity)) {
*loop_motion.mutable_linear_similarity() = similarity;
}
if (has_homography && IsInverseStable(homography)) {
*loop_motion.mutable_homography() = homography;
}
VLOG(1) << "Looping camera motion refinement for "
<< " translation:" << (has_translation ? "successful" : "failed")
<< " similarity:" << (has_similarity ? "successful" : "failed")
<< " homography:" << (has_homography ? "successful" : "failed");
return loop_motion;
}
// Explicit instantiation.
template CameraMotion FirstCameraMotionForLooping<std::vector<CameraMotion>>(
const std::vector<CameraMotion>&);
template CameraMotion FirstCameraMotionForLooping<std::deque<CameraMotion>>(
const std::deque<CameraMotion>&);
} // namespace mediapipe

View File

@ -0,0 +1,336 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_UTIL_TRACKING_CAMERA_MOTION_H_
#define MEDIAPIPE_UTIL_TRACKING_CAMERA_MOTION_H_
#include <vector>
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
// Helper functions to extract specific models from CameraMotion.
// Returned is always the requested model. In case, model is not present (i.e.
// has_<motion model> fails), the highest degree of freedom model
// (lower or equal to the requested model) that is present is embedded in the
// requested model.
// Presence of the model depends on wich models were requesteded to be
// estimated (via MotionEstimationOptions, to initialize requested models to
// identity, use ResetMotionModels above). For example, assume linear similarity
// was not requested to be estimated, but affine was requested. If
// CameraMotionToLinearSimilarity is called, has_linear_similarity would be
// false and the function fall back returning a translation model.
void CameraMotionToTranslation(const CameraMotion& camera_motion,
TranslationModel* model);
void CameraMotionToLinearSimilarity(const CameraMotion& camera_motion,
LinearSimilarityModel* model);
void CameraMotionToAffine(const CameraMotion& camera_motion,
AffineModel* model);
void CameraMotionToHomography(const CameraMotion& camera_motion,
Homography* homography);
void CameraMotionToMixtureHomography(const CameraMotion& camera_motion,
MixtureHomography* mixture);
// TODO: Under development ...
// Returns camera motion lhs * rhs. Initial camera motion is set to rhs
// before composition.
CameraMotion ComposeCameraMotion(const CameraMotion& lhs,
const CameraMotion& rhs);
// Inverts every motion model that is set in CameraMotion.
CameraMotion InvertCameraMotion(const CameraMotion& motion);
// Templated wrapper for above calls.
template <class Model>
Model CameraMotionToModel(const CameraMotion& camera_motion);
// Returns model from passed CameraMotion specified by unstable_type
// (which must name a type != VALID, CHECK-ed) and embeds it in the specified
// Model.
template <class Model>
Model UnstableCameraMotionToModel(const CameraMotion& camera_motion,
CameraMotion::Type unstable_type);
// Projects passed model to lower degree of freedom model (embedded in original
// type), as specified type. In case type is valid, function is effectively
// identity function.
// Only implemented for the following models:
// - Translation
// - LinearSimilarity
// - AffineModel
template <class Model>
Model ProjectToTypeModel(const Model& model, float frame_width,
float frame_height, CameraMotion::Type type);
// Substract camera motion (specifically highest, degree of freedom model,
// that has been estimated reliably) from feature lists. Operates on vectors
// for improved performance. Size of camera_motions can be larger than
// feature_lists, in this case last camera motions are ignored.
void SubtractCameraMotionFromFeatures(
const std::vector<CameraMotion>& camera_motions,
std::vector<RegionFlowFeatureList*>* feature_lists);
// Returns average motion magnitude after subtracting camera motion.
float ForegroundMotion(const CameraMotion& camera_motion,
const RegionFlowFeatureList& feature_list);
// Initializes a CameraMotion with its corresponding fields from a
// RegionFlowFeatureList.
void InitCameraMotionFromFeatureList(const RegionFlowFeatureList& feature_list,
CameraMotion* camera_motion);
// Converts Camera motion flag to std::string.
std::string CameraMotionFlagToString(const CameraMotion& motion);
// Converts Camera motion type to std::string. Used instead of builtin proto
// function for mobile support.
std::string CameraMotionTypeToString(const CameraMotion& motion);
// Returns inlier coverage either based on mixture (if present, in this case
// return mean of block coverages) or else homography.
// If neither is present, returns 0 to signal insufficient inliers.
// If use_homography_coverage is set, uses homography even when mixture is
// present.
float InlierCoverage(const CameraMotion& camera_motion,
bool use_homography_coverage);
// Downsamples passed motion models temporally by specified downsample_scale,
// i.e. for models F_0, F_1, F_2, F_3, F_4 and downsample_scale of 2, models:
// F_0 * F_1, F_2 * F_3 and F_4 are returned.
// Optionally also performs downsampling of corresponding model_type returning
// the least unstable for each composition.
template <class Model>
void DownsampleMotionModels(
const std::vector<Model>& models,
const std::vector<CameraMotion::Type>* model_type, // optional.
int downsample_scale, std::vector<Model>* downsampled_models,
std::vector<CameraMotion::Type>* downsampled_types);
// Compatible subsampling method to above DownsampleMotionModels.
// Note, when downsampling for example:
// F_0, F_1, F_2, F_3, F_4 by factor 3 via above function, downsampled result
// will be F_0 * F_1 * F_2, F_3 * F_4
// so we would need to pick entities at F_2 and F_4.
// Template class Container must be SequenceContainer, like
// std::vector, std::deque.
template <class Container>
void SubsampleEntities(const Container& input, int downsample_scale,
Container* output);
// For perfect looping, this function computes the motion in the first frame
// to be the inverse of the accumulated motion from frame 1 to N.
// If a particular motion type is not available or not invertible at any
// frame pair, the original motion for that type is retained.
// Does not work if mixtures are present.
template <class CameraMotionContainer> // STL container of CameraMotion's
CameraMotion FirstCameraMotionForLooping(
const CameraMotionContainer& container);
// Template implementation functions.
template <class Model>
Model UnstableCameraMotionToModel(const CameraMotion& camera_motion,
CameraMotion::Type unstable_type) {
switch (unstable_type) {
case CameraMotion::INVALID:
return Model(); // Identity.
case CameraMotion::UNSTABLE: {
return ModelAdapter<Model>::Embed(
CameraMotionToModel<TranslationModel>(camera_motion));
}
case CameraMotion::UNSTABLE_SIM: {
return ModelAdapter<Model>::Embed(
CameraMotionToModel<LinearSimilarityModel>(camera_motion));
}
case CameraMotion::UNSTABLE_HOMOG: {
return ModelAdapter<Model>::Embed(
CameraMotionToModel<Homography>(camera_motion));
}
case CameraMotion::VALID:
LOG(FATAL) << "Specify a type != VALID";
return Model();
}
}
template <>
inline TranslationModel ProjectToTypeModel(const TranslationModel& model,
float frame_width,
float frame_height,
CameraMotion::Type type) {
switch (type) {
case CameraMotion::INVALID:
return TranslationModel(); // Identity.
default:
return model;
}
}
template <>
inline LinearSimilarityModel ProjectToTypeModel(
const LinearSimilarityModel& model, float frame_width, float frame_height,
CameraMotion::Type type) {
switch (type) {
case CameraMotion::INVALID:
return LinearSimilarityModel(); // Identity.
case CameraMotion::UNSTABLE:
return LinearSimilarityAdapter::Embed(
TranslationAdapter::ProjectFrom(model, frame_width, frame_height));
default:
return model;
}
}
template <class Model>
Model ProjectToTypeModel(const Model& model, float frame_width,
float frame_height, CameraMotion::Type type) {
switch (type) {
case CameraMotion::INVALID:
return Model(); // Identity.
case CameraMotion::UNSTABLE:
return ModelAdapter<Model>::Embed(
TranslationAdapter::ProjectFrom(model, frame_width, frame_height));
case CameraMotion::UNSTABLE_SIM:
return ModelAdapter<Model>::Embed(LinearSimilarityAdapter::ProjectFrom(
model, frame_width, frame_height));
// case UNSTABLE_HOMOG does not occur except for mixtures.
default:
return model;
}
}
template <>
inline MixtureHomography ProjectToTypeModel(const MixtureHomography&, float,
float, CameraMotion::Type) {
LOG(FATAL) << "Projection not supported for mixtures.";
return MixtureHomography();
}
template <class Model>
void DownsampleMotionModels(
const std::vector<Model>& models,
const std::vector<CameraMotion::Type>* model_type, int downsample_scale,
std::vector<Model>* downsampled_models,
std::vector<CameraMotion::Type>* downsampled_types) {
if (model_type) {
CHECK_EQ(models.size(), model_type->size());
CHECK(downsampled_models) << "Expecting output models.";
}
CHECK(downsampled_models);
downsampled_models->clear();
if (downsampled_types) {
downsampled_types->clear();
}
const int num_models = models.size();
for (int model_idx = 0; model_idx < num_models;
model_idx += downsample_scale) {
const int last_idx =
std::min<int>(model_idx + downsample_scale, num_models) - 1;
CameraMotion::Type sampled_type = CameraMotion::VALID;
if (model_type) {
// Get least stable model within downsample window (max operation).
for (int i = model_idx; i <= last_idx; ++i) {
sampled_type = std::max(sampled_type, model_type->at(i));
}
downsampled_types->push_back(sampled_type);
}
// Concatenate models.
Model composed = models[last_idx];
for (int i = last_idx - 1; i >= model_idx; --i) {
composed = ModelCompose2(models[i], composed);
}
downsampled_models->push_back(composed);
}
}
template <class Container>
void SubsampleEntities(const Container& input, int downsample_factor,
Container* output) {
CHECK(output);
output->clear();
if (input.empty()) {
return;
}
for (int k = downsample_factor - 1; k < input.size();
k += downsample_factor) {
output->push_back(input[k]);
}
if (input.size() % downsample_factor != 0) {
// We need to add last constraint as termination.
output->push_back(input.back());
}
}
template <>
inline TranslationModel CameraMotionToModel(const CameraMotion& camera_motion) {
TranslationModel model;
CameraMotionToTranslation(camera_motion, &model);
return model;
}
template <>
inline LinearSimilarityModel CameraMotionToModel(
const CameraMotion& camera_motion) {
LinearSimilarityModel model;
CameraMotionToLinearSimilarity(camera_motion, &model);
return model;
}
template <>
inline AffineModel CameraMotionToModel(const CameraMotion& camera_motion) {
AffineModel model;
CameraMotionToAffine(camera_motion, &model);
return model;
}
template <>
inline Homography CameraMotionToModel(const CameraMotion& camera_motion) {
Homography model;
CameraMotionToHomography(camera_motion, &model);
return model;
}
template <>
inline MixtureHomography CameraMotionToModel(
const CameraMotion& camera_motion) {
MixtureHomography model;
CameraMotionToMixtureHomography(camera_motion, &model);
return model;
}
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_CAMERA_MOTION_H_

View File

@ -0,0 +1,198 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Describes camera motion between two frames with various degree of freedom
// parametric motion models.
// In addition, stores features describing how reliable the estimated motion
// model is.
// Flags indicate several properties derived from the camera motion, e.g. if a
// frame is sharp, blurry or contains overlays.
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/motion_models.proto";
// Next tag: 33
message CameraMotion {
// Background motion expressed in various models.
// These are per-frame pair motions (from current to previous frame).
// Models are expressed in the un-normalized domain frame_width x frame_height
// that is passed to MotionEstimation (storred below).
optional TranslationModel translation = 1;
optional SimilarityModel similarity = 2;
optional LinearSimilarityModel linear_similarity = 3;
optional AffineModel affine = 4;
optional Homography homography = 5;
optional MixtureHomography mixture_homography = 8;
// Frame dimensions camera motion was computed over.
optional float frame_width = 31;
optional float frame_height = 32;
// Mixture homographies computed w.r.t. exponentially increasing
// regularizers. Above mixture_homography member is selected from spectrum
// based on amount of rolling shutter present in the video.
repeated MixtureHomography mixture_homography_spectrum = 23;
// Relative row sigma w.r.t. frame_height for mixture models.
optional float mixture_row_sigma = 10;
// Average of all motion vector magnitudes (without accounting for any motion
// model), within 10th to 90th percentile (to remove outliers).
optional float average_magnitude = 24 [default = 0.0];
// Inlier-weighted variance of the translation model.
// Specified, w.r.t. unnormalized video domain that motion models
// are computed for.
optional float translation_variance = 25 [default = 0.0];
// Ratio of inliers w.r.t. regular and stricter thresholds. In [0, 1].
optional float similarity_inlier_ratio = 29 [default = 0];
optional float similarity_strict_inlier_ratio = 30 [default = 0];
// Average registration error of homography in pixels.
// Note: These two parameters default to zero in-case homographies have not
// been estimated.
optional float average_homography_error = 11;
// Fraction, in [0,1], of homography inliers.
optional float homography_inlier_coverage = 12;
// Same as above but with stricter threshold.
// (For details, see: MotionEstimationOptions::strict_coverage_scale).
// Coverage is designed to measure the amount of significant outliers,
// which can affect the validity of the estimated homography.
// However, it does not discount small outliers, which occur in case
// of small rolling shutter wobbles. For this a stricter version of coverage
// is needed, which is essential for computing the rolling_shutter_guess,
// i.e. the increase in coverage by using mixtures vs. homographies.
optional float homography_strict_inlier_coverage = 22;
// Per-block inlier fraction for mixtures.
repeated float mixture_inlier_coverage = 13;
// Set based on stability analysis indicating if frame is likely to originate
// from a rolling shutter camera. (-1 is used to indicate frame was not
// tested, e.g. due to mixture deemed unstable for analysis).
// Guess is a scaler indicating by how much the mixture models (suitable for
// rolling shutter distortions) increased inlier coverage compared to a single
// homography. For example a value, of 1.3 indicates, that the mixture models
// increased inlier coverage by 30%.
// If not -1, range is in [0, inf] (values slightly smaller than 1 are
// possible due to suppression of noisy feature tracks during estimation).
optional float rolling_shutter_guess = 14;
// Indicating if CameraMotion is deemed to originate from rolling
// shutter camera (index >= 0), and if so, denotes the index in the
// mixture_homography_spectrum, where higher indices correspond to heavier
// regularized motions. If motion is not deemed to originate from a rolling
// shutter camera, index is set to -1.
optional int32 rolling_shutter_motion_index = 16 [default = -1];
// List of overlay indices (cell locations in column major format) over domain
// of size overlay_domain x overlay_domain, where
// overlay_domain is set by MotionEstimation to
// MotionEstimationOptions::OverlayDetectionOptions::analysis_mask_size.
// Overlay analysis is performed over chunk of frames, as specified by
// MotionEstimationOptions::overlay_analysis_chunk_size, with the resulting
// overlay indices being assigned to each frame of the chunk.
// Consequently it suffices to store the result only for the first frame
// of every chunk. Subsequent frames store a single negative index relative
// to the first chunk frame indicating where to locate the overlay indicies.
// Specifically if for frame f, overlay_indices(0) == -2, overlay indices for
// corresponding chunk can be found at frame f - 2.
// For details about how overlay indices are used to flag a frame to contain
// an overlay, see MotionFilterOptions::OverlayOptions.
repeated int32 overlay_indices = 17;
optional int32 overlay_domain = 18 [default = 10];
// CameraMotion type indicates whether highest degree of freedom (DOF)
// model estimation was deemed stable, in which case CameraMotion::Type is set
// to VALID.
// If a model was deemed not stable (according to *StabilityBounds in
// MotionEstimationOptions), it is set to the lower dof type which was deemed
// stable.
enum Type {
VALID = 0; // All requested motion models estimated reliably.
UNSTABLE_HOMOG = 1; // Fallback to homographies, mixture unreliable.
UNSTABLE_SIM = 2; // Fallback to similarity model, homography
// unreliable.
UNSTABLE = 3; // Fallback to translation model, similarity
// unreliable, legacy naming.
INVALID = 4; // Identity model, translation unreliable.
}
optional Type type = 6 [default = VALID];
// If set, stores original type in case it was overriden (by filtering
// functions, etc.).
optional Type overridden_type = 15 [default = VALID];
// Set of optional *bit* flags set for various purposes.
enum Flags {
FLAG_SHOT_BOUNDARY = 1; // Set to indicate presence of a
// shot boundary.
FLAG_BLURRY_FRAME = 2;
FLAG_MAJOR_OVERLAY = 4;
FLAG_SHARP_FRAME = 8; // Set if frame is considered sharp
// in a neighborhood of frames.
FLAG_SINGULAR_ESTIMATION = 16; // Indicates that estimation resulted
// in singular optimization problem.
// Used internally by MotionEstimation.
// Indicates if shot boundary is part of a fade. If so, all frames of the
// fade will be labeled with the FLAG but only the begin and end of the fade
// will have the FLAG_SHOT_BOUNDARY set.
FLAG_SHOT_FADE = 32;
FLAG_DUPLICATED = 64; // Set if frame is exact duplicate of
// previous frame.
FLAG_CENTER_FRAME = 128; // Indicates this frame is at the
// center of the sequence. Currently
// used to constrain stabilizing crop
// transform.
}
optional int32 flags = 19 [default = 0];
// Same as in RegionFlowFeatureList (from region_flow.proto), measures blur
// as average cornerness over textured areas. As it depends on the image
// content, should only be used relative.
optional float blur_score = 20;
// Quanitifies amount of blur. Specified as ratio w.r.t. sharpest matching
// frame, i.e. 1 indicates no blur, values > 1 amount of blur w.r.t. sharpest
// frame.
optional float bluriness = 21 [default = 0.0];
// Same as in RegionFlowFeatureList (from region_flow.proto). Stores fraction
// of long feature tracks that got rejected for this frame.
optional float frac_long_features_rejected = 26;
// Same as in RegionFlowFeatureList (from region_flow.proto).
// Timestamp in micro seconds of the underlying frame.
optional int64 timestamp_usec = 27 [default = 0];
// Same as in RegionFlowFeatureList (from region_flow.proto).
// Denotes frame that motion was computed w.r.t. to, locally to the current
// frame. Values < 0 indicate backward tracking, while values > 0 indicate
// forward tracking. For example, match_frame = -1, indicates tracking is
// from current to previous frame.
optional int32 match_frame = 28 [default = 0];
// Deprecated fields.
extensions 9;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,168 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_UTIL_TRACKING_FLOW_PACKAGER_H_
#define MEDIAPIPE_UTIL_TRACKING_FLOW_PACKAGER_H_
#include <string>
#include <vector>
#include "absl/strings/string_view.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/util/tracking/flow_packager.pb.h"
#include "mediapipe/util/tracking/motion_estimation.pb.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
// Usage (output):
// FlowPackager flow_packager((FlowPackagerOptions()));
//
// // Input: Feature lists and optional camera motion
// vector<RegionFlowFeatureList> input_features; // Externally supplied.
// vector<CameraMotion> input_motions;
//
// const int num_frames = input_features.size();
//
// // Can encode to either TrackingContainerFormat or use protos.
// TrackingContainerFormat container;
// TrackingContainerProto proto;
//
// for (int f = 0; f < num_frames; ++f) {
// // Obtain tracking data.
// TrackingData track_data;
// flow_packager.PackFlow(input_features[f],
// input_motions[f],
// &track_data);
//
// // Encode tracking data.
// BinaryTrackingData binary_data;
// flow_packager.EncodeTrackingData(track_data,
// &binary_data);
//
// // Add to either container format or proto.
// // Container:
// TrackingContainer* track_data_encoded = container.add_track_data();
// flow_packager.BinaryTrackingDataToContainer(binary_data,
// track_data_encoded);
// // Proto:
// proto.add_track_data()->CopyFrom(binary_data);
// }
//
// // Write meta and term containers.
// flow_packager.FinalizeTrackingContainerFormat(&container);
// flow_packager.FinalizeTrackingProto(&proto);
//
// // Convert to binary std::string to stream out.
// std::string output;
// flow_packager.TrackingContainerFormatToBinary(container, &output);
// // OR:
// proto.SerializeToString(&output);
// Usage (input):
// std::string input;
// FlowPackager flow_packager((FlowPackagerOptions()));
// TrackingContainerFormat container;
// flow_packager.TrackingContainerFormatFromBinary(input, &container);
//
// // Obtain Track data.
// vector<TrackingData> tracking_data; // To be used by tracking.
// for (const auto& track_data_encoded : container.track_data()) {
// tracking_data.push_back(TrackingData());
// flow_packager.DecodeTrackingData(track_data_encoded,
// &tracking_data.back());
// }
//
// // Use tracking_data with Tracker.
class CameraMotion;
class RegionFlowFeatureList;
class FlowPackager {
public:
explicit FlowPackager(const FlowPackagerOptions& options);
FlowPackager(const FlowPackager&) = delete;
FlowPackager& operator=(const FlowPackager&) = delete;
void PackFlow(const RegionFlowFeatureList& feature_list,
const CameraMotion* camera_motion, // optional.
TrackingData* tracking_data) const;
// Converts TrackingData to condensed binary representation.
void EncodeTrackingData(const TrackingData& tracking_data,
BinaryTrackingData* binary_data) const;
void DecodeTrackingData(const BinaryTrackingData& data,
TrackingData* tracking_data) const;
void BinaryTrackingDataToContainer(const BinaryTrackingData& binary_data,
TrackingContainer* container) const;
void BinaryTrackingDataFromContainer(const TrackingContainer& container,
BinaryTrackingData* binary_data) const;
void DecodeMetaData(const TrackingContainer& data, MetaData* meta_ata) const;
// Fills in meta (first container) and termination data (last container).
// Optionally, pass timestamps for each frame.
void FinalizeTrackingContainerFormat(
std::vector<uint32>* timestamps, // optional, can be null.
TrackingContainerFormat* container_fromat);
void FinalizeTrackingContainerProto(
std::vector<uint32>* timestamps, // optional, can be null.
TrackingContainerProto* proto);
// Fast encode to binary representation.
void TrackingContainerFormatToBinary(
const TrackingContainerFormat& container_format, std::string* binary);
// Fast decode from binary representation.
void TrackingContainerFormatFromBinary(
const std::string& binary, TrackingContainerFormat* container_format);
// Checks whether tracking data can be encoded in high profile mode without
// duplicating any features. This occurs if the horizonal distance between two
// features is less than 64.
bool CompatibleForEncodeWithoutDuplication(
const TrackingData& tracking_data) const;
// Helper function for test. Sorts according to scaled, integer based
// lexicographical ordering.
// Declared public for access by test.
void SortRegionFlowFeatureList(float scale_x, float scale_y,
RegionFlowFeatureList* feature_list) const;
// Removes binary encoded container from std::string and parses it to
// container. Returns header std::string of the parsed container. Useful for
// random seek.
std::string SplitContainerFromString(absl::string_view* binary_data,
TrackingContainer* container);
private:
// Sets meta data for a set
void InitializeMetaData(int num_frames, const std::vector<uint32>& msecs,
const std::vector<int>& data_sizes,
MetaData* meta_data) const;
// Serializes container to binary std::string and adds it to binary_data.
void AddContainerToString(const TrackingContainer& container,
std::string* binary_data);
private:
FlowPackagerOptions options_;
};
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_FLOW_PACKAGER_H_

View File

@ -0,0 +1,319 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/motion_models.proto";
import "mediapipe/util/tracking/region_flow.proto";
// Messages encapsulating compressed and uncompressed TrackingData.
//
// Uncompressed tracking data can be aggregated via an TrackingDataChunk
// (e.g. to be cached to file per chunk). The whole chunk persists in memory
// after reading.
//
// Compressed tracking data can be aggregated as binary encoded TrackingData
// messages into two container formats (with support for random seeking):
// 1) TrackingContainerProto:
// Encoding using proto buffer wire format, using default proto
// serialization and de-serialization to binary string.
// The container uses the MetaData message to store the stream offsets and
// milliseconds for each frame of TrackingData. TrackingData itself is custom
// encoded to binary using FlowPackager::EncodeTrackingData and the resulting
// binary blob wrapped in a BinaryTrackingData message.
// 2) TrackingContainerFormat:
// Encoding without any dependencies to protobuffers, for clients without
// proto buffer support.
// Encoding is based on encoding binary blobs of data wrapped into repeated
// containers. The layout of a container is described by the message
// TrackingContainer and serialized to binary data as described below
// (without using proto encoding). Therefore, message TrackingContainer is
// mostly for documentation purposes than for direct use.
// The format is described by the proto message TrackingContainerFormat (used
// internally by FlowPackager) however serialization and de-serialization
// to binary string is performed using custom methods supplied by
// FlowPackager (TrackingContainerFormatToBinary and
// TrackingContainerFormatFromBinary).
// The format stores the MetaData first as above, although using custom
// encoding. TrackingData is encoded to binary as above using
// FlowPackager::EncodeTrackingData and the resulting binary blob is storred
// within a TrackingContainer.
// Next flag: 9
message TrackingData {
enum FrameFlags {
FLAG_PROFILE_BASELINE = 0;
FLAG_PROFILE_HIGH = 1;
FLAG_HIGH_FIDELITY_VECTORS = 2;
FLAG_BACKGROUND_UNSTABLE = 4; // Background model could not be estimated.
FLAG_DUPLICATED = 8; // Frame is duplicated, i.e. identical to
// previous one.
// Indicates the beginning of a new chunk. In this case the track_id's
// are not compatible w.r.t. previous one.
FLAG_CHUNK_BOUNDARY = 16;
}
optional int32 frame_flags = 1 [default = 0];
// Tracking data is resolution independent specified w.r.t.
// specified domain.
optional int32 domain_width = 2;
optional int32 domain_height = 3;
// Aspect ratio (w/h) of the original frame tracking data was computed from.
optional float frame_aspect = 6 [default = 1.0];
optional Homography background_model = 4;
// Stores num_elements vectors of motion data. (x,y) position encoded via
// row_indices and col_starts, as compressed sparse column matrix storage
// format:
// (https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29),
// Vector data is stored as (dx, dy) position. Optionally we store the fitting
// error and track id for each feature.
message MotionData {
optional int32 num_elements = 1;
// #num_elements pairs (flow_x, flow_y) densely packed.
repeated float vector_data = 2 [packed = true];
// Stores corresponding track index for each feature. Features belonging
// to the same track over time are assigned the same id.
// NOTE: Due to size, tracking ids are never stored as compressed binary
// tracking data.
repeated int32 track_id = 3 [packed = true];
// # num_elements row indices.
repeated int32 row_indices = 4 [packed = true];
// Start index in above array for each column (#domain_width + 1 entries).
repeated int32 col_starts = 5 [packed = true];
// Feature descriptors for num_elements feature points.
repeated BinaryFeatureDescriptor feature_descriptors = 6;
}
optional MotionData motion_data = 5;
// Total number of features in our analysis
optional uint32 global_feature_count = 7;
// Average of all motion vector magnitudes (without accounting for any motion
// model), within 10th to 90th percentile (to remove outliers).
optional float average_motion_magnitude = 8;
}
message TrackingDataChunk {
message Item {
optional TrackingData tracking_data = 1;
// Global frame index.
optional int32 frame_idx = 2;
// Corresponding timestamp.
optional int64 timestamp_usec = 3;
// Previous frame timestamp.
optional int64 prev_timestamp_usec = 4;
}
repeated Item item = 1;
// Set as marker for last chunk.
optional bool last_chunk = 2 [default = false];
// Set as marker for first chunk.
optional bool first_chunk = 3 [default = false];
}
// TrackingData in compressed binary format. Obtainable via
// FlowPackager::EncodeTrackingData. Details of binary encode are below.
message BinaryTrackingData { // TrackingContainer::header = "TRAK"
optional bytes data = 1;
}
// Detailed explanation of binary Tracking data encode (LITTLE ENDIAN encode!)
// TrackingData is stored in binary as a struct of the above fields and the
// compressed motion data in sparse column matrix storage format.
// (https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29)
// Specifically, TrackingData is encoded as:
// { frame_flags : 32 bit int (from member)
// domain_width : 32 bit int (from member)
// domain_height : 32 bit int (from member)
// frame_aspect : 32 bit float (from member)
//
// background_model : 6 * 32 bit float (dx, dy, a, b, c, d of AffineModel)
// scale : 32 bit float (scale vectors are multiplied with)
// num_vectors : 32 bit int (from member num_elements)
//
// col_start_delta : (domain_width + 1) * 8 bit uint (col starts delta
// encoded)
// row_idx_size : 32 bit int (size of row_idx array <= num_vectors)
// row_idx : row_idx_size * 8 bit uint
// vector_size : 32 bit int (size of vector_data)
// vector_data : vector_size * [8 bit | 16 bit] int
// (depending on FLAG_HIGH_FIDELITY_VECTORS)
// }
//
// >> Baseline encode <<
// Scale is determined such that maximum vector value (maximum across x and y)
// is mapped to highest 8 bit or 16 bit SIGNED int
// (i.e. 7 or 15 bit resolution respectively).
// Vector values are multiplied by this scale (storring float in int with
// truncation) and (dx, dy) is packed as [dy | dx] into a 16bit or 32 bit word.
// Unpacking requires therefore dividing the vector values by scale.
//
// Column starts are delta compressed, that is, col_start_delta[i] stores
// col_starts(i) - col_starts(i - 1) from MotionData.
//
// Row indices are directly stored at 8 bit uints, that is row_idx_size ==
// num_vectors in this case.
//
//
// >> High profile encode <<
// Scale is determined as above but for maximum vector deltas (maximum across x
// and y of magnitude in difference between two adjacent vectors). Vector value
// deltas are multiplied by this scale before encoding.
//
// Encoding is more complex compared to baseline. Instead of vector value, delta
// vector values (difference in dx = ddx, difference in dy = ddy)
// are multiplied by scale and storred packed as [ddy | ddx] into to 16bit or
// 32bit word. Compression algorithm accounts for error accumulation, so
// unpacking should first add deltas in integer domain (for x and y separately)
// and then divide by scale to yield (an approximation) of the
// original vector value.
// Most importantly, not every vector value is storred, but only if the delta is
// above the FlowPackagerOptions::high_profile_reuse_threshold, in which case we
// advance to the next vector data. Otherwise the previous vector is used.
//
// The information whether to advance is stored for each vector in the
// highest bit of of the row index (FlowPackagerOptions::ADVANCE_FLAG). Row
// indicies are not storred as in the baseline profile directly, but as deltas
// (reset at the beginning of every column). As deltas are small it is often
// possible to store two deltas (if both are < 8) in a single byte. This is
// indicated by the second highest flag in the row index
// (FlowPackagerOptions::DOUBLE_INDEX_ENCODE). If set, row index stores
// [row_delta_1 | row_delta_2] in the lower 6 bit. Note, that the advance flag
// applies uniformly to both deltas in this case.
// Sidenote (edge case): Due to the use of the top 2 bits as flags,
// at times we cannot store the full row delta in the lower 6 bits.
// In this case the vector is duplicated (using the ADVANCE_FLAG)
// until the delta sum of duplicated vectors reaches the original delta.
// Consequently, the compressed vector field in high profile may contain a few
// vectors more than the original.
//
// Column starts are delta compressed as in baseline, but account for double
// index encodes. Therefore each column delta is reduced by the number of double
// index encodes occuring for this column. This has to be replicated on the
// decoding side, each delta needs to be increased by the number of double index
// encodes encountered during encoding.
// Stores offsets for random seek and time offsets for each frame of
// TrackingData. Stream offsets are specified relative w.r.t. end of metadata
// blob.
// Offsets specify start of the corresponding binary encoded TrackingContainer
// (for TrackingContainerFormat) or BinaryTrackingData proto (for
// TrackingContainerProto).
message MetaData { // TrackingContainer::header = "META"
optional fixed32 num_frames = 2;
message TrackOffset {
optional fixed32 msec = 1; // Time offset of the metadata in msec.
optional fixed32 stream_offset = 2; // Offset of TrackingContainer or
// respectively BinaryTrackingData
// in stream.
// Specifed w.r.t. end of the Metadata.
}
repeated TrackOffset track_offsets = 3;
}
// TrackingContainer is self-describing container format to store arbitrary
// chunks of binary data. Each container is typed via its 4 character header,
// versioned via an int, and followed by the size of the binary data and the
// actual data. Designed for clients without availability of protobuffer
// support.
// Note: This message is mainly used for documentation purposes and uses custom
// encoding as specified by FlowPackager::TrackingContainerFormatToBinary.
// Default binary size of a TrackingContainer (DO NOT CHANGE!):
// header: 4 byte +
// version: 4 byte +
// size: 4 byte +
// data #size
// SUM: 12 + #size.
message TrackingContainer {
optional string header = 1; // 4 character header.
optional fixed32 version = 2 [default = 1]; // Version information.
optional fixed32 size = 3; // Size of binary data held by container
optional bytes data = 4; // Binary data encoded.
// DO NOT alter layout of TrackingContainer.
// Use version to extend or alter encoded binary data.
}
// Container format for clients without proto support (written via
// FlowPackager::TrackingContainerFormatToBinary and read via
// FlowPackager::TrackingContainerFormatFromBinary).
// Proto here is intermediate format for documentationa and internal use.
// Stores multiple TrackingContainers of different types.
// Meta data is storred first, to facilitate random seek (via stream offset
// positions) to arbitrary binary TrackinData. Termination container signals end
// of stream.
message TrackingContainerFormat {
optional TrackingContainer meta_data = 1; // Wraps binary meta data, via
// custom encode.
repeated TrackingContainer track_data = 2; // Wraps BinaryTrackingData.
// Add new TrackingContainers above before end of stream indicator.
// Zero sized termination container with TrackingContainer::header = "TERM".
optional TrackingContainer term_data = 3;
}
// Simplified proto format of above TrackingContainerFormat. Instead of using
// self-describing TrackingContainer's, we simply use the proto wire format for
// encoding and decoding (proto format is typed and versioned via ids).
message TrackingContainerProto {
optional MetaData meta_data = 1;
repeated BinaryTrackingData track_data = 2;
}
// Options controlling compression and encoding.
message FlowPackagerOptions {
// Tracking data is resolution independent specified w.r.t.
// specified domain. Only values <= 256 are supported if binary tracking data
// is requested to be supported (see below).
optional int32 domain_width = 1 [default = 256];
optional int32 domain_height = 2 [default = 192];
// Needs to be set for calls to FlowPackager::EncodeTrackingData. If encoding
// is not required, can be set to false in which case a higher domain_width
// can be used.
optional bool binary_tracking_data_support = 6 [default = true];
optional bool use_high_profile = 3 [default = false];
// If set uses 16 bit encode for vector data, in BinaryTrackingData,
// otherwise only 8 bits are used.
optional bool high_fidelity_16bit_encode = 4 [default = true];
// In high profile encode, re-use previously encoded vector when absolute
// difference to current vector is below threshold.
optional float high_profile_reuse_threshold = 5 [default = 0.5];
// High profile encoding flags.
enum HighProfileEncoding {
ADVANCE_FLAG = 0x80;
DOUBLE_INDEX_ENCODE = 0x40;
INDEX_MASK = 0x3F;
}
}

View File

@ -0,0 +1,93 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/camera_motion.proto";
import "mediapipe/util/tracking/frame_selection_solution_evaluator.proto";
import "mediapipe/util/tracking/region_flow.proto";
// Stores selected timestamps and corresponding frame index.
message FrameSelectionTimestamp {
optional int64 timestamp = 1; // Timestamp of the selected frame.
// Frame index of the selected frame in the initial video stream. If this
// timestamp was manufactured, this will be the index of the initial frame.
optional int32 frame_idx = 2;
// If this timestamp was manufactured, the timestamp of the original frame.
optional int64 processed_from_timestamp = 3 [default = -1];
}
// Stores the result of the frame selection, with composited features.
// Next index: 6
message FrameSelectionResult {
optional int64 timestamp = 1; // Timestamp of the selected frame.
// Frame index of the selected frame in the initial video stream. If this
// timestamp was manufactured, this will be the index of the initial frame.
optional int32 frame_idx = 2;
// CameraMotion from selected item to previous selected item.
optional CameraMotion camera_motion = 3;
// Features from selected item to previous selected item.
optional RegionFlowFeatureList features = 4;
// If this FrameSelectionResult was the result of processing a previous one,
// the timestamp of the original frame.
optional int64 processed_from_timestamp = 5 [default = -1];
}
// Next index: 7
message FrameSelectionCriterion {
// Interval at which frames should be sampled; set to zero if sampling should
// not be enforced (i.e. selection is performed w.r.t. other criteria).
optional int32 sampling_rate = 1 [default = 0];
// Bandwidth used during dynamic programming. The larger the bandwidth the
// more accurate the result w.r.t. the specified sampling rate. Smaller
// bandwidth's bias the solution suboptimally to center around the mean
// frame numbers of the sampling rate.
// If in (0, 1), assumed to specify fraction of total number of input frames,
// otherwise must be an integer.
optional float bandwidth_frames = 2 [default = 50];
// Search radius for dynamic programming (how many frames you are allowed to
// search around the previous frame).
optional int32 search_radius_frames = 3 [default = 1];
// Allows one to specify custom solution selection criteria (i.e. different
// way to choose the best row of the computed cost matrix).
optional FrameSelectionSolutionEvaluatorType solution_evaluator = 4;
// Outputs a fixed number of frames and automatically sets the appropriate
// sampling rate. Set to 0 by default (i.e. not enabled).
optional int32 max_output_frames = 5 [default = 0];
}
// Options for computing frame selection.
// TODO: Support multiple criteria if required. Currently uses only the
// first one.
message FrameSelectionOptions {
repeated FrameSelectionCriterion criterion = 1;
// FrameSelection buffers incoming CameraMotions for specified chunk size
// and creates cost matrices upon reaching the limit.
// TODO: Implement if necessary (currently nothing is cleared upon
// reaching the limit).
optional int32 chunk_size = 2 [default = 100];
}

View File

@ -0,0 +1,27 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
message FrameSelectionSolutionEvaluatorOptions {
extensions 20000 to max;
}
message FrameSelectionSolutionEvaluatorType {
// Class of type FrameSelectionSolution that computes the best row.
optional string class_name = 1 [default = "FrameSelectionSolutionEvaluator"];
optional FrameSelectionSolutionEvaluatorOptions options = 2;
}

View File

@ -0,0 +1,116 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/image_util.h"
#include <algorithm>
#include <vector>
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/util/tracking/region_flow.h"
namespace mediapipe {
// Returns median of the L1 color distance between img_1 and img_2
float FrameDifferenceMedian(const cv::Mat& img_1, const cv::Mat& img_2) {
CHECK(img_1.size() == img_2.size());
CHECK_EQ(img_1.channels(), img_2.channels());
std::vector<float> color_diffs;
color_diffs.reserve(img_1.cols * img_1.rows);
const int channels = img_1.channels();
for (int j = 0; j < img_1.rows; ++j) {
const uint8* src_1 = img_1.ptr<uint8>(j);
const uint8* src_2 = img_2.ptr<uint8>(j);
const int end_i = channels * img_1.cols;
const float inverse = 1.0f / channels;
for (int i = 0; i < end_i;) {
float color_diff = 0.0f;
for (int k = 0; k < channels; ++k, ++i) {
color_diff +=
std::abs(static_cast<int>(src_1[i]) - static_cast<int>(src_2[i]));
}
color_diffs.push_back(color_diff * inverse);
}
}
auto median = color_diffs.begin() + color_diffs.size() / 2;
std::nth_element(color_diffs.begin(), median, color_diffs.end());
return *median;
}
void JetColoring(int steps, std::vector<Vector3_f>* color_map) {
CHECK(color_map != nullptr);
color_map->resize(steps);
for (int i = 0; i < steps; ++i) {
const float frac = 2.0f * (i * (1.0f / steps) - 0.5f);
if (frac < -0.8f) {
(*color_map)[i] = Vector3_f(0, 0, 0.6f + (frac + 1.0) * 2.0f) * 255.0f;
} else if (frac >= -0.8f && frac < -0.25f) {
(*color_map)[i] = Vector3_f(0, (frac + 0.8f) * 1.82f, 1.0f) * 255.0f;
} else if (frac >= -0.25f && frac < 0.25f) {
(*color_map)[i] =
Vector3_f((frac + 0.25f) * 2.0f, 1.0f, 1.0 + (frac + 0.25f) * -2.0f) *
255.0f;
} else if (frac >= 0.25f && frac < 0.8f) {
(*color_map)[i] =
Vector3_f(1.0f, 1.0f + (frac - 0.25f) * -1.81f, 0.0f) * 255.0f;
} else if (frac >= 0.8f) {
(*color_map)[i] =
Vector3_f(1.0f + (frac - 0.8f) * -2.0f, 0.0f, 0.0f) * 255.0f;
} else {
LOG(ERROR) << "Out of bound value. Should not occur.";
}
}
}
void RenderSaliency(const SalientPointFrame& salient_points,
const cv::Scalar& line_color, int line_thickness,
bool render_bounding_box, cv::Mat* image) {
// Visualize salient points.
for (const auto& point : salient_points.point()) {
if (point.weight() > 0) {
SalientPoint copy = point;
ScaleSalientPoint(image->cols, image->rows, &copy);
Vector2_f pt(copy.norm_point_x(), copy.norm_point_y());
cv::ellipse(*image, cv::Point(pt.x(), pt.y()),
cv::Size(copy.norm_major(), copy.norm_minor()),
copy.angle() / M_PI * 180.0,
0, // start angle
360, // end angle
line_color, line_thickness);
if (render_bounding_box) {
std::vector<Vector2_f> ellipse_bounding_box;
BoundingBoxFromEllipse(pt, copy.norm_major(), copy.norm_minor(),
copy.angle(), &ellipse_bounding_box);
std::vector<cv::Point> corners;
corners.reserve(4);
for (const Vector2_f& corner : ellipse_bounding_box) {
corners.emplace_back(cv::Point(corner.x(), corner.y()));
}
for (int k = 0; k < 4; ++k) {
cv::line(*image, corners[k], corners[(k + 1) % 4], line_color,
line_thickness, CV_AA);
}
}
}
}
}
} // namespace mediapipe

View File

@ -0,0 +1,192 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_UTIL_TRACKING_IMAGE_UTIL_H_
#define MEDIAPIPE_UTIL_TRACKING_IMAGE_UTIL_H_
#include <vector>
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
#include "mediapipe/framework/port/vector.h"
#include "mediapipe/util/tracking/motion_models.pb.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
// Returns median of the L1 color distance between img_1 and img_2.
float FrameDifferenceMedian(const cv::Mat& img_1, const cv::Mat& img_2);
// Matlab's jet color map (returned assuming RGB channel order in [0, 1]
// normalized intensity domain). For details: http://goo.gl/gmHKZ
// Returned as map with num_entries entries.
void JetColoring(int num_entries, std::vector<Vector3_f>* color_map);
// Draws a saliency point frame to a single frame.
// Optionally renders axis aligned bounding box for each SalientPointFrame.
void RenderSaliency(const SalientPointFrame& salient_points,
const cv::Scalar& line_color, int line_thickness,
bool render_bounding_box, cv::Mat* image);
// Templated CopyBorder methods for increased speed. In-place border copy
// for specified Mat of type T with channels. Passed matrix is assumed to be of
// full size, that is we copy the content at [border, cols - 2 * border] x
// [border, rows - 2 * border] to the full size.
template <typename T, int border, int channels>
void CopyMatBorder(cv::Mat* mat);
// Same as above for copying border only in X or Y
template <typename T, int border, int channels>
void CopyMatBorderX(cv::Mat* mat);
template <typename T, int border, int channels>
void CopyMatBorderY(cv::Mat* mat);
template <typename T, int border, int channels>
void CopyMatBorder(cv::Mat* mat) {
const int width = mat->cols - 2 * border;
const int height = mat->rows - 2 * border;
// Maximum values we clamp at to avoid going out of bound small images.
const int max_w = width - 1;
const int max_h = height - 1;
// Top rows.
for (int r = 0; r < border; ++r) {
const T* src_ptr =
mat->ptr<T>(border + std::min(r, max_h)) + border * channels;
T* dst_ptr = mat->ptr<T>(border - 1 - r);
// Top left elems.
for (int i = 0; i < border; ++i, dst_ptr += channels) {
for (int j = 0; j < channels; ++j) {
dst_ptr[j] = src_ptr[std::min(max_w, border - 1 - i) * channels + j];
}
}
// src and dst should point to same column from here.
DCHECK_EQ(0, (src_ptr - dst_ptr) * sizeof(T) % mat->step[0]);
// Top row copy.
memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0]));
src_ptr += width * channels; // Points one behind end.
dst_ptr += width * channels;
// Top right elems.
for (int i = 0; i < border; ++i, dst_ptr += channels) {
if (i <= max_w) {
src_ptr -= channels;
}
for (int j = 0; j < channels; ++j) {
dst_ptr[j] = src_ptr[j];
}
}
}
// Left and right border.
for (int r = 0; r < height; ++r) {
// Get pointers to left most and right most column within image.
T* left_ptr = mat->ptr<T>(r + border) + border * channels;
T* right_ptr = left_ptr + (width - 1) * channels;
for (int i = 0; i < border; ++i) {
for (int j = 0; j < channels; ++j) {
left_ptr[-(i + 1) * channels + j] =
left_ptr[std::min(max_w, i) * channels + j];
right_ptr[(i + 1) * channels + j] =
right_ptr[-std::min(max_w, i) * channels + j];
}
}
}
// Bottom rows.
for (int r = 0; r < border; ++r) {
const T* src_ptr = mat->ptr<T>(border + height - 1 - std::min(r, max_h)) +
border * channels;
T* dst_ptr = mat->ptr<T>(border + height + r);
// First elems.
for (int i = 0; i < border; ++i, dst_ptr += channels) {
for (int j = 0; j < channels; ++j) {
dst_ptr[j] = src_ptr[(border - 1 - std::min(max_w, i)) * channels + j];
}
}
// src and dst should point to same column from here.
DCHECK_EQ(0, (dst_ptr - src_ptr) * sizeof(T) % mat->step[0]);
memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0]));
src_ptr += width * channels; // Points one behind the end.
dst_ptr += width * channels;
// Top right elems.
for (int i = 0; i < border; ++i, dst_ptr += channels) {
if (i <= max_w) {
src_ptr -= channels;
}
for (int j = 0; j < channels; ++j) {
dst_ptr[j] = src_ptr[j];
}
}
}
}
template <typename T, int border, int channels>
void CopyMatBorderX(cv::Mat* mat) {
const int width = mat->cols - 2 * border;
const int height = mat->rows - 2 * border;
// Maximum values we clamp at to avoid going out of bound small images.
const int max_w = width - 1;
// Left and right border.
for (int r = 0; r < height; ++r) {
T* left_ptr = mat->ptr<T>(r + border) + border * channels;
T* right_ptr = left_ptr + (width - 1) * channels;
for (int i = 0; i < border; ++i) {
for (int j = 0; j < channels; ++j) {
left_ptr[-(i + 1) * channels + j] =
left_ptr[std::min(i, max_w) * channels + j];
right_ptr[(i + 1) * channels + j] =
right_ptr[-std::min(max_w, i) * channels + j];
}
}
}
}
template <typename T, int border, int channels>
void CopyMatBorderY(cv::Mat* mat) {
const int width = mat->cols - 2 * border;
const int height = mat->rows - 2 * border;
// Maximum values we clamp at to avoid going out of bound small images.
const int max_h = height - 1;
// Top rows.
for (int r = 0; r < border; ++r) {
const T* src_ptr =
mat->ptr<T>(border + std::min(max_h, r)) + border * channels;
T* dst_ptr = mat->ptr<T>(border - 1 - r) + border * channels;
memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0]));
}
// Bottom rows.
for (int r = 0; r < border; ++r) {
const T* src_ptr = mat->ptr<T>(border + height - 1 - std::min(max_h, r)) +
border * channels;
T* dst_ptr = mat->ptr<T>(border + height + r) + border * channels;
memcpy(dst_ptr, src_ptr, width * channels * sizeof(dst_ptr[0]));
}
}
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_IMAGE_UTIL_H_

View File

@ -0,0 +1,93 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/image_util.h"
#include "mediapipe/framework/port/gtest.h"
namespace mediapipe {
template <int border>
void TestCopyBorder(cv::Mat* full_size) {
full_size->setTo(0);
const int width = full_size->cols - 2 * border;
// Fill center with per pixel value [1, 2, 3].
for (int y = border; y < full_size->rows - border; ++y) {
float* row_ptr = full_size->ptr<float>(y);
for (int x = border; x < full_size->cols - border; ++x) {
int value = x - border;
row_ptr[3 * x] = value + 0.1f;
row_ptr[3 * x + 1] = value + 0.2f;
row_ptr[3 * x + 2] = value + 0.3f;
}
}
CopyMatBorder<float, border, 3>(full_size);
// Main part should not be modified.
for (int y = border; y < full_size->rows - border; ++y) {
float* row_ptr = full_size->ptr<float>(y);
for (int x = border; x < full_size->cols - border; ++x) {
int value = x - border;
EXPECT_EQ(row_ptr[3 * x], value + 0.1f);
EXPECT_EQ(row_ptr[3 * x + 1], value + 0.2f);
EXPECT_EQ(row_ptr[3 * x + 2], value + 0.3f);
}
}
for (int y = 0; y < full_size->rows; ++y) {
const float* row_ptr = full_size->ptr<float>(y);
for (int x = 0; x < full_size->cols; ++x) {
// Memory copy, expect exact floating point equality.
int value = x - border;
if (x < border) {
// Cap to valid area.
value = std::min(width - 1, (border - 1) - x);
} else if (x >= full_size->cols - border) {
// Last column minus distance from frame boundary, capped
// to valid area.
value = std::max(0, width - 1 - (x - full_size->cols + border));
}
EXPECT_EQ(row_ptr[3 * x], value + 0.1f);
EXPECT_EQ(row_ptr[3 * x + 1], value + 0.2f);
EXPECT_EQ(row_ptr[3 * x + 2], value + 0.3f);
}
}
}
TEST(PushPullFilteringTest, CopyBorder) {
cv::Mat full_size(100, 50, CV_32FC3);
TestCopyBorder<1>(&full_size);
TestCopyBorder<2>(&full_size);
TestCopyBorder<3>(&full_size);
TestCopyBorder<4>(&full_size);
TestCopyBorder<5>(&full_size);
}
TEST(PushPullFilteringTest, CopyBorderSmallFrame) {
cv::Mat full_size(3, 3, CV_32FC3);
TestCopyBorder<1>(&full_size);
full_size.create(5, 5, CV_32FC3);
TestCopyBorder<2>(&full_size);
full_size.create(7, 7, CV_32FC3);
TestCopyBorder<3>(&full_size);
full_size.create(9, 9, CV_32FC3);
TestCopyBorder<4>(&full_size);
}
} // namespace mediapipe

View File

@ -0,0 +1,22 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/measure_time.h"
#ifdef SET_FLAG_MEASURE_TIME
// If set to true, outputs time measurements to LOG(INFO).
bool flags_measure_time = true;
#else
bool flags_measure_time = false;
#endif

View File

@ -0,0 +1,169 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Helper class and macro to take time measurements within current scope.
// Takes time measurement within current scope. Outputs to LOG(INFO) if
// flag --measure_time is set or if build flag SET_FLAG_MEASURE_TIME is
// defined (add --copt=-DSET_FLAG_MEASURE_TIME to your build command).
// Additionally you can limit time measurements to specific files,
// via the flag
// --measure_time_filter="<comma separated list of substrings of file names>"
// Example:
// { // Scope to be measured
// MEASURE_TIME << "Some additional logging and answers : " << 42;
// ... // instructions.
// }
#ifndef MEDIAPIPE_UTIL_TRACKING_MEASURE_TIME_H_
#define MEDIAPIPE_UTIL_TRACKING_MEASURE_TIME_H_
#include <memory>
#include <sstream>
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/clock.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/logging.h"
extern bool flags_measure_time;
namespace mediapipe {
class MeasureTimeFilter;
} // namespace mediapipe
#define MEASURE_TIME \
MEASURE_TIME_PRE_IMPL(flags_measure_time, __LINE__, __FILE__)
// Add level of indirection to trigger __LINE__ macro expansion.
#define MEASURE_TIME_PRE_IMPL(show_output, line, file) \
MEASURE_TIME_IMPL(show_output, line, file)
#define MEASURE_TIME_IMPL(show_output, line, file) \
std::unique_ptr<mediapipe::ScopedWallTimer> scoped_wall_timer_##line; \
const bool activated##line = show_output; \
if (activated##line) { \
static mediapipe::ScopedWallTimer::Accumulator* \
scoped_wall_timer_accum_##line = \
new mediapipe::ScopedWallTimer::Accumulator; \
scoped_wall_timer_##line.reset(new mediapipe::ScopedWallTimer( \
file, line, show_output, scoped_wall_timer_accum_##line)); \
} \
if (activated##line) /* NOLINT */ \
scoped_wall_timer_##line->stream()
namespace mediapipe {
class ScopedWallTimer {
public:
// Helper class for accumulating time across multiple calls to a scoped wall
// timer. Thread-safe (except on ANDROID, for which Mutex is mocked out).
class Accumulator {
public:
Accumulator() : accum_time_(0.0f), count_(0) {}
Accumulator(const Accumulator&) = delete;
Accumulator& operator=(const Accumulator&) = delete;
// Accumulates passed_time into accumulators. Returns total time and number
// of calls made.
void Accumulate(double passed_time, double* accum_time, int* count) {
absl::MutexLock lock(&mutex_);
accum_time_ += passed_time;
++count_;
*accum_time = accum_time_;
*count = count_;
}
private:
double accum_time_;
int count_;
absl::Mutex mutex_;
};
// Creates a new ScopedWallTimer for current file and line. LogMessage is only
// initialized if show_output is set to true.
ScopedWallTimer(const char* file, int line, bool show_output,
Accumulator* accumulator)
: file_(file),
line_(line),
show_output_(show_output),
accumulator_(accumulator) {
if (show_output_) {
CHECK(accumulator_);
start_time_ = GetWallTime();
}
}
ScopedWallTimer(const ScopedWallTimer&) = delete;
ScopedWallTimer& operator=(const ScopedWallTimer&) = delete;
// Destructor measures time and outputs to stream.
~ScopedWallTimer() {
if (show_output_) {
double passed_time = GetWallTime() - start_time_;
double accum_time = 0.0;
int count = 0;
accumulator_->Accumulate(passed_time, &accum_time, &count);
LOG(INFO) << stream_.str() << " TIMES: [Curr: " << passed_time * 1e-6
<< " ms, "
<< "Avg: " << accum_time * 1e-6 / std::max(1, count) << " ms, "
<< count << " calls]";
}
}
std::ostream& stream() { return stream_; }
private:
const char* file_;
int line_;
bool show_output_;
// We need to buffer information passed via stream operator <<
// While LogMessage is adequate for this, no good equivalent exists on
// Android, so we employ a portable ostringstream for buffering.
std::ostringstream stream_;
int64 start_time_;
Accumulator* accumulator_;
int64 GetWallTime() { return absl::GetCurrentTimeNanos(); }
};
class MeasureTimeFilter {
public:
static const MeasureTimeFilter* get() {
static MeasureTimeFilter instance;
return &instance;
}
MeasureTimeFilter(const MeasureTimeFilter&) = delete;
MeasureTimeFilter& operator=(const MeasureTimeFilter&) = delete;
bool Matches(const std::string& item) const {
for (const std::string& match_item : match_items_) {
if (item.find(match_item) != std::string::npos) {
return true;
}
}
return false;
}
private:
explicit MeasureTimeFilter() {}
explicit MeasureTimeFilter(const std::string& filter) {
match_items_ = absl::StrSplit(filter, absl::ByChar(','));
}
std::vector<std::string> match_items_;
};
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_MEASURE_TIME_H_

View File

@ -0,0 +1,850 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/motion_analysis.h"
#include <algorithm>
#include <cmath>
#include <cstring>
#include <deque>
#include <memory>
#include "absl/strings/str_format.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/vector.h"
#include "mediapipe/util/tracking/camera_motion.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/image_util.h"
#include "mediapipe/util/tracking/measure_time.h"
#include "mediapipe/util/tracking/motion_saliency.pb.h"
#include "mediapipe/util/tracking/region_flow.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
#include "mediapipe/util/tracking/region_flow_computation.h"
#include "mediapipe/util/tracking/region_flow_computation.pb.h"
#include "mediapipe/util/tracking/region_flow_visualization.h"
namespace mediapipe {
MotionAnalysis::MotionAnalysis(const MotionAnalysisOptions& options,
int frame_width, int frame_height)
: options_(options),
frame_width_(frame_width),
frame_height_(frame_height) {
// Init options by policy.
InitPolicyOptions();
// Merge back in any overriden options.
options_.MergeFrom(options);
region_flow_computation_.reset(new RegionFlowComputation(
options_.flow_options(), frame_width_, frame_height_));
motion_estimation_.reset(new MotionEstimation(options_.motion_options(),
frame_width_, frame_height_));
if (options_.compute_motion_saliency()) {
motion_saliency_.reset(new MotionSaliency(options_.saliency_options(),
frame_width_, frame_height_));
// Compute overlap needed between clips if filtering or smoothing of
// saliency is requested.
if (options_.select_saliency_inliers()) {
// 1.65 sigmas in each direction = 90% of variance is captured.
overlap_size_ = std::max<int>(
overlap_size_, options_.saliency_options().selection_frame_radius());
}
if (options_.filter_saliency()) {
overlap_size_ = std::max<int>(
overlap_size_,
options_.saliency_options().filtering_sigma_time() * 1.65f);
}
}
long_feature_stream_.reset(new LongFeatureStream);
frame_num_ = 0;
// Determine if feature descriptors need to be computed.
// Required for irls smoothing, overlay detection and mixture homographies.
const bool compute_mixtures =
options_.motion_options().mix_homography_estimation() !=
MotionEstimationOptions::ESTIMATION_HOMOG_MIX_NONE;
const bool use_spatial_bias =
options_.motion_options().estimation_policy() ==
MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS &&
options_.motion_options().long_feature_bias_options().use_spatial_bias();
compute_feature_descriptors_ =
options_.post_irls_smoothing() ||
options_.motion_options().overlay_detection() || compute_mixtures ||
use_spatial_bias;
if (compute_feature_descriptors_) {
CHECK_EQ(RegionFlowComputationOptions::FORMAT_RGB,
options_.flow_options().image_format())
<< "Feature descriptors only support RGB currently.";
prev_frame_.reset(new cv::Mat(frame_height_, frame_width_, CV_8UC3));
}
// Setup streaming buffer. By default we buffer features and motion.
// If saliency is computed, also buffer saliency and filtered/smoothed
// output_saliency.
std::vector<TaggedType> data_config{
TaggedPointerType<RegionFlowFeatureList>("features"),
TaggedPointerType<CameraMotion>("motion")};
std::vector<TaggedType> data_config_saliency = data_config;
data_config_saliency.push_back(
TaggedPointerType<SalientPointFrame>("saliency"));
data_config_saliency.push_back(
TaggedPointerType<SalientPointFrame>("output_saliency"));
// Store twice the overlap.
buffer_.reset(new StreamingBuffer(
options_.compute_motion_saliency() ? data_config_saliency : data_config,
2 * overlap_size_));
}
void MotionAnalysis::InitPolicyOptions() {
auto* flow_options = options_.mutable_flow_options();
auto* tracking_options = flow_options->mutable_tracking_options();
auto* motion_options = options_.mutable_motion_options();
auto* feature_bias_options =
motion_options->mutable_long_feature_bias_options();
auto* translation_bounds =
motion_options->mutable_stable_translation_bounds();
auto* similarity_bounds = motion_options->mutable_stable_similarity_bounds();
auto* homography_bounds = motion_options->mutable_stable_homography_bounds();
switch (options_.analysis_policy()) {
case MotionAnalysisOptions::ANALYSIS_POLICY_LEGACY:
break;
case MotionAnalysisOptions::ANALYSIS_POLICY_VIDEO:
// Long track settings. Temporally consistent.
options_.set_estimation_clip_size(64);
tracking_options->set_internal_tracking_direction(
TrackingOptions::FORWARD);
tracking_options->set_tracking_policy(
TrackingOptions::POLICY_LONG_TRACKS);
feature_bias_options->set_use_spatial_bias(false);
feature_bias_options->set_seed_priors_from_bias(true);
similarity_bounds->set_min_inlier_fraction(0.15f);
homography_bounds->set_min_inlier_coverage(0.25f);
flow_options->set_verify_long_features(false);
// Better features.
tracking_options->set_adaptive_features_levels(3);
tracking_options->set_use_cv_tracking_algorithm(true);
// Speed.
flow_options->set_downsample_mode(
RegionFlowComputationOptions::DOWNSAMPLE_BY_SCHEDULE);
motion_options->set_estimation_policy(
MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS);
motion_options->set_feature_density_normalization(true);
motion_options->set_overlay_detection(true);
motion_options->set_domain_limited_irls_scaling(true);
motion_options->set_irls_weights_preinitialized(true);
motion_options->mutable_irls_initialization()->set_activated(true);
motion_options->mutable_long_feature_initialization()->set_activated(
true);
break;
case MotionAnalysisOptions::ANALYSIS_POLICY_VIDEO_MOBILE:
// Long track settings. Temporally consistent.
options_.set_estimation_clip_size(32);
tracking_options->set_internal_tracking_direction(
TrackingOptions::FORWARD);
tracking_options->set_tracking_policy(
TrackingOptions::POLICY_LONG_TRACKS);
motion_options->set_estimation_policy(
MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS);
motion_options->set_feature_density_normalization(true);
motion_options->set_domain_limited_irls_scaling(true);
motion_options->mutable_irls_initialization()->set_activated(true);
motion_options->mutable_long_feature_initialization()->set_activated(
true);
feature_bias_options->set_use_spatial_bias(false);
feature_bias_options->set_seed_priors_from_bias(true);
similarity_bounds->set_inlier_threshold(2.0);
similarity_bounds->set_min_inlier_fraction(0.15f);
// Speed for mobile.
tracking_options->set_tracking_window_size(8);
tracking_options->set_tracking_iterations(8);
tracking_options->set_fractional_tracking_distance(0.1);
tracking_options->set_reuse_features_max_frame_distance(15);
tracking_options->set_max_features(500);
tracking_options->set_use_cv_tracking_algorithm(true);
flow_options->set_downsample_mode(
RegionFlowComputationOptions::DOWNSAMPLE_TO_MIN_SIZE);
flow_options->set_round_downsample_factor(true);
flow_options->set_downsampling_size(256);
flow_options->set_pre_blur_sigma(0);
flow_options->set_top_inlier_sets(1);
flow_options->set_ransac_rounds_per_region(10);
flow_options->mutable_visual_consistency_options()
->set_compute_consistency(false);
flow_options->set_verify_long_features(false);
// Avoid resetting features if a large amount is flagged.
// In that case verify feature tracks to avoid pertubing the estimation.
flow_options->set_verify_long_feature_acceleration(true);
flow_options->set_verification_distance(5.0);
flow_options->set_verify_long_feature_trigger_ratio(0.1);
motion_options->set_use_exact_homography_estimation(false);
motion_options->set_use_highest_accuracy_for_normal_equations(false);
break;
case MotionAnalysisOptions::ANALYSIS_POLICY_CAMERA_MOBILE:
// Long track settings.
tracking_options->set_internal_tracking_direction(
TrackingOptions::FORWARD);
tracking_options->set_tracking_policy(
TrackingOptions::POLICY_LONG_TRACKS);
motion_options->set_estimation_policy(
MotionEstimationOptions::TEMPORAL_IRLS_MASK);
motion_options->set_feature_density_normalization(true);
motion_options->set_domain_limited_irls_scaling(true);
motion_options->mutable_irls_initialization()->set_activated(true);
motion_options->mutable_irls_initialization()->set_rounds(50);
feature_bias_options->set_use_spatial_bias(false);
feature_bias_options->set_seed_priors_from_bias(true);
similarity_bounds->set_inlier_threshold(2.0);
similarity_bounds->set_min_inlier_fraction(0.15f);
// Speed for mobile.
tracking_options->set_tracking_window_size(8);
tracking_options->set_tracking_iterations(8);
tracking_options->set_fractional_tracking_distance(0.1);
tracking_options->set_reuse_features_max_frame_distance(10);
tracking_options->set_reuse_features_min_survived_frac(0.6);
tracking_options->set_max_features(240);
tracking_options->set_adaptive_tracking_distance(true);
tracking_options->set_use_cv_tracking_algorithm(true);
// Assumes downsampled input.
flow_options->set_pre_blur_sigma(0);
flow_options->set_top_inlier_sets(1);
flow_options->set_ransac_rounds_per_region(10);
flow_options->mutable_visual_consistency_options()
->set_compute_consistency(false);
flow_options->set_verify_long_features(false);
motion_options->set_use_exact_homography_estimation(false);
motion_options->set_use_highest_accuracy_for_normal_equations(false);
// Low latency.
options_.set_estimation_clip_size(1);
break;
case MotionAnalysisOptions::ANALYSIS_POLICY_HYPERLAPSE:
// Long track settings. Temporally consistent.
options_.set_estimation_clip_size(64);
tracking_options->set_internal_tracking_direction(
TrackingOptions::FORWARD);
tracking_options->set_tracking_policy(
TrackingOptions::POLICY_LONG_TRACKS);
feature_bias_options->set_use_spatial_bias(false);
feature_bias_options->set_seed_priors_from_bias(true);
// Relaxed stability bounds.
translation_bounds->set_max_motion_stdev(0.15);
translation_bounds->set_max_acceleration(60.0);
translation_bounds->set_frac_max_motion_magnitude(0.25);
similarity_bounds->set_min_inlier_fraction(0.02f);
similarity_bounds->set_min_inliers(5);
similarity_bounds->set_lower_scale(0.5);
similarity_bounds->set_upper_scale(2.0);
similarity_bounds->set_limit_rotation(0.5);
homography_bounds->set_lower_scale(0.5);
homography_bounds->set_upper_scale(2.0);
homography_bounds->set_limit_rotation(0.5);
homography_bounds->set_limit_perspective(0.004);
homography_bounds->set_min_inlier_coverage(0.1);
// Relax constraining of homographies.
motion_options->set_lin_sim_inlier_threshold(0.02);
// Anticipate much higher registration errors.
motion_options->set_irls_motion_magnitude_fraction(0.2);
flow_options->set_verify_long_features(false);
// Better features.
tracking_options->set_adaptive_features_levels(3);
// Speed.
tracking_options->set_use_cv_tracking_algorithm(true);
flow_options->set_downsample_mode(
RegionFlowComputationOptions::DOWNSAMPLE_BY_SCHEDULE);
// Less quality features.
flow_options->set_absolute_inlier_error_threshold(4);
flow_options->set_relative_inlier_error_threshold(0.2);
motion_options->set_estimation_policy(
MotionEstimationOptions::TEMPORAL_LONG_FEATURE_BIAS);
motion_options->set_feature_density_normalization(true);
// TODO: This needs to be activated and then tighten relaxed
// thresholds above again.
// motion_options->set_domain_limited_irls_scaling(true);
motion_options->set_irls_weights_preinitialized(true);
motion_options->mutable_irls_initialization()->set_activated(true);
// More weight towards long feature tracks.
motion_options->mutable_long_feature_initialization()->set_activated(
true);
motion_options->mutable_long_feature_initialization()
->set_min_length_percentile(0.975);
motion_options->mutable_long_feature_initialization()
->set_upweight_multiplier(50);
// 2nd pass, no need to do overlay again.
motion_options->set_overlay_detection(false);
break;
}
}
bool MotionAnalysis::AddFrame(const cv::Mat& frame, int64 timestamp_usec,
RegionFlowFeatureList* feature_list) {
return AddFrameWithSeed(frame, timestamp_usec, Homography(), feature_list);
}
bool MotionAnalysis::AddFrameWithSeed(const cv::Mat& frame,
int64 timestamp_usec,
const Homography& initial_transform,
RegionFlowFeatureList* feature_list) {
return AddFrameGeneric(frame, timestamp_usec, initial_transform,
nullptr, // rejection_transform
nullptr, // external features
nullptr, // feature modification function
feature_list);
}
bool MotionAnalysis::AddFrameGeneric(
const cv::Mat& frame, int64 timestamp_usec,
const Homography& initial_transform, const Homography* rejection_transform,
const RegionFlowFeatureList* external_features,
std::function<void(RegionFlowFeatureList*)>* modify_features,
RegionFlowFeatureList* output_feature_list) {
// Don't check input sizes here, RegionFlowComputation does that based
// on its internal options.
CHECK(feature_computation_) << "Calls to AddFrame* can NOT be mixed "
<< "with AddFeatures";
// Compute RegionFlow.
{
MEASURE_TIME << "CALL RegionFlowComputation::AddImage";
if (!region_flow_computation_->AddImageWithSeed(frame, timestamp_usec,
initial_transform)) {
LOG(ERROR) << "Error while computing region flow.";
return false;
}
}
// Buffer features.
std::unique_ptr<RegionFlowFeatureList> feature_list;
{
MEASURE_TIME << "CALL RegionFlowComputation::RetrieveRegionFlowFeatureList";
const bool compute_feature_match_descriptors =
compute_feature_descriptors_ && frame_num_ > 0;
// Determine which region flow should be computed in case multi frame
// tracking has been requested.
int max_track_index = 0;
if (options_.flow_options().tracking_options().tracking_policy() ==
TrackingOptions::POLICY_MULTI_FRAME) {
max_track_index = std::min(
options_.flow_options().tracking_options().multi_frames_to_track() -
1,
options_.track_index());
}
feature_list.reset(
region_flow_computation_->RetrieveMultiRegionFlowFeatureList(
std::min(std::max(0, frame_num_ - 1), max_track_index),
compute_feature_descriptors_, compute_feature_match_descriptors,
&frame,
compute_feature_match_descriptors ? prev_frame_.get() : nullptr));
if (feature_list == nullptr) {
LOG(ERROR) << "Error retrieving feature list.";
return false;
}
}
if (external_features) {
constexpr int kTrackIdShift = 1 << 20;
for (const auto& external_feat : external_features->feature()) {
auto* new_feat = feature_list->add_feature();
*new_feat = external_feat;
if (new_feat->track_id() >= 0) {
new_feat->set_track_id(new_feat->track_id() + kTrackIdShift);
}
}
}
if (rejection_transform) {
RegionFlowFeatureList tmp_list;
tmp_list.mutable_feature()->Swap(feature_list->mutable_feature());
for (const auto& feature : tmp_list.feature()) {
const Vector2_f diff =
TransformPoint(*rejection_transform, FeatureLocation(feature)) -
FeatureMatchLocation(feature);
if (diff.Norm() < options_.rejection_transform_threshold()) {
*feature_list->add_feature() = feature;
}
}
}
if (output_feature_list) {
*output_feature_list = *feature_list;
}
if (modify_features) {
(*modify_features)(feature_list.get());
}
buffer_->EmplaceDatum("features", feature_list.release());
// Store frame for next call.
if (compute_feature_descriptors_) {
frame.copyTo(*prev_frame_);
}
++frame_num_;
return true;
}
void MotionAnalysis::AddFeatures(const RegionFlowFeatureList& features) {
feature_computation_ = false;
buffer_->EmplaceDatum("features", new RegionFlowFeatureList(features));
++frame_num_;
}
void MotionAnalysis::EnqueueFeaturesAndMotions(
const RegionFlowFeatureList& features, const CameraMotion& motion) {
feature_computation_ = false;
CHECK(buffer_->HaveEqualSize({"motion", "features"}))
<< "Can not be mixed with other Add* calls";
buffer_->EmplaceDatum("features", new RegionFlowFeatureList(features));
buffer_->EmplaceDatum("motion", new CameraMotion(motion));
}
cv::Mat MotionAnalysis::GetGrayscaleFrameFromResults() {
return region_flow_computation_->GetGrayscaleFrameFromResults();
}
int MotionAnalysis::GetResults(
bool flush, std::vector<std::unique_ptr<RegionFlowFeatureList>>* features,
std::vector<std::unique_ptr<CameraMotion>>* camera_motion,
std::vector<std::unique_ptr<SalientPointFrame>>* saliency) {
MEASURE_TIME << "GetResults";
const int num_features_lists = buffer_->BufferSize("features");
const int num_new_feature_lists = num_features_lists - overlap_start_;
CHECK_GE(num_new_feature_lists, 0);
if (!flush && num_new_feature_lists < options_.estimation_clip_size()) {
// Nothing to compute, return.
return 0;
}
const bool compute_saliency = options_.compute_motion_saliency();
CHECK_EQ(compute_saliency, saliency != nullptr)
<< "Computing saliency requires saliency output and vice versa";
// Estimate motions for newly buffered RegionFlowFeatureLists, which also
// computes IRLS feature weights for foreground estimation, if needed
// (otherwise could be externally added).
const int num_motions_to_compute =
buffer_->BufferSize("features") - buffer_->BufferSize("motion");
if (num_motions_to_compute > 0) {
std::vector<CameraMotion> camera_motions;
std::vector<RegionFlowFeatureList*> feature_lists;
for (int k = overlap_start_; k < num_features_lists; ++k) {
feature_lists.push_back(
buffer_->GetMutableDatum<RegionFlowFeatureList>("features", k));
}
// TODO: Result should be vector of unique_ptr.
motion_estimation_->EstimateMotionsParallel(
options_.post_irls_smoothing(), &feature_lists, &camera_motions);
// Add solution to buffer.
for (const auto& motion : camera_motions) {
buffer_->EmplaceDatum("motion", new CameraMotion(motion));
}
}
CHECK(buffer_->HaveEqualSize({"features", "motion"}));
if (compute_saliency) {
ComputeSaliency();
}
return OutputResults(flush, features, camera_motion, saliency);
}
int MotionAnalysis::OutputResults(
bool flush, std::vector<std::unique_ptr<RegionFlowFeatureList>>* features,
std::vector<std::unique_ptr<CameraMotion>>* camera_motion,
std::vector<std::unique_ptr<SalientPointFrame>>* saliency) {
const bool compute_saliency = options_.compute_motion_saliency();
CHECK_EQ(compute_saliency, saliency != nullptr)
<< "Computing saliency requires saliency output and vice versa";
CHECK(buffer_->HaveEqualSize({"features", "motion"}));
// Discard prev. overlap (already output, just used for filtering here).
buffer_->DiscardData(buffer_->AllTags(), prev_overlap_start_);
prev_overlap_start_ = 0;
// Output only frames not part of the overlap.
const int num_output_frames =
std::max(0, buffer_->MaxBufferSize() - (flush ? 0 : overlap_size_));
if (features) {
features->reserve(num_output_frames);
}
if (camera_motion) {
camera_motion->reserve(num_output_frames);
}
if (saliency) {
saliency->reserve(num_output_frames);
}
// Need to retain twice the overlap, for frames in that segment we need
// to create a copy.
const int new_overlap_start =
std::max(0, num_output_frames - (flush ? 0 : overlap_size_));
// Populate output.
for (int k = 0; k < num_output_frames; ++k) {
std::unique_ptr<RegionFlowFeatureList> out_features;
std::unique_ptr<CameraMotion> out_motion;
std::unique_ptr<SalientPointFrame> out_saliency;
if (k >= new_overlap_start) {
// Create copy.
out_features.reset(new RegionFlowFeatureList(
*buffer_->GetDatum<RegionFlowFeatureList>("features", k)));
out_motion.reset(
new CameraMotion(*buffer_->GetDatum<CameraMotion>("motion", k)));
} else {
// Release datum.
out_features =
buffer_->ReleaseDatum<RegionFlowFeatureList>("features", k);
out_motion = buffer_->ReleaseDatum<CameraMotion>("motion", k);
}
// output_saliency is temporary so we never need to buffer it.
if (compute_saliency) {
out_saliency =
buffer_->ReleaseDatum<SalientPointFrame>("output_saliency", k);
}
if (options_.subtract_camera_motion_from_features()) {
std::vector<RegionFlowFeatureList*> feature_view{out_features.get()};
SubtractCameraMotionFromFeatures({*out_motion}, &feature_view);
}
if (features != nullptr) {
features->push_back(std::move(out_features));
}
if (camera_motion != nullptr) {
camera_motion->push_back(std::move(out_motion));
}
if (saliency != nullptr) {
saliency->push_back(std::move(out_saliency));
}
}
// Reset for next chunk.
prev_overlap_start_ = num_output_frames - new_overlap_start;
CHECK_GE(prev_overlap_start_, 0);
CHECK(buffer_->TruncateBuffer(flush));
overlap_start_ = buffer_->MaxBufferSize();
return num_output_frames;
}
void MotionAnalysis::RenderResults(const RegionFlowFeatureList& feature_list,
const CameraMotion& motion,
const SalientPointFrame* saliency,
cv::Mat* rendered_results) {
#ifndef NO_RENDERING
CHECK(rendered_results != nullptr);
CHECK_EQ(frame_width_, rendered_results->cols);
CHECK_EQ(frame_height_, rendered_results->rows);
const auto viz_options = options_.visualization_options();
// Visualize flow features if requested.
if (viz_options.visualize_region_flow_features()) {
cv::Scalar inlier_color(0, 255, 0);
cv::Scalar outlier_color(255, 0, 0);
if (feature_list.long_tracks()) {
long_feature_stream_->AddFeatures(feature_list,
true, // Check connectivity.
true); // Purge non present ones.
VisualizeLongFeatureStream(
*long_feature_stream_, inlier_color, outlier_color,
viz_options.min_long_feature_track(),
viz_options.max_long_feature_points(), 1.0f, 1.0f, rendered_results);
} else {
VisualizeRegionFlowFeatures(feature_list, inlier_color, outlier_color,
true, 1.0f, 1.0f, rendered_results);
}
}
if (saliency != nullptr && viz_options.visualize_salient_points()) {
static const cv::Scalar kColor(255, 0, 0);
RenderSaliency(*saliency, kColor, viz_options.line_thickness(), false,
rendered_results);
}
if (viz_options.visualize_blur_analysis_region()) {
VisualizeBlurAnalysisRegions(rendered_results);
}
if (viz_options.visualize_stats()) {
// Output general stats.
std::string hud_text = absl::StrFormat(
"H-cvg %.2f | H-err %4.2f | Avg.t %3.1f | dx: %+2.1f dy: %+2.1f "
"Feat# %4d | %s | ",
motion.homography_inlier_coverage(), motion.average_homography_error(),
motion.average_magnitude(), motion.translation().dx(),
motion.translation().dy(), feature_list.feature_size(),
CameraMotionTypeToString(motion));
hud_text += CameraMotionFlagToString(motion);
const float text_scale = frame_width_ * 5.e-4;
cv::putText(*rendered_results, hud_text,
cv::Point(0.02 * frame_width_, 0.975 * frame_height_),
cv::FONT_HERSHEY_SIMPLEX, text_scale, cv::Scalar::all(255),
text_scale * 3, CV_AA);
cv::putText(*rendered_results,
absl::StrFormat("%6d", motion.timestamp_usec() / 1000),
cv::Point(0.9 * frame_width_, 0.05 * frame_height_),
cv::FONT_HERSHEY_SIMPLEX, text_scale, cv::Scalar::all(255),
text_scale * 3, CV_AA);
}
#else
LOG(FATAL) << "Code stripped out because of NO_RENDERING";
#endif
}
void MotionAnalysis::ComputeDenseForeground(
const RegionFlowFeatureList& feature_list,
const CameraMotion& camera_motion, cv::Mat* foreground_mask) {
const auto& foreground_options = options_.foreground_options();
if (foreground_push_pull_ == nullptr) {
foreground_push_pull_.reset(
new PushPullFilteringC1(cv::Size(frame_width_, frame_height_),
PushPullFilteringC1::BINOMIAL_5X5, false,
nullptr, // Gaussian filter weights only.
nullptr, // No mip map visualizer.
nullptr)); // No weight adjustment.
}
// Determine foreground weights for each features.
std::vector<float> foreground_weights;
ForegroundWeightsFromFeatures(
feature_list, foreground_options.foreground_threshold(),
foreground_options.foreground_gamma(),
foreground_options.threshold_coverage_scaling() ? &camera_motion
: nullptr,
&foreground_weights);
// Setup push pull map (with border). Ensure constructor used the right type.
CHECK(foreground_push_pull_->filter_type() ==
PushPullFilteringC1::BINOMIAL_5X5 ||
foreground_push_pull_->filter_type() ==
PushPullFilteringC1::GAUSSIAN_5X5);
cv::Mat foreground_map(frame_height_ + 4, frame_width_ + 4, CV_32FC2);
std::vector<Vector2_f> feature_locations;
std::vector<cv::Vec<float, 1>> feature_irls;
for (int feat_idx = 0; feat_idx < foreground_weights.size(); ++feat_idx) {
// Skip marked outliers.
if (foreground_weights[feat_idx] == 0) {
continue;
}
feature_locations.push_back(
FeatureLocation(feature_list.feature(feat_idx)));
feature_irls.push_back(cv::Vec<float, 1>(foreground_weights[feat_idx]));
}
// Run push pull.
foreground_push_pull_->PerformPushPull(feature_locations, feature_irls, 0.2,
cv::Point2i(0, 0),
0, // Default read out level.
nullptr, // Uniform weights.
nullptr, // No bilateral term.
&foreground_map);
// Convert to grayscale output.
foreground_mask->create(frame_height_, frame_width_, CV_8U);
for (int i = 0; i < frame_height_; ++i) {
const float* src_ptr = foreground_map.ptr<float>(i);
uint8* dst_ptr = foreground_mask->ptr<uint8>(i);
for (int j = 0; j < frame_width_; ++j) {
// Result is in first channel (second is confidence).
dst_ptr[j] =
std::max(0, std::min(255, static_cast<int>(src_ptr[2 * j] * 255.0f)));
}
}
}
void MotionAnalysis::VisualizeDenseForeground(const cv::Mat& foreground_mask,
cv::Mat* output) {
CHECK(output != nullptr);
CHECK(foreground_mask.size() == output->size());
// Map foreground measure to color (green by default).
std::vector<Vector3_f> color_map;
if (options_.visualization_options().foreground_jet_coloring()) {
JetColoring(1000, &color_map);
} else {
color_map.resize(1000, Vector3_f(0, 255, 0));
}
auto clamp = [](int value) -> int {
return std::max(0, std::min(255, value));
};
// Burn-in alpha compositing.
const float alpha = 1.3f;
for (int i = 0; i < frame_height_; ++i) {
uint8* image_ptr = output->ptr<uint8>(i);
const uint8* foreground_ptr = foreground_mask.ptr<uint8>(i);
for (int j = 0; j < frame_width_; ++j) {
const float norm_foreground = foreground_ptr[j] * (1.0 / 255.0f);
const float foreground = norm_foreground * alpha;
const float alpha_denom = 1.0f / (1.0f + foreground);
Vector3_f color =
color_map[static_cast<int>(norm_foreground * (color_map.size() - 1))];
image_ptr[3 * j] =
clamp((image_ptr[3 * j] + color[0] * foreground) * alpha_denom);
image_ptr[3 * j + 1] =
clamp((image_ptr[3 * j + 1] + color[1] * foreground) * alpha_denom);
image_ptr[3 * j + 2] =
clamp((image_ptr[3 * j + 2] + color[2] * foreground) * alpha_denom);
}
}
}
void MotionAnalysis::VisualizeBlurAnalysisRegions(cv::Mat* input_view) {
CHECK(input_view != nullptr);
cv::Mat intensity;
cv::cvtColor(*input_view, intensity, CV_RGB2GRAY);
cv::Mat corner_values;
cv::cornerMinEigenVal(intensity, corner_values, 3);
cv::Mat mask;
region_flow_computation_->ComputeBlurMask(*input_view, &corner_values, &mask);
cv::Mat mask_3c;
cv::cvtColor(mask, mask_3c, CV_GRAY2RGB);
cv::addWeighted(*input_view, 0.5, mask_3c, 0.5, -128, *input_view);
}
void MotionAnalysis::ComputeSaliency() {
MEASURE_TIME << "Saliency computation.";
CHECK_EQ(overlap_start_, buffer_->BufferSize("saliency"));
const int num_features_lists = buffer_->BufferSize("features");
// Compute saliency only for newly buffered RegionFlowFeatureLists.
for (int k = overlap_start_; k < num_features_lists; ++k) {
std::vector<float> foreground_weights;
ForegroundWeightsFromFeatures(
*buffer_->GetDatum<RegionFlowFeatureList>("features", k),
options_.foreground_options().foreground_threshold(),
options_.foreground_options().foreground_gamma(),
options_.foreground_options().threshold_coverage_scaling()
? buffer_->GetDatum<CameraMotion>("motion", k)
: nullptr,
&foreground_weights);
std::unique_ptr<SalientPointFrame> saliency(new SalientPointFrame());
motion_saliency_->SaliencyFromFeatures(
*buffer_->GetDatum<RegionFlowFeatureList>("features", k),
&foreground_weights, saliency.get());
buffer_->AddDatum("saliency", std::move(saliency));
}
CHECK(buffer_->HaveEqualSize({"features", "motion", "saliency"}));
// Clear output saliency and copy from saliency.
buffer_->DiscardDatum("output_saliency",
buffer_->BufferSize("output_saliency"));
for (int k = 0; k < buffer_->BufferSize("saliency"); ++k) {
std::unique_ptr<SalientPointFrame> copy(new SalientPointFrame());
*copy = *buffer_->GetDatum<SalientPointFrame>("saliency", k);
buffer_->AddDatum("output_saliency", std::move(copy));
}
// Create view.
std::vector<SalientPointFrame*> saliency_view =
buffer_->GetMutableDatumVector<SalientPointFrame>("output_saliency");
// saliency_frames are filtered after this point and ready for output.
if (options_.select_saliency_inliers()) {
motion_saliency_->SelectSaliencyInliers(&saliency_view, false);
}
if (options_.filter_saliency()) {
motion_saliency_->FilterMotionSaliency(&saliency_view);
}
}
} // namespace mediapipe

View File

@ -0,0 +1,229 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Module for performing motion analysis on a video stream including computing
// locally filtered (robust) feature tracking, camera motion estimation, and
// dense foreground saliency estimation.
// Module buffers frames internally (using an adaptive overlap to achieve
// temporal consistency):
//
// Usage example:
//
// MotionAnalysisOptions options();
// // Should be always be a multiple 8 for optimal parallel performance
// options.set_estimation_clip_size(16);
// MotionAnalysis motion_analysis(options, 960, 540);
// std::vector<cv::Mat> input_frames(N);
// // Define output vectors.
// std::vector<std::unique_ptr<RegionFlowFeatureList>> features;
// std::vector<std::unique_ptr<CameraMotion>> camera_motion;
// std::vector<std::unique_ptr<SalientPointFrame>> saliency;
// std::vector<cv::Mat> rendered_results); // Should to be initialized with
// // frame.
//
// for (int k = 0; k < N; ++k) {
// motion_analysis.AddFrame(input_frames[k], 0);
// // Outputs results, if new ones are available.
// // Output will be all of the same lengths (Length returned by function).
// if (motion_analysis.GetResults(k + 1 == N, // Flush, force output.
// &features,
// &camera_motion,
// &saliency) > 0) { // Optional.
// // Optionally render at i'th frame.
// motion_analysis.RenderResults(*features[i],
// *camera_motion[i],
// saliency[i].get(),
// &rendered_results[i]);
//
// // Output results...
// }
// }
#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_ANALYSIS_H_
#define MEDIAPIPE_UTIL_TRACKING_MOTION_ANALYSIS_H_
#include <memory>
#include <string>
#include <vector>
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/motion_analysis.pb.h"
#include "mediapipe/util/tracking/motion_estimation.h"
#include "mediapipe/util/tracking/motion_estimation.pb.h"
#include "mediapipe/util/tracking/motion_saliency.h"
#include "mediapipe/util/tracking/push_pull_filtering.h"
#include "mediapipe/util/tracking/region_flow.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
#include "mediapipe/util/tracking/region_flow_computation.h"
#include "mediapipe/util/tracking/streaming_buffer.h"
namespace mediapipe {
typedef PushPullFiltering<1, FilterWeightMultiplierOne> PushPullFlowC1;
class MotionAnalysis {
public:
MotionAnalysis(const MotionAnalysisOptions& options, int frame_width,
int frame_height);
~MotionAnalysis() = default;
MotionAnalysis(const MotionAnalysis&) = delete;
MotionAnalysis& operator=(const MotionAnalysis&) = delete;
// Call with every frame. Timestamp is optional (set to zero if not needed).
// Optionally outputs list of features extracted from this frame.
// Returns true on success.
bool AddFrame(const cv::Mat& frame, int64 timestamp_usec,
RegionFlowFeatureList* feature_list = nullptr);
// Same as above, but uses specified initial transform to seed
// feature locations.
bool AddFrameWithSeed(const cv::Mat& frame, int64 timestamp_usec,
const Homography& initial_transform,
RegionFlowFeatureList* feature_list = nullptr);
// Generic function to perform motion analysis on the passed frame,
// with initial_transform used as seed.
// Optionally accepts external_feature to be added to the computed ones,
// and to reject all features that do not agree with the rejection transform
// within a specified threshold (rejection_transform_threshold in options).
// Also allows to modify feature locations before motion estimation by
// supplying appropiate callback - this is invoked *after* the rejection
// transform.
// Returns list of features extracted from this frame, *before* any
// modification is applied. To yield modified features, simply
// apply modify_features function to returned result.
bool AddFrameGeneric(
const cv::Mat& frame, int64 timestamp_usec,
const Homography& initial_transform,
const Homography* rejection_transform = nullptr,
const RegionFlowFeatureList* external_features = nullptr,
std::function<void(RegionFlowFeatureList*)>* modify_features = nullptr,
RegionFlowFeatureList* feature_list = nullptr);
// Instead of tracking passed frames, uses result directly as supplied by
// features. Can not be mixed with above AddFrame* calls.
void AddFeatures(const RegionFlowFeatureList& features);
// Instead of tracking and computing camera motions, simply adds precomputed
// features and camera motions to the internal buffers. Can not be mixed
// with above Add* calls.
// This is useful for just computing saliency via GetResults.
void EnqueueFeaturesAndMotions(const RegionFlowFeatureList& features,
const CameraMotion& motion);
// Returns motion results (features, camera motions and saliency, all
// optional).
// Call after every AddFrame for optimal performance.
// Returns number of available results. Note, this call with often return
// zero, and only return results (multiple in this case) when chunk boundaries
// are reached. The actual number returned depends on various smoothing
// settings for saliency and features.
// Set flush to true, to force output of all results (e.g. when the end of the
// video stream is reached).
// Note: Passing a non-zero argument for saliency, requires
// MotionAnalysisOptions::compute_motion_saliency to be set and
// vice versa. (CHECKED)
int GetResults(
bool flush, // Forces output.
std::vector<std::unique_ptr<RegionFlowFeatureList>>* features = nullptr,
std::vector<std::unique_ptr<CameraMotion>>* camera_motion = nullptr,
std::vector<std::unique_ptr<SalientPointFrame>>* saliency = nullptr);
// Exposes the grayscale image frame from the most recently created region
// flow tracking data.
cv::Mat GetGrayscaleFrameFromResults();
// Renders features and saliency to rendered_results based on
// VisualizationOptions onto pre-initialized rendered_results (in most cases
// you want to create a copy of the input frame).
// NOTE:
// If features are requested to be rendered, this function should be
// called serially with each frame, or wrong feature location might be
// rendered.
void RenderResults(const RegionFlowFeatureList& features,
const CameraMotion& camera_motion,
const SalientPointFrame* saliency, // Optional.
cv::Mat* rendered_results);
// Determines dense foreground mask from features.
// Returns foreground mask as CV_8U image, indicating propability of
// foreground.
void ComputeDenseForeground(const RegionFlowFeatureList& feature_list,
const CameraMotion& camera_motion,
cv::Mat* foreground_mask);
// Overlays foreground mask over output as green burn in, or with
// jet_coloring (based on options).
void VisualizeDenseForeground(const cv::Mat& foreground_mask,
cv::Mat* output);
// Masks out regions from input that are not used for blur analysis.
void VisualizeBlurAnalysisRegions(cv::Mat* input);
// Number of frames/features added so far.
int NumFrames() const { return frame_num_; }
private:
void InitPolicyOptions();
// Compute saliency from buffered features and motions.
void ComputeSaliency();
// Outputs computed results from the streaming buffer to the optional
// output args. Also performs overlap handling.
int OutputResults(
bool flush, // Forces output.
std::vector<std::unique_ptr<RegionFlowFeatureList>>* features = nullptr,
std::vector<std::unique_ptr<CameraMotion>>* camera_motion = nullptr,
std::vector<std::unique_ptr<SalientPointFrame>>* saliency = nullptr);
MotionAnalysisOptions options_;
int frame_width_ = 0;
int frame_height_ = 0;
int frame_num_ = 0;
// Internal objects for actual motion analysis.
std::unique_ptr<RegionFlowComputation> region_flow_computation_;
std::unique_ptr<MotionEstimation> motion_estimation_;
std::unique_ptr<MotionSaliency> motion_saliency_;
std::unique_ptr<PushPullFlowC1> foreground_push_pull_;
// Used for visualization if long feature tracks are present.
std::unique_ptr<LongFeatureStream> long_feature_stream_;
std::unique_ptr<StreamingBuffer> buffer_;
// Indicates where previous overlap in above buffers starts (earlier data is
// just to improve smoothing).
int prev_overlap_start_ = 0;
// Indicates where actual overlap starts (data after this has not been
// output).
int overlap_start_ = 0;
// Buffers previous frame.
std::unique_ptr<cv::Mat> prev_frame_;
bool compute_feature_descriptors_ = false;
// Amount of overlap between clips. Determined from saliency smoothing
// and filtering options.
int overlap_size_ = 0;
bool feature_computation_ = true;
};
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_ANALYSIS_H_

View File

@ -0,0 +1,154 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/motion_estimation.proto";
import "mediapipe/util/tracking/motion_saliency.proto";
import "mediapipe/util/tracking/region_flow_computation.proto";
// Settings for MotionAnalysis. This class computes sparse, locally consistent
// flow (referred to as region flow), camera motions, and foreground saliency
// (i.e. likely foreground objects moving different from the background).
// Next tag: 16
message MotionAnalysisOptions {
// Pre-configured policies for MotionAnalysis.
// For general use, it is recommended to select an appropiate policy
// instead of customizing flow and motion options by hand.
// Policies are being kept up to date with appropiate settings.
enum AnalysisPolicy {
// Default legacy options. Effectivley no op.
ANALYSIS_POLICY_LEGACY = 0;
// Use for video.
ANALYSIS_POLICY_VIDEO = 1;
// Use for video on mobile.
ANALYSIS_POLICY_VIDEO_MOBILE = 2;
// Use if applied to camera stream on mobile, e.g.
// low latency and high throughput.
// ASSUMES DOWNSAMPLED INPUT, e.g. from GPU.
ANALYSIS_POLICY_CAMERA_MOBILE = 3;
// Use for sped up video / hyperlapse when adding frames with seeds
// and rejection transforms. Mostly ups temporal consistency weights
// and relaxes stability constraints.
// Only recommended to be used as second pass after initial MotionAnalysis
// and FrameSelection.
ANALYSIS_POLICY_HYPERLAPSE = 4;
}
optional AnalysisPolicy analysis_policy = 14
[default = ANALYSIS_POLICY_LEGACY];
// Options for the actual motion stabilization
// (in order of object usage).
optional RegionFlowComputationOptions flow_options = 1;
optional MotionEstimationOptions motion_options = 2;
optional MotionSaliencyOptions saliency_options = 3;
// Clip-size used for (parallelized) motion estimation.
optional int32 estimation_clip_size = 4 [default = 16];
// If set, camera motion is subtracted from features before output.
// Effectively outputs, residual motion w.r.t. background.
optional bool subtract_camera_motion_from_features = 5 [default = false];
// If flow_options().tracking_options().tracking_policy() equals
// POLICY_MULTI_FRAME, this flag indicates which RegionFlowFeatureList to use.
// Specifically, for frame C, we use the motion from C to C - 1 - track_index.
optional int32 track_index = 6 [default = 0];
// If set, compute motion saliency (regions of moving foreground).
optional bool compute_motion_saliency = 7 [default = false];
// Selects saliency inliers (only saliency locations with sufficient
// spatial and temporal support are kept).
// Only applied when compute_motion_saliency is set.
optional bool select_saliency_inliers = 8 [default = true];
// Performs spatio-temporal filtering of extracted foreground saliency. If
// used with above selection of saliency inliers, filtering is performed
// *after* inlier selection.
// Only applied when compute_motion_saliency is set.
optional bool filter_saliency = 9 [default = true];
// If set, irls weights of motion estimation are spatio-temporally smoothed
// after model estimation.
optional bool post_irls_smoothing = 10 [default = false];
// If a rejection_transform is passed to AddFrameGeneric, features that
// do not agree with the transform within below threshold are removed.
optional float rejection_transform_threshold = 13 [default = 20.0];
// Adapts visualization for rendered_results when passed to GetResults.
message VisualizationOptions {
// Visualizes tracked region flow features, colored w.r.t. fitting error.
optional bool visualize_region_flow_features = 1 [default = true];
// Visualizes salient points. Only applicable is compute_motion_saliency is
// set to true.
optional bool visualize_salient_points = 2 [default = false];
// Line thickness of ellipse when rendering salient points.
optional int32 line_thickness = 5 [default = 4];
// Instead of green burn in uses jet coloring to indicate magnitude of
// foreground motion.
optional bool foreground_jet_coloring = 3 [default = false];
// If set, only keeps masks of pixels that is used for blur analysis, rest
// is set to zero.
optional bool visualize_blur_analysis_region = 4 [default = false];
optional bool visualize_stats = 6 [default = true];
// Only long feature tracks with specified minimum length are rendered.
// Set to zero to consider all tracks.
optional int32 min_long_feature_track = 7 [default = 0];
// Only the last N points of a long feature track are rendered. Set to zero
// to render all points.
optional int32 max_long_feature_points = 8 [default = 0];
}
optional VisualizationOptions visualization_options = 11;
// Describes how to compute foreground from features.
message ForegroundOptions {
// Indicates the *inverse* registration error (i.e. the irls weight)
// that is deemed a complete inlier.
// Weights in the interval [0, foreground_threshold] (corresponding to
// pixel errors in the interval [1 / foreground_threshold, inf])
// are mappend to 1 - [0, 1], i.e. foreground threshold is mapped to zero
// with weights below the threshold being assigned values > 0.
// Therefore, larger values will increase amount of detected foreground
// as well as noise.
optional float foreground_threshold = 1 [default = 0.5];
// By using foreground_gamma < 1.0 you can increase resolution of small
// foreground motion at the expense of the resolution of large foreground
// motions.
optional float foreground_gamma = 2 [default = 1.0];
// Threshold is scaled by coverage, i.e. for frames with large registration
// error less forground is visualized.
optional bool threshold_coverage_scaling = 3 [default = true];
}
optional ForegroundOptions foreground_options = 12;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,758 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Fits several linear motion models to the tracked features obtained
// from RegionFlowComputation.
//
// --- Multi-threaded usage (parallel motion estimation over frames) ---
// assume input: vector<RegionFlowFeatureList*> feature_lists
// // Can be obtained after getting RegionFlowFrame from RegionFlowComputation
// // and executing GetRegionFlowFeatureList (region_flow.h)
// MotionEstimation motion_estimation(MotionEstimationOptions(),
// frame_width,
// frame_height);
// vector<CameraMotion> camera_motions;
// motion_estimation.EstimateMotionsParallel(false, // no IRLS smoothing.
// &feature_lists,
// &camera_motions);
// // RegionFlowFeatureList can be discarded or passed to Cropper.
//
//
// --- DEPRECATED, per-frame usage ---
// assume input: RegionFlowFrame* flow_frame // from RegionFlowComputation.
//
// // Initialize with standard options.
// MotionEstimation motion_estimation(MotionEstimationOptions(),
// frame_width,
// frame_height);
// CameraMotion estimated_motion;
// motion_estimation.EstimateMotion(flow_frame,
// NULL, // deprecated param.
// NULL, // deprecated param.
// &estimation_motion);
//
// // If features are not needed anymore flow_frame can be be discarded now.
//
// // Pass motion models in estimated_motion onto MotionStabilization,
// // if stabilization is desired.
#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_ESTIMATION_H_
#define MEDIAPIPE_UTIL_TRACKING_MOTION_ESTIMATION_H_
#include <algorithm>
#include <deque>
#include <list>
#include <memory>
#include <unordered_map>
#include <vector>
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/vector.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/motion_estimation.pb.h"
#include "mediapipe/util/tracking/motion_models.pb.h"
#include "mediapipe/util/tracking/region_flow.h"
namespace mediapipe {
class Homography;
class LinearSimilarityModel;
class MixtureHomography;
class MixtureRowWeights;
class RegionFlowFeature;
class RegionFlowFeatureList;
class RegionFlowFrame;
class EstimateMotionIRLSInvoker;
class InlierMask;
class IrlsInitializationInvoker;
// Thread local storage for pre-allocated memory.
class MotionEstimationThreadStorage;
class TrackFilterInvoker;
class MotionEstimation {
public:
MotionEstimation(const MotionEstimationOptions& options, int frame_width,
int frame_height);
virtual ~MotionEstimation();
MotionEstimation(const MotionEstimation&) = delete;
MotionEstimation& operator=(const MotionEstimation&) = delete;
// Can be used to re-initialize options between EstimateMotion /
// EstimateMotionsParallel calls.
void InitializeWithOptions(const MotionEstimationOptions& options);
// Estimates motion models from RegionFlowFeatureLists based on
// MotionEstimationOptions, in a multithreaded manner (frame parallel).
// The computed IRLS weights used on the last iteration of the highest
// degree of freedom model are *written* to the irls_weight member for each
// RegionFlowFeature in RegionFlowFeatureList which can be a useful
// feature for later processing.
// In addition the returned irls weights can be smoothed spatially
// and temporally before they are output.
// Note: The actual vector feature_lists is not modified.
virtual void EstimateMotionsParallel(
bool post_irls_weight_smoothing,
std::vector<RegionFlowFeatureList*>* feature_lists,
std::vector<CameraMotion>* camera_motions) const;
// DEPRECATED function, estimating Camera motion from a single
// RegionFlowFrame.
virtual void EstimateMotion(const RegionFlowFrame& region_flow_frame,
const int*, // deprecated, must be NULL.
const int*, // deprecated, must be NULL.
CameraMotion* camera_motion) const;
// Public facing API to directly estimate motion models (as opposed to
// a cascade of increasing degree of freedom motion models with appropiate
// stability analysis via above EstimateMotionsParallel).
// Use this if all you need is just the a specific motion
// model describing/summarizing the motion of the RegionFlowFeatureList.
// Returns false if model estimation failed (in this case an identity model
// is set in camera_motion).
// NOTE: All direct estimation functions assume normalized feature input,
// i.e. transformed via NormalizeRegionFlowFeatureList.
//
// NOTE 2: For easy direct use see Fit* functions below class.
bool EstimateTranslationModel(RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion);
bool EstimateLinearSimilarityModel(RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion);
bool EstimateAffineModel(RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion);
bool EstimateHomography(RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion);
bool EstimateMixtureHomography(RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion);
// Static function which sets motion models (requested in options) to identity
// models.
static void ResetMotionModels(const MotionEstimationOptions& options,
CameraMotion* camera_motion);
// The following functions ResetTo* functions reset all models that are
// present in camera_motion (tested via has_*) to identity or
// the passed model (which is embedded in higher degree of freedom models
// if applicable). CameraMotion::Type is set in accordance to function name.
static void ResetToIdentity(CameraMotion* camera_motion,
bool consider_valid = false); // Set to true
// for type VALID
// Resets every specified model to embedded translation model.
// CameraMotion type is set to UNSTABLE.
static void ResetToTranslation(const TranslationModel& translation,
CameraMotion* camera_motion);
// Resets every specified model with more or equal DOF than a similarity
// to the passed model.
// Camera Motion type is set to UNSTABLE_SIM.
static void ResetToSimilarity(const LinearSimilarityModel& model,
CameraMotion* camera_motion);
// Resets every specified model with more or equal DOF than a homography
// to the passed model. If flag_as_unstable_model is set, camera motion type
// is set to UNSTABLE_HOMOG.
static void ResetToHomography(const Homography& model,
bool flag_as_unstable_model,
CameraMotion* camera_motion);
private:
// Simple enum indicating with motion model should be estimated, mapped from
// MotionEstimationOptions.
enum MotionType {
MODEL_AVERAGE_MAGNITUDE = 0,
MODEL_TRANSLATION = 1,
MODEL_LINEAR_SIMILARITY = 2,
MODEL_AFFINE = 3,
MODEL_HOMOGRAPHY = 4,
MODEL_MIXTURE_HOMOGRAPHY = 5,
// ... internal enum values used for mixture spectrum (up to 10 mixtures are
// supported). Do not use directly.
// Change value if new motions are added.
MODEL_NUM_VALUES = 16,
};
// Determines shot boundaries from estimated motion models and input
// feature_lists by setting the corresponding flag in CameraMotion.
// Make sure, this function is called after motion estimation.
void DetermineShotBoundaries(
const std::vector<RegionFlowFeatureList*>& feature_lists,
std::vector<CameraMotion>* camera_motions) const;
// Implementation function to estimate CameraMotion's from
// RegionFlowFeatureLists.
void EstimateMotionsParallelImpl(
bool irls_weights_preinitialized,
std::vector<RegionFlowFeatureList*>* feature_lists,
std::vector<CameraMotion>* camera_motions) const;
struct SingleTrackClipData;
struct EstimateModelOptions;
EstimateModelOptions DefaultModelOptions() const;
// Implementation function to estimate all motions for a specific type
// across multiple single tracks. Motions are only estimated for those
// CameraMotions with type less or equal to max_unstable_type. Flag
// irls_weights_preinitialized enables some optimizations in case it is
// set to false as features are not pre-initialized.
// Optionally pass thread_storage.
// Returns true if requested type was attempt to be estimated
// (based on options) , false otherwise.
bool EstimateMotionModels(
const MotionType& max_type, const CameraMotion::Type& max_unstable_type,
const EstimateModelOptions& options,
const MotionEstimationThreadStorage* thread_storage, // optional.
std::vector<SingleTrackClipData>* clip_datas) const;
// Multiplies input irls_weights by an upweight multiplier for each feature
// that is part of a sufficiently large track (contribution of each track
// length is by track_length_multiplier, mapping each track length
// to an importance weight in [0, 1]).
void LongFeatureInitialization(
const RegionFlowFeatureList& feature_list,
const LongFeatureInfo& feature_info,
const std::vector<float>& track_length_importance,
std::vector<float>* irls_weights) const;
// Multiplies input irls_weights by normalization factor that downweights
// features is areas of high density.
void FeatureDensityNormalization(const RegionFlowFeatureList& feature_list,
std::vector<float>* irls_weights) const;
// Initializes irls weights (if
// MotionEstimationOptions::irls_initialization::activated is true),
// based on a multitude of options (RANSAC based pre-fitting of motion
// models, homography initialization, etc.).
// Processes one frame or all (if frame = -1) within a clip_data,
void IrlsInitialization(const MotionType& type, int max_unstable_type,
int frame, // Specify -1 for all frames.
const EstimateModelOptions& options,
SingleTrackClipData* clip_data) const;
// Estimation functions for models, called via options by EstimateMotion and
// EstimateMotionsParallel.
// NOTE: All direct estimation functions assume normalized feature input,
// i.e. transformed via
// TransformRegionFlowFeatureList(normalization_transform, feature_list);
// where normalization_transform =
// LinearSimilarityAdapter::NormalizationTransform(frame_width,
// frame_height);
//
// Direct estimation functions perform estimation via iterated reweighted
// least squares (IRLS). In this case specify number of iterations (10 is a
// good default), and optionally the PriorFeatureWeights for each iteration.
// The alphas specify, how much weight should be given to the
// prior weight that the feature has before optimization. An alpha of zero
// indicates, no prior weighting, whereas as an alpha of one corresponds to
// full prior weighting. The actual prior is stored in priors.
// Each iteration is reweighted by numerator / error, where error is the L2
// fitting error after estimation and
// numerator = (1.0 - alpha) * 1.0 + alpha * prior
struct PriorFeatureWeights {
explicit PriorFeatureWeights(int num_iterations)
: alphas(num_iterations, 0.0f) {}
PriorFeatureWeights(int num_iterations, int num_features)
: alphas(num_iterations, 0.0f), priors(num_features, 1.0f) {}
// Tests for correct dimensions of PriorFeatureWeights.
bool HasCorrectDimension(int num_iterations, int num_features) const {
return alphas.size() == num_iterations && priors.size() == num_features;
}
// Returns true if at least one alpha is non-zero.
bool HasNonZeroAlpha() const {
return !alphas.empty() &&
*std::max_element(alphas.begin(), alphas.end()) > 0;
}
// Returns true, if a prior was specified.
bool HasPrior() const { return !priors.empty(); }
std::vector<float> alphas; // Alpha for each IRLS round.
std::vector<float> priors; // Prior weight for each feature.
// If set, above alpha are not adjusted with iterations, but always set to
// 1.0, given full weight to the prior.
bool use_full_prior = false;
};
// In addition, each estimation function can compute its corresponding
// stability features and store it in CameraMotion. These features are needed
// to test via the IsStable* functions further below.
// Estimates 2 DOF translation model.
// Note: feature_list is assumed to be normalized/transformed by
// LinearSimilarity::NormalizationTransform N. Returned irls weights and
// linear similarity are expressed in original frame, i.e. for estimated model
// M, M' = N^(-1) M N is returned.
void EstimateTranslationModelIRLS(
int irls_rounds, bool compute_stability,
RegionFlowFeatureList* feature_list,
const PriorFeatureWeights* prior_weights, // optional.
CameraMotion* camera_motion) const;
// Estimates linear similarity from feature_list using irls_rounds iterative
// reweighted least squares iterations. For L2 estimation, use irls_round = 1.
// The irls_weight member of each RegionFlowFeature in feature_list will be
// set to the inverse residual w.r.t. estimated LinearSimilarityModel.
// Note: feature_list is assumed to be normalized/transformed by
// LinearSimilarity::NormalizationTransform N. Returned irls weights and
// linear similarity are expressed in original frame, i.e. for estimated model
// M, M' = N^(-1) M N is returned.
// Returns true if estimation was successful, otherwise returns false and sets
// the CameraMotion::type to INVALID.
bool EstimateLinearSimilarityModelIRLS(
int irls_rounds, bool compute_stability,
RegionFlowFeatureList* feature_list,
const PriorFeatureWeights* prior_weights, // optional.
CameraMotion* camera_motion) const;
// Same as above for affine motion.
// Note: feature_list is assumed to be normalized/transformed by
// LinearSimilarity::NormalizationTransform N. Returned irls weights and
// affine model are expressed in original frame, i.e. for estimated model
// M, M' = N^(-1) M N is returned.
bool EstimateAffineModelIRLS(int irls_rounds,
RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion) const;
// Same as above for homography.
// Note: feature_list is assumed to be normalized/transformed by
// LinearSimilarity::NormalizationTransform N. Returned irls weights and
// homography are expressed in original frame, i.e. for estimated model
// M, M' = N^(-1) M N is returned.
// Returns true if estimation was successful, otherwise returns false and sets
// the CameraMotion::type to INVALID.
bool EstimateHomographyIRLS(
int irls_rounds, bool compute_stability,
const PriorFeatureWeights* prior_weights, // optional.
MotionEstimationThreadStorage* thread_storage, // optional.
RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const;
// Same as above for mixture homography.
// Note: feature_list is assumed to be normalized/transformed by
// LinearSimilarity::NormalizationTransform N. Returned irls weights and
// mixture homography are expressed in original frame, i.e. for estimated
// model M, M' = N^(-1) M N is returned.
// Mixture model estimation customized by MotionEstimationOptions.
// Returns true if estimation was successful, otherwise returns false and sets
// the CameraMotion::type to INVALID.
// Supports computation for mixture spectrum, i.e. mixtures with different
// regularizers. For default regularizer pass
// MotionEstimationOptions::mixture_regularizer. Estimated motion will be
// stored in CameraMotion::mixture_homography_spectrum(spectrum_idx).
bool EstimateMixtureHomographyIRLS(
int irls_rounds, bool compute_stability, float regularizer,
int spectrum_idx, // 0 by default.
const PriorFeatureWeights* prior_weights, // optional.
MotionEstimationThreadStorage* thread_storage, // optional.
RegionFlowFeatureList* feature_list, CameraMotion* camera_motion) const;
// Returns weighted variance for mean translation from feature_list (assumed
// to be in normalized coordinates). Returned variance is in unnormalized
// domain.
float TranslationVariance(const RegionFlowFeatureList& feature_list,
const Vector2_f& translation) const;
// Replace each features irls weight by robust min-filtered irls weight
// across each track.
void MinFilterIrlsWeightByTrack(SingleTrackClipData* clip_data) const;
// Performs filtering of irls weight across several tracking clip datas,
// to yield consistent irls weights.
void EnforceTrackConsistency(
std::vector<SingleTrackClipData>* clip_datas) const;
// Initializes or modifies prior_weights for passed feature_list by
// biasing toward previous (filtered) IRLS weight for that feature.
// This enables temporal coherence.
void BiasLongFeatures(RegionFlowFeatureList* feature_list, MotionType type,
const EstimateModelOptions& model_options,
PriorFeatureWeights* prior_weights) const;
// Called by above function to determine the bias each feature is multiplied
// with.
void BiasFromFeatures(const RegionFlowFeatureList& feature_list,
MotionType type,
const EstimateModelOptions& model_options,
std::vector<float>* bias) const;
// Maps track index to tuple of spatial bias and number of similar
// looking long tracks.
typedef std::unordered_map<int, std::pair<float, float>> SpatialBiasMap;
void ComputeSpatialBias(MotionType type,
const EstimateModelOptions& model_options,
RegionFlowFeatureList* feature_list,
SpatialBiasMap* spatial_bias) const;
// Updates features weights in feature_list by temporally consistent bias.
void UpdateLongFeatureBias(MotionType type,
const EstimateModelOptions& model_options,
bool remove_terminated_tracks,
bool update_irls_observation,
RegionFlowFeatureList* feature_list) const;
// Bilateral filtering of irls weights across the passed list.
void SmoothIRLSWeights(std::deque<float>* irls) const;
// Helper function. Returns number of irls iterations for passed MotionType
// derived from current MotionEstimationOptions. Returns zero, if no
// estimation should be attempted.
int IRLSRoundsFromSettings(const MotionType& type) const;
// Partitions irls_rounds into several rounds with each having irls_per_round
// interations each based on MotionEstimationOptions::EstimationPolicy.
// Post-condition: total_rounds * irls_per_rounds == irls_rounds.
void PolicyToIRLSRounds(int irls_rounds, int* total_rounds,
int* irls_per_round) const;
// Check for specified MotionType is estimated model is stable. If not, resets
// feature's irls weights to reset_irls_weights (optional) and resets motion
// model in camera_motion to lower degree of freedom model. In this case,
// CameraMotion::Type is flagged as UNSTABLE_* where * denotes the lower
// degree of freedom model.
// Model is only checked those CameraMotions with type less than or equal to
// max_unstable_type.
void CheckModelStability(
const MotionType& type, const CameraMotion::Type& max_unstable_type,
const std::vector<std::vector<float>>* reset_irls_weights,
std::vector<RegionFlowFeatureList*>* feature_lists,
std::vector<CameraMotion>* camera_motions) const;
// Implementation function called by above function, to check for a single
// model.
void CheckSingleModelStability(const MotionType& type,
const CameraMotion::Type& max_unstable_type,
const std::vector<float>* reset_irls_weights,
RegionFlowFeatureList* feature_list,
CameraMotion* camera_motion) const;
// Projects motion model specified by type to lower degree of freedom models.
void ProjectMotionsDown(const MotionType& type,
std::vector<CameraMotion>* camera_motions) const;
// Filters passed feature_lists based on
// MotionEstimationOptions::irls_weight_filter.
void IRLSWeightFilter(
std::vector<RegionFlowFeatureList*>* feature_lists) const;
// Inlier scale based on average motion magnitude and the fraction
// of the magnitude that is still considered an inlier.
// In general a residual of 1 pixel is assigned an IRLS weight of 1,
// this function returns a residual scale, such that a residual
// of distance_fraction * translation_magnitude equals an IRLS weight of 1
// if multiplied by returned scale.
float GetIRLSResidualScale(const float avg_motion_magnitude,
float distance_fraction) const;
const LinearSimilarityModel& InverseNormalizationTransform() const {
return inv_normalization_transform_;
}
const LinearSimilarityModel& NormalizationTransform() const {
return normalization_transform_;
}
// Returns domain normalized features fall in.
Vector2_f NormalizedDomain() const { return normalized_domain_; }
// Returns index within the inlier mask for each feature point.
// Also returns for each bin normalizer to account for different number of
// features per bin during weighting.
void ComputeFeatureMask(const RegionFlowFeatureList& feature_list,
std::vector<int>* mask_indices,
std::vector<float>* bin_normalizer) const;
// Runs multiple rounds of RANSAC, resetting outlier IRLS weight to
// a low score.
// Optionally can perform temporally consistent selection if inlier_mask is
// specified.
// Returns best model across all iterations in best_model and true if
// estimated model was deemed stable.
bool GetTranslationIrlsInitialization(
RegionFlowFeatureList* feature_list,
const EstimateModelOptions& model_options, float avg_camera_motion,
InlierMask* inlier_mask, // optional.
TranslationModel* best_model) const;
// Same as above for linear similarities.
bool GetSimilarityIrlsInitialization(
RegionFlowFeatureList* feature_list,
const EstimateModelOptions& model_options, float avg_camera_motion,
InlierMask* inlier_mask, // optional.
LinearSimilarityModel* best_model) const;
// Computes number of inliers and strict inliers (satisfying much stricter
// threshold) for a given feature list after model fitting.
void ComputeSimilarityInliers(const RegionFlowFeatureList& feature_list,
int* num_inliers,
int* num_strict_inliers) const;
// Initializes irls weights based on setting
// MotionEstimationOptions::homography_irls_weight_initialization.
void GetHomographyIRLSCenterWeights(const RegionFlowFeatureList& feature_list,
std::vector<float>* center_weights) const;
// Checks for unreasonable large accelerationas between frames as specified by
// MotionEstimationOptions::StableTranslationBounds.
void CheckTranslationAcceleration(
std::vector<CameraMotion>* camera_motions) const;
// Functions below, test passed model is deemed stable according to
// several heuristics set by Stable[MODEL]Bounds in MotionEstimationOptions.
bool IsStableTranslation(const TranslationModel& normalized_translation,
float translation_variance,
const RegionFlowFeatureList& features) const;
// Tests if passed similarity is stable. Pass number of inliers from
// ComputeSimilarityInliers.
bool IsStableSimilarity(const LinearSimilarityModel& model,
const RegionFlowFeatureList& features,
int num_inliers) const;
bool IsStableHomography(const Homography& homography,
float average_homography_error,
float inlier_coverage) const;
bool IsStableMixtureHomography(
const MixtureHomography& homography, float min_block_inlier_coverage,
const std::vector<float>& block_inlier_coverage) const;
// Computes fraction (in [0, 1]) of inliers w.r.t. frame area using a grid of
// occupancy cells. A feature is consider an inlier if its irls_weight is
// larger or equal to min_inlier_score.
float GridCoverage(const RegionFlowFeatureList& feature_list,
float min_inlier_score,
MotionEstimationThreadStorage* thread_storage) const;
// Estimates per scanline-block coverage of mixture. If
// assume_rolling_shutter_camera is set, low textured features are allowed to
// have higher error as registration errors would not be as visible here.
void ComputeMixtureCoverage(const RegionFlowFeatureList& feature_list,
float min_inlier_score,
bool assume_rolling_shutter_camera,
MotionEstimationThreadStorage* thread_storage,
CameraMotion* camera_motion) const;
// Returns average motion magnitude as mean of the translation magnitude from
// the 10th to 90th percentile.
void EstimateAverageMotionMagnitude(const RegionFlowFeatureList& feature_list,
CameraMotion* camera_motion) const;
// Returns per iteration weight of the feature's irls weight initialization.
float IRLSPriorWeight(int iteration, int irls_rounds) const;
// Implementation function for above function. Estimates mixture homography
// from features and returns true if estimation was non-degenerate.
bool MixtureHomographyFromFeature(
const TranslationModel& translation, int irls_rounds, float regularizer,
const PriorFeatureWeights* prior_weights, // optional.
RegionFlowFeatureList* feature_list,
MixtureHomography* mix_homography) const;
// Determines overlay indices (spatial bin locations that are likely to be
// affected by overlays) and stores them in corresponding member in
// CameraMotion. Features that fall within these bins will be assigned a
// weight of zero.
void DetermineOverlayIndices(
bool irls_weights_preinitialized,
std::vector<CameraMotion>* camera_motions,
std::vector<RegionFlowFeatureList*>* feature_lists) const;
// Determine features likely to be part of a static overlay, by setting their
// irls weight to zero.
// Returns fraction of the image domain that is considered to be occupied by
// overlays and specific overlay cell indices in overlay_indices.
float OverlayAnalysis(const std::vector<TranslationModel>& translations,
std::vector<RegionFlowFeatureList*>* feature_lists,
std::vector<int>* overlay_indices) const;
// Smooths feature's irls_weights spatio-temporally.
void PostIRLSSmoothing(
const std::vector<CameraMotion>& camera_motions,
std::vector<RegionFlowFeatureList*>* feature_lists) const;
// Initializes LUT for gaussian weighting. By default discretizes the domain
// [0, max_range] into 4K bins, returning scale to map from a value in the
// domain to the corresponding bin. If scale is nullptr max_range bins are
// created instead (in this case scale would be 1.0, i.e. value equals bin
// index).
void InitGaussLUT(float sigma, float max_range, std::vector<float>* lut,
float* scale) const;
// Performs fast volumetric smoothing / filtering of irls weights. Weights are
// expected to be already binned using BuildFeatureGrid.
void RunTemporalIRLSSmoothing(
const std::vector<FeatureGrid<RegionFlowFeature>>& feature_grid,
const std::vector<std::vector<int>>& feature_taps_3,
const std::vector<std::vector<int>>& feature_taps_5,
const std::vector<float>& frame_confidence,
std::vector<RegionFlowFeatureView>* feature_views) const;
private:
MotionEstimationOptions options_;
int frame_width_;
int frame_height_;
LinearSimilarityModel normalization_transform_;
LinearSimilarityModel inv_normalization_transform_;
// Transform from normalized features to irls domain.
LinearSimilarityModel irls_transform_;
// Frame dimensions transformed by normalization transform.
Vector2_f normalized_domain_;
std::unique_ptr<MixtureRowWeights> row_weights_;
// For initialization biased towards previous frame.
std::unique_ptr<InlierMask> inlier_mask_;
// Stores current bias for each track and the last K irls observations.
struct LongFeatureBias {
explicit LongFeatureBias(float initial_weight) : bias(initial_weight) {
irls_values.push_back(1.0f / initial_weight);
}
LongFeatureBias() : LongFeatureBias(1.0f) {}
float bias = 1.0f; // Current bias, stores pixel error,
// i.e. 1 / IRLS.
std::vector<float> irls_values; // Recently observed IRLS values;
// Ring buffer.
int total_observations = 1;
};
// Maps track id to LongFeatureBias.
typedef std::unordered_map<int, LongFeatureBias> LongFeatureBiasMap;
// Bias map indexed by MotionType.
mutable std::vector<LongFeatureBiasMap> long_feature_bias_maps_ =
std::vector<LongFeatureBiasMap>(static_cast<int>(MODEL_NUM_VALUES));
// Lookup tables and scale for FeatureBias computation.
struct FeatureBiasLUT {
// For ComputeSpatialBias weighting.
std::vector<float> spatial_lut;
float spatial_scale;
std::vector<float> color_lut;
float color_scale;
// For BiasFromFeature computation.
std::vector<float> bias_weight_lut;
float bias_weight_scale;
};
FeatureBiasLUT feature_bias_lut_;
// Counts the number of consecutive duplicate frames for each motion model.
mutable std::vector<int> num_duplicate_frames_ =
std::vector<int>(static_cast<int>(MODEL_NUM_VALUES));
friend class EstimateMotionIRLSInvoker;
friend class IrlsInitializationInvoker;
friend class TrackFilterInvoker;
friend class MotionEstimationThreadStorage;
};
// Meta-data set in the header of filter streams to communicate information used
// during camera motion estimation.
struct CameraMotionStreamHeader {
CameraMotionStreamHeader() : frame_width(0), frame_height(0) {}
int32 frame_width;
int32 frame_height;
};
// Direct fitting functions.
TranslationModel FitTranslationModel(const RegionFlowFeatureList& features);
LinearSimilarityModel FitLinearSimilarityModel(
const RegionFlowFeatureList& features);
AffineModel FitAffineModel(const RegionFlowFeatureList& features);
Homography FitHomography(const RegionFlowFeatureList& features);
MixtureHomography FitMixtureHomography(const RegionFlowFeatureList& features);
// Templated fitting functions.
template <class Model>
Model FitModel(const RegionFlowFeatureList& features);
template <>
inline TranslationModel FitModel(const RegionFlowFeatureList& features) {
return FitTranslationModel(features);
}
template <>
inline LinearSimilarityModel FitModel(const RegionFlowFeatureList& features) {
return FitLinearSimilarityModel(features);
}
template <>
inline AffineModel FitModel(const RegionFlowFeatureList& features) {
return FitAffineModel(features);
}
template <>
inline Homography FitModel(const RegionFlowFeatureList& features) {
return FitHomography(features);
}
template <>
inline MixtureHomography FitModel(const RegionFlowFeatureList& features) {
return FitMixtureHomography(features);
}
// Generic projection function that projects models in an arbitrary direction
// (that is from lower to higher or vice versa) via fast model fits, without
// any error bound checking.
// MixtureRowWeights are only necessary for ToModel == MixtureHomography.
template <class ToModel, class FromModel>
ToModel ProjectViaFit(const FromModel& model, int frame_width, int frame_height,
MixtureRowWeights* row_weights = nullptr,
int grid_dim = 10) {
// Build a grid of features.
const float dx = frame_width * 1.0f / grid_dim;
const float dy = frame_height * 1.0f / grid_dim;
// Create region flow from grid.
RegionFlowFeatureList grid_features;
grid_features.set_frame_width(frame_width);
grid_features.set_frame_height(frame_height);
grid_features.set_match_frame(-1);
for (int k = 0; k <= grid_dim; ++k) {
for (int l = 0; l <= grid_dim; ++l) {
auto* feat = grid_features.add_feature();
feat->set_x(l * dx);
feat->set_y(k * dy);
}
}
RegionFlowFeatureListViaTransform(model, &grid_features, 1.0f,
0.0f, // Replace flow.
false, // Don't change feature loc.
row_weights);
return FitModel<ToModel>(grid_features);
}
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_ESTIMATION_H_

View File

@ -0,0 +1,670 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Settings for MotionEstimation.
syntax = "proto2";
package mediapipe;
// Note: In general for Estimation modes, the prefix are used as follows:
// L2: minimize squared norm of error
// IRLS: iterative reweighted least square, L2 minimization using multiple
// iterations, downweighting outliers.
// Next tag: 69
message MotionEstimationOptions {
// Specifies which camera models should be estimated, translation is always
// estimated.
optional bool estimate_translation_irls = 1 [default = true];
enum LinearSimilarityEstimation {
ESTIMATION_LS_NONE = 0;
ESTIMATION_LS_L2 = 1; // L2 estimation
ESTIMATION_LS_IRLS = 4; // good performance, robust to outliers.
// DEPRECATED modes.
ESTIMATION_LS_L2_RANSAC = 2; // DEPRECATED, use IRLS instead.
ESTIMATION_LS_L1 = 3; // DEPRECATED, use IRLS instead, or static
// function MotionEstimation::
// EstimateLinearSimilarityL1
}
optional LinearSimilarityEstimation linear_similarity_estimation = 3
[default = ESTIMATION_LS_IRLS];
enum AffineEstimation {
ESTIMATION_AFFINE_NONE = 0;
ESTIMATION_AFFINE_L2 = 1;
ESTIMATION_AFFINE_IRLS = 2;
}
optional AffineEstimation affine_estimation = 30
[default = ESTIMATION_AFFINE_NONE];
enum HomographyEstimation {
ESTIMATION_HOMOG_NONE = 0;
ESTIMATION_HOMOG_L2 = 1;
ESTIMATION_HOMOG_IRLS = 2;
}
optional HomographyEstimation homography_estimation = 5
[default = ESTIMATION_HOMOG_IRLS];
// By default, homography estimation minimizes an objective that is not
// strictly the L2 distance between matched points. If the flag is set, each
// row of the linear system is scaled with the exact denominator which results
// in an objective that minimizes the L2 distance.
optional bool homography_exact_denominator_scaling = 53 [default = false];
// Per default, we use exact solver for over-determined system using
// well-conditioned QR decomposition. For better speed, set value to false to
// use estimation via normal equations.
optional bool use_exact_homography_estimation = 54 [default = true];
// If set uses double instead of float when computing normal equations.
optional bool use_highest_accuracy_for_normal_equations = 55 [default = true];
// Regularizer for perspective part of the homography. If zero, no
// regularization is performed. Should be >= 0.
optional float homography_perspective_regularizer = 61 [default = 0];
// Note: Mixture models have high DOF are much more affected by outliers
// than models above. It is recommended that if IRLS estimation is NOT used,
// that mixture_regularizer is increased by a factor >=3.
enum MixtureHomographyEstimation {
ESTIMATION_HOMOG_MIX_NONE = 0;
ESTIMATION_HOMOG_MIX_L2 = 1;
ESTIMATION_HOMOG_MIX_IRLS = 2; // robust to outliers.
}
optional MixtureHomographyEstimation mix_homography_estimation = 12
[default = ESTIMATION_HOMOG_MIX_NONE];
// If row-wise mixture models are estimated, determines number of them.
// Note, changing number of mixtures, interpolation sigma and regularizer
// is very likely to impact the stability analysis for mixtures and rolling
// shutter scoring. At least MixtureHomographyBounds would need to be adjusted
// to the new values.
optional int32 num_mixtures = 13 [default = 10];
// If row-wise mixture models are estimated, determines how much each point is
// influenced by its neigbhoring mixtures. Specified as relative sigma
// (standard deviation) w.r.t. frame_height.
optional float mixture_row_sigma = 14 [default = 0.1];
// Mixture estimation uses L2 regularizer to assure that adjacent mixture
// models are similar.
optional float mixture_regularizer = 15 [default = 1e-4];
// Mixtures are estimated across a spectrum of exponentially increasingly
// regularizers. In particular the regularizer at level L is given as
// mixture_regularizer * mixture_regularizer_base^L.
// A maximum of 10 levels are supported (checked!).
// Note: When changing the number of levels you probably want to adapt the
// MotionStabilizationOptions::rolling_shutter_increment value as well,
// as the number of levels directly controls the highest threshold for
// the rolling shutter index analysis.
optional float mixture_regularizer_levels = 42 [default = 3];
optional float mixture_regularizer_base = 43 [default = 2.2];
optional int32 mixture_rs_analysis_level = 44 [default = 2];
// IRLS rounds to down-weight outliers (default across all models).
// Note: IRLS in combination with full mixture models (as opposed to the
// default reduced ones) is somewhat expensive.
optional int32 irls_rounds = 17 [default = 10];
// If set to > 0 (always needs be less than 1.0), influence of supplied prior
// irls weights is linearlly decreased from the specified prior scale (weight
// 1.0) to prior_scale. Effectively, biases the solution to the
// supplied prior features.
// Note: Without irls_weights_preinitialized set to true, this option is
// effectively a no op.
// TODO: Retire this option.
optional float irls_prior_scale = 50 [default = 0.2];
// Determine how to normalize irls weights w.r.t. average motion magnitude.
// In general a residual of 1 pixel is assigned an IRLS weight of 1.
// However as larger motions in general are affected by a larger error, we
// normalize irls weights, such that a residual
// of distance of irls_motion_magnitude_fraction times
// <average translation magnitude> equals an IRLS weight of 1.
// Must be larger than zero.
optional float irls_motion_magnitude_fraction = 31 [default = 0.08];
// Scale that is applied for mixture (where error is expected to be bigger).
optional float irls_mixture_fraction_scale = 68 [default = 1.5];
// By default, irls weight of all features are set uniformly to one before
// estimating EACH model, refining them in subsequent irls iterations.
// If flag below is set, input irls weights are used instead for each motion
// model.
optional bool irls_weights_preinitialized = 39 [default = false];
// If weights are pre-initialized optionally min filter weights along track
// ids when long tracks are used. This can be used to consistently label
// outliers in time before estimation.
optional bool filter_initialized_irls_weights = 67 [default = false];
// If activated, irls weight of outlier features are reset. Outliers are
// defined as those features, for which the best model fit after #rounds
// iterations of RANSAC did NOT yield an error lower than cutoff.
// Only applies to translation and similarity estimation.
message IrlsOutlierInitialization {
optional bool activated = 1 [default = false];
optional int32 rounds = 2 [default = 100];
optional float cutoff = 3 [default = 0.003];
}
optional IrlsOutlierInitialization irls_initialization = 56;
// Normalizes feature's irls weights prior to estimation such that
// feature in high density areas are downweighted. Multiplicative in case
// irls_weights_preinitialized is set to true.
optional bool feature_density_normalization = 62 [default = false];
// A regular grid of size feature_mask_size x feature_mask_size
// is used to normalize features w.r.t. their density.
optional int32 feature_mask_size = 63 [default = 10];
// In addition to above outlier and density initialization, long features
// that are present for a specified ratio of the analysis interval can be
// upweighted. This greatly improves temporal consistency.
message LongFeatureInitialization {
optional bool activated = 1 [default = false];
// Tracks with a length greater of equal to the specified percentile
// are upweighted by the specified upweight_multiplier.
optional float min_length_percentile = 2 [default = 0.95];
// Features passing above test have their irls weight increased by the
// specified multiplier prior to estimation.
optional float upweight_multiplier = 3 [default = 5];
}
optional LongFeatureInitialization long_feature_initialization = 66;
// Irls initialization can be performed in a temporal depdent manner,
// (if estimation_policy() == TEMPORALLY_DEPENDENT), where
// the previous frame's motion estimation biases the IrlsInitialization of the
// currently processed frame. In particular the location and magnitude of
// inliers is used during the RANSAC selection stage, to favor those features
// that agree with the prior, represented as confidence mask of inliers
// (using same dimension as above feature_mask_size).
// After estimation, the prior is updated.
message IrlsMaskOptions {
// Amount prior is decayed after each iteration.
optional float decay = 2 [default = 0.7];
// Score that each inlier adds to the current prior. Specified w.r.t. total
// number of features, i.e. each feature increases a bins score by
// inlier_score.
optional float inlier_score = 3 [default = 0.4];
// Each inlier scores at least this value regardless of the inlier mask
// (additive).
optional float base_score = 4 [default = 0.2];
// Motions are scored relative to previous motion. Threshold denotes
// absolute minimum of denominator.
optional float min_translation_norm = 5 [default = 2e-3];
// Translation is updated in every step by blending it with the previous
// estimated translation. (alpha is within 0 to 1, where 0 indicates to use
// only measured translation, i.e. no blending).
optional float translation_blend_alpha = 6 [default = 0.7];
// Every time translation is updated, prior (in [0, 1]) is increased by the
// specified amount.
optional float translation_prior_increase = 7 [default = 0.2];
// Deprecated fields.
extensions 1;
}
optional IrlsMaskOptions irls_mask_options = 57;
// Describes how long feature tracks are leveraged for joint estimation across
// many frames.
message JointTrackEstimationOptions {
// For each frame-pair motion model, describing the motion between frame
// I and I - 1, estimate in addition several additional motion
// models along long feature tracks describing the motion between frame
// I and I - k * motion_stride (additional models are not output,
// but help to filter irls weights).
// Specifies total number of estimated motion models per frame-pair. Must be
// greater than zero.
optional int32 num_motion_models = 1 [default = 3];
// Spacing in frames for additional motion models.
optional int32 motion_stride = 2 [default = 15];
// If set, performs temporal smoothing across frames of the obtained irls
// weights.
optional bool temporal_smoothing = 3 [default = false];
// TODO: Specify which filter is used during temporal smoothing.
// TODO: Limit joint estimation to static scenes
// (dont violate rigidity of alignment in case of wide
// baselines). Adopt stride and num_motion_models
// accordingly.
}
optional JointTrackEstimationOptions joint_track_estimation = 59;
// Options being used to bias IRLS features if estimation mode
// TEMPORAL_LONG_FEATURE_BIAS is being used.
// Next Tag: 15
message LongFeatureBiasOptions {
// Estimation is performed multiple times, alternating between model
// estimation and smooth temporal feature biasing for the specified number
// of rounds.
optional int32 total_rounds = 13 [default = 1];
// Controls how fast the bias for a track gets updated, in case feature is
// an inlier. Use higher values for less decay of background motion over
// time.
optional float inlier_bias = 1 [default = 0.98];
// Same as above for outliers (or features with low prior), i.e those that
// got recently seeded.
optional float outlier_bias = 2 [default = 0.7];
// Number of elements after which we deem estimation to be stable.
// Used to control weight of bias if fewer than the specified number have
// been observed. Also used as maximum ring buffer size (only most recent
// number of observations are kept). Must be > 0.
optional int32 num_irls_observations = 3 [default = 10];
// Change in irls weight magnitude (from outlier to inlier) above which we
// reset the current bias.
optional float max_irls_change_ratio = 4 [default = 10.0];
// Irls weight above which we consider it to be an inlier for bias update
// purposes (see above inlier and outlier bias). By default, outliers are
// allowed to update their bias faster than inliers. Must be > 0.
optional float inlier_irls_weight = 5 [default = 0.2];
// Standard deviation used during feature initialization. Current bias of a
// track is used to pre-weight features via gaussian weighting with
// specified standard deviation.
optional float bias_stdev = 12 [default = 1.0];
// When seeding new tracks (on the first frame), we bilaterally pool
// neighboring feature biases as seed. Details are controlled by options
// below. If false, the feature's estimation error is used instead
// (faster, but less spatially smooth).
// If activated it is advised to use a patch descriptor radius of at least
// 20 pixels.
optional bool use_spatial_bias = 6 [default = true];
// Newly observered tracks's biases are seeded by similar looking features
// in close spatial proximity. For efficieny a grid is used to determine
// proximity.
// Grid size in normalized coordinates w.r.t. frame domain.
optional float grid_size = 7 [default = 0.04];
// Sigma's for combining feature biases.
optional float spatial_sigma = 8 [default = 0.02];
optional float color_sigma = 9 [default = 20.0];
// Defines what we consider to be a long track. Features spawned around
// locations of similar looking long tracks are considered to have
// high prior, e.g. their initilization is given more weight.
optional int32 long_track_threshold = 10 [default = 30];
// Determines with fraction of long tracks is considered to be sufficient
// for highly confident bias seed.
optional float long_track_confidence_fraction = 11 [default = 0.25];
// If activated, uses the irls weights from the estimation of the lower
// degree of freedom model to seed the bias of the higher degree of freedom
// model. This improves rigidity of the computed motion.
optional bool seed_priors_from_bias = 14 [default = false];
}
optional LongFeatureBiasOptions long_feature_bias_options = 64;
// Controls how multiple models via EstimateMotionsParallel are estimated.
enum EstimationPolicy {
INDEPENDENT_PARALLEL = 1; // Models are estimated independently across
// frames in parallel.
TEMPORAL_IRLS_MASK = 2; // Previous frame's estimation biases
// current one, controlled via above
// IrlsMaskOptions.
TEMPORAL_LONG_FEATURE_BIAS = 4; // Frame's estimation is biased along
// long features, controlled via above
// LongFeatureBiasOptions.
JOINTLY_FROM_TRACKS = 3; // Estimation is performed jointly over
// chunks of frames, exercising a consistent
// irls weight PER track. Expects as input
// RegionFlowFeatureList's with long_tracks.
// Controlled via above
// JointTrackEstimationOptions.
}
optional EstimationPolicy estimation_policy = 58
[default = INDEPENDENT_PARALLEL];
optional int32 coverage_grid_size = 51 [default = 10];
// Degree of freedom of estimated homography mixtures. If desired, specific
// parts of the homography can be held constant across the mixture.
// For fast draft TRANSLATION_MIXTURE is recommended, for high quality
// SKEW_ROTATION_MIXTURE.
enum MixtureModelMode {
FULL_MIXTURE = 0; // 8 dof * num_mixtures
TRANSLATION_MIXTURE = 1; // 6 dof + 2 dof * num_mixtures
SKEW_ROTATION_MIXTURE = 2; // 4 dof + 4 dof * num_mixtures
}
optional MixtureModelMode mixture_model_mode = 23
[default = SKEW_ROTATION_MIXTURE];
// If specified, only features that agree with the estimated linear similarity
// will be used to estimate the homography.
// If set, linear_similarity_estimation can not be ESTIMATION_NONE! (checked)
optional bool use_only_lin_sim_inliers_for_homography = 6 [default = true];
// Max. deviation to be considered an inlier w.r.t. estimated similarity for
// above flag. This value is set w.r.t. normalized frame diameter.
// TODO: Should take GetIRLSResidualScale into account.
optional float lin_sim_inlier_threshold = 20 [default = 0.003];
// If any parameter of the input flow or estimated translation exceeds these
// thresholds we deem the motion INVALID.
message TranslationBounds {
// Absolute minimum of features present.
optional int32 min_features = 1 [default = 3];
// Max magnitude of the translation expressed w.r.t. frame diameter
optional float frac_max_motion_magnitude = 2 [default = 0.15];
// Motion magnitude is only tested for if standard deviation of estimated
// translation exceeds threshold.
optional float max_motion_stdev_threshold = 4 [default = 0.01];
// Max standard deviation of the estimated translation (normalized to frame
// diameter).
optional float max_motion_stdev = 3 [default = 0.065];
// Maximum acceleration between frames. Specified relative to minimum
// velocity across two adjacent frames (absolute minimum of 0.001 is
// enforced, ~1 pix for 480p).
// If exceeded for one frame, the whole batch passed to
// EstimateMotionsParallel is labeled unstable.
optional float max_acceleration = 5 [default = 20.0];
}
optional TranslationBounds stable_translation_bounds = 32;
// If any test/bound is violated, the motion is deemed UNSTABLE.
message SimilarityBounds {
// Input frame has to be labeled stable, i.e. enough features and coverage
// present.
optional bool only_stable_input = 1 [default = true];
// Minimum number of inlier features (absolute and as fraction of total
// number of features).
// TODO: Dataset run setting this to 0.15
optional float min_inlier_fraction = 2 [default = 0.2];
optional float min_inliers = 3 [default = 30];
// Bounds on valid similarities. We use larger values compared to
// homographies. Note: Bounds are necessary, to guarantee invertability
// of the resulting similarity.
optional float lower_scale = 4 [default = 0.8];
optional float upper_scale = 5 [default = 1.25]; // 1 / 0.8.
optional float limit_rotation = 6 [default = 0.25]; // 15 degrees.
// Thresholds for a feature to be considered inlier w.r.t similarity
// transform, expressed in terms of pixel residual error. Max of absolute
// and fractional thresholds is used.
// Ratio of inliers that pass regular and strict thresholds are storred in
// CameraMotion.
//
// TODO: Just use lin_sim_inlier_threshold directly, however that
// recomputes the error, and requires regression testing. Using an extra
// fractional inlier threshold for now.
//
// Absolute in pixels.
optional float inlier_threshold = 7 [default = 4.0];
// Scaled by frame diameter.
optional float frac_inlier_threshold = 8 [default = 0];
// TODO: Revisit after frame selection change.
// Absolute in pixels.
optional float strict_inlier_threshold = 9 [default = 0.5];
}
optional SimilarityBounds stable_similarity_bounds = 33;
// If any parameter of the estimated homography exceeds these bounds,
// we deem it UNSTABLE_SIM and use estimated similarity instead.
message HomographyBounds {
optional float lower_scale = 1 [default = 0.8];
optional float upper_scale = 2 [default = 1.25]; // 1 / 0.8.
optional float limit_rotation = 3 [default = 0.25]; // 15 degrees.
optional float limit_perspective = 4 [default = 0.0004];
// Inlier coverage is only tested for if average homography error exceeds
// registration_thresholds. Max of the following two thresholds is used.
//
// Absolute in pixels.
optional float registration_threshold = 5 [default = 0.1];
// Scaled by frame diameter.
optional float frac_registration_threshold = 8 [default = 0];
// Minimum fraction of inlier features w.r.t. frame area.
optional float min_inlier_coverage = 6 [default = 0.3];
// Grid coverage inlier threshold. Pixel errors below this
// threshold are considered inliers. Defined w.r.t. frame diameter, approx.
// 1.5 for 16:9 SD video (480p), i.e. threshold is multiplied by frame
// diameter.
optional float frac_inlier_threshold = 7 [default = 2.0e-3];
}
optional HomographyBounds stable_homography_bounds = 11;
// If any parameter of the estimated homography mixture exceeds these bounds,
// we deem it UNSTABLE_HOMOG and use the estimated homography instead.
message MixtureHomographyBounds {
// Minimum fraction of inlier features w.r.t. block area.
optional float min_inlier_coverage = 1 [default = 0.4];
// Each block is tested to be stable, regarding the outliers.
// A frame is labeled unstable, if more or equal than the specified adjacent
// blocks are labeled outliers.
optional int32 max_adjacent_outlier_blocks = 2 [default = 5];
// Maximum number of adjacent empty blocks (no inliers).
optional int32 max_adjacent_empty_blocks = 3 [default = 3];
// Grid coverage threshold inlier threshold. See identical parameter in
// HomographyBounds.
optional float frac_inlier_threshold = 7 [default = 2.5e-3];
}
optional MixtureHomographyBounds stable_mixture_homography_bounds = 34;
// Scale for stricter coverage evaluation. Used for rolling shutter guess
// computation, by only using high quality inliers. Larger values reflect
// stricter coverage.
// Specifically, when computing coverage via GridCoverage call,
// frac_inlier_threshold is reduced (divided) by specified scale below.
optional float strict_coverage_scale = 41 [default = 1.333];
// By default frames with zero trackable features (e.g. at the beginning,
// empty frame or shot boundary) are set identity model but still labeled as
// valid. If set to false, these frames are flagged as invalid, which can be
// useful to locate shot boundaries, etc.
optional bool label_empty_frames_as_valid = 22 [default = true];
// Setting for temporal smoothing of irls weights in optional post-processing
// step.
// In normalized coordinates w.r.t. frame domain.
optional float feature_grid_size = 24 [default = 0.05];
optional float spatial_sigma = 25 [default = 0.01];
// Frame diameter across which smoothing is performed.
optional int32 temporal_irls_diameter = 26 [default = 20];
optional float temporal_sigma = 27 [default = 5]; // in frames.
// Bilateral weight (for un-normalized color domain [0, .. 255]).
optional float feature_sigma = 28 [default = 30.0];
// If set to false 3 taps are used.
optional bool filter_5_taps = 29 [default = false];
// If set, during temporal smoothing, each frame is weighted by its
// confidence, defined as the square coverage (or square mean mixture
// coverage). Therefore, low confidence fits do not errornouesly propagate
// over time. In addition, if the confidence is below the specified
// confidence_threshold (relative the the maximum coverage observed in the
// test interval), irls weights are reset to 1, i.e. biased to be
// agree with the (unkown) background motion.
optional bool frame_confidence_weighting = 48 [default = true];
optional float reset_confidence_threshold = 49 [default = 0.4];
// Filters irls weights before smoothing them according to specified
// operation.
enum IRLSWeightFilter {
IRLS_FILTER_NONE = 0;
IRLS_FILTER_TEXTURE = 1;
IRLS_FILTER_CORNER_RESPONSE = 2;
}
// Calls TextureFilteredRegionFlowFeatureIRLSWeights on computed irls weights
// before smoothing them.
optional IRLSWeightFilter irls_weight_filter = 35
[default = IRLS_FILTER_NONE];
// Attempts to detect overlays, i.e. static elements burned-into the video
// that potentially corrupt motion estimation.
optional bool overlay_detection = 36 [default = false];
// Overlay detection is performed over specified number of frames.
optional int32 overlay_analysis_chunk_size = 37 [default = 8];
message OverlayDetectionOptions {
// Potential overlay features are aggregated over a mask with cells
// mask_size x mask_size as specified below.
optional int32 analysis_mask_size = 1 [default = 10];
// There are two types of candidates of overlay features, strict and loose
// ones. Strict candidates are used to determine if a grid bin should be
// considered an overlay cell. If the grid is an overlay cell, *all*
// candidates, i.e. strict and loose ones will be flagged as overlay
// features by setting their corresponding irls weight to zero.
// A feature is a strict overlay feature if its motion is less than
// near_zero_motion and AND less than max_translation_ratio times the
// estimated translation magnitude at that frame AND is texturedness is
// sufficiently high.
optional float strict_near_zero_motion = 2 [default = 0.2];
optional float strict_max_translation_ratio = 3 [default = 0.2];
// Minimum texturedness of a feature to be considered an overlay.
// Motivation: Overlays are mostly text or graphics, i.e. have visually
// distinguished features.
optional float strict_min_texturedness = 5 [default = 0.1];
// A feature is a loose overlay feature if its motion is less than
// loose_near_zero_motion.
optional float loose_near_zero_motion = 4 [default = 1.0];
// Minimum fraction of strict overlay features within a cell to be
// considered an overlay cell.
optional float overlay_min_ratio = 6 [default = 0.3];
// Absolute minimum number of strict overlay features within a cell to be
// considered an overlay cel..
optional float overlay_min_features = 7 [default = 10];
}
optional OverlayDetectionOptions overlay_detection_options = 38;
// Shot boundaries are introduced in 3 different scenarios:
// a) Frame has zero tracked features w.r.t. previous frame
// b) Estimated motion is deemed invalid (CameraMotion::INVALID).
// c) Visual consistency is above threshold of two adjacent frames.
message ShotBoundaryOptions {
// After cases a & b are determined from features/camera motion, they
// are verified by ensuring visual consistency is above specified threshold,
// if visual consistency has been computed. Only if this is case will the
// frame be labeled as shot boundary. Motivation is, that there should
// always be some (even small) measurable increase in the frame difference
// at a shot boundary.
// Verification is only performed if visual_consistency has been evaluated
// (value >= 0).
optional float motion_consistency_threshold = 1 [default = 0.02];
// Threshold for case c). Sometimes, motion estimation will miss shot
// boundaries. We define shot boundaries for which the visual consistency is
// higher than the specified threshold for at least two adjacent frames.
optional float appearance_consistency_threshold = 2 [default = 0.075];
}
optional ShotBoundaryOptions shot_boundary_options = 60;
// By default, irls weights of each feature are overwritten with refined irls
// weights of the last iteration for the highest degree of freedom model that
// was estimated stable. If set to false, original irls weights are retained.
// Note: If overlay detection is activated, features to be deemed overlays
// have their irls weight set to zero, regardless of this setting.
// Similarily, an IRLSWeightFilter is applied if requested, regardless
// of this setting.
optional bool output_refined_irls_weights = 40 [default = true];
// Weight initialization for homography estimation. This is to bias homography
// estimation either to foreground or background.
enum HomographyIrlsWeightInitialization {
IRLS_WEIGHT_CONSTANT_ONE = 1; // Constant, treat all features equally.
IRLS_WEIGHT_CENTER_GAUSSIAN = 2; // Weight features in the center higher.
// Tends to lock onto foreground.
IRLS_WEIGHT_PERIMETER_GAUSSIAN = 3; // Weight features around the
// perimeter higher, tends to lock onto
// background and improves rigidity of
// the perspective degree of freedom.
}
// IRLS weights for homography estimation are initialized based on the
// specified options. If, options irls_weights_preinitialized is set,
// weights are multiplied instead of reset.
optional HomographyIrlsWeightInitialization
homography_irls_weight_initialization = 45
[default = IRLS_WEIGHT_PERIMETER_GAUSSIAN];
// If set to false use L1 norm irls weights instead of L0 norm irls weights.
optional bool irls_use_l0_norm = 46 [default = true];
// IRLS weights are determined in a limited domain (in particular helpful
// for stabilization analysis on HD videos).
// TODO: Make this the default.
optional bool domain_limited_irls_scaling = 65 [default = false];
// For comparison and debugging purposes. Simply estimates requested models
// without checking their stability via the stable_*_bounds parameters.
// However, invertibility is still checked to avoid invalid data being passed
// to later stages of the stabilizer.
optional bool deactivate_stable_motion_estimation = 47 [default = false];
// Projects higher order motions if estimated correctly down to lower order
// motions, therefore replacing the previously estimated motions.
optional bool project_valid_motions_down = 52 [default = false];
// DEPRECATED functionality. Use static functions as indicated instead.
//
// Non-linear similarity, use MotionEstimation::EstimateSimilarityModelL2.
optional bool estimate_similarity = 2 [deprecated = true];
// Deprecated fields.
extensions 7, 8, 16;
}

View File

@ -0,0 +1,924 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/motion_models.h"
#include <stdlib.h>
#include <cmath>
#include <string>
#include <utility>
#include "Eigen/Core"
#include "Eigen/Dense"
#include "absl/strings/str_format.h"
// Set to true to use catmull rom mixture weights instead of Gaussian weights
// for homography mixture estimation.
bool flags_catmull_rom_mixture_weights = false;
namespace mediapipe {
std::string ModelAdapter<TranslationModel>::ToString(
const TranslationModel& model) {
return absl::StrFormat("%7f", model.dx()) + " " +
absl::StrFormat("%7f", model.dy());
}
AffineModel ModelAdapter<TranslationModel>::ToAffine(
const TranslationModel& model) {
return AffineAdapter::FromArgs(model.dx(), model.dy(), 1, 0, 0, 1);
}
TranslationModel ModelAdapter<TranslationModel>::FromAffine(
const AffineModel& model) {
DCHECK_EQ(model.a(), 1);
DCHECK_EQ(model.b(), 0);
DCHECK_EQ(model.c(), 0);
DCHECK_EQ(model.d(), 1);
return TranslationAdapter::FromArgs(model.dx(), model.dy());
}
Homography ModelAdapter<TranslationModel>::ToHomography(
const TranslationModel& model) {
return HomographyAdapter::FromAffine(ToAffine(model));
}
TranslationModel ModelAdapter<TranslationModel>::FromHomography(
const Homography& model) {
return TranslationAdapter::FromAffine(HomographyAdapter::ToAffine(model));
}
void ModelAdapter<TranslationModel>::GetJacobianAtPoint(const Vector2_f& pt,
float* jacobian) {
DCHECK(jacobian);
jacobian[0] = 1;
jacobian[1] = 0;
jacobian[2] = 0;
jacobian[3] = 1;
}
TranslationModel ModelAdapter<TranslationModel>::NormalizationTransform(
float frame_width, float frame_height) {
// Independent of frame size.
return TranslationModel();
}
TranslationModel ModelAdapter<TranslationModel>::Maximum(
const TranslationModel& lhs, const TranslationModel& rhs) {
return FromArgs(std::max(lhs.dx(), rhs.dx()), std::max(lhs.dy(), rhs.dy()));
}
TranslationModel ModelAdapter<TranslationModel>::ProjectFrom(
const LinearSimilarityModel& model, float frame_width, float frame_height) {
return LinearSimilarityAdapter::ProjectToTranslation(model, frame_width,
frame_height);
}
TranslationModel ModelAdapter<TranslationModel>::ProjectFrom(
const AffineModel& model, float frame_width, float frame_height) {
return ProjectFrom(AffineAdapter::ProjectToLinearSimilarity(
model, frame_width, frame_height),
frame_width, frame_height);
}
TranslationModel ModelAdapter<TranslationModel>::ProjectFrom(
const Homography& model, float frame_width, float frame_height) {
return ProjectFrom(
HomographyAdapter::ProjectToAffine(model, frame_width, frame_height),
frame_width, frame_height);
}
// Similarity model.
SimilarityModel ModelAdapter<SimilarityModel>::FromArgs(float dx, float dy,
float scale,
float rotation) {
SimilarityModel model;
model.set_dx(dx);
model.set_dy(dy);
model.set_scale(scale);
model.set_rotation(rotation);
return model;
}
SimilarityModel ModelAdapter<SimilarityModel>::FromFloatPointer(
const float* args, bool identity_parametrization) {
DCHECK(args);
SimilarityModel model;
model.set_dx(args[0]);
model.set_dy(args[1]);
model.set_scale((identity_parametrization ? 1.f : 0.f) + args[2]);
model.set_rotation(args[3]);
return model;
}
SimilarityModel ModelAdapter<SimilarityModel>::FromDoublePointer(
const double* args, bool identity_parametrization) {
DCHECK(args);
SimilarityModel model;
model.set_dx(args[0]);
model.set_dy(args[1]);
model.set_scale((identity_parametrization ? 1.f : 0.f) + args[2]);
model.set_rotation(args[3]);
return model;
}
Vector2_f ModelAdapter<SimilarityModel>::TransformPoint(
const SimilarityModel& model, const Vector2_f& pt) {
// Setup rotation part.
const float c_r = std::cos(model.rotation());
const float c_s = std::sin(model.rotation());
const Vector2_f pt_rot(c_r * pt.x() - c_s * pt.y(),
c_s * pt.x() + c_r * pt.y());
return pt_rot * model.scale() + Vector2_f(model.dx(), model.dy());
}
SimilarityModel ModelAdapter<SimilarityModel>::Invert(
const SimilarityModel& model) {
bool success = true;
const SimilarityModel result = InvertChecked(model, &success);
if (!success) {
LOG(ERROR) << "Model not invertible. Returning identity.";
return SimilarityModel();
} else {
return result;
}
}
SimilarityModel ModelAdapter<SimilarityModel>::InvertChecked(
const SimilarityModel& model, bool* success) {
SimilarityModel inv_model;
inv_model.set_rotation(-model.rotation());
if (fabs(model.scale()) > kDetInvertibleEps) {
inv_model.set_scale(1.0 / model.scale());
*success = true;
} else {
*success = false;
VLOG(1) << "Model is not invertible.";
return SimilarityModel();
}
// Rotate and scale [dx, dy] by inv_model.
const float c_r = std::cos(inv_model.rotation());
const float c_s = std::sin(inv_model.rotation());
const Vector2_f pt_rot(c_r * model.dx() - c_s * model.dy(),
c_s * model.dx() + c_r * model.dy());
const Vector2_f inv_trans = -pt_rot * inv_model.scale();
inv_model.set_dx(inv_trans.x());
inv_model.set_dy(inv_trans.y());
return inv_model;
}
SimilarityModel ModelAdapter<SimilarityModel>::Compose(
const SimilarityModel& lhs, const SimilarityModel& rhs) {
SimilarityModel result;
result.set_scale(lhs.scale() * rhs.scale());
result.set_rotation(lhs.rotation() + rhs.rotation());
// Apply lhs scale and rot to rhs translation.
const float c_r = std::cos(lhs.rotation());
const float c_s = std::sin(lhs.rotation());
const Vector2_f trans_rot(c_r * rhs.dx() - c_s * rhs.dy(),
c_s * rhs.dx() + c_r * rhs.dy());
const Vector2_f trans_concat =
trans_rot * lhs.scale() + Vector2_f(lhs.dx(), lhs.dy());
result.set_dx(trans_concat.x());
result.set_dy(trans_concat.y());
return result;
}
float ModelAdapter<SimilarityModel>::GetParameter(const SimilarityModel& model,
int id) {
switch (id) {
case 0:
return model.dx();
case 1:
return model.dy();
case 2:
return model.scale();
case 3:
return model.rotation();
default:
LOG(FATAL) << "Parameter id is out of bounds";
}
return 0;
}
std::string ModelAdapter<SimilarityModel>::ToString(
const SimilarityModel& model) {
return absl::StrFormat("%7f", model.dx()) + " " +
absl::StrFormat("%7f", model.dy()) + " " +
absl::StrFormat("%7f", model.scale()) + " " +
absl::StrFormat("%7f", model.rotation());
}
SimilarityModel ModelAdapter<SimilarityModel>::NormalizationTransform(
float frame_width, float frame_height) {
const float scale = std::hypot(frame_width, frame_height);
DCHECK_NE(scale, 0);
return SimilarityAdapter::FromArgs(0, 0, 1.0 / scale, 0);
}
TranslationModel ModelAdapter<SimilarityModel>::ProjectToTranslation(
const SimilarityModel& model, float frame_width, float frame_height) {
return LinearSimilarityAdapter::ProjectToTranslation(
LinearSimilarityAdapter::FromSimilarity(model), frame_width,
frame_height);
}
std::string ModelAdapter<LinearSimilarityModel>::ToString(
const LinearSimilarityModel& model) {
return absl::StrFormat("%7f", model.dx()) + " " +
absl::StrFormat("%7f", model.dy()) + " " +
absl::StrFormat("%7f", model.a()) + " " +
absl::StrFormat("%7f", model.b());
}
AffineModel ModelAdapter<LinearSimilarityModel>::ToAffine(
const LinearSimilarityModel& model) {
return AffineAdapter::FromArgs(model.dx(), model.dy(), model.a(), -model.b(),
model.b(), model.a());
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::FromAffine(
const AffineModel& model) {
DCHECK_EQ(model.a(), model.d());
DCHECK_EQ(model.b(), -model.c());
return LinearSimilarityAdapter::FromArgs(model.dx(), model.dy(), model.a(),
-model.b());
}
Homography ModelAdapter<LinearSimilarityModel>::ToHomography(
const LinearSimilarityModel& model) {
return HomographyAdapter::FromAffine(ToAffine(model));
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::FromHomography(
const Homography& model) {
return LinearSimilarityAdapter::FromAffine(
HomographyAdapter::ToAffine(model));
}
SimilarityModel ModelAdapter<LinearSimilarityModel>::ToSimilarity(
const LinearSimilarityModel& model) {
const float scale = std::hypot(model.a(), model.b());
return SimilarityAdapter::FromArgs(model.dx(), model.dy(), scale,
std::atan2(model.b(), model.a()));
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::FromSimilarity(
const SimilarityModel& model) {
return LinearSimilarityAdapter::FromArgs(
model.dx(), model.dy(), model.scale() * std::cos(model.rotation()),
model.scale() * std::sin(model.rotation()));
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::ScaleParameters(
const LinearSimilarityModel& model_in, float scale) {
LinearSimilarityModel model = model_in;
model.set_a(model.a() * scale);
model.set_b(model.b() * scale);
model.set_dx(model.dx() * scale);
model.set_dy(model.dy() * scale);
return model;
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::AddIdentity(
const LinearSimilarityModel& model_in) {
LinearSimilarityModel model = model_in;
model.set_a(model.a() + 1);
return model;
}
void ModelAdapter<LinearSimilarityModel>::GetJacobianAtPoint(
const Vector2_f& pt, float* jacobian) {
DCHECK(jacobian);
// First row.
jacobian[0] = 1;
jacobian[1] = 0;
jacobian[2] = pt.x();
jacobian[3] = -pt.y();
// Second row.
jacobian[4] = 0;
jacobian[5] = 1;
jacobian[6] = pt.y();
jacobian[7] = pt.x();
}
LinearSimilarityModel
ModelAdapter<LinearSimilarityModel>::NormalizationTransform(
float frame_width, float frame_height) {
const float scale = std::hypot(frame_width, frame_height);
DCHECK_NE(scale, 0);
return LinearSimilarityAdapter::FromArgs(0, 0, 1.0 / scale, 0);
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::Maximum(
const LinearSimilarityModel& lhs, const LinearSimilarityModel& rhs) {
return FromArgs(std::max(lhs.dx(), rhs.dx()), std::max(lhs.dy(), rhs.dy()),
std::max(lhs.a(), rhs.a()), std::max(lhs.b(), rhs.b()));
}
TranslationModel ModelAdapter<LinearSimilarityModel>::ProjectToTranslation(
const LinearSimilarityModel& model, float frame_width, float frame_height) {
LinearSimilarityModel center_trans = LinearSimilarityAdapter::FromArgs(
frame_width * 0.5f, frame_height * 0.5f, 1, 0);
LinearSimilarityModel inv_center_trans = LinearSimilarityAdapter::FromArgs(
-frame_width * 0.5f, -frame_height * 0.5f, 1, 0);
// Express model w.r.t. center.
LinearSimilarityModel center_model =
ModelCompose3(inv_center_trans, model, center_trans);
// No need to shift back to top-left after decomposition, as translations
// are independent from coordinate origin.
return TranslationAdapter::FromArgs(center_model.dx(), center_model.dy());
}
std::string ModelAdapter<AffineModel>::ToString(const AffineModel& model) {
return absl::StrFormat("%7f", model.dx()) + " " +
absl::StrFormat("%7f", model.dy()) + " " +
absl::StrFormat("%7f", model.a()) + " " +
absl::StrFormat("%7f", model.b()) + " " +
absl::StrFormat("%7f", model.c()) + " " +
absl::StrFormat("%7f", model.d());
}
AffineModel ModelAdapter<AffineModel>::NormalizationTransform(
float frame_width, float frame_height) {
const float scale = std::hypot(frame_width, frame_height);
DCHECK_NE(scale, 0);
return AffineAdapter::FromArgs(0, 0, 1.0f / scale, 0, 0, 1.0f / scale);
}
Homography ModelAdapter<AffineModel>::ToHomography(const AffineModel& model) {
float params[8] = {model.a(), model.b(), model.dx(), model.c(),
model.d(), model.dy(), 0, 0};
return HomographyAdapter::FromFloatPointer(params, false);
}
AffineModel ModelAdapter<AffineModel>::FromHomography(const Homography& model) {
DCHECK_EQ(model.h_20(), 0);
DCHECK_EQ(model.h_21(), 0);
float params[6] = {model.h_02(), model.h_12(), // dx, dy
model.h_00(), model.h_01(), // a, b
model.h_10(), model.h_11()}; // c, d
return FromFloatPointer(params, false);
}
AffineModel ModelAdapter<AffineModel>::ScaleParameters(
const AffineModel& model_in, float scale) {
AffineModel model = model_in;
model.set_a(model.a() * scale);
model.set_b(model.b() * scale);
model.set_c(model.c() * scale);
model.set_d(model.d() * scale);
model.set_dx(model.dx() * scale);
model.set_dy(model.dy() * scale);
return model;
}
AffineModel ModelAdapter<AffineModel>::AddIdentity(
const AffineModel& model_in) {
AffineModel model = model_in;
model.set_a(model.a() + 1);
model.set_d(model.d() + 1);
return model;
}
void ModelAdapter<AffineModel>::GetJacobianAtPoint(const Vector2_f& pt,
float* jacobian) {
DCHECK(jacobian);
// First row.
jacobian[0] = 1;
jacobian[1] = 0;
jacobian[2] = pt.x();
jacobian[3] = pt.y();
jacobian[4] = 0;
jacobian[5] = 0;
// Second row.
jacobian[6] = 0;
jacobian[7] = 1;
jacobian[8] = 0;
jacobian[9] = 0;
jacobian[10] = pt.x();
jacobian[11] = pt.y();
}
AffineModel ModelAdapter<AffineModel>::Maximum(const AffineModel& lhs,
const AffineModel& rhs) {
return FromArgs(std::max(lhs.dx(), rhs.dx()), std::max(lhs.dy(), rhs.dy()),
std::max(lhs.a(), rhs.a()), std::max(lhs.b(), rhs.b()),
std::max(lhs.c(), rhs.c()), std::max(lhs.d(), rhs.d()));
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::ProjectFrom(
const AffineModel& model, float frame_width, float frame_height) {
return AffineAdapter::ProjectToLinearSimilarity(model, frame_width,
frame_height);
}
LinearSimilarityModel ModelAdapter<LinearSimilarityModel>::ProjectFrom(
const Homography& model, float frame_width, float frame_height) {
return ProjectFrom(
AffineAdapter::ProjectFrom(model, frame_width, frame_height), frame_width,
frame_height);
}
LinearSimilarityModel ModelAdapter<AffineModel>::ProjectToLinearSimilarity(
const AffineModel& model, float frame_width, float frame_height) {
AffineModel center_trans = AffineAdapter::FromArgs(
frame_width * 0.5f, frame_height * 0.5f, 1, 0, 0, 1);
AffineModel inv_center_trans = AffineAdapter::FromArgs(
-frame_width * 0.5f, -frame_height * 0.5f, 1, 0, 0, 1);
// Express model w.r.t. center.
AffineModel center_model =
ModelCompose3(inv_center_trans, model, center_trans);
// Determine average scale.
const float scale = std::sqrt(AffineAdapter::Determinant(center_model));
// Goal is approximate matrix:
// (a b (a' -b'
// c d) with b' a')
//
// := :=
// v_1 v_2
// After normalization by the scale, a' = cos(u) and b' = sin(u)
// therefore, the columns on the right hand side have norm 1 and are
// orthogonal.
//
// ==> Orthonormalize v_1 and v_2
Vector2_f x_map(center_model.a(), center_model.c()); // == v_1
Vector2_f y_map(center_model.b(), center_model.d()); // == v_2
x_map.Normalize();
y_map.Normalize();
// Two approaches here
// A) Perform Grahm Schmidt, i.e. QR decomp / polar decomposition, i.e.:
// y_map' = y_map - <y_map, x_map> * x_map;
// ==> no error in x_map, error increasing with larger y
// B) Compute the middle vector between x_map and y_map and create
// orthogonal system from that
// (rotate by -45': [ 1 1
// -1 1] * 1/sqrt(2)
// Error equally distributed between x and y.
//
// Comparing approach A and B:
//
// video 1 (gleicher4) video 2 (pool dance)
// Method B
// Max diff : dx: 1.6359973 4.600112
// dy: 2.1076841 11.51579
// a: 1.004498 1.01036
// b: 0.0047194548 0.027450036
// Method A
// Max diff : dx: 4.3549204 14.145205
// dy: 2.4496114 7.7804289
// a: 1.0136091 1.043335
// b: 0.0024313219 0.0065769218
// Using method B:
Vector2_f middle = (x_map + y_map).Normalize();
Vector2_f a_b = Vector2_f(middle.x() + middle.y(), // see above matrix.
middle.y() - middle.x())
.Normalize();
AffineModel lin_approx = AffineAdapter::FromArgs(
center_model.dx(), center_model.dy(), scale * a_b.x(), -scale * a_b.y(),
scale * a_b.y(), scale * a_b.x());
// Map back to top-left origin.
return LinearSimilarityAdapter::FromAffine(
ModelCompose3(center_trans, lin_approx, inv_center_trans));
}
AffineModel ModelAdapter<AffineModel>::ProjectFrom(const Homography& model,
float frame_width,
float frame_height) {
return HomographyAdapter::ProjectToAffine(model, frame_width, frame_height);
}
Homography ModelAdapter<Homography>::InvertChecked(const Homography& model,
bool* success) {
// Could do adjoint method and do it by hand. Use Eigen for now, not that
// crucial at this point.
Eigen::Matrix3d model_mat;
model_mat(0, 0) = model.h_00();
model_mat(0, 1) = model.h_01();
model_mat(0, 2) = model.h_02();
model_mat(1, 0) = model.h_10();
model_mat(1, 1) = model.h_11();
model_mat(1, 2) = model.h_12();
model_mat(2, 0) = model.h_20();
model_mat(2, 1) = model.h_21();
model_mat(2, 2) = 1.0f;
if (model_mat.determinant() < kDetInvertibleEps) {
VLOG(1) << "Homography not invertible, det is zero.";
*success = false;
return Homography();
}
Eigen::Matrix3d inv_model_mat = model_mat.inverse();
if (inv_model_mat(2, 2) == 0) {
LOG(ERROR) << "Degenerate homography. See proto.";
*success = false;
return Homography();
}
*success = true;
Homography inv_model;
const float scale = 1.0f / inv_model_mat(2, 2);
inv_model.set_h_00(inv_model_mat(0, 0) * scale);
inv_model.set_h_01(inv_model_mat(0, 1) * scale);
inv_model.set_h_02(inv_model_mat(0, 2) * scale);
inv_model.set_h_10(inv_model_mat(1, 0) * scale);
inv_model.set_h_11(inv_model_mat(1, 1) * scale);
inv_model.set_h_12(inv_model_mat(1, 2) * scale);
inv_model.set_h_20(inv_model_mat(2, 0) * scale);
inv_model.set_h_21(inv_model_mat(2, 1) * scale);
return inv_model;
}
std::string ModelAdapter<Homography>::ToString(const Homography& model) {
return absl::StrFormat("%7f", model.h_00()) + " " +
absl::StrFormat("%7f", model.h_01()) + " " +
absl::StrFormat("%7f", model.h_02()) + " " +
absl::StrFormat("%7f", model.h_10()) + " " +
absl::StrFormat("%7f", model.h_11()) + " " +
absl::StrFormat("%7f", model.h_12()) + " " +
absl::StrFormat("%7f", model.h_20()) + " " +
absl::StrFormat("%7f", model.h_21());
}
AffineModel ModelAdapter<Homography>::ToAffine(const Homography& model) {
DCHECK_EQ(model.h_20(), 0);
DCHECK_EQ(model.h_21(), 0);
AffineModel affine_model;
affine_model.set_a(model.h_00());
affine_model.set_b(model.h_01());
affine_model.set_dx(model.h_02());
affine_model.set_c(model.h_10());
affine_model.set_d(model.h_11());
affine_model.set_dy(model.h_12());
return affine_model;
}
Homography ModelAdapter<Homography>::FromAffine(const AffineModel& model) {
return Embed(model);
}
bool ModelAdapter<Homography>::IsAffine(const Homography& model) {
return model.h_20() == 0 && model.h_21() == 0;
}
void ModelAdapter<Homography>::GetJacobianAtPoint(const Vector2_f& pt,
float* jacobian) {
DCHECK(jacobian);
// First row.
jacobian[0] = pt.x();
jacobian[1] = pt.y();
jacobian[2] = 1;
jacobian[3] = 0;
jacobian[4] = 0;
jacobian[5] = 0;
jacobian[6] = -pt.x() * pt.x();
jacobian[7] = -pt.x() * pt.y();
// Second row.
jacobian[8] = 0;
jacobian[9] = 0;
jacobian[10] = 0;
jacobian[11] = pt.x();
jacobian[12] = pt.y();
jacobian[13] = 1;
jacobian[14] = -pt.x() * pt.y();
jacobian[15] = -pt.y() * pt.y();
}
Homography ModelAdapter<Homography>::NormalizationTransform(
float frame_width, float frame_height) {
const float scale = std::hypot(frame_width, frame_height);
DCHECK_NE(scale, 0);
return HomographyAdapter::FromArgs(1.0f / scale, 0, 0, 0, 1.0f / scale, 0, 0,
0);
}
AffineModel ModelAdapter<Homography>::ProjectToAffine(const Homography& model,
float frame_width,
float frame_height) {
Homography center_trans;
center_trans.set_h_02(frame_width * 0.5f);
center_trans.set_h_12(frame_height * 0.5f);
Homography inv_center_trans;
inv_center_trans.set_h_02(-frame_width * 0.5f);
inv_center_trans.set_h_12(-frame_height * 0.5f);
// Express model w.r.t. center.
Homography center_model =
ModelCompose3(inv_center_trans, model, center_trans);
// Zero out perspective.
center_model.set_h_20(0);
center_model.set_h_21(0);
// Map back to top left and embed.
return ToAffine(ModelCompose3(center_trans, center_model, inv_center_trans));
}
namespace {
// Returns true if non-empty intersection is present. In this case, start and
// end are clipped to rect, specifically after clipping the line
// [start, end] is inside the rectangle or incident to the boundary
// of the rectangle. If strict is set to true, [start, end] is strictly inside
// the rectangle, i.e. not incident to the boundary (minus start and end
// point which still can be incident).
// Implemented using Liang-Barsky algorithm.
// http://en.wikipedia.org/wiki/Liang%E2%80%93Barsky
inline bool ClipLine(const Vector2_f& rect, bool strict, Vector2_f* start,
Vector2_f* end) {
Vector2_f diff = *end - *start;
float p[4] = {-diff.x(), diff.x(), -diff.y(), diff.y()};
// Bounds are (x_min, y_min) = (0, 0)
// (x_max, y_max) = rect
float q[4] = {start->x() - 0, rect.x() - start->x(), start->y() - 0,
rect.y() - start->y()};
// Compute parametric intersection points.
float near = -1e10;
float far = 1e10;
for (int k = 0; k < 4; ++k) {
if (fabs(p[k]) < 1e-6f) {
// Line is parallel to one axis of rectangle.
if ((strict && q[k] <= 0.0f) || q[k] < 0.0f) {
// Line is outside rectangle.
return false;
} else {
// Possible intersection along other dimensions.
continue;
}
} else {
// Line is not parallel -> compute intersection.
float intersect = q[k] / p[k];
// Sign of p determines if near or far parameter.
if (p[k] < 0.0f) {
near = std::max(near, intersect);
} else {
far = std::min(far, intersect);
}
}
}
if (near > far) {
// Line is outside of rectangle.
return false;
}
// Clip near and far to valid line segment interval [0, 1].
far = std::min(far, 1.0f);
near = std::max(near, 0.0f);
if (near <= far) {
// Non-empty intersection. Single points are considered valid intersection.
*end = *start + diff * far;
*start = *start + diff * near;
return true;
} else {
// Empty intersection.
return false;
}
}
} // namespace.
template <class Model>
float ModelMethods<Model>::NormalizedIntersectionArea(const Model& model_1,
const Model& model_2,
const Vector2_f& rect) {
const float rect_area = rect.x() * rect.y();
if (rect_area <= 0) {
LOG(WARNING) << "Empty rectangle passed -> empty intersection.";
return 0.0f;
}
std::vector<std::pair<Vector2_f, Vector2_f>> lines(4);
lines[0] = std::make_pair(Vector2_f(0, 0), Vector2_f(0, rect.y()));
lines[1] =
std::make_pair(Vector2_f(0, rect.y()), Vector2_f(rect.x(), rect.y()));
lines[2] =
std::make_pair(Vector2_f(rect.x(), rect.y()), Vector2_f(rect.x(), 0));
lines[3] = std::make_pair(Vector2_f(rect.x(), 0), Vector2_f(0, 0));
float model_1_area = 0.0f;
float model_2_area = 0.0f;
for (int k = 0; k < 4; ++k) {
Vector2_f start_1 = Adapter::TransformPoint(model_1, lines[k].first);
Vector2_f end_1 = Adapter::TransformPoint(model_1, lines[k].second);
// Use trapezoidal rule for polygon area.
model_1_area += 0.5 * (end_1.y() + start_1.y()) * (end_1.x() - start_1.x());
Vector2_f start_2 = Adapter::TransformPoint(model_2, lines[k].first);
Vector2_f end_2 = Adapter::TransformPoint(model_2, lines[k].second);
model_2_area += 0.5 * (end_2.y() + start_2.y()) * (end_2.x() - start_2.x());
}
const float average_area = 0.5f * (model_1_area + model_2_area);
if (average_area <= 0) {
LOG(WARNING) << "Degenerative models passed -> empty intersection.";
return 0.0f;
}
// First, clip transformed rectangle against origin defined by model_1.
bool success = true;
Model diff = ModelDiffChecked(model_2, model_1, &success);
if (!success) {
LOG(WARNING) << "Model difference is singular -> empty intersection.";
return 0.0f;
}
float area = 0.0f;
for (int k = 0; k < 4; ++k) {
Vector2_f start_1 = Adapter::TransformPoint(diff, lines[k].first);
Vector2_f end_1 = Adapter::TransformPoint(diff, lines[k].second);
if (ClipLine(rect, false, &start_1, &end_1)) {
// Non-empty intersection.
// Transform intersection back to world coordinate system.
const Vector2_f start = Adapter::TransformPoint(model_1, start_1);
const Vector2_f end = Adapter::TransformPoint(model_1, end_1);
// Use trapezoidal rule for polygon area without explicit ordering of
// vertices.
area += 0.5 * (end.y() + start.y()) * (end.x() - start.x());
}
}
// Second, clip transformed rectangle against origin defined by model_2.
Model inv_diff = Adapter::InvertChecked(diff, &success);
if (!success) {
LOG(WARNING) << "Model difference is singular -> empty intersection.";
return 0.0f;
}
for (int k = 0; k < 4; ++k) {
Vector2_f start_2 = Adapter::TransformPoint(inv_diff, lines[k].first);
Vector2_f end_2 = Adapter::TransformPoint(inv_diff, lines[k].second);
// Use strict comparison to address degenerate case of incident rectangles
// in which case intersection would be counted twice if non-strict
// comparison is employed.
if (ClipLine(rect, true, &start_2, &end_2)) {
// Transform start and end back to origin.
const Vector2_f start = Adapter::TransformPoint(model_2, start_2);
const Vector2_f end = Adapter::TransformPoint(model_2, end_2);
area += 0.5 * (end.y() + start.y()) * (end.x() - start.x());
}
}
// Normalize w.r.t. average rectangle area.
return area / average_area;
}
MixtureRowWeights::MixtureRowWeights(int frame_height, int margin, float sigma,
float y_scale, int num_models)
: frame_height_(frame_height),
y_scale_(y_scale),
margin_(margin),
sigma_(sigma),
num_models_(num_models) {
mid_points_.resize(num_models_);
// Compute mid_point (row idx) for each model.
if (flags_catmull_rom_mixture_weights) {
const float model_height =
static_cast<float>(frame_height_) / (num_models - 1);
// Use Catmull-rom spline.
// Compute weighting matrix.
weights_.resize(frame_height * num_models);
float spline_weights[4];
// No margin support for splines.
if (margin_ > 0) {
LOG(WARNING) << "No margin support when flag catmull_rom_mixture_weights "
<< "is set. Margin is reset to zero, it is recommended "
<< "that RowWeightsBoundChecked is used to prevent "
<< "segfaults.";
margin_ = 0;
}
for (int i = 0; i < frame_height; ++i) {
float* weight_ptr = &weights_[i * num_models];
float float_pos = static_cast<float>(i) / model_height;
int int_pos = float_pos;
memset(weight_ptr, 0, sizeof(weight_ptr[0]) * num_models);
float dy = float_pos - int_pos;
// Weights sum to one, for all choices of dy.
// For definition see
// en.wikipedia.org/wiki/Cubic_Hermite_spline#Catmull.E2.80.93Rom_spline
spline_weights[0] = 0.5f * (dy * ((2.0f - dy) * dy - 1.0f));
spline_weights[1] = 0.5f * (dy * dy * (3.0f * dy - 5.0f) + 2.0f);
spline_weights[2] = 0.5f * (dy * ((4.0f - 3.0f * dy) * dy + 1.0f));
spline_weights[3] = 0.5f * (dy * dy * (dy - 1.0f));
weight_ptr[int_pos] += spline_weights[1];
if (int_pos > 0) {
weight_ptr[int_pos - 1] += spline_weights[0];
} else {
weight_ptr[int_pos] += spline_weights[0]; // Double knot.
}
CHECK_LT(int_pos, num_models - 1);
weight_ptr[int_pos + 1] += spline_weights[2];
if (int_pos + 1 < num_models - 1) {
weight_ptr[int_pos + 2] += spline_weights[3];
} else {
weight_ptr[int_pos + 1] += spline_weights[3]; // Double knot.
}
}
} else {
// Gaussian weights.
const float model_height = static_cast<float>(frame_height_) / num_models;
for (int i = 0; i < num_models; ++i) {
mid_points_[i] = (i + 0.5f) * model_height;
}
// Compute gaussian weights.
const int num_values = frame_height_ + 2 * margin_;
std::vector<float> row_dist_weights(num_values);
const float common = -0.5f / (sigma * sigma);
for (int i = 0; i < num_values; ++i) {
row_dist_weights[i] = std::exp(common * i * i);
}
// Compute weighting matrix.
weights_.resize(num_values * num_models);
for (int i = 0; i < num_values; ++i) {
float* weight_ptr = &weights_[i * num_models];
float weight_sum = 0;
// Gaussian weights via lookup.
for (int j = 0; j < num_models; ++j) {
weight_ptr[j] = row_dist_weights[abs(i - margin_ - mid_points_[j])];
weight_sum += weight_ptr[j];
}
// Normalize.
DCHECK_GT(weight_sum, 0);
const float inv_weight_sum = 1.0f / weight_sum;
for (int j = 0; j < num_models; ++j) {
weight_ptr[j] *= inv_weight_sum;
}
}
}
}
float MixtureRowWeights::WeightThreshold(float frac_blocks) {
const float model_height = static_cast<float>(frame_height_) / num_models_;
const float y = model_height * frac_blocks + mid_points_[0];
const float* row_weights = RowWeightsClamped(y / y_scale_);
return row_weights[0];
}
// Explicit instantiations of ModelMethods.
template class ModelMethods<TranslationModel>;
template class ModelMethods<SimilarityModel>;
template class ModelMethods<LinearSimilarityModel>;
template class ModelMethods<AffineModel>;
template class ModelMethods<Homography>;
} // namespace mediapipe

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,122 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Motion models, that are supported and estimated by MotionStabilization.
// Note, that transformations represent the motion of the
// feature points from the previous frame to the current one.
// Note that the actual camera movement, is the inverse of this
// transformation.
// Check carefully, which movement (feature or camera) is required.
// Note that for each model the default values always specify an identity
// transform. Follow this rule when adding new models.
syntax = "proto2";
package mediapipe;
// Here x represents a 2D point in the image plane, in the following
// coordinate system:
// *----> x
// |
// |
// v y
// Simple translational model:
// I * x + [dx; dy] with I being 2x2 identity transform.
message TranslationModel {
optional float dx = 1 [default = 0];
optional float dy = 2 [default = 0];
}
// Non-linear similarity model (w.r.t. to its parametrization).
// c_r := cos(rotation);
// s_r := sin(rotation);
// Transformation applied to x:
// [scale 0; * [c_r -s_r; * x + [dx;
// 0 scale] s_r c_r] dy]
message SimilarityModel {
optional float dx = 1 [default = 0];
optional float dy = 2 [default = 0];
optional float scale = 3 [default = 1];
optional float rotation = 4 [default = 0]; // angle in [-pi, pi].
}
// Linear similarity model:
// [a -b; * x + [dx;
// b a] dy]
message LinearSimilarityModel {
optional float dx = 1 [default = 0];
optional float dy = 2 [default = 0];
optional float a = 3 [default = 1];
optional float b = 4 [default = 0];
}
// Affine according to
// ( [a b * x + [dx;
// ( c d] dy]
message AffineModel {
optional float dx = 1 [default = 0];
optional float dy = 2 [default = 0];
optional float a = 3 [default = 1];
optional float b = 4 [default = 0];
optional float c = 5 [default = 0];
optional float d = 6 [default = 1];
}
// Homography according to
// [h_00 h_01 h_02;
// h_10 h_11 h_12;
// h_20 h_21 1];
// Note: The parametrization with h_22 = 1 does not always hold, e.g.
// if the origin (0, 0, 1) gets mapped to the line at infinity
// (0, 0, 1). However for video we expect small perspective
// changes between frames and this parametrization improves
// robustness greatly as it removes an additional DOF.
// Therefore, all methods in motion_stabilization should not be
// used for general wide-baseline matching of frames.
message Homography {
optional float h_00 = 1 [default = 1];
optional float h_01 = 2 [default = 0];
optional float h_02 = 3 [default = 0];
optional float h_10 = 4 [default = 0];
optional float h_11 = 5 [default = 1];
optional float h_12 = 6 [default = 0];
optional float h_20 = 7 [default = 0];
optional float h_21 = 8 [default = 0];
}
// Mixture models with higher degrees of freedom, according to
// \sum_i model(i) * weight(i), where weights are passed during transform and
// are expected to sum to one.
message MixtureLinearSimilarity {
repeated LinearSimilarityModel model = 1;
}
message MixtureAffine {
repeated AffineModel model = 1;
}
message MixtureHomography {
repeated Homography model = 1;
// Specifies which degree of freedom vary across mixture.
// Can be used to implement several transformation functions quicker.
enum VariableDOF {
ALL_DOF = 0; // All dof are variable.
TRANSLATION_DOF = 1; // Only translation (h_02, h_12) varies.
SKEW_ROTATION_DOF = 2; // Translation (h_02, h_12), and skew-rotation
// (h_01, h_10) vary.
CONST_DOF = 3; // Mixture is constant.
}
optional VariableDOF dof = 2 [default = ALL_DOF];
}

View File

@ -0,0 +1,57 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/motion_models_cv.h"
namespace mediapipe {
void ModelCvConvert<TranslationModel>::ToCvMat(const TranslationModel& model,
cv::Mat* matrix) {
ModelCvConvert<AffineModel>::ToCvMat(
ModelAdapter<TranslationModel>::ToAffine(model), matrix);
}
void ModelCvConvert<LinearSimilarityModel>::ToCvMat(
const LinearSimilarityModel& model, cv::Mat* matrix) {
ModelCvConvert<AffineModel>::ToCvMat(
ModelAdapter<LinearSimilarityModel>::ToAffine(model), matrix);
}
void ModelCvConvert<AffineModel>::ToCvMat(const AffineModel& model,
cv::Mat* matrix) {
matrix->create(2, 3, CV_32FC1);
matrix->at<float>(0, 0) = model.a();
matrix->at<float>(0, 1) = model.b();
matrix->at<float>(0, 2) = model.dx();
matrix->at<float>(1, 0) = model.c();
matrix->at<float>(1, 1) = model.d();
matrix->at<float>(1, 2) = model.dy();
}
void ModelCvConvert<Homography>::ToCvMat(const Homography& model,
cv::Mat* matrix) {
CHECK(matrix != nullptr);
matrix->create(3, 3, CV_32FC1);
matrix->at<float>(0, 0) = model.h_00();
matrix->at<float>(0, 1) = model.h_01();
matrix->at<float>(0, 2) = model.h_02();
matrix->at<float>(1, 0) = model.h_10();
matrix->at<float>(1, 1) = model.h_11();
matrix->at<float>(1, 2) = model.h_12();
matrix->at<float>(2, 0) = model.h_20();
matrix->at<float>(2, 1) = model.h_21();
matrix->at<float>(2, 2) = 1.0;
}
} // namespace mediapipe

View File

@ -0,0 +1,63 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_CV_H_
#define MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_CV_H_
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/util/tracking/motion_models.pb.h" // NOLINT
namespace mediapipe {
template <class Model>
class ModelCvConvert {};
// Specialized implementations, with additional functionality if needed.
template <>
class ModelCvConvert<TranslationModel> {
public:
// Returns 2x3 floating point cv::Mat with model parameters.
static void ToCvMat(const TranslationModel& model, cv::Mat* matrix);
};
template <>
class ModelCvConvert<LinearSimilarityModel> {
public:
// Returns 2x3 floating point cv::Mat with model parameters.
static void ToCvMat(const LinearSimilarityModel& model, cv::Mat* matrix);
};
template <>
class ModelCvConvert<AffineModel> {
public:
// Returns 2x3 floating point cv::Mat with model parameters.
static void ToCvMat(const AffineModel& model, cv::Mat* matrix);
};
template <>
class ModelCvConvert<Homography> {
public:
// Returns 3x3 floating point cv::Mat with model parameters.
static void ToCvMat(const Homography& model, cv::Mat* matrix);
};
template <class Model>
void ModelToCvMat(const Model& model, cv::Mat* matrix) {
ModelCvConvert<Model>::ToCvMat(model, matrix);
}
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_MODELS_CV_H_

View File

@ -0,0 +1,335 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/framework/deps/message_matchers.h"
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/parse_text_proto.h"
#include "mediapipe/framework/port/proto_ns.h"
#include "mediapipe/framework/port/vector.h"
#include "mediapipe/util/tracking/motion_estimation.h"
namespace mediapipe {
namespace {
static const float kArrayFloat[] = {1, 2, 3, 4, 5, 6, 7, 8};
static const double kArrayDouble[] = {1, 2, 3, 4, 5, 6, 7, 8};
class MotionModelsTest : public ::testing::Test {};
// Test from/to array and parameter indexing functions.
template <class Model>
void CheckFromArrayAndGetParameter(const char* model_zero_string,
const char* model_identity_string) {
Model model_zero;
Model model_identity;
ASSERT_TRUE(
proto_ns::TextFormat::ParseFromString(model_zero_string, &model_zero));
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_identity_string,
&model_identity));
typedef ModelAdapter<Model> Adapter;
EXPECT_THAT(Adapter::FromFloatPointer(kArrayFloat, false),
mediapipe::EqualsProto(model_zero));
EXPECT_THAT(Adapter::FromFloatPointer(kArrayFloat, true),
mediapipe::EqualsProto(model_identity));
EXPECT_THAT(Adapter::FromDoublePointer(kArrayDouble, false),
mediapipe::EqualsProto(model_zero));
EXPECT_THAT(Adapter::FromDoublePointer(kArrayDouble, true),
mediapipe::EqualsProto(model_identity));
ASSERT_LE(Adapter::NumParameters(), 8);
for (int i = 0; i < Adapter::NumParameters(); ++i) {
EXPECT_EQ(kArrayFloat[i], Adapter::GetParameter(model_zero, i));
}
}
TEST_F(MotionModelsTest, FromArrayAndGetParameter) {
CheckFromArrayAndGetParameter<TranslationModel>("dx: 1 dy: 2", "dx: 1 dy: 2");
CheckFromArrayAndGetParameter<SimilarityModel>(
"dx: 1 dy: 2 scale: 3 rotation: 4", "dx: 1 dy: 2 scale: 4 rotation: 4");
CheckFromArrayAndGetParameter<LinearSimilarityModel>("dx: 1 dy: 2 a: 3 b: 4",
"dx: 1 dy: 2 a: 4 b: 4");
CheckFromArrayAndGetParameter<AffineModel>("dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6",
"dx: 1 dy: 2 a: 4 b: 4 c: 5 d: 7");
CheckFromArrayAndGetParameter<Homography>(
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: 4 h_11: 5 h_12: 6 "
"h_20: 7 h_21: 8 ",
"h_00: 2 h_01: 2 h_02: 3 "
"h_10: 4 h_11: 6 h_12: 6 "
"h_20: 7 h_21: 8 ");
}
// Test point transformations.
template <class Model>
void CheckTransformPoint(const char* model_string, float x_in, float y_in,
float x_out, float y_out) {
Model model;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string, &model));
const Vector2_f output =
ModelAdapter<Model>::TransformPoint(model, Vector2_f(x_in, y_in));
EXPECT_NEAR(x_out, output.x(), 1e-5);
EXPECT_NEAR(y_out, output.y(), 1e-5);
}
TEST_F(MotionModelsTest, TransformPoint) {
CheckTransformPoint<TranslationModel>("dx: 0 dy: 0", 1, 1, 1, 1);
CheckTransformPoint<TranslationModel>("dx: 1 dy: -1", 1, 1, 2, 0);
CheckTransformPoint<SimilarityModel>(
"dx: 0 dy: 0 scale: 1 rotation: 1.57079633", 1, 2, -2, 1);
CheckTransformPoint<SimilarityModel>(
"dx: 1 dy: -1 scale: 1 rotation: 1.57079633", 1, 2, -1, 0);
CheckTransformPoint<SimilarityModel>(
"dx: 1 dy: -1 scale: 2 rotation: 1.57079633", 1, 2, -3, 1);
CheckTransformPoint<LinearSimilarityModel>("dx: 0 dy: 0 a: 1 b: -0.5", 1, 2,
2, 1.5);
CheckTransformPoint<LinearSimilarityModel>("dx: 0.5 dy: -0.5 a: 1 b: 0.5", 1,
2, 0.5, 2);
CheckTransformPoint<LinearSimilarityModel>("dx: 0.5 dy: -0.5 a: 0.5 b: 0.5",
1, 2, 0, 1);
CheckTransformPoint<AffineModel>("dx: 0 dy: 0 a: 1 b: 0.5 c: -0.5 d: 1", 1, 2,
2, 1.5);
CheckTransformPoint<AffineModel>("dx: 0.5 dy: -0.5 a: 2 b: -0.5 c: 0.5 d: 1",
1, 2, 1.5, 2);
CheckTransformPoint<AffineModel>("dx: 1 dy: -1 a: 2 b: -2 c: 1 d: -1", 1, 2,
-1, -2);
// Transformations by Homography are followed by divsion by the 3rd element.
// Test division by value != 1.
CheckTransformPoint<Homography>(
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: 4 h_11: 3 h_12: 6 "
"h_20: 7 h_21: 8 ",
1, 2, 8.0 / 24.0, 16.0 / 24.0);
// Test division by 1.
CheckTransformPoint<Homography>(
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: 4 h_11: 3 h_12: 6 "
"h_20: 2 h_21: -1 ",
1, 2, 8.0, 16.0);
}
// Test model inversions.
template <class Model>
void CheckInvert(const char* model_string, const char* inv_model_string) {
Model model;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string, &model));
Model inv_model_expected;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(inv_model_string,
&inv_model_expected));
typedef ModelAdapter<Model> Adapter;
const Model inv_model_actual = Adapter::Invert(model);
for (int i = 0; i < Adapter::NumParameters(); ++i) {
EXPECT_NEAR(Adapter::GetParameter(inv_model_expected, i),
Adapter::GetParameter(inv_model_actual, i), 1e-5)
<< "Parameter index: " << i << " of total " << Adapter::NumParameters();
}
}
TEST_F(MotionModelsTest, Invert) {
CheckInvert<TranslationModel>("dx: 1 dy: -2", "dx: -1 dy: 2");
CheckInvert<SimilarityModel>("dx: 0 dy: 0 scale: 1 rotation: 1.57079633",
"dx: 0 dy: 0 scale: 1 rotation: -1.57079633");
CheckInvert<SimilarityModel>("dx: 1 dy: -2 scale: 1 rotation: 1.57079633",
"dx: 2 dy: 1 scale: 1 rotation: -1.57079633");
CheckInvert<SimilarityModel>("dx: 1 dy: -2 scale: 0.5 rotation: 1.57079633",
"dx: 4 dy: 2 scale: 2 rotation: -1.57079633");
CheckInvert<LinearSimilarityModel>("dx: 1 dy: 2 a: 3 b: 4 ",
"dx: -0.44 dy: -0.08 a: 0.12 b: -0.16 ");
// Test division by value != 1.
CheckInvert<Homography>(
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: -3 h_11: -2 h_12: -1 "
"h_20: 8 h_21: -1 ",
"h_00: -0.75 h_01: -1.25 h_02: 1 "
"h_10: -1.25 h_11: -5.75 h_12: -2 "
"h_20: 4.75 h_21: 4.25 ");
// Test division by 1.
CheckInvert<Homography>(
"h_00: -0.75 h_01: -1.25 h_02: 1 "
"h_10: -1.25 h_11: -5.75 h_12: -2 "
"h_20: 4.75 h_21: 4.25 ",
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: -3 h_11: -2 h_12: -1 "
"h_20: 8 h_21: -1 ");
}
// Test model compositions.
template <class Model>
void CheckCompose(const char* model_string1, const char* model_string2,
const char* composed_string) {
Model model1;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string1, &model1));
Model model2;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string2, &model2));
Model composed_expected;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(composed_string,
&composed_expected));
typedef ModelAdapter<Model> Adapter;
const Model composed_actual = Adapter::Compose(model1, model2);
for (int i = 0; i < Adapter::NumParameters(); ++i) {
EXPECT_NEAR(Adapter::GetParameter(composed_expected, i),
Adapter::GetParameter(composed_actual, i), 1e-5)
<< "Parameter index: " << i << " of total " << Adapter::NumParameters();
}
}
TEST_F(MotionModelsTest, Compose) {
CheckCompose<TranslationModel>("dx: 1 dy: -2", "dx: -3 dy: 4",
"dx: -2 dy: 2");
CheckCompose<SimilarityModel>(
"dx: 1 dy: 2 scale: 0.5 rotation: 1.57079633 ",
"dx: -2 dy: -1 scale: 2 rotation: -1.57079633 ",
"dx: 1.5 dy: 1 scale: 1 rotation: 0 ");
CheckCompose<LinearSimilarityModel>("dx: 1 dy: 2 a: 0.5 b: 0.5 ",
"dx: -2 dy: -1 a: 2 b: -0.5 ",
"dx: 0.5 dy: 0.5 a: 1.25 b: 0.75 ");
// Test division by value != 1.
CheckCompose<Homography>(
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: 4 h_11: 5 h_12: 6 "
"h_20: 1 h_21: -1 ",
"h_00: -3 h_01: -2 h_02: -1 "
"h_10: -4 h_11: -5 h_12: -2 "
"h_20: 7 h_21: 8 ",
"h_00: 5 h_01: 6 h_02: -1 "
"h_10: 5 h_11: 7.5 h_12: -4 "
"h_20: 4 h_21: 5.5 ");
// Test division by 1.
CheckCompose<Homography>(
"h_00: 1 h_01: 2 h_02: 3 "
"h_10: 4 h_11: 5 h_12: 6 "
"h_20: 2 h_21: -1 ",
"h_00: -3 h_01: -2 h_02: -1 "
"h_10: -4 h_11: -5 h_12: -2 "
"h_20: 7 h_21: 8 ",
"h_00: 10 h_01: 12 h_02: -2 "
"h_10: 10 h_11: 15 h_12: -8 "
"h_20: 5 h_21: 9 ");
}
// Test conversions between models and their affine representations, and
// vice-versa.
template <class Model>
void CheckToFromAffine(const char* model_string, const char* affine_string) {
Model model;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(model_string, &model));
AffineModel affine;
ASSERT_TRUE(proto_ns::TextFormat::ParseFromString(affine_string, &affine));
typedef ModelAdapter<Model> Adapter;
EXPECT_THAT(Adapter::ToAffine(model), mediapipe::EqualsProto(affine));
EXPECT_THAT(Adapter::FromAffine(affine), mediapipe::EqualsProto(model));
}
TEST_F(MotionModelsTest, ToFromAffine) {
CheckToFromAffine<TranslationModel>("dx: 1 dy: 2",
"dx: 1 dy: 2 a: 1 b: 0 c: 0 d: 1");
CheckToFromAffine<LinearSimilarityModel>("dx: 1 dy: 2 a: 3 b: -4",
"dx: 1 dy: 2 a: 3 b: 4 c: -4 d: 3");
CheckToFromAffine<AffineModel>("dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6",
"dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6");
CheckToFromAffine<Homography>(
"h_00: 3 h_01: 4 h_02: 1 "
"h_10: 5 h_11: 6 h_12: 2 "
"h_20: 0 h_21: 0 ",
"dx: 1 dy: 2 a: 3 b: 4 c: 5 d: 6");
Homography homography;
ASSERT_TRUE(
proto_ns::TextFormat::ParseFromString("h_00: 3 h_01: 4 h_02: 1 "
"h_10: 5 h_11: 6 h_12: 2 "
"h_20: 0 h_21: 0 ",
&homography));
EXPECT_TRUE(HomographyAdapter::IsAffine(homography));
homography.set_h_20(7);
homography.set_h_21(8);
EXPECT_FALSE(HomographyAdapter::IsAffine(homography));
}
TEST_F(MotionModelsTest, ProjectModels) {
// Express models w.r.t. center for easy testing.
LinearSimilarityModel center_trans =
LinearSimilarityAdapter::FromArgs(50, 50, 1, 0);
LinearSimilarityModel inv_center_trans =
LinearSimilarityAdapter::FromArgs(-50, -50, 1, 0);
// 20 x 10 translation with scaling of factor 2 and rotation.
LinearSimilarityModel lin_sim =
LinearSimilarityAdapter::FromArgs(20, 10, 2 * cos(0.2), 2 * sin(0.2));
LinearSimilarityModel lin_sim_center =
ModelCompose3(center_trans, lin_sim, inv_center_trans);
TranslationModel translation =
TranslationAdapter::ProjectFrom(lin_sim_center, 100, 100);
EXPECT_NEAR(translation.dx(), 20, 1e-3);
EXPECT_NEAR(translation.dy(), 10, 1e-3);
translation = ProjectViaFit<TranslationModel>(lin_sim_center, 100, 100);
EXPECT_NEAR(translation.dx(), 20, 1e-3);
EXPECT_NEAR(translation.dy(), 10, 1e-3);
Homography homog = HomographyAdapter::FromArgs(
1, 0, 10, 0, 1, 20, 5e-3, 1e-3); // Perspective transform: yaw + pitch.
Homography homog_center =
ModelCompose3(HomographyAdapter::Embed(center_trans), homog,
HomographyAdapter::Embed(inv_center_trans));
// Rendering:
// https://www.wolframalpha.com/input/?i=ListPlot%5B%7B+%7B7,-7%7D,+%7B108,16%7D,+%7B104,96%7D,+%7B12.5,+125%7D,+%7B7,-7%7D%5D
translation = TranslationAdapter::ProjectFrom(homog_center, 100, 100);
EXPECT_NEAR(translation.dx(), 10, 1e-3);
EXPECT_NEAR(translation.dy(), 20, 1e-3);
// TODO: Investigate how ProjectViaFit can yield similar result.
}
} // namespace
} // namespace mediapipe

View File

@ -0,0 +1,700 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/motion_saliency.h"
#include <math.h>
#include <algorithm>
#include <cmath>
#include <deque>
#include <iterator>
#include <list>
#include <memory>
#include <vector>
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/util/tracking/camera_motion.h"
#include "mediapipe/util/tracking/measure_time.h"
#include "mediapipe/util/tracking/region_flow.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
MotionSaliency::MotionSaliency(const MotionSaliencyOptions& options,
int frame_width, int frame_height)
: options_(options),
frame_width_(frame_width),
frame_height_(frame_height) {}
MotionSaliency::~MotionSaliency() {}
void MotionSaliency::SaliencyFromFeatures(
const RegionFlowFeatureList& feature_list,
std::vector<float>* irls_weights, // optional.
SalientPointFrame* salient_frame) {
CHECK(salient_frame);
CHECK_EQ(frame_width_, feature_list.frame_width());
CHECK_EQ(frame_height_, feature_list.frame_height());
if (irls_weights) {
CHECK_EQ(feature_list.feature_size(), irls_weights->size());
}
if (feature_list.feature_size() < 1) {
return;
}
float max_irls_weight = 0;
if (irls_weights) {
max_irls_weight =
*std::max_element(irls_weights->begin(), irls_weights->end());
} else {
struct FeatureIRLSComparator {
bool operator()(const RegionFlowFeature& lhs,
const RegionFlowFeature& rhs) const {
return lhs.irls_weight() < rhs.irls_weight();
}
};
max_irls_weight =
std::max_element(feature_list.feature().begin(),
feature_list.feature().end(), FeatureIRLSComparator())
->irls_weight();
}
// Max weight is too small for meaningful mode finding, terminate.
if (max_irls_weight < 1e-2f) {
return;
}
// Discard small weights that just slow clustering down.
const float irls_cutoff = max_irls_weight * 1e-2f;
int feat_idx = 0;
// Create SalientLocation's from input feature_list.
std::vector<SalientLocation> features;
for (const auto& src_feature : feature_list.feature()) {
const float weight =
irls_weights ? (*irls_weights)[feat_idx] : src_feature.irls_weight();
++feat_idx;
// Discard all features with small measure or zero weight from mode finding.
if (weight < irls_cutoff) {
continue;
}
features.push_back(SalientLocation(FeatureLocation(src_feature), weight));
}
DetermineSalientFrame(features, salient_frame);
}
void MotionSaliency::SaliencyFromPoints(const std::vector<Vector2_f>* points,
const std::vector<float>* weights,
SalientPointFrame* salient_frame) {
// TODO: Handle vectors of size zero.
CHECK(salient_frame);
CHECK_EQ(points->size(), weights->size());
float max_weight = *std::max_element(weights->begin(), weights->end());
// Max weight is too small for meaningful mode finding, terminate.
if (max_weight < 1e-2f) {
return;
}
// Discard small weights that just slow clustering down.
const float weight_cutoff = max_weight * 1e-2f;
// Create SalientLocation's from input points.
std::vector<SalientLocation> features;
for (int point_idx = 0; point_idx < points->size(); ++point_idx) {
const float weight = (*weights)[point_idx];
// Discard all features with small measure or zero weight from mode finding.
if (weight < weight_cutoff) {
continue;
}
features.push_back(SalientLocation((*points)[point_idx], weight));
}
DetermineSalientFrame(features, salient_frame);
}
// We only keep those salient points that have neighbors along the temporal
// dimension.
void MotionSaliency::SelectSaliencyInliers(
std::vector<SalientPointFrame*>* motion_saliency,
bool rescale_to_median_saliency_weight) {
float scale = 1.0;
if (rescale_to_median_saliency_weight) {
// Compute median saliency weight across all frames, to rescale saliency.
std::vector<float> saliency_weights;
for (int i = 0; i < motion_saliency->size(); ++i) {
for (const auto& salient_point : (*motion_saliency)[i]->point()) {
saliency_weights.push_back(salient_point.weight());
}
}
// Nothing to filter in the frame chunk.
if (saliency_weights.empty()) {
return;
}
auto median_iter = saliency_weights.begin() + saliency_weights.size() / 2;
std::nth_element(saliency_weights.begin(), median_iter,
saliency_weights.end());
const float median_weight = *median_iter;
if (median_weight > 0) {
scale = options_.saliency_weight() / median_weight;
}
}
SaliencyPointList inlier_saliency(motion_saliency->size());
const float sq_support_distance = options_.selection_support_distance() *
options_.selection_support_distance();
// Test each point salient point for inlierness.
for (int i = 0; i < motion_saliency->size(); ++i) {
for (const auto& salient_point : (*motion_saliency)[i]->point()) {
int support = 0;
Vector2_f salient_location(salient_point.norm_point_x(),
salient_point.norm_point_y());
// Find supporting points (saliency points close enough to current one)
// in adjacent frames. Linear Complexity.
for (int j = std::max<int>(0, i - options_.selection_frame_radius()),
end_j = std::min<int>(i + options_.selection_frame_radius(),
motion_saliency->size() - 1);
j <= end_j; ++j) {
if (i == j) {
continue;
}
for (const auto& compare_point : (*motion_saliency)[j]->point()) {
Vector2_f compare_location(compare_point.norm_point_x(),
compare_point.norm_point_y());
if ((salient_location - compare_location).Norm2() <=
sq_support_distance) {
++support;
}
}
} // end neighbor frames iteration.
if (support >= options_.selection_minimum_support()) {
SalientPoint* scaled_point = inlier_saliency[i].add_point();
scaled_point->CopyFrom(salient_point);
scaled_point->set_weight(scaled_point->weight() * scale);
}
} // end point traversal.
} // end frame traversal.
for (int k = 0; k < motion_saliency->size(); ++k) {
(*motion_saliency)[k]->Swap(&inlier_saliency[k]);
}
}
void MotionSaliency::FilterMotionSaliency(
std::vector<SalientPointFrame*>* saliency_point_list) {
CHECK(saliency_point_list != nullptr);
const float sigma_time = options_.filtering_sigma_time();
const float sigma_space = options_.filtering_sigma_space();
const int time_radius = ceil(sigma_time * 1.5);
const int time_diameter = 2 * time_radius + 1;
// Create lookup table for weights.
std::vector<float> time_weights(time_diameter);
const float time_coeff = -0.5f / (sigma_time * sigma_time);
for (int i = -time_radius, time_idx = 0; i <= time_radius; ++i, ++time_idx) {
time_weights[time_idx] = std::exp(time_coeff * i * i);
}
// Ignore points further than 1.65 sigmas away (includes 90% of distribution).
const float space_cutoff = 1.65 * sigma_space;
const float space_exp_scale = -0.5f / (sigma_space * sigma_space);
// Copy saliency points.
const int num_frames = saliency_point_list->size();
std::vector<SalientPointFrame> points(num_frames + 2 * time_radius);
for (int k = 0; k < saliency_point_list->size(); ++k) {
points[time_radius + k].CopyFrom(*(*saliency_point_list)[k]);
}
// Copy border.
std::copy(points.rbegin() + time_radius, points.rbegin() + 2 * time_radius,
points.end() - time_radius);
std::copy(points.begin() + time_radius, points.begin() + 2 * time_radius,
points.rend() - time_radius);
// Apply filter.
for (int i = time_radius; i < num_frames + time_radius; ++i) {
const int frame_idx = i - time_radius;
for (auto& sample_point :
*(*saliency_point_list)[frame_idx]->mutable_point()) {
Vector2_f point_sum(0, 0);
// Sum for left, bottom, right, top tuple.
Vector4_f bound_sum;
Vector3_f ellipse_sum(0, 0, 0); // Captures major, minor and angle.
float weight_sum = 0;
float filter_sum = 0;
const float sample_angle = sample_point.angle();
for (int k = i - time_radius, time_idx = 0; k <= i + time_radius;
++k, ++time_idx) {
for (const auto& test_point : points[k].point()) {
const float diff = std::hypot(
test_point.norm_point_y() - sample_point.norm_point_y(),
test_point.norm_point_x() - sample_point.norm_point_x());
if (diff > space_cutoff) {
continue;
}
const float weight = time_weights[time_idx] * test_point.weight() *
std::exp(diff * diff * space_exp_scale);
filter_sum += weight;
point_sum +=
Vector2_f(test_point.norm_point_x(), test_point.norm_point_y()) *
weight;
bound_sum += Vector4_f(test_point.left(), test_point.bottom(),
test_point.right(), test_point.top()) *
weight;
weight_sum += test_point.weight() * weight;
// Ensure test_point and sample are less than pi / 2 apart.
float test_angle = test_point.angle();
if (fabs(test_angle - sample_angle) > M_PI / 2) {
if (sample_angle > M_PI / 2) {
test_angle += M_PI;
} else {
test_angle -= M_PI;
}
}
ellipse_sum += Vector3_f(test_point.norm_major(),
test_point.norm_minor(), test_angle) *
weight;
}
}
if (filter_sum > 0) {
const float inv_filter_sum = 1.0f / filter_sum;
point_sum *= inv_filter_sum;
bound_sum *= inv_filter_sum;
weight_sum *= inv_filter_sum;
ellipse_sum *= inv_filter_sum;
}
sample_point.set_norm_point_x(point_sum.x());
sample_point.set_norm_point_y(point_sum.y());
sample_point.set_left(bound_sum.x());
sample_point.set_bottom(bound_sum.y());
sample_point.set_right(bound_sum.z());
sample_point.set_top(bound_sum.w());
sample_point.set_weight(weight_sum);
sample_point.set_norm_major(ellipse_sum.x());
sample_point.set_norm_minor(ellipse_sum.y());
sample_point.set_angle(ellipse_sum.z());
if (sample_point.angle() > M_PI) {
sample_point.set_angle(sample_point.angle() - M_PI);
}
if (sample_point.angle() < 0) {
sample_point.set_angle(sample_point.angle() + M_PI);
}
}
}
}
void MotionSaliency::CollapseMotionSaliency(
const SaliencyPointList& input_saliency, const Vector4_f& bounds,
SaliencyPointList* output_saliency) {
CHECK(output_saliency);
output_saliency->clear();
output_saliency->resize(input_saliency.size());
for (int f = 0; f < input_saliency.size(); ++f) { // traverse frames.
Vector2_f mean_saliency(0, 0);
float weight_sum = 0;
for (const auto& salient_point : input_saliency[f].point()) {
mean_saliency +=
Vector2_f(salient_point.norm_point_x(), salient_point.norm_point_y());
weight_sum += 1;
}
if (weight_sum > 0) {
SalientPoint* collapsed = (*output_saliency)[f].add_point();
collapsed->set_norm_point_x(mean_saliency.x() / weight_sum);
collapsed->set_norm_point_y(mean_saliency.y() / weight_sum);
collapsed->set_left(bounds.x());
collapsed->set_bottom(bounds.y());
collapsed->set_right(bounds.z());
collapsed->set_top(bounds.w());
collapsed->set_weight(1.0f);
}
}
}
namespace {
// Describes feature mode for a feature in a RegionFlowFeatureList stored
// at index feature_idx.
struct FeatureMode {
Vector2_f location;
float irls_weight;
int feature_idx;
int mode_bin;
};
// Determines mode for each feature in feature_view.
// Returns modes as list of pointers in mode_ptrs. Actual modes are stored
// binned into a grid of equal size as the passed FeatureGrid
// (bins of size grid_resolution x grid_resolution).
void DetermineFeatureModes(
const FeatureFrame<MotionSaliency::SalientLocation>& features,
float grid_resolution, const Vector2_i& grid_dims, float band_width,
const FeatureGrid<MotionSaliency::SalientLocation>& feature_grid,
const std::vector<std::vector<int>>& feature_taps,
const std::vector<float>& space_lut, float space_scale,
std::vector<std::list<FeatureMode>>* mode_grid,
std::vector<FeatureMode*>* mode_ptrs) {
CHECK(mode_grid);
CHECK(mode_ptrs);
const int num_features = features.size();
mode_ptrs->reserve(num_features);
const float grid_scale = 1.0f / grid_resolution;
int feature_idx = 0;
const int kMaxIter = 100;
// Set convergence radius to 0.1% of bandwidth.
const float sq_conv_radius = band_width * band_width * 1e-6f;
for (const auto& feature_ptr : features) {
Vector2_f center = feature_ptr->pt;
int iter = 0;
for (; iter < kMaxIter; ++iter) {
const int bin_x = center.x() * grid_scale;
const int bin_y = center.y() * grid_scale;
const int grid_loc = bin_y * grid_dims.x() + bin_x;
float sum_weight = 0;
Vector2_f new_center;
for (const auto& bin : feature_taps[grid_loc]) {
for (const auto& test_feat_ptr : feature_grid[bin]) {
const float dist = (test_feat_ptr->pt - center).Norm();
const float weight = space_lut[static_cast<int>(dist * space_scale)] *
test_feat_ptr->weight;
sum_weight += weight;
new_center += weight * test_feat_ptr->pt;
}
}
if (sum_weight > 0) {
new_center *= (1.0f / sum_weight);
if ((center - new_center).Norm2() < sq_conv_radius) {
center = new_center;
break;
} else {
center = new_center;
}
} else {
LOG(WARNING) << "No features found in band_width radius, "
<< "should not happen. ";
break;
}
}
const int mode_bin_x = center.x() * grid_scale;
const int mode_bin_y = center.y() * grid_scale;
const int mode_grid_loc = mode_bin_y * grid_dims.x() + mode_bin_x;
FeatureMode mode{center, feature_ptr->weight, feature_idx, mode_grid_loc};
(*mode_grid)[mode_grid_loc].push_back(mode);
FeatureMode* added_mode = &(*mode_grid)[mode_grid_loc].back();
(*mode_ptrs).push_back(added_mode);
++feature_idx;
}
}
} // namespace.
void MotionSaliency::SalientModeFinding(std::vector<SalientLocation>* locations,
std::vector<SalientMode>* modes) {
CHECK(modes);
CHECK(locations);
if (locations->empty()) {
return;
}
// Scale band_width to image domain.
const float band_width =
hypot(frame_width_, frame_height_) * options_.mode_band_width();
// Select all salient locations with non-zero weight.
FeatureFrame<SalientLocation> salient_features;
salient_features.reserve(locations->size());
for (auto& loc : *locations) {
if (loc.weight > 1e-6) {
salient_features.push_back(&loc);
}
}
const int num_features = salient_features.size();
if (num_features == 0) {
return;
}
// Build feature grid according to bandwith.
std::vector<FeatureGrid<SalientLocation>> feature_grids;
std::vector<std::vector<int>> feature_taps;
// Guarantee at least 1.5 sigmas in each direction are captured with
// tap 3 filtering (86 % of the data).
const float grid_resolution = 1.5f * band_width;
Vector2_i grid_dims;
BuildFeatureGrid(
frame_width_, frame_height_, grid_resolution, {salient_features},
[](const SalientLocation& l) -> Vector2_f { return l.pt; }, &feature_taps,
nullptr, &grid_dims, &feature_grids);
// Just one frame input, expect one grid as output.
CHECK_EQ(1, feature_grids.size());
const auto& feature_grid = feature_grids[0];
// Setup Gaussian LUT for smoothing in space, using 2^10 discretization bins.
const int lut_bins = 1 << 10;
std::vector<float> space_lut(lut_bins);
// Using 3 tap smoothing, max distance is 2 bin diagonals.
// We use maximum of 2 * sqrt(2) * bin_radius plus 1% room in case maximum
// value is attained.
const float max_space_diff = sqrt(2.0) * 2.f * grid_resolution * 1.01f;
const float space_bin_size = max_space_diff / lut_bins;
const float space_scale = 1.0f / space_bin_size;
const float space_coeff = -0.5f / (band_width * band_width);
for (int i = 0; i < lut_bins; ++i) {
const float value = i * space_bin_size;
space_lut[i] = std::exp(value * value * space_coeff);
}
// Store modes for each grid bin (to be averaged later).
std::vector<std::list<FeatureMode>> mode_grid(grid_dims.x() * grid_dims.y());
std::vector<FeatureMode*> mode_ptrs;
DetermineFeatureModes(salient_features, grid_resolution, grid_dims,
band_width, feature_grid, feature_taps, space_lut,
space_scale, &mode_grid, &mode_ptrs);
// Read out modes, ordered by decreasing weight.
struct FeatureModeComparator {
bool operator()(const FeatureMode* mode_lhs,
const FeatureMode* mode_rhs) const {
return mode_lhs->irls_weight > mode_rhs->irls_weight;
}
};
// Sort pointers, to keep order immutable during flagging operations.
std::sort(mode_ptrs.begin(), mode_ptrs.end(), FeatureModeComparator());
for (int m = 0; m < mode_ptrs.size(); ++m) {
// We mark a mode as processed by assigning -1 to its index.
if (mode_ptrs[m]->feature_idx < 0) {
continue;
}
FeatureMode* mode = mode_ptrs[m];
// Average modes within band_width based on irls_weight * spatial weight.
double sum_weight = mode->irls_weight;
double mode_x = sum_weight * mode->location.x();
double mode_y = sum_weight * mode->location.y();
const Vector2_f& feat_loc = salient_features[mode->feature_idx]->pt;
double feat_x = sum_weight * feat_loc.x();
double feat_y = sum_weight * feat_loc.y();
double feat_xx = sum_weight * feat_loc.x() * feat_loc.x();
double feat_xy = sum_weight * feat_loc.x() * feat_loc.y();
double feat_yy = sum_weight * feat_loc.y() * feat_loc.y();
mode->feature_idx = -1; // Flag as processed, does not change order
// of traversal.
for (const auto& bin : feature_taps[mode->mode_bin]) {
for (auto& test_mode : mode_grid[bin]) {
if (test_mode.feature_idx >= 0) {
const float dist = (test_mode.location - mode->location).Norm();
if (dist <= band_width) {
const Vector2_f test_loc =
salient_features[test_mode.feature_idx]->pt;
const float weight =
space_lut[static_cast<int>(dist * space_scale)] *
test_mode.irls_weight;
sum_weight += weight;
mode_x += weight * test_mode.location.x();
mode_y += weight * test_mode.location.y();
const float test_loc_x_w = weight * test_loc.x();
const float test_loc_y_w = weight * test_loc.y();
feat_x += test_loc_x_w;
feat_y += test_loc_y_w;
feat_xx += test_loc_x_w * test_loc.x();
feat_xy += test_loc_x_w * test_loc.y();
feat_yy += test_loc_y_w * test_loc.y();
// Flag as processed, does not change order of traversal.
test_mode.feature_idx = -1;
}
}
}
}
if (sum_weight >= options_.min_irls_mode_weight()) {
double inv_sum_weight = 1.0f / sum_weight;
mode_x *= inv_sum_weight;
mode_y *= inv_sum_weight;
feat_x *= inv_sum_weight;
feat_y *= inv_sum_weight;
feat_xx *= inv_sum_weight;
feat_xy *= inv_sum_weight;
feat_yy *= inv_sum_weight;
// Covariance matrix.
const float a = feat_xx - 2.0 * feat_x * mode_x + mode_x * mode_x;
const float bc =
feat_xy - feat_x * mode_y - feat_y * mode_x + mode_x * mode_y;
const float d = feat_yy - 2.0 * feat_y * mode_y + mode_y * mode_y;
Vector2_f axis_magnitude;
float angle;
if (!EllipseFromCovariance(a, bc, d, &axis_magnitude, &angle)) {
angle = 0;
axis_magnitude = Vector2_f(1, 1);
} else {
if (angle < 0) {
angle += M_PI;
}
CHECK_GE(angle, 0);
CHECK_LE(angle, M_PI + 1e-3);
}
SalientMode irls_mode;
irls_mode.location = Vector2_f(mode_x, mode_y);
irls_mode.assignment_weight = sum_weight;
irls_mode.axis_magnitude = axis_magnitude;
irls_mode.angle = angle;
modes->push_back(irls_mode);
}
}
// Sort modes by descreasing weight.
struct ModeWeightCompare {
bool operator()(const SalientMode& lhs, const SalientMode& rhs) const {
return lhs.assignment_weight > rhs.assignment_weight;
}
};
std::sort(modes->begin(), modes->end(), ModeWeightCompare());
}
// Determines the salient frame for a list of SalientLocations by performing
// mode finding and scales each point based on frame size.
void MotionSaliency::DetermineSalientFrame(
std::vector<SalientLocation> locations, SalientPointFrame* salient_frame) {
CHECK(salient_frame);
std::vector<SalientMode> modes;
{
MEASURE_TIME << "Mode finding";
SalientModeFinding(&locations, &modes);
}
const float denom_x = 1.0f / frame_width_;
const float denom_y = 1.0f / frame_height_;
// Convert to salient points.
for (int mode_idx = 0,
mode_sz = std::min<int>(modes.size(), options_.num_top_irls_modes());
mode_idx < mode_sz; ++mode_idx) {
SalientPoint* pt = salient_frame->add_point();
pt->set_norm_point_x(modes[mode_idx].location.x());
pt->set_norm_point_y(modes[mode_idx].location.y());
pt->set_left(options_.bound_left());
pt->set_bottom(options_.bound_bottom());
pt->set_right(options_.bound_right());
pt->set_top(options_.bound_top());
pt->set_norm_major(modes[mode_idx].axis_magnitude.x());
pt->set_norm_minor(modes[mode_idx].axis_magnitude.y());
pt->set_angle(modes[mode_idx].angle);
pt->set_weight(modes[mode_idx].assignment_weight *
options_.saliency_weight());
ScaleSalientPoint(denom_x, denom_y, pt);
}
}
void ForegroundWeightsFromFeatures(const RegionFlowFeatureList& feature_list,
float foreground_threshold,
float foreground_gamma,
const CameraMotion* camera_motion,
std::vector<float>* weights) {
CHECK(weights != nullptr);
weights->clear();
constexpr float kEpsilon = 1e-4f;
CHECK_GT(foreground_threshold, 0.0f);
if (camera_motion) {
foreground_threshold *=
std::max(kEpsilon, InlierCoverage(*camera_motion, false));
}
const float weight_denom = 1.0f / foreground_threshold;
// Map weights to foreground measure and determine minimum irls weight.
for (const auto& feature : feature_list.feature()) {
// Skip marked outliers.
if (feature.irls_weight() == 0) {
weights->push_back(0.0f);
continue;
}
// Maps an irls_weight of magnitude weight_denom (from above) to zero,
// with values below weight_denom assigned linearly mapped (zero is mapped
// ot 1). Avoid mapping to zero as it used to mark outliers.
const float foreground_measure =
std::max(0.0f, 1.0f - feature.irls_weight() * weight_denom);
if (std::abs(foreground_gamma - 1.0f) < 1e-3f) {
weights->push_back(std::max(kEpsilon, foreground_measure));
} else {
weights->push_back(
std::max(kEpsilon, std::pow(foreground_measure, foreground_gamma)));
}
}
CHECK_EQ(feature_list.feature_size(), weights->size());
}
} // namespace mediapipe

View File

@ -0,0 +1,150 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Computes MotionSaliency points that can be used for stabilization and
// retargeting.
#ifndef MEDIAPIPE_UTIL_TRACKING_MOTION_SALIENCY_H_
#define MEDIAPIPE_UTIL_TRACKING_MOTION_SALIENCY_H_
#include <utility>
#include <vector>
#include "mediapipe/framework/port/vector.h"
#include "mediapipe/util/tracking/motion_saliency.pb.h"
#include "mediapipe/util/tracking/region_flow.h"
namespace mediapipe {
class RegionFlowFeatureList;
class RegionFlowFrame;
class SalientPointFrame;
} // namespace mediapipe
namespace mediapipe {
class MotionSaliency {
public:
MotionSaliency(const MotionSaliencyOptions& options, int frame_width,
int frame_height);
~MotionSaliency();
// Finds modes in the RegionFlowFeatureList (clusters for high IRLS weight,
// per default features agreeing with the background motion).
// Optionally, per feature irls weights can be supplied instead of using the
// features weight to adapt modes that will be found, e.g. see
// ForegroundWeightsFromFeatures below.
void SaliencyFromFeatures(const RegionFlowFeatureList& feature_list,
std::vector<float>* irls_weights, // optional.
SalientPointFrame* salient_frame);
// Finds saliency points (modes) from a list of points and their respective
// weights, outputs a SalientPointFrame.
void SaliencyFromPoints(const std::vector<Vector2_f>* points,
const std::vector<float>* weights,
SalientPointFrame* salient_frame);
// Selects saliency inliers, by searching for close-by salient points
// (within fractional MotionSaliencyOptions::filtering_support_distance)
// across adjacent frames (considered are
// #MotionSaliencyOptions::filtering_frame_radius before and after the
// current frame).
// If at least #MotionSaliencyOptions::filtering_minimum_support
// supporting points are found the tested salient point is kept, otherwise
// discarded.
// If desired performs rescaling, such that the median salient point weight
// equals MotionSaliencyOptions::saliency_weight().
void SelectSaliencyInliers(std::vector<SalientPointFrame*>* motion_saliency,
bool rescale_to_median_saliency_weight);
// Averages all salient points (unweighted average) per frame. The resulting
// mean salient point is assigned weight one, and the specified normalized
// bounds (as tuple left, bottom, right, top).
void CollapseMotionSaliency(const SaliencyPointList& input_saliency,
const Vector4_f& bounds,
SaliencyPointList* output_saliency);
// Smooths saliency in space and time.
void FilterMotionSaliency(
std::vector<SalientPointFrame*>* saliency_point_list);
// Aggregates location in image domain and salient weight.
struct SalientLocation {
SalientLocation() {}
SalientLocation(const Vector2_f& _pt, float _weight)
: pt(_pt), weight(_weight) {}
Vector2_f pt;
float weight = 0;
};
private:
// Locates modes in a set of SalientLocation's.
// (using mean shift with bilateral weights, i.e. weight * spatial
// gaussian weighting).
// Only modes with for which the sum of total saliency weight is
// above min_irls_mode_sum are returned.
// Returns modes in the image domain as 2D points, sum of their
// assignment weights and spatial extend along major and minor axis.
// Modes are sorted w.r.t. their assignment irls weights (from highest to
// lowest).
struct SalientMode {
Vector2_f location;
// Total sum of irls weights assigned to this mode.
float assignment_weight = 0;
// Magnitude of major and minor axis storred in x and y, respectively.
Vector2_f axis_magnitude;
// Angle in radians w.r.t. x-axis.
float angle = 0;
};
// Note: input vector locations is not mutated by function.
void SalientModeFinding(std::vector<SalientLocation>* locations,
std::vector<SalientMode>* modes);
// Determines the salient frame for a list of SalientLocations by performing
// mode finding and scaling each point based on frame size.
void DetermineSalientFrame(std::vector<SalientLocation> locations,
SalientPointFrame* salient_frame);
MotionSaliencyOptions options_;
int frame_width_;
int frame_height_;
};
// Returns foregroundness weights in [0, 1] for each feature, by mapping irls
// weight to foreground score in [0, 1].
// In particular, the foreground threshold indicates the *inverse* registration
// error (i.e. the irls weight) that is deemed a complete inlier.
// Weights in the interval [0, foreground_threshold] (corresponding to
// pixel errors in the interval [1 / foreground_threshold, inf])
// are mapped to 1 - [0, 1], i.e. foreground threshold is mapped to zero
// with weights below the threshold being assigned values > 0.
// Therefore, larger values will increase amount of detected foreground
// as well as noise.
// In addition, foreground_gamma's < 1 can be used to increase the resolution
// of small foreground motions (irls weight close to the foreground_threshold)
// at the expense of larger foreground motions (irls weight close to zero).
// If optional parameter camera_motion is specified, the passed foreground
// threshold is scaled by the InlierCoverage of the camera_motion
// (which is in 0, 1). That is for unstable frames with small coverage,
// the threshold is tighter and fewer features are considered foreground.
void ForegroundWeightsFromFeatures(
const RegionFlowFeatureList& feature_list,
float foreground_threshold, // 0.5 is a good default value.
float foreground_gamma, // use 1.0 for default
const CameraMotion* camera_motion, // optional, can be nullptr.
std::vector<float>* weights);
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_MOTION_SALIENCY_H_

View File

@ -0,0 +1,71 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
// Next tag: 17
message MotionSaliencyOptions {
// Standard normalized bounds and weights used to initialize salient points.
// See region_flow.proto for details.
optional float bound_left = 1 [default = 0.3];
optional float bound_bottom = 2 [default = 0.3];
optional float bound_right = 15 [default = 0.3];
optional float bound_top = 16 [default = 0.3];
optional float saliency_weight = 3 [default = 20];
// If set, scales saliency_weight by flow magnitude.
optional bool scale_weight_by_flow_magnitude = 8 [default = false];
// Minimum number of features within a region to be considered salient.
// Only applicable for functions accepting RegionFlowFrames.
optional int32 min_features = 4 [default = 5];
// If set, only considers regions flagged as forground.
optional bool use_only_foreground_regions = 9 [default = false];
// Specifies roughly number of foreground features mapped to one mode,
// for mode to be considered salient.
optional float min_irls_mode_weight = 10 [default = 10];
// Only returns the top N irls modes.
optional int32 num_top_irls_modes = 11 [default = 3];
// Mode finding is performed with a fraction radius of 10% of frame
// diameter by default.
optional float mode_band_width = 12 [default = 0.10];
// We filter salient points along the temporal dimension only, keeping those
// that have sufficient support (in form of neighboring salient points). For
// every salient point in frame n, all points in frames
// [n - filtering_frame_radius, n + filtering_frame_radius] are tested,
// whether they support the current test point.
optional int32 selection_frame_radius = 5 [default = 5];
// Fractional distance to be considered a supporting salient point for a test
// point.
optional float selection_support_distance = 6 [default = 0.2];
// Minimum number of supporting salient points that need to be present in
// order for a point to be considered an inlier.
optional int32 selection_minimum_support = 7 [default = 4];
// Sigma in space (normalized domain).
optional float filtering_sigma_space = 13 [default = 0.05];
// Sigma in time (in frames).
optional float filtering_sigma_time = 14 [default = 5];
}

View File

@ -0,0 +1,36 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/parallel_invoker.h"
// Choose between ThreadPool, OpenMP and serial execution.
// Note only one parallel_using_* directive can be active.
int flags_parallel_invoker_mode = PARALLEL_INVOKER_MAX_VALUE;
int flags_parallel_invoker_max_threads = 4;
namespace mediapipe {
#if defined(PARALLEL_INVOKER_ACTIVE)
ThreadPool* ParallelInvokerThreadPool() {
static ThreadPool* pool = []() -> ThreadPool* {
ThreadPool* new_pool =
new ThreadPool("ParallelInvoker", flags_parallel_invoker_max_threads);
new_pool->StartWorkers();
return new_pool;
}();
return pool;
}
#endif
} // namespace mediapipe

View File

@ -0,0 +1,507 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Parallel for loop execution.
// For details adapt parallel_using_* flags defined in parallel_invoker.cc.
// Usage example (for 1D):
// Define Functor or lambda function that implements:
// void operator()(const BlockedRange & range) const;
// (in addition functor needs to be copyable).
// Execute a for loop in parallel from 0 to N via:
// ParallelFor(0, // start_index
// num_frames, // end_index, exclusive
// 1 // number of elements processed per iteration
// [](const BlockedRange& range) {
// // Process per-thread sub-range
// for (int i = range.begin(); i < range.end(); ++i) {
// // Process i'th item.
// }
// }
// Specific implementation to copy a vector of images in parallel.
// class CopyInvoker {
// public:
// CopyInvoker(const vector<cv::Mat>& inputs,
// vector<cv::Mat*>* outputs)
// : inputs_(inputs), outputs_(outputs) {
// }
// CopyInvoker(const CopyInvoker& rhs)
// : inputs_(rhs.inputs_), outputs_(rhs.outputs) {
// }
// void operator()(const BlockedRange& range) {
// for (int frame = range.begin(); frame < range.end(); ++frame) {
// inputs_[frame].copyTo(*(*outputs_)[frame]);
// }
// }
// private:
// const vector<cv::Mat>& inputs_;
// vector<cv::Mat*>* outputs_;
// }
// vector<cv::Mat> inputs;
// vector<cv::Mat*> outputs;
// ParallelFor(0, num_frames, 1, CopyInvoker(inputs, &outputs));
//
// OR (with lambdas):
// ParallelFor(0, num_frames, 1,
// [&inputs, &outputs](const BlockedRange& range) {
// for (int frame = range.begin(); frame < range.end(); ++frame) {
// inputs[frame].copyTo(*(outputs)[frame]);
// }
// }
#ifndef MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_
#define MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_
#include <stddef.h>
#include <memory>
#include "absl/synchronization/mutex.h"
#include "mediapipe/framework/port/logging.h"
#ifdef PARALLEL_INVOKER_ACTIVE
#include "mediapipe/framework/port/threadpool.h"
#ifdef __APPLE__
#include <dispatch/dispatch.h>
#include <stdatomic.h>
#endif
#endif // PARALLEL_INVOKER_ACTIVE
// Specifies parallelization implementation to use.
enum PARALLEL_INVOKER_MODE {
PARALLEL_INVOKER_NONE = 0, // Uses single threaded execution
PARALLEL_INVOKER_THREAD_POOL = 1, // Uses //thread/threadpool
PARALLEL_INVOKER_OPENMP = 2, // Uses OpenMP (requires compiler support)
PARALLEL_INVOKER_GCD = 3, // Uses GCD (Apple)
PARALLEL_INVOKER_MAX_VALUE = 4, // Increase when adding more modes
};
extern int flags_parallel_invoker_mode;
extern int flags_parallel_invoker_max_threads;
// Note flag: Parallel processing only activated if
// PARALLEL_INVOKER_ACTIVE is defined.
namespace mediapipe {
// Partitions the range [begin, end) into equal blocks of size grain_size each
// (except last one, might be less than grain_size).
class BlockedRange {
public:
BlockedRange(int begin, int end, int grain_size)
: begin_(begin), end_(end), grain_size_(grain_size) {}
int begin() const { return begin_; }
int end() const { return end_; }
int grain_size() const { return grain_size_; }
private:
int begin_;
int end_;
int grain_size_;
};
// Partitions the range row_range x col_range into equal
// blocks of size row_range.grain_size() x col_range.grain_size() each
// (except last column and row might be of size less than grain_size in one
// or both of their dimensions).
class BlockedRange2D {
public:
BlockedRange2D(const BlockedRange& rows, const BlockedRange& cols)
: rows_(rows), cols_(cols) {}
const BlockedRange& rows() const { return rows_; }
const BlockedRange& cols() const { return cols_; }
private:
BlockedRange rows_;
BlockedRange cols_;
};
#ifdef PARALLEL_INVOKER_ACTIVE
// Singleton ThreadPool for parallel invoker.
ThreadPool* ParallelInvokerThreadPool();
#ifdef __APPLE__
// Enable to allow GCD as an option beside ThreadPool.
#define USE_PARALLEL_INVOKER_GCD 1
#define CHECK_GCD_PARALLEL_WORK_COUNT DEBUG
template <class Invoker>
class ParallelInvokerGCDContext {
public:
ParallelInvokerGCDContext(const Invoker& invoker, const BlockedRange& rows)
: local_invoker_(invoker), rows_(rows) {
#if CHECK_GCD_PARALLEL_WORK_COUNT
count_ = 0;
#endif
}
const Invoker& invoker() {
#if CHECK_GCD_PARALLEL_WORK_COUNT
// Implicitly tracking the # of launched tasks at invoker retrieval.
atomic_fetch_add(&count_, 1);
#endif
return local_invoker_;
}
const BlockedRange& rows() const { return rows_; }
#if CHECK_GCD_PARALLEL_WORK_COUNT
const int count() { return atomic_load(&count_); }
#endif
private:
Invoker local_invoker_;
const BlockedRange& rows_;
#if CHECK_GCD_PARALLEL_WORK_COUNT
_Atomic(int32_t) count_;
#endif
};
template <class Invoker>
class ParallelInvokerGCDContext2D : public ParallelInvokerGCDContext<Invoker> {
public:
ParallelInvokerGCDContext2D(const Invoker& invoker, const BlockedRange& rows,
const BlockedRange& cols)
: ParallelInvokerGCDContext<Invoker>(invoker, rows), cols_(cols) {}
const BlockedRange& cols() const { return cols_; }
private:
BlockedRange cols_;
};
template <class Invoker>
static void ParallelForGCDTask(void* context, size_t index) {
ParallelInvokerGCDContext<Invoker>* invoker_context =
static_cast<ParallelInvokerGCDContext<Invoker>*>(context);
const BlockedRange& all_tasks = invoker_context->rows();
int start = all_tasks.begin() + index * all_tasks.grain_size();
int end = std::min(all_tasks.end(), start + all_tasks.grain_size());
BlockedRange this_task(start, end, all_tasks.grain_size());
const Invoker& invoker = invoker_context->invoker();
invoker(this_task);
}
template <class Invoker>
static void ParallelForGCDTask2D(void* context, size_t index) {
ParallelInvokerGCDContext2D<Invoker>* invoker_context =
static_cast<ParallelInvokerGCDContext2D<Invoker>*>(context);
// Partitioning across rows.
const BlockedRange& all_tasks = invoker_context->rows();
int start = all_tasks.begin() + index * all_tasks.grain_size();
int end = std::min(all_tasks.end(), start + all_tasks.grain_size());
BlockedRange this_task(start, end, all_tasks.grain_size());
const Invoker& invoker = invoker_context->invoker();
invoker(BlockedRange2D(this_task, invoker_context->cols()));
}
#endif // __APPLE__
#endif // PARALLEL_INVOKER_ACTIVE
// Simple wrapper for compatibility with below ParallelFor function.
template <class Invoker>
void SerialFor(size_t start, size_t end, size_t grain_size,
const Invoker& invoker) {
invoker(BlockedRange(start, end, 1));
}
inline void CheckAndSetInvokerOptions() {
#if defined(PARALLEL_INVOKER_ACTIVE)
#if defined(__ANDROID__)
// If unsupported option is selected, force usage of OpenMP if detected, and
// ThreadPool otherwise.
if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE &&
flags_parallel_invoker_mode != PARALLEL_INVOKER_THREAD_POOL &&
flags_parallel_invoker_mode != PARALLEL_INVOKER_OPENMP) {
#if defined(_OPENMP)
LOG(WARNING) << "Unsupported invoker mode selected on Android. "
<< "OpenMP linkage detected, so falling back to OpenMP";
flags_parallel_invoker_mode = PARALLEL_INVOKER_OPENMP;
#else // _OPENMP
// Fallback mode for active parallel invoker without OpenMP is ThreadPool.
LOG(WARNING) << "Unsupported invoker mode selected on Android. "
<< "Falling back to ThreadPool";
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
#endif // _OPENMP
}
#endif // __ANDROID__
#if defined(__APPLE__) || defined(__EMSCRIPTEN__)
// Force usage of ThreadPool if unsupported option is selected.
// (OpenMP is not supported on iOS, due to missing clang support).
if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE &&
#if defined(USE_PARALLEL_INVOKER_GCD)
flags_parallel_invoker_mode != PARALLEL_INVOKER_GCD &&
#endif // USE_PARALLEL_INVOKER_GCD
flags_parallel_invoker_mode != PARALLEL_INVOKER_THREAD_POOL) {
LOG(WARNING) << "Unsupported invoker mode selected on iOS. "
<< "Falling back to ThreadPool mode";
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
}
#endif // __APPLE__ || __EMSCRIPTEN__
#if !defined(__APPLE__) && !defined(__EMSCRIPTEN__) && !defined(__ANDROID__)
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
#endif // !__APPLE__ && !__EMSCRIPTEN__ && !__ANDROID__
// If OpenMP is requested, make sure we can actually use it, and fall back
// to ThreadPool if not.
if (flags_parallel_invoker_mode == PARALLEL_INVOKER_OPENMP) {
#if !defined(_OPENMP)
LOG(ERROR) << "OpenMP invoker mode selected but not compiling with OpenMP "
<< "enabled. Falling back to ThreadPool";
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
#endif // _OPENMP
}
#else // PARALLEL_INVOKER_ACTIVE
if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE) {
LOG(ERROR) << "Parallel execution requested but PARALLEL_INVOKER_ACTIVE "
<< "compile flag is not set. Falling back to single threaded "
<< "execution.";
flags_parallel_invoker_mode = PARALLEL_INVOKER_NONE;
}
#endif // PARALLEL_INVOKER_ACTIVE
CHECK_LT(flags_parallel_invoker_mode, PARALLEL_INVOKER_MAX_VALUE)
<< "Invalid invoker mode specified.";
CHECK_GE(flags_parallel_invoker_mode, 0) << "Invalid invoker mode specified.";
}
// Performs parallel iteration from [start to end), scheduling grain_size
// iterations per thread. For each iteration
// invoker(BlockedRange(thread_local_start, thread_local_end))
// is called. Each thread is given its local copy of invoker, i.e.
// invoker needs to have copy constructor defined.
template <class Invoker>
void ParallelFor(size_t start, size_t end, size_t grain_size,
const Invoker& invoker) {
#ifdef PARALLEL_INVOKER_ACTIVE
CheckAndSetInvokerOptions();
switch (flags_parallel_invoker_mode) {
#if defined(__APPLE__)
case PARALLEL_INVOKER_GCD: {
int iterations_remain = (end - start + grain_size - 1) / grain_size;
CHECK_GT(iterations_remain, 0);
if (iterations_remain == 1) {
// Execute invoker serially.
invoker(BlockedRange(start, std::min(end, start + grain_size), 1));
} else {
BlockedRange all_tasks(start, end, grain_size);
ParallelInvokerGCDContext<Invoker> context(invoker, all_tasks);
dispatch_queue_t concurrent_queue =
dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(iterations_remain, concurrent_queue, &context,
ParallelForGCDTask<Invoker>);
#if CHECK_GCD_PARALLEL_WORK_COUNT
CHECK_EQ(iterations_remain, context.count());
#endif
}
break;
}
#endif // __APPLE__
case PARALLEL_INVOKER_THREAD_POOL: {
int iterations_remain = (end - start + grain_size - 1) / grain_size;
CHECK_GT(iterations_remain, 0);
if (iterations_remain == 1) {
// Execute invoker serially.
invoker(BlockedRange(start, std::min(end, start + grain_size), 1));
break;
}
struct {
absl::Mutex mutex;
absl::CondVar completed;
int iterations_remain GUARDED_BY(mutex);
} loop;
{
absl::MutexLock lock(&loop.mutex);
loop.iterations_remain = iterations_remain;
}
for (int x = start; x < end; x += grain_size) {
auto loop_func = [x, end, grain_size, &loop, invoker]() {
// Execute invoker.
invoker(BlockedRange(x, std::min(end, x + grain_size), 1));
// Decrement counter.
absl::MutexLock lock(&loop.mutex);
--loop.iterations_remain;
if (loop.iterations_remain == 0) {
loop.completed.SignalAll();
}
};
// Attempt to run in parallel, if busy run serial to avoid deadlocking.
// This can happen during nested invocation of ParallelFor, as if the
// loop iteration itself is calling ParallelFor we might deadlock if
// we can not guarantee for the iteration to be scheduled.
ParallelInvokerThreadPool()->Schedule(loop_func);
}
// Wait on termination of all iterations.
loop.mutex.Lock();
while (loop.iterations_remain > 0) {
loop.completed.Wait(&loop.mutex);
}
loop.mutex.Unlock();
break;
}
case PARALLEL_INVOKER_OPENMP: {
// Use thread-local copy of invoker.
Invoker local_invoker(invoker);
#pragma omp parallel for firstprivate(local_invoker) \
num_threads(flags_parallel_invoker_max_threads)
for (int x = start; x < end; ++x) {
local_invoker(BlockedRange(x, x + 1, 1));
}
break;
}
case PARALLEL_INVOKER_NONE: {
SerialFor(start, end, grain_size, invoker);
break;
}
case PARALLEL_INVOKER_MAX_VALUE: {
LOG(FATAL) << "Impossible.";
break;
}
}
#else
SerialFor(start, end, grain_size, invoker);
#endif // PARALLEL_INVOKER_ACTIVE
}
// Simple wrapper for compatibility with below ParallelFor2D function.
template <class Invoker>
void SerialFor2D(size_t start_row, size_t end_row, size_t start_col,
size_t end_col, size_t grain_size, const Invoker& invoker) {
invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1),
BlockedRange(start_col, end_col, 1)));
}
// Same as above ParallelFor for 2D iteration.
template <class Invoker>
void ParallelFor2D(size_t start_row, size_t end_row, size_t start_col,
size_t end_col, size_t grain_size, const Invoker& invoker) {
#ifdef PARALLEL_INVOKER_ACTIVE
CheckAndSetInvokerOptions();
switch (flags_parallel_invoker_mode) {
#if defined(__APPLE__)
case PARALLEL_INVOKER_GCD: {
const int iterations_remain =
(end_row - start_row + grain_size - 1) / grain_size;
CHECK_GT(iterations_remain, 0);
if (iterations_remain == 1) {
// Execute invoker serially.
invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1),
BlockedRange(start_col, end_col, 1)));
} else {
BlockedRange all_tasks(start_row, end_row, grain_size);
ParallelInvokerGCDContext2D<Invoker> context(
invoker, all_tasks, BlockedRange(start_col, end_col, grain_size));
dispatch_queue_t concurrent_queue =
dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(iterations_remain, concurrent_queue, &context,
ParallelForGCDTask2D<Invoker>);
#if CHECK_GCD_PARALLEL_WORK_COUNT
CHECK_EQ(iterations_remain, context.count());
#endif
}
break;
}
#endif // __APPLE__
case PARALLEL_INVOKER_THREAD_POOL: {
int iterations_remain = end_row - start_row; // Guarded by loop_mutex
CHECK_GT(iterations_remain, 0);
if (iterations_remain == 1) {
// Execute invoker serially.
invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1),
BlockedRange(start_col, end_col, 1)));
break;
}
absl::Mutex loop_mutex;
absl::CondVar loop_completed;
for (int y = start_row; y < end_row; ++y) {
auto loop_func = [y, start_col, end_col, &loop_mutex, &loop_completed,
&iterations_remain, invoker]() {
// Execute invoker.
invoker(BlockedRange2D(BlockedRange(y, y + 1, 1),
BlockedRange(start_col, end_col, 1)));
// Decrement counter.
absl::MutexLock lock(&loop_mutex);
--iterations_remain;
if (iterations_remain == 0) {
loop_completed.Signal();
}
};
// Attempt to run in parallel, if busy run serial to avoid deadlocking.
ParallelInvokerThreadPool()->Schedule(loop_func);
}
// Wait on termination of all iterations.
loop_mutex.Lock();
while (iterations_remain > 0) {
loop_completed.Wait(&loop_mutex);
}
loop_mutex.Unlock();
break;
}
case PARALLEL_INVOKER_OPENMP: {
// Use thread-local copy of invoker.
Invoker local_invoker(invoker);
#pragma omp parallel for firstprivate(local_invoker) \
num_threads(flags_parallel_invoker_max_threads)
for (int y = start_row; y < end_row; ++y) {
local_invoker(BlockedRange2D(BlockedRange(y, y + 1, 1),
BlockedRange(start_col, end_col, 1)));
}
break;
}
case PARALLEL_INVOKER_NONE: {
SerialFor2D(start_row, end_row, start_col, end_col, grain_size, invoker);
break;
}
case PARALLEL_INVOKER_MAX_VALUE: {
LOG(FATAL) << "Impossible.";
break;
}
}
#else
SerialFor2D(start_row, end_row, start_col, end_col, grain_size, invoker);
#endif // PARALLEL_INVOKER_ACTIVE
}
} // namespace mediapipe
#endif // MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_

View File

@ -0,0 +1,28 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Guard to ensure clients do not link against both,
// single and parallel version.
#ifdef PARALLEL_INVOKER_ACTIVE
int LinkageAgainstBothSingleAndParallelStabilizationVersionsDetected() {
return 0;
}
#endif // PARALLEL_INVOKER_ACTIVE
#ifdef PARALLEL_INVOKER_INACTIVE
int LinkageAgainstBothSingleAndParallelStabilizationVersionsDetected() {
return 1;
}
#endif // PARALLEL_INVOKER_INACTIVE

View File

@ -0,0 +1,60 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/parallel_invoker.h"
#include <algorithm>
#include <numeric>
#include "absl/synchronization/mutex.h"
#include "mediapipe/framework/port/gtest.h"
namespace mediapipe {
namespace {
void RunParallelTest() {
absl::Mutex numbers_mutex;
std::vector<int> numbers;
const int kArraySize = 5000;
// Fill number array in parallel.
ParallelFor(0, kArraySize, 1,
[&numbers_mutex, &numbers](const BlockedRange& b) {
for (int k = b.begin(); k != b.end(); ++k) {
absl::MutexLock lock(&numbers_mutex);
numbers.push_back(k);
}
});
std::vector<int> expected(kArraySize);
std::iota(expected.begin(), expected.end(), 0);
EXPECT_TRUE(
std::is_permutation(expected.begin(), expected.end(), numbers.begin()));
}
TEST(ParallelInvokerTest, PhotosTest) {
flags_parallel_invoker_mode = PARALLEL_INVOKER_OPENMP;
RunParallelTest();
}
TEST(ParallelInvokerTest, ThreadPoolTest) {
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
// Needs to be run in opt mode to pass.
RunParallelTest();
}
} // namespace
} // namespace mediapipe

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,39 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
message PushPullOptions {
// Sigma for color difference.
optional float bilateral_sigma = 1 [default = 20.0];
// Determines how fast confident values can propagate. Filters are normalized,
// such that confidence dissipates quickly instead of propagating.
// To ensure confidence propagates the importance weight is scaled by the
// scalars specified below. Larger values yield quicker propagation.
optional float pull_propagation_scale = 3 [default = 8.0];
optional float push_propagation_scale = 4 [default = 8.0];
// Above bilateral sigma is scaled at each level by the specified scale
// (for push and pull phase). This is due to iterative downsampling of the
// guidance image introduces errors making bilateral weighting increasingly
// errorneous.
optional float pull_bilateral_scale = 5 [default = 0.7];
optional float push_bilateral_scale = 6 [default = 0.9];
// Deprecated fields.
extensions 2;
}

View File

@ -0,0 +1,799 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/region_flow.h"
#include <stddef.h>
#include <cmath>
#include <memory>
#include <numeric>
#include "absl/strings/str_cat.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/util/tracking/measure_time.h"
#include "mediapipe/util/tracking/parallel_invoker.h"
namespace mediapipe {
namespace {
bool IsPointWithinBounds(const Vector2_f& pt, float bounds, int frame_width,
int frame_height) {
// Ensure stability under float -> rounding operations.
if (pt.x() - 0.5f >= bounds && pt.x() + 0.5f <= frame_width - 1 - bounds &&
pt.y() - 0.5f >= bounds && pt.y() + 0.5f <= frame_height - 1 - bounds) {
return true;
} else {
return false;
}
}
} // namespace.
void GetRegionFlowFeatureList(const RegionFlowFrame& region_flow_frame,
int distance_from_border,
RegionFlowFeatureList* flow_feature_list) {
CHECK(flow_feature_list);
flow_feature_list->clear_feature();
const int frame_width = region_flow_frame.frame_width();
const int frame_height = region_flow_frame.frame_height();
flow_feature_list->set_frame_width(frame_width);
flow_feature_list->set_frame_height(frame_height);
flow_feature_list->set_unstable(region_flow_frame.unstable_frame());
flow_feature_list->set_distance_from_border(distance_from_border);
flow_feature_list->set_blur_score(region_flow_frame.blur_score());
for (const auto& region_flow : region_flow_frame.region_flow()) {
for (const auto& feature : region_flow.feature()) {
if (distance_from_border > 0) {
if (!IsPointWithinBounds(FeatureLocation(feature), distance_from_border,
frame_width, frame_height) ||
!IsPointWithinBounds(FeatureMatchLocation(feature),
distance_from_border, frame_width,
frame_height)) {
continue;
}
}
flow_feature_list->add_feature()->CopyFrom(feature);
}
}
}
float RegionFlowFeatureDistance(const PatchDescriptor& patch_desc_1,
const PatchDescriptor& patch_desc_2) {
DCHECK_EQ(patch_desc_1.data_size(), patch_desc_2.data_size());
DCHECK_GE(patch_desc_1.data_size(), 3);
constexpr int kNumMeans = 3;
float sq_distance_sum = 0;
for (int n = 0; n < kNumMeans; ++n) {
const float distance = patch_desc_1.data(n) - patch_desc_2.data(n);
sq_distance_sum += distance * distance;
}
return std::sqrt(sq_distance_sum);
}
void ResetRegionFlowFeatureIRLSWeights(
float value, RegionFlowFeatureList* flow_feature_list) {
for (auto& feature : *flow_feature_list->mutable_feature()) {
feature.set_irls_weight(value);
}
}
double RegionFlowFeatureIRLSSum(const RegionFlowFeatureList& feature_list) {
double sum = 0.0;
for (const auto& feature : feature_list.feature()) {
sum += feature.irls_weight();
}
return sum;
}
void ClampRegionFlowFeatureIRLSWeights(float lower, float upper,
RegionFlowFeatureView* feature_view) {
for (auto& feature_ptr : *feature_view) {
if (feature_ptr->irls_weight() < lower) {
feature_ptr->set_irls_weight(lower);
} else if (feature_ptr->irls_weight() > upper) {
feature_ptr->set_irls_weight(upper);
}
}
}
void ComputeRegionFlowFeatureTexturedness(
const RegionFlowFeatureList& flow_feature_list, bool use_15percent_as_max,
std::vector<float>* texturedness) {
CHECK(texturedness != nullptr);
*texturedness = std::vector<float>(flow_feature_list.feature_size(), 1.0f);
int texture_idx = 0;
for (auto feature = flow_feature_list.feature().begin();
feature != flow_feature_list.feature().end(); ++feature, ++texture_idx) {
const float feature_stdev_l1 =
PatchDescriptorColorStdevL1(feature->feature_descriptor());
if (feature_stdev_l1 < 0.0f) {
LOG_IF(WARNING,
[]() {
static int k = 0;
return k++ < 2;
}())
<< "Feature descriptor does not contain variance information. Was "
<< "ComputeRegionFlowFeatureDescriptors called?";
continue;
}
// feature_stdev_l1 is within [0, 3 * 128 = 384]
float alpha = feature_stdev_l1 / 384.f;
// In [0, 1], 0 = low texture, 1 = high texture. Scale such that around
// 15% of per channel maximum stdev is considered totally textured
// (1 / 0.15 * 3 ~ 18);
if (use_15percent_as_max) {
alpha = std::min(1.0f, alpha * 18.f);
}
(*texturedness)[texture_idx] = alpha;
}
}
void TextureFilteredRegionFlowFeatureIRLSWeights(
float low_texture_threshold, float low_texture_outlier_clamp,
RegionFlowFeatureList* flow_feature_list) {
std::vector<float> texturedness;
ComputeRegionFlowFeatureTexturedness(*flow_feature_list, true, &texturedness);
int texture_idx = 0;
for (auto feature = flow_feature_list->mutable_feature()->begin();
feature != flow_feature_list->mutable_feature()->end();
++feature, ++texture_idx) {
if (feature->irls_weight() == 0.0f) {
continue;
}
if (texturedness[texture_idx] < low_texture_threshold &&
feature->irls_weight() < low_texture_outlier_clamp) {
feature->set_irls_weight(low_texture_outlier_clamp);
} else {
feature->set_irls_weight(feature->irls_weight() /
(texturedness[texture_idx] + 1.e-6f));
}
}
}
void CornerFilteredRegionFlowFeatureIRLSWeights(
float low_corner_threshold, float low_corner_outlier_clamp,
RegionFlowFeatureList* flow_feature_list) {
for (auto feature = flow_feature_list->mutable_feature()->begin();
feature != flow_feature_list->mutable_feature()->end(); ++feature) {
if (feature->irls_weight() == 0.0f) {
continue;
}
const float corner_response = feature->corner_response();
if (corner_response < low_corner_threshold &&
feature->irls_weight() < low_corner_threshold) {
feature->set_irls_weight(low_corner_outlier_clamp);
} else {
feature->set_irls_weight(feature->irls_weight() /
(corner_response + 1.e-6f));
}
}
}
void GetRegionFlowFeatureIRLSWeights(
const RegionFlowFeatureList& flow_feature_list,
std::vector<float>* irls_weights) {
CHECK(irls_weights != nullptr);
irls_weights->clear();
irls_weights->reserve(flow_feature_list.feature_size());
for (auto feature = flow_feature_list.feature().begin();
feature != flow_feature_list.feature().end(); ++feature) {
irls_weights->push_back(feature->irls_weight());
}
}
void SetRegionFlowFeatureIRLSWeights(const std::vector<float>& irls_weights,
RegionFlowFeatureList* flow_feature_list) {
CHECK(flow_feature_list != nullptr);
CHECK_EQ(irls_weights.size(), flow_feature_list->feature_size());
int idx = 0;
for (auto feature = flow_feature_list->mutable_feature()->begin();
feature != flow_feature_list->mutable_feature()->end();
++feature, ++idx) {
feature->set_irls_weight(irls_weights[idx]);
}
}
int CountIgnoredRegionFlowFeatures(
const RegionFlowFeatureList& flow_feature_list, float threshold) {
int count = 0;
for (auto feature = flow_feature_list.feature().begin();
feature != flow_feature_list.feature().end(); ++feature) {
if (feature->irls_weight() <= threshold) {
++count;
}
}
return count;
}
namespace {
struct RegionFlowLocator {
bool operator()(const RegionFlow& lhs, const RegionFlow& rhs) const {
return lhs.region_id() < rhs.region_id();
}
};
} // namespace.
const RegionFlow* GetRegionFlowById(int region_id,
const RegionFlowFrame& flow_frame) {
RegionFlow region_flow;
region_flow.set_region_id(region_id);
const auto& region_pos = std::lower_bound(flow_frame.region_flow().begin(),
flow_frame.region_flow().end(),
region_flow, RegionFlowLocator());
if (region_pos == flow_frame.region_flow().end() ||
region_pos->region_id() != region_id) {
return NULL;
} else {
return &(*region_pos);
}
}
RegionFlow* GetMutableRegionFlowById(int region_id,
RegionFlowFrame* flow_frame) {
RegionFlow region_flow;
region_flow.set_region_id(region_id);
auto region_pos = std::lower_bound(flow_frame->mutable_region_flow()->begin(),
flow_frame->mutable_region_flow()->end(),
region_flow, RegionFlowLocator());
if (region_pos == flow_frame->mutable_region_flow()->end() ||
region_pos->region_id() != region_id) {
return NULL;
} else {
return &(*region_pos);
}
}
void SortRegionFlowById(RegionFlowFrame* flow_frame) {
std::sort(flow_frame->mutable_region_flow()->begin(),
flow_frame->mutable_region_flow()->end(), RegionFlowLocator());
}
void InvertRegionFlow(const RegionFlowFrame& region_flow_frame,
RegionFlowFrame* inverted_flow_frame) {
CHECK(inverted_flow_frame);
inverted_flow_frame->CopyFrom(region_flow_frame);
for (auto& region_flow : *inverted_flow_frame->mutable_region_flow()) {
region_flow.set_centroid_x(region_flow.centroid_x() + region_flow.flow_x());
region_flow.set_centroid_y(region_flow.centroid_y() + region_flow.flow_y());
region_flow.set_flow_x(-region_flow.flow_x());
region_flow.set_flow_y(-region_flow.flow_y());
for (auto& feature : *region_flow.mutable_feature()) {
feature.set_x(feature.x() + feature.dx());
feature.set_y(feature.y() + feature.dy());
feature.set_dx(-feature.dx());
feature.set_dy(-feature.dy());
}
}
}
void InvertRegionFlowFeatureList(const RegionFlowFeatureList& feature_list,
RegionFlowFeatureList* inverted_feature_list) {
CHECK(inverted_feature_list);
*inverted_feature_list = feature_list;
for (auto& feature : *inverted_feature_list->mutable_feature()) {
InvertRegionFlowFeature(&feature);
}
}
void InvertRegionFlowFeature(RegionFlowFeature* feature) {
Vector2_f pt_match = FeatureMatchLocation(*feature);
feature->set_x(pt_match.x());
feature->set_y(pt_match.y());
Vector2_f flow = FeatureFlow(*feature);
feature->set_dx(-flow.x());
feature->set_dy(-flow.y());
}
void LimitFeaturesToBounds(int frame_width, int frame_height, float bounds,
RegionFlowFeatureList* feature_list) {
RegionFlowFeatureList limited;
for (const auto& feature : feature_list->feature()) {
if (!IsPointWithinBounds(FeatureLocation(feature), bounds, frame_width,
frame_height)) {
continue;
} else {
limited.add_feature()->CopyFrom(feature);
}
}
feature_list->Swap(&limited);
}
void NormalizeRegionFlowFeatureList(RegionFlowFeatureList* feature_list) {
const LinearSimilarityModel norm_model =
LinearSimilarityAdapter::NormalizationTransform(
feature_list->frame_width(), feature_list->frame_height());
TransformRegionFlowFeatureList(norm_model, feature_list);
}
void DeNormalizeRegionFlowFeatureList(RegionFlowFeatureList* feature_list) {
const LinearSimilarityModel norm_model =
LinearSimilarityAdapter::NormalizationTransform(
feature_list->frame_width(), feature_list->frame_height());
TransformRegionFlowFeatureList(ModelInvert(norm_model), feature_list);
}
void ScaleSalientPoint(float scale_x, float scale_y, SalientPoint* sp) {
sp->set_norm_point_x(sp->norm_point_x() * scale_x);
sp->set_norm_point_y(sp->norm_point_y() * scale_y);
const float cos_angle = std::cos(sp->angle());
const float sin_angle = std::sin(sp->angle());
Vector2_f major_axis = Vector2_f(cos_angle, sin_angle) * sp->norm_major();
Vector2_f minor_axis = Vector2_f(-sin_angle, cos_angle) * sp->norm_minor();
major_axis[0] *= scale_x;
major_axis[1] *= scale_y;
minor_axis[0] *= scale_x;
minor_axis[1] *= scale_y;
sp->set_norm_major(major_axis.Norm());
sp->set_norm_minor(minor_axis.Norm());
sp->set_angle(std::atan2(major_axis.y(), major_axis.x()));
if (sp->angle() < 0) {
sp->set_angle(sp->angle() + M_PI);
}
}
void ScaleSaliencyList(float scale, bool normalize_to_scale,
SaliencyPointList* saliency_list) {
CHECK(saliency_list != nullptr);
for (auto& point_frame : *saliency_list) {
ScaleSalientPointFrame(scale, normalize_to_scale, &point_frame);
}
}
void ScaleSalientPointFrame(float scale, bool normalize_to_scale,
SalientPointFrame* saliency) {
CHECK(saliency != nullptr);
float saliency_scale = scale;
if (normalize_to_scale) {
float weight_sum = 0.0f;
for (const auto& salient_point : saliency->point()) {
weight_sum += salient_point.weight();
}
if (weight_sum > 1e-6f) {
saliency_scale /= weight_sum;
}
}
for (auto& salient_point : *saliency->mutable_point()) {
salient_point.set_weight(salient_point.weight() * saliency_scale);
}
}
void ResetSaliencyBounds(float left, float bottom, float right, float top,
SaliencyPointList* saliency_list) {
CHECK(saliency_list != nullptr);
for (auto& point_frame : *saliency_list) {
for (auto& salient_point : *point_frame.mutable_point()) {
salient_point.set_left(left);
salient_point.set_bottom(bottom);
salient_point.set_right(right);
salient_point.set_top(top);
}
}
}
bool EllipseFromCovariance(float a, float bc, float d,
Vector2_f* axis_magnitude, float* angle) {
CHECK(axis_magnitude != nullptr);
CHECK(angle != nullptr);
// Get trace and determinant
const float trace = a + d;
const float det = a * d - bc * bc;
// If area is very small (small axis in at least one direction)
// axis are unreliable -> return false.
if (det < 4) { // Measured in sq. pixels.
*axis_magnitude = Vector2_f(1, 1);
*angle = 0;
return false;
}
const float discriminant = std::max(0.f, trace * trace * 0.25f - det);
const float sqrt_discrm = std::sqrt(discriminant);
// Get eigenvalues.
float eig_1 = trace * 0.5f + sqrt_discrm;
float eig_2 = trace * 0.5f - sqrt_discrm;
// Compute eigenvectors.
Vector2_f vec_1, vec_2;
if (fabs(bc) < 1e-6) {
// Right-most case, we already have diagonal matrix.
vec_1.Set(1, 0);
vec_2.Set(0, 1);
} else {
vec_1.Set(eig_1 - d, bc);
vec_2.Set(eig_2 - d, bc);
// Normalize. Norm is always > 0, as bc is > 0 via above if.
vec_1 /= vec_1.Norm();
vec_2 /= vec_2.Norm();
}
// Select positive eigenvector.
if (eig_1 < 0) {
eig_1 *= -1.f;
}
if (eig_2 < 0) {
eig_2 *= -1.f;
}
// Major first.
if (eig_1 < eig_2) {
using std::swap;
swap(vec_1, vec_2);
swap(eig_1, eig_2);
}
*axis_magnitude = Vector2_f(std::sqrt(eig_1), std::sqrt(eig_2));
*angle = std::atan2(vec_1.y(), vec_1.x());
return eig_2 >= 1.5f; // Measurement in pixels.
}
void BoundingBoxFromEllipse(const Vector2_f& center, float norm_major_axis,
float norm_minor_axis, float angle,
std::vector<Vector2_f>* bounding_box) {
CHECK(bounding_box != nullptr);
float dim_x;
float dim_y;
if (angle < M_PI * 0.25 || angle > M_PI * 0.75) {
dim_x = norm_major_axis;
dim_y = norm_minor_axis;
} else {
dim_y = norm_major_axis;
dim_x = norm_minor_axis;
}
// Construct bounding for for axes aligned ellipse.
*bounding_box = {
Vector2_f(-dim_x, -dim_y),
Vector2_f(-dim_x, dim_y),
Vector2_f(dim_x, dim_y),
Vector2_f(dim_x, -dim_y),
};
for (Vector2_f& corner : *bounding_box) {
corner += center;
}
}
void CopyToEmptyFeatureList(RegionFlowFeatureList* src,
RegionFlowFeatureList* dst) {
CHECK(src != nullptr);
CHECK(dst != nullptr);
// Swap out features for empty list.
RegionFlowFeatureList empty_list;
empty_list.mutable_feature()->Swap(src->mutable_feature());
// Copy.
dst->CopyFrom(*src);
// Swap back.
src->mutable_feature()->Swap(empty_list.mutable_feature());
// src_features should be empty as in the beginning.
CHECK_EQ(0, empty_list.feature_size());
}
void IntersectRegionFlowFeatureList(
const RegionFlowFeatureList& to,
std::function<Vector2_f(const RegionFlowFeature&)> to_location_eval,
RegionFlowFeatureList* from, RegionFlowFeatureList* result,
std::vector<int>* source_indices) {
CHECK(from != nullptr);
CHECK(result != nullptr);
CHECK(from->long_tracks()) << "Intersection only works for long features";
CHECK(to.long_tracks()) << "Intersection only works for long features";
// Hash features in to, based on track_id.
std::unordered_map<int, const RegionFlowFeature*> track_map;
for (const auto& feature : to.feature()) {
track_map[feature.track_id()] = &feature;
}
// Initialize result.
CopyToEmptyFeatureList(from, result);
const int num_from_features = from->feature_size();
result->mutable_feature()->Reserve(num_from_features);
int feature_idx = 0;
for (const auto& feature : from->feature()) {
auto find_location = track_map.find(feature.track_id());
if (find_location != track_map.end()) {
const Vector2_f diff =
to_location_eval(*find_location->second) - FeatureLocation(feature);
RegionFlowFeature* new_feature = result->add_feature();
new_feature->CopyFrom(feature);
new_feature->set_dx(diff.x());
new_feature->set_dy(diff.y());
if (source_indices != nullptr) {
source_indices->push_back(feature_idx);
}
}
++feature_idx;
}
}
void LongFeatureStream::AddFeatures(const RegionFlowFeatureList& feature_list,
bool check_connectivity,
bool purge_non_present_features) {
if (!feature_list.long_tracks()) {
LOG(ERROR) << "Feature stream should be used only used with long feature "
<< "tracks. Ensure POLICY_LONG_FEATURE was used for "
<< "RegionFlowComputation.";
return;
}
if (feature_list.match_frame() == 0) {
// Skip first frame.
return;
}
if (std::abs(feature_list.match_frame()) != 1) {
LOG(ERROR) << "Only matching frames one frame from current one are "
<< "supported";
return;
}
// Record id of each track that is present in the current feature_list.
std::unordered_set<int> present_tracks;
for (auto feature : feature_list.feature()) { // Copy feature.
if (feature.track_id() < 0) {
LOG_IF(WARNING, []() {
static int k = 0;
return k++ < 2;
}()) << "Feature does not have a valid track id assigned. Ignoring.";
continue;
}
present_tracks.insert(feature.track_id());
if (check_connectivity) {
// A new feature should never have been erased before.
CHECK(old_ids_.find(feature.track_id()) == old_ids_.end())
<< "Feature : " << feature.track_id() << "was already removed.";
}
// Invert the features to be foward or backward according to forward_ flag.
if ((!forward_ && feature_list.match_frame() > 0) ||
(forward_ && feature_list.match_frame() < 0)) {
InvertRegionFlowFeature(&feature);
}
auto find_pos = tracks_.find(feature.track_id());
if (find_pos != tracks_.end()) {
// Track is present, add to it.
if (check_connectivity) {
CHECK_LT((FeatureLocation(find_pos->second.back()) -
FeatureMatchLocation(feature))
.Norm2(),
1e-4);
}
find_pos->second.push_back(feature);
} else {
tracks_[feature.track_id()] = std::vector<RegionFlowFeature>(1, feature);
}
}
if (purge_non_present_features) {
std::vector<int> to_be_removed;
for (const auto& track : tracks_) {
if (present_tracks.find(track.first) == present_tracks.end()) {
to_be_removed.push_back(track.first);
if (check_connectivity) {
old_ids_.insert(track.first);
}
}
}
for (int id : to_be_removed) {
tracks_.erase(id);
}
}
}
void LongFeatureStream::FlattenTrack(
const std::vector<RegionFlowFeature>& features,
std::vector<Vector2_f>* result, std::vector<float>* irls_weight,
std::vector<Vector2_f>* flow) const {
CHECK(result != nullptr);
if (features.empty()) {
return;
}
if (irls_weight) {
irls_weight->clear();
}
if (flow) {
flow->clear();
}
if (!forward_) {
// Backward tracking, add first match.
result->push_back(FeatureMatchLocation(features[0]));
}
for (const auto& feature : features) {
result->push_back(FeatureLocation(feature));
if (flow) {
flow->push_back(FeatureFlow(feature));
}
if (irls_weight) {
irls_weight->push_back(feature.irls_weight());
}
}
if (forward_) {
// Forward tracking, add last match.
result->push_back(FeatureMatchLocation(features.back()));
}
// Replicate last irls weight.
if (irls_weight) {
irls_weight->push_back(irls_weight->back());
}
}
const std::vector<RegionFlowFeature>* LongFeatureStream::TrackById(
int id) const {
auto track_pos = tracks_.find(id);
if (track_pos == tracks_.end()) {
return nullptr;
} else {
return &track_pos->second;
}
}
std::vector<Vector2_f> LongFeatureStream::FlattenedTrackById(int id) const {
const auto* track = TrackById(id);
if (track != nullptr) {
std::vector<Vector2_f> points;
FlattenTrack(*track, &points, nullptr, nullptr);
return points;
} else {
return std::vector<Vector2_f>();
}
}
void LongFeatureInfo::AddFeatures(const RegionFlowFeatureList& feature_list) {
if (!feature_list.long_tracks()) {
LOG(ERROR) << "Passed feature list was not computed with long tracks. ";
return;
}
for (const auto& feature : feature_list.feature()) {
AddFeature(feature);
}
IncrementFrame();
}
void LongFeatureInfo::AddFeature(const RegionFlowFeature& feature) {
if (feature.irls_weight() == 0) { // Skip outliers.
return;
}
const int track_id = feature.track_id();
if (track_id < 0) { // Skip unassigned ids.
return;
}
auto insert_pos = track_info_.find(track_id);
if (insert_pos == track_info_.end()) {
track_info_[track_id].start = num_frames_;
track_info_[track_id].length = 1;
} else {
++insert_pos->second.length;
}
}
void LongFeatureInfo::TrackLengths(const RegionFlowFeatureList& feature_list,
std::vector<int>* track_lengths) const {
CHECK(track_lengths);
const int feature_size = feature_list.feature_size();
track_lengths->resize(feature_size);
for (int k = 0; k < feature_size; ++k) {
(*track_lengths)[k] = TrackLength(feature_list.feature(k));
}
}
int LongFeatureInfo::TrackLength(const RegionFlowFeature& feature) const {
const auto insert_pos = track_info_.find(feature.track_id());
return insert_pos != track_info_.end() ? insert_pos->second.length : 0;
}
int LongFeatureInfo::TrackStart(const RegionFlowFeature& feature) const {
const auto insert_pos = track_info_.find(feature.track_id());
return insert_pos != track_info_.end() ? insert_pos->second.start : -1;
}
void LongFeatureInfo::Reset() {
track_info_.clear();
num_frames_ = 0;
}
int LongFeatureInfo::GlobalTrackLength(float percentile) const {
std::vector<int> track_lengths;
track_lengths.reserve(track_info_.size());
for (const auto& pair : track_info_) {
track_lengths.push_back(pair.second.length);
}
if (track_lengths.empty()) {
return 0;
}
auto percentile_item =
track_lengths.begin() + percentile * track_lengths.size();
std::nth_element(track_lengths.begin(), percentile_item, track_lengths.end());
return *percentile_item;
}
void GridTaps(int dim_x, int dim_y, int tap_radius,
std::vector<std::vector<int>>* taps) {
CHECK(taps);
const int grid_size = dim_x * dim_y;
const int diam = 2 * tap_radius + 1;
taps->resize(grid_size);
for (int i = 0; i < dim_y; ++i) {
for (int j = 0; j < dim_x; ++j) {
std::vector<int>& grid_bin = (*taps)[i * dim_x + j];
grid_bin.clear();
grid_bin.reserve(diam * diam);
for (int k = std::max(0, i - tap_radius),
end_k = std::min(dim_y - 1, i + tap_radius);
k <= end_k; ++k) {
for (int l = std::max(0, j - tap_radius),
end_l = std::min(dim_x - 1, j + tap_radius);
l <= end_l; ++l) {
grid_bin.push_back(k * dim_x + l);
}
}
}
}
}
} // namespace mediapipe

Some files were not shown because too many files have changed in this diff Show More