d16cc3be5b
GitOrigin-RevId: d91373b4d4d10abef49cab410caa6aadf0875049
416 lines
18 KiB
Protocol Buffer
416 lines
18 KiB
Protocol Buffer
// Copyright 2019 The MediaPipe Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
syntax = "proto2";
|
|
|
|
package mediapipe;
|
|
|
|
import "mediapipe/util/tracking/motion_models.proto";
|
|
|
|
// Next tag: 38
|
|
message MotionBoxState {
|
|
// Position (top-left corner) and fixed size of the current MotionBox,
|
|
// specified w.r.t. normalized domain (in [0, 1] along both dimensions).
|
|
optional float pos_x = 1;
|
|
optional float pos_y = 2;
|
|
optional float width = 3;
|
|
optional float height = 4;
|
|
|
|
// Optional degrees of freedom; scale and rotation w.r.t. center of the box,
|
|
// i.e. [pos_x, pos_y] + 0.5 * [width, height].
|
|
// To activate see TrackStepOptions::TrackingDegrees.
|
|
optional float scale = 5 [default = 1.0];
|
|
optional float rotation = 30 [default = 0.0]; // in radians.
|
|
|
|
message Quad {
|
|
// Vertex 0 is according to x_0 = vertices(0), y_0 = vertices(1)
|
|
// Vertex 1 is according to x_1 = vertices(2), y_1 = vertices(3)
|
|
// Vertex 2 is according to x_2 = vertices(4), y_2 = vertices(5)
|
|
// Vertex 3 is according to x_3 = vertices(6), y_3 = vertices(7)
|
|
// Order of vertices should be aligned in counter-clockwise manner
|
|
// 0---------3
|
|
// | |
|
|
// | |
|
|
// 1---------2
|
|
repeated float vertices = 1;
|
|
}
|
|
|
|
// This field is only used when we try to track under
|
|
// TRACKING_DEGREE_OBJECT_PERSPECTIVE.
|
|
optional Quad quad = 34;
|
|
|
|
// Aspect ratio (width / height) for the tracked rectangle in physical space.
|
|
optional float aspect_ratio = 35;
|
|
|
|
// Whether we want this box to be potentially grouped with other boxes
|
|
// to track together. This is useful for tracking small boxes that lie
|
|
// on a plane. For example, when we detect a plane,
|
|
// track the plane, then all boxes within the plane can share the same
|
|
// homography transform.
|
|
optional bool request_grouping = 37 [default = false];
|
|
|
|
// For quad tracking using pnp solver,
|
|
// Whether we use perspective-n-points to track quad between frames.
|
|
// That mode requires:
|
|
// 1. The quad which is being tracked is an rectangle in the physical world.
|
|
// 2. The `asepct_ratio` field has to be set in MotionBoxState.
|
|
optional Homography pnp_homography = 36;
|
|
|
|
// Object velocity in x and y, specified as normalized spatial unit per
|
|
// standard frame period (here calibrated w.r.t. kTrackingDefaultFps = 30
|
|
// FPS), that is 33.3 ms. Object velocity refers to velocity after
|
|
// subtracting camera motion.
|
|
// If current frame period is 66.67 ms (i.e. 15 fps); actual velocity is
|
|
// obtained by multipling with a factor of 2. Similar for 60 fps factor
|
|
// is 0.5f.
|
|
// Standard frame period is chosen for legacy reasons to keep TrackStepOptions
|
|
// defaults.
|
|
optional float dx = 7;
|
|
optional float dy = 8;
|
|
|
|
// Weighted average of object velocity magnitude of inlier points (expressed
|
|
// in normalized spatial units per standard frame period).
|
|
optional float kinetic_energy = 17;
|
|
|
|
// Specifies how valid the prior was in the last step.
|
|
optional float prior_weight = 9;
|
|
|
|
// Tracking status indicating result of tracking:
|
|
// UNTRACKED: Box can not be tracked
|
|
// (either out of bound or too many tracking failures).
|
|
// EMPTY: Box has size of <= 0 along at least on of its dimensions
|
|
// (collapsed).
|
|
// NO_FEATURES: No features found within the box, tracking is not possible.
|
|
// TRACKED: Successful tracking.
|
|
// DUPLICATED: Successful tracked, but duplicated from previous result as
|
|
// frame was duplicated.
|
|
// BOX_TRACKED_OUT_OF_BOUND: Successful tracked, out of bound from screen
|
|
// area. Will advance by camera motion. Only used for static objects.
|
|
enum TrackStatus {
|
|
BOX_UNTRACKED = 0;
|
|
BOX_EMPTY = 1;
|
|
BOX_NO_FEATURES = 2;
|
|
BOX_TRACKED = 3;
|
|
BOX_DUPLICATED = 4;
|
|
BOX_TRACKED_OUT_OF_BOUND = 5;
|
|
}
|
|
|
|
optional TrackStatus track_status = 10 [default = BOX_UNTRACKED];
|
|
|
|
// Spatial prior (presence of inliers, i.e. where is the object located within
|
|
// the box that is currently being tracked) as a pair of
|
|
// a) prior (in [0, 1]) and
|
|
// b) confidence (number of features converted to score within
|
|
// [0, 1]).
|
|
// Prior is defined over a grid of size spatial_prior_grid_size x
|
|
// spatial_prior_grid_size.
|
|
optional int32 spatial_prior_grid_size = 11 [default = 10];
|
|
repeated float spatial_prior = 12 [packed = true];
|
|
repeated float spatial_confidence = 13 [packed = true];
|
|
|
|
// Difference score between previous prior and current prior (in [0, 1]).
|
|
// Currently not used.
|
|
optional float prior_diff = 14;
|
|
|
|
// Score determining how much predicted motion disagrees with measured motion.
|
|
// If measured motion deviates strongly from predicted motion, disparity is
|
|
// +/-1, if motion agrees with predicted motion, disparity is 0.
|
|
// Sign indicates measured motion is accelerating (> 0)
|
|
// or de-accelerating (< 0) w.r.t. predicted motion.
|
|
optional float motion_disparity = 15;
|
|
|
|
// Score determining how discriminative estimated motion model is.
|
|
// In [0, 1] where 0 no discrimination w.r.t. background and 1
|
|
// high discrimination.
|
|
optional float background_discrimination = 16;
|
|
|
|
// Center of mass for inliers after tracking (center of feature that were used
|
|
// for motion estimation)
|
|
optional float inlier_center_x = 18;
|
|
optional float inlier_center_y = 19;
|
|
|
|
// Approximate number of inliers (each features scores a zero [outlier]
|
|
// or one [inlier]).
|
|
optional float inlier_sum = 24;
|
|
|
|
// Ratio of above inlier_sum to average inlier_sum across last states.
|
|
optional float inlier_ratio = 25;
|
|
|
|
// Extent (width and height of inliers).
|
|
optional float inlier_width = 22;
|
|
optional float inlier_height = 23;
|
|
|
|
// Set of current inlier tracking ids.
|
|
repeated uint32 inlier_ids = 26 [packed = true];
|
|
// Corresponding x,y coordinates for each inlier.
|
|
repeated uint32 inlier_id_match_pos = 31 [packed = true];
|
|
// Corresponding inlier score (currently: length of inlier observed).
|
|
repeated uint32 inlier_length = 27 [packed = true];
|
|
|
|
// Set of outlier ids.
|
|
repeated uint32 outlier_ids = 28 [packed = true];
|
|
// Corresponding x,y coordinates for each outlier.
|
|
repeated uint32 outlier_id_match_pos = 32 [packed = true];
|
|
|
|
// Confidence of box tracked in the range [0, 1], with 0 being least
|
|
// confident, and 1 being most confident. A reasonable threshold is 0.5
|
|
// to filter out unconfident boxes.
|
|
optional float tracking_confidence = 33;
|
|
|
|
// Additional internal state.
|
|
optional MotionBoxInternalState internal = 29;
|
|
|
|
reserved 20, 21;
|
|
}
|
|
|
|
// Captures additional internal state info about the tracking.
|
|
message MotionBoxInternalState {
|
|
// Stores all motion vectors that were used for tracking
|
|
// as packed arrays, capturing position, object motion, camera motion,
|
|
// tracking id and corresponding inlier weight.
|
|
repeated float pos_x = 1 [packed = true];
|
|
repeated float pos_y = 2 [packed = true];
|
|
repeated float dx = 3 [packed = true];
|
|
repeated float dy = 4 [packed = true];
|
|
repeated float camera_dx = 5 [packed = true];
|
|
repeated float camera_dy = 6 [packed = true];
|
|
repeated int32 track_id = 7 [packed = true];
|
|
|
|
// Within [0, 1]. 0 = outlier; 1 = inlier.
|
|
repeated float inlier_score = 8 [packed = true];
|
|
}
|
|
|
|
// Next tag: 42
|
|
message TrackStepOptions {
|
|
// Degrees of freedom being used for tracking. By default tracker only uses
|
|
// translation. Additionally scale and rotation from the camera motion
|
|
// and / or object motion can be taken into account.
|
|
enum TrackingDegrees {
|
|
TRACKING_DEGREE_TRANSLATION = 0;
|
|
|
|
// Additional tracking degrees according to camera motion.
|
|
TRACKING_DEGREE_CAMERA_SCALE = 1;
|
|
TRACKING_DEGREE_CAMERA_ROTATION = 2;
|
|
TRACKING_DEGREE_CAMERA_ROTATION_SCALE = 3;
|
|
// TODO: Implement!
|
|
TRACKING_DEGREE_CAMERA_PERSPECTIVE = 4;
|
|
|
|
// Tracking degrees modeling object motion. Note that additional
|
|
// object degrees of freedom are only applied when estimation is deemed
|
|
// stable, in particular sufficient inliers are present.
|
|
// By default, does NOT apply camera motion. If that is desired set
|
|
// the flag: track_object_and_camera to true.
|
|
TRACKING_DEGREE_OBJECT_SCALE = 5;
|
|
TRACKING_DEGREE_OBJECT_ROTATION = 6;
|
|
TRACKING_DEGREE_OBJECT_ROTATION_SCALE = 7;
|
|
TRACKING_DEGREE_OBJECT_PERSPECTIVE = 8;
|
|
}
|
|
|
|
optional TrackingDegrees tracking_degrees = 28
|
|
[default = TRACKING_DEGREE_TRANSLATION];
|
|
|
|
// If set and one of the TRACKING_DEGREE_OBJECT degrees are set also applies
|
|
// camera motion in addition to the object motion.
|
|
optional bool track_object_and_camera = 32 [default = false];
|
|
|
|
// Number of iterations to iteratively estimate model and re-estimate
|
|
// influence of each vector.
|
|
optional int32 irls_iterations = 1 [default = 5];
|
|
|
|
// Gaussian spatial prior sigma relative to box size.
|
|
// For motivation, see this plot: http://goo.gl/BCfcy.
|
|
optional float spatial_sigma = 2 [default = 0.15];
|
|
|
|
// Gaussian velocity prior sigma. It is computed as the maximum of the
|
|
// absolute minimum sigma (in normalized domain) and the relative sigma
|
|
// w.r.t. previous motion.
|
|
optional float min_motion_sigma = 3 [default = 0.002];
|
|
optional float relative_motion_sigma = 4 [default = 0.3];
|
|
|
|
// Settings for motion disparity. Difference between previous and current
|
|
// motion magnitude is scored linearly, from motion_disparity_low_level to
|
|
// motion_disparity_high_level (mapped to score of 0 and 1 respectively).
|
|
// Motivation is to ensure acceleration between frames are within reasonable
|
|
// bounds.
|
|
// Represents a maximum acceleration of around 4 - 5 pixels per frame in 360p
|
|
// video to be unpenalized, with accelerations of around >= 10 pixels being
|
|
// considered inconsitent with prediction.
|
|
optional float motion_disparity_low_level = 6 [default = 8e-3];
|
|
optional float motion_disparity_high_level = 7 [default = 1.6e-2];
|
|
|
|
// Motion disparity decays across frames. Disparity of previous frame decays
|
|
// over time. If disparity in current frame is not higher, i.e. the larger
|
|
// of the current and decayed disparity is taken.
|
|
// Motivation is, that if acceleration was unreasonable high (and we likely
|
|
// lost tracking) we enter a stage of trying to regain tracking by looking for
|
|
// vectors that agree with the previous prediction.
|
|
optional float disparity_decay = 8 [default = 0.8];
|
|
|
|
// Object motion is given as linear combination of previous and measured
|
|
// motion depending on the motion_disparity (a high disparity is giving high
|
|
// weight to the previous motion).
|
|
// We enforce at least a minimum of the below motion_prior_weight regardless
|
|
// of the motion disparity.
|
|
optional float motion_prior_weight = 9 [default = 0.2];
|
|
|
|
// Settings for motion discrimination.
|
|
//
|
|
// Current motion magnitude is scored linearly,
|
|
// from background_discrimination_low_level to
|
|
// background_discrimination_high_level (mapped to score of 0 and 1
|
|
// respectively).
|
|
// Motivation is that high object motions are easy to discriminate from the
|
|
// background, whereas small object motions are virtually indistinguishable.
|
|
// Represents a range of 2 - 4 pixels for 360p video.
|
|
optional float background_discrimination_low_level = 10 [default = 4e-3];
|
|
optional float background_discrimination_high_level = 11 [default = 8e-3];
|
|
|
|
// Spring force settings. If difference between predicted center of the box in
|
|
// the next frame and the predicted center of the inliers deviates by more
|
|
// than inlier_center_relative_distance times the box [width|height]
|
|
// a spring force is applied to the box. The amount of force is spring_force
|
|
// times the difference.
|
|
optional float inlier_center_relative_distance = 12 [default = 0.1];
|
|
optional float inlier_spring_force = 13 [default = 0.3];
|
|
|
|
// Same as above, but for the center of large motion magnitudes.
|
|
optional float kinetic_center_relative_distance = 14 [default = 0.4];
|
|
optional float kinetic_spring_force = 15 [default = 0.5];
|
|
|
|
// Spring force towards large motions is only applied when kinetic energy is
|
|
// above the specified threshold.
|
|
optional float kinetic_spring_force_min_kinetic_energy = 21 [default = 3e-3];
|
|
|
|
// Bias of old velocity during update step.
|
|
optional float velocity_update_weight = 16 [default = 0.7];
|
|
|
|
// Maximum number of frames considered to be tracking failures ->
|
|
// If over threshold, box is considered untrackable.
|
|
optional int32 max_track_failures = 17 [default = 10];
|
|
|
|
// Domain used for tracking is always larger than the current box.
|
|
// If current motion is not negligible, box is expanded in the direction the
|
|
// motion, otherwise expanded in all directions by the amount specified below
|
|
// (w.r.t. normalized domain).
|
|
optional float expansion_size = 18 [default = 0.05];
|
|
|
|
// Features are scored based on the magnitude of their irls weights, mapped to
|
|
// [0, 1] using the following range. The range represents roughly 3 - 1.5
|
|
// pixels error for 360p video.
|
|
optional float inlier_low_weight = 19 [default = 250];
|
|
optional float inlier_high_weight = 20 [default = 500];
|
|
|
|
// Kinetic energy decays over time by the specified rate.
|
|
optional float kinetic_energy_decay = 22 [default = 0.98];
|
|
|
|
// Amount by which prior is increased/decreased in case of valid/invalid
|
|
// measurements.
|
|
optional float prior_weight_increase = 23 [default = 0.2];
|
|
|
|
// We map the amount of present kinetic energy linearly to the domain [0, 1]
|
|
// describing if an object is static (0) or moving (1).
|
|
optional float low_kinetic_energy = 24 [default = 1e-3]; // ~0.4 pix
|
|
optional float high_kinetic_energy = 25 [default = 4e-3]; // ~3 pix
|
|
|
|
// Outputs internal state to MotionBoxState.
|
|
optional bool return_internal_state = 26 [default = false];
|
|
|
|
// Specifies which weights are stored in the internal state. By default
|
|
// post-estimation weights are stored, otherwise pre-estimation weights
|
|
// are stored.
|
|
optional bool use_post_estimation_weights_for_state = 29 [default = true];
|
|
|
|
// Computes spatial grid of inliers and stores it in the MotionBoxState.
|
|
optional bool compute_spatial_prior = 27 [default = false];
|
|
|
|
// Irls initialization by performing several rounds of RANSAC to preselect
|
|
// features for motion estimation scoring outliers low and inliers to be at
|
|
// least of median inlier weight.
|
|
message IrlsInitialization {
|
|
optional bool activated = 1 [default = false];
|
|
|
|
// Rounds of RANSAC.
|
|
optional int32 rounds = 2 [default = 50];
|
|
|
|
// Normalized cutoff threshold for a vector to be considered an inlier.
|
|
optional float cutoff = 3 [default = 0.005];
|
|
}
|
|
optional IrlsInitialization irls_initialization = 30;
|
|
|
|
// Ratio between static motion and temporal scale. This is actually
|
|
// the threshold on speed, under which we consider static (non-moving object).
|
|
optional float static_motion_temporal_ratio = 33 [default = 3e-3];
|
|
|
|
// Different control parameters to terminate tracking when
|
|
// occlusion occurs.
|
|
message CancelTrackingWithOcclusionOptions {
|
|
optional bool activated = 1 [default = false];
|
|
optional float min_motion_continuity = 2 [default = 0.4];
|
|
optional float min_inlier_ratio = 3 [default = 0.1];
|
|
}
|
|
optional CancelTrackingWithOcclusionOptions
|
|
cancel_tracking_with_occlusion_options = 34;
|
|
|
|
// If number of continued inliers is less than this number, then the object
|
|
// motion model will fall back to translation model.
|
|
// Set this min_continued_inliers threshold to a low number to make sure
|
|
// they follow local object rotation and scale, but it may result in un-robust
|
|
// rotation and scale estimation if the threshold is too low. Recommend that
|
|
// you don't set a number < 4.
|
|
optional int32 object_similarity_min_contd_inliers = 35 [default = 30];
|
|
|
|
// Maximum acceptable scale component of object similarity transform.
|
|
// Minimum scale is computed as 1.0 / max_scale.
|
|
// Exclusive for tracking a box with similarity.
|
|
optional float box_similarity_max_scale = 36 [default = 1.05];
|
|
|
|
// Maximum acceptable object similarity rotation in radians.
|
|
optional float box_similarity_max_rotation = 37 [default = 0.2];
|
|
|
|
// Homography transform will first be projected to similarity, and the scale
|
|
// component of the similarity transform should be within the range of
|
|
// [1.0 / max_scale, max_scale].
|
|
optional float quad_homography_max_scale = 38 [default = 1.2];
|
|
|
|
// The rotation component of the projected similarity should be smaller than
|
|
// this maximum rotation threshold.
|
|
optional float quad_homography_max_rotation = 39 [default = 0.3];
|
|
|
|
// Pre-calibrated camera intrinsics parameters, including focal length, center
|
|
// point, distortion coefficients (only 3 radial factors) and image width /
|
|
// height. The image formation model is described here:
|
|
// https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
|
|
// Only used for quad tracking mode. Leave it empty if unknown.
|
|
message CameraIntrinsics {
|
|
optional float fx = 1;
|
|
optional float fy = 2;
|
|
optional float cx = 3;
|
|
optional float cy = 4;
|
|
optional float k0 = 5;
|
|
optional float k1 = 6;
|
|
optional float k2 = 7;
|
|
optional int32 w = 8;
|
|
optional int32 h = 9;
|
|
}
|
|
optional CameraIntrinsics camera_intrinsics = 40;
|
|
|
|
// Specifically for quad tracking (aka TRACKING_DEGREE_OBJECT_PERSPECTIVE
|
|
// mode), if aspect_ratio field is set in start pos, pnp tracking will be
|
|
// deployed. If aspect_ratio is unknown (not set), but forced_pnp_tracking is
|
|
// true, we will first estimate the aspect ratio for the 3D quadrangle, then
|
|
// perform pnp tracking. If aspect_ratio is unknown and pnp tracking is not
|
|
// forced, general homography tracking will be deployed.
|
|
optional bool forced_pnp_tracking = 41 [default = false];
|
|
}
|