mediapipe/mediapipe/util/tracking/tracking.proto

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";

package mediapipe;

import "mediapipe/util/tracking/motion_models.proto";

// Next tag: 38
message MotionBoxState {
  // Position (top-left corner) and fixed size of the current MotionBox,
  // specified w.r.t. normalized domain (in [0, 1] along both dimensions).
  optional float pos_x = 1;
  optional float pos_y = 2;
  optional float width = 3;
  optional float height = 4;

  // Optional degrees of freedom; scale and rotation w.r.t. center of the box,
  // i.e. [pos_x, pos_y] + 0.5 * [width, height].
  // To activate see TrackStepOptions::TrackingDegrees.
  optional float scale = 5 [default = 1.0];
  optional float rotation = 30 [default = 0.0];  // in radians.

  message Quad {
    // Vertex 0 is according to x_0 = vertices(0), y_0 = vertices(1)
    // Vertex 1 is according to x_1 = vertices(2), y_1 = vertices(3)
    // Vertex 2 is according to x_2 = vertices(4), y_2 = vertices(5)
    // Vertex 3 is according to x_3 = vertices(6), y_3 = vertices(7)
    // Order of vertices should be aligned in counter-clockwise manner
    //          0---------3
    //          |         |
    //          |         |
    //          1---------2
    repeated float vertices = 1;
  }

  // This field is only used when we try to track under
  // TRACKING_DEGREE_OBJECT_PERSPECTIVE.
  optional Quad quad = 34;

  // Aspect ratio (width / height) for the tracked rectangle in physical space.
  optional float aspect_ratio = 35;

  // Whether we want this box to be potentially grouped with other boxes
  // to track together. This is useful for tracking small boxes that lie
  // on a plane. For example, when we detect a plane,
  // track the plane, then all boxes within the plane can share the same
  // homography transform.
  optional bool request_grouping = 37 [default = false];

  // For quad tracking using pnp solver,
  // Whether we use perspective-n-points to track quad between frames.
  // That mode requires:
  //   1. The quad which is being tracked is an rectangle in the physical world.
  //   2. The `asepct_ratio` field has to be set in MotionBoxState.
  optional Homography pnp_homography = 36;

  // Object velocity in x and y, specified as normalized spatial unit per
  // standard frame period (here calibrated w.r.t. kTrackingDefaultFps = 30
  // FPS), that is 33.3 ms. Object velocity refers to velocity after
  // subtracting camera motion.
  // If current frame period is 66.67 ms (i.e. 15 fps); actual velocity is
  // obtained by multipling with a factor of 2. Similar for 60 fps factor
  // is 0.5f.
  // Standard frame period is chosen for legacy reasons to keep TrackStepOptions
  // defaults.
  optional float dx = 7;
  optional float dy = 8;

  // Weighted average of object velocity magnitude of inlier points (expressed
  // in normalized spatial units per standard frame period).
  optional float kinetic_energy = 17;

  // Specifies how valid the prior was in the last step.
  optional float prior_weight = 9;

  // Tracking status indicating result of tracking:
  //   UNTRACKED: Box can not be tracked
  //              (either out of bound or too many tracking failures).
  //   EMPTY: Box has size of <= 0 along at least on of its dimensions
  //          (collapsed).
  //   NO_FEATURES: No features found within the box, tracking is not possible.
  //   TRACKED: Successful tracking.
  //   DUPLICATED: Successful tracked, but duplicated from previous result as
  //   frame was duplicated.
  //   BOX_TRACKED_OUT_OF_BOUND: Successful tracked, out of bound from screen
  //   area. Will advance by camera motion. Only used for static objects.
  enum TrackStatus {
    BOX_UNTRACKED = 0;
    BOX_EMPTY = 1;
    BOX_NO_FEATURES = 2;
    BOX_TRACKED = 3;
    BOX_DUPLICATED = 4;
    BOX_TRACKED_OUT_OF_BOUND = 5;
  }

  optional TrackStatus track_status = 10 [default = BOX_UNTRACKED];

  // Spatial prior (presence of inliers, i.e. where is the object located within
  // the box that is currently being tracked) as a pair of
  // a) prior (in [0, 1]) and
  // b) confidence (number of features converted to score within
  // [0, 1]).
  // Prior is defined over a grid of size spatial_prior_grid_size x
  // spatial_prior_grid_size.
  optional int32 spatial_prior_grid_size = 11 [default = 10];
  repeated float spatial_prior = 12 [packed = true];
  repeated float spatial_confidence = 13 [packed = true];

  // Difference score between previous prior and current prior (in [0, 1]).
  // Currently not used.
  optional float prior_diff = 14;

  // Score determining how much predicted motion disagrees with measured motion.
  // If measured motion deviates strongly from predicted motion, disparity is
  // +/-1, if motion agrees with predicted motion, disparity is 0.
  // Sign indicates measured motion is accelerating (> 0)
  // or de-accelerating (< 0) w.r.t. predicted motion.
  optional float motion_disparity = 15;

  // Score determining how discriminative estimated motion model is.
  // In [0, 1] where 0 no discrimination w.r.t. background and 1
  // high discrimination.
  optional float background_discrimination = 16;

  // Center of mass for inliers after tracking (center of feature that were used
  // for motion estimation)
  optional float inlier_center_x = 18;
  optional float inlier_center_y = 19;

  // Approximate number of inliers (each features scores a zero [outlier]
  // or one [inlier]).
  optional float inlier_sum = 24;

  // Ratio of above inlier_sum to average inlier_sum across last states.
  optional float inlier_ratio = 25;

  // Extent (width and height of inliers).
  optional float inlier_width = 22;
  optional float inlier_height = 23;

  // Set of current inlier tracking ids.
  repeated uint32 inlier_ids = 26 [packed = true];
  // Corresponding x,y coordinates for each inlier.
  repeated uint32 inlier_id_match_pos = 31 [packed = true];
  // Corresponding inlier score (currently: length of inlier observed).
  repeated uint32 inlier_length = 27 [packed = true];

  // Set of outlier ids.
  repeated uint32 outlier_ids = 28 [packed = true];
  // Corresponding x,y coordinates for each outlier.
  repeated uint32 outlier_id_match_pos = 32 [packed = true];

  // Confidence of box tracked in the range [0, 1], with 0 being least
  // confident, and 1 being most confident. A reasonable threshold is 0.5
  // to filter out unconfident boxes.
  optional float tracking_confidence = 33;

  // Additional internal state.
  optional MotionBoxInternalState internal = 29;

  reserved 20, 21;
}

// Captures additional internal state info about the tracking.
message MotionBoxInternalState {
  // Stores all motion vectors that were used for tracking
  // as packed arrays, capturing position, object motion, camera motion,
  // tracking id and corresponding inlier weight.
  repeated float pos_x = 1 [packed = true];
  repeated float pos_y = 2 [packed = true];
  repeated float dx = 3 [packed = true];
  repeated float dy = 4 [packed = true];
  repeated float camera_dx = 5 [packed = true];
  repeated float camera_dy = 6 [packed = true];
  repeated int32 track_id = 7 [packed = true];

  // Within [0, 1]. 0 = outlier; 1 = inlier.
  repeated float inlier_score = 8 [packed = true];
}

// Next tag: 42
message TrackStepOptions {
  // Degrees of freedom being used for tracking. By default tracker only uses
  // translation. Additionally scale and rotation from the camera motion
  // and / or object motion can be taken into account.
  enum TrackingDegrees {
    TRACKING_DEGREE_TRANSLATION = 0;

    // Additional tracking degrees according to camera motion.
    TRACKING_DEGREE_CAMERA_SCALE = 1;
    TRACKING_DEGREE_CAMERA_ROTATION = 2;
    TRACKING_DEGREE_CAMERA_ROTATION_SCALE = 3;
    // TODO: Implement!
    TRACKING_DEGREE_CAMERA_PERSPECTIVE = 4;

    // Tracking degrees modeling object motion. Note that additional
    // object degrees of freedom are only applied when estimation is deemed
    // stable, in particular sufficient inliers are present.
    // By default, does NOT apply camera motion. If that is desired set
    // the flag: track_object_and_camera to true.
    TRACKING_DEGREE_OBJECT_SCALE = 5;
    TRACKING_DEGREE_OBJECT_ROTATION = 6;
    TRACKING_DEGREE_OBJECT_ROTATION_SCALE = 7;
    TRACKING_DEGREE_OBJECT_PERSPECTIVE = 8;
  }

  optional TrackingDegrees tracking_degrees = 28
      [default = TRACKING_DEGREE_TRANSLATION];

  // If set and one of the TRACKING_DEGREE_OBJECT degrees are set also applies
  // camera motion in addition to the object motion.
  optional bool track_object_and_camera = 32 [default = false];

  // Number of iterations to iteratively estimate model and re-estimate
  // influence of each vector.
  optional int32 irls_iterations = 1 [default = 5];

  // Gaussian spatial prior sigma relative to box size.
  // For motivation, see this plot: http://goo.gl/BCfcy.
  optional float spatial_sigma = 2 [default = 0.15];

  // Gaussian velocity prior sigma. It is computed as the maximum of the
  // absolute minimum sigma (in normalized domain) and the relative sigma
  // w.r.t. previous motion.
  optional float min_motion_sigma = 3 [default = 0.002];
  optional float relative_motion_sigma = 4 [default = 0.3];

  // Settings for motion disparity. Difference between previous and current
  // motion magnitude is scored linearly, from motion_disparity_low_level to
  // motion_disparity_high_level (mapped to score of 0 and 1 respectively).
  // Motivation is to ensure acceleration between frames are within reasonable
  // bounds.
  // Represents a maximum acceleration of around 4 - 5 pixels per frame in 360p
  // video to be unpenalized, with accelerations of around >= 10 pixels being
  // considered inconsitent with prediction.
  optional float motion_disparity_low_level = 6 [default = 8e-3];
  optional float motion_disparity_high_level = 7 [default = 1.6e-2];

  // Motion disparity decays across frames. Disparity of previous frame decays
  // over time. If disparity in current frame is not higher, i.e. the larger
  // of the current and decayed disparity is taken.
  // Motivation is, that if acceleration was unreasonable high (and we likely
  // lost tracking) we enter a stage of trying to regain tracking by looking for
  // vectors that agree with the previous prediction.
  optional float disparity_decay = 8 [default = 0.8];

  // Object motion is given as linear combination of previous and measured
  // motion depending on the motion_disparity (a high disparity is giving high
  // weight to the previous motion).
  // We enforce at least a minimum of the below motion_prior_weight regardless
  // of the motion disparity.
  optional float motion_prior_weight = 9 [default = 0.2];

  // Settings for motion discrimination.
  //
  // Current motion magnitude is scored linearly,
  // from background_discrimination_low_level to
  // background_discrimination_high_level (mapped to score of 0 and 1
  // respectively).
  // Motivation is that high object motions are easy to discriminate from the
  // background, whereas small object motions are virtually indistinguishable.
  // Represents a range of 2 - 4 pixels for 360p video.
  optional float background_discrimination_low_level = 10 [default = 4e-3];
  optional float background_discrimination_high_level = 11 [default = 8e-3];

  // Spring force settings. If difference between predicted center of the box in
  // the next frame and the predicted center of the inliers deviates by more
  // than inlier_center_relative_distance times the box [width|height]
  // a spring force is applied to the box. The amount of force is spring_force
  // times the difference.
  optional float inlier_center_relative_distance = 12 [default = 0.1];
  optional float inlier_spring_force = 13 [default = 0.3];

  // Same as above, but for the center of large motion magnitudes.
  optional float kinetic_center_relative_distance = 14 [default = 0.4];
  optional float kinetic_spring_force = 15 [default = 0.5];

  // Spring force towards large motions is only applied when kinetic energy is
  // above the specified threshold.
  optional float kinetic_spring_force_min_kinetic_energy = 21 [default = 3e-3];

  // Bias of old velocity during update step.
  optional float velocity_update_weight = 16 [default = 0.7];

  // Maximum number of frames considered to be tracking failures ->
  // If over threshold, box is considered untrackable.
  optional int32 max_track_failures = 17 [default = 10];

  // Domain used for tracking is always larger than the current box.
  // If current motion is not negligible, box is expanded in the direction the
  // motion, otherwise expanded in all directions by the amount specified below
  // (w.r.t. normalized domain).
  optional float expansion_size = 18 [default = 0.05];

  // Features are scored based on the magnitude of their irls weights, mapped to
  // [0, 1] using the following range. The range represents roughly 3 - 1.5
  // pixels error for 360p video.
  optional float inlier_low_weight = 19 [default = 250];
  optional float inlier_high_weight = 20 [default = 500];

  // Kinetic energy decays over time by the specified rate.
  optional float kinetic_energy_decay = 22 [default = 0.98];

  // Amount by which prior is increased/decreased in case of valid/invalid
  // measurements.
  optional float prior_weight_increase = 23 [default = 0.2];

  // We map the amount of present kinetic energy linearly to the domain [0, 1]
  // describing if an object is static (0) or moving (1).
  optional float low_kinetic_energy = 24 [default = 1e-3];   // ~0.4 pix
  optional float high_kinetic_energy = 25 [default = 4e-3];  // ~3 pix

  // Outputs internal state to MotionBoxState.
  optional bool return_internal_state = 26 [default = false];

  // Specifies which weights are stored in the internal state. By default
  // post-estimation weights are stored, otherwise pre-estimation weights
  // are stored.
  optional bool use_post_estimation_weights_for_state = 29 [default = true];

  // Computes spatial grid of inliers and stores it in the MotionBoxState.
  optional bool compute_spatial_prior = 27 [default = false];

  // Irls initialization by performing several rounds of RANSAC to preselect
  // features for motion estimation scoring outliers low and inliers to be at
  // least of median inlier weight.
  message IrlsInitialization {
    optional bool activated = 1 [default = false];

    // Rounds of RANSAC.
    optional int32 rounds = 2 [default = 50];

    // Normalized cutoff threshold for a vector to be considered an inlier.
    optional float cutoff = 3 [default = 0.005];
  }
  optional IrlsInitialization irls_initialization = 30;

  // Ratio between static motion and temporal scale. This is actually
  // the threshold on speed, under which we consider static (non-moving object).
  optional float static_motion_temporal_ratio = 33 [default = 3e-3];

  // Different control parameters to terminate tracking when
  // occlusion occurs.
  message CancelTrackingWithOcclusionOptions {
    optional bool activated = 1 [default = false];
    optional float min_motion_continuity = 2 [default = 0.4];
    optional float min_inlier_ratio = 3 [default = 0.1];
  }
  optional CancelTrackingWithOcclusionOptions
      cancel_tracking_with_occlusion_options = 34;

  // If number of continued inliers is less than this number, then the object
  // motion model will fall back to translation model.
  // Set this min_continued_inliers threshold to a low number to make sure
  // they follow local object rotation and scale, but it may result in un-robust
  // rotation and scale estimation if the threshold is too low. Recommend that
  // you don't set a number < 4.
  optional int32 object_similarity_min_contd_inliers = 35 [default = 30];

  // Maximum acceptable scale component of object similarity transform.
  // Minimum scale is computed as 1.0 / max_scale.
  // Exclusive for tracking a box with similarity.
  optional float box_similarity_max_scale = 36 [default = 1.05];

  // Maximum acceptable object similarity rotation in radians.
  optional float box_similarity_max_rotation = 37 [default = 0.2];

  // Homography transform will first be projected to similarity, and the scale
  // component of the similarity transform should be within the range of
  // [1.0 / max_scale, max_scale].
  optional float quad_homography_max_scale = 38 [default = 1.2];

  // The rotation component of the projected similarity should be smaller than
  // this maximum rotation threshold.
  optional float quad_homography_max_rotation = 39 [default = 0.3];

  // Pre-calibrated camera intrinsics parameters, including focal length, center
  // point, distortion coefficients (only 3 radial factors) and image width /
  // height. The image formation model is described here:
  // https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
  // Only used for quad tracking mode. Leave it empty if unknown.
  message CameraIntrinsics {
    optional float fx = 1;
    optional float fy = 2;
    optional float cx = 3;
    optional float cy = 4;
    optional float k0 = 5;
    optional float k1 = 6;
    optional float k2 = 7;
    optional int32 w = 8;
    optional int32 h = 9;
  }
  optional CameraIntrinsics camera_intrinsics = 40;

  // Specifically for quad tracking (aka TRACKING_DEGREE_OBJECT_PERSPECTIVE
  // mode), if aspect_ratio field is set in start pos, pnp tracking will be
  // deployed. If aspect_ratio is unknown (not set), but forced_pnp_tracking is
  // true, we will first estimate the aspect ratio for the 3D quadrangle, then
  // perform pnp tracking. If aspect_ratio is unknown and pnp tracking is not
  // forced, general homography tracking will be deployed.
  optional bool forced_pnp_tracking = 41 [default = false];
}