// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto2"; package mediapipe; import "mediapipe/util/tracking/motion_models.proto"; // Next tag: 38 message MotionBoxState { // Position (top-left corner) and fixed size of the current MotionBox, // specified w.r.t. normalized domain (in [0, 1] along both dimensions). optional float pos_x = 1; optional float pos_y = 2; optional float width = 3; optional float height = 4; // Optional degrees of freedom; scale and rotation w.r.t. center of the box, // i.e. [pos_x, pos_y] + 0.5 * [width, height]. // To activate see TrackStepOptions::TrackingDegrees. optional float scale = 5 [default = 1.0]; optional float rotation = 30 [default = 0.0]; // in radians. message Quad { // Vertex 0 is according to x_0 = vertices(0), y_0 = vertices(1) // Vertex 1 is according to x_1 = vertices(2), y_1 = vertices(3) // Vertex 2 is according to x_2 = vertices(4), y_2 = vertices(5) // Vertex 3 is according to x_3 = vertices(6), y_3 = vertices(7) // Order of vertices should be aligned in counter-clockwise manner // 0---------3 // | | // | | // 1---------2 repeated float vertices = 1; } // This field is only used when we try to track under // TRACKING_DEGREE_OBJECT_PERSPECTIVE. optional Quad quad = 34; // Aspect ratio (width / height) for the tracked rectangle in physical space. optional float aspect_ratio = 35; // Whether we want this box to be potentially grouped with other boxes // to track together. This is useful for tracking small boxes that lie // on a plane. For example, when we detect a plane, // track the plane, then all boxes within the plane can share the same // homography transform. optional bool request_grouping = 37 [default = false]; // For quad tracking using pnp solver, // Whether we use perspective-n-points to track quad between frames. // That mode requires: // 1. The quad which is being tracked is an rectangle in the physical world. // 2. The `asepct_ratio` field has to be set in MotionBoxState. optional Homography pnp_homography = 36; // Object velocity in x and y, specified as normalized spatial unit per // standard frame period (here calibrated w.r.t. kTrackingDefaultFps = 30 // FPS), that is 33.3 ms. Object velocity refers to velocity after // subtracting camera motion. // If current frame period is 66.67 ms (i.e. 15 fps); actual velocity is // obtained by multipling with a factor of 2. Similar for 60 fps factor // is 0.5f. // Standard frame period is chosen for legacy reasons to keep TrackStepOptions // defaults. optional float dx = 7; optional float dy = 8; // Weighted average of object velocity magnitude of inlier points (expressed // in normalized spatial units per standard frame period). optional float kinetic_energy = 17; // Specifies how valid the prior was in the last step. optional float prior_weight = 9; // Tracking status indicating result of tracking: // UNTRACKED: Box can not be tracked // (either out of bound or too many tracking failures). // EMPTY: Box has size of <= 0 along at least on of its dimensions // (collapsed). // NO_FEATURES: No features found within the box, tracking is not possible. // TRACKED: Successful tracking. // DUPLICATED: Successful tracked, but duplicated from previous result as // frame was duplicated. // BOX_TRACKED_OUT_OF_BOUND: Successful tracked, out of bound from screen // area. Will advance by camera motion. Only used for static objects. enum TrackStatus { BOX_UNTRACKED = 0; BOX_EMPTY = 1; BOX_NO_FEATURES = 2; BOX_TRACKED = 3; BOX_DUPLICATED = 4; BOX_TRACKED_OUT_OF_BOUND = 5; } optional TrackStatus track_status = 10 [default = BOX_UNTRACKED]; // Spatial prior (presence of inliers, i.e. where is the object located within // the box that is currently being tracked) as a pair of // a) prior (in [0, 1]) and // b) confidence (number of features converted to score within // [0, 1]). // Prior is defined over a grid of size spatial_prior_grid_size x // spatial_prior_grid_size. optional int32 spatial_prior_grid_size = 11 [default = 10]; repeated float spatial_prior = 12 [packed = true]; repeated float spatial_confidence = 13 [packed = true]; // Difference score between previous prior and current prior (in [0, 1]). // Currently not used. optional float prior_diff = 14; // Score determining how much predicted motion disagrees with measured motion. // If measured motion deviates strongly from predicted motion, disparity is // +/-1, if motion agrees with predicted motion, disparity is 0. // Sign indicates measured motion is accelerating (> 0) // or de-accelerating (< 0) w.r.t. predicted motion. optional float motion_disparity = 15; // Score determining how discriminative estimated motion model is. // In [0, 1] where 0 no discrimination w.r.t. background and 1 // high discrimination. optional float background_discrimination = 16; // Center of mass for inliers after tracking (center of feature that were used // for motion estimation) optional float inlier_center_x = 18; optional float inlier_center_y = 19; // Approximate number of inliers (each features scores a zero [outlier] // or one [inlier]). optional float inlier_sum = 24; // Ratio of above inlier_sum to average inlier_sum across last states. optional float inlier_ratio = 25; // Extent (width and height of inliers). optional float inlier_width = 22; optional float inlier_height = 23; // Set of current inlier tracking ids. repeated uint32 inlier_ids = 26 [packed = true]; // Corresponding x,y coordinates for each inlier. repeated uint32 inlier_id_match_pos = 31 [packed = true]; // Corresponding inlier score (currently: length of inlier observed). repeated uint32 inlier_length = 27 [packed = true]; // Set of outlier ids. repeated uint32 outlier_ids = 28 [packed = true]; // Corresponding x,y coordinates for each outlier. repeated uint32 outlier_id_match_pos = 32 [packed = true]; // Confidence of box tracked in the range [0, 1], with 0 being least // confident, and 1 being most confident. A reasonable threshold is 0.5 // to filter out unconfident boxes. optional float tracking_confidence = 33; // Additional internal state. optional MotionBoxInternalState internal = 29; reserved 20, 21; } // Captures additional internal state info about the tracking. message MotionBoxInternalState { // Stores all motion vectors that were used for tracking // as packed arrays, capturing position, object motion, camera motion, // tracking id and corresponding inlier weight. repeated float pos_x = 1 [packed = true]; repeated float pos_y = 2 [packed = true]; repeated float dx = 3 [packed = true]; repeated float dy = 4 [packed = true]; repeated float camera_dx = 5 [packed = true]; repeated float camera_dy = 6 [packed = true]; repeated int32 track_id = 7 [packed = true]; // Within [0, 1]. 0 = outlier; 1 = inlier. repeated float inlier_score = 8 [packed = true]; } // Next tag: 42 message TrackStepOptions { // Degrees of freedom being used for tracking. By default tracker only uses // translation. Additionally scale and rotation from the camera motion // and / or object motion can be taken into account. enum TrackingDegrees { TRACKING_DEGREE_TRANSLATION = 0; // Additional tracking degrees according to camera motion. TRACKING_DEGREE_CAMERA_SCALE = 1; TRACKING_DEGREE_CAMERA_ROTATION = 2; TRACKING_DEGREE_CAMERA_ROTATION_SCALE = 3; // TODO: Implement! TRACKING_DEGREE_CAMERA_PERSPECTIVE = 4; // Tracking degrees modeling object motion. Note that additional // object degrees of freedom are only applied when estimation is deemed // stable, in particular sufficient inliers are present. // By default, does NOT apply camera motion. If that is desired set // the flag: track_object_and_camera to true. TRACKING_DEGREE_OBJECT_SCALE = 5; TRACKING_DEGREE_OBJECT_ROTATION = 6; TRACKING_DEGREE_OBJECT_ROTATION_SCALE = 7; TRACKING_DEGREE_OBJECT_PERSPECTIVE = 8; } optional TrackingDegrees tracking_degrees = 28 [default = TRACKING_DEGREE_TRANSLATION]; // If set and one of the TRACKING_DEGREE_OBJECT degrees are set also applies // camera motion in addition to the object motion. optional bool track_object_and_camera = 32 [default = false]; // Number of iterations to iteratively estimate model and re-estimate // influence of each vector. optional int32 irls_iterations = 1 [default = 5]; // Gaussian spatial prior sigma relative to box size. // For motivation, see this plot: http://goo.gl/BCfcy. optional float spatial_sigma = 2 [default = 0.15]; // Gaussian velocity prior sigma. It is computed as the maximum of the // absolute minimum sigma (in normalized domain) and the relative sigma // w.r.t. previous motion. optional float min_motion_sigma = 3 [default = 0.002]; optional float relative_motion_sigma = 4 [default = 0.3]; // Settings for motion disparity. Difference between previous and current // motion magnitude is scored linearly, from motion_disparity_low_level to // motion_disparity_high_level (mapped to score of 0 and 1 respectively). // Motivation is to ensure acceleration between frames are within reasonable // bounds. // Represents a maximum acceleration of around 4 - 5 pixels per frame in 360p // video to be unpenalized, with accelerations of around >= 10 pixels being // considered inconsitent with prediction. optional float motion_disparity_low_level = 6 [default = 8e-3]; optional float motion_disparity_high_level = 7 [default = 1.6e-2]; // Motion disparity decays across frames. Disparity of previous frame decays // over time. If disparity in current frame is not higher, i.e. the larger // of the current and decayed disparity is taken. // Motivation is, that if acceleration was unreasonable high (and we likely // lost tracking) we enter a stage of trying to regain tracking by looking for // vectors that agree with the previous prediction. optional float disparity_decay = 8 [default = 0.8]; // Object motion is given as linear combination of previous and measured // motion depending on the motion_disparity (a high disparity is giving high // weight to the previous motion). // We enforce at least a minimum of the below motion_prior_weight regardless // of the motion disparity. optional float motion_prior_weight = 9 [default = 0.2]; // Settings for motion discrimination. // // Current motion magnitude is scored linearly, // from background_discrimination_low_level to // background_discrimination_high_level (mapped to score of 0 and 1 // respectively). // Motivation is that high object motions are easy to discriminate from the // background, whereas small object motions are virtually indistinguishable. // Represents a range of 2 - 4 pixels for 360p video. optional float background_discrimination_low_level = 10 [default = 4e-3]; optional float background_discrimination_high_level = 11 [default = 8e-3]; // Spring force settings. If difference between predicted center of the box in // the next frame and the predicted center of the inliers deviates by more // than inlier_center_relative_distance times the box [width|height] // a spring force is applied to the box. The amount of force is spring_force // times the difference. optional float inlier_center_relative_distance = 12 [default = 0.1]; optional float inlier_spring_force = 13 [default = 0.3]; // Same as above, but for the center of large motion magnitudes. optional float kinetic_center_relative_distance = 14 [default = 0.4]; optional float kinetic_spring_force = 15 [default = 0.5]; // Spring force towards large motions is only applied when kinetic energy is // above the specified threshold. optional float kinetic_spring_force_min_kinetic_energy = 21 [default = 3e-3]; // Bias of old velocity during update step. optional float velocity_update_weight = 16 [default = 0.7]; // Maximum number of frames considered to be tracking failures -> // If over threshold, box is considered untrackable. optional int32 max_track_failures = 17 [default = 10]; // Domain used for tracking is always larger than the current box. // If current motion is not negligible, box is expanded in the direction the // motion, otherwise expanded in all directions by the amount specified below // (w.r.t. normalized domain). optional float expansion_size = 18 [default = 0.05]; // Features are scored based on the magnitude of their irls weights, mapped to // [0, 1] using the following range. The range represents roughly 3 - 1.5 // pixels error for 360p video. optional float inlier_low_weight = 19 [default = 250]; optional float inlier_high_weight = 20 [default = 500]; // Kinetic energy decays over time by the specified rate. optional float kinetic_energy_decay = 22 [default = 0.98]; // Amount by which prior is increased/decreased in case of valid/invalid // measurements. optional float prior_weight_increase = 23 [default = 0.2]; // We map the amount of present kinetic energy linearly to the domain [0, 1] // describing if an object is static (0) or moving (1). optional float low_kinetic_energy = 24 [default = 1e-3]; // ~0.4 pix optional float high_kinetic_energy = 25 [default = 4e-3]; // ~3 pix // Outputs internal state to MotionBoxState. optional bool return_internal_state = 26 [default = false]; // Specifies which weights are stored in the internal state. By default // post-estimation weights are stored, otherwise pre-estimation weights // are stored. optional bool use_post_estimation_weights_for_state = 29 [default = true]; // Computes spatial grid of inliers and stores it in the MotionBoxState. optional bool compute_spatial_prior = 27 [default = false]; // Irls initialization by performing several rounds of RANSAC to preselect // features for motion estimation scoring outliers low and inliers to be at // least of median inlier weight. message IrlsInitialization { optional bool activated = 1 [default = false]; // Rounds of RANSAC. optional int32 rounds = 2 [default = 50]; // Normalized cutoff threshold for a vector to be considered an inlier. optional float cutoff = 3 [default = 0.005]; } optional IrlsInitialization irls_initialization = 30; // Ratio between static motion and temporal scale. This is actually // the threshold on speed, under which we consider static (non-moving object). optional float static_motion_temporal_ratio = 33 [default = 3e-3]; // Different control parameters to terminate tracking when // occlusion occurs. message CancelTrackingWithOcclusionOptions { optional bool activated = 1 [default = false]; optional float min_motion_continuity = 2 [default = 0.4]; optional float min_inlier_ratio = 3 [default = 0.1]; } optional CancelTrackingWithOcclusionOptions cancel_tracking_with_occlusion_options = 34; // If number of continued inliers is less than this number, then the object // motion model will fall back to translation model. // Set this min_continued_inliers threshold to a low number to make sure // they follow local object rotation and scale, but it may result in un-robust // rotation and scale estimation if the threshold is too low. Recommend that // you don't set a number < 4. optional int32 object_similarity_min_contd_inliers = 35 [default = 30]; // Maximum acceptable scale component of object similarity transform. // Minimum scale is computed as 1.0 / max_scale. // Exclusive for tracking a box with similarity. optional float box_similarity_max_scale = 36 [default = 1.05]; // Maximum acceptable object similarity rotation in radians. optional float box_similarity_max_rotation = 37 [default = 0.2]; // Homography transform will first be projected to similarity, and the scale // component of the similarity transform should be within the range of // [1.0 / max_scale, max_scale]. optional float quad_homography_max_scale = 38 [default = 1.2]; // The rotation component of the projected similarity should be smaller than // this maximum rotation threshold. optional float quad_homography_max_rotation = 39 [default = 0.3]; // Pre-calibrated camera intrinsics parameters, including focal length, center // point, distortion coefficients (only 3 radial factors) and image width / // height. The image formation model is described here: // https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html // Only used for quad tracking mode. Leave it empty if unknown. message CameraIntrinsics { optional float fx = 1; optional float fy = 2; optional float cx = 3; optional float cy = 4; optional float k0 = 5; optional float k1 = 6; optional float k2 = 7; optional int32 w = 8; optional int32 h = 9; } optional CameraIntrinsics camera_intrinsics = 40; // Specifically for quad tracking (aka TRACKING_DEGREE_OBJECT_PERSPECTIVE // mode), if aspect_ratio field is set in start pos, pnp tracking will be // deployed. If aspect_ratio is unknown (not set), but forced_pnp_tracking is // true, we will first estimate the aspect ratio for the 3D quadrangle, then // perform pnp tracking. If aspect_ratio is unknown and pnp tracking is not // forced, general homography tracking will be deployed. optional bool forced_pnp_tracking = 41 [default = false]; }