// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Options for tracking by pyramidical Lucas-Kanade. Local outlier rejection // is performed by enforcing translation models over regions (grid-based // or obtained from segmentation). syntax = "proto2"; package mediapipe; import "mediapipe/util/tracking/tone_estimation.proto"; // Next tag: 33 message TrackingOptions { // Describes direction of flow during feature tracking and for the output // region flow. enum FlowDirection { FORWARD = 1; // Tracks are forward, from frame N-k -> frame N (k > 0). BACKWARD = 2; // Tracks are backward, from frame N -> frame N-k // (k > 0). CONSECUTIVELY = 3; // Try forward and backward tracking consecutively. // Forward tracking is done first. If the flow is // stable, i.e. there are sufficient tracked features, // the region flow computation ends. Otherwise, backward // tracking is performed. } // Flow direction used internally during tracking features. Forward tracking // allows reusing tracked features instead of explicitly tracking them in // every frame, and can therefore be faster. See the reuse_features_XXX // options below. However, if not reusing features, then it is best to match // the direction for both internal tracking and output flow, for peformance // reasons. optional FlowDirection internal_tracking_direction = 19 [default = BACKWARD]; // Direction of flow vectors that are computed and output by calls to retrieve // region flow, tracked features, etc. Note when this is BACKWARD, then the // returned flow for frame N contains features tracked *from* frame N to a // previous frame N-k. When this is FORWARD, the flow for frame N contains // the flow from features in a previous frame N-k, tracked *to* frame N. // Note that the output flow direction can only be set to FORWARD or BACKWARD. optional FlowDirection output_flow_direction = 20 [default = BACKWARD]; // Specifies how a feature is tracked w.r.t. previous or next frames // (dependent on the FlowDirection options above). // Per default, each frame is tracked w.r.t. a single neighboring frame // (TRACK_SINGLE_FRAME). If associations across multiple frames are desired, // TRACK_MULTI_FRAME creates multiple results for the current frame, by // tracking features w.r.t. multiple neighbors. Number of neighbors is // specified by multi_frames_to_track. // If long feature tracks are desired (i.e. a track across a frame pair // that is identified to belong to an earlier known feature), use // TRACK_ACROSS_FRAMES. Maximum track length can be specified by // long_tracks_max_frames. enum TrackingPolicy { POLICY_SINGLE_FRAME = 1; // Tracks w.r.t. previous or next frame. POLICY_MULTI_FRAME = 2; // Tracks w.r.t. multiple frames. POLICY_LONG_TRACKS = 3; // Create long feature tracks. // Requires internal_tracking_direction to be // FORWARD. Checked against. } optional TrackingPolicy tracking_policy = 25 [default = POLICY_SINGLE_FRAME]; // Number of frame-pairs used for POLICY_MULTI_FRAME, ignored for other // policies. // Value of 1 means we are tracking features in the current frame, w.r.t. // the previous one. Value of 2 denotes tracking of features in current // w.r.t the previous one and the one before the previous one, etc. optional int32 multi_frames_to_track = 1 [default = 1]; // Maximum length of long feature tracks for POLICY_LONG_TRACKS in frames. // Note: This maximum is not hard enforced, to avoid that many long // tracks are dropped at the same time. Instead if a feature reaches // long_tracks_max_frames * 0.8, it will get dropped with a probability of X, // where X is calculated, such that 95% of all qualifying features are // dropped within the interval [.8, 1.2] * long_tracks_max_frames. optional int32 long_tracks_max_frames = 26 [default = 300]; // Hard limit of maximum number of features. Control density of features, with // min_feature_distance option. This limit is to guarantee that the // run-time of RegionFlowComputation does not spiral out of control. optional int32 max_features = 2 [default = 2000]; // Specifies the extraction method for features. enum CornerExtractionMethod { EXTRACTION_HARRIS = 1; // Using Harris' approximation of // EXTRACTION_MIN_EIG_VAL. EXTRACTION_MIN_EIG_VAL = 2; // Exact smallest eigenvalue computation. EXTRACTION_FAST = 3; // Extract using FAST feature detector. } optional CornerExtractionMethod corner_extraction_method = 27 [default = EXTRACTION_MIN_EIG_VAL]; // Settings for above corner extraction methods. message MinEigValExtractionSettings { // Quality level of features (features with // min_eig_value < quality_level * max_eig_value are rejected). // Here [min|max]_eig_value denote the minimum and maximum eigen value of // the auto-correlation matrix of the patch centered at a feature point. The // ratio of eigenvalues denotes the "cornerness", lower means more // pronounced corners. // (see http://en.wikipedia.org/wiki/Harris-Affine for details.) optional float feature_quality_level = 1 [default = 0.01]; // Features below this quality level are always discarded, even if their // score is above feature_quality_level() * local maximum within that grid // cell. This prevents us from including very poor features. optional float adaptive_lowest_quality_level = 2 [default = 8e-5]; } optional MinEigValExtractionSettings min_eig_val_settings = 28; message HarrisExtractionSettings { // Same as in MinEigValExtractionSettings. optional float feature_quality_level = 1 [default = 2.5e-4]; // Note, due to Harris response being negative for some pixels, // no lowest quality level is enforced. } optional HarrisExtractionSettings harris_settings = 29; message FastExtractionSettings { // threshold on difference between intensity of the central pixel and pixels // of a circle around this pixel. Empirically, the larger the threshold, the // fewer the keypoints will be detected. // Default value set as the same with OpenCV. optional int32 threshold = 1 [default = 10]; } optional FastExtractionSettings fast_settings = 31; optional int32 tracking_window_size = 4 [default = 10]; optional int32 tracking_iterations = 5 [default = 10]; // Fractional tracking distance w.r.t. to frame diameter d. The number of // pyramid levels l is chosen such that // 2^l * tracking_window_size / 2 >= fractional_tracking_distance * d. // Therefore, theoretically it is guaranteed that objects moving less than // fractional_tracking_distance * d can be tracked. optional float fractional_tracking_distance = 6 [default = 0.15]; // If set, modifies tracking distance to be 130% of maximum average // tracking distances of previous frames. optional bool adaptive_tracking_distance = 24 [default = false]; // Minimum feature distance in pixels. Close features are suppressed. If value // < 1, the distance is computed as a fraction of the frame diameter. optional float min_feature_distance = 7 [default = 7]; // By default, when downscaling by factor x, the minimum feature distance // is downscaled by a factor of sqrt(x). If set false, no scaling is // performed. optional bool distance_downscale_sqrt = 21 [default = true]; // Uses grid based extraction of features. Quality level is local within a // grid cell and results are combined over all cells and multiple scales and // grid offsets. // Default option, setting it to false is deprecated and will fail. optional bool adaptive_good_features_to_track = 8 [default = true]; // Size of each grid cell. Values < 1 are interpreted to be relative to // frame_width_ x frame_height_. optional float adaptive_features_block_size = 9 [default = .26]; // Scales / levels employed for feature extraction. Grid cell size is scaled // by 0.5 for each level. optional int32 adaptive_features_levels = 10 [default = 1]; // If > 1, feature extraction is carried out at multiple scales by downscaling // the image repeatedly, extracting features (eigenvalue images) and upscaling // them. optional int32 adaptive_extraction_levels = 22 [default = 1]; // Alternate way of specifying extraction levels: number of levels is // automatically computed by downsampling the image until its maximum // dimension (width or height) reaches this value. Overrides // adaptive_extraction_levels if > 0. optional int32 adaptive_extraction_levels_lowest_size = 23 [default = 0]; // Grid step-size in fraction of width or height used for creating synthetic // zero motion tracks with feature points lying on a grid. Can be set based on // desired number of total features as 1/sqrt(num_features), // e.g. .04 ~= 1/sqrt(600). optional float synthetic_zero_motion_grid_step = 13 [default = .04]; // If set, uses ORB features with brute force matching and ratio test // to track frames across larger perspective changes than possible with // default KLT features. optional bool wide_baseline_matching = 14 [default = false]; // Only brute force matches with // best_match_distance < ratio_test_threshold * second_best_match_distance // are retained. optional float ratio_test_threshold = 15 [default = 0.8]; // Refines wide baseline matches by estimating affine transform to // wide-baseline matches which is used to seed initial positions for KLT // matches. optional bool refine_wide_baseline_matches = 16 [default = false]; // When tracking features, features tracked from frame A to frame B may be // reused as the features for frame B when tracking from it (instead of // extracting features). The max_frame_distance flag limits the distance // between A and B for the features to be reused. Setting it to 0 => no // re-use. optional int32 reuse_features_max_frame_distance = 17 [default = 0]; // In conjunction with above, the features are reused in frame B only if they // are at-least this fraction of the original features in frame A. Otherwise // they are reset and extracted from scratch. optional float reuse_features_min_survived_frac = 18 [default = 0.7]; // If set uses newer OpenCV tracking algorithm. // Recommended to be set for all new projects. optional bool use_cv_tracking_algorithm = 30 [default = true]; enum KltTrackerImplementation { UNSPECIFIED = 0; KLT_OPENCV = 1; // Use OpenCV's implementation of KLT tracker. } // Implementation choice of KLT tracker. optional KltTrackerImplementation klt_tracker_implementation = 32 [default = KLT_OPENCV]; // Deprecated fields. extensions 3, 11, 12; } // Next tag: 67 message RegionFlowComputationOptions { optional TrackingOptions tracking_options = 1; // Features are binned into grids of different resolutions (see // fast_estimation_block_size below) and retained if they survive a localized // translation based RANSAC algorithm and at the survivors are at least of // size min_feature_inliers. Must be at least 3! optional int32 min_feature_inliers = 2 [default = 3]; // Relative number of inlier features w.r.t. average number of features // per grid bin. Maximum of both thresholds is used as actual threshold. optional float relative_min_feature_inliers = 46 [default = 0.2]; // Pre-blur before computing features to reduce noise. Set to zero for no // blurring. optional float pre_blur_sigma = 33 [default = 0.8]; // Number of ransac rounds to estimate per region flow vector. This could be // adaptive, but the required number of rounds is so low, that estimating // the bound is more costly than just running it for a fixed number of times. optional int32 ransac_rounds_per_region = 3 [default = 15]; // Error thresholds for a feature to be considered as an inlier in // pixel-distance. The max of all three thresholds below is used as the actual // threshold. // Absolute in pixels. optional float absolute_inlier_error_threshold = 4 [default = 2]; // Scaled w.r.t. frame diameter. optional float frac_inlier_error_threshold = 52 [default = 0]; // Scaled w.r.t model estimated during each RANSAC round. optional float relative_inlier_error_threshold = 44 [default = 0.1]; // Returns for each grid only the top N inlier sets. optional int32 top_inlier_sets = 45 [default = 2]; // For debugging purposes, uses all tracked features regardless of the above // setting. optional bool no_estimation_mode = 40 [default = false]; // Block size in pixels. If fractional block_size is used (0 < size < 1), // it is interpreted as fraction of the image dimensions. // We use 4 blocks in each dimension by standard. optional float fast_estimation_block_size = 6 [default = .25]; // Minimum block size in pixels (larger dimension) to perform fast estimation // on. Pyramid levels are allocated such that // block_size * 0.5^(level - 1) = min_block_size. // At least two levels are used. optional int32 fast_estimation_min_block_size = 25 [default = 100]; // We use overlapping versions of the grid, next parameters specifies how // many in each dimensions (total is therefore, the value squared!). optional int32 fast_estimation_overlap_grids = 22 [default = 3]; // Flow features with motion above this thresholds (w.r.t. frame diameter) // are rejected. optional float max_magnitude_threshold_ratio = 23 [default = 0.2]; // Flow features that have a motion that is larger than // median_magnitude_bounds times the median magnitude are discarded. // If set to zero, test is not enforced. optional float median_magnitude_bounds = 51 [default = 0.0]; // Determines how irls weights for computed features are initialized. // In general, more stable features are given higher weight. enum IrlsInitialization { INIT_UNIFORM = 1; // All weights equal 1 // Feature's irls weight is initialized to a value in [0, 2] // indicating how consistent the feature's motion is w.r.t. neighboring // features (high values = very consistent). Determined by counting how // often a feature is part of the inlier set for a particular bin. INIT_CONSISTENCY = 2; } // If this option is activated, feature's irls weight is initialized to the // inverse of its computed flow. optional IrlsInitialization irls_initialization = 49 [default = INIT_CONSISTENCY]; // We support down-sampling of an incoming frame before running the // resolution dependent part of the region flow computation (feature // extraction and tracking if desired). // Note that in all downsampling modes except for DOWNSAMPLE_TO_INPUT_SIZE, // for uneven dimensions after downsampling, we always round up to // the nearest even dimension, i.e. 350p with a downsample_factor of 2.0 // would expect an input of size 176p. enum DownsampleMode { // No downsampling. DOWNSAMPLE_NONE = 1; // Downsizes the input frame such that frame_size == downsampling_size, // where frame_size := max(width, height). DOWNSAMPLE_TO_MAX_SIZE = 2; // Downsizes frame by pre-defined factor, downsample_factor below. DOWNSAMPLE_BY_FACTOR = 3; // Downsampling based on downsampling schedule, see DownsampleSchedule below // for details. DOWNSAMPLE_BY_SCHEDULE = 4; // Downsizes the input frame such that frame_size == downsampling_size, // where frame_size := min(width, height). DOWNSAMPLE_TO_MIN_SIZE = 5; // Input frame is assumed to be already downsampled by the factor specified // by downsample_factor below. For example if the original frame is 720p, // and downsample_factor is set to 2.0, then we expect as input 360p. DOWNSAMPLE_TO_INPUT_SIZE = 6; } optional DownsampleMode downsample_mode = 11 [default = DOWNSAMPLE_NONE]; // Specify the size of either dimension here, the frame will be // downsampled to fit downsampling_size. optional int32 downsampling_size = 12 [default = 256]; optional float downsample_factor = 18 [default = 2.0]; // If set, we will force the computed downsampling factor to be the nearest // integer, resulting in faster downsampling. This will have no effect for // DOWNSAMPLE_TO_INPUT_SIZE, DOWNSAMPLE_BY_FACTOR, and DOWNSAMPLE_BY_SCHEDULE, // which should have exact values defined. optional bool round_downsample_factor = 62 [default = false]; // Downsampling schedule. Frame sizes up to which a particular downsampling // factor is applied. Factor chosen by comparing actual frame area against // standard area (standard_width * standard_height), where standard_width = // 16/9 X standard_height. message DownSampleSchedule { optional float downsample_factor_360p = 1 [default = 1]; // For <= 360p. optional float downsample_factor_480p = 2 [default = 1]; // For <= 480p. optional float downsample_factor_720p = 3 [default = 2]; // For <= 720p. optional float downsample_factor_1080p = 4 [default = 2]; // >= 720p. } // Used if downsample_mode is DOWNSAMPLE_BY_SCHEDULE. optional DownSampleSchedule downsample_schedule = 19; // Minimum number of good features that we require to be present. // Without good features, the estimated motion models will do more harm than // good, so it is better to use simply the identity transform for this frame, // and set the flag unstable_models to true in RegionFlow. optional int32 min_feature_requirement = 13 [default = 20]; // We also require features to cover a minimum percentage area of the frame. // We use downsampling and plot each feature by a 1 in a grid, this is // equivalent to plotting each feature by a rectangle in the original frame. optional float min_feature_cover = 14 [default = 0.15]; // Grid size for above min feature cover. optional int32 min_feature_cover_grid = 20 [default = 8]; // Computes blur score for each frame. Score is proportional to amount of // blur present in a frame, i.e. higher scores reflect more blurred frames. // Note that the score is dependent on the gradient distribution of the image // content, i.e. the score itself is rather meaningless but needs to be // compared to scores of neighboring frames. optional bool compute_blur_score = 17 [default = false]; message BlurScoreOptions { // Blur score is only computed over image regions of high cornerness // (as blur in any direction will always alter these regions). First, the // corner image (smallest eigenvalue of 2nd moment matrix) is box filtered, // and then thresholded. optional int32 box_filter_diam = 1 [default = 3]; // Specifies relative (w.r.t. maximum) and absolute corneress threshold // for threshold operation. optional float relative_cornerness_threshold = 2 [default = 3e-2]; optional float absolute_cornerness_threshold = 3 [default = 1e-4]; // Blur score is defined as 1.0 / , where // is the n-th percentile of the cornerness evaluated // over the image regions of high corness as specified above. optional float median_percentile = 5 [default = 0.85]; } optional BlurScoreOptions blur_score_options = 31; // Determines how/if visual consistency is computed. If activated, // computes the absolute *change* in visual difference between two adjancent // frame pairs, i.e. the modulus of the 2nd derivative of the frame // appearance. Stores result in RegionFlowFeatureList::visual_consistency. message VisualConsistencyOptions { // Computation of visual consistency is only performed if activated. optional bool compute_consistency = 1 [default = true]; // Incoming color or gray scale image is scaled to a tiny square image of // the specified dimension. Used to compare adjacent images via SSD. optional int32 tiny_image_dimension = 2 [default = 20]; } optional VisualConsistencyOptions visual_consistency_options = 55; // Radius of patch descriptor computed during RetrieveRegionFlowFeatureList // call. optional int32 patch_descriptor_radius = 21 [default = 3]; // Minimum distance from image border. Must be greater or equal to // patch_descriptor_radius. optional int32 distance_from_border = 50 [default = 3]; // Corner response is scaled by scalar below and normalized to lie within // [0, 1], where 0 is low corner score and 1 high corner score. optional float corner_response_scale = 26 [default = 1500]; // Verifies reliablity of features, by back-tracking operation from matched // location. If returned location is within verification_distance feature is // accepted otherwise discarded. optional bool verify_features = 27 [default = false]; optional float verification_distance = 28 [default = 0.5]; // If set, consistency of long features is verified (in case tracking_policy // is set to POLICY_LONG_FEATURES) by extracting a patch // around the feature during the very first observation and comparing the // matching patching along the long feature trajectory via SSD. If the // difference is above the long_feature_verification_threshold the feature is // removed. optional bool verify_long_features = 53 [default = true]; // Maximum average per pixel error (in L1 norm) in the normalized intensity // domain for matching patches to be considered to be consistent. optional float long_feature_verification_threshold = 54 [default = 0.04]; // Long features are expected to have limited acceleration over time. // If acceleration exceeds specified value based on the setting in // verify_long_feature_acceleration either: // a) verify_long_feature_acceleration = false // A new track is started instead of continuing the old one. // The track itself is not removed in this case. // // b) verify_long_feature_acceleration = true // The track is flagged for verification, by back-tracking operation from // matched location. If track fails verification test it is // discarded. This only triggers if at least // verify_long_feature_trigger_ratio of features have been flagged, // otherwise option a is used. optional float max_long_feature_acceleration = 56 [default = 5.0]; optional bool verify_long_feature_acceleration = 63 [default = false]; optional float verify_long_feature_trigger_ratio = 64 [default = 0.0]; // If true, histogram equalization is performed to the input image sequence // before registration. optional bool histogram_equalization = 57 [default = false]; // If true, synthetic region flows with zero motion are used for all (or just // the first) frame. optional bool use_synthetic_zero_motion_tracks_all_frames = 34 [default = false]; optional bool use_synthetic_zero_motion_tracks_first_frame = 35 [default = false]; // Optional gain correction before tracking features. Improves robustness when // lighting is changing. optional bool gain_correction = 36 [default = false]; // If set performs gain correction by simply equalizing mean intensity // between frames, instead of using ToneEstimation. optional bool fast_gain_correction = 61 [default = false]; // If the multiple hypothesis flag is set, features are tracked using both // with and without gain correction, and the hypothesis with more inliers // is selected. optional bool gain_correction_multiple_hypotheses = 47 [default = true]; // This flag, when used together with the multiple hypotheses flag, specifies // that gain correction should increase the number of inliers by at least this // fraction for it to be used instead of default tracking. optional float gain_correction_inlier_improvement_frac = 48 [default = 0.1]; // If set, always uses the brighter frame as reference. This is the // preferred direction of correction, to avoid overexposed regions from // being corrected which leads to spurious matches. optional bool gain_correction_bright_reference = 59 [default = false]; // Only performs gain correction if number of tracked features falls under // specified ratio (w.r.t. previous frame). // Set to zero, to always perform gain correction if requested. optional float gain_correction_triggering_ratio = 60 [default = 0.0]; // Gain correction is based on a grid of zero motion features, independent of // the underlying motion. Fractional parameter specifies resolution of the // grid w.r.t. frame size. optional float frac_gain_feature_size = 37 [default = 0.3]; optional float frac_gain_step = 38 [default = 0.1]; enum GainCorrectMode { GAIN_CORRECT_DEFAULT_USER = 1; // Uses default or user supplied bounds, // i.e. gain_bias_bounds is left untouched. GAIN_CORRECT_VIDEO = 2; // Uses defaults for video (most strict). GAIN_CORRECT_HDR = 3; // Uses most relaxed settings to track // across HDR frames, taken at different // exposures. GAIN_CORRECT_PHOTO_BURST = 4; // More relaxed than video but stricter // than HDR; use for photo burst where // exposure between frames can change. } optional GainCorrectMode gain_correct_mode = 41 [default = GAIN_CORRECT_DEFAULT_USER]; // Bounds for the estimated model. If not set externally, will be set // based on GainCorrectMode. optional ToneEstimationOptions.GainBiasBounds gain_bias_bounds = 39; // Supported image formats. All images are converted to grayscale // before processing. These image formats only concern AddImage. // IMPORTANT: All the Retrieve* methods expect RGB when the descriptors // are computed. enum ImageFormat { FORMAT_GRAYSCALE = 1; FORMAT_RGB = 2; FORMAT_RGBA = 3; FORMAT_BGR = 4; FORMAT_BGRA = 5; } // Image format of the input. optional ImageFormat image_format = 58 [default = FORMAT_RGB]; enum DescriptorExtractorType { ORB = 0; // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.370.4395&rep=rep1&type=pdf } // The descriptor extractor type used. optional DescriptorExtractorType descriptor_extractor_type = 65 [default = ORB]; // Whether to compute derivatives when building the pyramid. When set to // true, it's building a Laplacian pyramid. When set to false, it's building // a Gaussian pyramid. optional bool compute_derivative_in_pyramid = 66 [default = true]; // Deprecated fields. extensions 5, 7, 8, 9, 10, 15, 16, 24, 29, 30, 32, 42, 43; }