mediapipe/mediapipe/util/tracking/region_flow.proto

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";

package mediapipe;

option java_package = "com.google.mediapipe.tracking";
option java_multiple_files = true;

// Captures additional information about a RegionFlowFeature's
// surrounding patch.
// Using MotionEstimation::RetrieveRegionFlowFeatureList or
// ComputeRegionFlowFeatureDescriptors the patch descriptor has the folling
// layout:
// (9 dimensional: 3 mean intensities, 3x3 covariance matrix, (only store upper
// half (6 elems) in column major order, i.e. indices for data in patch
// descriptor refer to:
// mean: 0 1 2,  covariance: 3 4 5
//                             6 7
//                               8
message PatchDescriptor {
  repeated float data = 1;  // The actual feature descriptor.
}

// Binary feature descriptor for a particular feature.
// For example: orb
// http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.370.4395&rep=rep1&type=pdf
message BinaryFeatureDescriptor {
  optional bytes data = 1;
}

// Internal datastructure used during temporal IRLS smoothing.
message TemporalIRLSSmoothing {
  optional float weight_sum = 1 [default = 0];
  optional float value_sum = 2 [default = 0];
}

// Tracked feature at location (x,y) with flow (dx, dy) and patch based
// error (sum of absolute value of intensity difference).
// Next tag: 19
message RegionFlowFeature {
  optional float x = 1 [default = 0];
  optional float y = 2 [default = 0];
  optional float dx = 3 [default = 0];
  optional float dy = 4 [default = 0];

  // Features that belong to the same feature track are assigned a unique id
  // and are identified via it.
  // Note, this id is only unique within the lifetime of a RegionFlowComputation
  // object. That is, if distribution or parallelization using multiple
  // instances was used, the ids are only unique within that instance context.
  optional int32 track_id = 13 [default = -1];  // no id.

  // Tracking error as patch intensity residual (SSD).
  optional float tracking_error = 5 [default = 0];

  // Inverse of registration error (in pixels), after parametric motion model
  // fitting. Values are in [0, 1e6].
  // Low values correspond to outliers, high values to inliers.
  // Set by MotionEstimation::EstimateMotions*
  optional float irls_weight = 6 [default = 1.0];

  // Corner response (computed as minimum eigenvalue of
  // block filtered 2nd moment matrix).
  optional float corner_response = 11 [default = 0.0];

  // Patch feature descriptors. *For internal use only*. External clients should
  // not rely on their contents.
  optional PatchDescriptor feature_descriptor = 7;
  optional PatchDescriptor feature_match_descriptor = 8;

  // Internal datastructure used temporally during temporal IRLS smoothing.
  optional TemporalIRLSSmoothing internal_irls = 10;

  // Optional label for debugging purposes.
  optional string label = 14;

  // Flags indicating specific statuses.
  enum Flags {
    FLAG_BROKEN_TRACK = 1;  // Used for long feature tracks if track id
                            // was reset.
  }

  optional int32 flags = 15;

  // Unique feature id per RegionFlowComputation object.
  optional int32 feature_id = 16;

  // octave (pyramid layer) from which the keypoint has been extracted
  optional int32 octave = 17 [default = 0];

  // Feature descriptor for the current feature.
  optional BinaryFeatureDescriptor binary_feature_descriptor = 18;

  // Deprecated fields.
  extensions 9, 12;
}

// RegionFlowFrame is a optical flow representation where each region has a
// consistent optical flow (adheres to local translational model).
// Regions are arranged in a regular grid according to BlockDescriptor.
// Next tag: 11.
message RegionFlowFrame {
  // Next tag: 8
  message RegionFlow {
    required int32 region_id = 1;

    // Mean anchor point (centroid) of flow vector and mean flow.
    optional float centroid_x = 2 [default = 0];
    optional float centroid_y = 3 [default = 0];
    optional float flow_x = 4 [default = 0];
    optional float flow_y = 5 [default = 0];

    repeated RegionFlowFeature feature = 7;

    // Deprecated fields.
    extensions 6;
  }

  // Sorted by id for quick lookup.
  repeated RegionFlow region_flow = 1;

  // Total number of features in all RegionFlow's.
  optional int32 num_total_features = 2 [default = 0];

  // If set, indicates that the frame's region flow is unstable.
  // (not enough features or coverage too low).
  optional bool unstable_frame = 4 [default = false];

  // Blur score of the current frame is defined as the n-th percentile
  // of the corneress of the input frame evaluated over regions of high
  // corneress. For details see BlurScoreOptions in
  // region_flow_computation.proto.
  // The actual value is pretty meaningless, but relative to the blur score
  // of other frames one can detect blurry frames, e.g. by a 'significant'
  // local maxima in a sequence of blur_scores.
  optional float blur_score = 7;

  optional int32 frame_width = 8;
  optional int32 frame_height = 9;

  // Region flow is estimated using a grid of equal sized bins as regions.
  // BlockDescriptor specifies size of bins/blocks.
  message BlockDescriptor {
    optional int32 block_width = 1;
    optional int32 block_height = 2;
    optional int32 num_blocks_x = 3 [default = 0];
    optional int32 num_blocks_y = 4 [default = 0];
  }
  optional BlockDescriptor block_descriptor = 10;

  // Deprecated fields.
  extensions 3, 5, 6;
}

// Encapsulates a list of features with associated flow.
// Can be extracted from RegionFlow via GetRegionFlowFeatureList
// declared in region_flow.h. This is the essential (additional) information
// required by Cropper using wobble_suppression with displacements.
// Next tag: 14
message RegionFlowFeatureList {
  repeated RegionFlowFeature feature = 1;
  optional int32 frame_width = 2;
  optional int32 frame_height = 3;

  // Set from corresponding RegionFlowFrame field.
  optional bool unstable = 4 [default = false];

  // Records the minimum distance from the image border for each feature and
  // matching feature (if enforced > 0).
  optional int32 distance_from_border = 5 [default = 0];

  // Set from corresponding RegionFlowFrame field.
  optional float blur_score = 6;

  // If set, indicates, that features represent long tracks, i.e. each feature
  // has a valid track_id() >= 0.
  optional bool long_tracks = 7 [default = false];

  // If long_tracks, stores number of long feature tracks that got rejected in
  // this frame, as their patches were deemed inconsistent with the track's very
  // first extracted patch.
  optional float frac_long_features_rejected = 8 [default = 0];

  // Measures visual consistency between adjacent frames. In particular, stores
  // the absolute *change* in visual difference between two adjancent frame
  // pairs, i.e. the modulus of the 2nd derivative of the frame appearance.
  // Normalized w.r.t. number of channels and total pixels of the underlying
  // frame.
  // In particular for sudden changes (e.g. shot boundaries) this value will
  // be significantly non-zero (> 0.05).
  // Negative value per default indicates no consistency has been computed.
  optional float visual_consistency = 9 [default = -1];

  // Timestamp in micro seconds of the underlying frame, that is the frame
  // for which the source features (not matching features) were computed.
  optional int64 timestamp_usec = 10 [default = 0];

  // Denotes the frame that flow was computed w.r.t. to, locally to the current
  // frame. For example, if current frame is N, N + match_frame is the matching
  // frame that flow was computed to.
  // Values < 0 indicate backward tracking, while values > 0 indicate forward
  // tracking. By default, for empty feature lists, matching frame is the
  // same as current frame, i.e. match_frame = 0.
  optional int32 match_frame = 11 [default = 0];

  // Set, if frame is estimated to be an exact duplicate of the previous frame.
  optional bool is_duplicated = 12 [default = false];

  // Stores all the tracked ids that have been discarded actively in this frame.
  // This information will be popluated via RegionFlowFeatureList, so that the
  // downstreaming modules can receive it and use it to avoid misjudgement on
  // tracking continuity.
  // Discard reason:
  // (1) A tracked feature has too long track, which might create drift.
  // (2) A tracked feature in a highly densed area, which provides little value.
  repeated int32 actively_discarded_tracked_ids = 13;
}

// Salient point location (normalized w.r.t. frame_width and frame_height, i.e.
// specified in the domain [0, 1] x [0, 1]).

// For TYPE_INCLUDE:
// During retargeting and stabilization salient points introduce constraints
// that will try to keep the normalized location in the rectangle
// frame_size - normalized bounds.
// For this soft constraints are used, therefore the weight specifies
// how "important" the salient point is (higher is better).
// In particular for each point p the retargeter introduces two pairs of
// constraints of the form:
//           x - slack < width - right
//    and    x + slack > 0 + left,        with slack > 0
//  where the weight specifies the importance of the slack.
//
// For TYPE_EXCLUDE_*:
// Similar to above, but constraints are introduced to keep
// the point to the left of the left bound OR the right of the right bound.
// In particular:
//          x - slack < left OR
//          x + slack >= right
//  Similar to above, the weight specifies the importance of the slack.
//
// Note: Choosing a too high weight can lead to
// jerkiness as the stabilization essentially starts tracking the salient point.
message SalientPoint {
  // Normalized location of the point (within domain [0, 1] x [0, 1].
  optional float norm_point_x = 1 [default = 0.0];
  optional float norm_point_y = 2 [default = 0.0];

  enum SalientPointType {
    TYPE_INCLUDE = 1;
    TYPE_EXCLUDE_LEFT = 2;
    TYPE_EXCLUDE_RIGHT = 3;
  }

  // Salient point type. By default we try to frame the salient point within
  // the bounding box specified by left, bottom, right, top. Alternatively, one
  // can choose to exclude the point. For details, see discussion above.
  optional SalientPointType type = 11 [default = TYPE_INCLUDE];

  // Bounds are specified in normalized coordinates [0, 1], FROM the specified
  // border. Opposing bounds (e.g. left and right) may not add to values
  // larger than 1.
  // Default bounds center salient point within centering third of the frame.
  optional float left = 3 [default = 0.3];
  optional float bottom = 4 [default = 0.3];
  optional float right = 9 [default = 0.3];
  optional float top = 10 [default = 0.3];

  optional float weight = 5 [default = 15];

  // In addition salient point can represent a region of interest (defined as
  // ellipse of size norm_major x norm_minor (normalized to [0, 1] domain)
  // which orientation is given by angle (in radians in [0, pi]).
  // Due to aspect ratio change of the normalized domain, it is recommended that
  // transformations to other domains are done via the ScaleSalientPoint
  // function.
  optional float norm_major = 6;
  optional float norm_minor = 7;

  // Angle of major axis with x-axis (counter-clock wise, in radians).
  optional float angle = 8;

  extensions 20000 to max;
}

// Aggregates SalientPoint's for a frame.
message SalientPointFrame {
  repeated SalientPoint point = 1;

  extensions 20000 to max;
}