mediapipe/mediapipe2/examples/desktop/autoflip/quality/cropping.proto

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";

package mediapipe.autoflip;

import "mediapipe/examples/desktop/autoflip/autoflip_messages.proto";
import "mediapipe/examples/desktop/autoflip/quality/kinematic_path_solver.proto";

// All relevant information for key frames, including timestamp and detected
// features. This object should be generated by calling PackKeyFrameInfo() in
// the util namespace. It is passed in to ComputeFrameCropRegion().
message KeyFrameInfo {
  // Frame timestamp (in microseconds).
  optional int64 timestamp_ms = 1;
  // Detected features.
  optional DetectionSet detections = 2;
}

// User-specified key frame crop options (such as target width and height).
message KeyFrameCropOptions {
  // Target crop size.
  // Note: if you are using the SceneCroppingCalculator, DO NOT set these fields
  // manually as they will be then overwritten inside the calculator.
  optional int32 target_width = 1;
  optional int32 target_height = 2;
  // Option for how region score is aggregated from individual feature scores.
  // TODO: consider merging this enum type into the signal fusing
  // calculator.
  enum ScoreAggregationType {
    // Unknown value (should not be used).
    UNKNOWN = 0;
    // Takes the score of the feature with maximum score.
    MAXIMUM = 1;
    // Takes the sum of the scores of the required regions.
    SUM_REQUIRED = 2;
    // Takes the sum of the scores of all the regions that are fully covered.
    SUM_ALL = 3;
    // Uses a constant score 1.0 for all crop regions.
    CONSTANT = 4;
  }
  optional ScoreAggregationType score_aggregation_type = 3 [default = SUM_ALL];
  // Minimum centered coverage fraction (in length, not area) for a non-required
  // region to be included in the crop region. Applies to both dimensions.
  optional float non_required_region_min_coverage_fraction = 4 [default = 0.5];
}

// Key frame crop result containing the crop region rectangle, along with
// summary information on the cropping, such as whether all required regions
// could fit inside the target size, and what fraction of non-required regions
// are fully covered. This object is returned by ComputeFrameCropRegion() in
// the FrameCropRegionComputer class.
message KeyFrameCropResult {
  // Successfully covers all required features. If there are no required
  // regions, this field is set to true.
  optional bool are_required_regions_covered_in_target_size = 1;
  // Fraction of non-required features covered.
  optional float fraction_non_required_covered = 2;
  // Whether required crop region is empty (no detections).
  optional bool required_region_is_empty = 3;
  // Whether (full) crop region is empty (no detections).
  optional bool region_is_empty = 4;
  // Computed required crop region.
  optional Rect required_region = 5;
  // Computed (full) crop region.
  optional Rect region = 6;
  // Score of the computed crop region based on the detected features.
  optional float region_score = 7;
  // Frame timestamp (in microseconds).
  optional int64 timestamp_ms = 8;
}

// Compact processed scene key frame info containing timestamp, center position,
// and score. Each key frame has one SceneKeyFrameCompactInfo in
// SceneKeyFrameCropSummary.
message SceneKeyFrameCompactInfo {
  // Key frame timestamp (in microseconds).
  optional int64 timestamp_ms = 1;
  // Key frame crop region center in the horizontal/vertical directions (in
  // pixels).
  optional float center_x = 2;
  optional float center_y = 3;
  // Key frame crop region score.
  optional float score = 4;
}

// Summary information for the key frame crop results in a scene. Computed by
// AnalyzeSceneKeyFrameCropResults() in the SceneCameraMotionAnalyzer class.
// Used to decide camera motion type and populate salient point frames.
message SceneKeyFrameCropSummary {
  // Scene frame size.
  optional int32 scene_frame_width = 1;
  optional int32 scene_frame_height = 2;

  // Number of key frames in the scene.
  optional int32 num_key_frames = 3;
  // Scene key frame compact infos.
  repeated SceneKeyFrameCompactInfo key_frame_compact_infos = 4;

  // The minimum/maximum values of key frames' crop centers in the horizontal/
  // vertical directions.
  optional float key_frame_center_min_x = 5;
  optional float key_frame_center_max_x = 6;
  optional float key_frame_center_min_y = 7;
  optional float key_frame_center_max_y = 8;

  // The union of all the key frame required crop regions. When camera is steady
  // the crop window is set to cover this union.
  optional Rect key_frame_required_crop_region_union = 9;

  // The minimum/maximum scores of key frames' crop regions.
  optional float key_frame_min_score = 10;
  optional float key_frame_max_score = 11;

  // Size of the scene's crop window, calculated as the maximum of the target
  // size and the largest size of the key frames' crop regions in the scene.
  optional int32 crop_window_width = 12;
  optional int32 crop_window_height = 13;

  // Indicator for whether the scene has any frame with any salient region.
  optional bool has_salient_region = 14;
  // Indicator for whether the scene has any frame with any required salient
  // region.
  optional bool has_required_salient_region = 15;
  // Percentage of key frames that are successfully cropped (i.e. covers all
  // required regions inside the target size).
  optional float frame_success_rate = 16;
  // Amount of motion in the horizontal/vertical direction (i.e. the horizontal/
  // vertical range of the key frame crop centers' position as a fraction of
  // frame width/height).
  optional float horizontal_motion_amount = 17;
  optional float vertical_motion_amount = 18;
}

// Scene camera motion determined by the SceneCameraMotionAnalyzer class.
message SceneCameraMotion {
  // Camera focuses on a fixed center throughout the scene.
  message SteadyMotion {
    // Steady look-at center in horizontal/vertical directions (in pixels).
    optional float steady_look_at_center_x = 1;
    optional float steady_look_at_center_y = 2;
  }
  // Camera tracks key frame salient region centers.
  message TrackingMotion {
    // Fields to be added if necessary.
  }
  // Camera sweeps from one point to another.
  message SweepingMotion {
    // Starting and ending center positions for camera sweeping in pixels.
    optional float sweep_start_center_x = 1;
    optional float sweep_start_center_y = 2;
    optional float sweep_end_center_x = 3;
    optional float sweep_end_center_y = 4;
  }
  oneof motion_type {
    SteadyMotion steady_motion = 1;
    TrackingMotion tracking_motion = 2;
    SweepingMotion sweeping_motion = 3;
    // Other types that we might support later.
  }
}

// User-specified options for analyzing scene camera motion from a collection of
// key frame crop regions.
message SceneCameraMotionAnalyzerOptions {
  reserved 9;
  // If there is small motion within the scene keep the camera steady at the
  // center.
  optional float motion_stabilization_threshold_percent = 1 [default = .30];
  // Snap to center if there is small motion and already focused closed to the
  // center.
  optional float snap_center_max_distance_percent = 2 [default = .08];
  // Maximum weight for a constraint. Scales scores accordingly so that the
  // maximum score is equal to this weight.
  optional float maximum_salient_point_weight = 3 [default = 100.0];
  // Normalized bound for SalientPoint's in the frame from the border. This is
  // uniformly applied to the left, right, top, and bottom. It should be
  // strictly less than 0.5. A narrower bound (closer to 0.5) gives better
  // constraint enforcement.
  optional float salient_point_bound = 4 [default = 0.48];
  // Indicator for whether sweeping is allowed. Note that if a scene can be
  // seamlessly padded with solid background color, sweeping will be disabled
  // regardlessly of the value of this flag.
  optional bool allow_sweeping = 5 [default = true];
  // Minimal scene time span in seconds to allow camera sweeping.
  optional float minimum_scene_span_sec_for_sweeping = 6 [default = 1.0];
  // If success rate in a scene is less than this, then use camera sweeping.
  optional float minimum_success_rate_for_sweeping = 7 [default = 0.4];
  // If true, sweep entire frame. Otherwise, sweep the crop window.
  optional bool sweep_entire_frame = 8 [default = true];
  // When no salient region is received, the default behavior is the return the
  // camera to center-focused location.  When this flag is set to a value >0,
  // the camera will remain at its last position for this amount of time before
  // recentering (if the last scene camera motion type was steady).
  optional int64 duration_before_centering_us = 10;
}

// Video cropping summary information for debugging/statistics.
message VideoCroppingSummary {
  message SceneCroppingSummary {
    // Scene span in seconds.
    optional float start_sec = 1;
    optional float end_sec = 2;
    // Indicator for whether this scene was cut at a real physical scene
    // boundary (as opposed to force flush).
    optional bool is_end_of_scene = 3;
    // Scene camera motion.
    optional SceneCameraMotion camera_motion = 4;
    // Indicator for whether the scene is padded.
    optional bool is_padded = 5;
  }
  // Cropping summaries for all the scenes in the video.
  repeated SceneCroppingSummary scene_summaries = 1;
}

message CameraMotionOptions {
  message PolynomialRegressionPathSolver {
    // Number of frames from prior buffer to be used to smooth out camera
    // trajectory when it was a forced flush.
    optional int32 prior_frame_buffer_size = 1 [default = 30];
  }
  oneof camera_model_oneof {
    // Fits a poly line to keypoints to find a smooth camera path.
    PolynomialRegressionPathSolver polynomial_path_solver = 1;
    // Maintains a kinematic state of the camera, updated with keypoints, to
    // find a smooth camera path.  Currently optimized for real-time operation.
    KinematicOptions kinematic_options = 2;
  }
}