mediapipe/mediapipe/util/tracking/region_flow_computation.h

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Computes the RegionFlow for a set of frames.
// Specifically, extracts Harris-like features from each frame, tracks these
// between frames and regularizes the tracked features locally (outlier
// rejection) by leveraging fast per-frame segmentation.
// Optionally, features can be assigned to either foreground or background based
// on the computation of the fundamental matrix for a pair of frames,
//
// Basic usage:
// RegionFlowComputation flow_computation(RegionFlowComputationOptions(),
//                                        frame_width,
//                                        frame_height);
//
// std::vector<cv::Mat> input_images;                // Supplied by caller.
// for (int i = 0; i < num_frames; ++i) {
//   flow_computation.AddImage(input_images[i]);
//
//   // Result is owned by this caller.
//   std::unique_ptr<RegionFlow> result(
//       flow_computation.RetrieveRegionFlow());
//
//   // OR
//   std::unique_ptr<RegionFlowFeatureList> result(
//       flow_computation.RetrieveRegionFlowFeatureList(
//           true,              // Compute feature descriptor.
//           false,             // no match descriptor for this example.
//           &input_images[i],
//           nullptr);
//
//   // Do your custom processing or pass on to MotionEstimation.
//
//  }

#ifndef MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_COMPUTATION_H_
#define MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_COMPUTATION_H_

#include <deque>
#include <memory>
#include <unordered_map>
#include <vector>

#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/util/tracking/motion_models.pb.h"
#include "mediapipe/util/tracking/region_flow.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
#include "mediapipe/util/tracking/region_flow_computation.pb.h"

namespace mediapipe {
class RegionFlowFeatureList;
class RegionFlowFrame;
}  // namespace mediapipe

namespace mediapipe {

struct TrackedFeature;
typedef std::vector<TrackedFeature> TrackedFeatureList;
class MotionAnalysis;

class RegionFlowComputation {
 public:
  RegionFlowComputation(const RegionFlowComputationOptions& options,
                        int frame_width, int frame_height);
  virtual ~RegionFlowComputation();
  RegionFlowComputation(const RegionFlowComputation&) = delete;
  RegionFlowComputation& operator=(const RegionFlowComputation&) = delete;

  // Performs motion analysis on source w.r.t. to source passed in previous
  // call. Therefore, first call will compute empty flow. If
  // RegionFlowComputationOptions::frame_to_track := ftt > 0, motion analysis
  // performed w.r.t. the previous ftt source passed via AddImage.
  // Motion analysis uses grid-based regions to enforce locally consistent flow.
  // Source is expected to be 3-channel RGB 8bit image (24bit in total), OR
  // 1-channel Grayscale 8bit image, compatible with
  // RegionFlowComputationOptions::ImageFormat.
  // Pass the frame's timestamp to have it stored in the result or zero if not
  // needed.
  // Returns true on success, false otherwise.
  virtual bool AddImage(const cv::Mat& source, int64 timestamp_usec);

  // Same as above, but seed initial feature position in the matching frame
  // with initial_transform.
  virtual bool AddImageWithSeed(const cv::Mat& source, int64 timestamp_usec,
                                const Homography& initial_transform);

  // Same as AddImage but also accepts an optional source_mask (pass empty
  // cv::Mat to get the same behavior as AddImage). If non-empty, features are
  // only extracted in regions where the mask value is > 0. Mask should be 8-bit
  // grayscale of the same size as source, unless empty.
  virtual bool AddImageWithMask(const cv::Mat& source,
                                const cv::Mat& source_mask,
                                int64 timestamp_usec);

  // Call after AddImage* to retrieve last downscaled, grayscale image.
  cv::Mat GetGrayscaleFrameFromResults();

  // Returns result as RegionFlowFrame. Result is owned by caller.
  // Will return NULL if called twice without AddImage* call.
  virtual RegionFlowFrame* RetrieveRegionFlow();

  // Returns result as RegionFlowFeatureList. Result is owned by caller.
  // Will return NULL if called twice without AddImage* call.
  // Computes optionally feature descriptors (if compute_feature_descriptor
  // is set, in that case curr_color_image must not be NULL)
  // and additionally matching descriptor (if compute_match_descriptor is set,
  // in this case prev_color_image must not be NULL).
  // Passed images should be in sync with those passed to AddImage, i.e.
  // source in AddImage and parameter curr_color_image should refer to the same
  // image.
  virtual RegionFlowFeatureList* RetrieveRegionFlowFeatureList(
      bool compute_feature_descriptor, bool compute_match_descriptor,
      const cv::Mat* curr_color_image,   // optional.
      const cv::Mat* prev_color_image);  // optional.

  // Same as above, but returns specific tracked result from current frame C
  // to C - track_index - 1.
  virtual RegionFlowFeatureList* RetrieveMultiRegionFlowFeatureList(
      int track_index, bool compute_feature_descriptor,
      bool compute_match_descriptor,
      const cv::Mat* curr_color_image,   // optional.
      const cv::Mat* prev_color_image);  // optional.

  // Returns result of a specific RegionFlowFrame in case
  // RegionFlowComputationOptions::frames_to_track() > 1. Result is owned by
  // caller.
  virtual RegionFlowFrame* RetrieveMultiRegionFlow(int frame);

  // Resets computation to ignore any previously added frames. Next frame passed
  // via AddImageXXX() routines will be treated as the first frame in the
  // sequence.
  virtual void Reset();

  // Creates synthetic tracks with feature points in a grid with zero motion
  // w.r.t. prev frame. Points are located at the center of each grid. Step size
  // is fractional w.r.t. image size.
  static void ZeroMotionGridFeatures(int frame_width, int frame_height,
                                     float frac_grid_step_x,
                                     float frac_grid_step_y,
                                     RegionFlowFeatureList* result);

  // Returns densly sampled motions zero motion features.
  // Features are centered in a box of size frac_diameter that is shifted by
  // frac_steps_x * frame_width and frac_steps_y * frame_height.
  static void DenseZeroMotionSamples(int frame_width, int frame_height,
                                     float frac_diameter, float frac_steps_x,
                                     float frac_steps_y,
                                     RegionFlowFeatureList* result);

 private:
  typedef std::vector<std::unique_ptr<RegionFlowFeatureList>>
      RegionFlowFeatureListVector;

  typedef std::vector<TrackedFeature*> TrackedFeatureView;

  // Indexed via grid bin, each bin contains list of its corresponding features.
  typedef std::vector<TrackedFeatureView> TrackedFeatureMap;

  struct FrameTrackingData;
  struct LongTrackData;
  struct ORBFeatureDescriptors;

  // Implementation function to retrieve the i-th RegionFlowFeatureList
  // (specified track_index). Specifically, i-th feature list, denotes the flow
  // from the current frame N to the previous frame N - 1 - track_index.
  // Casts arguments to cv.
  virtual std::unique_ptr<RegionFlowFeatureList>
  RetrieveRegionFlowFeatureListImpl(int track_index,
                                    bool compute_feature_descriptor,
                                    bool compute_match_descriptor,
                                    const cv::Mat* curr_color_image,
                                    const cv::Mat* prev_color_image);

  // Initializes the FrameTrackingData's members from source and source_mask.
  // Returns true on success.
  bool InitFrame(const cv::Mat& source, const cv::Mat& source_mask,
                 FrameTrackingData* data);

  // Adds image to the current buffer and starts tracking.
  bool AddImageAndTrack(const cv::Mat& source, const cv::Mat& source_mask,
                        int64 timestamp_usec,
                        const Homography& initial_transform);

  // Computes *change* in visual difference between adjacent frames. Normalized
  // w.r.t. number of channels and number of pixels. For this to be meaningful
  // it is expected that passed FrameTrackingData's are exactly one frame apart
  // (CHECKED).
  float ComputeVisualConsistency(FrameTrackingData* previous,
                                 FrameTrackingData* current) const;

  // Computes flow regularized based on regions and other options, from frame
  // index "from" to index "to", specified relative to current frame, i.e. index
  // of current frame = 0, prev frame = -1, next frame = 1, etc. Set invert_flow
  // to true if the flow should be inverted after tracking.
  // Optionally, can input the previous result to link features via ids,
  // effectively creating long feature tracks. In this case you usually want to
  // request the current result (same as returned in feature_list) in form of
  // a TrackedFeatureList.
  void ComputeRegionFlow(int from, int to, bool synthetic_tracks,
                         bool invert_flow,
                         const TrackedFeatureList* prev_result,  // optional.
                         TrackedFeatureList* curr_result,        // optional.
                         RegionFlowFeatureList* feature_list);

  // Gain corrects input frame w.r.t. reference frame. Returns true iff gain
  // correction succeeds. If false, calibrated_frame is left untouched.
  bool GainCorrectFrame(const cv::Mat& reference_frame,
                        const cv::Mat& input_frame, float reference_mean,
                        float input_mean, cv::Mat* calibrated_frame) const;

  // Feature extraction method.
  // Expects as input an image pyramid of gray scale image (each subsequent
  // level should be downsampled by a factor of 2 (always rounding up), CHECKED
  // against).
  // For each level extracts corner features across a grid by considering all
  // locations that have a corner response corner repsonse above
  // options_.feature_quality_level() * maximum within the grid bin.
  // Features with high corner response are output first (but corner response is
  // not necessarily monotonic). Feature locations are binned into mask
  // (via downscaling by mask_scale), using a 5x5 patch, to discarded features
  // that are too close to each other.
  // Features and corner responses are added to the corresponding vectors in
  // data, i.e. passed data is not cleared and expected to be initialized.
  virtual void AdaptiveGoodFeaturesToTrack(
      const std::vector<cv::Mat>& extraction_pyramid, int max_features,
      float mask_scale, cv::Mat* mask, FrameTrackingData* data);

  // Uses prev_result to remove all features that are not present in data.
  // Uses track_ids, i.e. only works with long feature processing.
  void RemoveAbsentFeatures(const TrackedFeatureList& prev_result,
                            FrameTrackingData* data);

  // Remove features in data that lie outside the feature extraction mask for
  // that frame.
  void RemoveFeaturesOutsideMask(FrameTrackingData* data);

  // Extracts features for tracking from frame corresponding to data.
  // Optionally, may reuse tracked features if available, based on options.
  // Optionally, if new features are extracted, can use feature_list to mask out
  // feature locations that should not be extracted again.
  void ExtractFeatures(const TrackedFeatureList* prev_result,
                       FrameTrackingData* data);

  // Performs inplace feature selection, by evaluating the range
  // [0, data->features.size()] via an Evaluator implementing
  // [](int) -> bool. Only feature indices for which eval returns true are kept
  // (using in place moves) the remainder is discarded. Applies moves operation
  // to FrameTrackingData's feature, track_idx, feature_source_map and
  // neighborhoods. Also applies moves to any vector<int> and vector<float>
  // that can be optionally supplied.
  // Note: All vectors are assumed to of the same size (checked in debug
  // mode).
  template <class Evaluator>
  int InplaceFeatureSelection(FrameTrackingData* data,
                              std::vector<std::vector<int>*> int_vecs,
                              std::vector<std::vector<float>*> float_vecs,
                              const Evaluator& eval);

  // Tracks features between two frames (from -> to). Operates on internal data
  // structure FrameTrackingData which stores all frame information relavant for
  // tracking.
  //
  // If gain_correct is true, tracking is carried out between the "from" and the
  // gain-corrected "to" image. It is also an output variable indicating whether
  // gain correction succeeded or failed.
  //
  // Updates internal data structure, so any computation can be reused in
  // successive calls to feature extraction or tracking.
  void TrackFeatures(FrameTrackingData* from_data_ptr,
                     FrameTrackingData* to_data_ptr, bool* gain_correct,
                     float* frac_long_features_rejected,
                     TrackedFeatureList* results);

  // Wide-baseline version of above function, using feature descriptor matching
  // instead of tracking.
  void WideBaselineMatchFeatures(FrameTrackingData* from_data_ptr,
                                 FrameTrackingData* to_data_ptr,
                                 TrackedFeatureList* results);

  // Fits affine model to TrackedFeatureList via direct call of
  // MotionEstimation::EstimateAffineModelIRLS.
  AffineModel AffineModelFromFeatures(TrackedFeatureList* features) const;

  // Creates synthetic tracks with feature points in a grid with zero motion
  // w.r.t. prev frame. Points are located at the center of each grid. Step size
  // is fractional w.r.t. image size.
  // Returns minimum distance from border across all features.
  static int ZeroMotionGridTracks(int frame_width, int frame_height,
                                  float frac_grid_step_x,
                                  float frac_grid_step_y,
                                  TrackedFeatureList* results);

  // Computes region flow using a rectangular grid of square regions.
  void ComputeBlockBasedFlow(TrackedFeatureList* feature_list,
                             TrackedFeatureView* inlier_features) const;

  // Initializes feature locations for FrameTrackingData at index to,
  // from resulting tracks in from.
  void InitializeFeatureLocationsFromPreviousResult(int from, int to);

  // Initializes feature locations in "to" from initial transform by applying
  // it to every feature of "from".
  void InitializeFeatureLocationsFromTransform(int from, int to,
                                               const Homography& transform);

  // Enforces a translational model within each region, only retaining inliers
  // that are output to inliers.
  void DetermineRegionFlowInliers(const TrackedFeatureMap& region_feature_map,
                                  TrackedFeatureView* inliers) const;

  // Determines number of minimum inliers based on absolute and relative
  // thresholds.
  int GetMinNumFeatureInliers(
      const TrackedFeatureMap& region_feature_map) const;

  // Internal conversion function from a feature list to corresponding frame.
  void RegionFlowFeatureListToRegionFlow(
      const RegionFlowFeatureList& feature_list, RegionFlowFrame* frame) const;

  // Initializes all members except actual features in a RegionFlowFeatureList.
  void InitializeRegionFlowFeatureList(
      RegionFlowFeatureList* region_flow_feature_list) const;

  // Converts TrackedFeatureView to RegionFlowFeatureList, flattening over
  // all bins. Returns average motion magnitude.
  // Optionally TrackedFeature's corresponding to each feature output in
  // region_flow_feature_list can be recorded via flattened_feature_list.
  float TrackedFeatureViewToRegionFlowFeatureList(
      const TrackedFeatureView& region_feature_view,
      TrackedFeatureList* flattened_feature_list,
      RegionFlowFeatureList* region_flow_feature_list) const;

  // Determines if sufficient (spatially distributed) features are available.
  bool HasSufficientFeatures(const RegionFlowFeatureList& feature_list);

  // Returns number of required pyramid levels to track the specified distance.
  int PyramidLevelsFromTrackDistance(float track_distance);

  // Returns blur score (inverse of average corner measure) for input image.
  // The higher the value the blurrier the frame.
  float ComputeBlurScore(const cv::Mat& image);

  // Computes binary mask of pixels, for which the corner score (passed in
  // min_eig_vals) can be used to as a measure to quanity the amount of blur.
  // For pixelx not part of the mask the corner score is not a reliable measure
  // to quanity blur. For example, discards over-exposed regions and regions
  // that do not have sufficient cornerness.
  // Note: Modifies the corner values!
  void ComputeBlurMask(const cv::Mat& input, cv::Mat* min_eig_vals,
                       cv::Mat* mask);

  // Appends features in a sorted manner (by pointer location) while discarding
  // duplicates.
  void AppendUniqueFeaturesSorted(const TrackedFeatureView& to_be_added,
                                  TrackedFeatureView* features) const;

  void GetFeatureTrackInliers(bool skip_estimation,
                              TrackedFeatureList* features,
                              TrackedFeatureView* inliers) const;

  bool IsVerifyLongFeatures() const {
    return long_track_data_ != nullptr && options_.verify_long_features();
  }

  int DownsampleWidth() const { return frame_width_; }
  int DownsampleHeight() const { return frame_height_; }

  // Returns 1.0 / scale that is being applied to the features for downscaling.
  float DownsampleScale() const { return downsample_scale_; }

 private:
  RegionFlowComputationOptions options_;

  // Frame width and height after downsampling.
  int frame_width_;
  int frame_height_;

  // Number of frames w.r.t each frame is tracked.
  int frames_to_track_;
  // Maximum length of long feature tracks in frames.
  int max_long_track_length_;

  // Original frame width and height.
  int original_width_;
  int original_height_;

  // Scale and state of downsampling.
  float downsample_scale_;
  bool use_downsampling_;

  int pyramid_levels_;
  int extraction_levels_;

  int frame_num_ = 0;
  int max_features_ = 0;
  float curr_blur_score_ = 0;
  // Moving average of number of features across recently computed tracks.
  float curr_num_features_avg_ = 0;

  // Count used to generate unique feature ids.
  int feature_count_ = 0;

  // List of RegionFlow frames of size options_.frames_to_track.
  RegionFlowFeatureListVector region_flow_results_;

  // Gain adapted version.
  std::unique_ptr<cv::Mat> gain_image_;
  std::unique_ptr<cv::Mat> gain_pyramid_;

  // Temporary buffers.
  std::unique_ptr<cv::Mat> corner_values_;
  std::unique_ptr<cv::Mat> corner_filtered_;
  std::unique_ptr<cv::Mat> corner_mask_;

  std::unique_ptr<cv::Mat> curr_color_image_;

  // Temporary images for feature extraction.
  std::unique_ptr<cv::Mat> feature_tmp_image_1_;
  std::unique_ptr<cv::Mat> feature_tmp_image_2_;

  std::vector<uint8> feature_status_;       // Indicates if point could be
                                            // tracked.
  std::vector<float> feature_track_error_;  // Patch-based error.

  // Circular queue to buffer tracking data.
  std::deque<std::unique_ptr<FrameTrackingData>> data_queue_;

  // Global settings for block based flow.
  int block_width_;
  int block_height_;
  int block_levels_;

  // Stores average flow magnitudes for recently processed frames.
  std::deque<float> flow_magnitudes_;

  // Records data for long feature tracks.
  std::unique_ptr<LongTrackData> long_track_data_;

  bool use_cv_tracking_ = false;

  // Counter used for controlling how ofter do we run descriptor extraction.
  // Count from 0 to options_.extract_descriptor_every_n_frame() - 1.
  // Extract descriptors only when counter == 0.
  int cnt_extract_descriptors_ = 0;

  friend class MotionAnalysis;
};

}  // namespace mediapipe

#endif  // MEDIAPIPE_UTIL_TRACKING_REGION_FLOW_COMPUTATION_H_