// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef MEDIAPIPE_UTIL_TRACKING_TRACKING_H_ #define MEDIAPIPE_UTIL_TRACKING_TRACKING_H_ // Performs tracking via rectangular regions (MotionBoxes) from pre-initialized // positions, using metadata from tracked features (TrackingData converted to // MotionVectorFrames), forward and backward in time. #include #include #include #include #include #include "absl/container/flat_hash_set.h" #include "mediapipe/framework/port/vector.h" #include "mediapipe/util/tracking/flow_packager.pb.h" #include "mediapipe/util/tracking/motion_models.h" #include "mediapipe/util/tracking/motion_models.pb.h" #include "mediapipe/util/tracking/tracking.pb.h" namespace mediapipe { // Useful helper functions. // // Clamps values to be within interval [left, right]. inline float Clamp(float value, float left, float right) { return value < left ? left : (value > right ? right : value); } // Standard linear interpolation function. template T Lerp(T a, T b, float alpha) { return static_cast(a * (1.0f - alpha) + b * alpha); } // Approximates sigmoid function with a linear ramp, mapping // x <= lhs to 0, x >= rhs to 1 (for lhs < rhs) linear in between interval // [lhs, rhs]. If lhs > rhs, roles are reversed. inline float LinearRamp(float value, float lhs, float rhs) { return Clamp((value - lhs) / (rhs - lhs), 0, 1); } inline Vector2_f MotionBoxPosition(const MotionBoxState& state) { return Vector2_f(state.pos_x(), state.pos_y()); } inline void SetMotionBoxPosition(const Vector2_f& pos, MotionBoxState* state) { state->set_pos_x(pos.x()); state->set_pos_y(pos.y()); } // TODO: this needs to be changed for quad inline Vector2_f MotionBoxSize(const MotionBoxState& state) { return Vector2_f(state.width(), state.height()); } inline void SetMotionBoxSize(const Vector2_f& size, MotionBoxState* state) { state->set_width(size.x()); state->set_height(size.y()); } inline Vector2_f MotionBoxCenter(const MotionBoxState& state) { return MotionBoxPosition(state) + 0.5f * MotionBoxSize(state); } inline Vector2_f InlierCenter(const MotionBoxState& state) { return Vector2_f(state.inlier_center_x(), state.inlier_center_y()); } inline Vector2_f MotionBoxVelocity(const MotionBoxState& state) { return Vector2_f(state.dx(), state.dy()); } inline void SetMotionBoxVelocity(const Vector2_f& velo, MotionBoxState* state) { state->set_dx(velo.x()); state->set_dy(velo.y()); } // Derive normalization factor from image aspect ratio so that the scale for the // longer edge is 1. scale will be reversed if `invert` is true. void ScaleFromAspect(float aspect, bool invert, float* scale_x, float* scale_y); // Returns 4 corners of the MotionBox as top_left, bottom_left, bottom_right // and top_right. Applies 2D scaling prior to rotation, which is necessary to // preserve orthogonality of the rotation if the scaling is not isotropic. std::array MotionBoxCorners( const MotionBoxState& state, const Vector2_f& scaling = Vector2_f(1.0f, 1.0f)); // Computes corresponding line equations for MotionBoxCorners. // Output line equations on 4 sides. // Returns true if box is normal, false if we encounter abnormal box which // leads to numerical problems. // Applies 2D scaling prior to rotation, which is necessary to // preserve orthogonality of the rotation if the scaling is not isotropic. bool MotionBoxLines(const MotionBoxState& state, const Vector2_f& scaling, std::array* box_lines); // Returns top-left and bottom right corner of the bounding box // of the MotionBoxState. void MotionBoxBoundingBox(const MotionBoxState& state, Vector2_f* top_left, Vector2_f* bottom_right); // Adds all inliers from state to the inlier map (as id, score) tuple. // If id already exist, score is updated to be the maximum of current and // existing score. inline void MotionBoxInliers(const MotionBoxState& state, std::unordered_map* inliers) { CHECK(inliers); const int num_inliers = state.inlier_ids_size(); DCHECK_EQ(num_inliers, state.inlier_length_size()); for (int k = 0; k < num_inliers; ++k) { (*inliers)[state.inlier_ids(k)] = std::max((*inliers)[state.inlier_ids(k)], state.inlier_length(k)); } } // Adds all outliers from state to the outlier map. inline void MotionBoxOutliers(const MotionBoxState& state, std::unordered_set* outliers) { for (int id : state.outlier_ids()) { outliers->insert(id); } } // Returns inlier locations from state (normalized in [0, 1] domain). void MotionBoxInlierLocations(const MotionBoxState& state, std::vector* inlier_pos); // Same for outlier positions. void MotionBoxOutlierLocations(const MotionBoxState& state, std::vector* outlier_pos); // Get corners of rotated rectangle. Note that the quad component in // MotionBoxState is not used in this function. Only the rotated rectangle // is used. // Inputs: // -- state: the MotionBoxState where we extract the rotated rectangle // -- scaling: additional scaling we apply on x and y axis // Output: // corners in counter-clockwise order std::array GetCornersOfRotatedRect(const MotionBoxState& state, const Vector2_f& scaling); // Use the position, width, and height in MotionBoxState to initialize // the quad. Only use it when you want to get homography for tracking. void InitializeQuadInMotionBoxState(MotionBoxState* state); // Initializes inliers and outliers related fields in MotionBoxState from // TrackingData. The box or quad position will be read from `state` so they need // to be set beforehand. void InitializeInliersOutliersInMotionBoxState(const TrackingData& tracking, MotionBoxState* state); // Initializes pnp_homography field in MotionBoxState using perspective // transform between a physical rectangle with specified aspect ratio and a // screen quad. void InitializePnpHomographyInMotionBoxState( const TrackingData& tracking, const TrackStepOptions& track_step_options, MotionBoxState* state); // Represents the motion of a feature at pos between frames, differentiating // object from background motion (supplied via a MotionVectorFrame). struct MotionVector { MotionVector() : pos(0, 0), background(0, 0), object(0, 0) {} MotionVector(const Vector2_f& pos_, const Vector2_f& background_, const Vector2_f& object_) : pos(pos_), background(background_), object(object_) {} Vector2_f Location() const { return pos; } Vector2_f MatchLocation() const { return pos + background + object; } Vector2_f Motion() const { return background + object; } // Position of the feature in normalized domain [0, 1]. Vector2_f pos; // Motion due to background (i.e. camera motion). Vector2_f background; // Motion due to foreground (i.e. object motion in addition to background). // If feature belong to background, object motion is nearly zero. Vector2_f object; int track_id = -1; // Returns the MotionVector stored in the internal state at index. static MotionVector FromInternalState(const MotionBoxInternalState& internal, int index); }; constexpr float kTrackingDefaultFps = 30.0; // Holds motion vectors and background model for each frame. // Note: Specified in the aspect preserving domain under uniform scaling, // longest dimension normalized to 1, i.e. if aspect_ratio >= 1, width is // normalized to 1 otherwise height is normalized to 1. struct MotionVectorFrame { std::vector motion_vectors; Homography background_model; bool valid_background_model = true; bool is_duplicated = false; // Set if frame is duplicated w.r.t. // previous one. bool is_chunk_boundary = false; // Set if this is the first frame in a chunk. float duration_ms = 1000.0f / kTrackingDefaultFps; // Aspect ratio (w/h) of the original frame. float aspect_ratio = 1.0f; // Stores the tracked ids that have been discarded actively. This information // will be used to avoid misjudgement on tracking continuity. absl::flat_hash_set* actively_discarded_tracked_ids = nullptr; }; // Transforms TrackingData to MotionVectorFrame, ready to be used by tracking // algorithm (so the MotionVectorFrame data is denormalized). void MotionVectorFrameFromTrackingData(const TrackingData& tracking_data, MotionVectorFrame* motion_vector_frame); // Transform TrackingData to feature positions and descriptors, ready to be used // by detection (re-acquisition) algorithm (so the "features" is denomalized). // Descriptors with all 0s will be discarded. void FeatureAndDescriptorFromTrackingData( const TrackingData& tracking_data, std::vector* features, std::vector* descriptors); // Inverts MotionVectorFrame (by default defined as motion from current to // previous frame) to hold motion from previous to current frame. void InvertMotionVectorFrame(const MotionVectorFrame& input, MotionVectorFrame* output); // Returns duration in ms for this chunk item. float TrackingDataDurationMs(const TrackingDataChunk::Item& item); // Returns feature indices that are within the given box. If the box size isn't // big enough to cover sufficient features (i.e., min_num_features), this will // iteratively enlarge the box size (up to max_enlarge_size) to include more // features. The argument box_scaling is used in MotionBoxLines() to get // properly scaled box corners. Note: box_scaling and max_enlarge_size need to // be in normalized image space. // TODO: Add unit test. void GetFeatureIndicesWithinBox(const std::vector& features, const MotionBoxState& box_state, const Vector2_f& box_scaling, float max_enlarge_size, int min_num_features, std::vector* inlier_indices); // Represents a moving box over time. Initial position is supplied via // ResetAtFrame, and subsequent positions for previous and next frames are // determined via tracking by TrackStep method. // Example usage: // // Assuming metadata is available: vector mvf; // MotionBoxState box_state; // // Set to center 20%. // box_state.set_pos_x(0.4); // box_state.set_pos_y(0.4); // box_state.set_width(0.2); // box_state.set_height(0.2); // // // Initialize first position at frame 5. // MotionBox motion_box(TrackStepOptions()); // motion_box.ResetAtFrame(4, box_state); // // Track 4 frames backward and forward in time. // for (int i = 0; i < 4; ++i) { // // Tracking steps need to be called contiguously, as otherwise no // // prior location for the track is present and TrackStep will fail. // // Backward. // motion_box.TrackStep(4 - i, mvf[4 -i], false, nullptr); // // Get position -> consume for display, etc. // motion_box.StateAtFrame(4 - i); // // // Forward. // motion_box.TrackStep(4 + i, mvf[4 -i], true, nullptr); // // Get position -> consume. // motion_box.StateAtFrame(4 + i); // } class MotionBox { public: explicit MotionBox(const TrackStepOptions& track_step_options) : options_(track_step_options) {} MotionBox() = default; // Sets and overwrites MotionBoxState at specified frame. Use to supply // initial position. void ResetAtFrame(int frame, const MotionBoxState& state); // Tracks MotionBox from state at from_frame either forward or backward in // time, based on the passed MotionVectorFrame. (MotionVectorFrame has to // correspond to requested tracking direction, this is not checked against). // Returns true if tracking was successful. // Note: It is assumed that from_frame already has a valid location, either // via ResetAtFrame or previous successful execution of TrackStep. That is // TrackStep needs to be called contiguously from a initialized position // via ResetFrame. Otherwise no prior location for the track is present (at // from_frame) and TrackStep will fail (return false). bool TrackStep(int from_frame, const MotionVectorFrame& motion_vectors, bool forward); MotionBoxState StateAtFrame(int frame) const { if (frame < queue_start_ || frame >= queue_start_ + static_cast(states_.size())) { LOG(ERROR) << "Requesting state at unknown frame " << frame << ". Returning UNTRACKED."; MotionBoxState invalid; invalid.set_track_status(MotionBoxState::BOX_UNTRACKED); return invalid; } else { MotionBoxState result = states_[frame - queue_start_]; if (!options_.return_internal_state()) { result.clear_internal(); } return result; } } MotionBoxState* MutableStateAtFrame(int frame) { if (frame < queue_start_ || frame >= queue_start_ + states_.size()) { return NULL; } else { return &states_[frame - queue_start_]; } } bool TrackableFromFrame(int frame) const { return StateAtFrame(frame).track_status() >= MotionBoxState::BOX_TRACKED; } void set_start_track(int frame) { start_track_ = frame; } int start_track() const { return start_track_; } void set_end_track(int frame) { end_track_ = frame; } int end_track() const { return end_track_; } void TrimFront(const int cache_size) { int trim_count = states_.size() - cache_size; if (trim_count > 0) { queue_start_ += trim_count; while (trim_count-- > 0) { states_.pop_front(); } } } void TrimBack(const int cache_size) { int trim_count = states_.size() - cache_size; if (trim_count > 0) { while (trim_count-- > 0) { states_.pop_back(); } } } // If this variable is set to true, then TrackStep would print warning // messages when tracking is failed. // Default value is true and is set in tracking.cc. static bool print_motion_box_warnings_; private: // Determines next position from curr_pos based on tracking data in // motion_vectors. Also receives history of the last N positions. void TrackStepImplDeNormalized( int frome_frame, const MotionBoxState& curr_pos, const MotionVectorFrame& motion_vectors, const std::vector& history, MotionBoxState* next_pos) const; // Pre-normalization wrapper for above function. De-normalizes domain // to aspect preserving domain and velocity to current frame period. void TrackStepImpl(int from_frame, const MotionBoxState& curr_pos, const MotionVectorFrame& motion_frame, const std::vector& history, MotionBoxState* next_pos) const; // Implementation functions for above TrackStepImpl. // Returns bounding box for start position and the expansion magnitude // (normalized) that was applied. void GetStartPosition(const MotionBoxState& curr_pos, float aspect_ratio, float* expand_mag, Vector2_f* top_left, Vector2_f* bottom_right) const; // Outputs spatial sigma in x and y for spatial weighting. // Pass current box_state and inverse box domain size. void GetSpatialGaussWeights(const MotionBoxState& box_state, const Vector2_f& inv_box_domain, float* spatial_gauss_x, float* spatial_gauss_y) const; // Outputs subset of motion_vectors that are within the specified domain // (top_left to bottom_right). Only searches over the range specified via // start and end idx. // Each vector is weighted based on gaussian proximity, similar motion, // track continuity, etc. which forms the prior weight of each feature. // Features are binned into a grid of fixed dimension for density analysis. // Also output number of vectors with good prior weights (> 0.1), and number // of continued inliers. // Returns true on success, false on failure. When it returns false, the // output values are not reliable. bool GetVectorsAndWeights( const std::vector& motion_vectors, int start_idx, int end_idx, const Vector2_f& top_left, const Vector2_f& bottom_right, const MotionBoxState& box_state, bool valid_background_model, bool is_chunk_boundary, float temporal_scale, // Scale for velocity from standard frame period. float expand_mag, const std::vector& history, std::vector* vectors, std::vector* prior_weights, int* number_of_good_prior, int* number_of_cont_inliers) const; // Initializes weights by performing multiple ransac rounds from vectors. // Error is scaled by irls scale along parallel and orthogonal direction. void TranslationIrlsInitialization( const std::vector& vectors, const Vector2_f& irls_scale, std::vector* weights) const; // Wrapper function, estimating object motion w.r.t. various degrees // of freedom. void EstimateObjectMotion( const std::vector& motion_vectors, const std::vector& prior_weights, int num_continued_inliers, const Vector2_f& irls_scale, std::vector* weights, Vector2_f* object_translation, LinearSimilarityModel* object_similarity, Homography* object_homography) const; // Perform IRLS estimation of the passed motion_vector's object motion. // Each vector is weighted by original_weight / estimation_error, where // estimation_error is refined in each estimation round. // Outputs final translation and resulting irls weights (1.0 / // estimation_error, i.e. with prior bias). void EstimateTranslation( const std::vector& motion_vectors, const std::vector& orig_weights, const Vector2_f& irls_scale, std::vector* weights, Vector2_f* translation) const; // Same as above for similarity. Returns false on failure (numerical // instability is most common case here). bool EstimateSimilarity( const std::vector& motion_vectors, const std::vector& orig_weights, const Vector2_f& irls_scale, std::vector* weights, LinearSimilarityModel* lin_sim) const; // Same as above for homograph. bool EstimateHomography( const std::vector& motion_vectors, const std::vector& prior_weights, const Vector2_f& irls_scale, std::vector* weights, Homography* object_homography) const; // Perform 6DoF perspective transform based homography estimation using // motion_vector's object + background motion. // weights are used to determine whether a vector is inlier or outliers. // The perspective solver will exclude those vectors with weights smaller than // kMaxOutlierWeight (0.1). bool EstimatePnpHomography( const MotionBoxState& curr_pos, const std::vector& motion_vectors, const std::vector& weights, float domain_x, float domain_y, Homography* pnp_homography) const; // Apply pre-computed perspective transform based homography to the next pos. void ApplyObjectMotionPerspectively(const MotionBoxState& curr_pos, const Homography& pnp_homography, float domain_x, float domain_y, MotionBoxState* next_pos) const; // Scores every vector after translation estimation into inliers and outliers // (based on post_estimation_weights, inlierness is a measure in [0, 1]), // and records result in next_pos as well as returning inlierness per vector // in inlier_weights. // Also computes the following statistics: // - inlier_density: Local density for each inlier, i.e. measure of how many // other inliers are close to that point. In [0, 1]. // - continued_inliers: Number of inliers that continue to be present already // in curr_pos (same track id) // - swapped_inliers: Number of inliers that are outliers in curr_pos (same // track id). // - motion_inliers: Number of inliers of similar motion as previous state. // This measure is complementary to above continued inliers // in case object is moving significantly, in which case // tracks tend to be short lived. // - kinetic_average: Average object norm of all inliers weighted by // pre_estimation_weights. void ScoreAndRecordInliers(const MotionBoxState& curr_pos, const std::vector& vectors, const std::vector& grid_positions, const std::vector& pre_estimation_weights, const std::vector& post_estimation_weights, float background_discrimination, MotionBoxState* next_pos, std::vector* inlier_weights, std::vector* inlier_density, int* continued_inliers, int* swapped_inliers, float* motion_inliers, float* kinetic_average) const; // Computes motion disparity (in [0, 1]), that is how well does the current // object motion agree with the previous object motion. // 0 indicates perfect match, 1 indicates signicant difference. float ComputeMotionDisparity(const MotionBoxState& curr_pos, const Vector2_f& irls_scale, float continued_inliers, int num_inliers, const Vector2_f& object_motion) const; // Computes inlier center and extent (vector positions weighted by // weights and density). // Sets center inlier center, if inlier_weight is above min_inlier_sum, // else to the Motion box center. void ComputeInlierCenterAndExtent( const std::vector& motion_vectors, const std::vector& weights, const std::vector& density, const MotionBoxState& state, float* min_inlier_sum, Vector2_f* center, Vector2_f* extent) const; float ScaleEstimate(const std::vector& motion_vectors, const std::vector& weights, float min_sum) const; // Applies spring force from box_state's position to center_of_interest, if // difference is above rel_threshold. Correcting force equals difference // above threshold times the spring_force coefficient. void ApplySpringForce(const Vector2_f& center_of_interest, const float rel_threshold, const float spring_force, MotionBoxState* box_state) const; // Compute the tracking confidence and return the value. // The confidence is a float value in [0, 1], with 0 being least confident, // and 1 being most confident. float ComputeTrackingConfidence(const MotionBoxState& motion_box_state) const; private: class ObjectMotionValidator { public: static bool IsValidSimilarity( const LinearSimilarityModel& linear_similarity_model, float max_scale, float max_rotation) { SimilarityModel similarity_model = LinearSimilarityAdapter::ToSimilarity(linear_similarity_model); if (similarity_model.scale() < 1.0f / max_scale || similarity_model.scale() > max_scale || std::abs(similarity_model.rotation()) > max_rotation) { return false; } return true; } static bool IsValidHomography(const Homography& homography, float max_scale, float max_rotation) { // Filter out abnormal homography. Otherwise the determinant of // projected affine matrix will be negative. if (!IsInverseStable(homography)) { LOG(WARNING) << "Homography matrix is not stable."; return false; } LinearSimilarityModel similarity_model = LinearSimilarityAdapter::ProjectFrom(homography, 1.0f, 1.0f); return IsValidSimilarity(similarity_model, max_scale, max_rotation); } // Check if it is a convex quad. static bool IsValidQuad(const MotionBoxState::Quad& quad) { const int kQuadVerticesSize = 8; CHECK_EQ(quad.vertices_size(), kQuadVerticesSize); for (int a = 0; a < kQuadVerticesSize; a += 2) { int b = (a + 2) % kQuadVerticesSize; int c = (a - 2 + kQuadVerticesSize) % kQuadVerticesSize; Vector2_f ab(quad.vertices(b) - quad.vertices(a), quad.vertices(b + 1) - quad.vertices(a + 1)); Vector2_f ac(quad.vertices(c) - quad.vertices(a), quad.vertices(c + 1) - quad.vertices(a + 1)); // Since quad's vertices is defined in counter-clockwise manner, we only // accept negative cross product. if (ab.CrossProd(ac) >= 0) { return false; } } return true; } // Check if all the 4 corners of the quad are out of FOV. static bool IsQuadOutOfFov(const MotionBoxState::Quad& quad, const Vector2_f& fov) { const int kQuadVerticesSize = 8; CHECK_EQ(quad.vertices_size(), kQuadVerticesSize); bool too_far = true; for (int j = 0; j < kQuadVerticesSize; j += 2) { if (quad.vertices(j) < fov.x() && quad.vertices(j) > 0.0f && quad.vertices(j + 1) < fov.y() && quad.vertices(j + 1) > 0.0f) { too_far = false; break; } } return too_far; } }; class DistanceWeightsComputer { public: DistanceWeightsComputer(const MotionBoxState& initial_state, const MotionBoxState& current_state, const TrackStepOptions& options); // Compute distance weight based on input motion vector position. float ComputeDistanceWeight(const MotionVector& test_vector); private: Homography ComputeHomographyFromQuad(const MotionBoxState::Quad& src_quad, const MotionBoxState::Quad& dst_quad); float cos_neg_a_; float sin_neg_a_; float spatial_gauss_x_; float spatial_gauss_y_; Vector2_f inv_box_domain_; Vector2_f box_center_; Vector2_f box_center_transformed_; bool is_large_rotation_ = false; Homography homography_; // homography from current box to initial box TrackStepOptions::TrackingDegrees tracking_degrees_; }; TrackStepOptions options_; std::deque states_; int queue_start_; int start_track_; int end_track_; MotionBoxState initial_state_; }; } // namespace mediapipe. #endif // MEDIAPIPE_UTIL_TRACKING_TRACKING_H_