mediapipe/mediapipe2/util/tracking/tracking.cc

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "mediapipe/util/tracking/tracking.h"

#include <algorithm>
#include <cmath>
#include <memory>
#include <numeric>
#include <random>
#include <unordered_set>

#include "Eigen/Core"
#include "Eigen/Dense"
#include "Eigen/SVD"
#include "absl/algorithm/container.h"
#include "absl/memory/memory.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/opencv_calib3d_inc.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
#include "mediapipe/util/tracking/flow_packager.pb.h"
#include "mediapipe/util/tracking/measure_time.h"
#include "mediapipe/util/tracking/motion_models.h"

namespace mediapipe {

bool MotionBox::print_motion_box_warnings_ = true;

namespace {

static constexpr int kNormalizationGridSize = 10;
constexpr float kShortScale = 16383.0f;
constexpr float kInvShortScale = 1.0f / kShortScale;

// Motion vectors with weights larger than kMinInlierWeight are classified as
// inliers.
constexpr float kMinInlierWeight = 0.5f;
// Motion vectors with weights smaller han kMaxOutlierWeight are classified as
// outliers.
constexpr float kMaxOutlierWeight = 0.1f;

// Lexicographic (first x, then y) comparator for MotionVector::pos.
struct MotionVectorComparator {
  bool operator()(const MotionVector& lhs, const MotionVector& rhs) const {
    return (lhs.pos.x() < rhs.pos.x()) ||
           (lhs.pos.x() == rhs.pos.x() && lhs.pos.y() < rhs.pos.y());
  }
};

void StoreInternalState(const std::vector<const MotionVector*>& vectors,
                        const std::vector<float>& inlier_weights,
                        float aspect_ratio, MotionBoxInternalState* internal) {
  const int num_vectors = vectors.size();
  CHECK_EQ(num_vectors, inlier_weights.size());

  float scale_x = 1.0f;
  float scale_y = 1.0f;
  ScaleFromAspect(aspect_ratio, true, &scale_x, &scale_y);

  internal->Clear();
  for (int k = 0; k < num_vectors; ++k) {
    internal->add_pos_x(vectors[k]->pos.x() * scale_x);
    internal->add_pos_y(vectors[k]->pos.y() * scale_y);
    internal->add_dx(vectors[k]->object.x() * scale_x);
    internal->add_dy(vectors[k]->object.y() * scale_y);
    internal->add_camera_dx(vectors[k]->background.x() * scale_x);
    internal->add_camera_dy(vectors[k]->background.y() * scale_y);
    internal->add_track_id(vectors[k]->track_id);
    internal->add_inlier_score(inlier_weights[k]);
  }
}

// protolite compatible MotionBoxState_TrackStatus_Name
std::string TrackStatusToString(MotionBoxState::TrackStatus status) {
  switch (status) {
    case MotionBoxState::BOX_UNTRACKED:
      return "BOX_UNTRACKED";
    case MotionBoxState::BOX_EMPTY:
      return "BOX_EMPTY";
    case MotionBoxState::BOX_NO_FEATURES:
      return "BOX_NO_FEATURES";
    case MotionBoxState::BOX_TRACKED:
      return "BOX_TRACKED";
    case MotionBoxState::BOX_DUPLICATED:
      return "BOX_DUPLICATED";
    case MotionBoxState::BOX_TRACKED_OUT_OF_BOUND:
      return "BOX_TRACKED_OUT_OF_BOUND";
  }
  LOG(FATAL) << "Should not happen.";
  return "UNKNOWN";
}

void ClearInlierState(MotionBoxState* state) {
  state->clear_inlier_ids();
  state->clear_inlier_length();
  state->clear_inlier_id_match_pos();
  state->clear_outlier_ids();
  state->clear_outlier_id_match_pos();
}

// Returns orthogonal error system from motion_vec scaled by irls_scale.
std::pair<Vector2_f, Vector2_f> ComputeIrlsErrorSystem(
    const Vector2_f& irls_scale, const Vector2_f& motion_vec) {
  Vector2_f irls_vec = motion_vec.Normalize();
  Vector2_f irls_vec_ortho = irls_vec.Ortho();
  return std::make_pair(irls_vec * irls_scale.x(),
                        irls_vec_ortho * irls_scale.y());
}

// Returns error for a given difference vector and error system.
float ErrorDiff(const Vector2_f& diff,
                const std::pair<Vector2_f, Vector2_f>& error_system) {
  // Error system is an orthogonal system of originally unit vectors that were
  // pre-multiplied by the corresponding irls scale.
  // One can think of this function here as L2 norm *after* scaling the whole
  // vector space w.r.t. the error system.
  //
  // In particular, we will project the vector diff onto this system and then
  // scale the magnitude along each direction with the corresponding irls scale.
  // As scalar multiplication is commutative with the dot product of vectors
  // pre-multiplication of the scale with the error system is sufficient.
  return Vector2_f(diff.DotProd(error_system.first),
                   diff.DotProd(error_system.second))
      .Norm();
}

// Returns true if point is within the inlier extent of the passed state (within
// small bound of 5% of frame diameter).
bool PointWithinInlierExtent(const Vector2_f pt, const MotionBoxState& state) {
  // No extent known, assume to be inside.
  if (state.prior_weight() == 0) {
    return true;
  }

  const float width_radius = state.inlier_width() * 0.55f;
  const float height_radius = state.inlier_height() * 0.55f;
  const float left = state.inlier_center_x() - width_radius;
  const float right = state.inlier_center_x() + width_radius;
  const float top = state.inlier_center_y() - height_radius;
  const float bottom = state.inlier_center_y() + height_radius;

  return pt.x() >= left && pt.x() <= right && pt.y() >= top && pt.y() <= bottom;
}

// Taken from MotionEstimation::LinearSimilarityL2SolveSystem.
bool LinearSimilarityL2Solve(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& weights, LinearSimilarityModel* model) {
  CHECK(model);
  if (motion_vectors.size() < 4) {
    LOG(ERROR) << "Requiring at least 4 input vectors for sufficient solve.";
    return false;
  }

  cv::Mat matrix(4, 4, CV_32F);
  cv::Mat solution(4, 1, CV_32F);
  cv::Mat rhs(4, 1, CV_32F);

  matrix.setTo(0);
  rhs.setTo(0);

  CHECK_EQ(motion_vectors.size(), weights.size());
  for (int k = 0; k < motion_vectors.size(); ++k) {
    const float x = motion_vectors[k]->pos.x();
    const float y = motion_vectors[k]->pos.y();
    const float w = weights[k];

    // double J[2 * 4] = {1, 0, x,  -y,
    //                    0, 1, y,   x};
    // Compute J^t * J * w = {1,  0,   x,    -y
    //                        0,  1,   y,     x,
    //                        x,  y,   xx+yy, 0,
    //                       -y,  x,   0,     xx+yy} * w;

    const float x_w = x * w;
    const float y_w = y * w;
    const float xx_yy_w = (x * x + y * y) * w;
    float* matrix_ptr = matrix.ptr<float>(0);
    matrix_ptr[0] += w;
    matrix_ptr[2] += x_w;
    matrix_ptr[3] += -y_w;

    matrix_ptr += 4;
    matrix_ptr[1] += w;
    matrix_ptr[2] += y_w;
    matrix_ptr[3] += x_w;

    matrix_ptr += 4;
    matrix_ptr[0] += x_w;
    matrix_ptr[1] += y_w;
    matrix_ptr[2] += xx_yy_w;

    matrix_ptr += 4;
    matrix_ptr[0] += -y_w;
    matrix_ptr[1] += x_w;
    matrix_ptr[3] += xx_yy_w;

    float* rhs_ptr = rhs.ptr<float>(0);

    const float m_x = motion_vectors[k]->object.x() * w;
    const float m_y = motion_vectors[k]->object.y() * w;

    rhs_ptr[0] += m_x;
    rhs_ptr[1] += m_y;
    rhs_ptr[2] += x * m_x + y * m_y;
    rhs_ptr[3] += -y * m_x + x * m_y;
  }

  // Solution parameters p.
  if (cv::solve(matrix, rhs, solution)) {
    const float* ptr = solution.ptr<float>(0);
    model->set_dx(ptr[0]);
    model->set_dy(ptr[1]);
    model->set_a(ptr[2] + 1.0);  // Identity parametrization.
    model->set_b(ptr[3]);
    return true;
  } else {
    return false;
  }
}

// Taken from MotionEstimation::HomographyL2NormalEquationSolve
bool HomographyL2Solve(const std::vector<const MotionVector*>& motion_vectors,
                       const std::vector<float>& weights, Homography* model) {
  CHECK(model);

  cv::Mat matrix(8, 8, CV_32F);
  cv::Mat solution(8, 1, CV_32F);
  cv::Mat rhs(8, 1, CV_32F);

  matrix.setTo(0);
  rhs.setTo(0);

  // Matrix multiplications are hand-coded for speed improvements vs.
  // opencv's cvGEMM calls.
  CHECK_EQ(motion_vectors.size(), weights.size());
  for (int k = 0; k < motion_vectors.size(); ++k) {
    const float x = motion_vectors[k]->pos.x();
    const float y = motion_vectors[k]->pos.y();
    const float w = weights[k];

    const float xw = x * w;
    const float yw = y * w;
    const float xxw = x * x * w;
    const float yyw = y * y * w;
    const float xyw = x * y * w;
    const float mx = x + motion_vectors[k]->object.x();
    const float my = y + motion_vectors[k]->object.y();

    const float mxxyy = mx * mx + my * my;
    // Jacobian
    // double J[2 * 8] = {x, y, 1,  0,  0,   0, -x * m_x, -y * m_x,
    //                   {0, 0, 0,  x,  y,   1, -x * m_y, -y * m_y}
    //
    // // Compute J^t * J * w =
    // ( xx        xy    x      0       0    0    -xx*mx  -xy*mx    )
    // ( xy        yy    y      0       0    0    -xy*mx  -yy*mx    )
    // ( x         y     1      0       0    0     -x*mx   -y*mx    )
    // ( 0         0     0     xx      xy    x    -xx*my  -xy*my    )
    // ( 0         0     0     xy      yy    y    -xy*my  -yy*my    )
    // ( 0         0     0      x      y     1     -x*my   -y*my    )
    // ( -xx*mx -xy*mx -x*mx -xx*my -xy*my -x*my xx*mxxyy  xy*mxxyy )
    // ( -xy*mx -yy*mx -y*mx -xy*my -yy*my -y*my xy*mxxyy  yy*mxxyy  ) * w

    // 1st row:  xx        xy    x      0       0    0    -xx*mx  -xy*mx
    float* matrix_ptr = matrix.ptr<float>(0);
    matrix_ptr[0] += xxw;
    matrix_ptr[1] += xyw;
    matrix_ptr[2] += xw;
    matrix_ptr[6] += -xxw * mx;
    matrix_ptr[7] += -xyw * mx;

    // 2nd row:  xy       yy   y      0       0    0    -xy*mx  -yy*mx
    matrix_ptr += 8;
    matrix_ptr[0] += xyw;
    matrix_ptr[1] += yyw;
    matrix_ptr[2] += yw;
    matrix_ptr[6] += -xyw * mx;
    matrix_ptr[7] += -yyw * mx;

    // 3rd row: x         y     1      0       0    0     -x*mx   -y*mx
    matrix_ptr += 8;
    matrix_ptr[0] += xw;
    matrix_ptr[1] += yw;
    matrix_ptr[2] += w;
    matrix_ptr[6] += -xw * mx;
    matrix_ptr[7] += -yw * mx;

    // 4th row: 0         0     0     xx      xy    x    -xx*my  -xy*my
    matrix_ptr += 8;
    matrix_ptr[3] += xxw;
    matrix_ptr[4] += xyw;
    matrix_ptr[5] += xw;
    matrix_ptr[6] += -xxw * my;
    matrix_ptr[7] += -xyw * my;

    // 5th row: 0         0     0     xy      yy    y    -xy*my  -yy*my
    matrix_ptr += 8;
    matrix_ptr[3] += xyw;
    matrix_ptr[4] += yyw;
    matrix_ptr[5] += yw;
    matrix_ptr[6] += -xyw * my;
    matrix_ptr[7] += -yyw * my;

    // 6th row:  0         0     0     x     y     1      -x*my    -y*my
    matrix_ptr += 8;
    matrix_ptr[3] += xw;
    matrix_ptr[4] += yw;
    matrix_ptr[5] += w;
    matrix_ptr[6] += -xw * my;
    matrix_ptr[7] += -yw * my;

    // 7th row:  -xx*mx -xy*mx -x*mx -xx*my -xy*my -x*my xx*mxxyy  xy*mxxyy
    matrix_ptr += 8;
    matrix_ptr[0] += -xxw * mx;
    matrix_ptr[1] += -xyw * mx;
    matrix_ptr[2] += -xw * mx;
    matrix_ptr[3] += -xxw * my;
    matrix_ptr[4] += -xyw * my;
    matrix_ptr[5] += -xw * my;
    matrix_ptr[6] += xxw * mxxyy;
    matrix_ptr[7] += xyw * mxxyy;

    // 8th row: -xy*mx -yy*mx -y*mx -xy*my -yy*my -y*my xy*mxxyy  yy*mxxyy
    matrix_ptr += 8;
    matrix_ptr[0] += -xyw * mx;
    matrix_ptr[1] += -yyw * mx;
    matrix_ptr[2] += -yw * mx;
    matrix_ptr[3] += -xyw * my;
    matrix_ptr[4] += -yyw * my;
    matrix_ptr[5] += -yw * my;
    matrix_ptr[6] += xyw * mxxyy;
    matrix_ptr[7] += yyw * mxxyy;

    // Right hand side:
    // b = ( x
    //       y )
    // Compute J^t * b  * w =
    // ( x*mx  y*mx  mx  x*my  y*my  my  -x*mxxyy -y*mxxyy ) * w

    float* rhs_ptr = rhs.ptr<float>(0);
    rhs_ptr[0] += xw * mx;
    rhs_ptr[1] += yw * mx;
    rhs_ptr[2] += mx * w;
    rhs_ptr[3] += xw * my;
    rhs_ptr[4] += yw * my;
    rhs_ptr[5] += my * w;
    rhs_ptr[6] += -xw * mxxyy;
    rhs_ptr[7] += -yw * mxxyy;
  }

  // Solution parameters p.
  if (cv::solve(matrix, rhs, solution)) {
    const float* ptr = solution.ptr<float>(0);
    *model = HomographyAdapter::FromFloatPointer(ptr, false);
    return true;
  }

  return false;
}

void TransformQuadInMotionBoxState(const MotionBoxState& curr_pos,
                                   const Homography& homography,
                                   MotionBoxState* next_pos) {
  CHECK(next_pos != nullptr);
  if (!curr_pos.has_pos_x() || !curr_pos.has_pos_y() || !curr_pos.has_width() ||
      !curr_pos.has_height()) {
    LOG(ERROR) << "Previous box does not exist, cannot transform!";
    return;
  }
  const int kQuadVerticesSize = 8;
  const MotionBoxState::Quad* curr_quad_ptr = nullptr;
  auto quad = absl::make_unique<MotionBoxState::Quad>();
  if (curr_pos.has_quad() &&
      curr_pos.quad().vertices_size() == kQuadVerticesSize) {
    curr_quad_ptr = &curr_pos.quad();
  } else {
    std::array<Vector2_f, 4> corners =
        GetCornersOfRotatedRect(curr_pos, Vector2_f(1.0f, 1.0f));
    for (const auto& vertex : corners) {
      quad->add_vertices(vertex.x());
      quad->add_vertices(vertex.y());
    }
    curr_quad_ptr = quad.get();
  }

  MotionBoxState::Quad* next_pos_quad = next_pos->mutable_quad();
  bool next_pos_quad_existed = true;
  if (next_pos_quad->vertices_size() != kQuadVerticesSize) {
    next_pos_quad_existed = false;
    next_pos_quad->clear_vertices();
  }
  for (int i = 0; i < kQuadVerticesSize / 2; ++i) {
    const auto& curr_pos_quad_vertex = Vector2_f(
        curr_quad_ptr->vertices(i * 2), curr_quad_ptr->vertices(i * 2 + 1));
    const auto& next_pos_quad_vertex_diff =
        HomographyAdapter::TransformPoint(homography, curr_pos_quad_vertex) -
        curr_pos_quad_vertex;
    if (next_pos_quad_existed) {
      next_pos_quad->set_vertices(i * 2, next_pos_quad->vertices(i * 2) +
                                             next_pos_quad_vertex_diff.x());
      next_pos_quad->set_vertices(
          i * 2 + 1,
          next_pos_quad->vertices(i * 2 + 1) + next_pos_quad_vertex_diff.y());
    } else {
      next_pos_quad->add_vertices(curr_pos_quad_vertex.x() +
                                  next_pos_quad_vertex_diff.x());
      next_pos_quad->add_vertices(curr_pos_quad_vertex.y() +
                                  next_pos_quad_vertex_diff.y());
    }
  }
}

void UpdateStatePositionAndSizeFromStateQuad(MotionBoxState* box_state) {
  Vector2_f top_left, bottom_right;
  MotionBoxBoundingBox(*box_state, &top_left, &bottom_right);
  box_state->set_width(bottom_right.x() - top_left.x());
  box_state->set_height(bottom_right.y() - top_left.y());
  box_state->set_pos_x(top_left.x());
  box_state->set_pos_y(top_left.y());
}

void ApplyCameraTrackingDegrees(const MotionBoxState& prev_state,
                                const Homography& background_model,
                                const TrackStepOptions& options,
                                const Vector2_f& domain,
                                MotionBoxState* next_state) {
  // Determine center translation.
  Vector2_f center(MotionBoxCenter(prev_state));
  const Vector2_f background_motion =
      HomographyAdapter::TransformPoint(background_model, center) - center;

  if (options.tracking_degrees() ==
          TrackStepOptions::TRACKING_DEGREE_TRANSLATION ||
      !options.track_object_and_camera()) {
    SetMotionBoxPosition(MotionBoxPosition(*next_state) + background_motion,
                         next_state);
    return;
  }

  // Transform corners and fit similarity.
  // Overall idea:
  // We got corners, x0, x1, x2, x3 of the rect in the previous location
  // transform by background_model H.
  // Assuming H = [A | t], their target location in the next frame
  // is:
  // xi' = A * xi + t for i = 0..3
  // We want to express the location of ci' w.r.t. to the translated center c,
  // to decouple H from the translation of the center.
  // In particular, we are looking for the translation of the center:
  // c* = c + t* and points
  // xi* = xi + t*
  // Express location of xi' w.r.t. c:
  // xi' = A(xi* - c*) + c*
  // Axi + t = A(xi - c) + c + t*
  // Axi + t = Axi - Ac + c + t*
  // t* = Ac - c + t
  std::array<Vector2_f, 4> corners = MotionBoxCorners(prev_state);
  std::vector<MotionVector> corner_vecs(4);
  std::vector<const MotionVector*> corner_vec_ptrs(4);

  for (int k = 0; k < 4; ++k) {
    MotionVector v;
    v.pos = corners[k];
    v.object = HomographyAdapter::TransformPoint(background_model, corners[k]) -
               corners[k];
    corner_vecs[k] = v;
    corner_vec_ptrs[k] = &corner_vecs[k];
  }

  LinearSimilarityModel linear_similarity;
  LinearSimilarityL2Solve(corner_vec_ptrs, std::vector<float>(4, 1.0f),
                          &linear_similarity);

  SimilarityModel similarity =
      LinearSimilarityAdapter::ToSimilarity(linear_similarity);

  // See above derivation, motion of the center is t* = Ac + t - c;
  // Could also make the point that background_model instead of
  // linear_similarity is more accurate here due to the fitting operation above.
  SetMotionBoxPosition(MotionBoxPosition(*next_state) +
                           TransformPoint(linear_similarity, center) - center,
                       next_state);

  switch (options.tracking_degrees()) {
    case TrackStepOptions::TRACKING_DEGREE_TRANSLATION:
      break;
    case TrackStepOptions::TRACKING_DEGREE_CAMERA_SCALE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE:
      next_state->set_scale(next_state->scale() * similarity.scale());
      break;
    case TrackStepOptions::TRACKING_DEGREE_CAMERA_ROTATION:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION:
      next_state->set_rotation(next_state->rotation() + similarity.rotation());
      break;
    case TrackStepOptions::TRACKING_DEGREE_CAMERA_ROTATION_SCALE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE:
      next_state->set_scale(next_state->scale() * similarity.scale());
      next_state->set_rotation(next_state->rotation() + similarity.rotation());
      break;
    case TrackStepOptions::TRACKING_DEGREE_CAMERA_PERSPECTIVE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE:
      TransformQuadInMotionBoxState(prev_state, background_model, next_state);
      if (prev_state.has_pnp_homography()) {
        *(next_state->mutable_pnp_homography()) = HomographyAdapter::Compose(
            prev_state.pnp_homography(), background_model);
        UpdateStatePositionAndSizeFromStateQuad(next_state);
      }
      break;
  }
}

void ApplyObjectMotion(const MotionBoxState& curr_pos,
                       const Vector2_f& object_translation,
                       const LinearSimilarityModel& object_similarity,
                       const Homography& object_homography,
                       const TrackStepOptions& options,
                       MotionBoxState* next_pos) {
  switch (options.tracking_degrees()) {
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE: {
      Vector2_f center(MotionBoxCenter(curr_pos));
      // See ApplyCameraTrackingDegrees for derivation.
      SetMotionBoxPosition(MotionBoxPosition(*next_pos) +
                               TransformPoint(object_similarity, center) -
                               center,
                           next_pos);
      SimilarityModel similarity =
          LinearSimilarityAdapter::ToSimilarity(object_similarity);
      if (options.tracking_degrees() !=
          TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION) {
        next_pos->set_scale(next_pos->scale() * similarity.scale());
      }
      if (options.tracking_degrees() !=
          TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE) {
        next_pos->set_rotation(next_pos->rotation() + similarity.rotation());
      }
      break;
    }

    case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE: {
      Vector2_f center(MotionBoxCenter(curr_pos));
      SetMotionBoxPosition(
          MotionBoxPosition(*next_pos) +
              HomographyAdapter::TransformPoint(object_homography, center) -
              center,
          next_pos);
      TransformQuadInMotionBoxState(curr_pos, object_homography, next_pos);
      break;
    }
    default:
      // Use translation per default.
      SetMotionBoxPosition(MotionBoxPosition(*next_pos) + object_translation,
                           next_pos);
  }
}

bool IsBoxValid(const MotionBoxState& state) {
  const float kMaxBoxHeight =
      10000.0f;  // as relative to normalized [0, 1] space
  const float kMaxBoxWidth =
      10000.0f;  // as relative to normalized [0, 1] space
  if (state.width() > kMaxBoxWidth) {
    LOG(ERROR) << "box width " << state.width() << " too big";
    return false;
  }
  if (state.height() > kMaxBoxHeight) {
    LOG(ERROR) << "box height " << state.height() << " too big";
    return false;
  }

  return true;
}

Homography PnpHomographyFromRotationAndTranslation(const cv::Mat& rvec,
                                                   const cv::Mat& tvec) {
  cv::Mat homography_matrix(3, 3, CV_64F);
  cv::Rodrigues(rvec, homography_matrix);

  for (int c = 0; c < 3; ++c) {
    homography_matrix.at<double>(c, 2) = tvec.at<double>(c);
  }

  // check non zero
  homography_matrix /= homography_matrix.at<double>(2, 2);

  return HomographyAdapter::FromDoublePointer(homography_matrix.ptr<double>(0),
                                              false);
}

// Translate CameraIntrinsics proto to cv format.
void ConvertCameraIntrinsicsToCvMat(
    const TrackStepOptions::CameraIntrinsics& camera_intrinsics,
    cv::Mat* camera_mat, cv::Mat* dist_coef) {
  *camera_mat = cv::Mat::eye(3, 3, CV_64F);
  *dist_coef = cv::Mat::zeros(1, 5, CV_64FC1);
  camera_mat->at<double>(0, 0) = camera_intrinsics.fx();
  camera_mat->at<double>(1, 1) = camera_intrinsics.fy();
  camera_mat->at<double>(0, 2) = camera_intrinsics.cx();
  camera_mat->at<double>(1, 2) = camera_intrinsics.cy();
  dist_coef->at<double>(0) = camera_intrinsics.k0();
  dist_coef->at<double>(1) = camera_intrinsics.k1();
  dist_coef->at<double>(4) = camera_intrinsics.k2();
}

}  // namespace.

void ScaleFromAspect(float aspect, bool invert, float* scale_x,
                     float* scale_y) {
  *scale_x = aspect >= 1.0f ? 1.0f : aspect;
  *scale_y = aspect >= 1.0f ? 1.0f / aspect : 1.0f;
  if (invert) {
    *scale_x = 1.0f / *scale_x;
    *scale_y = 1.0f / *scale_y;
  }
}

std::array<Vector2_f, 4> MotionBoxCorners(const MotionBoxState& state,
                                          const Vector2_f& scaling) {
  std::array<Vector2_f, 4> transformed_corners;
  if (state.has_quad() && state.quad().vertices_size() == 8) {
    for (int k = 0; k < 4; ++k) {
      transformed_corners[k] = Vector2_f(state.quad().vertices(2 * k),
                                         state.quad().vertices(2 * k + 1))
                                   .MulComponents(scaling);
    }
  } else {
    transformed_corners = GetCornersOfRotatedRect(state, scaling);
  }

  return transformed_corners;
}

bool MotionBoxLines(const MotionBoxState& state, const Vector2_f& scaling,
                    std::array<Vector3_f, 4>* box_lines) {
  CHECK(box_lines);
  std::array<Vector2_f, 4> corners = MotionBoxCorners(state, scaling);
  std::array<Vector3_f, 4> lines;
  for (int k = 0; k < 4; ++k) {
    const Vector2_f diff = corners[(k + 1) % 4] - corners[k];
    const Vector2_f normal = diff.Ortho().Normalize();
    box_lines->at(k).Set(normal.x(), normal.y(), -normal.DotProd(corners[k]));
    // Double check that second point is on the computed line.
    if (box_lines->at(k).DotProd(Vector3_f(corners[(k + 1) % 4].x(),
                                           corners[(k + 1) % 4].y(), 1.0f)) >=
        0.02f) {
      LOG(ERROR) << "box is abnormal. Line equations don't satisfy constraint";
      return false;
    }
  }
  return true;
}

void MotionBoxBoundingBox(const MotionBoxState& state, Vector2_f* top_left,
                          Vector2_f* bottom_right) {
  CHECK(top_left);
  CHECK(bottom_right);

  std::array<Vector2_f, 4> corners = MotionBoxCorners(state);

  // Determine min and max across dimension.
  *top_left = Vector2_f(std::numeric_limits<float>::max(),
                        std::numeric_limits<float>::max());

  *bottom_right = Vector2_f(std::numeric_limits<float>::min(),
                            std::numeric_limits<float>::min());

  for (const Vector2_f& c : corners) {
    top_left->x(std::min(top_left->x(), c.x()));
    top_left->y(std::min(top_left->y(), c.y()));
    bottom_right->x(std::max(bottom_right->x(), c.x()));
    bottom_right->y(std::max(bottom_right->y(), c.y()));
  }
}

void MotionBoxInlierLocations(const MotionBoxState& state,
                              std::vector<Vector2_f>* inlier_pos) {
  CHECK(inlier_pos);
  inlier_pos->clear();
  for (int k = 0; k < state.inlier_id_match_pos_size(); k += 2) {
    inlier_pos->push_back(
        Vector2_f(state.inlier_id_match_pos(k) * kInvShortScale,
                  state.inlier_id_match_pos(k + 1) * kInvShortScale));
  }
}

void MotionBoxOutlierLocations(const MotionBoxState& state,
                               std::vector<Vector2_f>* outlier_pos) {
  CHECK(outlier_pos);
  outlier_pos->clear();
  for (int k = 0; k < state.outlier_id_match_pos_size(); k += 2) {
    outlier_pos->push_back(
        Vector2_f(state.outlier_id_match_pos(k) * kInvShortScale,
                  state.outlier_id_match_pos(k + 1) * kInvShortScale));
  }
}

std::array<Vector2_f, 4> GetCornersOfRotatedRect(const MotionBoxState& state,
                                                 const Vector2_f& scaling) {
  std::array<Vector2_f, 4> transformed_corners;
  // Scale and rotate 4 corner w.r.t. center.
  const Vector2_f center = MotionBoxCenter(state).MulComponents(scaling);
  const Vector2_f top_left = MotionBoxPosition(state).MulComponents(scaling);
  const std::array<Vector2_f, 4> corners{{
      top_left,
      top_left + Vector2_f(0, state.height() * scaling.y()),
      top_left +
          Vector2_f(state.width() * scaling.x(), state.height() * scaling.y()),
      top_left + Vector2_f(state.width() * scaling.x(), 0),
  }};

  const float cos_a = std::cos(state.rotation());
  const float sin_a = std::sin(state.rotation());
  for (int k = 0; k < 4; ++k) {
    // Scale and rotate w.r.t. center.
    const Vector2_f rad = corners[k] - center;
    const Vector2_f rot_rad(cos_a * rad.x() - sin_a * rad.y(),
                            sin_a * rad.x() + cos_a * rad.y());
    const Vector2_f transformed_corner = center + rot_rad * state.scale();
    transformed_corners[k] = transformed_corner;
  }

  return transformed_corners;
}

void InitializeQuadInMotionBoxState(MotionBoxState* state) {
  CHECK(state != nullptr);
  // Every quad has 4 vertices. Each vertex has x and y 2 coordinates. So
  // a total of 8 floating point values.
  if (state->quad().vertices_size() != 8) {
    MotionBoxState::Quad* quad_ptr = state->mutable_quad();
    quad_ptr->clear_vertices();
    std::array<Vector2_f, 4> corners =
        GetCornersOfRotatedRect(*state, Vector2_f(1.0f, 1.0f));
    for (const auto& vertex : corners) {
      quad_ptr->add_vertices(vertex.x());
      quad_ptr->add_vertices(vertex.y());
    }
  }
}

void InitializeInliersOutliersInMotionBoxState(const TrackingData& tracking,
                                               MotionBoxState* state) {
  MotionVectorFrame mvf;  // Holds motion from current to previous frame.
  MotionVectorFrameFromTrackingData(tracking, &mvf);

  std::array<Vector3_f, 4> box_lines;
  if (!MotionBoxLines(*state, Vector2_f(1.0f, 1.0f), &box_lines)) {
    LOG(ERROR) << "Error in computing MotionBoxLines.";
    return;
  }

  // scale for motion vectors.
  float scale_x, scale_y;
  ScaleFromAspect(mvf.aspect_ratio, true, &scale_x, &scale_y);

  state->clear_inlier_ids();
  state->clear_inlier_length();
  state->clear_outlier_ids();

  float inlier_center_x = 0.0f;
  float inlier_center_y = 0.0f;
  int cnt_inlier = 0;

  float min_x = std::numeric_limits<float>::max();
  float max_x = -std::numeric_limits<float>::max();
  float min_y = std::numeric_limits<float>::max();
  float max_y = -std::numeric_limits<float>::max();

  for (const auto& motion_vec : mvf.motion_vectors) {
    const float pos_x = motion_vec.pos.x() * scale_x;
    const float pos_y = motion_vec.pos.y() * scale_y;

    bool insider = true;

    for (const Vector3_f& line : box_lines) {
      if (line.DotProd(Vector3_f(pos_x, pos_y, 1.0f)) > 0.0f) {
        insider = false;
        break;
      }
    }

    if (insider) {
      ++cnt_inlier;
      inlier_center_x += pos_x;
      inlier_center_y += pos_y;

      min_x = std::min(min_x, pos_x);
      max_x = std::max(max_x, pos_x);
      min_y = std::min(min_y, pos_y);
      max_y = std::max(max_y, pos_y);

      state->add_inlier_ids(motion_vec.track_id);
      state->add_inlier_length(1.0f);
    } else {
      state->add_outlier_ids(motion_vec.track_id);
    }
  }

  if (cnt_inlier) {
    state->set_prior_weight(1.0f);
    state->set_inlier_center_x(inlier_center_x / cnt_inlier);
    state->set_inlier_center_y(inlier_center_y / cnt_inlier);
    state->set_inlier_width(max_x - min_x);
    state->set_inlier_height(max_y - min_y);
  }
}

void InitializePnpHomographyInMotionBoxState(
    const TrackingData& tracking, const TrackStepOptions& track_step_options,
    MotionBoxState* state) {
  // Only happen when `quad` and `aspect_ratio` are all specified.
  if (!state->has_quad()) {
    VLOG(1) << "Skip pnp tracking since box does not contain quad info.";
    return;
  }

  const int kQuadCornersSize = 4;
  CHECK_EQ(state->quad().vertices_size(), kQuadCornersSize * 2);
  float scale_x, scale_y;
  ScaleFromAspect(tracking.frame_aspect(), false, &scale_x, &scale_y);
  std::vector<cv::Point2f> corners_2d(kQuadCornersSize);
  if (track_step_options.has_camera_intrinsics()) {
    const auto& camera = track_step_options.camera_intrinsics();
    for (int c = 0; c < kQuadCornersSize; ++c) {
      corners_2d[c].x = state->quad().vertices(c * 2) * camera.w();
      corners_2d[c].y = state->quad().vertices(c * 2 + 1) * camera.h();
    }

    cv::Mat camera_mat, dist_coef;
    ConvertCameraIntrinsicsToCvMat(camera, &camera_mat, &dist_coef);
    cv::undistortPoints(corners_2d, corners_2d, camera_mat, dist_coef);
  } else {
    const float center_x = scale_x * 0.5f;
    const float center_y = scale_y * 0.5f;
    for (int c = 0; c < kQuadCornersSize; ++c) {
      corners_2d[c].x = state->quad().vertices(c * 2) * scale_x - center_x;
      corners_2d[c].y = state->quad().vertices(c * 2 + 1) * scale_y - center_y;
    }
  }

  if (!state->has_aspect_ratio()) {
    if (!track_step_options.forced_pnp_tracking()) {
      VLOG(1) << "Skip pnp tracking since aspect ratio is unknown and "
                 "estimation of it is not forced.";
      return;
    }
    const float u2_u0 = corners_2d[2].x - corners_2d[0].x;
    const float v2_v0 = corners_2d[2].y - corners_2d[0].y;
    const float u3_u1 = corners_2d[3].x - corners_2d[1].x;
    const float v3_v1 = corners_2d[3].y - corners_2d[1].y;

    constexpr float kEpsilon = 1e-6f;
    const float denominator = u2_u0 * v3_v1 - v2_v0 * u3_u1;
    if (std::abs(denominator) < kEpsilon) {
      LOG(WARNING) << "Zero denominator. Failed calculating aspect ratio.";
      return;
    }

    float s[kQuadCornersSize];
    s[0] = ((corners_2d[2].x - corners_2d[3].x) * v3_v1 -
            (corners_2d[2].y - corners_2d[3].y) * u3_u1) *
           2.0f / denominator;
    s[1] = -(u2_u0 * (corners_2d[2].y - corners_2d[3].y) -
             v2_v0 * (corners_2d[2].x - corners_2d[3].x)) *
           2.0f / denominator;
    s[2] = 2.0f - s[0];
    s[3] = 2.0f - s[1];

    std::vector<Vector3_f> corners(kQuadCornersSize);
    for (int i = 0; i < kQuadCornersSize; ++i) {
      if (s[0] <= 0) {
        LOG(WARNING) << "Negative scale. Failed calculating aspect ratio.";
        return;
      }
      corners[i] =
          Vector3_f(corners_2d[i].x * s[i], corners_2d[i].y * s[i], s[i]);
    }

    const Vector3_f width_edge = corners[2] - corners[1];
    const Vector3_f height_edge = corners[0] - corners[1];
    const float height_norm = height_edge.Norm();
    const float width_norm = width_edge.Norm();
    if (height_norm < kEpsilon || width_norm < kEpsilon) {
      LOG(WARNING)
          << "abnormal 3d quadrangle. Failed calculating aspect ratio.";
      return;
    }

    constexpr float kMaxCosAngle = 0.258819;  // which is cos(75 deg)
    if (width_edge.DotProd(height_edge) / height_norm / width_norm >
        kMaxCosAngle) {
      LOG(WARNING)
          << "abnormal 3d quadrangle. Failed calculating aspect ratio.";
      return;
    }

    state->set_aspect_ratio(width_norm / height_norm);
  }

  CHECK_GT(state->aspect_ratio(), 0.0f);

  const float half_width = state->aspect_ratio();
  const float half_height = 1.0f;
  const std::vector<cv::Point3f> corners_3d{
      cv::Point3f(-half_width, -half_height, 0.0f),
      cv::Point3f(-half_width, half_height, 0.0f),
      cv::Point3f(half_width, half_height, 0.0f),
      cv::Point3f(half_width, -half_height, 0.0f),
  };

  std::vector<MotionVector> motion_vectors(kQuadCornersSize);
  std::vector<const MotionVector*> motion_vector_pointers(kQuadCornersSize);

  for (int c = 0; c < kQuadCornersSize; ++c) {
    motion_vectors[c].pos = Vector2_f(corners_3d[c].x, corners_3d[c].y);
    motion_vectors[c].object =
        Vector2_f(corners_2d[c].x, corners_2d[c].y) - motion_vectors[c].pos;

    motion_vector_pointers[c] = &motion_vectors[c];
  }

  const std::vector<float> weights(kQuadCornersSize, 1.0f);
  HomographyL2Solve(motion_vector_pointers, weights,
                    state->mutable_pnp_homography());
}

// Scales velocity and all other velocity dependent fields according to
// temporal_scale.
void ScaleStateTemporally(float temporal_scale, MotionBoxState* state) {
  state->set_dx(state->dx() * temporal_scale);
  state->set_dy(state->dy() * temporal_scale);
  state->set_kinetic_energy(state->kinetic_energy() * temporal_scale);
}

void ScaleStateAspect(float aspect, bool invert, MotionBoxState* state) {
  float scale_x = 1.0f;
  float scale_y = 1.0f;
  ScaleFromAspect(aspect, invert, &scale_x, &scale_y);

  if (state->has_quad() && state->quad().vertices_size() == 8) {
    for (int i = 0; i < 4; ++i) {
      float curr_val = state->quad().vertices(i * 2);
      state->mutable_quad()->set_vertices(i * 2, curr_val * scale_x);
      curr_val = state->quad().vertices(i * 2 + 1);
      state->mutable_quad()->set_vertices(i * 2 + 1, curr_val * scale_y);
    }
  }

  state->set_pos_x(state->pos_x() * scale_x);
  state->set_pos_y(state->pos_y() * scale_y);
  state->set_width(state->width() * scale_x);
  state->set_height(state->height() * scale_y);
  state->set_dx(state->dx() * scale_x);
  state->set_dy(state->dy() * scale_y);
  state->set_inlier_center_x(state->inlier_center_x() * scale_x);
  state->set_inlier_center_y(state->inlier_center_y() * scale_y);
  state->set_inlier_width(state->inlier_width() * scale_x);
  state->set_inlier_height(state->inlier_height() * scale_y);
}

MotionVector MotionVector::FromInternalState(
    const MotionBoxInternalState& internal, int index) {
  CHECK_LT(index, internal.pos_x_size());
  MotionVector v;
  v.pos = Vector2_f(internal.pos_x(index), internal.pos_y(index));
  v.object = Vector2_f(internal.dx(index), internal.dy(index));
  v.background =
      Vector2_f(internal.camera_dx(index), internal.camera_dy(index));
  v.track_id = internal.track_id(index);
  return v;
}

void MotionBox::ResetAtFrame(int frame, const MotionBoxState& state) {
  states_.clear();
  queue_start_ = frame;

  states_.push_back(state);
  states_.back().set_track_status(MotionBoxState::BOX_TRACKED);
  // Init inlier dimensions from state if not set.
  if (states_.back().inlier_width() == 0 ||
      states_.back().inlier_height() == 0) {
    states_.back().set_inlier_width(state.width());
    states_.back().set_inlier_height(state.height());
  }

  initial_state_ = state;
}

bool MotionBox::TrackStep(int from_frame,
                          const MotionVectorFrame& motion_vectors,
                          bool forward) {
  if (!TrackableFromFrame(from_frame)) {
    LOG(WARNING) << "Tracking requested for initial position that is not "
                 << "trackable.";
    return false;
  }
  const int queue_pos = from_frame - queue_start_;

  MotionBoxState new_state;
  if (motion_vectors.is_duplicated) {
    // Do not track or update the state, just copy.
    new_state = states_[queue_pos];
    new_state.set_track_status(MotionBoxState::BOX_DUPLICATED);
  } else {
    // Compile history and perform tracking.
    std::vector<const MotionBoxState*> history;
    constexpr int kHistorySize = 10;
    if (forward) {
      for (int k = queue_pos - 1; k >= std::max(0, queue_pos - kHistorySize);
           --k) {
        history.push_back(&states_[k]);
      }
    } else {
      for (int k = queue_pos + 1;
           k <= std::min<int>(states_.size() - 1, queue_pos + kHistorySize);
           ++k) {
        history.push_back(&states_[k]);
      }
    }

    TrackStepImpl(from_frame, states_[queue_pos], motion_vectors, history,
                  &new_state);
  }

  if (new_state.track_status() < MotionBoxState::BOX_TRACKED) {
    new_state.set_tracking_confidence(0.0f);
  }
  if (!new_state.has_tracking_confidence()) {
    // In this case, track status should be >= MotionBoxState::BOX_TRACKED
    new_state.set_tracking_confidence(1.0f);
  }

  VLOG(1) << "Track status from frame " << from_frame << ": "
          << TrackStatusToString(new_state.track_status())
          << ". Has quad: " << new_state.has_quad();

  constexpr float kFailureDisparity = 0.8f;
  if (new_state.track_status() >= MotionBoxState::BOX_TRACKED) {
    if (forward) {
      const int new_pos = queue_pos + 1;
      if (new_pos < states_.size()) {
        states_[new_pos] = new_state;
      } else {
        states_.push_back(new_state);
      }

      // Check for successive tracking failures of in bound boxes.
      if (new_pos >= options_.max_track_failures()) {
        int num_track_errors = 0;
        // Cancel at the N + 1 tracking failure.
        for (int f = new_pos - options_.max_track_failures(); f <= new_pos;
             ++f) {
          if (states_[f].track_status() !=
              MotionBoxState::BOX_TRACKED_OUT_OF_BOUND) {
            num_track_errors += (fabs(states_[f].motion_disparity()) *
                                     states_[f].prior_weight() >
                                 kFailureDisparity);
          }
        }

        if (num_track_errors >= options_.max_track_failures()) {
          LOG_IF(INFO, print_motion_box_warnings_)
              << "Tracking failed during max track failure "
              << "verification.";
          states_[new_pos].set_track_status(MotionBoxState::BOX_UNTRACKED);
          return false;
        }
      }
    } else {
      // Backward tracking.
      int new_pos = queue_pos - 1;
      if (new_pos >= 0) {
        states_[new_pos] = new_state;
      } else {
        states_.push_front(new_state);
        --queue_start_;
        new_pos = 0;
      }

      // Check for successive tracking failures.
      if (new_pos + options_.max_track_failures() + 1 < states_.size()) {
        int num_track_errors = 0;
        // Cancel at the N + 1 tracking failure.
        for (int f = new_pos; f <= new_pos + options_.max_track_failures();
             ++f) {
          if (states_[f].track_status() !=
              MotionBoxState::BOX_TRACKED_OUT_OF_BOUND) {
            num_track_errors += (fabs(states_[f].motion_disparity()) *
                                     states_[f].prior_weight() >
                                 kFailureDisparity);
          }
        }

        if (num_track_errors >= options_.max_track_failures()) {
          LOG_IF(INFO, print_motion_box_warnings_)
              << "Tracking failed during max track failure "
              << "verification.";
          states_[new_pos].set_track_status(MotionBoxState::BOX_UNTRACKED);
          return false;
        }
      }
    }

    // Signal track success.
    return true;
  } else {
    LOG_IF(WARNING, print_motion_box_warnings_)
        << "Tracking error at " << from_frame
        << " status : " << TrackStatusToString(new_state.track_status());
    return false;
  }
}

namespace {

Vector2_f SpatialPriorPosition(const Vector2_f& location,
                               const MotionBoxState& state) {
  const int grid_size = state.spatial_prior_grid_size();
  return Vector2_f(
      Clamp((location.x() - state.pos_x()) / state.width(), 0, 1) *
          (grid_size - 1),
      Clamp((location.y() - state.pos_y()) / state.height(), 0, 1) *
          (grid_size - 1));
}

// Creates spatial prior for current set of inlier vectors and blends
// it with previous prior (based on blend_prior). If interpolate is set to true,
// uses more accurate interpolation into bins, instead of nearest neighbor
// interpolation. If use_next_position is set to true, the position in
// the next/previous frame is used instead of the current one.
void ComputeSpatialPrior(bool interpolate, bool use_next_position,
                         float blend_prior, MotionBoxState* update_pos) {
  const int grid_size = update_pos->spatial_prior_grid_size();

  std::vector<float> old_prior(update_pos->spatial_prior().begin(),
                               update_pos->spatial_prior().end());
  std::vector<float> old_confidence(update_pos->spatial_confidence().begin(),
                                    update_pos->spatial_confidence().end());

  CHECK_EQ(old_confidence.size(), old_prior.size());
  CHECK(old_confidence.empty() ||
        grid_size * grid_size == old_confidence.size())
      << "Empty or priors of constant size expected";

  update_pos->clear_spatial_prior();
  update_pos->clear_spatial_confidence();
  auto* spatial_prior = update_pos->mutable_spatial_prior();
  auto* spatial_confidence = update_pos->mutable_spatial_confidence();
  spatial_prior->Reserve(grid_size * grid_size);
  spatial_confidence->Reserve(grid_size * grid_size);

  for (int k = 0; k < grid_size * grid_size; ++k) {
    spatial_prior->AddAlreadyReserved(0);
    spatial_confidence->AddAlreadyReserved(0);
  }

  // Aggregate inlier weights (0 = outlier, 1 = total inlier) across grid.
  const MotionBoxInternalState& internal = update_pos->internal();
  const int num_elems = internal.pos_x_size();

  for (int k = 0; k < num_elems; ++k) {
    MotionVector vec = MotionVector::FromInternalState(internal, k);
    const Vector2_f pos =
        use_next_position ? vec.MatchLocation() : vec.Location();
    float weight = internal.inlier_score(k);

    const Vector2_f grid_pos = SpatialPriorPosition(pos, *update_pos);

    if (use_next_position) {
      // Check for out of bound and skip.
      if (grid_pos.x() < 0 || grid_pos.y() < 0 ||
          grid_pos.x() > update_pos->spatial_prior_grid_size() - 1 ||
          grid_pos.y() > update_pos->spatial_prior_grid_size() - 1) {
        continue;
      }
    }

    if (interpolate) {
      const int int_x = static_cast<int>(grid_pos.x());
      const int int_y = static_cast<int>(grid_pos.y());

      CHECK_GE(grid_pos.x(), 0) << pos.x() << ", " << update_pos->pos_x();
      CHECK_GE(grid_pos.y(), 0);
      CHECK_LE(grid_pos.x(), grid_size - 1);
      CHECK_LE(grid_pos.y(), grid_size - 1);

      const float dx = grid_pos.x() - int_x;
      const float dy = grid_pos.y() - int_y;
      const float dx_1 = 1.0f - dx;
      const float dy_1 = 1.0f - dy;
      const int stride = static_cast<int>(dx != 0);

      int grid_pos = int_y * grid_size + int_x;

      // Bilinear interpolation. Total sum of weights across all 4 additions
      // (for prior and confidence each), is one.
      *spatial_prior->Mutable(grid_pos) += dx_1 * dy_1 * weight;
      *spatial_confidence->Mutable(grid_pos) += dx_1 * dy_1;

      *spatial_prior->Mutable(grid_pos + stride) += dx * dy_1 * weight;
      *spatial_confidence->Mutable(grid_pos + stride) += dx * dy_1;

      grid_pos += (dy != 0) * grid_size;
      *spatial_prior->Mutable(grid_pos) += dx_1 * dy * weight;
      *spatial_confidence->Mutable(grid_pos) += dx_1 * dy;

      *spatial_prior->Mutable(grid_pos + stride) += dx * dy * weight;
      *spatial_confidence->Mutable(grid_pos + stride) += dx * dy;
    } else {
      // Nearest neighbor.
      const int grid_bin = static_cast<int>(grid_pos.y() + 0.5f) * grid_size +
                           static_cast<int>(grid_pos.x() + 0.5f);
      *spatial_prior->Mutable(grid_bin) += weight;
      *spatial_confidence->Mutable(grid_bin) += 1;
    }
  }

  // Normalize, i.e. max truncation.
  float total_prior_difference = 0;
  float weight_sum = 0;
  for (int k = 0; k < grid_size * grid_size; ++k) {
    // Convert aggregated inlier weights to grid cell prior.
    // Here we consider a grid cell to be an inlier, if at least two
    // 2 inliers within that cell where found.
    *spatial_prior->Mutable(k) = std::min(1.0f, spatial_prior->Get(k) * 0.5f);
    *spatial_confidence->Mutable(k) =
        std::min(1.0f, spatial_confidence->Get(k) * 0.5f);

    if (!old_prior.empty()) {
      // Truncated error, consider a difference of 0.2 within normal
      // update range.
      const float difference = std::max<float>(
          0.f, fabs(update_pos->spatial_prior(k) - old_prior[k]) - 0.2f);
      // Weight error by confidence.
      total_prior_difference += difference * update_pos->spatial_confidence(k);
      weight_sum += update_pos->spatial_confidence(k);

      // Blend confidence with previous confidence.
      const float curr_confidence =
          update_pos->spatial_confidence(k) * (1.0f - blend_prior);
      const float prev_confidence = old_confidence[k] * blend_prior;

      float summed_confidence = curr_confidence + prev_confidence;
      const float denom =
          summed_confidence > 0 ? 1.0f / summed_confidence : 1.0f;

      // Update prior and confidence as weighted linear combination between
      // current and previous prior.
      *spatial_prior->Mutable(k) =
          (update_pos->spatial_prior(k) * curr_confidence +
           old_prior[k] * prev_confidence) *
          denom;

      *spatial_confidence->Mutable(k) =
          (update_pos->spatial_confidence(k) * curr_confidence +
           prev_confidence * prev_confidence) *
          denom;
    }
  }

  update_pos->set_prior_diff(std::sqrt(
      total_prior_difference * (weight_sum > 0 ? 1.0f / weight_sum : 1.0f)));
}

}  // namespace.

void MotionBox::GetStartPosition(const MotionBoxState& curr_pos,
                                 float aspect_ratio, float* expand_mag,
                                 Vector2_f* top_left,
                                 Vector2_f* bottom_right) const {
  CHECK(top_left);
  CHECK(bottom_right);
  CHECK(expand_mag);

  MotionBoxBoundingBox(curr_pos, top_left, bottom_right);

  if (curr_pos.has_pnp_homography()) {
    *expand_mag = 0.0f;
  } else {
    // Expaned box by the specified minimum expansion_size. For fast moving
    // objects, we ensure that magnitude is twice the box velocity, but not more
    // than 1/4 of the box diameter.
    *expand_mag = std::max(options_.expansion_size(),
                           std::min(MotionBoxSize(curr_pos).Norm() * 0.25f,
                                    MotionBoxVelocity(curr_pos).Norm() * 2.0f));
  }

  // Expansion magnitude is not non-uniformly scaled w.r.t. aspect ratio
  // to ensure inclusion test in GetVectorsAndWeights can assume uniform
  // explansion magnitude.
  const Vector2_f expand = Vector2_f(*expand_mag, *expand_mag);
  *top_left -= expand;
  *bottom_right += expand;
}

void MotionBox::GetSpatialGaussWeights(const MotionBoxState& box_state,
                                       const Vector2_f& inv_box_domain,
                                       float* spatial_gauss_x,
                                       float* spatial_gauss_y) const {
  CHECK(spatial_gauss_x);
  CHECK(spatial_gauss_y);

  // Space sigma depends on how much the tracked object fills the rectangle.
  // We get this information from the inlier extent of the previous
  // estimation.
  // Motivation: Choose sigma s such that the inlier domain equals 90% coverage.
  //             i.e. using z-score one sided of 95% = 1.65
  //             s * 1.65 = domain
  //          ==> s = domain / 1.65f
  const float space_sigma_x = std::max(
      options_.spatial_sigma(), box_state.inlier_width() * inv_box_domain.x() *
                                    0.5f * box_state.prior_weight() / 1.65f);
  const float space_sigma_y = options_.spatial_sigma();
  std::max(options_.spatial_sigma(), box_state.inlier_height() *
                                         inv_box_domain.y() * 0.5f *
                                         box_state.prior_weight() / 1.65f);

  *spatial_gauss_x = -0.5f / (space_sigma_x * space_sigma_x);
  *spatial_gauss_y = -0.5f / (space_sigma_y * space_sigma_y);
}

// Computes for each vector its 2D grid position for a grid spanning
// the domain top_left to bottom_right.
// Note: Passed vectors must lie within the domain or function will return false
// for error.
template <int kGridSize>
bool ComputeGridPositions(const Vector2_f& top_left,
                          const Vector2_f& bottom_right,
                          const std::vector<const MotionVector*>& vectors,
                          std::vector<Vector2_f>* grid_positions) {
  CHECK(grid_positions);

  // Slightly larger domain to avoid boundary issues.
  const Vector2_f inv_grid_domain(
      (1.0f - 1e-3f) / (bottom_right.x() - top_left.x()),
      (1.0f - 1e-3f) / (bottom_right.y() - top_left.y()));

  grid_positions->clear();
  grid_positions->reserve(vectors.size());
  for (const MotionVector* vec : vectors) {
    // Get grid position. Note grid is never rotated, but we only use it for
    // density estimation.
    const Vector2_f grid_pos =
        (vec->pos - top_left).MulComponents(inv_grid_domain) * (kGridSize - 1);
    if (grid_pos.x() < 0 || grid_pos.y() < 0 || grid_pos.x() > kGridSize ||
        grid_pos.y() > kGridSize) {
      return false;
    }

    grid_positions->push_back(grid_pos);
  }

  return true;
}

template <int kGridSize>
void AddToGrid(const Vector2_f& grid_pos, std::vector<float>* grid) {
  const float grid_x = grid_pos.x();
  const float grid_y = grid_pos.y();

  const int int_grid_x = grid_x;
  const int int_grid_y = grid_y;

  const float dx = grid_x - int_grid_x;
  const float dy = grid_y - int_grid_y;
  const float dxdy = dx * dy;
  const float dx_plus_dy = dx + dy;

  const int inc_x = dx != 0;
  const int inc_y = dy != 0;

  int bin_idx = int_grid_y * kGridSize + int_grid_x;
  // (1 - dx)(1 - dy) = 1 - (dx + dy) + dx*dy
  (*grid)[bin_idx] += 1 - dx_plus_dy + dxdy;
  // dx * (1 - dy) = dx - dxdy
  (*grid)[bin_idx + inc_x] += dx - dxdy;

  bin_idx += kGridSize * inc_y;
  // (1 - dx) * dy = dy - dxdy
  (*grid)[bin_idx] += dy - dxdy;
  (*grid)[bin_idx + inc_x] += dxdy;
}

template <int kGridSize>
float SampleFromGrid(const Vector2_f& grid_pos,
                     const std::vector<float>& grid) {
  const float grid_x = grid_pos.x();
  const float grid_y = grid_pos.y();

  const int int_grid_x = grid_x;
  const int int_grid_y = grid_y;

  const float dx = grid_x - int_grid_x;
  const float dy = grid_y - int_grid_y;
  const float dxdy = dx * dy;
  const float dx_plus_dy = dx + dy;
  int inc_x = dx != 0;
  int inc_y = dy != 0;

  float normalizer = 0;
  int bin_idx = int_grid_y * kGridSize + int_grid_x;

  // See above.
  normalizer += grid[bin_idx] * (1 - dx_plus_dy + dxdy);
  normalizer += grid[bin_idx + inc_x] * (dx - dxdy);

  bin_idx += kGridSize * inc_y;
  normalizer += grid[bin_idx] * (dy - dxdy);
  normalizer += grid[bin_idx + inc_x] * dxdy;

  const float inv_normalizer = normalizer > 0 ? 1.0f / normalizer : 0;
  // Density should always decrease weight; never increase.
  return std::min(1.0f, inv_normalizer);
}

MotionBox::DistanceWeightsComputer::DistanceWeightsComputer(
    const MotionBoxState& initial_state, const MotionBoxState& current_state,
    const TrackStepOptions& options) {
  tracking_degrees_ = options.tracking_degrees();
  const Vector2_f box_domain(current_state.width() * current_state.scale(),
                             current_state.height() * current_state.scale());
  CHECK_GT(box_domain.x(), 0.0f);
  CHECK_GT(box_domain.y(), 0.0f);
  inv_box_domain_ = Vector2_f(1.0f / box_domain.x(), 1.0f / box_domain.y());

  // Space sigma depends on how much the tracked object fills the rectangle.
  // We get this information from the inlier extent of the previous
  // estimation.
  // Motivation: Choose sigma s such that the inlier domain equals 90%
  // coverage.
  //             i.e. using z-score one sided of 95% = 1.65
  //             s * 1.65 = domain
  //          ==> s = domain / 1.65f
  const float space_sigma_x =
      std::max(options.spatial_sigma(),
               current_state.inlier_width() * inv_box_domain_.x() * 0.5f *
                   current_state.prior_weight() / 1.65f);

  const float space_sigma_y =
      std::max(options.spatial_sigma(),
               current_state.inlier_height() * inv_box_domain_.y() * 0.5f *
                   current_state.prior_weight() / 1.65f);

  spatial_gauss_x_ = -0.5f / (space_sigma_x * space_sigma_x);
  spatial_gauss_y_ = -0.5f / (space_sigma_y * space_sigma_y);

  if (tracking_degrees_ == TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION ||
      tracking_degrees_ ==
          TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE) {
    cos_neg_a_ = std::cos(-current_state.rotation());
    sin_neg_a_ = std::sin(-current_state.rotation());
    if (std::abs(current_state.rotation()) > 0.01f) {
      is_large_rotation_ = true;
    }
  }

  // Compute box center as blend between geometric center and inlier center.
  constexpr float kMaxBoxCenterBlendWeight = 0.5f;
  box_center_ =
      Lerp(MotionBoxCenter(current_state), InlierCenter(current_state),
           std::min(kMaxBoxCenterBlendWeight, current_state.prior_weight()));
  if (tracking_degrees_ ==
      TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE) {
    CHECK(initial_state.has_quad());
    CHECK(current_state.has_quad());
    homography_ =
        ComputeHomographyFromQuad(current_state.quad(), initial_state.quad());
    box_center_transformed_ =
        HomographyAdapter::TransformPoint(homography_, box_center_);
  }
}

float MotionBox::DistanceWeightsComputer::ComputeDistanceWeight(
    const MotionVector& test_vector) {
  // Distance weighting.
  Vector2_f diff_center;
  if (tracking_degrees_ ==
      TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE) {
    Vector2_f test_vector_transformed =
        HomographyAdapter::TransformPoint(homography_, test_vector.pos);
    diff_center = test_vector_transformed - box_center_transformed_;
  } else {
    diff_center = test_vector.pos - box_center_;
    if (is_large_rotation_) {
      // Rotate difference vector to normalized domain.
      diff_center.Set(
          cos_neg_a_ * diff_center.x() - sin_neg_a_ * diff_center.y(),
          sin_neg_a_ * diff_center.x() + cos_neg_a_ * diff_center.y());
    }
  }

  const Vector2_f diff = diff_center.MulComponents(inv_box_domain_);
  // Regular gaussian with variance in each direction, assuming directions
  // are indpendent.
  float weight = std::exp(diff.x() * diff.x() * spatial_gauss_x_ +
                          diff.y() * diff.y() * spatial_gauss_y_);
  return weight;
}

Homography MotionBox::DistanceWeightsComputer::ComputeHomographyFromQuad(
    const MotionBoxState::Quad& src_quad,
    const MotionBoxState::Quad& dst_quad) {
  std::vector<float> src_quad_vec(8);
  std::vector<float> dst_quad_vec(8);
  for (int i = 0; i < 8; ++i) {
    src_quad_vec[i] = src_quad.vertices(i);
    dst_quad_vec[i] = dst_quad.vertices(i);
  }
  // Construct the matrix
  Eigen::Matrix<float, 8, 8> A = Eigen::Matrix<float, 8, 8>::Zero();
  for (int i = 0; i < 4; ++i) {
    const int r0 = 2 * i;
    const int r1 = 2 * i + 1;
    A(r0, 0) = src_quad_vec[r0];
    A(r0, 1) = src_quad_vec[r1];
    A(r0, 2) = 1.f;
    A(r0, 6) = -src_quad_vec[r0] * dst_quad_vec[r0];
    A(r0, 7) = -src_quad_vec[r1] * dst_quad_vec[r0];
    A(r1, 3) = src_quad_vec[r0];
    A(r1, 4) = src_quad_vec[r1];
    A(r1, 5) = 1.f;
    A(r1, 6) = -src_quad_vec[r0] * dst_quad_vec[r1];
    A(r1, 7) = -src_quad_vec[r1] * dst_quad_vec[r1];
  }

  // Map arrays to Eigen vectors without memcpy
  std::vector<float> homography_vector(8);
  Eigen::Map<Eigen::Matrix<float, 8, 1> > x(homography_vector.data());
  Eigen::Map<const Eigen::Matrix<float, 8, 1> > b(dst_quad_vec.data());

  x = A.fullPivLu().solve(b);
  Homography homography;
  homography.set_h_00(homography_vector[0]);
  homography.set_h_01(homography_vector[1]);
  homography.set_h_02(homography_vector[2]);
  homography.set_h_10(homography_vector[3]);
  homography.set_h_11(homography_vector[4]);
  homography.set_h_12(homography_vector[5]);
  homography.set_h_20(homography_vector[6]);
  homography.set_h_21(homography_vector[7]);
  return homography;
}

bool MotionBox::GetVectorsAndWeights(
    const std::vector<MotionVector>& motion_vectors, int start_idx, int end_idx,
    const Vector2_f& top_left, const Vector2_f& bottom_right,
    const MotionBoxState& box_state, bool valid_background_model,
    bool is_chunk_boundary, float temporal_scale, float expand_mag,
    const std::vector<const MotionBoxState*>& history,
    std::vector<const MotionVector*>* vectors, std::vector<float>* weights,
    int* number_of_good_prior, int* number_of_cont_inliers) const {
  CHECK(weights);
  CHECK(vectors);
  CHECK(number_of_good_prior);
  CHECK(number_of_cont_inliers);

  const int num_max_vectors = end_idx - start_idx;
  weights->clear();
  vectors->clear();
  weights->reserve(num_max_vectors);
  vectors->reserve(num_max_vectors);

  const Vector2_f box_domain(box_state.width() * box_state.scale(),
                             box_state.height() * box_state.scale());

  CHECK_GT(box_domain.x(), 0.0f);
  CHECK_GT(box_domain.y(), 0.0f);
  const Vector2_f inv_box_domain(1.0f / box_domain.x(), 1.0f / box_domain.y());

  // The four lines of the rotated and scaled box.
  std::array<Vector3_f, 4> box_lines;
  if (!MotionBoxLines(box_state, Vector2_f(1.0f, 1.0f), &box_lines)) {
    LOG(ERROR) << "Error in computing MotionBoxLines. Return 0 good inits and "
                  "continued inliers";
    return false;
  }

  // Get list of previous tracking inliers and outliers.
  // Ids are used for non-chunk boundaries (faster matching), locations
  // for chunk boundaries.
  std::unordered_map<int, int> inlier_ids;
  std::unordered_set<int> outlier_ids;
  std::vector<Vector2_f> inlier_locations;
  std::vector<Vector2_f> outlier_locations;

  if (!is_chunk_boundary) {
    MotionBoxInliers(box_state, &inlier_ids);
    MotionBoxOutliers(box_state, &outlier_ids);

    // Never map ids in history across a chunk boundary.
    for (const auto* state_ptr : history) {
      MotionBoxOutliers(*state_ptr, &outlier_ids);
    }
    // Why don't we build inlier map from a history of inliers?
    // It is unlikely we skip a feature as an inlier, it is either
    // consistently part of the motion model or it is not.
  } else {
    MotionBoxInlierLocations(box_state, &inlier_locations);
    MotionBoxOutlierLocations(box_state, &outlier_locations);
  }

  // Indicator for each vector, if inlier or outlier from prev. estimation.
  std::vector<uchar> is_inlier;
  std::vector<uchar> is_outlier;
  is_inlier.reserve(num_max_vectors);
  is_outlier.reserve(num_max_vectors);
  int num_cont_inliers = 0;

  // Approx. 2 pix at SD resolution.
  constexpr float kSqProximity = 2e-3 * 2e-3;

  for (int k = start_idx; k < end_idx; ++k) {
    // x is within bound due to sorting.
    const MotionVector& test_vector = motion_vectors[k];

    if (test_vector.pos.y() < top_left.y() ||
        test_vector.pos.y() > bottom_right.y()) {
      continue;
    }

    if (std::abs(box_state.rotation()) > 0.01f ||
        options_.tracking_degrees() ==
            TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE) {
      // Test also if vector is within transformed convex area.
      bool accepted = true;
      for (const Vector3_f& line : box_lines) {
        if (line.DotProd(Vector3_f(test_vector.pos.x(), test_vector.pos.y(),
                                   1.0f)) > expand_mag) {
          // Outside, reject.
          accepted = false;
          break;
        }
      }
      if (!accepted) {
        continue;
      }
    }

    vectors->push_back(&motion_vectors[k]);

    auto is_close_to_test_vector = [test_vector,
                                    kSqProximity](const Vector2_f v) -> bool {
      return (v - test_vector.pos).Norm2() < kSqProximity;
    };

    const bool is_inlier_flag =
        inlier_ids.find(test_vector.track_id) != inlier_ids.end() ||
        std::find_if(inlier_locations.begin(), inlier_locations.end(),
                     is_close_to_test_vector) != inlier_locations.end();
    num_cont_inliers += is_inlier_flag;

    const bool is_outlier_flag =
        outlier_ids.find(test_vector.track_id) != outlier_ids.end() ||
        std::find_if(outlier_locations.begin(), outlier_locations.end(),
                     is_close_to_test_vector) != outlier_locations.end();

    is_inlier.push_back(is_inlier_flag);
    is_outlier.push_back(is_outlier_flag);
  }

  CHECK_EQ(vectors->size(), is_inlier.size());
  CHECK_EQ(vectors->size(), is_outlier.size());

  const float prev_motion_mag = MotionBoxVelocity(box_state).Norm();

  // Try to lock on object again, if disparity is high.
  constexpr float kMinPriorMotionWeight = 0.2f;
  const float prior_motion_weight =
      std::max(kMinPriorMotionWeight, std::abs(box_state.motion_disparity())) *
      box_state.prior_weight();

  const float motion_sigma =
      std::max<float>(options_.min_motion_sigma(),
                      prev_motion_mag * options_.relative_motion_sigma());
  const float motion_gaussian_scale = -0.5f / (motion_sigma * motion_sigma);

  // Maps current kinetic energy to [0, 1] quantifying static (0) vs. moving (1)
  // object.
  // Map normalized thresholds to current frame period.
  const float low_kinetic_energy =
      options_.low_kinetic_energy() * temporal_scale;
  const float high_kinetic_energy =
      options_.high_kinetic_energy() * temporal_scale;
  const float kinetic_identity = LinearRamp(
      box_state.kinetic_energy(), low_kinetic_energy, high_kinetic_energy);
  int num_good_inits = 0;

  // Map number of continued inliers to score in [0, 1].
  const float cont_inlier_score = LinearRamp(num_cont_inliers, 10, 30);

  VLOG(1) << "GetVectorsAndWeights, found cont. inliers: " << num_cont_inliers
          << "  score: " << cont_inlier_score;

  DistanceWeightsComputer distance_weights_computer(initial_state_, box_state,
                                                    options_);
  for (int k = 0; k < vectors->size(); ++k) {
    const MotionVector& test_vector = *(*vectors)[k];

    float weight = distance_weights_computer.ComputeDistanceWeight(test_vector);

    if (valid_background_model) {
      const float motion_diff =
          fabs(prev_motion_mag - test_vector.object.Norm());
      const float motion_weight =
          std::exp(motion_gaussian_scale * motion_diff * motion_diff);

      // Blend with spatial weight, that is the higher the disparity
      // (i.e. we lost tracking, the more inclined we are to lock
      //       onto vectors of similar motion magnitude regardless of their
      //       position).
      // Note: One might feel inclined to always bias towards the previous
      // motion, by multiplying weight with motion_weight. This however fails
      // when tracking objects that start at rest and start moving.
      weight = Lerp(weight, motion_weight, prior_motion_weight);
    }

    // There are two kinds of vectors we are trying to balance here:
    // - inliers from previous estimation
    // - similar vectors
    //
    // Current strategy:
    // - For static objects: Boost inliers a lot, discount outliers a lot,
    // do not care about similar vectors.
    // - For moving objects: Boost inliers proportional to number of continued
    // inliers, discount outliers a lot, boost similar vectors and
    // actively downweight dis-similar objects.
    //
    // Motivation: Inliers are usually not very stable, so if not sufficient
    // have been continued prefer velocity over inliers for moving objects.

    // NOTE: Regarding additive vs. multiplicative weights. We need to
    // multiply the weight here. Adding the weight messes
    // with the gaussian spatial weighting which in turn makes it hard to lock
    // onto moving objects in the first place (as center is assumed to be
    // placed over moving objects, this helps distinguish initial foreground
    // and background).

    // Up-weighting of inlier vectors and vectors of similar motion.
    float upweight = 1.0f;
    if (is_inlier[k]) {
      // Previous track, boost weight significantly.
      //
      // NOTE: Regarding the amount of up-weighting: Long features are not
      // very stable on moving objects. Therefore only upweight strongly for
      // static objects.
      constexpr float kWeakMultiplier = 5.0f;
      constexpr float kStrongMultiplier = 20.0f;

      // Map 0 -> 1 and values >= 0.5 -> 0, because long features are not
      // very stable on moving objects. Therefore only upweight strongly for
      // static objects.
      const float kinetic_alpha =
          std::max(0.0f, 1.0f - 2.0f * kinetic_identity);

      // Choose strong multiplier only when kinetic_alpha OR inlier score
      // support it.
      const float multiplier = Lerp(kWeakMultiplier, kStrongMultiplier,
                                    std::max(cont_inlier_score, kinetic_alpha));
      upweight *= multiplier;
    }

    // Scale weight boost for moving objects by prior, i.e. modulate
    // strength of scale w.r.t. confidence.
    const float kin_scale = Lerp(1, 10, box_state.prior_weight());
    // 80% moving object weighted by prior. This weighting is biasing towards
    // moving object when the prior is low.
    if (kinetic_identity >= 0.8f * box_state.prior_weight() &&
        test_vector.object.Norm() > high_kinetic_energy && !is_outlier[k]) {
      // If we track a moving object, long tracks are less likely to be stable
      // due to appearance variations. In that case boost similar vectors.
      upweight *= 5.f * kin_scale;
    }

    float downweight = 1.0f;
    // Down-weighting of outlier vectors and vectors of different motion.
    if (!is_inlier[k]) {
      // Outlier.
      if (is_outlier[k]) {
        // Note: Outlier ids might overlap with inliers as outliers are built
        // from a history of frames.
        // *Always favor inliers over outliers*! Important to keep!!
        downweight *= 20.0f;
      }

      // Vectors of different motion, for 100% moving object downweight
      // vectors with small motion.
      if (kinetic_identity >= 1.0f * box_state.prior_weight() &&
          test_vector.object.Norm() < low_kinetic_energy) {
        downweight *= 2.f * kin_scale;
      }
    }

    // Cap any kind of up or down weighting so that no vector overwhelms
    // all others.
    const float kMaxWeight = 100.f;
    upweight = std::min(kMaxWeight, upweight);
    downweight = std::min(kMaxWeight, downweight);
    weight *= upweight / downweight;

    num_good_inits += (weight >= 0.1f);
    weights->push_back(weight);
  }

  const int num_vectors = vectors->size();
  CHECK_EQ(num_vectors, weights->size());

  const float weight_sum =
      std::accumulate(weights->begin(), weights->end(), 0.0f);

  // Normalize weights.
  if (weight_sum > 0) {
    const float inv_weight_sum = 1.0f / weight_sum;
    for (auto& weight : *weights) {
      weight *= inv_weight_sum;
    }
  }

  *number_of_good_prior = num_good_inits;
  *number_of_cont_inliers = num_cont_inliers;

  return true;
}

void MotionBox::TranslationIrlsInitialization(
    const std::vector<const MotionVector*>& vectors,
    const Vector2_f& irls_scale, std::vector<float>* weights) const {
  const int num_features = vectors.size();

  const auto& irls_options = options_.irls_initialization();
  if (!irls_options.activated() || !num_features) {
    return;
  }

  // Bool indicator which features agree with model in each round.
  // In case no RANSAC rounds are performed considered all features inliers.
  std::vector<uint8> best_features(num_features, 1);
  std::vector<uint8> curr_features(num_features);
  float best_sum = 0;

  unsigned int seed = 900913;
  std::default_random_engine rand_gen(seed);
  std::uniform_int_distribution<> distribution(0, num_features - 1);

  const float cutoff = irls_options.cutoff();
  const float sq_cutoff = cutoff * cutoff;

  for (int rounds = 0; rounds < irls_options.rounds(); ++rounds) {
    float curr_sum = 0;
    // Pick a random vector.
    const int rand_idx = distribution(rand_gen);
    const Vector2_f flow = vectors[rand_idx]->object;
    const auto error_system = ComputeIrlsErrorSystem(irls_scale, flow);

    // curr_features gets set for every feature below; no need to reset.
    for (int i = 0; i < num_features; ++i) {
      const Vector2_f diff = vectors[i]->object - flow;
      const float error = ErrorDiff(diff, error_system);
      curr_features[i] = static_cast<uint8>(error < sq_cutoff);
      if (curr_features[i]) {
        curr_sum += (*weights)[i];
      }
    }

    if (curr_sum > best_sum) {
      best_sum = curr_sum;
      std::swap(best_features, curr_features);
    }
  }

  std::vector<float> inlier_weights;
  inlier_weights.reserve(num_features);

  // Score outliers low.
  int num_inliers = 0;
  for (int i = 0; i < num_features; ++i) {
    if (best_features[i] == 0) {
      (*weights)[i] = 1e-10f;
    } else {
      ++num_inliers;
      inlier_weights.push_back((*weights)[i]);
    }
  }

  if (!inlier_weights.empty()) {
    // Ensure that all selected inlier features have at least median weight.
    auto median = inlier_weights.begin() + inlier_weights.size() * 0.5f;
    std::nth_element(inlier_weights.begin(), median, inlier_weights.end());

    for (int i = 0; i < num_features; ++i) {
      if (best_features[i] != 0) {
        (*weights)[i] = std::max(*median, (*weights)[i]);
      }
    }
  }
}

void MotionBox::EstimateObjectMotion(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& prior_weights, int num_continued_inliers,
    const Vector2_f& irls_scale, std::vector<float>* weights,
    Vector2_f* object_translation, LinearSimilarityModel* object_similarity,
    Homography* object_homography) const {
  CHECK(object_translation);
  CHECK(object_similarity);
  CHECK(object_homography);

  const int num_vectors = motion_vectors.size();
  CHECK_EQ(num_vectors, prior_weights.size());
  CHECK_EQ(num_vectors, weights->size());

  // Create backup of weights if needed.
  std::vector<float> similarity_weights;

  switch (options_.tracking_degrees()) {
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE:
      similarity_weights = *weights;
      break;

    default:
      // Nothing to do here.
      break;
  }

  EstimateTranslation(motion_vectors, prior_weights, irls_scale, weights,
                      object_translation);

  TranslationModel translation_model = TranslationAdapter::FromArgs(
      object_translation->x(), object_translation->y());

  // For any additional degrees of freedom, require a good set of inliers.
  if (num_continued_inliers < options_.object_similarity_min_contd_inliers()) {
    if (options_.tracking_degrees() !=
        TrackStepOptions::TRACKING_DEGREE_TRANSLATION) {
      VLOG(2) << "Falling back to translation!!!";
    }
    VLOG(1) << "num_continued_inliers: " << num_continued_inliers << " < "
            << options_.object_similarity_min_contd_inliers()
            << ", fall back to translation";
    *object_similarity = LinearSimilarityAdapter::Embed(translation_model);
    *object_homography = HomographyAdapter::Embed(translation_model);
    return;
  }

  switch (options_.tracking_degrees()) {
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_SCALE:
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_ROTATION_SCALE: {
      if (EstimateSimilarity(motion_vectors, prior_weights, irls_scale,
                             &similarity_weights, object_similarity)) {
        if (!ObjectMotionValidator::IsValidSimilarity(
                *object_similarity, options_.box_similarity_max_scale(),
                options_.box_similarity_max_rotation())) {
          LOG(WARNING) << "Unstable similarity model - falling back to "
                       << "translation.";
          *object_similarity =
              LinearSimilarityAdapter::Embed(translation_model);
        } else {
          // Good estimation, use weights as output.
          weights->swap(similarity_weights);
        }
      } else {
        *object_similarity = LinearSimilarityAdapter::Embed(translation_model);
      }

      break;
    }
    case TrackStepOptions::TRACKING_DEGREE_OBJECT_PERSPECTIVE:
      if (EstimateHomography(motion_vectors, prior_weights, irls_scale,
                             &similarity_weights, object_homography)) {
        if (!ObjectMotionValidator::IsValidHomography(
                *object_homography, options_.quad_homography_max_scale(),
                options_.quad_homography_max_rotation())) {
          LOG(WARNING) << "Unstable homography model - falling back to "
                       << "translation.";
          *object_homography = HomographyAdapter::Embed(translation_model);
        } else {
          weights->swap(similarity_weights);
        }
      } else {
        *object_homography = HomographyAdapter::Embed(translation_model);
      }
      VLOG(1) << "Got homography: "
              << HomographyAdapter::ToString(*object_homography);
      break;
    default:
      // Plenty of CAMERA_ cases are not handled in this function.
      break;
  }
}

void MotionBox::EstimateTranslation(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& prior_weights, const Vector2_f& irls_scale,
    std::vector<float>* weights, Vector2_f* translation) const {
  CHECK(weights);
  CHECK(translation);

  const int iterations = options_.irls_iterations();

  // NOTE: Floating point accuracy is totally sufficient here.
  //                 We tried changing to double now 3 times and it just does
  //                 not matter. Do not do it again.      - Past self
  Vector2_f object_translation;
  const int num_vectors = motion_vectors.size();
  const float kEpsilon = 1e-8f;

  VLOG(1) << "Estimating translation for " << num_vectors << " vectors";

  for (int i = 0; i < iterations; ++i) {
    float flow_sum = 0;
    object_translation = Vector2_f(0.0, 0.0);
    for (int k = 0; k < num_vectors; ++k) {
      const MotionVector& motion_vector = *motion_vectors[k];
      const Vector2_f flow = motion_vector.object;
      object_translation += flow * (*weights)[k];
      flow_sum += (*weights)[k];
    }

    if (flow_sum > 0) {
      object_translation *= (1.0 / flow_sum);

      const auto error_system =
          ComputeIrlsErrorSystem(irls_scale, object_translation);

      // Update irls weights.
      for (int k = 0; k < num_vectors; ++k) {
        const MotionVector& motion_vector = *motion_vectors[k];
        Vector2_f diff(motion_vector.object - object_translation);
        const float error = ErrorDiff(diff, error_system);
        // In last iteration compute weights without any prior bias.
        const float numerator = (i + 1 == iterations) ? 1.0f : prior_weights[k];
        (*weights)[k] = numerator / (error + kEpsilon);
      }
    }
  }
  *translation = object_translation;

  VLOG(1) << "Got translation: " << *translation;
}

bool MotionBox::EstimateSimilarity(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& prior_weights, const Vector2_f& irls_scale,
    std::vector<float>* weights, LinearSimilarityModel* lin_sim) const {
  CHECK(weights);
  CHECK(lin_sim);

  const int iterations = options_.irls_iterations();
  LinearSimilarityModel object_similarity;
  const int num_vectors = motion_vectors.size();
  const float kEpsilon = 1e-8f;

  VLOG(1) << "Estimating similarity for " << num_vectors << " vectors";
  for (int i = 0; i < iterations; ++i) {
    if (LinearSimilarityL2Solve(motion_vectors, *weights, &object_similarity)) {
      // Update irls weights.
      for (int k = 0; k < num_vectors; ++k) {
        const MotionVector& motion_vector = *motion_vectors[k];
        const Vector2_f model_vec =
            TransformPoint(object_similarity, motion_vector.pos) -
            motion_vector.pos;
        const auto error_system = ComputeIrlsErrorSystem(irls_scale, model_vec);

        Vector2_f diff(motion_vector.object - model_vec);
        const float error = ErrorDiff(diff, error_system);
        // In last iteration compute weights without any prior bias.
        const float numerator = (i + 1 == iterations) ? 1.0f : prior_weights[k];
        (*weights)[k] = numerator / (error + kEpsilon);
      }
    } else {
      return false;
    }
  }
  *lin_sim = object_similarity;

  VLOG(1) << "Got similarity: "
          << LinearSimilarityAdapter::ToString(object_similarity);
  return true;
}

bool MotionBox::EstimateHomography(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& prior_weights, const Vector2_f& irls_scale,
    std::vector<float>* weights, Homography* object_homography) const {
  CHECK(weights);

  const int iterations = options_.irls_iterations();
  Homography homography;
  const int num_vectors = motion_vectors.size();
  const float kEpsilon = 1e-8f;

  VLOG(1) << "Estimating homography for " << num_vectors << " vectors";
  for (int i = 0; i < iterations; ++i) {
    if (HomographyL2Solve(motion_vectors, *weights, &homography)) {
      // Update irls weights.
      for (int k = 0; k < num_vectors; ++k) {
        const MotionVector& motion_vector = *motion_vectors[k];
        const Vector2_f model_vec =
            TransformPoint(homography, motion_vector.pos) - motion_vector.pos;
        const auto error_system = ComputeIrlsErrorSystem(irls_scale, model_vec);

        Vector2_f diff(motion_vector.object - model_vec);
        const float error = ErrorDiff(diff, error_system);
        // In last iteration compute weights without any prior bias.
        const float numerator = (i + 1 == iterations) ? 1.0f : prior_weights[k];
        (*weights)[k] = numerator / (error + kEpsilon);
      }
    } else {
      return false;
    }
  }
  *object_homography = homography;

  return true;
}

bool MotionBox::EstimatePnpHomography(
    const MotionBoxState& curr_pos,
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& weights, float domain_x, float domain_y,
    Homography* pnp_homography) const {
  constexpr int kMinVectors = 4;
  if (motion_vectors.size() < kMinVectors) return false;

  Homography inv_h = HomographyAdapter::Invert(curr_pos.pnp_homography());

  std::vector<cv::Point3f> vectors_3d;
  vectors_3d.reserve(motion_vectors.size());
  std::vector<cv::Point2f> vectors_2d;
  vectors_2d.reserve(motion_vectors.size());
  if (options_.has_camera_intrinsics()) {
    const auto& camera = options_.camera_intrinsics();
    cv::Mat camera_mat, dist_coef;
    ConvertCameraIntrinsicsToCvMat(camera, &camera_mat, &dist_coef);
    float scale = std::max(camera.w(), camera.h());

    std::vector<cv::Point2f> mv_p;
    mv_p.reserve(motion_vectors.size());
    std::vector<cv::Point2f> mv_q;
    mv_q.reserve(motion_vectors.size());
    for (int j = 0; j < motion_vectors.size(); ++j) {
      if (weights[j] < kMaxOutlierWeight) continue;
      mv_p.emplace_back(motion_vectors[j]->pos.x() * scale,
                        motion_vectors[j]->pos.y() * scale);

      Vector2_f q = motion_vectors[j]->pos + motion_vectors[j]->object +
                    motion_vectors[j]->background;
      mv_q.emplace_back(q.x() * scale, q.y() * scale);
    }

    if (mv_p.size() < kMinVectors) return false;

    cv::undistortPoints(mv_p, mv_p, camera_mat, dist_coef);
    cv::undistortPoints(mv_q, mv_q, camera_mat, dist_coef);

    vectors_3d.resize(mv_p.size());
    vectors_2d.resize(mv_q.size());
    for (int j = 0; j < mv_p.size(); ++j) {
      Vector2_f p = TransformPoint(inv_h, Vector2_f(mv_p[j].x, mv_p[j].y));
      vectors_3d[j] = cv::Point3f(p.x(), p.y(), 0.0f);
      vectors_2d[j] = cv::Point2f(mv_q[j].x, mv_q[j].y);
    }
  } else {
    Vector2_f center(domain_x * 0.5f, domain_y * 0.5f);
    for (int j = 0; j < motion_vectors.size(); ++j) {
      if (weights[j] < kMaxOutlierWeight) continue;
      Vector2_f p = TransformPoint(inv_h, motion_vectors[j]->pos - center);
      vectors_3d.emplace_back(p.x(), p.y(), 0.0f);

      Vector2_f q = motion_vectors[j]->pos + motion_vectors[j]->object +
                    motion_vectors[j]->background - center;
      vectors_2d.emplace_back(q.x(), q.y());
    }

    if (vectors_3d.size() < kMinVectors) return false;
  }

  // TODO: use previous rvec and tvec to initilize the solver.
  cv::Mat rvec, tvec;
  cv::solvePnP(vectors_3d, vectors_2d,
               cv::Mat::eye(3, 3, CV_64F) /* camera_mat */,
               cv::Mat::zeros(1, 5, CV_64FC1) /* dist_coef */, rvec, tvec);
  *pnp_homography = PnpHomographyFromRotationAndTranslation(rvec, tvec);

  return true;
}

void MotionBox::ApplyObjectMotionPerspectively(const MotionBoxState& curr_pos,
                                               const Homography& pnp_homography,
                                               float domain_x, float domain_y,
                                               MotionBoxState* next_pos) const {
  const float half_width = curr_pos.aspect_ratio();
  const float half_height = 1.0f;

  constexpr int kQuadCornersSize = 4;

  // Omitting the 3rd dimension because they are all zeros.
  const std::vector<Vector2_f> corners_3d{
      Vector2_f(-half_width, -half_height),
      Vector2_f(-half_width, half_height),
      Vector2_f(half_width, half_height),
      Vector2_f(half_width, -half_height),
  };

  std::vector<Vector2_f> corners_2d(kQuadCornersSize);
  for (int c = 0; c < kQuadCornersSize; ++c) {
    corners_2d[c] =
        HomographyAdapter::TransformPoint(pnp_homography, corners_3d[c]);
  }

  if (options_.has_camera_intrinsics()) {
    std::vector<cv::Point3f> cv_points(4);
    for (int c = 0; c < kQuadCornersSize; ++c) {
      cv_points[c] = cv::Point3f(corners_2d[c].x(), corners_2d[c].y(), 1.0);
    }

    const auto& camera = options_.camera_intrinsics();
    cv::Mat camera_mat, dist_coef;
    ConvertCameraIntrinsicsToCvMat(camera, &camera_mat, &dist_coef);
    cv::Mat dummy_zeros = cv::Mat::zeros(1, 3, CV_64FC1);
    std::vector<cv::Point2f> cv_points_distorted;
    cv::projectPoints(cv_points, dummy_zeros /* rvec */, dummy_zeros /* tvec */,
                      camera_mat, dist_coef, cv_points_distorted);
    const float scale = 1.0f / std::max(camera.w(), camera.h());
    for (int c = 0; c < kQuadCornersSize; ++c) {
      next_pos->mutable_quad()->set_vertices(c * 2,
                                             cv_points_distorted[c].x * scale);
      next_pos->mutable_quad()->set_vertices(c * 2 + 1,
                                             cv_points_distorted[c].y * scale);
    }
  } else {
    const float center_x = domain_x * 0.5f;
    const float center_y = domain_y * 0.5f;
    for (int c = 0; c < kQuadCornersSize; ++c) {
      next_pos->mutable_quad()->set_vertices(c * 2,
                                             corners_2d[c].x() + center_x);
      next_pos->mutable_quad()->set_vertices(c * 2 + 1,
                                             corners_2d[c].y() + center_y);
    }
  }

  *next_pos->mutable_pnp_homography() = pnp_homography;
  UpdateStatePositionAndSizeFromStateQuad(next_pos);
}

float MotionBox::ComputeMotionDisparity(
    const MotionBoxState& curr_pos, const Vector2_f& irls_scale,
    float continued_inliers, int num_inliers,
    const Vector2_f& object_translation) const {
  // Motion disparity does not take into account change of direction,
  // only use parallel irls scale.
  const float curr_velocity = MotionBoxVelocity(curr_pos).Norm();
  const float sign = object_translation.Norm() < curr_velocity ? -1.0f : 1.0f;
  const float motion_diff = fabs(object_translation.Norm() - curr_velocity);

  // Score difference.
  const float measured_motion_disparity = LinearRamp(
      motion_diff * irls_scale.x(), options_.motion_disparity_low_level(),
      options_.motion_disparity_high_level());

  // Cap disparity measurement by inlier ratio, to account for objects
  // suddenly stopping/accelerating. In this case measured disparity might be
  // high, whereas inliers continue to be tracked.
  const float max_disparity = 1.0f - continued_inliers;

  const float capped_disparity =
      std::min(max_disparity, measured_motion_disparity);

  // Take into account large disparity in previous frames. Score by prior of
  // previous motion.
  const float motion_disparity =
      std::max(curr_pos.motion_disparity() * options_.disparity_decay(),
               capped_disparity) *
      curr_pos.prior_weight();

  // Map number of inliers to score in [0, 1], assuming a lot of inliers
  // indicate lock onto object.
  const float inlier_score = LinearRamp(num_inliers, 20, 40);

  // Decaying motion disparity faster if number of inliers indicate lock onto
  // tracking objects has occurred.
  return std::min(1.0f - inlier_score, motion_disparity) * sign;
}

void MotionBox::ScoreAndRecordInliers(
    const MotionBoxState& curr_pos,
    const std::vector<const MotionVector*>& vectors,
    const std::vector<Vector2_f>& grid_positions,
    const std::vector<float>& pre_estimation_weights,
    const std::vector<float>& post_estimation_weights,
    float background_discrimination, MotionBoxState* next_pos,
    std::vector<float>* inlier_weights, std::vector<float>* inlier_density,
    int* continued_inliers, int* swapped_inliers, float* motion_inliers_out,
    float* kinetic_average_out) const {
  CHECK(inlier_weights);
  CHECK(inlier_density);
  CHECK(continued_inliers);
  CHECK(swapped_inliers);
  CHECK(motion_inliers_out);
  CHECK(kinetic_average_out);

  std::unordered_map<int, int> prev_inliers;
  MotionBoxInliers(curr_pos, &prev_inliers);

  std::unordered_set<int> prev_outliers;
  MotionBoxOutliers(curr_pos, &prev_outliers);

  ClearInlierState(next_pos);

  // Continued inlier fraction denotes amount of spatial occlusion. Very low
  // values indicate that we are in very difficult tracking territory.
  *continued_inliers = 0;
  *swapped_inliers = 0;
  float kinetic_average = 0;
  float kinetic_average_sum = 0;
  float motion_inliers = 0;
  const int num_vectors = vectors.size();
  inlier_weights->resize(num_vectors);
  inlier_density->resize(num_vectors);

  // Inliers normalization grid.
  std::vector<float> grid_count(kNormalizationGridSize *
                                kNormalizationGridSize);
  const float prev_object_motion = MotionBoxVelocity(curr_pos).Norm();
  // Count number of similar moving inliers as previous object motion.
  const float similar_motion_threshold =
      std::max(2e-3f, prev_object_motion * 0.3f);

  // If background discrimination is low, inliers are ambiguous: Hard to
  // distinguish from earlier outliers. In this case do not record inliers
  // outside our current tracking extent, as everything will look like an
  // inlier.
  //
  // TODO: Compute 2nd moment for inliers and describe as ellipse,
  // improve shape here then.
  bool inlier_ambiguity = background_discrimination < 0.5f;
  int rejected = 0;
  int num_inliers = 0;
  for (int k = 0; k < num_vectors; ++k) {
    (*inlier_weights)[k] =
        LinearRamp(post_estimation_weights[k], options_.inlier_low_weight(),
                   options_.inlier_high_weight());
    const int track_id = vectors[k]->track_id;

    bool is_prev_outlier = prev_outliers.find(track_id) != prev_outliers.end();

    const Vector2_f match_loc = vectors[k]->MatchLocation();
    if ((*inlier_weights)[k] > kMinInlierWeight) {  // Inlier.
      if (is_prev_outlier) {
        ++(*swapped_inliers);
      }

      if (inlier_ambiguity &&
          !PointWithinInlierExtent(vectors[k]->Location(), curr_pos)) {
        ++rejected;
        continue;
      }

      ++num_inliers;

      AddToGrid<kNormalizationGridSize>(grid_positions[k], &grid_count);

      if (track_id >= 0) {
        next_pos->add_inlier_ids(track_id);
        next_pos->add_inlier_id_match_pos(match_loc.x() * kShortScale);
        next_pos->add_inlier_id_match_pos(match_loc.y() * kShortScale);
        auto pos = prev_inliers.find(track_id);
        if (pos != prev_inliers.end()) {
          // Count length of observation.
          next_pos->add_inlier_length(pos->second + 1.0f);
          ++(*continued_inliers);
        } else {
          next_pos->add_inlier_length(1);
        }
      }

      // Note: This should be weighted by the pre estimation weights, simply
      // adding a 1 for each inlier leads to lower irls averages.
      kinetic_average += vectors[k]->object.Norm() * pre_estimation_weights[k];
      kinetic_average_sum += pre_estimation_weights[k];

      // Count number of inliers that agree with previous kinetic energy
      // estimate.
      if (std::abs(vectors[k]->object.Norm() - prev_object_motion) *
              curr_pos.prior_weight() <
          similar_motion_threshold) {
        motion_inliers += pre_estimation_weights[k];
      }
    } else if ((*inlier_weights)[k] < kMaxOutlierWeight) {  // Outlier.
      next_pos->add_outlier_ids(track_id);
      next_pos->add_outlier_id_match_pos(match_loc.x() * kShortScale);
      next_pos->add_outlier_id_match_pos(match_loc.y() * kShortScale);
    }
  }

  // Read out density of inliers.
  for (int k = 0; k < num_vectors; ++k) {
    if ((*inlier_weights)[k] > kMinInlierWeight) {
      (*inlier_density)[k] = 2 * SampleFromGrid<kNormalizationGridSize>(
                                     grid_positions[k], grid_count);
    } else {
      (*inlier_density)[k] = 0;
    }
  }

  if (kinetic_average_sum > 0) {
    kinetic_average *= 1.0f / kinetic_average_sum;
  }

  VLOG(1) << "num inliers: " << num_inliers << " rejected: " << rejected;

  *kinetic_average_out = kinetic_average;
  *motion_inliers_out = motion_inliers;
}

void MotionBox::ComputeInlierCenterAndExtent(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& weights, const std::vector<float>& density,
    const MotionBoxState& box_state, float* min_inlier_sum, Vector2_f* center,
    Vector2_f* extent) const {
  CHECK(min_inlier_sum);
  CHECK(center);
  CHECK(extent);

  float weight_sum = 0;
  float inlier_sum = 0;
  const int num_vectors = motion_vectors.size();
  CHECK_EQ(num_vectors, weights.size());
  CHECK_EQ(num_vectors, density.size());

  Vector2_f first_moment(0.0f, 0.0f);
  Vector2_f second_moment(0.0f, 0.0f);

  Vector2_f top_left;
  Vector2_f bottom_right;
  MotionBoxBoundingBox(box_state, &top_left, &bottom_right);

  for (int k = 0; k < num_vectors; ++k) {
    const MotionVector& motion_vector = *motion_vectors[k];
    const Vector2_f match = motion_vector.MatchLocation();
    float space_multiplier = 1.0f;
    // Decrease contribution of out of bound inliers.
    // Note: If all inliers are out of bound this down weighting will have no
    // effect. It is designed to prevent skewing the inlier center towards
    // similar moving inliers outside the tracked box.
    if (match.x() < top_left.x() || match.x() > bottom_right.x() ||
        match.y() < top_left.y() || match.y() > bottom_right.y()) {
      space_multiplier = 0.25f;
    }
    const float w = weights[k] * density[k] * space_multiplier;
    if (w > 0) {
      first_moment += w * match;
      second_moment +=
          w * Vector2_f(match.x() * match.x(), match.y() * match.y());
      weight_sum += w;
      inlier_sum += weights[k];
    }
  }

  // Update center if sufficient inliers present.
  if (inlier_sum > *min_inlier_sum) {
    const float inv_weight_sum = 1.0f / weight_sum;
    first_moment *= inv_weight_sum;
    second_moment *= inv_weight_sum;

    *center = first_moment;
    *extent = second_moment - Vector2_f(first_moment.x() * first_moment.x(),
                                        first_moment.y() * first_moment.y());

    // 1.645 sigmas in each direction = 90% of the data captured.
    *extent =
        Vector2_f(std::sqrt(extent->x()) * 3.29, std::sqrt(extent->y()) * 3.29);
  } else {
    // Gravitate back to box center with inlier center.
    *center = Lerp(MotionBoxCenter(box_state), InlierCenter(box_state), 0.5f);
  }

  // Record number of inliers present.
  *min_inlier_sum = weight_sum;
}

float MotionBox::ScaleEstimate(
    const std::vector<const MotionVector*>& motion_vectors,
    const std::vector<float>& weights, float min_sum) const {
  const int num_vectors = motion_vectors.size();
  CHECK_EQ(num_vectors, weights.size());

  float scale_sum = 0;

  // First moments.
  Vector2_d sum_coords(0, 0);
  Vector2_d match_sum_coords(0, 0);

  // Second moments.
  Vector2_d sum_sq_coords(0, 0);
  Vector2_d match_sum_sq_coords(0, 0);

  for (int k = 0; k < num_vectors; ++k) {
    const MotionVector& motion_vector = *motion_vectors[k];

    const Vector2_d pos(motion_vector.pos.x(), motion_vector.pos.y());
    double weight = weights[k];
    sum_coords += weight * pos;
    sum_sq_coords += weight * Vector2_d(pos.x() * pos.x(), pos.y() * pos.y());

    const Vector2_d match =
        Vector2_d::Cast<float>(motion_vector.MatchLocation());
    match_sum_coords += weight * match;
    match_sum_sq_coords +=
        weight * Vector2_d(match.x() * match.x(), match.y() * match.y());
    scale_sum += weights[k];
  }

  if (scale_sum > min_sum) {
    const double denom = 1.0f / scale_sum;
    sum_coords *= denom;
    match_sum_coords *= denom;
    sum_sq_coords *= denom;
    match_sum_sq_coords *= denom;

    const float curr_scale =
        sqrt(sum_sq_coords.x() - sum_coords.x() * sum_coords.x() +
             sum_sq_coords.y() - sum_coords.y() * sum_coords.y());
    const float next_scale = sqrt(
        match_sum_sq_coords.x() - match_sum_coords.x() * match_sum_coords.x() +
        match_sum_sq_coords.y() - match_sum_coords.y() * match_sum_coords.y());
    return next_scale / curr_scale;
  }

  return 1.0f;
}

void MotionBox::ApplySpringForce(const Vector2_f& center_of_interest,
                                 const float rel_threshold,
                                 const float spring_force,
                                 MotionBoxState* box_state) const {
  // Apply spring force towards center of interest.
  const Vector2_f center = MotionBoxCenter(*box_state);
  const float center_diff_x = center_of_interest.x() - center.x();
  const float center_diff_y = center_of_interest.y() - center.y();

  const float diff_x = fabs(center_diff_x) - box_state->width() * rel_threshold;
  const float diff_y =
      fabs(center_diff_y) - box_state->height() * rel_threshold;

  if (diff_x > 0) {
    const float correction_mag = diff_x * spring_force;
    const float correction =
        center_diff_x < 0 ? -correction_mag : correction_mag;
    box_state->set_pos_x(box_state->pos_x() + correction);
  }

  if (diff_y > 0) {
    const float correction_mag = diff_y * spring_force;
    const float correction =
        center_diff_y < 0 ? -correction_mag : correction_mag;
    box_state->set_pos_y(box_state->pos_y() + correction);
  }
}

void MotionBox::TrackStepImpl(int from_frame, const MotionBoxState& curr_pos,
                              const MotionVectorFrame& motion_frame,
                              const std::vector<const MotionBoxState*>& history,
                              MotionBoxState* next_pos) const {
  // Create new curr pos with velocity scaled to current duration.
  constexpr float kDefaultPeriodMs = 1000.0f / kTrackingDefaultFps;

  // Scale to be applied to velocity related fields in MotionBoxState
  // to transform state from standard frame period to current one.
  float temporal_scale = (motion_frame.duration_ms == 0)
                             ? 1.0f
                             : motion_frame.duration_ms / kDefaultPeriodMs;

  MotionBoxState curr_pos_normalized = curr_pos;
  ScaleStateTemporally(temporal_scale, &curr_pos_normalized);
  ScaleStateAspect(motion_frame.aspect_ratio, false, &curr_pos_normalized);

  TrackStepImplDeNormalized(from_frame, curr_pos_normalized, motion_frame,
                            history, next_pos);

  // Scale back velocity and aspect to normalized domains.
  ScaleStateTemporally(1.0f / temporal_scale, next_pos);
  ScaleStateAspect(motion_frame.aspect_ratio, true, next_pos);

  // Test if out of bound, only for moving objects.
  const float static_motion =
      options_.static_motion_temporal_ratio() * temporal_scale;
  if (MotionBoxVelocity(*next_pos).Norm() > static_motion) {
    // Test if close to boundary and keep moving towards it.
    constexpr float kRatio = 0.3;
    if ((next_pos->pos_x() < -next_pos->width() * kRatio &&
         next_pos->dx() < -static_motion / 2) ||
        (next_pos->pos_y() < -next_pos->height() * kRatio &&
         next_pos->dy() < -static_motion / 2) ||
        (next_pos->pos_x() > 1.0f - next_pos->width() * (1.0f - kRatio) &&
         next_pos->dx() > static_motion / 2) ||
        (next_pos->pos_y() > 1.0f - next_pos->height() * (1.0f - kRatio) &&
         next_pos->dy() > static_motion / 2)) {
      VLOG(1) << "Tracked box went out of bound.";
      next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
      return;
    }
  }
}

float MotionBox::ComputeTrackingConfidence(
    const MotionBoxState& motion_box_state) const {
  const float inlier_num_lower_bound = 10.0f;
  const float inlier_num_upper_bound = 30.0f;
  const float confidence =
      LinearRamp(motion_box_state.inlier_ids_size(), inlier_num_lower_bound,
                 inlier_num_upper_bound);
  return confidence;
}

// General tracking algorithm overview:
// Tracking algorithm consists of 6 main stages.
// 1.) Selecting features from passed MotionVectorFrame based on incidence with
//     rectangle defined by curr_pos
// 2.) Assigning each vector a prior weight. Vector are mainly scored by a box
//     centered gaussian, giving more weights to vectors within the center of
//     the box. If current state is deemed unreliable, vectors with similar
//     velocity to previously estimated velocity are favored. If current state
//     indicates tracking of a moving object, high velocity vectors are favored.
// 3.) Estimating a translational model via IRLS enforcing the prior of step 2
//     in every iteration.
// 4.) Score how much the estimate model deviates from the previous motion
//     (termed motion disparity) and how discriminative the motion is
//     from the background motion (termed motion discrimination)
// 5.) Computing the inlier center (position of vectors used for the motion
//     model in the next frame) and center of high velocity vectors. Apply a
//     spring force towards each center based on the motion discrimination.
// 6.) Update velocity and kinetic energy by blending current measurment with
//     previous one.
void MotionBox::TrackStepImplDeNormalized(
    int from_frame, const MotionBoxState& curr_pos,
    const MotionVectorFrame& motion_frame,
    const std::vector<const MotionBoxState*>& history,
    MotionBoxState* next_pos) const {
  CHECK(next_pos);

  constexpr float kDefaultPeriodMs = 1000.0f / kTrackingDefaultFps;
  float temporal_scale = (motion_frame.duration_ms == 0)
                             ? 1.0f
                             : motion_frame.duration_ms / kDefaultPeriodMs;

  // Initialize to current position.
  *next_pos = curr_pos;

  if (!IsBoxValid(curr_pos)) {
    LOG(ERROR) << "curr_pos is not a valid box. Stop tracking!";
    next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
    return;
  }

  Vector2_f top_left, bottom_right;
  float expand_mag = 0.0f;
  GetStartPosition(curr_pos, motion_frame.aspect_ratio, &expand_mag, &top_left,
                   &bottom_right);

  const float aspect_ratio = motion_frame.aspect_ratio;
  float domain_x = 1.0f;
  float domain_y = 1.0f;
  ScaleFromAspect(aspect_ratio, false, &domain_x, &domain_y);

  // Binary search for start and end index (lexicographic search, i.e.
  // x indices are guaranteed to be within bounds, but y coordinates could be
  // outside and need to be checked against the domain of the box via
  // GetVectorsAndWeights below).
  MotionVector search_start;
  MotionVector search_end;
  search_start.pos = top_left;
  search_end.pos = bottom_right;

  int start_idx = std::lower_bound(motion_frame.motion_vectors.begin(),
                                   motion_frame.motion_vectors.end(),
                                   search_start, MotionVectorComparator()) -
                  motion_frame.motion_vectors.begin();

  int end_idx = std::lower_bound(motion_frame.motion_vectors.begin(),
                                 motion_frame.motion_vectors.end(), search_end,
                                 MotionVectorComparator()) -
                motion_frame.motion_vectors.begin();

  const float static_motion =
      options_.static_motion_temporal_ratio() * temporal_scale;
  if (start_idx >= end_idx || top_left.x() >= domain_x - expand_mag ||
      top_left.y() >= domain_y - expand_mag || bottom_right.x() <= expand_mag ||
      bottom_right.y() <= expand_mag) {
    // Empty box, no features found. It can happen if box is outside
    // field of view, or there is no features in the box.
    // Move box by background model if it has static motion, else move return
    // tracking error.
    if (MotionBoxVelocity(curr_pos).Norm() > static_motion ||
        (!motion_frame.valid_background_model && from_frame != queue_start_)) {
      next_pos->set_track_status(MotionBoxState::BOX_NO_FEATURES);
    } else {
      // Static object, move by background model.
      next_pos->set_track_status(MotionBoxState::BOX_TRACKED_OUT_OF_BOUND);
      ApplyCameraTrackingDegrees(curr_pos, motion_frame.background_model,
                                 options_, Vector2_f(domain_x, domain_y),
                                 next_pos);

      // The further the quad is away from the FOV (range 0 to 1), the larger
      // scale change will be applied to the quad by homography transform. To
      // some certain extent, the position of vertices will be flipped from
      // positive to negative, or vice versa. Here we reject all the quads with
      // abnormal shape by convexity of the quad..
      if (next_pos->has_quad() &&
          (ObjectMotionValidator::IsQuadOutOfFov(
               next_pos->quad(), Vector2_f(domain_x, domain_y)) ||
           !ObjectMotionValidator::IsValidQuad(next_pos->quad()))) {
        LOG(ERROR) << "Quad is out of fov or not convex. Cancel tracking.";
        next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
        return;
      }
    }
    return;
  }

  float start_x = Clamp(top_left.x(), 0.0f, domain_x);
  float start_y = Clamp(top_left.y(), 0.0f, domain_y);
  float end_x = Clamp(bottom_right.x(), 0.0f, domain_x);
  float end_y = Clamp(bottom_right.y(), 0.0f, domain_y);

  const Vector2_f curr_pos_size = MotionBoxSize(curr_pos);
  constexpr float kMinSize = 1e-3f;  // 1 pix for 1080p.
  if (start_x >= end_x || start_y >= end_y || curr_pos_size.x() < kMinSize ||
      curr_pos_size.y() < kMinSize) {
    next_pos->set_track_status(MotionBoxState::BOX_EMPTY);
    return;
  }

  top_left = Vector2_f(start_x, start_y);
  bottom_right = Vector2_f(end_x, end_y);

  // Get indices of features within box, corresponding priors and position
  // in feature grid.
  std::vector<const MotionVector*> vectors;
  std::vector<float> prior_weights;
  bool valid_background_model = motion_frame.valid_background_model;

  int num_good_inits;
  int num_cont_inliers;
  const bool get_vec_weights_status = GetVectorsAndWeights(
      motion_frame.motion_vectors, start_idx, end_idx, top_left, bottom_right,
      curr_pos, valid_background_model, motion_frame.is_chunk_boundary,
      temporal_scale, expand_mag, history, &vectors, &prior_weights,
      &num_good_inits, &num_cont_inliers);
  if (!get_vec_weights_status) {
    LOG(ERROR) << "error in GetVectorsAndWeights. Terminate tracking.";
    next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
    return;
  }

  // `num_good_inits` comes from motion vector weights, but pnp solver currently
  // does not depend on weights. So for pnp tracking mode, we don't fall back to
  // camera motion tracking based on num_good_inits.
  if (!curr_pos.has_pnp_homography() &&
      (num_good_inits < 3 &&
       MotionBoxVelocity(curr_pos).Norm() <= static_motion)) {
    // Static object, move by background model.
    next_pos->set_track_status(MotionBoxState::BOX_TRACKED);
    ApplyCameraTrackingDegrees(curr_pos, motion_frame.background_model,
                               options_, Vector2_f(domain_x, domain_y),
                               next_pos);
    VLOG(1) << "No good inits; applying camera motion for static object";

    if (next_pos->has_quad() &&
        !ObjectMotionValidator::IsValidQuad(next_pos->quad())) {
      LOG(ERROR) << "Quad is not convex. Cancel tracking.";
      next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
      return;
    }
    return;
  }

  VLOG(1) << "Good inits: " << num_good_inits;

  const int num_vectors = vectors.size();
  CHECK_EQ(num_vectors, prior_weights.size());

  Vector2_f object_translation;

  // Compute a rough estimate of the current motion.
  for (int k = 0; k < num_vectors; ++k) {
    object_translation += vectors[k]->Motion() * prior_weights[k];
  }

  Vector2_f prev_object_motion = MotionBoxVelocity(curr_pos);

  // Estimate expected motion magnitude. In case of low prior, skew more
  // towards rough estimate instead of previous motion.
  const float motion_mag_estimate =
      std::max(object_translation.Norm(),
               prev_object_motion.Norm() * curr_pos.prior_weight());

  // For motivation about this: See MotionEstimation::GetIRLSResidualScale.
  // Assume 1 pixel estimation error for tracked features at 360p video.
  // This serves as absolute minimum of the estimation error, so we do not
  // scale translation fractions below this threshold.
  constexpr float kMinError = 1.25e-3f;

  // We use a combination of absolute and relative error. If a predefined
  // fraction of the motion exceeds the minimum error, we scale the error
  // such that the relative error equals the min error.
  // We use different thresholds parallel and perpendicular to the estimation
  // direction.
  // Motivation is that we allow for more error perpendicular to an estimation
  // (angular difference) than it is direction (magnitude error).

  // Scale in parallel, orthogonal direction.
  Vector2_f irls_scale(1.0f, 1.0f);

  const Vector2_f kMotionPercentage(0.1f, 0.25f);
  const Vector2_f motion_mag_scaled = motion_mag_estimate * kMotionPercentage;

  if (motion_mag_scaled.x() > kMinError) {
    irls_scale.x(kMinError / motion_mag_scaled.x());
  }
  if (motion_mag_scaled.y() > kMinError) {
    irls_scale.y(kMinError / motion_mag_scaled.y());
  }

  // Irls init for translation.
  // TODO: Adjust to object tracking degrees.
  TranslationIrlsInitialization(vectors, irls_scale, &prior_weights);

  LinearSimilarityModel object_similarity;
  Homography object_homography;
  Homography pnp_homography;

  std::vector<float> weights = prior_weights;
  if (num_good_inits > 0) {
    MEASURE_TIME << "Estimate object motion.";
    EstimateObjectMotion(vectors, prior_weights, num_cont_inliers, irls_scale,
                         &weights, &object_translation, &object_similarity,
                         &object_homography);
  } else {
    // There is no hope to estimate a model in a stable manner here.
    object_translation = prev_object_motion;
    VLOG(1) << "No good inits, reusing prev. motion instead of estimation";
  }

  // Multiplier to quanitify how discriminative object motion is
  // (larger motions are more discriminative).
  // Note: Independent from temporal scale.
  float background_discrimination = curr_pos.background_discrimination();
  if (valid_background_model) {
    background_discrimination =
        LinearRamp(object_translation.Norm(),
                   options_.background_discrimination_low_level(),
                   options_.background_discrimination_high_level());
  }

  // Score weights from motion estimation to determine set of inliers.
  std::vector<float> inlier_weights;
  std::vector<float> inlier_density;

  // Compute grid positions for each vector to determine density of inliers.
  std::vector<Vector2_f> grid_positions;
  ComputeGridPositions<kNormalizationGridSize>(top_left, bottom_right, vectors,
                                               &grid_positions);

  // Continued inlier fraction denotes amount of spatial occlusion. Very low
  // values indicate that we are in very difficult tracking territory.
  int continued_inliers = 0;
  int swapped_inliers = 0;
  float kinetic_average = 0;
  float motion_inliers = 0;
  ScoreAndRecordInliers(curr_pos, vectors, grid_positions, prior_weights,
                        weights, background_discrimination, next_pos,
                        &inlier_weights, &inlier_density, &continued_inliers,
                        &swapped_inliers, &motion_inliers, &kinetic_average);

  const int num_prev_inliers = curr_pos.inlier_ids_size();
  int num_prev_inliers_not_actively_discarded = num_prev_inliers;
  if (motion_frame.actively_discarded_tracked_ids != nullptr) {
    num_prev_inliers_not_actively_discarded = std::count_if(
        curr_pos.inlier_ids().begin(), curr_pos.inlier_ids().end(),
        [&motion_frame](int id) {
          return !motion_frame.actively_discarded_tracked_ids->contains(id);
        });
    motion_frame.actively_discarded_tracked_ids->clear();
  }
  const int num_inliers = next_pos->inlier_ids_size();
  // Must be in [0, 1].
  const float continued_inlier_fraction =
      num_prev_inliers_not_actively_discarded == 0
          ? 1.0f
          : continued_inliers * 1.0f / num_prev_inliers_not_actively_discarded;

  // Within [0, M], where M is maximum number of features. Values of > 1
  // indicate significant number of inlieres were outliers in previous frame.
  const float swapped_inlier_fraction =
      num_prev_inliers == 0 ? 0.0f : swapped_inliers * 1.0f / num_prev_inliers;

  if (curr_pos.has_pnp_homography()) {
    MEASURE_TIME << "Estimate pnp homography.";

    // Use IRLS homography `inlier_weights` to determin inliers / outliers.
    // The rationale is: solving homography transform is 20X faster than solving
    // perspective transform (0.05ms vs 1ms). So, we use 5 iterations of
    // reweighted homography to filter out outliers first. And only use inliers
    // to solve for perspective.
    if (!EstimatePnpHomography(curr_pos, vectors, inlier_weights, domain_x,
                               domain_y, &pnp_homography)) {
      // Here, we can either cancel tracking or apply homography or even
      // translation as our best guess. But since some specific use cases of
      // pnp tracking (for example Augmented Images) prefer high precision
      // over high recall, we choose to cancel tracking once and for all.
      VLOG(1) << "Not enough motion vectors to solve pnp. Cancel tracking.";
      next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
      return;
    }
  }

  // Compute disparity.
  if (num_good_inits > 0) {
    next_pos->set_motion_disparity(ComputeMotionDisparity(
        curr_pos, irls_scale, continued_inliers, num_inliers,
        valid_background_model ? object_translation : prev_object_motion));
  } else {
    // No good features, signal error.
    next_pos->set_motion_disparity(1.0);
  }

  VLOG(1) << "Motion inliers: " << motion_inliers
          << ", continued inliers: " << continued_inliers
          << ", continued ratio: " << continued_inlier_fraction
          << ", swapped fraction: " << swapped_inlier_fraction
          << ", motion disparity: " << next_pos->motion_disparity();

  if (options_.cancel_tracking_with_occlusion_options().activated() &&
      curr_pos.track_status() != MotionBoxState::BOX_DUPLICATED &&
      continued_inlier_fraction <
          options_.cancel_tracking_with_occlusion_options()
              .min_motion_continuity()) {
    next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
    LOG(INFO) << "Occlusion detected. continued_inlier_fraction: "
              << continued_inlier_fraction << " too low. Stop tracking";
    return;
  }

  // Force reset of state when inlier continuity is severly violated,
  // disparity maxes out or significant of number of inliers were outliers in
  // the previous frame.
  if (std::max(continued_inlier_fraction, motion_inliers) < 0.15f ||
      std::abs(next_pos->motion_disparity()) >= 1.0f ||
      swapped_inlier_fraction >= 2.5) {
    VLOG(1) << "Track error, state reset.";
    // Bad tracking error occurred.
    // Current set of inliers is not reliable.
    ClearInlierState(next_pos);
    next_pos->set_motion_disparity(1.0f);
    inlier_weights.assign(inlier_weights.size(), 0);

    // Reuse previous motion and discrimination.
    object_translation = prev_object_motion;
    background_discrimination = curr_pos.background_discrimination();
  }

  next_pos->set_inlier_sum(
      std::accumulate(inlier_weights.begin(), inlier_weights.end(), 0.0f));
  if (history.empty()) {
    // Assign full confidence on first frame, otherwise all other stats
    // are zero and there is no way to compute.
    next_pos->set_tracking_confidence(1.0f);
    LOG(INFO) << "no history. confidence : 1.0";
  } else {
    next_pos->set_tracking_confidence(ComputeTrackingConfidence(*next_pos));
    VLOG(1) << "confidence: " << next_pos->tracking_confidence();
  }
  next_pos->set_background_discrimination(background_discrimination);

  // Slowly decay current kinetic energy. Blend with current measurement based
  // on disparity (high = use previous value, low = use current one).
  next_pos->set_kinetic_energy(std::max(
      options_.kinetic_energy_decay() * curr_pos.kinetic_energy(),
      kinetic_average * (1.0f - std::abs(next_pos->motion_disparity()))));

  float inlier_max = curr_pos.inlier_sum();
  int num_tracked_frames_in_history = 0;
  for (auto entry : history) {
    inlier_max = std::max(inlier_max, entry->inlier_sum());
    if (entry->track_status() == MotionBoxState::BOX_TRACKED) {
      num_tracked_frames_in_history++;
    }
  }

  const float inlier_ratio =
      inlier_max > 0 ? (next_pos->inlier_sum() / (inlier_max + 1e-3f)) : 0.0f;

  next_pos->set_inlier_ratio(inlier_ratio);

  const bool is_perfect_fit = inlier_ratio > 0.85f && inlier_ratio < 1.15f;

  // num_tracked_frames_in_history has to be greater than 1, since the first
  // frame is marked as BOX_TRACKED in ResetAtFrame.
  if (options_.cancel_tracking_with_occlusion_options().activated() &&
      curr_pos.track_status() != MotionBoxState::BOX_DUPLICATED &&
      num_tracked_frames_in_history > 1 &&
      inlier_ratio < options_.cancel_tracking_with_occlusion_options()
                         .min_inlier_ratio()) {
    next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
    LOG(INFO) << "inlier_ratio: " << inlier_ratio
              << " too small. Stop tracking. inlier_max: " << inlier_max
              << ". length in history: " << history.size();
    return;
  }

  // Blend measured object motion based on motion disparity, i.e. the more the
  // measured and previous motion agree, the less the smoothing. This is to
  // propagate the box in the direction of the previous object motion
  // in case tracking has been lost.
  // Allow new measurements to propagate slowly.
  if (valid_background_model && !is_perfect_fit) {
    // Always move some fraction in the direction of the measured object
    // even if deemed in disagreement with previous motion.
    const float kMinimumBlend = 0.2f;
    object_translation = Lerp(
        object_translation, prev_object_motion,
        std::min(1.0f - kMinimumBlend, std::abs(next_pos->motion_disparity())));
  }

  if (curr_pos.has_pnp_homography()) {
    ApplyObjectMotionPerspectively(curr_pos, pnp_homography, domain_x, domain_y,
                                   next_pos);
  } else {
    ApplyObjectMotion(curr_pos, object_translation, object_similarity,
                      object_homography, options_, next_pos);

    ApplyCameraTrackingDegrees(curr_pos, motion_frame.background_model,
                               options_, Vector2_f(domain_x, domain_y),
                               next_pos);
  }

  if (next_pos->has_quad() &&
      !ObjectMotionValidator::IsValidQuad(next_pos->quad())) {
    LOG(ERROR) << "Quad is not convex. Cancel tracking.";
    next_pos->set_track_status(MotionBoxState::BOX_UNTRACKED);
    return;
  }

  // Storing pre-comp weights.
  const std::vector<float>& internal_weights =
      options_.use_post_estimation_weights_for_state() ? inlier_weights
                                                       : prior_weights;

  StoreInternalState(vectors, internal_weights, aspect_ratio,
                     next_pos->mutable_internal());

  // Compute center of inliers in next frame and change in scale for inliers.
  Vector2_f inlier_center;
  Vector2_f inlier_extent;
  float min_inlier_weight = 2.0f;  // Only update inlier_center if more
                                   // inliers than specified are found.
  ComputeInlierCenterAndExtent(vectors, inlier_weights, inlier_density,
                               *next_pos, &min_inlier_weight, &inlier_center,
                               &inlier_extent);

  // Determine difference to previous estimate.
  Vector2_f prev_inlier_center(InlierCenter(curr_pos));
  const float rel_inlier_center_diff =
      (inlier_center - prev_inlier_center).Norm() /
      MotionBoxSize(curr_pos).Norm();

  // Smooth with previous location, based on relative inlier difference.
  // A difference of 1.0 is mapped to a weight of 1.0 (total outlier).
  // Blend weight is maxed out at .6f to always allow measurements to propagate
  // over time (assuming high motion discrimination).
  const float center_blend =
      std::min(Lerp(0.95f, 0.6f, background_discrimination),
               rel_inlier_center_diff) *
      curr_pos.prior_weight();
  inlier_center = Lerp(inlier_center, prev_inlier_center, center_blend);

  next_pos->set_inlier_center_x(inlier_center.x());
  next_pos->set_inlier_center_y(inlier_center.y());

  // Update extent only when sufficient inliers are present.
  // TODO: This is too hacky, evaluate.
  if (min_inlier_weight > 30) {
    Vector2_f prev_inlier_extent(curr_pos.inlier_width(),
                                 curr_pos.inlier_height());
    // Blend with previous extent based on prior and discrimination.
    inlier_extent = Lerp(
        inlier_extent, prev_inlier_extent,
        curr_pos.prior_weight() * Lerp(1.0, 0.85, background_discrimination));
    next_pos->set_inlier_width(inlier_extent.x());
    next_pos->set_inlier_height(inlier_extent.y());
  }

  VLOG(1) << "Inlier extent " << next_pos->inlier_width() << " , "
          << next_pos->inlier_height();

  // Spring force applied to the inlier_center is modulated by the background
  // discrimination. Motivation: Low background discrimination leads to inlier
  // center more biased towards previous result due to update weight being
  // tampered down. Always apply a minimum force.
  // TODO: During challenging (low inlier) situations this can
  // save the lock onto objects. Cook up a condition to set min spring force to
  // 0.25 or so.
  constexpr float kMinSpringForceFraction = 0.0;
  ApplySpringForce(inlier_center, options_.inlier_center_relative_distance(),
                   std::min(1.0f, options_.inlier_spring_force() *
                                      std::max(kMinSpringForceFraction,
                                               background_discrimination)),
                   next_pos);

  if (options_.compute_spatial_prior()) {
    // Blend based on object multiplier using high prior weight for low
    // multipliers.
    // Magic update numbers, prior is not important for tracking, only for
    // visualization purposes.
    const float prior_weight = Lerp(0.98, 0.85, background_discrimination);
    ComputeSpatialPrior(true, true, prior_weight, next_pos);
  }

  // Update velocity.
  float velocity_update_weight =
      is_perfect_fit
          ? 0.0f
          : (options_.velocity_update_weight() * curr_pos.prior_weight());
  // Computed object motion is completely random when background model is
  // invalid. Use previous motion in this case.
  if (!valid_background_model) {
    velocity_update_weight = 1.0f;
  }

  next_pos->set_dx(
      Lerp(object_translation.x(), curr_pos.dx(), velocity_update_weight));
  next_pos->set_dy(
      Lerp(object_translation.y(), curr_pos.dy(), velocity_update_weight));

  // Update prior.
  if (valid_background_model) {
    next_pos->set_prior_weight(std::min(
        1.0f, curr_pos.prior_weight() + options_.prior_weight_increase()));
  } else {
    next_pos->set_prior_weight(std::max(
        0.0f, curr_pos.prior_weight() - options_.prior_weight_increase()));
  }

  next_pos->set_track_status(MotionBoxState::BOX_TRACKED);
}

void MotionVectorFrameFromTrackingData(const TrackingData& tracking_data,
                                       MotionVectorFrame* motion_vector_frame) {
  CHECK(motion_vector_frame != nullptr);

  const auto& motion_data = tracking_data.motion_data();
  float aspect_ratio = tracking_data.frame_aspect();
  if (aspect_ratio < 0.1 || aspect_ratio > 10.0f) {
    LOG(ERROR) << "Aspect ratio : " << aspect_ratio << " is out of bounds. "
               << "Resetting to 1.0.";
    aspect_ratio = 1.0f;
  }

  float scale_x, scale_y;
  // Normalize longest dimension to 1 under aspect ratio preserving scaling.
  ScaleFromAspect(aspect_ratio, false, &scale_x, &scale_y);

  scale_x /= tracking_data.domain_width();
  scale_y /= tracking_data.domain_height();

  const bool use_background_model =
      !(tracking_data.frame_flags() & TrackingData::FLAG_BACKGROUND_UNSTABLE);

  Homography homog_scale = HomographyAdapter::Embed(
      AffineAdapter::FromArgs(0, 0, scale_x, 0, 0, scale_y));

  Homography inv_homog_scale = HomographyAdapter::Embed(
      AffineAdapter::FromArgs(0, 0, 1.0f / scale_x, 0, 0, 1.0f / scale_y));

  // Might be just the identity if not set.
  const Homography background_model = tracking_data.background_model();
  const Homography background_model_scaled =
      ModelCompose3(homog_scale, background_model, inv_homog_scale);

  motion_vector_frame->background_model.CopyFrom(background_model_scaled);
  motion_vector_frame->valid_background_model = use_background_model;
  motion_vector_frame->is_duplicated =
      tracking_data.frame_flags() & TrackingData::FLAG_DUPLICATED;
  motion_vector_frame->is_chunk_boundary =
      tracking_data.frame_flags() & TrackingData::FLAG_CHUNK_BOUNDARY;
  motion_vector_frame->aspect_ratio = tracking_data.frame_aspect();
  motion_vector_frame->motion_vectors.reserve(motion_data.row_indices_size());

  motion_vector_frame->motion_vectors.clear();
  const bool long_tracks = motion_data.track_id_size() > 0;

  for (int c = 0; c < motion_data.col_starts_size() - 1; ++c) {
    const float x = c;
    const float scaled_x = x * scale_x;

    for (int r = motion_data.col_starts(c),
             r_end = motion_data.col_starts(c + 1);
         r < r_end; ++r) {
      MotionVector motion_vector;

      const float y = motion_data.row_indices(r);
      const float scaled_y = y * scale_y;

      const float dx = motion_data.vector_data(2 * r);
      const float dy = motion_data.vector_data(2 * r + 1);

      if (use_background_model) {
        Vector2_f loc(x, y);
        Vector2_f background_motion =
            HomographyAdapter::TransformPoint(background_model, loc) - loc;
        motion_vector.background = Vector2_f(background_motion.x() * scale_x,
                                             background_motion.y() * scale_y);
      }

      motion_vector.pos = Vector2_f(scaled_x, scaled_y);
      motion_vector.object = Vector2_f(dx * scale_x, dy * scale_y);

      if (long_tracks) {
        motion_vector.track_id = motion_data.track_id(r);
      }
      motion_vector_frame->motion_vectors.push_back(motion_vector);
    }
  }
}

void FeatureAndDescriptorFromTrackingData(
    const TrackingData& tracking_data, std::vector<Vector2_f>* features,
    std::vector<std::string>* descriptors) {
  const auto& motion_data = tracking_data.motion_data();
  float aspect_ratio = tracking_data.frame_aspect();
  if (aspect_ratio < 0.1 || aspect_ratio > 10.0f) {
    LOG(ERROR) << "Aspect ratio : " << aspect_ratio << " is out of bounds. "
               << "Resetting to 1.0.";
    aspect_ratio = 1.0f;
  }

  if (motion_data.feature_descriptors_size() == 0) {
    LOG(WARNING) << "Feature descriptors not exist";
    return;
  }

  float scale_x, scale_y;
  // Normalize longest dimension to 1 under aspect ratio preserving scaling.
  ScaleFromAspect(aspect_ratio, false, &scale_x, &scale_y);
  scale_x /= tracking_data.domain_width();
  scale_y /= tracking_data.domain_height();

  features->clear();
  descriptors->clear();

  for (int c = 0; c < motion_data.col_starts_size() - 1; ++c) {
    const float x = c;
    const float scaled_x = x * scale_x;

    for (int r = motion_data.col_starts(c),
             r_end = motion_data.col_starts(c + 1);
         r < r_end; ++r) {
      const std::string& descriptor = motion_data.feature_descriptors(r).data();

      if (absl::c_all_of(descriptor, [](char c) { return c == 0; })) {
        continue;
      }

      const float y = motion_data.row_indices(r);
      const float scaled_y = y * scale_y;

      features->emplace_back(scaled_x, scaled_y);
      descriptors->emplace_back(descriptor);
    }
  }
}

void InvertMotionVectorFrame(const MotionVectorFrame& input,
                             MotionVectorFrame* output) {
  CHECK(output != nullptr);

  output->background_model.CopyFrom(ModelInvert(input.background_model));
  output->valid_background_model = input.valid_background_model;
  output->is_duplicated = input.is_duplicated;
  output->is_chunk_boundary = input.is_chunk_boundary;
  output->duration_ms = input.duration_ms;
  output->aspect_ratio = input.aspect_ratio;
  output->motion_vectors.clear();
  output->motion_vectors.reserve(input.motion_vectors.size());
  output->actively_discarded_tracked_ids = input.actively_discarded_tracked_ids;

  const float aspect_ratio = input.aspect_ratio;
  float domain_x = 1.0f;
  float domain_y = 1.0f;
  ScaleFromAspect(aspect_ratio, false, &domain_x, &domain_y);

  // Explicit copy.
  for (auto motion_vec : input.motion_vectors) {
    motion_vec.background *= -1.0f;
    motion_vec.object *= -1.0f;

    motion_vec.pos -= motion_vec.background + motion_vec.object;

    // Inverted vector might be out of bound.
    if (motion_vec.pos.x() < 0.0f || motion_vec.pos.x() > domain_x ||
        motion_vec.pos.y() < 0.0f || motion_vec.pos.y() > domain_y) {
      continue;
    }

    // Approximately 40 - 60% of all inserts happen to be at the end.
    if (output->motion_vectors.empty() ||
        MotionVectorComparator()(output->motion_vectors.back(), motion_vec)) {
      output->motion_vectors.push_back(motion_vec);
    } else {
      output->motion_vectors.insert(
          std::lower_bound(output->motion_vectors.begin(),
                           output->motion_vectors.end(), motion_vec,
                           MotionVectorComparator()),
          motion_vec);
    }
  }
}

float TrackingDataDurationMs(const TrackingDataChunk::Item& item) {
  return (item.timestamp_usec() - item.prev_timestamp_usec()) * 1e-3f;
}

void GetFeatureIndicesWithinBox(const std::vector<Vector2_f>& features,
                                const MotionBoxState& box_state,
                                const Vector2_f& box_scaling,
                                float max_enlarge_size, int min_num_features,
                                std::vector<int>* inlier_indices) {
  CHECK(inlier_indices);
  inlier_indices->clear();

  if (features.empty()) return;
  std::array<Vector3_f, 4> box_lines;
  if (!MotionBoxLines(box_state, box_scaling, &box_lines)) {
    LOG(ERROR) << "Error in computing MotionBoxLines.";
    return;
  }

  // If the box size isn't big enough to cover sufficient features to
  // reacquire the box, the following code will try to iteratively enlarge the
  // box size by half of 'max_enlarge_size' to include more features, but
  // maximimum twice.
  float distance_threshold = 0.0f;
  int inliers_count = 0;
  std::vector<bool> chosen(features.size(), false);
  std::vector<float> signed_distance(features.size());

  for (int j = 0; j < features.size(); ++j) {
    float max_dist = std::numeric_limits<float>::lowest();
    for (const Vector3_f& line : box_lines) {
      float dist =
          line.DotProd(Vector3_f(features[j].x(), features[j].y(), 1.0f));
      if (dist > max_enlarge_size) {
        max_dist = dist;
        break;
      }
      max_dist = std::max(dist, max_dist);
    }

    signed_distance[j] = max_dist;
    if (signed_distance[j] < distance_threshold) {
      ++inliers_count;
      chosen[j] = true;
      inlier_indices->push_back(j);
    }
  }

  const float box_enlarge_step = max_enlarge_size * 0.5f;
  while (inliers_count < min_num_features) {
    distance_threshold += box_enlarge_step;
    if (distance_threshold > max_enlarge_size) break;
    for (int j = 0; j < features.size(); ++j) {
      if (chosen[j]) continue;
      if (signed_distance[j] < distance_threshold) {
        ++inliers_count;
        chosen[j] = true;
        inlier_indices->push_back(j);
      }
    }
  }
}

}  // namespace mediapipe.