mediapipe-rs/mediapipe/modules/face_geometry/libs/geometry_pipeline.cc

// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "mediapipe/modules/face_geometry/libs/geometry_pipeline.h"

#include <cmath>
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>

#include "Eigen/Core"
#include "absl/memory/memory.h"
#include "mediapipe/framework/formats/landmark.pb.h"
#include "mediapipe/framework/formats/matrix.h"
#include "mediapipe/framework/formats/matrix_data.pb.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/framework/port/status_macros.h"
#include "mediapipe/framework/port/statusor.h"
#include "mediapipe/modules/face_geometry/libs/mesh_3d_utils.h"
#include "mediapipe/modules/face_geometry/libs/procrustes_solver.h"
#include "mediapipe/modules/face_geometry/libs/validation_utils.h"
#include "mediapipe/modules/face_geometry/protos/environment.pb.h"
#include "mediapipe/modules/face_geometry/protos/face_geometry.pb.h"
#include "mediapipe/modules/face_geometry/protos/geometry_pipeline_metadata.pb.h"
#include "mediapipe/modules/face_geometry/protos/mesh_3d.pb.h"

namespace mediapipe::face_geometry {
namespace {

struct PerspectiveCameraFrustum {
  // NOTE: all arguments must be validated prior to calling this constructor.
  PerspectiveCameraFrustum(const PerspectiveCamera& perspective_camera,
                           int frame_width, int frame_height) {
    static constexpr float kDegreesToRadians = 3.14159265358979323846f / 180.f;

    const float height_at_near =
        2.f * perspective_camera.near() *
        std::tan(0.5f * kDegreesToRadians *
                 perspective_camera.vertical_fov_degrees());

    const float width_at_near = frame_width * height_at_near / frame_height;

    left = -0.5f * width_at_near;
    right = 0.5f * width_at_near;
    bottom = -0.5f * height_at_near;
    top = 0.5f * height_at_near;
    near = perspective_camera.near();
    far = perspective_camera.far();
  }

  float left;
  float right;
  float bottom;
  float top;
  float near;
  float far;
};

class ScreenToMetricSpaceConverter {
 public:
  ScreenToMetricSpaceConverter(
      OriginPointLocation origin_point_location,      //
      InputSource input_source,                       //
      Eigen::Matrix3Xf&& canonical_metric_landmarks,  //
      Eigen::VectorXf&& landmark_weights,             //
      std::unique_ptr<ProcrustesSolver> procrustes_solver)
      : origin_point_location_(origin_point_location),
        input_source_(input_source),
        canonical_metric_landmarks_(std::move(canonical_metric_landmarks)),
        landmark_weights_(std::move(landmark_weights)),
        procrustes_solver_(std::move(procrustes_solver)) {}

  // Converts `screen_landmark_list` into `metric_landmark_list` and estimates
  // the `pose_transform_mat`.
  //
  // Here's the algorithm summary:
  //
  // (1) Project X- and Y- screen landmark coordinates at the Z near plane.
  //
  // (2) Estimate a canonical-to-runtime landmark set scale by running the
  //     Procrustes solver using the screen runtime landmarks.
  //
  //     On this iteration, screen landmarks are used instead of unprojected
  //     metric landmarks as it is not safe to unproject due to the relative
  //     nature of the input screen landmark Z coordinate.
  //
  // (3) Use the canonical-to-runtime scale from (2) to unproject the screen
  //     landmarks. The result is referenced as "intermediate landmarks" because
  //     they are the first estimation of the resuling metric landmarks, but are
  //     not quite there yet.
  //
  // (4) Estimate a canonical-to-runtime landmark set scale by running the
  //     Procrustes solver using the intermediate runtime landmarks.
  //
  // (5) Use the product of the scale factors from (2) and (4) to unproject
  //     the screen landmarks the second time. This is the second and the final
  //     estimation of the metric landmarks.
  //
  // (6) Multiply each of the metric landmarks by the inverse pose
  //     transformation matrix to align the runtime metric face landmarks with
  //     the canonical metric face landmarks.
  //
  // Note: the input screen landmarks are in the left-handed coordinate system,
  //       however any metric landmarks - including the canonical metric
  //       landmarks, the final runtime metric landmarks and any intermediate
  //       runtime metric landmarks - are in the right-handed coordinate system.
  //
  //       To keep the logic correct, the landmark set handedness is changed any
  //       time the screen-to-metric semantic barrier is passed.
  absl::Status Convert(const NormalizedLandmarkList& screen_landmark_list,  //
                       const PerspectiveCameraFrustum& pcf,                 //
                       LandmarkList& metric_landmark_list,                  //
                       Eigen::Matrix4f& pose_transform_mat) const {
    RET_CHECK_EQ(screen_landmark_list.landmark_size(),
                 canonical_metric_landmarks_.cols())
        << "The number of landmarks doesn't match the number passed upon "
           "initialization!";

    Eigen::Matrix3Xf screen_landmarks;
    ConvertLandmarkListToEigenMatrix(screen_landmark_list, screen_landmarks);

    ProjectXY(pcf, screen_landmarks);
    const float depth_offset = screen_landmarks.row(2).mean();

    // 1st iteration: don't unproject XY because it's unsafe to do so due to
    //                the relative nature of the Z coordinate. Instead, run the
    //                first estimation on the projected XY and use that scale to
    //                unproject for the 2nd iteration.
    Eigen::Matrix3Xf intermediate_landmarks(screen_landmarks);
    ChangeHandedness(intermediate_landmarks);

    ASSIGN_OR_RETURN(const float first_iteration_scale,
                     EstimateScale(intermediate_landmarks),
                     _ << "Failed to estimate first iteration scale!");

    // 2nd iteration: unproject XY using the scale from the 1st iteration.
    intermediate_landmarks = screen_landmarks;
    MoveAndRescaleZ(pcf, depth_offset, first_iteration_scale,
                    intermediate_landmarks);
    UnprojectXY(pcf, intermediate_landmarks);
    ChangeHandedness(intermediate_landmarks);

    // For face detection input landmarks, re-write Z-coord from the canonical
    // landmarks.
    if (input_source_ == InputSource::FACE_DETECTION_PIPELINE) {
      Eigen::Matrix4f intermediate_pose_transform_mat;
      MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
          canonical_metric_landmarks_, intermediate_landmarks,
          landmark_weights_, intermediate_pose_transform_mat))
          << "Failed to estimate pose transform matrix!";

      intermediate_landmarks.row(2) =
          (intermediate_pose_transform_mat *
           canonical_metric_landmarks_.colwise().homogeneous())
              .row(2);
    }
    ASSIGN_OR_RETURN(const float second_iteration_scale,
                     EstimateScale(intermediate_landmarks),
                     _ << "Failed to estimate second iteration scale!");

    // Use the total scale to unproject the screen landmarks.
    const float total_scale = first_iteration_scale * second_iteration_scale;
    MoveAndRescaleZ(pcf, depth_offset, total_scale, screen_landmarks);
    UnprojectXY(pcf, screen_landmarks);
    ChangeHandedness(screen_landmarks);

    // At this point, screen landmarks are converted into metric landmarks.
    Eigen::Matrix3Xf& metric_landmarks = screen_landmarks;

    MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
        canonical_metric_landmarks_, metric_landmarks, landmark_weights_,
        pose_transform_mat))
        << "Failed to estimate pose transform matrix!";

    // For face detection input landmarks, re-write Z-coord from the canonical
    // landmarks and run the pose transform estimation again.
    if (input_source_ == InputSource::FACE_DETECTION_PIPELINE) {
      metric_landmarks.row(2) =
          (pose_transform_mat *
           canonical_metric_landmarks_.colwise().homogeneous())
              .row(2);

      MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
          canonical_metric_landmarks_, metric_landmarks, landmark_weights_,
          pose_transform_mat))
          << "Failed to estimate pose transform matrix!";
    }

    // Multiply each of the metric landmarks by the inverse pose
    // transformation matrix to align the runtime metric face landmarks with
    // the canonical metric face landmarks.
    metric_landmarks = (pose_transform_mat.inverse() *
                        metric_landmarks.colwise().homogeneous())
                           .topRows(3);

    ConvertEigenMatrixToLandmarkList(metric_landmarks, metric_landmark_list);

    return absl::OkStatus();
  }

 private:
  void ProjectXY(const PerspectiveCameraFrustum& pcf,
                 Eigen::Matrix3Xf& landmarks) const {
    float x_scale = pcf.right - pcf.left;
    float y_scale = pcf.top - pcf.bottom;
    float x_translation = pcf.left;
    float y_translation = pcf.bottom;

    if (origin_point_location_ == OriginPointLocation::TOP_LEFT_CORNER) {
      landmarks.row(1) = 1.f - landmarks.row(1).array();
    }

    landmarks =
        landmarks.array().colwise() * Eigen::Array3f(x_scale, y_scale, x_scale);
    landmarks.colwise() += Eigen::Vector3f(x_translation, y_translation, 0.f);
  }

  absl::StatusOr<float> EstimateScale(Eigen::Matrix3Xf& landmarks) const {
    Eigen::Matrix4f transform_mat;
    MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
        canonical_metric_landmarks_, landmarks, landmark_weights_,
        transform_mat))
        << "Failed to estimate canonical-to-runtime landmark set transform!";

    return transform_mat.col(0).norm();
  }

  static void MoveAndRescaleZ(const PerspectiveCameraFrustum& pcf,
                              float depth_offset, float scale,
                              Eigen::Matrix3Xf& landmarks) {
    landmarks.row(2) =
        (landmarks.array().row(2) - depth_offset + pcf.near) / scale;
  }

  static void UnprojectXY(const PerspectiveCameraFrustum& pcf,
                          Eigen::Matrix3Xf& landmarks) {
    landmarks.row(0) =
        landmarks.row(0).cwiseProduct(landmarks.row(2)) / pcf.near;
    landmarks.row(1) =
        landmarks.row(1).cwiseProduct(landmarks.row(2)) / pcf.near;
  }

  static void ChangeHandedness(Eigen::Matrix3Xf& landmarks) {
    landmarks.row(2) *= -1.f;
  }

  static void ConvertLandmarkListToEigenMatrix(
      const NormalizedLandmarkList& landmark_list,
      Eigen::Matrix3Xf& eigen_matrix) {
    eigen_matrix = Eigen::Matrix3Xf(3, landmark_list.landmark_size());
    for (int i = 0; i < landmark_list.landmark_size(); ++i) {
      const auto& landmark = landmark_list.landmark(i);
      eigen_matrix(0, i) = landmark.x();
      eigen_matrix(1, i) = landmark.y();
      eigen_matrix(2, i) = landmark.z();
    }
  }

  static void ConvertEigenMatrixToLandmarkList(
      const Eigen::Matrix3Xf& eigen_matrix, LandmarkList& landmark_list) {
    landmark_list.Clear();

    for (int i = 0; i < eigen_matrix.cols(); ++i) {
      auto& landmark = *landmark_list.add_landmark();
      landmark.set_x(eigen_matrix(0, i));
      landmark.set_y(eigen_matrix(1, i));
      landmark.set_z(eigen_matrix(2, i));
    }
  }

  const OriginPointLocation origin_point_location_;
  const InputSource input_source_;
  Eigen::Matrix3Xf canonical_metric_landmarks_;
  Eigen::VectorXf landmark_weights_;

  std::unique_ptr<ProcrustesSolver> procrustes_solver_;
};

class GeometryPipelineImpl : public GeometryPipeline {
 public:
  GeometryPipelineImpl(
      const PerspectiveCamera& perspective_camera,  //
      const Mesh3d& canonical_mesh,                 //
      uint32_t canonical_mesh_vertex_size,          //
      uint32_t canonical_mesh_num_vertices,
      uint32_t canonical_mesh_vertex_position_offset,
      std::unique_ptr<ScreenToMetricSpaceConverter> space_converter)
      : perspective_camera_(perspective_camera),
        canonical_mesh_(canonical_mesh),
        canonical_mesh_vertex_size_(canonical_mesh_vertex_size),
        canonical_mesh_num_vertices_(canonical_mesh_num_vertices),
        canonical_mesh_vertex_position_offset_(
            canonical_mesh_vertex_position_offset),
        space_converter_(std::move(space_converter)) {}

  absl::StatusOr<std::vector<FaceGeometry>> EstimateFaceGeometry(
      const std::vector<NormalizedLandmarkList>& multi_face_landmarks,
      int frame_width, int frame_height) const override {
    MP_RETURN_IF_ERROR(ValidateFrameDimensions(frame_width, frame_height))
        << "Invalid frame dimensions!";

    // Create a perspective camera frustum to be shared for geometry estimation
    // per each face.
    PerspectiveCameraFrustum pcf(perspective_camera_, frame_width,
                                 frame_height);

    std::vector<FaceGeometry> multi_face_geometry;

    // From this point, the meaning of "face landmarks" is clarified further as
    // "screen face landmarks". This is done do distinguish from "metric face
    // landmarks" that are derived during the face geometry estimation process.
    for (const NormalizedLandmarkList& screen_face_landmarks :
         multi_face_landmarks) {
      // Having a too compact screen landmark list will result in numerical
      // instabilities, therefore such faces are filtered.
      if (IsScreenLandmarkListTooCompact(screen_face_landmarks)) {
        continue;
      }

      // Convert the screen landmarks into the metric landmarks and get the pose
      // transformation matrix.
      LandmarkList metric_face_landmarks;
      Eigen::Matrix4f pose_transform_mat;
      MP_RETURN_IF_ERROR(space_converter_->Convert(screen_face_landmarks, pcf,
                                                   metric_face_landmarks,
                                                   pose_transform_mat))
          << "Failed to convert landmarks from the screen to the metric space!";

      // Pack geometry data for this face.
      FaceGeometry face_geometry;
      Mesh3d* mutable_mesh = face_geometry.mutable_mesh();
      // Copy the canonical face mesh as the face geometry mesh.
      mutable_mesh->CopyFrom(canonical_mesh_);
      // Replace XYZ vertex mesh coodinates with the metric landmark positions.
      for (int i = 0; i < canonical_mesh_num_vertices_; ++i) {
        uint32_t vertex_buffer_offset = canonical_mesh_vertex_size_ * i +
                                        canonical_mesh_vertex_position_offset_;

        mutable_mesh->set_vertex_buffer(vertex_buffer_offset,
                                        metric_face_landmarks.landmark(i).x());
        mutable_mesh->set_vertex_buffer(vertex_buffer_offset + 1,
                                        metric_face_landmarks.landmark(i).y());
        mutable_mesh->set_vertex_buffer(vertex_buffer_offset + 2,
                                        metric_face_landmarks.landmark(i).z());
      }
      // Populate the face pose transformation matrix.
      mediapipe::MatrixDataProtoFromMatrix(
          pose_transform_mat, face_geometry.mutable_pose_transform_matrix());

      multi_face_geometry.push_back(face_geometry);
    }

    return multi_face_geometry;
  }

 private:
  static bool IsScreenLandmarkListTooCompact(
      const NormalizedLandmarkList& screen_landmarks) {
    float mean_x = 0.f;
    float mean_y = 0.f;
    for (int i = 0; i < screen_landmarks.landmark_size(); ++i) {
      const auto& landmark = screen_landmarks.landmark(i);
      mean_x += (landmark.x() - mean_x) / static_cast<float>(i + 1);
      mean_y += (landmark.y() - mean_y) / static_cast<float>(i + 1);
    }

    float max_sq_dist = 0.f;
    for (const auto& landmark : screen_landmarks.landmark()) {
      const float d_x = landmark.x() - mean_x;
      const float d_y = landmark.y() - mean_y;
      max_sq_dist = std::max(max_sq_dist, d_x * d_x + d_y * d_y);
    }

    static constexpr float kIsScreenLandmarkListTooCompactThreshold = 1e-3f;
    return std::sqrt(max_sq_dist) <= kIsScreenLandmarkListTooCompactThreshold;
  }

  const PerspectiveCamera perspective_camera_;
  const Mesh3d canonical_mesh_;
  const uint32_t canonical_mesh_vertex_size_;
  const uint32_t canonical_mesh_num_vertices_;
  const uint32_t canonical_mesh_vertex_position_offset_;

  std::unique_ptr<ScreenToMetricSpaceConverter> space_converter_;
};

}  // namespace

absl::StatusOr<std::unique_ptr<GeometryPipeline>> CreateGeometryPipeline(
    const Environment& environment, const GeometryPipelineMetadata& metadata) {
  MP_RETURN_IF_ERROR(ValidateEnvironment(environment))
      << "Invalid environment!";
  MP_RETURN_IF_ERROR(ValidateGeometryPipelineMetadata(metadata))
      << "Invalid geometry pipeline metadata!";

  const auto& canonical_mesh = metadata.canonical_mesh();
  RET_CHECK(HasVertexComponent(canonical_mesh.vertex_type(),
                               VertexComponent::POSITION))
      << "Canonical face mesh must have the `POSITION` vertex component!";
  RET_CHECK(HasVertexComponent(canonical_mesh.vertex_type(),
                               VertexComponent::TEX_COORD))
      << "Canonical face mesh must have the `TEX_COORD` vertex component!";

  uint32_t canonical_mesh_vertex_size =
      GetVertexSize(canonical_mesh.vertex_type());
  uint32_t canonical_mesh_num_vertices =
      canonical_mesh.vertex_buffer_size() / canonical_mesh_vertex_size;
  uint32_t canonical_mesh_vertex_position_offset =
      GetVertexComponentOffset(canonical_mesh.vertex_type(),
                               VertexComponent::POSITION)
          .value();

  // Put the Procrustes landmark basis into Eigen matrices for an easier access.
  Eigen::Matrix3Xf canonical_metric_landmarks =
      Eigen::Matrix3Xf::Zero(3, canonical_mesh_num_vertices);
  Eigen::VectorXf landmark_weights =
      Eigen::VectorXf::Zero(canonical_mesh_num_vertices);

  for (int i = 0; i < canonical_mesh_num_vertices; ++i) {
    uint32_t vertex_buffer_offset =
        canonical_mesh_vertex_size * i + canonical_mesh_vertex_position_offset;

    canonical_metric_landmarks(0, i) =
        canonical_mesh.vertex_buffer(vertex_buffer_offset);
    canonical_metric_landmarks(1, i) =
        canonical_mesh.vertex_buffer(vertex_buffer_offset + 1);
    canonical_metric_landmarks(2, i) =
        canonical_mesh.vertex_buffer(vertex_buffer_offset + 2);
  }

  for (const WeightedLandmarkRef& wlr : metadata.procrustes_landmark_basis()) {
    uint32_t landmark_id = wlr.landmark_id();
    landmark_weights(landmark_id) = wlr.weight();
  }

  std::unique_ptr<GeometryPipeline> result =
      absl::make_unique<GeometryPipelineImpl>(
          environment.perspective_camera(), canonical_mesh,
          canonical_mesh_vertex_size, canonical_mesh_num_vertices,
          canonical_mesh_vertex_position_offset,
          absl::make_unique<ScreenToMetricSpaceConverter>(
              environment.origin_point_location(),
              metadata.input_source() == InputSource::DEFAULT
                  ? InputSource::FACE_LANDMARK_PIPELINE
                  : metadata.input_source(),
              std::move(canonical_metric_landmarks),
              std::move(landmark_weights),
              CreateFloatPrecisionProcrustesSolver()));

  return result;
}

}  // namespace mediapipe::face_geometry