467 lines
20 KiB
C++
467 lines
20 KiB
C++
// Copyright 2020 The MediaPipe Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "mediapipe/modules/face_geometry/libs/geometry_pipeline.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "Eigen/Core"
|
|
#include "absl/memory/memory.h"
|
|
#include "mediapipe/framework/formats/landmark.pb.h"
|
|
#include "mediapipe/framework/formats/matrix.h"
|
|
#include "mediapipe/framework/formats/matrix_data.pb.h"
|
|
#include "mediapipe/framework/port/ret_check.h"
|
|
#include "mediapipe/framework/port/status.h"
|
|
#include "mediapipe/framework/port/status_macros.h"
|
|
#include "mediapipe/framework/port/statusor.h"
|
|
#include "mediapipe/modules/face_geometry/libs/mesh_3d_utils.h"
|
|
#include "mediapipe/modules/face_geometry/libs/procrustes_solver.h"
|
|
#include "mediapipe/modules/face_geometry/libs/validation_utils.h"
|
|
#include "mediapipe/modules/face_geometry/protos/environment.pb.h"
|
|
#include "mediapipe/modules/face_geometry/protos/face_geometry.pb.h"
|
|
#include "mediapipe/modules/face_geometry/protos/geometry_pipeline_metadata.pb.h"
|
|
#include "mediapipe/modules/face_geometry/protos/mesh_3d.pb.h"
|
|
|
|
namespace mediapipe::face_geometry {
|
|
namespace {
|
|
|
|
struct PerspectiveCameraFrustum {
|
|
// NOTE: all arguments must be validated prior to calling this constructor.
|
|
PerspectiveCameraFrustum(const PerspectiveCamera& perspective_camera,
|
|
int frame_width, int frame_height) {
|
|
static constexpr float kDegreesToRadians = 3.14159265358979323846f / 180.f;
|
|
|
|
const float height_at_near =
|
|
2.f * perspective_camera.near() *
|
|
std::tan(0.5f * kDegreesToRadians *
|
|
perspective_camera.vertical_fov_degrees());
|
|
|
|
const float width_at_near = frame_width * height_at_near / frame_height;
|
|
|
|
left = -0.5f * width_at_near;
|
|
right = 0.5f * width_at_near;
|
|
bottom = -0.5f * height_at_near;
|
|
top = 0.5f * height_at_near;
|
|
near = perspective_camera.near();
|
|
far = perspective_camera.far();
|
|
}
|
|
|
|
float left;
|
|
float right;
|
|
float bottom;
|
|
float top;
|
|
float near;
|
|
float far;
|
|
};
|
|
|
|
class ScreenToMetricSpaceConverter {
|
|
public:
|
|
ScreenToMetricSpaceConverter(
|
|
OriginPointLocation origin_point_location, //
|
|
InputSource input_source, //
|
|
Eigen::Matrix3Xf&& canonical_metric_landmarks, //
|
|
Eigen::VectorXf&& landmark_weights, //
|
|
std::unique_ptr<ProcrustesSolver> procrustes_solver)
|
|
: origin_point_location_(origin_point_location),
|
|
input_source_(input_source),
|
|
canonical_metric_landmarks_(std::move(canonical_metric_landmarks)),
|
|
landmark_weights_(std::move(landmark_weights)),
|
|
procrustes_solver_(std::move(procrustes_solver)) {}
|
|
|
|
// Converts `screen_landmark_list` into `metric_landmark_list` and estimates
|
|
// the `pose_transform_mat`.
|
|
//
|
|
// Here's the algorithm summary:
|
|
//
|
|
// (1) Project X- and Y- screen landmark coordinates at the Z near plane.
|
|
//
|
|
// (2) Estimate a canonical-to-runtime landmark set scale by running the
|
|
// Procrustes solver using the screen runtime landmarks.
|
|
//
|
|
// On this iteration, screen landmarks are used instead of unprojected
|
|
// metric landmarks as it is not safe to unproject due to the relative
|
|
// nature of the input screen landmark Z coordinate.
|
|
//
|
|
// (3) Use the canonical-to-runtime scale from (2) to unproject the screen
|
|
// landmarks. The result is referenced as "intermediate landmarks" because
|
|
// they are the first estimation of the resuling metric landmarks, but are
|
|
// not quite there yet.
|
|
//
|
|
// (4) Estimate a canonical-to-runtime landmark set scale by running the
|
|
// Procrustes solver using the intermediate runtime landmarks.
|
|
//
|
|
// (5) Use the product of the scale factors from (2) and (4) to unproject
|
|
// the screen landmarks the second time. This is the second and the final
|
|
// estimation of the metric landmarks.
|
|
//
|
|
// (6) Multiply each of the metric landmarks by the inverse pose
|
|
// transformation matrix to align the runtime metric face landmarks with
|
|
// the canonical metric face landmarks.
|
|
//
|
|
// Note: the input screen landmarks are in the left-handed coordinate system,
|
|
// however any metric landmarks - including the canonical metric
|
|
// landmarks, the final runtime metric landmarks and any intermediate
|
|
// runtime metric landmarks - are in the right-handed coordinate system.
|
|
//
|
|
// To keep the logic correct, the landmark set handedness is changed any
|
|
// time the screen-to-metric semantic barrier is passed.
|
|
absl::Status Convert(const NormalizedLandmarkList& screen_landmark_list, //
|
|
const PerspectiveCameraFrustum& pcf, //
|
|
LandmarkList& metric_landmark_list, //
|
|
Eigen::Matrix4f& pose_transform_mat) const {
|
|
RET_CHECK_EQ(screen_landmark_list.landmark_size(),
|
|
canonical_metric_landmarks_.cols())
|
|
<< "The number of landmarks doesn't match the number passed upon "
|
|
"initialization!";
|
|
|
|
Eigen::Matrix3Xf screen_landmarks;
|
|
ConvertLandmarkListToEigenMatrix(screen_landmark_list, screen_landmarks);
|
|
|
|
ProjectXY(pcf, screen_landmarks);
|
|
const float depth_offset = screen_landmarks.row(2).mean();
|
|
|
|
// 1st iteration: don't unproject XY because it's unsafe to do so due to
|
|
// the relative nature of the Z coordinate. Instead, run the
|
|
// first estimation on the projected XY and use that scale to
|
|
// unproject for the 2nd iteration.
|
|
Eigen::Matrix3Xf intermediate_landmarks(screen_landmarks);
|
|
ChangeHandedness(intermediate_landmarks);
|
|
|
|
ASSIGN_OR_RETURN(const float first_iteration_scale,
|
|
EstimateScale(intermediate_landmarks),
|
|
_ << "Failed to estimate first iteration scale!");
|
|
|
|
// 2nd iteration: unproject XY using the scale from the 1st iteration.
|
|
intermediate_landmarks = screen_landmarks;
|
|
MoveAndRescaleZ(pcf, depth_offset, first_iteration_scale,
|
|
intermediate_landmarks);
|
|
UnprojectXY(pcf, intermediate_landmarks);
|
|
ChangeHandedness(intermediate_landmarks);
|
|
|
|
// For face detection input landmarks, re-write Z-coord from the canonical
|
|
// landmarks.
|
|
if (input_source_ == InputSource::FACE_DETECTION_PIPELINE) {
|
|
Eigen::Matrix4f intermediate_pose_transform_mat;
|
|
MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
|
|
canonical_metric_landmarks_, intermediate_landmarks,
|
|
landmark_weights_, intermediate_pose_transform_mat))
|
|
<< "Failed to estimate pose transform matrix!";
|
|
|
|
intermediate_landmarks.row(2) =
|
|
(intermediate_pose_transform_mat *
|
|
canonical_metric_landmarks_.colwise().homogeneous())
|
|
.row(2);
|
|
}
|
|
ASSIGN_OR_RETURN(const float second_iteration_scale,
|
|
EstimateScale(intermediate_landmarks),
|
|
_ << "Failed to estimate second iteration scale!");
|
|
|
|
// Use the total scale to unproject the screen landmarks.
|
|
const float total_scale = first_iteration_scale * second_iteration_scale;
|
|
MoveAndRescaleZ(pcf, depth_offset, total_scale, screen_landmarks);
|
|
UnprojectXY(pcf, screen_landmarks);
|
|
ChangeHandedness(screen_landmarks);
|
|
|
|
// At this point, screen landmarks are converted into metric landmarks.
|
|
Eigen::Matrix3Xf& metric_landmarks = screen_landmarks;
|
|
|
|
MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
|
|
canonical_metric_landmarks_, metric_landmarks, landmark_weights_,
|
|
pose_transform_mat))
|
|
<< "Failed to estimate pose transform matrix!";
|
|
|
|
// For face detection input landmarks, re-write Z-coord from the canonical
|
|
// landmarks and run the pose transform estimation again.
|
|
if (input_source_ == InputSource::FACE_DETECTION_PIPELINE) {
|
|
metric_landmarks.row(2) =
|
|
(pose_transform_mat *
|
|
canonical_metric_landmarks_.colwise().homogeneous())
|
|
.row(2);
|
|
|
|
MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
|
|
canonical_metric_landmarks_, metric_landmarks, landmark_weights_,
|
|
pose_transform_mat))
|
|
<< "Failed to estimate pose transform matrix!";
|
|
}
|
|
|
|
// Multiply each of the metric landmarks by the inverse pose
|
|
// transformation matrix to align the runtime metric face landmarks with
|
|
// the canonical metric face landmarks.
|
|
metric_landmarks = (pose_transform_mat.inverse() *
|
|
metric_landmarks.colwise().homogeneous())
|
|
.topRows(3);
|
|
|
|
ConvertEigenMatrixToLandmarkList(metric_landmarks, metric_landmark_list);
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
private:
|
|
void ProjectXY(const PerspectiveCameraFrustum& pcf,
|
|
Eigen::Matrix3Xf& landmarks) const {
|
|
float x_scale = pcf.right - pcf.left;
|
|
float y_scale = pcf.top - pcf.bottom;
|
|
float x_translation = pcf.left;
|
|
float y_translation = pcf.bottom;
|
|
|
|
if (origin_point_location_ == OriginPointLocation::TOP_LEFT_CORNER) {
|
|
landmarks.row(1) = 1.f - landmarks.row(1).array();
|
|
}
|
|
|
|
landmarks =
|
|
landmarks.array().colwise() * Eigen::Array3f(x_scale, y_scale, x_scale);
|
|
landmarks.colwise() += Eigen::Vector3f(x_translation, y_translation, 0.f);
|
|
}
|
|
|
|
absl::StatusOr<float> EstimateScale(Eigen::Matrix3Xf& landmarks) const {
|
|
Eigen::Matrix4f transform_mat;
|
|
MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem(
|
|
canonical_metric_landmarks_, landmarks, landmark_weights_,
|
|
transform_mat))
|
|
<< "Failed to estimate canonical-to-runtime landmark set transform!";
|
|
|
|
return transform_mat.col(0).norm();
|
|
}
|
|
|
|
static void MoveAndRescaleZ(const PerspectiveCameraFrustum& pcf,
|
|
float depth_offset, float scale,
|
|
Eigen::Matrix3Xf& landmarks) {
|
|
landmarks.row(2) =
|
|
(landmarks.array().row(2) - depth_offset + pcf.near) / scale;
|
|
}
|
|
|
|
static void UnprojectXY(const PerspectiveCameraFrustum& pcf,
|
|
Eigen::Matrix3Xf& landmarks) {
|
|
landmarks.row(0) =
|
|
landmarks.row(0).cwiseProduct(landmarks.row(2)) / pcf.near;
|
|
landmarks.row(1) =
|
|
landmarks.row(1).cwiseProduct(landmarks.row(2)) / pcf.near;
|
|
}
|
|
|
|
static void ChangeHandedness(Eigen::Matrix3Xf& landmarks) {
|
|
landmarks.row(2) *= -1.f;
|
|
}
|
|
|
|
static void ConvertLandmarkListToEigenMatrix(
|
|
const NormalizedLandmarkList& landmark_list,
|
|
Eigen::Matrix3Xf& eigen_matrix) {
|
|
eigen_matrix = Eigen::Matrix3Xf(3, landmark_list.landmark_size());
|
|
for (int i = 0; i < landmark_list.landmark_size(); ++i) {
|
|
const auto& landmark = landmark_list.landmark(i);
|
|
eigen_matrix(0, i) = landmark.x();
|
|
eigen_matrix(1, i) = landmark.y();
|
|
eigen_matrix(2, i) = landmark.z();
|
|
}
|
|
}
|
|
|
|
static void ConvertEigenMatrixToLandmarkList(
|
|
const Eigen::Matrix3Xf& eigen_matrix, LandmarkList& landmark_list) {
|
|
landmark_list.Clear();
|
|
|
|
for (int i = 0; i < eigen_matrix.cols(); ++i) {
|
|
auto& landmark = *landmark_list.add_landmark();
|
|
landmark.set_x(eigen_matrix(0, i));
|
|
landmark.set_y(eigen_matrix(1, i));
|
|
landmark.set_z(eigen_matrix(2, i));
|
|
}
|
|
}
|
|
|
|
const OriginPointLocation origin_point_location_;
|
|
const InputSource input_source_;
|
|
Eigen::Matrix3Xf canonical_metric_landmarks_;
|
|
Eigen::VectorXf landmark_weights_;
|
|
|
|
std::unique_ptr<ProcrustesSolver> procrustes_solver_;
|
|
};
|
|
|
|
class GeometryPipelineImpl : public GeometryPipeline {
|
|
public:
|
|
GeometryPipelineImpl(
|
|
const PerspectiveCamera& perspective_camera, //
|
|
const Mesh3d& canonical_mesh, //
|
|
uint32_t canonical_mesh_vertex_size, //
|
|
uint32_t canonical_mesh_num_vertices,
|
|
uint32_t canonical_mesh_vertex_position_offset,
|
|
std::unique_ptr<ScreenToMetricSpaceConverter> space_converter)
|
|
: perspective_camera_(perspective_camera),
|
|
canonical_mesh_(canonical_mesh),
|
|
canonical_mesh_vertex_size_(canonical_mesh_vertex_size),
|
|
canonical_mesh_num_vertices_(canonical_mesh_num_vertices),
|
|
canonical_mesh_vertex_position_offset_(
|
|
canonical_mesh_vertex_position_offset),
|
|
space_converter_(std::move(space_converter)) {}
|
|
|
|
absl::StatusOr<std::vector<FaceGeometry>> EstimateFaceGeometry(
|
|
const std::vector<NormalizedLandmarkList>& multi_face_landmarks,
|
|
int frame_width, int frame_height) const override {
|
|
MP_RETURN_IF_ERROR(ValidateFrameDimensions(frame_width, frame_height))
|
|
<< "Invalid frame dimensions!";
|
|
|
|
// Create a perspective camera frustum to be shared for geometry estimation
|
|
// per each face.
|
|
PerspectiveCameraFrustum pcf(perspective_camera_, frame_width,
|
|
frame_height);
|
|
|
|
std::vector<FaceGeometry> multi_face_geometry;
|
|
|
|
// From this point, the meaning of "face landmarks" is clarified further as
|
|
// "screen face landmarks". This is done do distinguish from "metric face
|
|
// landmarks" that are derived during the face geometry estimation process.
|
|
for (const NormalizedLandmarkList& screen_face_landmarks :
|
|
multi_face_landmarks) {
|
|
// Having a too compact screen landmark list will result in numerical
|
|
// instabilities, therefore such faces are filtered.
|
|
if (IsScreenLandmarkListTooCompact(screen_face_landmarks)) {
|
|
continue;
|
|
}
|
|
|
|
// Convert the screen landmarks into the metric landmarks and get the pose
|
|
// transformation matrix.
|
|
LandmarkList metric_face_landmarks;
|
|
Eigen::Matrix4f pose_transform_mat;
|
|
MP_RETURN_IF_ERROR(space_converter_->Convert(screen_face_landmarks, pcf,
|
|
metric_face_landmarks,
|
|
pose_transform_mat))
|
|
<< "Failed to convert landmarks from the screen to the metric space!";
|
|
|
|
// Pack geometry data for this face.
|
|
FaceGeometry face_geometry;
|
|
Mesh3d* mutable_mesh = face_geometry.mutable_mesh();
|
|
// Copy the canonical face mesh as the face geometry mesh.
|
|
mutable_mesh->CopyFrom(canonical_mesh_);
|
|
// Replace XYZ vertex mesh coodinates with the metric landmark positions.
|
|
for (int i = 0; i < canonical_mesh_num_vertices_; ++i) {
|
|
uint32_t vertex_buffer_offset = canonical_mesh_vertex_size_ * i +
|
|
canonical_mesh_vertex_position_offset_;
|
|
|
|
mutable_mesh->set_vertex_buffer(vertex_buffer_offset,
|
|
metric_face_landmarks.landmark(i).x());
|
|
mutable_mesh->set_vertex_buffer(vertex_buffer_offset + 1,
|
|
metric_face_landmarks.landmark(i).y());
|
|
mutable_mesh->set_vertex_buffer(vertex_buffer_offset + 2,
|
|
metric_face_landmarks.landmark(i).z());
|
|
}
|
|
// Populate the face pose transformation matrix.
|
|
mediapipe::MatrixDataProtoFromMatrix(
|
|
pose_transform_mat, face_geometry.mutable_pose_transform_matrix());
|
|
|
|
multi_face_geometry.push_back(face_geometry);
|
|
}
|
|
|
|
return multi_face_geometry;
|
|
}
|
|
|
|
private:
|
|
static bool IsScreenLandmarkListTooCompact(
|
|
const NormalizedLandmarkList& screen_landmarks) {
|
|
float mean_x = 0.f;
|
|
float mean_y = 0.f;
|
|
for (int i = 0; i < screen_landmarks.landmark_size(); ++i) {
|
|
const auto& landmark = screen_landmarks.landmark(i);
|
|
mean_x += (landmark.x() - mean_x) / static_cast<float>(i + 1);
|
|
mean_y += (landmark.y() - mean_y) / static_cast<float>(i + 1);
|
|
}
|
|
|
|
float max_sq_dist = 0.f;
|
|
for (const auto& landmark : screen_landmarks.landmark()) {
|
|
const float d_x = landmark.x() - mean_x;
|
|
const float d_y = landmark.y() - mean_y;
|
|
max_sq_dist = std::max(max_sq_dist, d_x * d_x + d_y * d_y);
|
|
}
|
|
|
|
static constexpr float kIsScreenLandmarkListTooCompactThreshold = 1e-3f;
|
|
return std::sqrt(max_sq_dist) <= kIsScreenLandmarkListTooCompactThreshold;
|
|
}
|
|
|
|
const PerspectiveCamera perspective_camera_;
|
|
const Mesh3d canonical_mesh_;
|
|
const uint32_t canonical_mesh_vertex_size_;
|
|
const uint32_t canonical_mesh_num_vertices_;
|
|
const uint32_t canonical_mesh_vertex_position_offset_;
|
|
|
|
std::unique_ptr<ScreenToMetricSpaceConverter> space_converter_;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
absl::StatusOr<std::unique_ptr<GeometryPipeline>> CreateGeometryPipeline(
|
|
const Environment& environment, const GeometryPipelineMetadata& metadata) {
|
|
MP_RETURN_IF_ERROR(ValidateEnvironment(environment))
|
|
<< "Invalid environment!";
|
|
MP_RETURN_IF_ERROR(ValidateGeometryPipelineMetadata(metadata))
|
|
<< "Invalid geometry pipeline metadata!";
|
|
|
|
const auto& canonical_mesh = metadata.canonical_mesh();
|
|
RET_CHECK(HasVertexComponent(canonical_mesh.vertex_type(),
|
|
VertexComponent::POSITION))
|
|
<< "Canonical face mesh must have the `POSITION` vertex component!";
|
|
RET_CHECK(HasVertexComponent(canonical_mesh.vertex_type(),
|
|
VertexComponent::TEX_COORD))
|
|
<< "Canonical face mesh must have the `TEX_COORD` vertex component!";
|
|
|
|
uint32_t canonical_mesh_vertex_size =
|
|
GetVertexSize(canonical_mesh.vertex_type());
|
|
uint32_t canonical_mesh_num_vertices =
|
|
canonical_mesh.vertex_buffer_size() / canonical_mesh_vertex_size;
|
|
uint32_t canonical_mesh_vertex_position_offset =
|
|
GetVertexComponentOffset(canonical_mesh.vertex_type(),
|
|
VertexComponent::POSITION)
|
|
.value();
|
|
|
|
// Put the Procrustes landmark basis into Eigen matrices for an easier access.
|
|
Eigen::Matrix3Xf canonical_metric_landmarks =
|
|
Eigen::Matrix3Xf::Zero(3, canonical_mesh_num_vertices);
|
|
Eigen::VectorXf landmark_weights =
|
|
Eigen::VectorXf::Zero(canonical_mesh_num_vertices);
|
|
|
|
for (int i = 0; i < canonical_mesh_num_vertices; ++i) {
|
|
uint32_t vertex_buffer_offset =
|
|
canonical_mesh_vertex_size * i + canonical_mesh_vertex_position_offset;
|
|
|
|
canonical_metric_landmarks(0, i) =
|
|
canonical_mesh.vertex_buffer(vertex_buffer_offset);
|
|
canonical_metric_landmarks(1, i) =
|
|
canonical_mesh.vertex_buffer(vertex_buffer_offset + 1);
|
|
canonical_metric_landmarks(2, i) =
|
|
canonical_mesh.vertex_buffer(vertex_buffer_offset + 2);
|
|
}
|
|
|
|
for (const WeightedLandmarkRef& wlr : metadata.procrustes_landmark_basis()) {
|
|
uint32_t landmark_id = wlr.landmark_id();
|
|
landmark_weights(landmark_id) = wlr.weight();
|
|
}
|
|
|
|
std::unique_ptr<GeometryPipeline> result =
|
|
absl::make_unique<GeometryPipelineImpl>(
|
|
environment.perspective_camera(), canonical_mesh,
|
|
canonical_mesh_vertex_size, canonical_mesh_num_vertices,
|
|
canonical_mesh_vertex_position_offset,
|
|
absl::make_unique<ScreenToMetricSpaceConverter>(
|
|
environment.origin_point_location(),
|
|
metadata.input_source() == InputSource::DEFAULT
|
|
? InputSource::FACE_LANDMARK_PIPELINE
|
|
: metadata.input_source(),
|
|
std::move(canonical_metric_landmarks),
|
|
std::move(landmark_weights),
|
|
CreateFloatPrecisionProcrustesSolver()));
|
|
|
|
return result;
|
|
}
|
|
|
|
} // namespace mediapipe::face_geometry
|