// Copyright 2020 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "mediapipe/modules/face_geometry/libs/geometry_pipeline.h" #include #include #include #include #include #include "Eigen/Core" #include "absl/memory/memory.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/framework/formats/matrix.h" #include "mediapipe/framework/formats/matrix_data.pb.h" #include "mediapipe/framework/port/ret_check.h" #include "mediapipe/framework/port/status.h" #include "mediapipe/framework/port/status_macros.h" #include "mediapipe/framework/port/statusor.h" #include "mediapipe/modules/face_geometry/libs/mesh_3d_utils.h" #include "mediapipe/modules/face_geometry/libs/procrustes_solver.h" #include "mediapipe/modules/face_geometry/libs/validation_utils.h" #include "mediapipe/modules/face_geometry/protos/environment.pb.h" #include "mediapipe/modules/face_geometry/protos/face_geometry.pb.h" #include "mediapipe/modules/face_geometry/protos/geometry_pipeline_metadata.pb.h" #include "mediapipe/modules/face_geometry/protos/mesh_3d.pb.h" namespace mediapipe::face_geometry { namespace { struct PerspectiveCameraFrustum { // NOTE: all arguments must be validated prior to calling this constructor. PerspectiveCameraFrustum(const PerspectiveCamera& perspective_camera, int frame_width, int frame_height) { static constexpr float kDegreesToRadians = 3.14159265358979323846f / 180.f; const float height_at_near = 2.f * perspective_camera.near() * std::tan(0.5f * kDegreesToRadians * perspective_camera.vertical_fov_degrees()); const float width_at_near = frame_width * height_at_near / frame_height; left = -0.5f * width_at_near; right = 0.5f * width_at_near; bottom = -0.5f * height_at_near; top = 0.5f * height_at_near; near = perspective_camera.near(); far = perspective_camera.far(); } float left; float right; float bottom; float top; float near; float far; }; class ScreenToMetricSpaceConverter { public: ScreenToMetricSpaceConverter( OriginPointLocation origin_point_location, // InputSource input_source, // Eigen::Matrix3Xf&& canonical_metric_landmarks, // Eigen::VectorXf&& landmark_weights, // std::unique_ptr procrustes_solver) : origin_point_location_(origin_point_location), input_source_(input_source), canonical_metric_landmarks_(std::move(canonical_metric_landmarks)), landmark_weights_(std::move(landmark_weights)), procrustes_solver_(std::move(procrustes_solver)) {} // Converts `screen_landmark_list` into `metric_landmark_list` and estimates // the `pose_transform_mat`. // // Here's the algorithm summary: // // (1) Project X- and Y- screen landmark coordinates at the Z near plane. // // (2) Estimate a canonical-to-runtime landmark set scale by running the // Procrustes solver using the screen runtime landmarks. // // On this iteration, screen landmarks are used instead of unprojected // metric landmarks as it is not safe to unproject due to the relative // nature of the input screen landmark Z coordinate. // // (3) Use the canonical-to-runtime scale from (2) to unproject the screen // landmarks. The result is referenced as "intermediate landmarks" because // they are the first estimation of the resuling metric landmarks, but are // not quite there yet. // // (4) Estimate a canonical-to-runtime landmark set scale by running the // Procrustes solver using the intermediate runtime landmarks. // // (5) Use the product of the scale factors from (2) and (4) to unproject // the screen landmarks the second time. This is the second and the final // estimation of the metric landmarks. // // (6) Multiply each of the metric landmarks by the inverse pose // transformation matrix to align the runtime metric face landmarks with // the canonical metric face landmarks. // // Note: the input screen landmarks are in the left-handed coordinate system, // however any metric landmarks - including the canonical metric // landmarks, the final runtime metric landmarks and any intermediate // runtime metric landmarks - are in the right-handed coordinate system. // // To keep the logic correct, the landmark set handedness is changed any // time the screen-to-metric semantic barrier is passed. absl::Status Convert(const NormalizedLandmarkList& screen_landmark_list, // const PerspectiveCameraFrustum& pcf, // LandmarkList& metric_landmark_list, // Eigen::Matrix4f& pose_transform_mat) const { RET_CHECK_EQ(screen_landmark_list.landmark_size(), canonical_metric_landmarks_.cols()) << "The number of landmarks doesn't match the number passed upon " "initialization!"; Eigen::Matrix3Xf screen_landmarks; ConvertLandmarkListToEigenMatrix(screen_landmark_list, screen_landmarks); ProjectXY(pcf, screen_landmarks); const float depth_offset = screen_landmarks.row(2).mean(); // 1st iteration: don't unproject XY because it's unsafe to do so due to // the relative nature of the Z coordinate. Instead, run the // first estimation on the projected XY and use that scale to // unproject for the 2nd iteration. Eigen::Matrix3Xf intermediate_landmarks(screen_landmarks); ChangeHandedness(intermediate_landmarks); ASSIGN_OR_RETURN(const float first_iteration_scale, EstimateScale(intermediate_landmarks), _ << "Failed to estimate first iteration scale!"); // 2nd iteration: unproject XY using the scale from the 1st iteration. intermediate_landmarks = screen_landmarks; MoveAndRescaleZ(pcf, depth_offset, first_iteration_scale, intermediate_landmarks); UnprojectXY(pcf, intermediate_landmarks); ChangeHandedness(intermediate_landmarks); // For face detection input landmarks, re-write Z-coord from the canonical // landmarks. if (input_source_ == InputSource::FACE_DETECTION_PIPELINE) { Eigen::Matrix4f intermediate_pose_transform_mat; MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem( canonical_metric_landmarks_, intermediate_landmarks, landmark_weights_, intermediate_pose_transform_mat)) << "Failed to estimate pose transform matrix!"; intermediate_landmarks.row(2) = (intermediate_pose_transform_mat * canonical_metric_landmarks_.colwise().homogeneous()) .row(2); } ASSIGN_OR_RETURN(const float second_iteration_scale, EstimateScale(intermediate_landmarks), _ << "Failed to estimate second iteration scale!"); // Use the total scale to unproject the screen landmarks. const float total_scale = first_iteration_scale * second_iteration_scale; MoveAndRescaleZ(pcf, depth_offset, total_scale, screen_landmarks); UnprojectXY(pcf, screen_landmarks); ChangeHandedness(screen_landmarks); // At this point, screen landmarks are converted into metric landmarks. Eigen::Matrix3Xf& metric_landmarks = screen_landmarks; MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem( canonical_metric_landmarks_, metric_landmarks, landmark_weights_, pose_transform_mat)) << "Failed to estimate pose transform matrix!"; // For face detection input landmarks, re-write Z-coord from the canonical // landmarks and run the pose transform estimation again. if (input_source_ == InputSource::FACE_DETECTION_PIPELINE) { metric_landmarks.row(2) = (pose_transform_mat * canonical_metric_landmarks_.colwise().homogeneous()) .row(2); MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem( canonical_metric_landmarks_, metric_landmarks, landmark_weights_, pose_transform_mat)) << "Failed to estimate pose transform matrix!"; } // Multiply each of the metric landmarks by the inverse pose // transformation matrix to align the runtime metric face landmarks with // the canonical metric face landmarks. metric_landmarks = (pose_transform_mat.inverse() * metric_landmarks.colwise().homogeneous()) .topRows(3); ConvertEigenMatrixToLandmarkList(metric_landmarks, metric_landmark_list); return absl::OkStatus(); } private: void ProjectXY(const PerspectiveCameraFrustum& pcf, Eigen::Matrix3Xf& landmarks) const { float x_scale = pcf.right - pcf.left; float y_scale = pcf.top - pcf.bottom; float x_translation = pcf.left; float y_translation = pcf.bottom; if (origin_point_location_ == OriginPointLocation::TOP_LEFT_CORNER) { landmarks.row(1) = 1.f - landmarks.row(1).array(); } landmarks = landmarks.array().colwise() * Eigen::Array3f(x_scale, y_scale, x_scale); landmarks.colwise() += Eigen::Vector3f(x_translation, y_translation, 0.f); } absl::StatusOr EstimateScale(Eigen::Matrix3Xf& landmarks) const { Eigen::Matrix4f transform_mat; MP_RETURN_IF_ERROR(procrustes_solver_->SolveWeightedOrthogonalProblem( canonical_metric_landmarks_, landmarks, landmark_weights_, transform_mat)) << "Failed to estimate canonical-to-runtime landmark set transform!"; return transform_mat.col(0).norm(); } static void MoveAndRescaleZ(const PerspectiveCameraFrustum& pcf, float depth_offset, float scale, Eigen::Matrix3Xf& landmarks) { landmarks.row(2) = (landmarks.array().row(2) - depth_offset + pcf.near) / scale; } static void UnprojectXY(const PerspectiveCameraFrustum& pcf, Eigen::Matrix3Xf& landmarks) { landmarks.row(0) = landmarks.row(0).cwiseProduct(landmarks.row(2)) / pcf.near; landmarks.row(1) = landmarks.row(1).cwiseProduct(landmarks.row(2)) / pcf.near; } static void ChangeHandedness(Eigen::Matrix3Xf& landmarks) { landmarks.row(2) *= -1.f; } static void ConvertLandmarkListToEigenMatrix( const NormalizedLandmarkList& landmark_list, Eigen::Matrix3Xf& eigen_matrix) { eigen_matrix = Eigen::Matrix3Xf(3, landmark_list.landmark_size()); for (int i = 0; i < landmark_list.landmark_size(); ++i) { const auto& landmark = landmark_list.landmark(i); eigen_matrix(0, i) = landmark.x(); eigen_matrix(1, i) = landmark.y(); eigen_matrix(2, i) = landmark.z(); } } static void ConvertEigenMatrixToLandmarkList( const Eigen::Matrix3Xf& eigen_matrix, LandmarkList& landmark_list) { landmark_list.Clear(); for (int i = 0; i < eigen_matrix.cols(); ++i) { auto& landmark = *landmark_list.add_landmark(); landmark.set_x(eigen_matrix(0, i)); landmark.set_y(eigen_matrix(1, i)); landmark.set_z(eigen_matrix(2, i)); } } const OriginPointLocation origin_point_location_; const InputSource input_source_; Eigen::Matrix3Xf canonical_metric_landmarks_; Eigen::VectorXf landmark_weights_; std::unique_ptr procrustes_solver_; }; class GeometryPipelineImpl : public GeometryPipeline { public: GeometryPipelineImpl( const PerspectiveCamera& perspective_camera, // const Mesh3d& canonical_mesh, // uint32_t canonical_mesh_vertex_size, // uint32_t canonical_mesh_num_vertices, uint32_t canonical_mesh_vertex_position_offset, std::unique_ptr space_converter) : perspective_camera_(perspective_camera), canonical_mesh_(canonical_mesh), canonical_mesh_vertex_size_(canonical_mesh_vertex_size), canonical_mesh_num_vertices_(canonical_mesh_num_vertices), canonical_mesh_vertex_position_offset_( canonical_mesh_vertex_position_offset), space_converter_(std::move(space_converter)) {} absl::StatusOr> EstimateFaceGeometry( const std::vector& multi_face_landmarks, int frame_width, int frame_height) const override { MP_RETURN_IF_ERROR(ValidateFrameDimensions(frame_width, frame_height)) << "Invalid frame dimensions!"; // Create a perspective camera frustum to be shared for geometry estimation // per each face. PerspectiveCameraFrustum pcf(perspective_camera_, frame_width, frame_height); std::vector multi_face_geometry; // From this point, the meaning of "face landmarks" is clarified further as // "screen face landmarks". This is done do distinguish from "metric face // landmarks" that are derived during the face geometry estimation process. for (const NormalizedLandmarkList& screen_face_landmarks : multi_face_landmarks) { // Having a too compact screen landmark list will result in numerical // instabilities, therefore such faces are filtered. if (IsScreenLandmarkListTooCompact(screen_face_landmarks)) { continue; } // Convert the screen landmarks into the metric landmarks and get the pose // transformation matrix. LandmarkList metric_face_landmarks; Eigen::Matrix4f pose_transform_mat; MP_RETURN_IF_ERROR(space_converter_->Convert(screen_face_landmarks, pcf, metric_face_landmarks, pose_transform_mat)) << "Failed to convert landmarks from the screen to the metric space!"; // Pack geometry data for this face. FaceGeometry face_geometry; Mesh3d* mutable_mesh = face_geometry.mutable_mesh(); // Copy the canonical face mesh as the face geometry mesh. mutable_mesh->CopyFrom(canonical_mesh_); // Replace XYZ vertex mesh coodinates with the metric landmark positions. for (int i = 0; i < canonical_mesh_num_vertices_; ++i) { uint32_t vertex_buffer_offset = canonical_mesh_vertex_size_ * i + canonical_mesh_vertex_position_offset_; mutable_mesh->set_vertex_buffer(vertex_buffer_offset, metric_face_landmarks.landmark(i).x()); mutable_mesh->set_vertex_buffer(vertex_buffer_offset + 1, metric_face_landmarks.landmark(i).y()); mutable_mesh->set_vertex_buffer(vertex_buffer_offset + 2, metric_face_landmarks.landmark(i).z()); } // Populate the face pose transformation matrix. mediapipe::MatrixDataProtoFromMatrix( pose_transform_mat, face_geometry.mutable_pose_transform_matrix()); multi_face_geometry.push_back(face_geometry); } return multi_face_geometry; } private: static bool IsScreenLandmarkListTooCompact( const NormalizedLandmarkList& screen_landmarks) { float mean_x = 0.f; float mean_y = 0.f; for (int i = 0; i < screen_landmarks.landmark_size(); ++i) { const auto& landmark = screen_landmarks.landmark(i); mean_x += (landmark.x() - mean_x) / static_cast(i + 1); mean_y += (landmark.y() - mean_y) / static_cast(i + 1); } float max_sq_dist = 0.f; for (const auto& landmark : screen_landmarks.landmark()) { const float d_x = landmark.x() - mean_x; const float d_y = landmark.y() - mean_y; max_sq_dist = std::max(max_sq_dist, d_x * d_x + d_y * d_y); } static constexpr float kIsScreenLandmarkListTooCompactThreshold = 1e-3f; return std::sqrt(max_sq_dist) <= kIsScreenLandmarkListTooCompactThreshold; } const PerspectiveCamera perspective_camera_; const Mesh3d canonical_mesh_; const uint32_t canonical_mesh_vertex_size_; const uint32_t canonical_mesh_num_vertices_; const uint32_t canonical_mesh_vertex_position_offset_; std::unique_ptr space_converter_; }; } // namespace absl::StatusOr> CreateGeometryPipeline( const Environment& environment, const GeometryPipelineMetadata& metadata) { MP_RETURN_IF_ERROR(ValidateEnvironment(environment)) << "Invalid environment!"; MP_RETURN_IF_ERROR(ValidateGeometryPipelineMetadata(metadata)) << "Invalid geometry pipeline metadata!"; const auto& canonical_mesh = metadata.canonical_mesh(); RET_CHECK(HasVertexComponent(canonical_mesh.vertex_type(), VertexComponent::POSITION)) << "Canonical face mesh must have the `POSITION` vertex component!"; RET_CHECK(HasVertexComponent(canonical_mesh.vertex_type(), VertexComponent::TEX_COORD)) << "Canonical face mesh must have the `TEX_COORD` vertex component!"; uint32_t canonical_mesh_vertex_size = GetVertexSize(canonical_mesh.vertex_type()); uint32_t canonical_mesh_num_vertices = canonical_mesh.vertex_buffer_size() / canonical_mesh_vertex_size; uint32_t canonical_mesh_vertex_position_offset = GetVertexComponentOffset(canonical_mesh.vertex_type(), VertexComponent::POSITION) .value(); // Put the Procrustes landmark basis into Eigen matrices for an easier access. Eigen::Matrix3Xf canonical_metric_landmarks = Eigen::Matrix3Xf::Zero(3, canonical_mesh_num_vertices); Eigen::VectorXf landmark_weights = Eigen::VectorXf::Zero(canonical_mesh_num_vertices); for (int i = 0; i < canonical_mesh_num_vertices; ++i) { uint32_t vertex_buffer_offset = canonical_mesh_vertex_size * i + canonical_mesh_vertex_position_offset; canonical_metric_landmarks(0, i) = canonical_mesh.vertex_buffer(vertex_buffer_offset); canonical_metric_landmarks(1, i) = canonical_mesh.vertex_buffer(vertex_buffer_offset + 1); canonical_metric_landmarks(2, i) = canonical_mesh.vertex_buffer(vertex_buffer_offset + 2); } for (const WeightedLandmarkRef& wlr : metadata.procrustes_landmark_basis()) { uint32_t landmark_id = wlr.landmark_id(); landmark_weights(landmark_id) = wlr.weight(); } std::unique_ptr result = absl::make_unique( environment.perspective_camera(), canonical_mesh, canonical_mesh_vertex_size, canonical_mesh_num_vertices, canonical_mesh_vertex_position_offset, absl::make_unique( environment.origin_point_location(), metadata.input_source() == InputSource::DEFAULT ? InputSource::FACE_LANDMARK_PIPELINE : metadata.input_source(), std::move(canonical_metric_landmarks), std::move(landmark_weights), CreateFloatPrecisionProcrustesSolver())); return result; } } // namespace mediapipe::face_geometry