Add attention model

2021-12-13 15:10:03 +02:00 · 2021-12-13 15:10:03 +02:00 · d861abde7c
commit d861abde7c
parent 1cf04343bc
6 changed files with 118 additions and 33 deletions
--- a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_cpu.cpp
+++ b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_cpu.cpp
@ -27,9 +27,12 @@ int main(int argc, char **argv) {
      "mediapipe/modules/face_detection/face_detection_short_range.tflite";
  constexpr char face_landmark_model_path[] =
      "mediapipe/modules/face_landmark/face_landmark.tflite";
+  constexpr char face_landmark_with_attention_model_path[] =
+      "mediapipe/modules/face_landmark/face_landmark_with_attention.tflite";
+  constexpr bool with_attention = true;

  MPFaceMeshDetector *faceMeshDetector = MPFaceMeshDetectorConstruct(
-      maxNumFaces, face_detection_model_path, face_landmark_model_path);
+      maxNumFaces, face_detection_model_path, face_landmark_model_path, with_attention, face_landmark_with_attention_model_path);

  // Allocate memory for face landmarks.
  auto multiFaceLandmarks = new cv::Point2f *[maxNumFaces];
@ -73,6 +76,10 @@ int main(int argc, char **argv) {
      auto &face_landmarks = multiFaceLandmarks[0];
      auto &landmark = face_landmarks[0];

+      for (auto i = 0; i < 478; ++i) {
+          cv::circle(camera_frame_raw, face_landmarks[i], 1.2, cv::Scalar(0, 0, 255));
+      }
+
      LOG(INFO) << "First landmark: x - " << landmark.x << ", y - "
                << landmark.y;
    }
--- a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.cpp
+++ b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.cpp
@ -1,20 +1,33 @@
 #include "face_mesh_lib.h"

+int MPFaceMeshDetector::kLandmarksNum = 468;
+
 MPFaceMeshDetector::MPFaceMeshDetector(int numFaces,
                                       const char *face_detection_model_path,
-                                       const char *face_landmark_model_path) {
-  const auto status = InitFaceMeshDetector(numFaces, face_detection_model_path,
-                                           face_landmark_model_path);
+                                       const char *face_landmark_model_path,
+                                       bool with_attention,
+                                       const char *face_landmark_with_attention_model_path) {
+  const auto status = InitFaceMeshDetector(
+      numFaces,
+      face_detection_model_path,
+      face_landmark_model_path,
+      with_attention,
+      face_landmark_with_attention_model_path);
  if (!status.ok()) {
    LOG(INFO) << "Failed constructing FaceMeshDetector.";
    LOG(INFO) << status.message();
  }
+  if (with_attention) {
+      kLandmarksNum = kLandmarksNumWithAttention;
+  }
 }

 absl::Status
 MPFaceMeshDetector::InitFaceMeshDetector(int numFaces,
                                         const char *face_detection_model_path,
-                                         const char *face_landmark_model_path) {
+                                         const char *face_landmark_model_path,
+                                         bool with_attention,
+                                         const char *face_landmark_with_attention_model_path) {
  numFaces = std::max(numFaces, 1);

  if (face_detection_model_path == nullptr) {
@ -22,6 +35,10 @@ MPFaceMeshDetector::InitFaceMeshDetector(int numFaces,
        "mediapipe/modules/face_detection/face_detection_short_range.tflite";
  }

+  if (with_attention) {
+    face_landmark_model_path = face_landmark_with_attention_model_path;
+  }
+
  if (face_landmark_model_path == nullptr) {
    face_landmark_model_path =
        "mediapipe/modules/face_landmark/face_landmark.tflite";
@ -30,6 +47,8 @@ MPFaceMeshDetector::InitFaceMeshDetector(int numFaces,
  // Prepare graph config.
  auto preparedGraphConfig = absl::StrReplaceAll(
      graphConfig, {{"$numFaces", std::to_string(numFaces)}});
+  preparedGraphConfig = with_attention ? absl::StrReplaceAll( preparedGraphConfig, { {"$with_attention", "true"} }) :
+      absl::StrReplaceAll( preparedGraphConfig, { {"$with_attention", "false"} });
  preparedGraphConfig = absl::StrReplaceAll(
      preparedGraphConfig,
      {{"$faceDetectionModelPath", face_detection_model_path}});
@ -268,10 +287,13 @@ void MPFaceMeshDetector::DetectLandmarks(cv::Point3f **multi_face_landmarks,

 extern "C" {
 DLLEXPORT MPFaceMeshDetector *
-MPFaceMeshDetectorConstruct(int numFaces, const char *face_detection_model_path,
-                            const char *face_landmark_model_path) {
+MPFaceMeshDetectorConstruct(int numFaces,
+    const char* face_detection_model_path,
+    const char* face_landmark_model_path,
+    bool with_attention,
+    const char* face_landmark_model_with_attention_path){
  return new MPFaceMeshDetector(numFaces, face_detection_model_path,
-                                face_landmark_model_path);
+                                face_landmark_model_path, with_attention, face_landmark_model_with_attention_path);
 }

 DLLEXPORT void MPFaceMeshDetectorDestruct(MPFaceMeshDetector *detector) {
@ -331,10 +353,12 @@ node {
 # Defines side packets for further use in the graph.
 node {
  calculator: "ConstantSidePacketCalculator"
-  output_side_packet: "PACKET:num_faces"
+  output_side_packet: "PACKET:0:num_faces"
+  output_side_packet: "PACKET:1:with_attention"
  node_options: {
    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
      packet { int_value: $numFaces }
+      packet { bool_value: $with_attention }
    }
  }
 }
@ -374,6 +398,7 @@ node {
    input_side_packet: "MODEL_BLOB:face_detection_model_blob"
    output_side_packet: "MODEL:face_detection_model"
 }
+
 node {
    calculator: "TfLiteModelCalculator"
    input_side_packet: "MODEL_BLOB:face_landmark_model_blob"
@ -388,6 +413,7 @@ node {
  input_side_packet: "NUM_FACES:num_faces"
  input_side_packet: "MODEL:0:face_detection_model"
  input_side_packet: "MODEL:1:face_landmark_model"
+  input_side_packet: "WITH_ATTENTION:with_attention"
  output_stream: "LANDMARKS:multi_face_landmarks"
  output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
  output_stream: "DETECTIONS:face_detections"
--- a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.h
+++ b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.h
@ -31,8 +31,11 @@

 class MPFaceMeshDetector {
 public:
-  MPFaceMeshDetector(int numFaces, const char *face_detection_model_path,
-                     const char *face_landmark_model_path);
+  MPFaceMeshDetector(int numFaces,
+                     const char *face_detection_model_path,
+                     const char *face_landmark_model_path,
+                     bool with_attention,
+                     const char* face_landmark_model_with_attention_path);

  void DetectFaces(const cv::Mat &camera_frame,
                   cv::Rect *multi_face_bounding_boxes, int *numFaces);
@ -40,12 +43,16 @@ public:
  void DetectLandmarks(cv::Point2f **multi_face_landmarks, int *numFaces);
  void DetectLandmarks(cv::Point3f **multi_face_landmarks, int *numFaces);

-  static constexpr auto kLandmarksNum = 468;
+  static constexpr auto kLandmarksNumWithoutAttention = 468;
+  static constexpr auto kLandmarksNumWithAttention = 478;
+  static int kLandmarksNum;

 private:
  absl::Status InitFaceMeshDetector(int numFaces,
                                    const char *face_detection_model_path,
-                                    const char *face_landmark_model_path);
+                                    const char *face_landmark_model_path,
+                                    bool with_attention,
+                                    const char* face_landmark_model_with_attention_path);
  absl::Status DetectFacesWithStatus(const cv::Mat &camera_frame,
                                     cv::Rect *multi_face_bounding_boxes,
                                     int *numFaces);
@ -79,8 +86,12 @@ extern "C" {
 #endif

 DLLEXPORT MPFaceMeshDetector *
-MPFaceMeshDetectorConstruct(int numFaces, const char *face_detection_model_path,
-                            const char *face_landmark_model_path);
+MPFaceMeshDetectorConstruct(int numFaces,
+    const char *face_detection_model_path,
+    const char *face_landmark_model_path,
+    bool with_attention = true,
+    const char* face_landmark_model_with_attention_path = "mediapipe/modules/face_landmark/face_landmark_with_attention.tflite"
+    );

 DLLEXPORT void MPFaceMeshDetectorDestruct(MPFaceMeshDetector *detector);

--- a/mediapipe/modules/face_landmark/BUILD
+++ b/mediapipe/modules/face_landmark/BUILD
@ -47,14 +47,18 @@ mediapipe_simple_subgraph(
    graph = "face_landmark_side_model_cpu.pbtxt",
    register_as = "FaceLandmarkSideModelCpu",
    deps = [
+		":tensors_to_face_landmarks",
+		":tensors_to_face_landmarks_with_attention",
        "//mediapipe/calculators/core:gate_calculator",
        "//mediapipe/calculators/core:split_vector_calculator",
        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
        "//mediapipe/calculators/tensor:inference_calculator",
        "//mediapipe/calculators/tensor:tensors_to_floats_calculator",
        "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator",
+		"//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator",
        "//mediapipe/calculators/util:landmark_projection_calculator",
        "//mediapipe/calculators/util:thresholding_calculator",
+		"//mediapipe/framework/tool:switch_container",
    ],
 )

--- a/mediapipe/modules/face_landmark/face_landmark_front_side_model_cpu_with_face_counter.pbtxt
+++ b/mediapipe/modules/face_landmark/face_landmark_front_side_model_cpu_with_face_counter.pbtxt
@ -31,8 +31,12 @@ input_side_packet: "MODEL:0:face_detection_model"
 # NOTE: mediapipe/modules/face_landmark/face_landmark.tflite model
 # only, can be passed here, otherwise - results are undefined.
 input_side_packet: "MODEL:1:face_landmark_model"
-
-# Collection of detected/predicted faces, each represented as a list of 468 face
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
+# Collection of detected/predicted faces depends on with_attention, if true : each represented as a list of 468 face
+# landmarks, if false:  each represented as a list of 478 face
 # landmarks. (std::vector<NormalizedLandmarkList>)
 # NOTE: there will not be an output packet in the LANDMARKS stream for this
 # particular timestamp if none of faces detected. However, the MediaPipe
@ -207,6 +211,7 @@ node {
  input_stream: "IMAGE:landmarks_loop_image"
  input_stream: "ROI:face_rect"
  input_side_packet: "MODEL:face_landmark_model"
+  input_side_packet: "WITH_ATTENTION:with_attention"
  output_stream: "LANDMARKS:face_landmarks"
 }

--- a/mediapipe/modules/face_landmark/face_landmark_side_model_cpu.pbtxt
+++ b/mediapipe/modules/face_landmark/face_landmark_side_model_cpu.pbtxt
@ -29,7 +29,10 @@ input_stream: "ROI:roi"
 # only, can be passed here, otherwise - results are undefined.
 input_side_packet: "MODEL:face_landmark_model"

-
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
 # 468 face landmarks within the given ROI. (NormalizedLandmarkList)
 # NOTE: if a face is not present within the given ROI, for this particular
 # timestamp there will not be an output packet in the LANDMARKS stream. However,
@ -55,33 +58,58 @@ node: {
  }
 }

+# Generates a single side packet containing a TensorFlow Lite op resolver that
+# supports custom ops needed by the model used in this graph.
+node {
+  calculator: "TfLiteCustomOpResolverCalculator"
+  output_side_packet: "op_resolver"
+}
+
 # Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
 # vector of tensors representing, for instance, detection boxes/keypoints and
 # scores.
 node {
  calculator: "InferenceCalculator"
  input_stream: "TENSORS:input_tensors"
-  output_stream: "TENSORS:output_tensors"
  input_side_packet: "MODEL:face_landmark_model"
-  options {
+  input_side_packet: "CUSTOM_OP_RESOLVER:op_resolver"
+  output_stream: "TENSORS:output_tensors"
+  options: {
    [mediapipe.InferenceCalculatorOptions.ext] {
      delegate { tflite {} }
    }
  }
 }

-# Splits a vector of tensors into multiple vectors.
+# Splits a vector of tensors into landmark tensors and face flag tensor.
 node {
-  calculator: "SplitTensorVectorCalculator"
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
  input_stream: "output_tensors"
  output_stream: "landmark_tensors"
  output_stream: "face_flag_tensor"
+  options: {
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
        options: {
          [mediapipe.SplitVectorCalculatorOptions.ext] {
            ranges: { begin: 0 end: 1 }
            ranges: { begin: 1 end: 2 }
          }
        }
+      }
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 6 }
+            ranges: { begin: 6 end: 7 }
+          }
+        }
+      }
+    }
+  }
 }

 # Converts the face-flag tensor into a float that represents the confidence
@ -121,14 +149,18 @@ node {
 # Decodes the landmark tensors into a vector of landmarks, where the landmark
 # coordinates are normalized by the size of the input image to the model.
 node {
-  calculator: "TensorsToLandmarksCalculator"
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
  input_stream: "TENSORS:ensured_landmark_tensors"
-  output_stream: "NORM_LANDMARKS:landmarks"
+  output_stream: "LANDMARKS:landmarks"
  options: {
-    [mediapipe.TensorsToLandmarksCalculatorOptions.ext] {
-      num_landmarks: 468
-      input_image_width: 192
-      input_image_height: 192
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "TensorsToFaceLandmarks"
+      }
+      contained_node: {
+        calculator: "TensorsToFaceLandmarksWithAttention"
+      }
    }
  }
 }