From d861abde7cf4656895d5950d2dbe40c7480c08c5 Mon Sep 17 00:00:00 2001
From: Pavlo-Ivan Mykhalevych <pmykhalevych@tsukat.com>
Date: Mon, 13 Dec 2021 15:10:03 +0200
Subject: [PATCH] Add attention model

---
 .../desktop/face_mesh_dll/face_mesh_cpu.cpp   |  9 ++-
 .../desktop/face_mesh_dll/face_mesh_lib.cpp   | 42 ++++++++++---
 .../desktop/face_mesh_dll/face_mesh_lib.h     | 27 ++++++---
 mediapipe/modules/face_landmark/BUILD         |  4 ++
 ...ont_side_model_cpu_with_face_counter.pbtxt |  9 ++-
 .../face_landmark_side_model_cpu.pbtxt        | 60 ++++++++++++++-----
 6 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_cpu.cpp b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_cpu.cpp
index 83762a1a1..a982c133d 100644
--- a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_cpu.cpp
+++ b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_cpu.cpp
@@ -27,9 +27,12 @@ int main(int argc, char **argv) {
       "mediapipe/modules/face_detection/face_detection_short_range.tflite";
   constexpr char face_landmark_model_path[] =
       "mediapipe/modules/face_landmark/face_landmark.tflite";
+  constexpr char face_landmark_with_attention_model_path[] =
+      "mediapipe/modules/face_landmark/face_landmark_with_attention.tflite";
+  constexpr bool with_attention = true;
 
   MPFaceMeshDetector *faceMeshDetector = MPFaceMeshDetectorConstruct(
-      maxNumFaces, face_detection_model_path, face_landmark_model_path);
+      maxNumFaces, face_detection_model_path, face_landmark_model_path, with_attention, face_landmark_with_attention_model_path);
 
   // Allocate memory for face landmarks.
   auto multiFaceLandmarks = new cv::Point2f *[maxNumFaces];
@@ -73,6 +76,10 @@ int main(int argc, char **argv) {
       auto &face_landmarks = multiFaceLandmarks[0];
       auto &landmark = face_landmarks[0];
 
+      for (auto i = 0; i < 478; ++i) {
+          cv::circle(camera_frame_raw, face_landmarks[i], 1.2, cv::Scalar(0, 0, 255));
+      }
+
       LOG(INFO) << "First landmark: x - " << landmark.x << ", y - "
                 << landmark.y;
     }
diff --git a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.cpp b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.cpp
index b3082e58c..0fe85a49e 100644
--- a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.cpp
+++ b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.cpp
@@ -1,20 +1,33 @@
 #include "face_mesh_lib.h"
 
+int MPFaceMeshDetector::kLandmarksNum = 468;
+
 MPFaceMeshDetector::MPFaceMeshDetector(int numFaces,
                                        const char *face_detection_model_path,
-                                       const char *face_landmark_model_path) {
-  const auto status = InitFaceMeshDetector(numFaces, face_detection_model_path,
-                                           face_landmark_model_path);
+                                       const char *face_landmark_model_path,
+                                       bool with_attention,
+                                       const char *face_landmark_with_attention_model_path) {
+  const auto status = InitFaceMeshDetector(
+      numFaces,
+      face_detection_model_path,
+      face_landmark_model_path,
+      with_attention,
+      face_landmark_with_attention_model_path);
   if (!status.ok()) {
     LOG(INFO) << "Failed constructing FaceMeshDetector.";
     LOG(INFO) << status.message();
   }
+  if (with_attention) {
+      kLandmarksNum = kLandmarksNumWithAttention;
+  }
 }
 
 absl::Status
 MPFaceMeshDetector::InitFaceMeshDetector(int numFaces,
                                          const char *face_detection_model_path,
-                                         const char *face_landmark_model_path) {
+                                         const char *face_landmark_model_path,
+                                         bool with_attention,
+                                         const char *face_landmark_with_attention_model_path) {
   numFaces = std::max(numFaces, 1);
 
   if (face_detection_model_path == nullptr) {
@@ -22,6 +35,10 @@ MPFaceMeshDetector::InitFaceMeshDetector(int numFaces,
         "mediapipe/modules/face_detection/face_detection_short_range.tflite";
   }
 
+  if (with_attention) {
+    face_landmark_model_path = face_landmark_with_attention_model_path;
+  }
+
   if (face_landmark_model_path == nullptr) {
     face_landmark_model_path =
         "mediapipe/modules/face_landmark/face_landmark.tflite";
@@ -30,6 +47,8 @@ MPFaceMeshDetector::InitFaceMeshDetector(int numFaces,
   // Prepare graph config.
   auto preparedGraphConfig = absl::StrReplaceAll(
       graphConfig, {{"$numFaces", std::to_string(numFaces)}});
+  preparedGraphConfig = with_attention ? absl::StrReplaceAll( preparedGraphConfig, { {"$with_attention", "true"} }) :
+      absl::StrReplaceAll( preparedGraphConfig, { {"$with_attention", "false"} });
   preparedGraphConfig = absl::StrReplaceAll(
       preparedGraphConfig,
       {{"$faceDetectionModelPath", face_detection_model_path}});
@@ -268,10 +287,13 @@ void MPFaceMeshDetector::DetectLandmarks(cv::Point3f **multi_face_landmarks,
 
 extern "C" {
 DLLEXPORT MPFaceMeshDetector *
-MPFaceMeshDetectorConstruct(int numFaces, const char *face_detection_model_path,
-                            const char *face_landmark_model_path) {
+MPFaceMeshDetectorConstruct(int numFaces,
+    const char* face_detection_model_path,
+    const char* face_landmark_model_path,
+    bool with_attention,
+    const char* face_landmark_model_with_attention_path){
   return new MPFaceMeshDetector(numFaces, face_detection_model_path,
-                                face_landmark_model_path);
+                                face_landmark_model_path, with_attention, face_landmark_model_with_attention_path);
 }
 
 DLLEXPORT void MPFaceMeshDetectorDestruct(MPFaceMeshDetector *detector) {
@@ -331,10 +353,12 @@ node {
 # Defines side packets for further use in the graph.
 node {
   calculator: "ConstantSidePacketCalculator"
-  output_side_packet: "PACKET:num_faces"
+  output_side_packet: "PACKET:0:num_faces"
+  output_side_packet: "PACKET:1:with_attention"
   node_options: {
     [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
       packet { int_value: $numFaces }
+      packet { bool_value: $with_attention }
     }
   }
 }
@@ -374,6 +398,7 @@ node {
     input_side_packet: "MODEL_BLOB:face_detection_model_blob"
     output_side_packet: "MODEL:face_detection_model"
 }
+
 node {
     calculator: "TfLiteModelCalculator"
     input_side_packet: "MODEL_BLOB:face_landmark_model_blob"
@@ -388,6 +413,7 @@ node {
   input_side_packet: "NUM_FACES:num_faces"
   input_side_packet: "MODEL:0:face_detection_model"
   input_side_packet: "MODEL:1:face_landmark_model"
+  input_side_packet: "WITH_ATTENTION:with_attention"
   output_stream: "LANDMARKS:multi_face_landmarks"
   output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
   output_stream: "DETECTIONS:face_detections"
diff --git a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.h b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.h
index 6705b42e6..84556fb4d 100644
--- a/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.h
+++ b/mediapipe/examples/desktop/face_mesh_dll/face_mesh_lib.h
@@ -31,25 +31,32 @@
 
 class MPFaceMeshDetector {
 public:
-  MPFaceMeshDetector(int numFaces, const char *face_detection_model_path,
-                     const char *face_landmark_model_path);
+  MPFaceMeshDetector(int numFaces,
+                     const char *face_detection_model_path,
+                     const char *face_landmark_model_path,
+                     bool with_attention,
+                     const char* face_landmark_model_with_attention_path);
 
   void DetectFaces(const cv::Mat &camera_frame,
                    cv::Rect *multi_face_bounding_boxes, int *numFaces);
-  
+
   void DetectLandmarks(cv::Point2f **multi_face_landmarks, int *numFaces);
   void DetectLandmarks(cv::Point3f **multi_face_landmarks, int *numFaces);
 
-  static constexpr auto kLandmarksNum = 468;
+  static constexpr auto kLandmarksNumWithoutAttention = 468;
+  static constexpr auto kLandmarksNumWithAttention = 478;
+  static int kLandmarksNum;
 
 private:
   absl::Status InitFaceMeshDetector(int numFaces,
                                     const char *face_detection_model_path,
-                                    const char *face_landmark_model_path);
+                                    const char *face_landmark_model_path,
+                                    bool with_attention,
+                                    const char* face_landmark_model_with_attention_path);
   absl::Status DetectFacesWithStatus(const cv::Mat &camera_frame,
                                      cv::Rect *multi_face_bounding_boxes,
                                      int *numFaces);
-  
+
   absl::Status DetectLandmarksWithStatus(cv::Point2f **multi_face_landmarks);
   absl::Status DetectLandmarksWithStatus(cv::Point3f **multi_face_landmarks);
 
@@ -79,8 +86,12 @@ extern "C" {
 #endif
 
 DLLEXPORT MPFaceMeshDetector *
-MPFaceMeshDetectorConstruct(int numFaces, const char *face_detection_model_path,
-                            const char *face_landmark_model_path);
+MPFaceMeshDetectorConstruct(int numFaces,
+    const char *face_detection_model_path,
+    const char *face_landmark_model_path,
+    bool with_attention = true,
+    const char* face_landmark_model_with_attention_path = "mediapipe/modules/face_landmark/face_landmark_with_attention.tflite"
+    );
 
 DLLEXPORT void MPFaceMeshDetectorDestruct(MPFaceMeshDetector *detector);
 
diff --git a/mediapipe/modules/face_landmark/BUILD b/mediapipe/modules/face_landmark/BUILD
index ba2a53198..de16605b8 100644
--- a/mediapipe/modules/face_landmark/BUILD
+++ b/mediapipe/modules/face_landmark/BUILD
@@ -47,14 +47,18 @@ mediapipe_simple_subgraph(
     graph = "face_landmark_side_model_cpu.pbtxt",
     register_as = "FaceLandmarkSideModelCpu",
     deps = [
+		":tensors_to_face_landmarks",
+		":tensors_to_face_landmarks_with_attention",
         "//mediapipe/calculators/core:gate_calculator",
         "//mediapipe/calculators/core:split_vector_calculator",
         "//mediapipe/calculators/tensor:image_to_tensor_calculator",
         "//mediapipe/calculators/tensor:inference_calculator",
         "//mediapipe/calculators/tensor:tensors_to_floats_calculator",
         "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator",
+		"//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator",
         "//mediapipe/calculators/util:landmark_projection_calculator",
         "//mediapipe/calculators/util:thresholding_calculator",
+		"//mediapipe/framework/tool:switch_container",
     ],
 )
 
diff --git a/mediapipe/modules/face_landmark/face_landmark_front_side_model_cpu_with_face_counter.pbtxt b/mediapipe/modules/face_landmark/face_landmark_front_side_model_cpu_with_face_counter.pbtxt
index dc83f17b7..2d82b9883 100644
--- a/mediapipe/modules/face_landmark/face_landmark_front_side_model_cpu_with_face_counter.pbtxt
+++ b/mediapipe/modules/face_landmark/face_landmark_front_side_model_cpu_with_face_counter.pbtxt
@@ -31,8 +31,12 @@ input_side_packet: "MODEL:0:face_detection_model"
 # NOTE: mediapipe/modules/face_landmark/face_landmark.tflite model
 # only, can be passed here, otherwise - results are undefined.
 input_side_packet: "MODEL:1:face_landmark_model"
-
-# Collection of detected/predicted faces, each represented as a list of 468 face
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
+# Collection of detected/predicted faces depends on with_attention, if true : each represented as a list of 468 face
+# landmarks, if false:  each represented as a list of 478 face
 # landmarks. (std::vector<NormalizedLandmarkList>)
 # NOTE: there will not be an output packet in the LANDMARKS stream for this
 # particular timestamp if none of faces detected. However, the MediaPipe
@@ -207,6 +211,7 @@ node {
   input_stream: "IMAGE:landmarks_loop_image"
   input_stream: "ROI:face_rect"
   input_side_packet: "MODEL:face_landmark_model"
+  input_side_packet: "WITH_ATTENTION:with_attention"
   output_stream: "LANDMARKS:face_landmarks"
 }
 
diff --git a/mediapipe/modules/face_landmark/face_landmark_side_model_cpu.pbtxt b/mediapipe/modules/face_landmark/face_landmark_side_model_cpu.pbtxt
index d8537fd82..f1d8b81a6 100644
--- a/mediapipe/modules/face_landmark/face_landmark_side_model_cpu.pbtxt
+++ b/mediapipe/modules/face_landmark/face_landmark_side_model_cpu.pbtxt
@@ -29,7 +29,10 @@ input_stream: "ROI:roi"
 # only, can be passed here, otherwise - results are undefined.
 input_side_packet: "MODEL:face_landmark_model"
 
-
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
 # 468 face landmarks within the given ROI. (NormalizedLandmarkList)
 # NOTE: if a face is not present within the given ROI, for this particular
 # timestamp there will not be an output packet in the LANDMARKS stream. However,
@@ -55,31 +58,56 @@ node: {
   }
 }
 
+# Generates a single side packet containing a TensorFlow Lite op resolver that
+# supports custom ops needed by the model used in this graph.
+node {
+  calculator: "TfLiteCustomOpResolverCalculator"
+  output_side_packet: "op_resolver"
+}
+
 # Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
 # vector of tensors representing, for instance, detection boxes/keypoints and
 # scores.
 node {
   calculator: "InferenceCalculator"
   input_stream: "TENSORS:input_tensors"
-  output_stream: "TENSORS:output_tensors"
   input_side_packet: "MODEL:face_landmark_model"
-  options {
+  input_side_packet: "CUSTOM_OP_RESOLVER:op_resolver"
+  output_stream: "TENSORS:output_tensors"
+  options: {
     [mediapipe.InferenceCalculatorOptions.ext] {
       delegate { tflite {} }
     }
   }
 }
 
-# Splits a vector of tensors into multiple vectors.
+# Splits a vector of tensors into landmark tensors and face flag tensor.
 node {
-  calculator: "SplitTensorVectorCalculator"
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
   input_stream: "output_tensors"
   output_stream: "landmark_tensors"
   output_stream: "face_flag_tensor"
   options: {
-    [mediapipe.SplitVectorCalculatorOptions.ext] {
-      ranges: { begin: 0 end: 1 }
-      ranges: { begin: 1 end: 2 }
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 1 }
+            ranges: { begin: 1 end: 2 }
+          }
+        }
+      }
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 6 }
+            ranges: { begin: 6 end: 7 }
+          }
+        }
+      }
     }
   }
 }
@@ -121,14 +149,18 @@ node {
 # Decodes the landmark tensors into a vector of landmarks, where the landmark
 # coordinates are normalized by the size of the input image to the model.
 node {
-  calculator: "TensorsToLandmarksCalculator"
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
   input_stream: "TENSORS:ensured_landmark_tensors"
-  output_stream: "NORM_LANDMARKS:landmarks"
+  output_stream: "LANDMARKS:landmarks"
   options: {
-    [mediapipe.TensorsToLandmarksCalculatorOptions.ext] {
-      num_landmarks: 468
-      input_image_width: 192
-      input_image_height: 192
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "TensorsToFaceLandmarks"
+      }
+      contained_node: {
+        calculator: "TensorsToFaceLandmarksWithAttention"
+      }
     }
   }
 }