Update face detector graph for downstream face landmarks graph.

PiperOrigin-RevId: 511566984
2023-02-22 12:29:51 -08:00 · 2023-02-22 12:29:51 -08:00 · 000aeeb036
commit 000aeeb036
parent fbbc13d756
7 changed files with 188 additions and 52 deletions
--- a/mediapipe/tasks/cc/vision/face_detector/BUILD
+++ b/mediapipe/tasks/cc/vision/face_detector/BUILD
@ -31,9 +31,8 @@ cc_library(
        "//mediapipe/calculators/tensor:tensors_to_detections_calculator_cc_proto",
        "//mediapipe/calculators/tflite:ssd_anchors_calculator",
        "//mediapipe/calculators/tflite:ssd_anchors_calculator_cc_proto",
        "//mediapipe/calculators/util:detection_label_id_to_text_calculator",
        "//mediapipe/calculators/util:detection_label_id_to_text_calculator_cc_proto",
        "//mediapipe/calculators/util:detection_projection_calculator",
        "//mediapipe/calculators/util:detection_transformation_calculator",
        "//mediapipe/calculators/util:detections_to_rects_calculator",
        "//mediapipe/calculators/util:detections_to_rects_calculator_cc_proto",
        "//mediapipe/calculators/util:non_max_suppression_calculator",
--- a/mediapipe/tasks/cc/vision/face_detector/face_detector_graph.cc
+++ b/mediapipe/tasks/cc/vision/face_detector/face_detector_graph.cc
@ -21,7 +21,6 @@ limitations under the License.
 #include "mediapipe/calculators/tensor/image_to_tensor_calculator.pb.h"
 #include "mediapipe/calculators/tensor/tensors_to_detections_calculator.pb.h"
 #include "mediapipe/calculators/tflite/ssd_anchors_calculator.pb.h"
 #include "mediapipe/calculators/util/detection_label_id_to_text_calculator.pb.h"
 #include "mediapipe/calculators/util/detections_to_rects_calculator.pb.h"
 #include "mediapipe/calculators/util/non_max_suppression_calculator.pb.h"
 #include "mediapipe/calculators/util/rect_transformation_calculator.pb.h"
@ -58,21 +57,40 @@ namespace {
 constexpr char kImageTag[] = "IMAGE";
 constexpr char kNormRectTag[] = "NORM_RECT";
 constexpr char kDetectionsTag[] = "DETECTIONS";
 constexpr char kAnchorsTag[] = "ANCHORS";
 constexpr char kImageSizeTag[] = "IMAGE_SIZE";
 constexpr char kNormRectsTag[] = "NORM_RECTS";
 constexpr char kProjectionMatrixTag[] = "PROJECTION_MATRIX";
 constexpr char kTensorsTag[] = "TENSORS";
 constexpr char kMatrixTag[] = "MATRIX";
 constexpr char kFaceRectsTag[] = "FACE_RECTS";
 constexpr char kExpandedFaceRectsTag[] = "EXPANDED_FACE_RECTS";
 constexpr char kPixelDetectionsTag[] = "PIXEL_DETECTIONS";
 struct FaceDetectionOuts {
  Source<std::vector<Detection>> face_detections;
  Source<std::vector<NormalizedRect>> face_rects;
  Source<std::vector<NormalizedRect>> expanded_face_rects;
  Source<Image> image;
 };
 void ConfigureSsdAnchorsCalculator(
    mediapipe::SsdAnchorsCalculatorOptions* options) {
  // TODO config SSD anchors parameters from metadata.
-  options->set_num_layers(1);
+  options->set_num_layers(4);
  options->set_min_scale(0.1484375);
  options->set_max_scale(0.75);
-  options->set_input_size_height(192);
+  options->set_input_size_height(128);
-  options->set_input_size_width(192);
+  options->set_input_size_width(128);
  options->set_anchor_offset_x(0.5);
  options->set_anchor_offset_y(0.5);
-  options->add_strides(4);
+  options->add_strides(8);
  options->add_strides(16);
  options->add_strides(16);
  options->add_strides(16);
  options->add_aspect_ratios(1.0);
  options->set_fixed_anchor_size(true);
-  options->set_interpolated_scale_aspect_ratio(0.0);
+  options->set_interpolated_scale_aspect_ratio(1.0);
 }
 void ConfigureTensorsToDetectionsCalculator(
@ -80,7 +98,7 @@ void ConfigureTensorsToDetectionsCalculator(
    mediapipe::TensorsToDetectionsCalculatorOptions* options) {
  // TODO use metadata to configure these fields.
  options->set_num_classes(1);
-  options->set_num_boxes(2304);
+  options->set_num_boxes(896);
  options->set_num_coords(16);
  options->set_box_coord_offset(0);
  options->set_keypoint_coord_offset(4);
@ -90,10 +108,10 @@ void ConfigureTensorsToDetectionsCalculator(
  options->set_score_clipping_thresh(100.0);
  options->set_reverse_output_order(true);
  options->set_min_score_thresh(tasks_options.min_detection_confidence());
-  options->set_x_scale(192.0);
+  options->set_x_scale(128.0);
-  options->set_y_scale(192.0);
+  options->set_y_scale(128.0);
-  options->set_w_scale(192.0);
+  options->set_w_scale(128.0);
-  options->set_h_scale(192.0);
+  options->set_h_scale(128.0);
 }
 void ConfigureNonMaxSuppressionCalculator(
@ -107,8 +125,70 @@ void ConfigureNonMaxSuppressionCalculator(
      mediapipe::NonMaxSuppressionCalculatorOptions::WEIGHTED);
 }
 void ConfigureDetectionsToRectsCalculator(
    mediapipe::DetectionsToRectsCalculatorOptions* options) {
  // Left eye.
  options->set_rotation_vector_start_keypoint_index(0);
  // Right ete.
  options->set_rotation_vector_end_keypoint_index(1);
  options->set_rotation_vector_target_angle_degrees(0);
 }
 void ConfigureRectTransformationCalculator(
    mediapipe::RectTransformationCalculatorOptions* options) {
  options->set_scale_x(1.5);
  options->set_scale_y(1.5);
 }
 }  // namespace
 // A "mediapipe.tasks.vision.face_detector.FaceDetectorGraph" performs face
 // detection.
 //
 // Inputs:
 //   IMAGE - Image
 //     Image to perform detection on.
 //   NORM_RECT - NormalizedRect @Optional
 //     Describes image rotation and region of image to perform detection on. If
 //     not provided, whole image is used for face detection.
 //
 // Outputs:
 //   DETECTIONS - std::vector<Detection>
 //     Detected face with maximum `num_faces` specified in options.
 //   FACE_RECTS - std::vector<NormalizedRect>
 //     Detected face bounding boxes in normalized coordinates.
 //   EXPANDED_FACE_RECTS - std::vector<NormalizedRect>
 //     Expanded face bounding boxes in normalized coordinates so that bounding
 //     boxes likely contain the whole face. This is usually used as RoI for face
 //     landmarks detection to run on.
 //   IMAGE - Image
 //     The input image that the face detector runs on and has the pixel data
 //     stored on the target storage (CPU vs GPU).
 // All returned coordinates are in the unrotated and uncropped input image
 // coordinates system.
 //
 // Example:
 // node {
 //   calculator: "mediapipe.tasks.vision.face_detector.FaceDetectorGraph"
 //   input_stream: "IMAGE:image"
 //   input_stream: "NORM_RECT:norm_rect"
 //   output_stream: "DETECTIONS:palm_detections"
 //   output_stream: "FACE_RECTS:face_rects"
 //   output_stream: "EXPANDED_FACE_RECTS:expanded_face_rects"
 //   output_stream: "IMAGE:image_out"
 //   options {
 //     [mediapipe.tasks.vision.face_detector.proto.FaceDetectorGraphOptions.ext]
 //     {
 //       base_options {
 //          model_asset {
 //            file_name: "face_detection.tflite"
 //          }
 //       }
 //       min_detection_confidence: 0.5
 //       num_faces: 2
 //     }
 //   }
 // }
 class FaceDetectorGraph : public core::ModelTaskGraph {
 public:
  absl::StatusOr<CalculatorGraphConfig> GetConfig(
@ -116,17 +196,24 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
    ASSIGN_OR_RETURN(const auto* model_resources,
                     CreateModelResources<FaceDetectorGraphOptions>(sc));
    Graph graph;
-    ASSIGN_OR_RETURN(auto face_detections,
+    ASSIGN_OR_RETURN(auto outs,
                     BuildFaceDetectionSubgraph(
                         sc->Options<FaceDetectorGraphOptions>(),
                         *model_resources, graph[Input<Image>(kImageTag)],
                         graph[Input<NormalizedRect>(kNormRectTag)], graph));
-    face_detections >> graph[Output<std::vector<Detection>>(kDetectionsTag)];
+    outs.face_detections >>
        graph.Out(kDetectionsTag).Cast<std::vector<Detection>>();
    outs.face_rects >>
        graph.Out(kFaceRectsTag).Cast<std::vector<NormalizedRect>>();
    outs.expanded_face_rects >>
        graph.Out(kExpandedFaceRectsTag).Cast<std::vector<NormalizedRect>>();
    outs.image >> graph.Out(kImageTag).Cast<Image>();
    return graph.GetConfig();
  }
 private:
-  absl::StatusOr<Source<std::vector<Detection>>> BuildFaceDetectionSubgraph(
+  absl::StatusOr<FaceDetectionOuts> BuildFaceDetectionSubgraph(
      const FaceDetectorGraphOptions& subgraph_options,
      const core::ModelResources& model_resources, Source<Image> image_in,
      Source<NormalizedRect> norm_rect_in, Graph& graph) {
@ -149,17 +236,18 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
    image_to_tensor_options.set_keep_aspect_ratio(true);
    image_to_tensor_options.set_border_mode(
        mediapipe::ImageToTensorCalculatorOptions::BORDER_ZERO);
-    image_in >> preprocessing.In("IMAGE");
+    image_in >> preprocessing.In(kImageTag);
-    norm_rect_in >> preprocessing.In("NORM_RECT");
+    norm_rect_in >> preprocessing.In(kNormRectTag);
-    auto preprocessed_tensors = preprocessing.Out("TENSORS");
+    auto preprocessed_tensors = preprocessing.Out(kTensorsTag);
-    auto matrix = preprocessing.Out("MATRIX");
+    auto matrix = preprocessing.Out(kMatrixTag);
    auto image_size = preprocessing.Out(kImageSizeTag);
    // Face detection model inferece.
    auto& inference = AddInference(
        model_resources, subgraph_options.base_options().acceleration(), graph);
-    preprocessed_tensors >> inference.In("TENSORS");
+    preprocessed_tensors >> inference.In(kTensorsTag);
    auto model_output_tensors =
-        inference.Out("TENSORS").Cast<std::vector<Tensor>>();
+        inference.Out(kTensorsTag).Cast<std::vector<Tensor>>();
    // Generates a single side packet containing a vector of SSD anchors.
    auto& ssd_anchor = graph.AddNode("SsdAnchorsCalculator");
@ -174,9 +262,9 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
        subgraph_options,
        &tensors_to_detections
             .GetOptions<mediapipe::TensorsToDetectionsCalculatorOptions>());
-    model_output_tensors >> tensors_to_detections.In("TENSORS");
+    model_output_tensors >> tensors_to_detections.In(kTensorsTag);
-    anchors >> tensors_to_detections.SideIn("ANCHORS");
+    anchors >> tensors_to_detections.SideIn(kAnchorsTag);
-    auto detections = tensors_to_detections.Out("DETECTIONS");
+    auto detections = tensors_to_detections.Out(kDetectionsTag);
    // Non maximum suppression removes redundant face detections.
    auto& non_maximum_suppression =
@ -190,12 +278,60 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
    // Projects detections back into the input image coordinates system.
    auto& detection_projection = graph.AddNode("DetectionProjectionCalculator");
-    nms_detections >> detection_projection.In("DETECTIONS");
+    nms_detections >> detection_projection.In(kDetectionsTag);
-    matrix >> detection_projection.In("PROJECTION_MATRIX");
+    matrix >> detection_projection.In(kProjectionMatrixTag);
-    auto face_detections =
+    auto face_detections = detection_projection.Out(kDetectionsTag);
        detection_projection[Output<std::vector<Detection>>("DETECTIONS")];
-    return {face_detections};
+    // Clip face detections to maximum number of faces;
    auto& clip_detection_vector_size =
        graph.AddNode("ClipDetectionVectorSizeCalculator");
    clip_detection_vector_size
        .GetOptions<mediapipe::ClipVectorSizeCalculatorOptions>()
        .set_max_vec_size(subgraph_options.num_faces());
    face_detections >> clip_detection_vector_size.In("");
    auto clipped_face_detections =
        clip_detection_vector_size.Out("").Cast<std::vector<Detection>>();
    // Converts results of face detection into a rectangle (normalized by image
    // size) that encloses the face and is rotated such that the line connecting
    // left eye and right eye is aligned with the X-axis of the rectangle.
    auto& detections_to_rects = graph.AddNode("DetectionsToRectsCalculator");
    ConfigureDetectionsToRectsCalculator(
        &detections_to_rects
             .GetOptions<mediapipe::DetectionsToRectsCalculatorOptions>());
    image_size >> detections_to_rects.In(kImageSizeTag);
    clipped_face_detections >> detections_to_rects.In(kDetectionsTag);
    auto face_rects = detections_to_rects.Out(kNormRectsTag)
                          .Cast<std::vector<NormalizedRect>>();
    // Expands and shifts the rectangle that contains the face so that it's
    // likely to cover the entire face.
    auto& rect_transformation = graph.AddNode("RectTransformationCalculator");
    ConfigureRectTransformationCalculator(
        &rect_transformation
             .GetOptions<mediapipe::RectTransformationCalculatorOptions>());
    face_rects >> rect_transformation.In(kNormRectsTag);
    image_size >> rect_transformation.In(kImageSizeTag);
    auto expanded_face_rects =
        rect_transformation.Out("").Cast<std::vector<NormalizedRect>>();
    // Calculator to convert relative detection bounding boxes to pixel
    // detection bounding boxes.
    auto& detection_transformation =
        graph.AddNode("DetectionTransformationCalculator");
    detection_projection.Out(kDetectionsTag) >>
        detection_transformation.In(kDetectionsTag);
    preprocessing.Out(kImageSizeTag) >>
        detection_transformation.In(kImageSizeTag);
    auto face_pixel_detections =
        detection_transformation.Out(kPixelDetectionsTag)
            .Cast<std::vector<Detection>>();
    return FaceDetectionOuts{
        /* face_detections= */ face_pixel_detections,
        /* face_rects= */ face_rects,
        /* expanded_face_rects= */ expanded_face_rects,
        /* image= */ preprocessing.Out("IMAGE").Cast<Image>()};
  }
 };
--- a/mediapipe/tasks/cc/vision/face_detector/face_detector_graph_test.cc
+++ b/mediapipe/tasks/cc/vision/face_detector/face_detector_graph_test.cc
@ -74,6 +74,8 @@ constexpr char kTestDataDirectory[] = "/mediapipe/tasks/testdata/vision/";
 constexpr char kFullRangeBlazeFaceModel[] = "face_detection_full_range.tflite";
 constexpr char kFullRangeSparseBlazeFaceModel[] =
    "face_detection_full_range_sparse.tflite";
 constexpr char kShortRangeBlazeFaceModel[] =
    "face_detection_short_range.tflite";
 constexpr char kPortraitImage[] = "portrait.jpg";
 constexpr char kPortraitExpectedDetection[] =
    "portrait_expected_detection.pbtxt";
@ -161,14 +163,8 @@ TEST_P(FaceDetectorGraphTest, Succeed) {
 INSTANTIATE_TEST_SUITE_P(
    FaceDetectorGraphTest, FaceDetectorGraphTest,
-    Values(TestParams{.test_name = "FullRange",
+    Values(TestParams{.test_name = "ShortRange",
-                      .face_detection_model_name = kFullRangeBlazeFaceModel,
+                      .face_detection_model_name = kShortRangeBlazeFaceModel,
                      .test_image_name = kPortraitImage,
                      .expected_result = {GetExpectedFaceDetectionResult(
                          kPortraitExpectedDetection)}},
           TestParams{
               .test_name = "FullRangeSparse",
               .face_detection_model_name = kFullRangeSparseBlazeFaceModel,
                      .test_image_name = kPortraitImage,
                      .expected_result = {GetExpectedFaceDetectionResult(
                          kPortraitExpectedDetection)}}),
--- a/mediapipe/tasks/cc/vision/face_detector/proto/face_detector_graph_options.proto
+++ b/mediapipe/tasks/cc/vision/face_detector/proto/face_detector_graph_options.proto
@ -39,4 +39,7 @@ message FaceDetectorGraphOptions {
  // IoU threshold ([0,0, 1.0]) for non-maximu-suppression to be considered
  // duplicate detetions.
  optional float min_suppression_threshold = 3 [default = 0.5];
  // Maximum number of faces to detect in the image.
  optional int32 num_faces = 4 [default = 1];
 }
--- a/mediapipe/tasks/testdata/vision/BUILD
+++ b/mediapipe/tasks/testdata/vision/BUILD
@ -39,6 +39,7 @@ mediapipe_files(srcs = [
    "deeplabv3.tflite",
    "face_detection_full_range.tflite",
    "face_detection_full_range_sparse.tflite",
    "face_detection_short_range.tflite",
    "face_landmark.tflite",
    "fist.jpg",
    "fist.png",
@ -137,6 +138,7 @@ filegroup(
        "deeplabv3.tflite",
        "face_detection_full_range.tflite",
        "face_detection_full_range_sparse.tflite",
        "face_detection_short_range.tflite",
        "face_landmark.tflite",
        "hand_landmark_full.tflite",
        "hand_landmark_lite.tflite",
--- a/mediapipe/tasks/testdata/vision/portrait_expected_detection.pbtxt
+++ b/mediapipe/tasks/testdata/vision/portrait_expected_detection.pbtxt
@ -1,12 +1,12 @@
 # proto-file: mediapipe/framework/formats/detection.proto
 # proto-message: Detection
 location_data {
-  format: RELATIVE_BOUNDING_BOX
+  format: BOUNDING_BOX
-  relative_bounding_box {
+  bounding_box {
-    xmin: 0.35494408
+    xmin: 283
-    ymin: 0.1059662
+    ymin: 115
-    width: 0.28768203
+    width: 234
-    height: 0.23037356
+    height: 234
  }
  relative_keypoints {
    x: 0.44416338
@ -25,7 +25,7 @@ location_data {
    y: 0.2719954
  }
  relative_keypoints {
-    x: 0.37245658
+    x: 0.36063305
    y: 0.20143759
  }
  relative_keypoints {
--- a/third_party/external_files.bzl
+++ b/third_party/external_files.bzl
@ -252,8 +252,8 @@ def external_files():
    http_file(
        name = "com_google_mediapipe_face_detection_short_range_tflite",
-        sha256 = "3bc182eb9f33925d9e58b5c8d59308a760f4adea8f282370e428c51212c26633",
+        sha256 = "bbff11cebd1eb27a1e004cae0b0e63ec8c551cbf34a4451148b4908b8db3eca8",
-        urls = ["https://storage.googleapis.com/mediapipe-assets/face_detection_short_range.tflite?generation=1661875748538815"],
+        urls = ["https://storage.googleapis.com/mediapipe-assets/face_detection_short_range.tflite?generation=1677044301978921"],
    )
    http_file(
@ -264,8 +264,8 @@ def external_files():
    http_file(
        name = "com_google_mediapipe_face_landmark_with_attention_tflite",
-        sha256 = "883b7411747bac657c30c462d305d312e9dec6adbf8b85e2f5d8d722fca9455d",
+        sha256 = "e06a804e0144f9929eda782122916b35d60c697c3c9344013ca2bbe76a6ce2b4",
-        urls = ["https://storage.googleapis.com/mediapipe-assets/face_landmark_with_attention.tflite?generation=1661875751615925"],
+        urls = ["https://storage.googleapis.com/mediapipe-assets/face_landmark_with_attention.tflite?generation=1676415468821650"],
    )
    http_file(
@ -714,8 +714,8 @@ def external_files():
    http_file(
        name = "com_google_mediapipe_portrait_expected_detection_pbtxt",
-        sha256 = "bb54e08e87844ef14bb185d5cb808908eb6011bfa6db48bd22d9650f6fda338b",
+        sha256 = "ace755f0fd0ba3b2d75e4f8bb1b08d2f65975fd5570898004540dfef735c1c3d",
-        urls = ["https://storage.googleapis.com/mediapipe-assets/portrait_expected_detection.pbtxt?generation=1674261627835475"],
+        urls = ["https://storage.googleapis.com/mediapipe-assets/portrait_expected_detection.pbtxt?generation=1677044311581104"],
    )
    http_file(