Update face detector graph for downstream face landmarks graph.
PiperOrigin-RevId: 511566984
This commit is contained in:
parent
fbbc13d756
commit
000aeeb036
|
@ -31,9 +31,8 @@ cc_library(
|
|||
"//mediapipe/calculators/tensor:tensors_to_detections_calculator_cc_proto",
|
||||
"//mediapipe/calculators/tflite:ssd_anchors_calculator",
|
||||
"//mediapipe/calculators/tflite:ssd_anchors_calculator_cc_proto",
|
||||
"//mediapipe/calculators/util:detection_label_id_to_text_calculator",
|
||||
"//mediapipe/calculators/util:detection_label_id_to_text_calculator_cc_proto",
|
||||
"//mediapipe/calculators/util:detection_projection_calculator",
|
||||
"//mediapipe/calculators/util:detection_transformation_calculator",
|
||||
"//mediapipe/calculators/util:detections_to_rects_calculator",
|
||||
"//mediapipe/calculators/util:detections_to_rects_calculator_cc_proto",
|
||||
"//mediapipe/calculators/util:non_max_suppression_calculator",
|
||||
|
|
|
@ -21,7 +21,6 @@ limitations under the License.
|
|||
#include "mediapipe/calculators/tensor/image_to_tensor_calculator.pb.h"
|
||||
#include "mediapipe/calculators/tensor/tensors_to_detections_calculator.pb.h"
|
||||
#include "mediapipe/calculators/tflite/ssd_anchors_calculator.pb.h"
|
||||
#include "mediapipe/calculators/util/detection_label_id_to_text_calculator.pb.h"
|
||||
#include "mediapipe/calculators/util/detections_to_rects_calculator.pb.h"
|
||||
#include "mediapipe/calculators/util/non_max_suppression_calculator.pb.h"
|
||||
#include "mediapipe/calculators/util/rect_transformation_calculator.pb.h"
|
||||
|
@ -58,21 +57,40 @@ namespace {
|
|||
constexpr char kImageTag[] = "IMAGE";
|
||||
constexpr char kNormRectTag[] = "NORM_RECT";
|
||||
constexpr char kDetectionsTag[] = "DETECTIONS";
|
||||
constexpr char kAnchorsTag[] = "ANCHORS";
|
||||
constexpr char kImageSizeTag[] = "IMAGE_SIZE";
|
||||
constexpr char kNormRectsTag[] = "NORM_RECTS";
|
||||
constexpr char kProjectionMatrixTag[] = "PROJECTION_MATRIX";
|
||||
constexpr char kTensorsTag[] = "TENSORS";
|
||||
constexpr char kMatrixTag[] = "MATRIX";
|
||||
constexpr char kFaceRectsTag[] = "FACE_RECTS";
|
||||
constexpr char kExpandedFaceRectsTag[] = "EXPANDED_FACE_RECTS";
|
||||
constexpr char kPixelDetectionsTag[] = "PIXEL_DETECTIONS";
|
||||
|
||||
struct FaceDetectionOuts {
|
||||
Source<std::vector<Detection>> face_detections;
|
||||
Source<std::vector<NormalizedRect>> face_rects;
|
||||
Source<std::vector<NormalizedRect>> expanded_face_rects;
|
||||
Source<Image> image;
|
||||
};
|
||||
|
||||
void ConfigureSsdAnchorsCalculator(
|
||||
mediapipe::SsdAnchorsCalculatorOptions* options) {
|
||||
// TODO config SSD anchors parameters from metadata.
|
||||
options->set_num_layers(1);
|
||||
options->set_num_layers(4);
|
||||
options->set_min_scale(0.1484375);
|
||||
options->set_max_scale(0.75);
|
||||
options->set_input_size_height(192);
|
||||
options->set_input_size_width(192);
|
||||
options->set_input_size_height(128);
|
||||
options->set_input_size_width(128);
|
||||
options->set_anchor_offset_x(0.5);
|
||||
options->set_anchor_offset_y(0.5);
|
||||
options->add_strides(4);
|
||||
options->add_strides(8);
|
||||
options->add_strides(16);
|
||||
options->add_strides(16);
|
||||
options->add_strides(16);
|
||||
options->add_aspect_ratios(1.0);
|
||||
options->set_fixed_anchor_size(true);
|
||||
options->set_interpolated_scale_aspect_ratio(0.0);
|
||||
options->set_interpolated_scale_aspect_ratio(1.0);
|
||||
}
|
||||
|
||||
void ConfigureTensorsToDetectionsCalculator(
|
||||
|
@ -80,7 +98,7 @@ void ConfigureTensorsToDetectionsCalculator(
|
|||
mediapipe::TensorsToDetectionsCalculatorOptions* options) {
|
||||
// TODO use metadata to configure these fields.
|
||||
options->set_num_classes(1);
|
||||
options->set_num_boxes(2304);
|
||||
options->set_num_boxes(896);
|
||||
options->set_num_coords(16);
|
||||
options->set_box_coord_offset(0);
|
||||
options->set_keypoint_coord_offset(4);
|
||||
|
@ -90,10 +108,10 @@ void ConfigureTensorsToDetectionsCalculator(
|
|||
options->set_score_clipping_thresh(100.0);
|
||||
options->set_reverse_output_order(true);
|
||||
options->set_min_score_thresh(tasks_options.min_detection_confidence());
|
||||
options->set_x_scale(192.0);
|
||||
options->set_y_scale(192.0);
|
||||
options->set_w_scale(192.0);
|
||||
options->set_h_scale(192.0);
|
||||
options->set_x_scale(128.0);
|
||||
options->set_y_scale(128.0);
|
||||
options->set_w_scale(128.0);
|
||||
options->set_h_scale(128.0);
|
||||
}
|
||||
|
||||
void ConfigureNonMaxSuppressionCalculator(
|
||||
|
@ -107,8 +125,70 @@ void ConfigureNonMaxSuppressionCalculator(
|
|||
mediapipe::NonMaxSuppressionCalculatorOptions::WEIGHTED);
|
||||
}
|
||||
|
||||
void ConfigureDetectionsToRectsCalculator(
|
||||
mediapipe::DetectionsToRectsCalculatorOptions* options) {
|
||||
// Left eye.
|
||||
options->set_rotation_vector_start_keypoint_index(0);
|
||||
// Right ete.
|
||||
options->set_rotation_vector_end_keypoint_index(1);
|
||||
options->set_rotation_vector_target_angle_degrees(0);
|
||||
}
|
||||
|
||||
void ConfigureRectTransformationCalculator(
|
||||
mediapipe::RectTransformationCalculatorOptions* options) {
|
||||
options->set_scale_x(1.5);
|
||||
options->set_scale_y(1.5);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// A "mediapipe.tasks.vision.face_detector.FaceDetectorGraph" performs face
|
||||
// detection.
|
||||
//
|
||||
// Inputs:
|
||||
// IMAGE - Image
|
||||
// Image to perform detection on.
|
||||
// NORM_RECT - NormalizedRect @Optional
|
||||
// Describes image rotation and region of image to perform detection on. If
|
||||
// not provided, whole image is used for face detection.
|
||||
//
|
||||
// Outputs:
|
||||
// DETECTIONS - std::vector<Detection>
|
||||
// Detected face with maximum `num_faces` specified in options.
|
||||
// FACE_RECTS - std::vector<NormalizedRect>
|
||||
// Detected face bounding boxes in normalized coordinates.
|
||||
// EXPANDED_FACE_RECTS - std::vector<NormalizedRect>
|
||||
// Expanded face bounding boxes in normalized coordinates so that bounding
|
||||
// boxes likely contain the whole face. This is usually used as RoI for face
|
||||
// landmarks detection to run on.
|
||||
// IMAGE - Image
|
||||
// The input image that the face detector runs on and has the pixel data
|
||||
// stored on the target storage (CPU vs GPU).
|
||||
// All returned coordinates are in the unrotated and uncropped input image
|
||||
// coordinates system.
|
||||
//
|
||||
// Example:
|
||||
// node {
|
||||
// calculator: "mediapipe.tasks.vision.face_detector.FaceDetectorGraph"
|
||||
// input_stream: "IMAGE:image"
|
||||
// input_stream: "NORM_RECT:norm_rect"
|
||||
// output_stream: "DETECTIONS:palm_detections"
|
||||
// output_stream: "FACE_RECTS:face_rects"
|
||||
// output_stream: "EXPANDED_FACE_RECTS:expanded_face_rects"
|
||||
// output_stream: "IMAGE:image_out"
|
||||
// options {
|
||||
// [mediapipe.tasks.vision.face_detector.proto.FaceDetectorGraphOptions.ext]
|
||||
// {
|
||||
// base_options {
|
||||
// model_asset {
|
||||
// file_name: "face_detection.tflite"
|
||||
// }
|
||||
// }
|
||||
// min_detection_confidence: 0.5
|
||||
// num_faces: 2
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
class FaceDetectorGraph : public core::ModelTaskGraph {
|
||||
public:
|
||||
absl::StatusOr<CalculatorGraphConfig> GetConfig(
|
||||
|
@ -116,17 +196,24 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
|
|||
ASSIGN_OR_RETURN(const auto* model_resources,
|
||||
CreateModelResources<FaceDetectorGraphOptions>(sc));
|
||||
Graph graph;
|
||||
ASSIGN_OR_RETURN(auto face_detections,
|
||||
ASSIGN_OR_RETURN(auto outs,
|
||||
BuildFaceDetectionSubgraph(
|
||||
sc->Options<FaceDetectorGraphOptions>(),
|
||||
*model_resources, graph[Input<Image>(kImageTag)],
|
||||
graph[Input<NormalizedRect>(kNormRectTag)], graph));
|
||||
face_detections >> graph[Output<std::vector<Detection>>(kDetectionsTag)];
|
||||
outs.face_detections >>
|
||||
graph.Out(kDetectionsTag).Cast<std::vector<Detection>>();
|
||||
outs.face_rects >>
|
||||
graph.Out(kFaceRectsTag).Cast<std::vector<NormalizedRect>>();
|
||||
outs.expanded_face_rects >>
|
||||
graph.Out(kExpandedFaceRectsTag).Cast<std::vector<NormalizedRect>>();
|
||||
outs.image >> graph.Out(kImageTag).Cast<Image>();
|
||||
|
||||
return graph.GetConfig();
|
||||
}
|
||||
|
||||
private:
|
||||
absl::StatusOr<Source<std::vector<Detection>>> BuildFaceDetectionSubgraph(
|
||||
absl::StatusOr<FaceDetectionOuts> BuildFaceDetectionSubgraph(
|
||||
const FaceDetectorGraphOptions& subgraph_options,
|
||||
const core::ModelResources& model_resources, Source<Image> image_in,
|
||||
Source<NormalizedRect> norm_rect_in, Graph& graph) {
|
||||
|
@ -149,17 +236,18 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
|
|||
image_to_tensor_options.set_keep_aspect_ratio(true);
|
||||
image_to_tensor_options.set_border_mode(
|
||||
mediapipe::ImageToTensorCalculatorOptions::BORDER_ZERO);
|
||||
image_in >> preprocessing.In("IMAGE");
|
||||
norm_rect_in >> preprocessing.In("NORM_RECT");
|
||||
auto preprocessed_tensors = preprocessing.Out("TENSORS");
|
||||
auto matrix = preprocessing.Out("MATRIX");
|
||||
image_in >> preprocessing.In(kImageTag);
|
||||
norm_rect_in >> preprocessing.In(kNormRectTag);
|
||||
auto preprocessed_tensors = preprocessing.Out(kTensorsTag);
|
||||
auto matrix = preprocessing.Out(kMatrixTag);
|
||||
auto image_size = preprocessing.Out(kImageSizeTag);
|
||||
|
||||
// Face detection model inferece.
|
||||
auto& inference = AddInference(
|
||||
model_resources, subgraph_options.base_options().acceleration(), graph);
|
||||
preprocessed_tensors >> inference.In("TENSORS");
|
||||
preprocessed_tensors >> inference.In(kTensorsTag);
|
||||
auto model_output_tensors =
|
||||
inference.Out("TENSORS").Cast<std::vector<Tensor>>();
|
||||
inference.Out(kTensorsTag).Cast<std::vector<Tensor>>();
|
||||
|
||||
// Generates a single side packet containing a vector of SSD anchors.
|
||||
auto& ssd_anchor = graph.AddNode("SsdAnchorsCalculator");
|
||||
|
@ -174,9 +262,9 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
|
|||
subgraph_options,
|
||||
&tensors_to_detections
|
||||
.GetOptions<mediapipe::TensorsToDetectionsCalculatorOptions>());
|
||||
model_output_tensors >> tensors_to_detections.In("TENSORS");
|
||||
anchors >> tensors_to_detections.SideIn("ANCHORS");
|
||||
auto detections = tensors_to_detections.Out("DETECTIONS");
|
||||
model_output_tensors >> tensors_to_detections.In(kTensorsTag);
|
||||
anchors >> tensors_to_detections.SideIn(kAnchorsTag);
|
||||
auto detections = tensors_to_detections.Out(kDetectionsTag);
|
||||
|
||||
// Non maximum suppression removes redundant face detections.
|
||||
auto& non_maximum_suppression =
|
||||
|
@ -190,12 +278,60 @@ class FaceDetectorGraph : public core::ModelTaskGraph {
|
|||
|
||||
// Projects detections back into the input image coordinates system.
|
||||
auto& detection_projection = graph.AddNode("DetectionProjectionCalculator");
|
||||
nms_detections >> detection_projection.In("DETECTIONS");
|
||||
matrix >> detection_projection.In("PROJECTION_MATRIX");
|
||||
auto face_detections =
|
||||
detection_projection[Output<std::vector<Detection>>("DETECTIONS")];
|
||||
nms_detections >> detection_projection.In(kDetectionsTag);
|
||||
matrix >> detection_projection.In(kProjectionMatrixTag);
|
||||
auto face_detections = detection_projection.Out(kDetectionsTag);
|
||||
|
||||
return {face_detections};
|
||||
// Clip face detections to maximum number of faces;
|
||||
auto& clip_detection_vector_size =
|
||||
graph.AddNode("ClipDetectionVectorSizeCalculator");
|
||||
clip_detection_vector_size
|
||||
.GetOptions<mediapipe::ClipVectorSizeCalculatorOptions>()
|
||||
.set_max_vec_size(subgraph_options.num_faces());
|
||||
face_detections >> clip_detection_vector_size.In("");
|
||||
auto clipped_face_detections =
|
||||
clip_detection_vector_size.Out("").Cast<std::vector<Detection>>();
|
||||
|
||||
// Converts results of face detection into a rectangle (normalized by image
|
||||
// size) that encloses the face and is rotated such that the line connecting
|
||||
// left eye and right eye is aligned with the X-axis of the rectangle.
|
||||
auto& detections_to_rects = graph.AddNode("DetectionsToRectsCalculator");
|
||||
ConfigureDetectionsToRectsCalculator(
|
||||
&detections_to_rects
|
||||
.GetOptions<mediapipe::DetectionsToRectsCalculatorOptions>());
|
||||
image_size >> detections_to_rects.In(kImageSizeTag);
|
||||
clipped_face_detections >> detections_to_rects.In(kDetectionsTag);
|
||||
auto face_rects = detections_to_rects.Out(kNormRectsTag)
|
||||
.Cast<std::vector<NormalizedRect>>();
|
||||
|
||||
// Expands and shifts the rectangle that contains the face so that it's
|
||||
// likely to cover the entire face.
|
||||
auto& rect_transformation = graph.AddNode("RectTransformationCalculator");
|
||||
ConfigureRectTransformationCalculator(
|
||||
&rect_transformation
|
||||
.GetOptions<mediapipe::RectTransformationCalculatorOptions>());
|
||||
face_rects >> rect_transformation.In(kNormRectsTag);
|
||||
image_size >> rect_transformation.In(kImageSizeTag);
|
||||
auto expanded_face_rects =
|
||||
rect_transformation.Out("").Cast<std::vector<NormalizedRect>>();
|
||||
|
||||
// Calculator to convert relative detection bounding boxes to pixel
|
||||
// detection bounding boxes.
|
||||
auto& detection_transformation =
|
||||
graph.AddNode("DetectionTransformationCalculator");
|
||||
detection_projection.Out(kDetectionsTag) >>
|
||||
detection_transformation.In(kDetectionsTag);
|
||||
preprocessing.Out(kImageSizeTag) >>
|
||||
detection_transformation.In(kImageSizeTag);
|
||||
auto face_pixel_detections =
|
||||
detection_transformation.Out(kPixelDetectionsTag)
|
||||
.Cast<std::vector<Detection>>();
|
||||
|
||||
return FaceDetectionOuts{
|
||||
/* face_detections= */ face_pixel_detections,
|
||||
/* face_rects= */ face_rects,
|
||||
/* expanded_face_rects= */ expanded_face_rects,
|
||||
/* image= */ preprocessing.Out("IMAGE").Cast<Image>()};
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -74,6 +74,8 @@ constexpr char kTestDataDirectory[] = "/mediapipe/tasks/testdata/vision/";
|
|||
constexpr char kFullRangeBlazeFaceModel[] = "face_detection_full_range.tflite";
|
||||
constexpr char kFullRangeSparseBlazeFaceModel[] =
|
||||
"face_detection_full_range_sparse.tflite";
|
||||
constexpr char kShortRangeBlazeFaceModel[] =
|
||||
"face_detection_short_range.tflite";
|
||||
constexpr char kPortraitImage[] = "portrait.jpg";
|
||||
constexpr char kPortraitExpectedDetection[] =
|
||||
"portrait_expected_detection.pbtxt";
|
||||
|
@ -161,14 +163,8 @@ TEST_P(FaceDetectorGraphTest, Succeed) {
|
|||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
FaceDetectorGraphTest, FaceDetectorGraphTest,
|
||||
Values(TestParams{.test_name = "FullRange",
|
||||
.face_detection_model_name = kFullRangeBlazeFaceModel,
|
||||
.test_image_name = kPortraitImage,
|
||||
.expected_result = {GetExpectedFaceDetectionResult(
|
||||
kPortraitExpectedDetection)}},
|
||||
TestParams{
|
||||
.test_name = "FullRangeSparse",
|
||||
.face_detection_model_name = kFullRangeSparseBlazeFaceModel,
|
||||
Values(TestParams{.test_name = "ShortRange",
|
||||
.face_detection_model_name = kShortRangeBlazeFaceModel,
|
||||
.test_image_name = kPortraitImage,
|
||||
.expected_result = {GetExpectedFaceDetectionResult(
|
||||
kPortraitExpectedDetection)}}),
|
||||
|
|
|
@ -39,4 +39,7 @@ message FaceDetectorGraphOptions {
|
|||
// IoU threshold ([0,0, 1.0]) for non-maximu-suppression to be considered
|
||||
// duplicate detetions.
|
||||
optional float min_suppression_threshold = 3 [default = 0.5];
|
||||
|
||||
// Maximum number of faces to detect in the image.
|
||||
optional int32 num_faces = 4 [default = 1];
|
||||
}
|
||||
|
|
2
mediapipe/tasks/testdata/vision/BUILD
vendored
2
mediapipe/tasks/testdata/vision/BUILD
vendored
|
@ -39,6 +39,7 @@ mediapipe_files(srcs = [
|
|||
"deeplabv3.tflite",
|
||||
"face_detection_full_range.tflite",
|
||||
"face_detection_full_range_sparse.tflite",
|
||||
"face_detection_short_range.tflite",
|
||||
"face_landmark.tflite",
|
||||
"fist.jpg",
|
||||
"fist.png",
|
||||
|
@ -137,6 +138,7 @@ filegroup(
|
|||
"deeplabv3.tflite",
|
||||
"face_detection_full_range.tflite",
|
||||
"face_detection_full_range_sparse.tflite",
|
||||
"face_detection_short_range.tflite",
|
||||
"face_landmark.tflite",
|
||||
"hand_landmark_full.tflite",
|
||||
"hand_landmark_lite.tflite",
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
# proto-file: mediapipe/framework/formats/detection.proto
|
||||
# proto-message: Detection
|
||||
location_data {
|
||||
format: RELATIVE_BOUNDING_BOX
|
||||
relative_bounding_box {
|
||||
xmin: 0.35494408
|
||||
ymin: 0.1059662
|
||||
width: 0.28768203
|
||||
height: 0.23037356
|
||||
format: BOUNDING_BOX
|
||||
bounding_box {
|
||||
xmin: 283
|
||||
ymin: 115
|
||||
width: 234
|
||||
height: 234
|
||||
}
|
||||
relative_keypoints {
|
||||
x: 0.44416338
|
||||
|
@ -25,7 +25,7 @@ location_data {
|
|||
y: 0.2719954
|
||||
}
|
||||
relative_keypoints {
|
||||
x: 0.37245658
|
||||
x: 0.36063305
|
||||
y: 0.20143759
|
||||
}
|
||||
relative_keypoints {
|
||||
|
|
12
third_party/external_files.bzl
vendored
12
third_party/external_files.bzl
vendored
|
@ -252,8 +252,8 @@ def external_files():
|
|||
|
||||
http_file(
|
||||
name = "com_google_mediapipe_face_detection_short_range_tflite",
|
||||
sha256 = "3bc182eb9f33925d9e58b5c8d59308a760f4adea8f282370e428c51212c26633",
|
||||
urls = ["https://storage.googleapis.com/mediapipe-assets/face_detection_short_range.tflite?generation=1661875748538815"],
|
||||
sha256 = "bbff11cebd1eb27a1e004cae0b0e63ec8c551cbf34a4451148b4908b8db3eca8",
|
||||
urls = ["https://storage.googleapis.com/mediapipe-assets/face_detection_short_range.tflite?generation=1677044301978921"],
|
||||
)
|
||||
|
||||
http_file(
|
||||
|
@ -264,8 +264,8 @@ def external_files():
|
|||
|
||||
http_file(
|
||||
name = "com_google_mediapipe_face_landmark_with_attention_tflite",
|
||||
sha256 = "883b7411747bac657c30c462d305d312e9dec6adbf8b85e2f5d8d722fca9455d",
|
||||
urls = ["https://storage.googleapis.com/mediapipe-assets/face_landmark_with_attention.tflite?generation=1661875751615925"],
|
||||
sha256 = "e06a804e0144f9929eda782122916b35d60c697c3c9344013ca2bbe76a6ce2b4",
|
||||
urls = ["https://storage.googleapis.com/mediapipe-assets/face_landmark_with_attention.tflite?generation=1676415468821650"],
|
||||
)
|
||||
|
||||
http_file(
|
||||
|
@ -714,8 +714,8 @@ def external_files():
|
|||
|
||||
http_file(
|
||||
name = "com_google_mediapipe_portrait_expected_detection_pbtxt",
|
||||
sha256 = "bb54e08e87844ef14bb185d5cb808908eb6011bfa6db48bd22d9650f6fda338b",
|
||||
urls = ["https://storage.googleapis.com/mediapipe-assets/portrait_expected_detection.pbtxt?generation=1674261627835475"],
|
||||
sha256 = "ace755f0fd0ba3b2d75e4f8bb1b08d2f65975fd5570898004540dfef735c1c3d",
|
||||
urls = ["https://storage.googleapis.com/mediapipe-assets/portrait_expected_detection.pbtxt?generation=1677044311581104"],
|
||||
)
|
||||
|
||||
http_file(
|
||||
|
|
Loading…
Reference in New Issue
Block a user