Remove resizing and rotation from face stylizer's postprocessing step.

The resizing and rotation logic is useful when the pipeline needs to paste the stylized face image back to the original image. To only output the stylized face image, resizing and rotation may result to strange outputs. PiperOrigin-RevId: 524441372
2023-04-14 19:35:58 -07:00 · 2023-04-14 19:35:58 -07:00 · e14a88052a
commit e14a88052a
parent 8b3395edfb
7 changed files with 55 additions and 132 deletions
--- a/mediapipe/tasks/cc/vision/face_stylizer/BUILD
+++ b/mediapipe/tasks/cc/vision/face_stylizer/BUILD
@ -23,18 +23,12 @@ cc_library(
    srcs = ["face_stylizer_graph.cc"],
    deps = [
        "//mediapipe/calculators/core:split_vector_calculator_cc_proto",
-        "//mediapipe/calculators/image:image_cropping_calculator",
-        "//mediapipe/calculators/image:image_cropping_calculator_cc_proto",
-        "//mediapipe/calculators/image:warp_affine_calculator",
-        "//mediapipe/calculators/image:warp_affine_calculator_cc_proto",
+        "//mediapipe/calculators/image:image_clone_calculator_cc_proto",
        "//mediapipe/calculators/tensor:image_to_tensor_calculator_cc_proto",
        "//mediapipe/calculators/tensor:inference_calculator",
        "//mediapipe/calculators/util:detections_to_rects_calculator",
        "//mediapipe/calculators/util:face_to_rect_calculator",
-        "//mediapipe/calculators/util:from_image_calculator",
-        "//mediapipe/calculators/util:inverse_matrix_calculator",
        "//mediapipe/calculators/util:landmarks_to_detection_calculator_cc_proto",
-        "//mediapipe/calculators/util:to_image_calculator",
        "//mediapipe/framework/api2:builder",
        "//mediapipe/framework/api2:port",
        "//mediapipe/framework/formats:image",
@ -53,7 +47,6 @@ cc_library(
        "//mediapipe/tasks/cc/vision/face_landmarker:face_landmarker_graph",
        "//mediapipe/tasks/cc/vision/face_landmarker/proto:face_landmarker_graph_options_cc_proto",
        "//mediapipe/tasks/cc/vision/face_landmarker/proto:face_landmarks_detector_graph_options_cc_proto",
-        "//mediapipe/tasks/cc/vision/face_stylizer/calculators:strip_rotation_calculator",
        "//mediapipe/tasks/cc/vision/face_stylizer/calculators:tensors_to_image_calculator",
        "//mediapipe/tasks/cc/vision/face_stylizer/calculators:tensors_to_image_calculator_cc_proto",
        "//mediapipe/tasks/cc/vision/face_stylizer/proto:face_stylizer_graph_options_cc_proto",
--- a/mediapipe/tasks/cc/vision/face_stylizer/face_stylizer.h
+++ b/mediapipe/tasks/cc/vision/face_stylizer/face_stylizer.h
@ -84,9 +84,7 @@ class FaceStylizer : tasks::vision::core::BaseVisionTaskApi {
  // The input image can be of any size with format RGB or RGBA.
  // When no face is detected on the input image, the method returns a
  // std::nullopt. Otherwise, returns the stylized image of the most visible
-  // face. To ensure that the output image has reasonable quality, the stylized
-  // output image size is the smaller of the model output size and the size of
-  // the 'region_of_interest' specified in 'image_processing_options'.
+  // face. The stylized output image size is the same as the model output size.
  absl::StatusOr<std::optional<mediapipe::Image>> Stylize(
      mediapipe::Image image,
      std::optional<core::ImageProcessingOptions> image_processing_options =
@ -111,9 +109,7 @@ class FaceStylizer : tasks::vision::core::BaseVisionTaskApi {
  // must be monotonically increasing.
  // When no face is detected on the input image, the method returns a
  // std::nullopt. Otherwise, returns the stylized image of the most visible
-  // face. To ensure that the output image has reasonable quality, the stylized
-  // output image size is the smaller of the model output size and the size of
-  // the 'region_of_interest' specified in 'image_processing_options'.
+  // face. The stylized output image size is the same as the model output size.
  absl::StatusOr<std::optional<mediapipe::Image>> StylizeForVideo(
      mediapipe::Image image, int64_t timestamp_ms,
      std::optional<core::ImageProcessingOptions> image_processing_options =
@ -143,10 +139,8 @@ class FaceStylizer : tasks::vision::core::BaseVisionTaskApi {
  // The "result_callback" provides:
  //   - When no face is detected on the input image, the method returns a
  //     std::nullopt. Otherwise, returns the stylized image of the most visible
-  //     face. To ensure that the output image has reasonable quality, the
-  //     stylized output image size is the smaller of the model output size and
-  //     the size of the 'region_of_interest' specified in
-  //     'image_processing_options'.
+  //     face. The stylized output image size is the same as the model output
+  //     size.
  //   - The input timestamp in milliseconds.
  absl::Status StylizeAsync(mediapipe::Image image, int64_t timestamp_ms,
                            std::optional<core::ImageProcessingOptions>
--- a/mediapipe/tasks/cc/vision/face_stylizer/face_stylizer_graph.cc
+++ b/mediapipe/tasks/cc/vision/face_stylizer/face_stylizer_graph.cc
@ -19,8 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "mediapipe/calculators/core/split_vector_calculator.pb.h"
-#include "mediapipe/calculators/image/image_cropping_calculator.pb.h"
-#include "mediapipe/calculators/image/warp_affine_calculator.pb.h"
+#include "mediapipe/calculators/image/image_clone_calculator.pb.h"
 #include "mediapipe/calculators/tensor/image_to_tensor_calculator.pb.h"
 #include "mediapipe/calculators/util/landmarks_to_detection_calculator.pb.h"
 #include "mediapipe/framework/api2/builder.h"
@ -326,7 +325,6 @@ class FaceStylizerGraph : public core::ModelTaskGraph {
    image_in >> preprocessing.In(kImageTag);
    face_rect >> preprocessing.In(kNormRectTag);
    auto preprocessed_tensors = preprocessing.Out(kTensorsTag);
-    auto transform_matrix = preprocessing.Out(kMatrixTag);

    // Adds inference subgraph and connects its input stream to the output
    // tensors produced by the ImageToTensorCalculator.
@ -344,53 +342,12 @@ class FaceStylizerGraph : public core::ModelTaskGraph {
    model_output_tensors >> tensors_to_image.In(kTensorsTag);
    auto tensor_image = tensors_to_image.Out(kImageTag);

-    auto& inverse_matrix = graph.AddNode("InverseMatrixCalculator");
-    transform_matrix >> inverse_matrix.In(kMatrixTag);
-    auto inverse_transform_matrix = inverse_matrix.Out(kMatrixTag);
+    auto& image_converter = graph.AddNode("ImageCloneCalculator");
+    image_converter.GetOptions<mediapipe::ImageCloneCalculatorOptions>()
+        .set_output_on_gpu(false);
+    tensor_image >> image_converter.In("");

-    auto& warp_affine = graph.AddNode("WarpAffineCalculator");
-    auto& warp_affine_options =
-        warp_affine.GetOptions<WarpAffineCalculatorOptions>();
-    warp_affine_options.set_border_mode(
-        WarpAffineCalculatorOptions::BORDER_ZERO);
-    warp_affine_options.set_gpu_origin(mediapipe::GpuOrigin_Mode_TOP_LEFT);
-    tensor_image >> warp_affine.In(kImageTag);
-    inverse_transform_matrix >> warp_affine.In(kMatrixTag);
-    image_size >> warp_affine.In(kOutputSizeTag);
-    auto image_to_crop = warp_affine.Out(kImageTag);
-
-    // The following calculators are for cropping and resizing the output image
-    // based on the roi and the model output size. As the WarpAffineCalculator
-    // rotates the image based on the transform matrix, the rotation info in the
-    // rect proto is stripped to prevent the ImageCroppingCalculator from
-    // performing extra rotation.
-    auto& strip_rotation =
-        graph.AddNode("mediapipe.tasks.StripRotationCalculator");
-    face_rect >> strip_rotation.In(kNormRectTag);
-    auto norm_rect_no_rotation = strip_rotation.Out(kNormRectTag);
-    auto& from_image = graph.AddNode("FromImageCalculator");
-    image_to_crop >> from_image.In(kImageTag);
-    auto& image_cropping = graph.AddNode("ImageCroppingCalculator");
-    auto& image_cropping_opts =
-        image_cropping.GetOptions<ImageCroppingCalculatorOptions>();
-    image_cropping_opts.set_output_max_width(
-        image_to_tensor_options.output_tensor_width());
-    image_cropping_opts.set_output_max_height(
-        image_to_tensor_options.output_tensor_height());
-    norm_rect_no_rotation >> image_cropping.In(kNormRectTag);
-    auto& to_image = graph.AddNode("ToImageCalculator");
-    // ImageCroppingCalculator currently doesn't support mediapipe::Image, the
-    // graph selects its cpu or gpu path based on the image preprocessing
-    // backend.
-    if (use_gpu) {
-      from_image.Out(kImageGpuTag) >> image_cropping.In(kImageGpuTag);
-      image_cropping.Out(kImageGpuTag) >> to_image.In(kImageGpuTag);
-    } else {
-      from_image.Out(kImageCpuTag) >> image_cropping.In(kImageTag);
-      image_cropping.Out(kImageTag) >> to_image.In(kImageCpuTag);
-    }
-
-    return {{/*stylized_image=*/to_image.Out(kImageTag).Cast<Image>(),
+    return {{/*stylized_image=*/image_converter.Out("").Cast<Image>(),
             /*original_image=*/preprocessing.Out(kImageTag).Cast<Image>()}};
  }
 };
--- a/mediapipe/tasks/java/com/google/mediapipe/tasks/vision/facestylizer/FaceStylizer.java
+++ b/mediapipe/tasks/java/com/google/mediapipe/tasks/vision/facestylizer/FaceStylizer.java
@ -198,9 +198,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The image can be of any size. To ensure that the output image has reasonable quality, the
-   * size of the stylized output is based the model output size and can be smaller than the input
-   * image.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @throws MediaPipeException if there is an internal error. Or if {@link FaceStylizer} is created
@ -220,9 +220,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The input image can be of any size. To ensure that the output image has reasonable quality,
-   * the stylized output image size is the smaller of the model output size and the size of the
-   * {@link ImageProcessingOptions#regionOfInterest} specified in {@code imageProcessingOptions}.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param imageProcessingOptions the {@link ImageProcessingOptions} specifying how to process the
@ -256,9 +256,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The image can be of any size. To ensure that the output image has reasonable quality, the
-   * size of the stylized output is based the model output size and can be smaller than the input
-   * image.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @throws IllegalArgumentException if the {@link ImageProcessingOptions} specify a
@ -281,9 +281,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The input image can be of any size. To ensure that the output image has reasonable quality,
-   * the stylized output image size is the smaller of the model output size and the size of the
-   * {@link ImageProcessingOptions#regionOfInterest} specified in {@code imageProcessingOptions}.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param imageProcessingOptions the {@link ImageProcessingOptions} specifying how to process the
@ -320,9 +320,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The image can be of any size. To ensure that the output image has reasonable quality, the
-   * size of the stylized output is based the model output size and can be smaller than the input
-   * image.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param timestampMs the input timestamp (in milliseconds).
@ -346,9 +346,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The input image can be of any size. To ensure that the output image has reasonable quality,
-   * the stylized output image size is the smaller of the model output size and the size of the
-   * {@link ImageProcessingOptions#regionOfInterest} specified in {@code imageProcessingOptions}.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}. *
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param imageProcessingOptions the {@link ImageProcessingOptions} specifying how to process the
@ -387,9 +387,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The image can be of any size. To ensure that the output image has reasonable quality, the
-   * size of the stylized output is based the model output size and can be smaller than the input
-   * image.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param timestampMs the input timestamp (in milliseconds).
@ -414,9 +414,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The input image can be of any size. To ensure that the output image has reasonable quality,
-   * the stylized output image size is the smaller of the model output size and the size of the
-   * {@link ImageProcessingOptions#regionOfInterest} specified in {@code imageProcessingOptions}.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param timestampMs the input timestamp (in milliseconds).
@ -445,9 +445,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *
   * <p>{@link FaceStylizer} supports the following color space types:
   *
-   * <p>The image can be of any size. To ensure that the output image has reasonable quality, the
-   * size of the stylized output is based the model output * size and can be smaller than the input
-   * image.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * <ul>
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
@ -475,9 +475,9 @@ public final class FaceStylizer extends BaseVisionTaskApi {
   *   <li>{@link android.graphics.Bitmap.Config#ARGB_8888}
   * </ul>
   *
-   * <p>The input image can be of any size. To ensure that the output image has reasonable quality,
-   * the stylized output image size is the smaller of the model output size and the size of the
-   * {@link ImageProcessingOptions#regionOfInterest} specified in {@code imageProcessingOptions}.
+   * <p>The input image can be of any size. The output image is the stylized image with the most
+   * visible face. The stylized output image size is the same as the model output size. When no face
+   * is detected on the input image, returns {@code Optional.empty()}.
   *
   * @param image a MediaPipe {@link MPImage} object for processing.
   * @param imageProcessingOptions the {@link ImageProcessingOptions} specifying how to process the
--- a/mediapipe/tasks/javatests/com/google/mediapipe/tasks/vision/facestylizer/FaceStylizerTest.java
+++ b/mediapipe/tasks/javatests/com/google/mediapipe/tasks/vision/facestylizer/FaceStylizerTest.java
@ -234,8 +234,8 @@ public class FaceStylizerTest {
      FaceStylizerResult actualResult = faceStylizer.stylize(inputImage);
      MPImage stylizedImage = actualResult.stylizedImage().get();
      assertThat(stylizedImage).isNotNull();
-      assertThat(stylizedImage.getWidth()).isEqualTo(83);
-      assertThat(stylizedImage.getHeight()).isEqualTo(83);
+      assertThat(stylizedImage.getWidth()).isEqualTo(modelImageSize);
+      assertThat(stylizedImage.getHeight()).isEqualTo(modelImageSize);
    }

    @Test
--- a/mediapipe/tasks/python/vision/face_stylizer.py
+++ b/mediapipe/tasks/python/vision/face_stylizer.py
@ -176,16 +176,13 @@ class FaceStylizer(base_vision_task_api.BaseVisionTaskApi):
    Only use this method when the FaceStylizer is created with the image
    running mode.

-    To ensure that the output image has reasonable quality, the stylized output
-    image size is the smaller of the model output size and the size of the
-    `region_of_interest` specified in `image_processing_options`.
-
    Args:
      image: MediaPipe Image.
      image_processing_options: Options for image processing.

    Returns:
-      The stylized image of the most visible face. None if no face is detected
+      The stylized image of the most visible face. The stylized output image
+      size is the same as the model output size. None if no face is detected
      on the input image.

    Raises:
@ -217,17 +214,14 @@ class FaceStylizer(base_vision_task_api.BaseVisionTaskApi):
    milliseconds) along with the video frame. The input timestamps should be
    monotonically increasing for adjacent calls of this method.

-    To ensure that the output image has reasonable quality, the stylized output
-    image size is the smaller of the model output size and the size of the
-    `region_of_interest` specified in `image_processing_options`.
-
    Args:
      image: MediaPipe Image.
      timestamp_ms: The timestamp of the input video frame in milliseconds.
      image_processing_options: Options for image processing.

    Returns:
-      The stylized image of the most visible face. None if no face is detected
+      The stylized image of the most visible face. The stylized output image
+      size is the same as the model output size. None if no face is detected
      on the input image.

    Raises:
@ -266,12 +260,9 @@ class FaceStylizer(base_vision_task_api.BaseVisionTaskApi):
    images if needed. In other words, it's not guaranteed to have output per
    input image.

-    To ensure that the stylized image has reasonable quality, the stylized
-    output image size is the smaller of the model output size and the size of
-    the `region_of_interest` specified in `image_processing_options`.
-
    The `result_callback` provides:
-      - The stylized image of the most visible face. None if no face is detected
+      - The stylized image of the most visible face. The stylized output image
+        size is the same as the model output size. None if no face is detected
        on the input image.
      - The input image that the face stylizer runs on.
      - The input timestamp in milliseconds.
--- a/mediapipe/tasks/web/vision/face_stylizer/face_stylizer.ts
+++ b/mediapipe/tasks/web/vision/face_stylizer/face_stylizer.ts
@ -129,10 +129,6 @@ export class FaceStylizer extends VisionTaskRunner {
   * synchronously once the callback returns. Only use this method when the
   * FaceStylizer is created with the image running mode.
   *
-   * The input image can be of any size. To ensure that the output image has
-   * reasonable quality, the stylized output image size is determined by the
-   * model output size.
-   *
   * @param image An image to process.
   * @param callback The callback that is invoked with the stylized image. The
   *    lifetime of the returned data is only guaranteed for the duration of the
@ -153,11 +149,6 @@ export class FaceStylizer extends VisionTaskRunner {
   *  If both are specified, the crop around the region-of-interest is extracted
   *  first, then the specified rotation is applied to the crop.
   *
-   * The input image can be of any size. To ensure that the output image has
-   * reasonable quality, the stylized output image size is the smaller of the
-   * model output size and the size of the 'regionOfInterest' specified in
-   * 'imageProcessingOptions'.
-   *
   * @param image An image to process.
   * @param imageProcessingOptions the `ImageProcessingOptions` specifying how
   *    to process the input image before running inference.
@ -192,9 +183,6 @@ export class FaceStylizer extends VisionTaskRunner {
   * frame's timestamp (in milliseconds). The input timestamps must be
   * monotonically increasing.
   *
-   * To ensure that the output image has reasonable quality, the stylized
-   * output image size is determined by the model output size.
-   *
   * @param videoFrame A video frame to process.
   * @param timestamp The timestamp of the current frame, in ms.
   * @param callback The callback that is invoked with the stylized image. The
@ -221,10 +209,6 @@ export class FaceStylizer extends VisionTaskRunner {
   * frame's timestamp (in milliseconds). The input timestamps must be
   * monotonically increasing.
   *
-   * To ensure that the output image has reasonable quality, the stylized
-   * output image size is the smaller of the model output size and the size of
-   * the 'regionOfInterest' specified in 'imageProcessingOptions'.
-   *
   * @param videoFrame A video frame to process.
   * @param imageProcessingOptions the `ImageProcessingOptions` specifying how
   *    to process the input image before running inference.
@ -278,8 +262,12 @@ export class FaceStylizer extends VisionTaskRunner {

    this.graphRunner.attachImageListener(
        STYLIZED_IMAGE_STREAM, (image, timestamp) => {
-          const imageData = this.convertToImageData(image);
-          this.userCallback(imageData, image.width, image.height);
+          if (image.data instanceof WebGLTexture) {
+            this.userCallback(image.data, image.width, image.height);
+          } else {
+            const imageData = this.convertToImageData(image);
+            this.userCallback(imageData, image.width, image.height);
+          }
          this.setLatestOutputTimestamp(timestamp);
        });
    this.graphRunner.attachEmptyPacketListener(