diff --git a/mediapipe/tasks/cc/vision/core/BUILD b/mediapipe/tasks/cc/vision/core/BUILD index 12d789901..e8e197a1d 100644 --- a/mediapipe/tasks/cc/vision/core/BUILD +++ b/mediapipe/tasks/cc/vision/core/BUILD @@ -21,12 +21,23 @@ cc_library( hdrs = ["running_mode.h"], ) +cc_library( + name = "image_processing_options", + hdrs = ["image_processing_options.h"], + deps = [ + "//mediapipe/tasks/cc/components/containers:rect", + ], +) + cc_library( name = "base_vision_task_api", hdrs = ["base_vision_task_api.h"], deps = [ + ":image_processing_options", ":running_mode", "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/framework/formats:rect_cc_proto", + "//mediapipe/tasks/cc/components/containers:rect", "//mediapipe/tasks/cc/core:base_task_api", "//mediapipe/tasks/cc/core:task_runner", "@com_google_absl//absl/status", diff --git a/mediapipe/tasks/cc/vision/core/base_vision_task_api.h b/mediapipe/tasks/cc/vision/core/base_vision_task_api.h index 4586cbbdd..c3c0a0261 100644 --- a/mediapipe/tasks/cc/vision/core/base_vision_task_api.h +++ b/mediapipe/tasks/cc/vision/core/base_vision_task_api.h @@ -16,15 +16,20 @@ limitations under the License. #ifndef MEDIAPIPE_TASKS_CC_VISION_CORE_BASE_VISION_TASK_API_H_ #define MEDIAPIPE_TASKS_CC_VISION_CORE_BASE_VISION_TASK_API_H_ +#include #include +#include #include #include #include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/str_cat.h" +#include "mediapipe/framework/formats/rect.pb.h" +#include "mediapipe/tasks/cc/components/containers/rect.h" #include "mediapipe/tasks/cc/core/base_task_api.h" #include "mediapipe/tasks/cc/core/task_runner.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" namespace mediapipe { @@ -87,6 +92,60 @@ class BaseVisionTaskApi : public tasks::core::BaseTaskApi { return runner_->Send(std::move(inputs)); } + // Convert from ImageProcessingOptions to NormalizedRect, performing sanity + // checks on-the-fly. If the input ImageProcessingOptions is not present, + // returns a default NormalizedRect covering the whole image with rotation set + // to 0. If 'roi_allowed' is false, an error will be returned if the input + // ImageProcessingOptions has its 'region_or_interest' field set. + static absl::StatusOr ConvertToNormalizedRect( + std::optional options, bool roi_allowed = true) { + mediapipe::NormalizedRect normalized_rect; + normalized_rect.set_rotation(0); + normalized_rect.set_x_center(0.5); + normalized_rect.set_y_center(0.5); + normalized_rect.set_width(1.0); + normalized_rect.set_height(1.0); + if (!options.has_value()) { + return normalized_rect; + } + + if (options->rotation_degrees % 90 != 0) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + "Expected rotation to be a multiple of 90°.", + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError); + } + // Convert to radians counter-clockwise. + normalized_rect.set_rotation(-options->rotation_degrees * M_PI / 180.0); + + if (options->region_of_interest.has_value()) { + if (!roi_allowed) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + "This task doesn't support region-of-interest.", + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError); + } + auto& roi = *options->region_of_interest; + if (roi.left >= roi.right || roi.top >= roi.bottom) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + "Expected Rect with left < right and top < bottom.", + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError); + } + if (roi.left < 0 || roi.top < 0 || roi.right > 1 || roi.bottom > 1) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + "Expected Rect values to be in [0,1].", + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError); + } + normalized_rect.set_x_center((roi.left + roi.right) / 2.0); + normalized_rect.set_y_center((roi.top + roi.bottom) / 2.0); + normalized_rect.set_width(roi.right - roi.left); + normalized_rect.set_height(roi.bottom - roi.top); + } + return normalized_rect; + } + private: RunningMode running_mode_; }; diff --git a/mediapipe/tasks/cc/vision/core/image_processing_options.h b/mediapipe/tasks/cc/vision/core/image_processing_options.h new file mode 100644 index 000000000..7e764c1fe --- /dev/null +++ b/mediapipe/tasks/cc/vision/core/image_processing_options.h @@ -0,0 +1,52 @@ +/* Copyright 2022 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef MEDIAPIPE_TASKS_CC_VISION_CORE_IMAGE_PROCESSING_OPTIONS_H_ +#define MEDIAPIPE_TASKS_CC_VISION_CORE_IMAGE_PROCESSING_OPTIONS_H_ + +#include + +#include "mediapipe/tasks/cc/components/containers/rect.h" + +namespace mediapipe { +namespace tasks { +namespace vision { +namespace core { + +// Options for image processing. +// +// If both region-or-interest and rotation are specified, the crop around the +// region-of-interest is extracted first, the the specified rotation is applied +// to the crop. +struct ImageProcessingOptions { + // The optional region-of-interest to crop from the image. If not specified, + // the full image is used. + // + // Coordinates must be in [0,1] with 'left' < 'right' and 'top' < bottom. + std::optional region_of_interest = std::nullopt; + + // The rotation to apply to the image (or cropped region-of-interest), in + // degrees clockwise. + // + // The rotation must be a multiple (positive or negative) of 90°. + int rotation_degrees = 0; +}; + +} // namespace core +} // namespace vision +} // namespace tasks +} // namespace mediapipe + +#endif // MEDIAPIPE_TASKS_CC_VISION_CORE_IMAGE_PROCESSING_OPTIONS_H_ diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD b/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD index e5b1f0479..a766c6b3f 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD @@ -137,6 +137,7 @@ cc_library( "//mediapipe/tasks/cc/core:utils", "//mediapipe/tasks/cc/core/proto:inference_subgraph_cc_proto", "//mediapipe/tasks/cc/vision/core:base_vision_task_api", + "//mediapipe/tasks/cc/vision/core:image_processing_options", "//mediapipe/tasks/cc/vision/core:running_mode", "//mediapipe/tasks/cc/vision/core:vision_task_api_factory", "//mediapipe/tasks/cc/vision/gesture_recognizer/proto:gesture_recognizer_graph_options_cc_proto", diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc index 333edb6fb..000a2e141 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc @@ -39,6 +39,7 @@ limitations under the License. #include "mediapipe/tasks/cc/core/task_runner.h" #include "mediapipe/tasks/cc/core/utils.h" #include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/vision_task_api_factory.h" #include "mediapipe/tasks/cc/vision/gesture_recognizer/proto/gesture_recognizer_graph_options.pb.h" #include "mediapipe/tasks/cc/vision/gesture_recognizer/proto/hand_gesture_recognizer_graph_options.pb.h" @@ -76,31 +77,6 @@ constexpr char kHandWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kHandWorldLandmarksStreamName[] = "world_landmarks"; constexpr int kMicroSecondsPerMilliSecond = 1000; -// Returns a NormalizedRect filling the whole image. If input is present, its -// rotation is set in the returned NormalizedRect and a check is performed to -// make sure no region-of-interest was provided. Otherwise, rotation is set to -// 0. -absl::StatusOr FillNormalizedRect( - std::optional normalized_rect) { - NormalizedRect result; - if (normalized_rect.has_value()) { - result = *normalized_rect; - } - bool has_coordinates = result.has_x_center() || result.has_y_center() || - result.has_width() || result.has_height(); - if (has_coordinates) { - return CreateStatusWithPayload( - absl::StatusCode::kInvalidArgument, - "GestureRecognizer does not support region-of-interest.", - MediaPipeTasksStatus::kInvalidArgumentError); - } - result.set_x_center(0.5); - result.set_y_center(0.5); - result.set_width(1); - result.set_height(1); - return result; -} - // Creates a MediaPipe graph config that contains a subgraph node of // "mediapipe.tasks.vision.GestureRecognizerGraph". If the task is running // in the live stream mode, a "FlowLimiterCalculator" will be added to limit the @@ -248,15 +224,16 @@ absl::StatusOr> GestureRecognizer::Create( absl::StatusOr GestureRecognizer::Recognize( mediapipe::Image image, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, "GPU input images are currently not supported.", MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(NormalizedRect norm_rect, - FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options, /*roi_allowed=*/false)); ASSIGN_OR_RETURN( auto output_packets, ProcessImageData( @@ -283,15 +260,16 @@ absl::StatusOr GestureRecognizer::Recognize( absl::StatusOr GestureRecognizer::RecognizeForVideo( mediapipe::Image image, int64 timestamp_ms, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(NormalizedRect norm_rect, - FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options, /*roi_allowed=*/false)); ASSIGN_OR_RETURN( auto output_packets, ProcessVideoData( @@ -321,15 +299,16 @@ absl::StatusOr GestureRecognizer::RecognizeForVideo( absl::Status GestureRecognizer::RecognizeAsync( mediapipe::Image image, int64 timestamp_ms, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(NormalizedRect norm_rect, - FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options, /*roi_allowed=*/false)); return SendLiveStreamData( {{kImageInStreamName, MakePacket(std::move(image)) diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h index 750a99797..29c8bea7b 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h @@ -23,10 +23,10 @@ limitations under the License. #include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/image.h" #include "mediapipe/framework/formats/landmark.pb.h" -#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/tasks/cc/components/containers/gesture_recognition_result.h" #include "mediapipe/tasks/cc/core/base_options.h" #include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" namespace mediapipe { @@ -129,36 +129,36 @@ class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { // Only use this method when the GestureRecognizer is created with the image // running mode. // - // image - mediapipe::Image - // Image to perform hand gesture recognition on. - // imageProcessingOptions - std::optional - // If provided, can be used to specify the rotation to apply to the image - // before performing classification, by setting its 'rotation' field in - // radians (e.g. 'M_PI / 2' for a 90° anti-clockwise rotation). Note that - // specifying a region-of-interest using the 'x_center', 'y_center', 'width' - // and 'height' fields is NOT supported and will result in an invalid - // argument error being returned. + // The optional 'image_processing_options' parameter can be used to specify + // the rotation to apply to the image before performing recognition, by + // setting its 'rotation_degrees' field. Note that specifying a + // region-of-interest using the 'region_of_interest' field is NOT supported + // and will result in an invalid argument error being returned. // // The image can be of any size with format RGB or RGBA. // TODO: Describes how the input image will be preprocessed // after the yuv support is implemented. - // TODO: use an ImageProcessingOptions struct instead of - // NormalizedRect. absl::StatusOr Recognize( Image image, - std::optional image_processing_options = + std::optional image_processing_options = std::nullopt); // Performs gesture recognition on the provided video frame. // Only use this method when the GestureRecognizer is created with the video // running mode. // + // The optional 'image_processing_options' parameter can be used to specify + // the rotation to apply to the image before performing recognition, by + // setting its 'rotation_degrees' field. Note that specifying a + // region-of-interest using the 'region_of_interest' field is NOT supported + // and will result in an invalid argument error being returned. + // // The image can be of any size with format RGB or RGBA. It's required to // provide the video frame's timestamp (in milliseconds). The input timestamps // must be monotonically increasing. absl::StatusOr RecognizeForVideo(Image image, int64 timestamp_ms, - std::optional + std::optional image_processing_options = std::nullopt); // Sends live image data to perform gesture recognition, and the results will @@ -171,6 +171,12 @@ class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { // sent to the gesture recognizer. The input timestamps must be monotonically // increasing. // + // The optional 'image_processing_options' parameter can be used to specify + // the rotation to apply to the image before performing recognition, by + // setting its 'rotation_degrees' field. Note that specifying a + // region-of-interest using the 'region_of_interest' field is NOT supported + // and will result in an invalid argument error being returned. + // // The "result_callback" provides // - A vector of GestureRecognitionResult, each is the recognized results // for a input frame. @@ -180,7 +186,7 @@ class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { // outside of the callback, callers need to make a copy of the image. // - The input timestamp in milliseconds. absl::Status RecognizeAsync(Image image, int64 timestamp_ms, - std::optional + std::optional image_processing_options = std::nullopt); // Shuts down the GestureRecognizer when all works are done. diff --git a/mediapipe/tasks/cc/vision/image_classifier/BUILD b/mediapipe/tasks/cc/vision/image_classifier/BUILD index dfa77cb96..3d655cd50 100644 --- a/mediapipe/tasks/cc/vision/image_classifier/BUILD +++ b/mediapipe/tasks/cc/vision/image_classifier/BUILD @@ -59,6 +59,7 @@ cc_library( "//mediapipe/tasks/cc/core/proto:base_options_cc_proto", "//mediapipe/tasks/cc/core/proto:inference_subgraph_cc_proto", "//mediapipe/tasks/cc/vision/core:base_vision_task_api", + "//mediapipe/tasks/cc/vision/core:image_processing_options", "//mediapipe/tasks/cc/vision/core:running_mode", "//mediapipe/tasks/cc/vision/core:vision_task_api_factory", "//mediapipe/tasks/cc/vision/image_classifier/proto:image_classifier_graph_options_cc_proto", diff --git a/mediapipe/tasks/cc/vision/image_classifier/image_classifier.cc b/mediapipe/tasks/cc/vision/image_classifier/image_classifier.cc index f3dcdd07d..8a32758f4 100644 --- a/mediapipe/tasks/cc/vision/image_classifier/image_classifier.cc +++ b/mediapipe/tasks/cc/vision/image_classifier/image_classifier.cc @@ -34,6 +34,7 @@ limitations under the License. #include "mediapipe/tasks/cc/core/proto/inference_subgraph.pb.h" #include "mediapipe/tasks/cc/core/task_runner.h" #include "mediapipe/tasks/cc/core/utils.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" #include "mediapipe/tasks/cc/vision/core/vision_task_api_factory.h" #include "mediapipe/tasks/cc/vision/image_classifier/proto/image_classifier_graph_options.pb.h" @@ -59,26 +60,6 @@ constexpr int kMicroSecondsPerMilliSecond = 1000; using ::mediapipe::tasks::components::containers::proto::ClassificationResult; using ::mediapipe::tasks::core::PacketMap; -// Returns a NormalizedRect covering the full image if input is not present. -// Otherwise, makes sure the x_center, y_center, width and height are set in -// case only a rotation was provided in the input. -NormalizedRect FillNormalizedRect( - std::optional normalized_rect) { - NormalizedRect result; - if (normalized_rect.has_value()) { - result = *normalized_rect; - } - bool has_coordinates = result.has_x_center() || result.has_y_center() || - result.has_width() || result.has_height(); - if (!has_coordinates) { - result.set_x_center(0.5); - result.set_y_center(0.5); - result.set_width(1); - result.set_height(1); - } - return result; -} - // Creates a MediaPipe graph config that contains a subgraph node of // type "ImageClassifierGraph". If the task is running in the live stream mode, // a "FlowLimiterCalculator" will be added to limit the number of frames in @@ -164,14 +145,16 @@ absl::StatusOr> ImageClassifier::Create( } absl::StatusOr ImageClassifier::Classify( - Image image, std::optional image_processing_options) { + Image image, + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, "GPU input images are currently not supported.", MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - NormalizedRect norm_rect = FillNormalizedRect(image_processing_options); + ASSIGN_OR_RETURN(NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options)); ASSIGN_OR_RETURN( auto output_packets, ProcessImageData( @@ -183,14 +166,15 @@ absl::StatusOr ImageClassifier::Classify( absl::StatusOr ImageClassifier::ClassifyForVideo( Image image, int64 timestamp_ms, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, "GPU input images are currently not supported.", MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - NormalizedRect norm_rect = FillNormalizedRect(image_processing_options); + ASSIGN_OR_RETURN(NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options)); ASSIGN_OR_RETURN( auto output_packets, ProcessVideoData( @@ -206,14 +190,15 @@ absl::StatusOr ImageClassifier::ClassifyForVideo( absl::Status ImageClassifier::ClassifyAsync( Image image, int64 timestamp_ms, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, "GPU input images are currently not supported.", MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - NormalizedRect norm_rect = FillNormalizedRect(image_processing_options); + ASSIGN_OR_RETURN(NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options)); return SendLiveStreamData( {{kImageInStreamName, MakePacket(std::move(image)) diff --git a/mediapipe/tasks/cc/vision/image_classifier/image_classifier.h b/mediapipe/tasks/cc/vision/image_classifier/image_classifier.h index 5dff06cc7..de69b7994 100644 --- a/mediapipe/tasks/cc/vision/image_classifier/image_classifier.h +++ b/mediapipe/tasks/cc/vision/image_classifier/image_classifier.h @@ -22,11 +22,11 @@ limitations under the License. #include "absl/status/statusor.h" #include "mediapipe/framework/formats/image.h" -#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/tasks/cc/components/containers/proto/classifications.pb.h" #include "mediapipe/tasks/cc/components/processors/classifier_options.h" #include "mediapipe/tasks/cc/core/base_options.h" #include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" namespace mediapipe { @@ -109,12 +109,10 @@ class ImageClassifier : tasks::vision::core::BaseVisionTaskApi { // // The optional 'image_processing_options' parameter can be used to specify: // - the rotation to apply to the image before performing classification, by - // setting its 'rotation' field in radians (e.g. 'M_PI / 2' for a 90° - // anti-clockwise rotation). + // setting its 'rotation_degrees' field. // and/or // - the region-of-interest on which to perform classification, by setting its - // 'x_center', 'y_center', 'width' and 'height' fields. If none of these is - // set, they will automatically be set to cover the full image. + // 'region_of_interest' field. If not specified, the full image is used. // If both are specified, the crop around the region-of-interest is extracted // first, then the specified rotation is applied to the crop. // @@ -126,19 +124,17 @@ class ImageClassifier : tasks::vision::core::BaseVisionTaskApi { // YUVToImageCalculator is integrated. absl::StatusOr Classify( mediapipe::Image image, - std::optional image_processing_options = + std::optional image_processing_options = std::nullopt); // Performs image classification on the provided video frame. // // The optional 'image_processing_options' parameter can be used to specify: // - the rotation to apply to the image before performing classification, by - // setting its 'rotation' field in radians (e.g. 'M_PI / 2' for a 90° - // anti-clockwise rotation). + // setting its 'rotation_degrees' field. // and/or // - the region-of-interest on which to perform classification, by setting its - // 'x_center', 'y_center', 'width' and 'height' fields. If none of these is - // set, they will automatically be set to cover the full image. + // 'region_of_interest' field. If not specified, the full image is used. // If both are specified, the crop around the region-of-interest is extracted // first, then the specified rotation is applied to the crop. // @@ -150,7 +146,7 @@ class ImageClassifier : tasks::vision::core::BaseVisionTaskApi { // must be monotonically increasing. absl::StatusOr ClassifyForVideo(mediapipe::Image image, int64 timestamp_ms, - std::optional + std::optional image_processing_options = std::nullopt); // Sends live image data to image classification, and the results will be @@ -158,12 +154,10 @@ class ImageClassifier : tasks::vision::core::BaseVisionTaskApi { // // The optional 'image_processing_options' parameter can be used to specify: // - the rotation to apply to the image before performing classification, by - // setting its 'rotation' field in radians (e.g. 'M_PI / 2' for a 90° - // anti-clockwise rotation). + // setting its 'rotation_degrees' field. // and/or // - the region-of-interest on which to perform classification, by setting its - // 'x_center', 'y_center', 'width' and 'height' fields. If none of these is - // set, they will automatically be set to cover the full image. + // 'region_of_interest' field. If not specified, the full image is used. // If both are specified, the crop around the region-of-interest is extracted // first, then the specified rotation is applied to the crop. // @@ -175,7 +169,7 @@ class ImageClassifier : tasks::vision::core::BaseVisionTaskApi { // sent to the object detector. The input timestamps must be monotonically // increasing. // - // The "result_callback" prvoides + // The "result_callback" provides: // - The classification results as a ClassificationResult object. // - The const reference to the corresponding input image that the image // classifier runs on. Note that the const reference to the image will no @@ -183,12 +177,9 @@ class ImageClassifier : tasks::vision::core::BaseVisionTaskApi { // outside of the callback, callers need to make a copy of the image. // - The input timestamp in milliseconds. absl::Status ClassifyAsync(mediapipe::Image image, int64 timestamp_ms, - std::optional + std::optional image_processing_options = std::nullopt); - // TODO: add Classify() variants taking a region of interest as - // additional argument. - // Shuts down the ImageClassifier when all works are done. absl::Status Close() { return runner_->Close(); } }; diff --git a/mediapipe/tasks/cc/vision/image_classifier/image_classifier_test.cc b/mediapipe/tasks/cc/vision/image_classifier/image_classifier_test.cc index 55830e520..0c45122c0 100644 --- a/mediapipe/tasks/cc/vision/image_classifier/image_classifier_test.cc +++ b/mediapipe/tasks/cc/vision/image_classifier/image_classifier_test.cc @@ -27,7 +27,6 @@ limitations under the License. #include "absl/strings/str_format.h" #include "mediapipe/framework/deps/file_path.h" #include "mediapipe/framework/formats/image.h" -#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/framework/port/gmock.h" #include "mediapipe/framework/port/gtest.h" #include "mediapipe/framework/port/parse_text_proto.h" @@ -35,6 +34,8 @@ limitations under the License. #include "mediapipe/tasks/cc/common.h" #include "mediapipe/tasks/cc/components/containers/proto/category.pb.h" #include "mediapipe/tasks/cc/components/containers/proto/classifications.pb.h" +#include "mediapipe/tasks/cc/components/containers/rect.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" #include "mediapipe/tasks/cc/vision/utils/image_utils.h" #include "tensorflow/lite/core/api/op_resolver.h" @@ -49,9 +50,11 @@ namespace image_classifier { namespace { using ::mediapipe::file::JoinPath; +using ::mediapipe::tasks::components::containers::Rect; using ::mediapipe::tasks::components::containers::proto::ClassificationEntry; using ::mediapipe::tasks::components::containers::proto::ClassificationResult; using ::mediapipe::tasks::components::containers::proto::Classifications; +using ::mediapipe::tasks::vision::core::ImageProcessingOptions; using ::testing::HasSubstr; using ::testing::Optional; @@ -547,12 +550,9 @@ TEST_F(ImageModeTest, SucceedsWithRegionOfInterest) { options->classifier_options.max_results = 1; MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr image_classifier, ImageClassifier::Create(std::move(options))); - // Crop around the soccer ball. - NormalizedRect image_processing_options; - image_processing_options.set_x_center(0.532); - image_processing_options.set_y_center(0.521); - image_processing_options.set_width(0.164); - image_processing_options.set_height(0.427); + // Region-of-interest around the soccer ball. + Rect roi{/*left=*/0.45, /*top=*/0.3075, /*right=*/0.614, /*bottom=*/0.7345}; + ImageProcessingOptions image_processing_options{roi, /*rotation_degrees=*/0}; MP_ASSERT_OK_AND_ASSIGN(auto results, image_classifier->Classify( image, image_processing_options)); @@ -572,8 +572,8 @@ TEST_F(ImageModeTest, SucceedsWithRotation) { ImageClassifier::Create(std::move(options))); // Specify a 90° anti-clockwise rotation. - NormalizedRect image_processing_options; - image_processing_options.set_rotation(M_PI / 2.0); + ImageProcessingOptions image_processing_options; + image_processing_options.rotation_degrees = -90; MP_ASSERT_OK_AND_ASSIGN(auto results, image_classifier->Classify( image, image_processing_options)); @@ -616,13 +616,10 @@ TEST_F(ImageModeTest, SucceedsWithRegionOfInterestAndRotation) { options->classifier_options.max_results = 1; MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr image_classifier, ImageClassifier::Create(std::move(options))); - // Crop around the chair, with 90° anti-clockwise rotation. - NormalizedRect image_processing_options; - image_processing_options.set_x_center(0.2821); - image_processing_options.set_y_center(0.2406); - image_processing_options.set_width(0.5642); - image_processing_options.set_height(0.1286); - image_processing_options.set_rotation(M_PI / 2.0); + // Region-of-interest around the chair, with 90° anti-clockwise rotation. + Rect roi{/*left=*/0.006, /*top=*/0.1763, /*right=*/0.5702, /*bottom=*/0.3049}; + ImageProcessingOptions image_processing_options{roi, + /*rotation_degrees=*/-90}; MP_ASSERT_OK_AND_ASSIGN(auto results, image_classifier->Classify( image, image_processing_options)); @@ -633,7 +630,7 @@ TEST_F(ImageModeTest, SucceedsWithRegionOfInterestAndRotation) { entries { categories { index: 560 - score: 0.6800408 + score: 0.6522213 category_name: "folding chair" } timestamp_ms: 0 @@ -643,6 +640,69 @@ TEST_F(ImageModeTest, SucceedsWithRegionOfInterestAndRotation) { })pb")); } +// Testing all these once with ImageClassifier. +TEST_F(ImageModeTest, FailsWithInvalidImageProcessingOptions) { + MP_ASSERT_OK_AND_ASSIGN(Image image, + DecodeImageFromFile(JoinPath("./", kTestDataDirectory, + "multi_objects.jpg"))); + auto options = std::make_unique(); + options->base_options.model_asset_path = + JoinPath("./", kTestDataDirectory, kMobileNetFloatWithMetadata); + MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr image_classifier, + ImageClassifier::Create(std::move(options))); + + // Invalid: left > right. + Rect roi{/*left=*/0.9, /*top=*/0, /*right=*/0.1, /*bottom=*/1}; + ImageProcessingOptions image_processing_options{roi, + /*rotation_degrees=*/0}; + auto results = image_classifier->Classify(image, image_processing_options); + EXPECT_EQ(results.status().code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(results.status().message(), + HasSubstr("Expected Rect with left < right and top < bottom")); + EXPECT_THAT( + results.status().GetPayload(kMediaPipeTasksPayload), + Optional(absl::Cord(absl::StrCat( + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError)))); + + // Invalid: top > bottom. + roi = {/*left=*/0, /*top=*/0.9, /*right=*/1, /*bottom=*/0.1}; + image_processing_options = {roi, + /*rotation_degrees=*/0}; + results = image_classifier->Classify(image, image_processing_options); + EXPECT_EQ(results.status().code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(results.status().message(), + HasSubstr("Expected Rect with left < right and top < bottom")); + EXPECT_THAT( + results.status().GetPayload(kMediaPipeTasksPayload), + Optional(absl::Cord(absl::StrCat( + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError)))); + + // Invalid: coordinates out of [0,1] range. + roi = {/*left=*/-0.1, /*top=*/0, /*right=*/1, /*bottom=*/1}; + image_processing_options = {roi, + /*rotation_degrees=*/0}; + results = image_classifier->Classify(image, image_processing_options); + EXPECT_EQ(results.status().code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(results.status().message(), + HasSubstr("Expected Rect values to be in [0,1]")); + EXPECT_THAT( + results.status().GetPayload(kMediaPipeTasksPayload), + Optional(absl::Cord(absl::StrCat( + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError)))); + + // Invalid: rotation not a multiple of 90°. + image_processing_options = {/*region_of_interest=*/std::nullopt, + /*rotation_degrees=*/1}; + results = image_classifier->Classify(image, image_processing_options); + EXPECT_EQ(results.status().code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(results.status().message(), + HasSubstr("Expected rotation to be a multiple of 90°")); + EXPECT_THAT( + results.status().GetPayload(kMediaPipeTasksPayload), + Optional(absl::Cord(absl::StrCat( + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError)))); +} + class VideoModeTest : public tflite_shims::testing::Test {}; TEST_F(VideoModeTest, FailsWithCallingWrongMethod) { @@ -732,11 +792,9 @@ TEST_F(VideoModeTest, SucceedsWithRegionOfInterest) { MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr image_classifier, ImageClassifier::Create(std::move(options))); // Crop around the soccer ball. - NormalizedRect image_processing_options; - image_processing_options.set_x_center(0.532); - image_processing_options.set_y_center(0.521); - image_processing_options.set_width(0.164); - image_processing_options.set_height(0.427); + // Region-of-interest around the soccer ball. + Rect roi{/*left=*/0.45, /*top=*/0.3075, /*right=*/0.614, /*bottom=*/0.7345}; + ImageProcessingOptions image_processing_options{roi, /*rotation_degrees=*/0}; for (int i = 0; i < iterations; ++i) { MP_ASSERT_OK_AND_ASSIGN( @@ -877,11 +935,8 @@ TEST_F(LiveStreamModeTest, SucceedsWithRegionOfInterest) { MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr image_classifier, ImageClassifier::Create(std::move(options))); // Crop around the soccer ball. - NormalizedRect image_processing_options; - image_processing_options.set_x_center(0.532); - image_processing_options.set_y_center(0.521); - image_processing_options.set_width(0.164); - image_processing_options.set_height(0.427); + Rect roi{/*left=*/0.45, /*top=*/0.3075, /*right=*/0.614, /*bottom=*/0.7345}; + ImageProcessingOptions image_processing_options{roi, /*rotation_degrees=*/0}; for (int i = 0; i < iterations; ++i) { MP_ASSERT_OK( diff --git a/mediapipe/tasks/cc/vision/object_detector/BUILD b/mediapipe/tasks/cc/vision/object_detector/BUILD index 186909509..8220d8b7f 100644 --- a/mediapipe/tasks/cc/vision/object_detector/BUILD +++ b/mediapipe/tasks/cc/vision/object_detector/BUILD @@ -75,6 +75,7 @@ cc_library( "//mediapipe/tasks/cc/core/proto:base_options_cc_proto", "//mediapipe/tasks/cc/core/proto:inference_subgraph_cc_proto", "//mediapipe/tasks/cc/vision/core:base_vision_task_api", + "//mediapipe/tasks/cc/vision/core:image_processing_options", "//mediapipe/tasks/cc/vision/core:running_mode", "//mediapipe/tasks/cc/vision/core:vision_task_api_factory", "//mediapipe/tasks/cc/vision/object_detector/proto:object_detector_options_cc_proto", diff --git a/mediapipe/tasks/cc/vision/object_detector/object_detector.cc b/mediapipe/tasks/cc/vision/object_detector/object_detector.cc index 9149a3cbe..dd19237ff 100644 --- a/mediapipe/tasks/cc/vision/object_detector/object_detector.cc +++ b/mediapipe/tasks/cc/vision/object_detector/object_detector.cc @@ -34,6 +34,7 @@ limitations under the License. #include "mediapipe/tasks/cc/core/proto/base_options.pb.h" #include "mediapipe/tasks/cc/core/proto/inference_subgraph.pb.h" #include "mediapipe/tasks/cc/core/utils.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" #include "mediapipe/tasks/cc/vision/core/vision_task_api_factory.h" #include "mediapipe/tasks/cc/vision/object_detector/proto/object_detector_options.pb.h" @@ -58,31 +59,6 @@ constexpr int kMicroSecondsPerMilliSecond = 1000; using ObjectDetectorOptionsProto = object_detector::proto::ObjectDetectorOptions; -// Returns a NormalizedRect filling the whole image. If input is present, its -// rotation is set in the returned NormalizedRect and a check is performed to -// make sure no region-of-interest was provided. Otherwise, rotation is set to -// 0. -absl::StatusOr FillNormalizedRect( - std::optional normalized_rect) { - NormalizedRect result; - if (normalized_rect.has_value()) { - result = *normalized_rect; - } - bool has_coordinates = result.has_x_center() || result.has_y_center() || - result.has_width() || result.has_height(); - if (has_coordinates) { - return CreateStatusWithPayload( - absl::StatusCode::kInvalidArgument, - "ObjectDetector does not support region-of-interest.", - MediaPipeTasksStatus::kInvalidArgumentError); - } - result.set_x_center(0.5); - result.set_y_center(0.5); - result.set_width(1); - result.set_height(1); - return result; -} - // Creates a MediaPipe graph config that contains a subgraph node of // "mediapipe.tasks.vision.ObjectDetectorGraph". If the task is running in the // live stream mode, a "FlowLimiterCalculator" will be added to limit the @@ -170,15 +146,16 @@ absl::StatusOr> ObjectDetector::Create( absl::StatusOr> ObjectDetector::Detect( mediapipe::Image image, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(NormalizedRect norm_rect, - FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options, /*roi_allowed=*/false)); ASSIGN_OR_RETURN( auto output_packets, ProcessImageData( @@ -189,15 +166,16 @@ absl::StatusOr> ObjectDetector::Detect( absl::StatusOr> ObjectDetector::DetectForVideo( mediapipe::Image image, int64 timestamp_ms, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(NormalizedRect norm_rect, - FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options, /*roi_allowed=*/false)); ASSIGN_OR_RETURN( auto output_packets, ProcessVideoData( @@ -212,15 +190,16 @@ absl::StatusOr> ObjectDetector::DetectForVideo( absl::Status ObjectDetector::DetectAsync( Image image, int64 timestamp_ms, - std::optional image_processing_options) { + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(NormalizedRect norm_rect, - FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + NormalizedRect norm_rect, + ConvertToNormalizedRect(image_processing_options, /*roi_allowed=*/false)); return SendLiveStreamData( {{kImageInStreamName, MakePacket(std::move(image)) diff --git a/mediapipe/tasks/cc/vision/object_detector/object_detector.h b/mediapipe/tasks/cc/vision/object_detector/object_detector.h index 2e5ed7b8d..44ce68ed9 100644 --- a/mediapipe/tasks/cc/vision/object_detector/object_detector.h +++ b/mediapipe/tasks/cc/vision/object_detector/object_detector.h @@ -27,9 +27,9 @@ limitations under the License. #include "absl/status/statusor.h" #include "mediapipe/framework/formats/detection.pb.h" #include "mediapipe/framework/formats/image.h" -#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/tasks/cc/core/base_options.h" #include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" namespace mediapipe { @@ -154,10 +154,9 @@ class ObjectDetector : tasks::vision::core::BaseVisionTaskApi { // after the yuv support is implemented. // // The optional 'image_processing_options' parameter can be used to specify - // the rotation to apply to the image before performing classification, by - // setting its 'rotation' field in radians (e.g. 'M_PI / 2' for a 90° - // anti-clockwise rotation). Note that specifying a region-of-interest using - // the 'x_center', 'y_center', 'width' and 'height' fields is NOT supported + // the rotation to apply to the image before performing detection, by + // setting its 'rotation_degrees' field. Note that specifying a + // region-of-interest using the 'region_of_interest' field is NOT supported // and will result in an invalid argument error being returned. // // For CPU images, the returned bounding boxes are expressed in the @@ -168,7 +167,7 @@ class ObjectDetector : tasks::vision::core::BaseVisionTaskApi { // images after enabling the gpu support in MediaPipe Tasks. absl::StatusOr> Detect( mediapipe::Image image, - std::optional image_processing_options = + std::optional image_processing_options = std::nullopt); // Performs object detection on the provided video frame. @@ -180,10 +179,9 @@ class ObjectDetector : tasks::vision::core::BaseVisionTaskApi { // must be monotonically increasing. // // The optional 'image_processing_options' parameter can be used to specify - // the rotation to apply to the image before performing classification, by - // setting its 'rotation' field in radians (e.g. 'M_PI / 2' for a 90° - // anti-clockwise rotation). Note that specifying a region-of-interest using - // the 'x_center', 'y_center', 'width' and 'height' fields is NOT supported + // the rotation to apply to the image before performing detection, by + // setting its 'rotation_degrees' field. Note that specifying a + // region-of-interest using the 'region_of_interest' field is NOT supported // and will result in an invalid argument error being returned. // // For CPU images, the returned bounding boxes are expressed in the @@ -192,7 +190,7 @@ class ObjectDetector : tasks::vision::core::BaseVisionTaskApi { // underlying image data. absl::StatusOr> DetectForVideo( mediapipe::Image image, int64 timestamp_ms, - std::optional image_processing_options = + std::optional image_processing_options = std::nullopt); // Sends live image data to perform object detection, and the results will be @@ -206,10 +204,9 @@ class ObjectDetector : tasks::vision::core::BaseVisionTaskApi { // increasing. // // The optional 'image_processing_options' parameter can be used to specify - // the rotation to apply to the image before performing classification, by - // setting its 'rotation' field in radians (e.g. 'M_PI / 2' for a 90° - // anti-clockwise rotation). Note that specifying a region-of-interest using - // the 'x_center', 'y_center', 'width' and 'height' fields is NOT supported + // the rotation to apply to the image before performing detection, by + // setting its 'rotation_degrees' field. Note that specifying a + // region-of-interest using the 'region_of_interest' field is NOT supported // and will result in an invalid argument error being returned. // // The "result_callback" provides @@ -223,7 +220,7 @@ class ObjectDetector : tasks::vision::core::BaseVisionTaskApi { // outside of the callback, callers need to make a copy of the image. // - The input timestamp in milliseconds. absl::Status DetectAsync(mediapipe::Image image, int64 timestamp_ms, - std::optional + std::optional image_processing_options = std::nullopt); // Shuts down the ObjectDetector when all works are done. diff --git a/mediapipe/tasks/cc/vision/object_detector/object_detector_test.cc b/mediapipe/tasks/cc/vision/object_detector/object_detector_test.cc index 8db3fa767..1747685dd 100644 --- a/mediapipe/tasks/cc/vision/object_detector/object_detector_test.cc +++ b/mediapipe/tasks/cc/vision/object_detector/object_detector_test.cc @@ -31,11 +31,12 @@ limitations under the License. #include "mediapipe/framework/deps/file_path.h" #include "mediapipe/framework/formats/image.h" #include "mediapipe/framework/formats/location_data.pb.h" -#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/framework/port/gmock.h" #include "mediapipe/framework/port/gtest.h" #include "mediapipe/framework/port/parse_text_proto.h" #include "mediapipe/framework/port/status_matchers.h" +#include "mediapipe/tasks/cc/components/containers/rect.h" +#include "mediapipe/tasks/cc/vision/core/image_processing_options.h" #include "mediapipe/tasks/cc/vision/core/running_mode.h" #include "mediapipe/tasks/cc/vision/utils/image_utils.h" #include "tensorflow/lite/c/common.h" @@ -64,6 +65,8 @@ namespace vision { namespace { using ::mediapipe::file::JoinPath; +using ::mediapipe::tasks::components::containers::Rect; +using ::mediapipe::tasks::vision::core::ImageProcessingOptions; using ::testing::HasSubstr; using ::testing::Optional; @@ -532,8 +535,8 @@ TEST_F(ImageModeTest, SucceedsWithRotation) { JoinPath("./", kTestDataDirectory, kMobileSsdWithMetadata); MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr object_detector, ObjectDetector::Create(std::move(options))); - NormalizedRect image_processing_options; - image_processing_options.set_rotation(M_PI / 2.0); + ImageProcessingOptions image_processing_options; + image_processing_options.rotation_degrees = -90; MP_ASSERT_OK_AND_ASSIGN( auto results, object_detector->Detect(image, image_processing_options)); MP_ASSERT_OK(object_detector->Close()); @@ -557,16 +560,17 @@ TEST_F(ImageModeTest, FailsWithRegionOfInterest) { JoinPath("./", kTestDataDirectory, kMobileSsdWithMetadata); MP_ASSERT_OK_AND_ASSIGN(std::unique_ptr object_detector, ObjectDetector::Create(std::move(options))); - NormalizedRect image_processing_options; - image_processing_options.set_x_center(0.5); - image_processing_options.set_y_center(0.5); - image_processing_options.set_width(1.0); - image_processing_options.set_height(1.0); + Rect roi{/*left=*/0.1, /*top=*/0, /*right=*/0.9, /*bottom=*/1}; + ImageProcessingOptions image_processing_options{roi, /*rotation_degrees=*/0}; auto results = object_detector->Detect(image, image_processing_options); EXPECT_EQ(results.status().code(), absl::StatusCode::kInvalidArgument); EXPECT_THAT(results.status().message(), - HasSubstr("ObjectDetector does not support region-of-interest")); + HasSubstr("This task doesn't support region-of-interest")); + EXPECT_THAT( + results.status().GetPayload(kMediaPipeTasksPayload), + Optional(absl::Cord(absl::StrCat( + MediaPipeTasksStatus::kImageProcessingInvalidArgumentError)))); } class VideoModeTest : public tflite_shims::testing::Test {};