From 4b5df1cb9610e9707184a857b9e5f4e25d73ec0a Mon Sep 17 00:00:00 2001 From: MediaPipe Team Date: Thu, 20 Oct 2022 10:40:56 -0700 Subject: [PATCH] Add support for rotations in GestureRecognizer C++ API. PiperOrigin-RevId: 482533599 --- .../tasks/cc/vision/gesture_recognizer/BUILD | 3 + .../gesture_recognizer/calculators/BUILD | 2 + .../landmarks_to_matrix_calculator.cc | 32 +++ .../landmarks_to_matrix_calculator_test.cc | 44 +++- .../gesture_recognizer/gesture_recognizer.cc | 67 +++++- .../gesture_recognizer/gesture_recognizer.h | 30 ++- .../gesture_recognizer_graph.cc | 16 +- .../hand_gesture_recognizer_graph.cc | 36 ++- mediapipe/tasks/cc/vision/hand_detector/BUILD | 2 +- .../hand_detector/hand_detector_graph.cc | 37 +-- .../hand_detector/hand_detector_graph_test.cc | 34 ++- .../hand_landmarker/hand_landmarker_graph.cc | 21 +- .../hand_landmarker_graph_test.cc | 55 ++++- .../gesturerecognizer/GestureRecognizer.java | 22 +- mediapipe/tasks/testdata/vision/BUILD | 13 + ..._left_down_hand_rotated_landmarks.prototxt | 84 +++++++ ...ed_left_up_hand_rotated_landmarks.prototxt | 84 +++++++ ...and_detector_result_one_hand_rotated.pbtxt | 33 +++ .../pointing_up_rotated_landmarks.pbtxt | 223 ++++++++++++++++++ .../vision/thumb_up_rotated_landmarks.pbtxt | 223 ++++++++++++++++++ third_party/external_files.bzl | 50 +++- 21 files changed, 1048 insertions(+), 63 deletions(-) create mode 100644 mediapipe/tasks/testdata/vision/expected_left_down_hand_rotated_landmarks.prototxt create mode 100644 mediapipe/tasks/testdata/vision/expected_left_up_hand_rotated_landmarks.prototxt create mode 100644 mediapipe/tasks/testdata/vision/hand_detector_result_one_hand_rotated.pbtxt create mode 100644 mediapipe/tasks/testdata/vision/pointing_up_rotated_landmarks.pbtxt create mode 100644 mediapipe/tasks/testdata/vision/thumb_up_rotated_landmarks.pbtxt diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD b/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD index 985c25cfb..e5b1f0479 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD @@ -56,6 +56,7 @@ cc_library( "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:landmark_cc_proto", "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/framework/formats:tensor", "//mediapipe/tasks/cc:common", "//mediapipe/tasks/cc/components:image_preprocessing", @@ -91,6 +92,7 @@ cc_library( "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:image", "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/tasks/cc:common", "//mediapipe/tasks/cc/components/processors/proto:classifier_options_cc_proto", "//mediapipe/tasks/cc/core:model_task_graph", @@ -123,6 +125,7 @@ cc_library( "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:image", "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/tasks/cc:common", "//mediapipe/tasks/cc/components:image_preprocessing", "//mediapipe/tasks/cc/components/containers:gesture_recognition_result", diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/BUILD b/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/BUILD index a6de4f950..08f7f45d0 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/BUILD +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/BUILD @@ -69,6 +69,7 @@ cc_library( "//mediapipe/framework:calculator_framework", "//mediapipe/framework/formats:landmark_cc_proto", "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/framework/port:ret_check", "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", @@ -86,6 +87,7 @@ cc_test( "//mediapipe/framework:calculator_runner", "//mediapipe/framework/formats:landmark_cc_proto", "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/formats:rect_cc_proto", "//mediapipe/framework/port:gtest_main", "//mediapipe/framework/port:parse_text_proto", "@com_google_absl//absl/strings", diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator.cc index b70689eaf..277bb170a 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include +#include #include #include #include @@ -26,6 +27,7 @@ limitations under the License. #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/framework/formats/matrix.h" +#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/framework/port/ret_check.h" #include "mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator.pb.h" @@ -38,6 +40,7 @@ namespace { constexpr char kLandmarksTag[] = "LANDMARKS"; constexpr char kWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kImageSizeTag[] = "IMAGE_SIZE"; +constexpr char kNormRectTag[] = "NORM_RECT"; constexpr char kLandmarksMatrixTag[] = "LANDMARKS_MATRIX"; constexpr int kFeaturesPerLandmark = 3; @@ -62,6 +65,25 @@ absl::StatusOr NormalizeLandmarkAspectRatio( return normalized_landmarks; } +template +absl::StatusOr RotateLandmarks(const LandmarkListT& landmarks, + float rotation) { + float cos = std::cos(rotation); + // Negate because Y-axis points down and not up. + float sin = std::sin(-rotation); + LandmarkListT rotated_landmarks; + for (int i = 0; i < landmarks.landmark_size(); ++i) { + const auto& old_landmark = landmarks.landmark(i); + float x = old_landmark.x() - 0.5; + float y = old_landmark.y() - 0.5; + auto* new_landmark = rotated_landmarks.add_landmark(); + new_landmark->set_x(x * cos - y * sin + 0.5); + new_landmark->set_y(y * cos + x * sin + 0.5); + new_landmark->set_z(old_landmark.z()); + } + return rotated_landmarks; +} + template absl::StatusOr NormalizeObject(const LandmarkListT& landmarks, int origin_offset) { @@ -134,6 +156,13 @@ absl::Status ProcessLandmarks(LandmarkListT landmarks, CalculatorContext* cc) { NormalizeLandmarkAspectRatio(landmarks, width, height)); } + if (cc->Inputs().HasTag(kNormRectTag)) { + RET_CHECK(!cc->Inputs().Tag(kNormRectTag).IsEmpty()); + const auto rotation = + cc->Inputs().Tag(kNormRectTag).Get().rotation(); + ASSIGN_OR_RETURN(landmarks, RotateLandmarks(landmarks, rotation)); + } + const auto& options = cc->Options(); if (options.object_normalization()) { ASSIGN_OR_RETURN( @@ -163,6 +192,8 @@ absl::Status ProcessLandmarks(LandmarkListT landmarks, CalculatorContext* cc) { // WORLD_LANDMARKS - World 3d landmarks of one object. Use *either* // LANDMARKS or WORLD_LANDMARKS. // IMAGE_SIZE - (width, height) of the image +// NORM_RECT - Optional NormalizedRect object whose 'rotation' field is used +// to rotate the landmarks. // Output: // LANDMARKS_MATRIX - Matrix for the landmarks. // @@ -185,6 +216,7 @@ class LandmarksToMatrixCalculator : public CalculatorBase { cc->Inputs().Tag(kLandmarksTag).Set().Optional(); cc->Inputs().Tag(kWorldLandmarksTag).Set().Optional(); cc->Inputs().Tag(kImageSizeTag).Set>().Optional(); + cc->Inputs().Tag(kNormRectTag).Set().Optional(); cc->Outputs().Tag(kLandmarksMatrixTag).Set(); return absl::OkStatus(); } diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator_test.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator_test.cc index 8a68d8dae..fe6f1162b 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator_test.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/calculators/landmarks_to_matrix_calculator_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include #include #include @@ -23,6 +24,7 @@ limitations under the License. #include "mediapipe/framework/calculator_runner.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/framework/formats/matrix.h" +#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/framework/port/gtest.h" #include "mediapipe/framework/port/parse_text_proto.h" #include "mediapipe/framework/port/status_matchers.h" @@ -35,6 +37,7 @@ constexpr char kLandmarksTag[] = "LANDMARKS"; constexpr char kWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kImageSizeTag[] = "IMAGE_SIZE"; constexpr char kLandmarksMatrixTag[] = "LANDMARKS_MATRIX"; +constexpr char kNormRectTag[] = "NORM_RECT"; template LandmarkListT BuildPseudoLandmarks(int num_landmarks, int offset = 0) { @@ -54,6 +57,7 @@ struct Landmarks2dToMatrixCalculatorTestCase { int object_normalization_origin_offset = -1; float expected_cell_0_2; float expected_cell_1_5; + float rotation; }; using Landmarks2dToMatrixCalculatorTest = @@ -68,6 +72,7 @@ TEST_P(Landmarks2dToMatrixCalculatorTest, OutputsCorrectResult) { calculator: "LandmarksToMatrixCalculator" input_stream: "LANDMARKS:landmarks" input_stream: "IMAGE_SIZE:image_size" + input_stream: "NORM_RECT:norm_rect" output_stream: "LANDMARKS_MATRIX:landmarks_matrix" options { [mediapipe.LandmarksToMatrixCalculatorOptions.ext] { @@ -91,6 +96,11 @@ TEST_P(Landmarks2dToMatrixCalculatorTest, OutputsCorrectResult) { runner.MutableInputs() ->Tag(kImageSizeTag) .packets.push_back(Adopt(image_size.release()).At(Timestamp(0))); + auto norm_rect = std::make_unique(); + norm_rect->set_rotation(test_case.rotation); + runner.MutableInputs() + ->Tag(kNormRectTag) + .packets.push_back(Adopt(norm_rect.release()).At(Timestamp(0))); MP_ASSERT_OK(runner.Run()) << "Calculator execution failed."; @@ -109,12 +119,20 @@ INSTANTIATE_TEST_CASE_P( .base_offset = 0, .object_normalization_origin_offset = 0, .expected_cell_0_2 = 0.1f, - .expected_cell_1_5 = 0.1875f}, + .expected_cell_1_5 = 0.1875f, + .rotation = 0}, {.test_name = "TestWithOffset21", .base_offset = 21, .object_normalization_origin_offset = 0, .expected_cell_0_2 = 0.1f, - .expected_cell_1_5 = 0.1875f}}), + .expected_cell_1_5 = 0.1875f, + .rotation = 0}, + {.test_name = "TestWithRotation", + .base_offset = 0, + .object_normalization_origin_offset = 0, + .expected_cell_0_2 = 0.075f, + .expected_cell_1_5 = -0.25f, + .rotation = M_PI / 2.0}}), [](const testing::TestParamInfo< Landmarks2dToMatrixCalculatorTest::ParamType>& info) { return info.param.test_name; @@ -126,6 +144,7 @@ struct LandmarksWorld3dToMatrixCalculatorTestCase { int object_normalization_origin_offset = -1; float expected_cell_0_2; float expected_cell_1_5; + float rotation; }; using LandmarksWorld3dToMatrixCalculatorTest = @@ -140,6 +159,7 @@ TEST_P(LandmarksWorld3dToMatrixCalculatorTest, OutputsCorrectResult) { calculator: "LandmarksToMatrixCalculator" input_stream: "WORLD_LANDMARKS:landmarks" input_stream: "IMAGE_SIZE:image_size" + input_stream: "NORM_RECT:norm_rect" output_stream: "LANDMARKS_MATRIX:landmarks_matrix" options { [mediapipe.LandmarksToMatrixCalculatorOptions.ext] { @@ -162,6 +182,11 @@ TEST_P(LandmarksWorld3dToMatrixCalculatorTest, OutputsCorrectResult) { runner.MutableInputs() ->Tag(kImageSizeTag) .packets.push_back(Adopt(image_size.release()).At(Timestamp(0))); + auto norm_rect = std::make_unique(); + norm_rect->set_rotation(test_case.rotation); + runner.MutableInputs() + ->Tag(kNormRectTag) + .packets.push_back(Adopt(norm_rect.release()).At(Timestamp(0))); MP_ASSERT_OK(runner.Run()) << "Calculator execution failed."; @@ -180,17 +205,26 @@ INSTANTIATE_TEST_CASE_P( .base_offset = 0, .object_normalization_origin_offset = 0, .expected_cell_0_2 = 0.1f, - .expected_cell_1_5 = 0.25}, + .expected_cell_1_5 = 0.25, + .rotation = 0}, {.test_name = "TestWithOffset21", .base_offset = 21, .object_normalization_origin_offset = 0, .expected_cell_0_2 = 0.1f, - .expected_cell_1_5 = 0.25}, + .expected_cell_1_5 = 0.25, + .rotation = 0}, {.test_name = "NoObjectNormalization", .base_offset = 0, .object_normalization_origin_offset = -1, .expected_cell_0_2 = 0.021f, - .expected_cell_1_5 = 0.052f}}), + .expected_cell_1_5 = 0.052f, + .rotation = 0}, + {.test_name = "TestWithRotation", + .base_offset = 0, + .object_normalization_origin_offset = 0, + .expected_cell_0_2 = 0.1f, + .expected_cell_1_5 = -0.25f, + .rotation = M_PI / 2.0}}), [](const testing::TestParamInfo< LandmarksWorld3dToMatrixCalculatorTest::ParamType>& info) { return info.param.test_name; diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc index e0d1473c2..333edb6fb 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include #include #include "absl/memory/memory.h" @@ -27,6 +28,7 @@ limitations under the License. #include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/image.h" #include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/framework/packet.h" #include "mediapipe/tasks/cc/common.h" #include "mediapipe/tasks/cc/components/image_preprocessing.h" @@ -62,6 +64,8 @@ constexpr char kHandGestureSubgraphTypeName[] = constexpr char kImageTag[] = "IMAGE"; constexpr char kImageInStreamName[] = "image_in"; constexpr char kImageOutStreamName[] = "image_out"; +constexpr char kNormRectTag[] = "NORM_RECT"; +constexpr char kNormRectStreamName[] = "norm_rect_in"; constexpr char kHandGesturesTag[] = "HAND_GESTURES"; constexpr char kHandGesturesStreamName[] = "hand_gestures"; constexpr char kHandednessTag[] = "HANDEDNESS"; @@ -72,6 +76,31 @@ constexpr char kHandWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kHandWorldLandmarksStreamName[] = "world_landmarks"; constexpr int kMicroSecondsPerMilliSecond = 1000; +// Returns a NormalizedRect filling the whole image. If input is present, its +// rotation is set in the returned NormalizedRect and a check is performed to +// make sure no region-of-interest was provided. Otherwise, rotation is set to +// 0. +absl::StatusOr FillNormalizedRect( + std::optional normalized_rect) { + NormalizedRect result; + if (normalized_rect.has_value()) { + result = *normalized_rect; + } + bool has_coordinates = result.has_x_center() || result.has_y_center() || + result.has_width() || result.has_height(); + if (has_coordinates) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + "GestureRecognizer does not support region-of-interest.", + MediaPipeTasksStatus::kInvalidArgumentError); + } + result.set_x_center(0.5); + result.set_y_center(0.5); + result.set_width(1); + result.set_height(1); + return result; +} + // Creates a MediaPipe graph config that contains a subgraph node of // "mediapipe.tasks.vision.GestureRecognizerGraph". If the task is running // in the live stream mode, a "FlowLimiterCalculator" will be added to limit the @@ -83,6 +112,7 @@ CalculatorGraphConfig CreateGraphConfig( auto& subgraph = graph.AddNode(kHandGestureSubgraphTypeName); subgraph.GetOptions().Swap(options.get()); graph.In(kImageTag).SetName(kImageInStreamName); + graph.In(kNormRectTag).SetName(kNormRectStreamName); subgraph.Out(kHandGesturesTag).SetName(kHandGesturesStreamName) >> graph.Out(kHandGesturesTag); subgraph.Out(kHandednessTag).SetName(kHandednessStreamName) >> @@ -93,10 +123,11 @@ CalculatorGraphConfig CreateGraphConfig( graph.Out(kHandWorldLandmarksTag); subgraph.Out(kImageTag).SetName(kImageOutStreamName) >> graph.Out(kImageTag); if (enable_flow_limiting) { - return tasks::core::AddFlowLimiterCalculator(graph, subgraph, {kImageTag}, - kHandGesturesTag); + return tasks::core::AddFlowLimiterCalculator( + graph, subgraph, {kImageTag, kNormRectTag}, kHandGesturesTag); } graph.In(kImageTag) >> subgraph.In(kImageTag); + graph.In(kNormRectTag) >> subgraph.In(kNormRectTag); return graph.GetConfig(); } @@ -216,16 +247,22 @@ absl::StatusOr> GestureRecognizer::Create( } absl::StatusOr GestureRecognizer::Recognize( - mediapipe::Image image) { + mediapipe::Image image, + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, "GPU input images are currently not supported.", MediaPipeTasksStatus::kRunnerUnexpectedInputError); } - ASSIGN_OR_RETURN(auto output_packets, - ProcessImageData({{kImageInStreamName, - MakePacket(std::move(image))}})); + ASSIGN_OR_RETURN(NormalizedRect norm_rect, + FillNormalizedRect(image_processing_options)); + ASSIGN_OR_RETURN( + auto output_packets, + ProcessImageData( + {{kImageInStreamName, MakePacket(std::move(image))}, + {kNormRectStreamName, + MakePacket(std::move(norm_rect))}})); if (output_packets[kHandGesturesStreamName].IsEmpty()) { return {{{}, {}, {}, {}}}; } @@ -245,18 +282,24 @@ absl::StatusOr GestureRecognizer::Recognize( } absl::StatusOr GestureRecognizer::RecognizeForVideo( - mediapipe::Image image, int64 timestamp_ms) { + mediapipe::Image image, int64 timestamp_ms, + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } + ASSIGN_OR_RETURN(NormalizedRect norm_rect, + FillNormalizedRect(image_processing_options)); ASSIGN_OR_RETURN( auto output_packets, ProcessVideoData( {{kImageInStreamName, MakePacket(std::move(image)) + .At(Timestamp(timestamp_ms * kMicroSecondsPerMilliSecond))}, + {kNormRectStreamName, + MakePacket(std::move(norm_rect)) .At(Timestamp(timestamp_ms * kMicroSecondsPerMilliSecond))}})); if (output_packets[kHandGesturesStreamName].IsEmpty()) { return {{{}, {}, {}, {}}}; @@ -276,17 +319,23 @@ absl::StatusOr GestureRecognizer::RecognizeForVideo( }; } -absl::Status GestureRecognizer::RecognizeAsync(mediapipe::Image image, - int64 timestamp_ms) { +absl::Status GestureRecognizer::RecognizeAsync( + mediapipe::Image image, int64 timestamp_ms, + std::optional image_processing_options) { if (image.UsesGpu()) { return CreateStatusWithPayload( absl::StatusCode::kInvalidArgument, absl::StrCat("GPU input images are currently not supported."), MediaPipeTasksStatus::kRunnerUnexpectedInputError); } + ASSIGN_OR_RETURN(NormalizedRect norm_rect, + FillNormalizedRect(image_processing_options)); return SendLiveStreamData( {{kImageInStreamName, MakePacket(std::move(image)) + .At(Timestamp(timestamp_ms * kMicroSecondsPerMilliSecond))}, + {kNormRectStreamName, + MakePacket(std::move(norm_rect)) .At(Timestamp(timestamp_ms * kMicroSecondsPerMilliSecond))}}); } diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h index 53b824e25..892b3c16a 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h @@ -17,11 +17,13 @@ limitations under the License. #define MEDIAPIPE_TASKS_CC_VISION_GESTURE_RECOGNIZRER_GESTURE_RECOGNIZER_H_ #include +#include #include "absl/status/statusor.h" #include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/image.h" #include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/tasks/cc/components/containers/gesture_recognition_result.h" #include "mediapipe/tasks/cc/core/base_options.h" #include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" @@ -93,6 +95,13 @@ struct GestureRecognizerOptions { // Inputs: // Image // - The image that gesture recognition runs on. +// std::optional +// - If provided, can be used to specify the rotation to apply to the image +// before performing gesture recognition, by setting its 'rotation' field +// in radians (e.g. 'M_PI / 2' for a 90° anti-clockwise rotation). Note +// that specifying a region-of-interest using the 'x_center', 'y_center', +// 'width' and 'height' fields is NOT supported and will result in an +// invalid argument error being returned. // Outputs: // GestureRecognitionResult // - The hand gesture recognition results. @@ -122,12 +131,23 @@ class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { // // image - mediapipe::Image // Image to perform hand gesture recognition on. + // imageProcessingOptions - std::optional + // If provided, can be used to specify the rotation to apply to the image + // before performing classification, by setting its 'rotation' field in + // radians (e.g. 'M_PI / 2' for a 90° anti-clockwise rotation). Note that + // specifying a region-of-interest using the 'x_center', 'y_center', 'width' + // and 'height' fields is NOT supported and will result in an invalid + // argument error being returned. // // The image can be of any size with format RGB or RGBA. // TODO: Describes how the input image will be preprocessed // after the yuv support is implemented. + // TODO: use an ImageProcessingOptions struct instead of + // NormalizedRect. absl::StatusOr Recognize( - Image image); + Image image, + std::optional image_processing_options = + std::nullopt); // Performs gesture recognition on the provided video frame. // Only use this method when the GestureRecognizer is created with the video @@ -137,7 +157,9 @@ class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { // provide the video frame's timestamp (in milliseconds). The input timestamps // must be monotonically increasing. absl::StatusOr - RecognizeForVideo(Image image, int64 timestamp_ms); + RecognizeForVideo(Image image, int64 timestamp_ms, + std::optional + image_processing_options = std::nullopt); // Sends live image data to perform gesture recognition, and the results will // be available via the "result_callback" provided in the @@ -157,7 +179,9 @@ class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { // longer be valid when the callback returns. To access the image data // outside of the callback, callers need to make a copy of the image. // - The input timestamp in milliseconds. - absl::Status RecognizeAsync(Image image, int64 timestamp_ms); + absl::Status RecognizeAsync(Image image, int64 timestamp_ms, + std::optional + image_processing_options = std::nullopt); // Shuts down the GestureRecognizer when all works are done. absl::Status Close() { return runner_->Close(); } diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc index b4f2af4d8..e02eadde8 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc @@ -24,6 +24,7 @@ limitations under the License. #include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/image.h" #include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/tasks/cc/common.h" #include "mediapipe/tasks/cc/core/model_task_graph.h" #include "mediapipe/tasks/cc/core/utils.h" @@ -53,6 +54,7 @@ using ::mediapipe::tasks::vision::hand_landmarker::proto:: HandLandmarkerGraphOptions; constexpr char kImageTag[] = "IMAGE"; +constexpr char kNormRectTag[] = "NORM_RECT"; constexpr char kLandmarksTag[] = "LANDMARKS"; constexpr char kWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kHandednessTag[] = "HANDEDNESS"; @@ -76,6 +78,9 @@ struct GestureRecognizerOutputs { // Inputs: // IMAGE - Image // Image to perform hand gesture recognition on. +// NORM_RECT - NormalizedRect +// Describes image rotation and region of image to perform landmarks +// detection on. // // Outputs: // HAND_GESTURES - std::vector @@ -93,13 +98,15 @@ struct GestureRecognizerOutputs { // IMAGE - mediapipe::Image // The image that gesture recognizer runs on and has the pixel data stored // on the target storage (CPU vs GPU). -// +// All returned coordinates are in the unrotated and uncropped input image +// coordinates system. // // Example: // node { // calculator: // "mediapipe.tasks.vision.gesture_recognizer.GestureRecognizerGraph" // input_stream: "IMAGE:image_in" +// input_stream: "NORM_RECT:norm_rect" // output_stream: "HAND_GESTURES:hand_gestures" // output_stream: "LANDMARKS:hand_landmarks" // output_stream: "WORLD_LANDMARKS:world_hand_landmarks" @@ -132,7 +139,8 @@ class GestureRecognizerGraph : public core::ModelTaskGraph { ASSIGN_OR_RETURN(auto hand_gesture_recognition_output, BuildGestureRecognizerGraph( *sc->MutableOptions(), - graph[Input(kImageTag)], graph)); + graph[Input(kImageTag)], + graph[Input(kNormRectTag)], graph)); hand_gesture_recognition_output.gesture >> graph[Output>(kHandGesturesTag)]; hand_gesture_recognition_output.handedness >> @@ -148,7 +156,7 @@ class GestureRecognizerGraph : public core::ModelTaskGraph { private: absl::StatusOr BuildGestureRecognizerGraph( GestureRecognizerGraphOptions& graph_options, Source image_in, - Graph& graph) { + Source norm_rect_in, Graph& graph) { auto& image_property = graph.AddNode("ImagePropertiesCalculator"); image_in >> image_property.In("IMAGE"); auto image_size = image_property.Out("SIZE"); @@ -162,6 +170,7 @@ class GestureRecognizerGraph : public core::ModelTaskGraph { graph_options.mutable_hand_landmarker_graph_options()); image_in >> hand_landmarker_graph.In(kImageTag); + norm_rect_in >> hand_landmarker_graph.In(kNormRectTag); auto hand_landmarks = hand_landmarker_graph[Output>( kLandmarksTag)]; @@ -187,6 +196,7 @@ class GestureRecognizerGraph : public core::ModelTaskGraph { hand_world_landmarks >> hand_gesture_subgraph.In(kWorldLandmarksTag); handedness >> hand_gesture_subgraph.In(kHandednessTag); image_size >> hand_gesture_subgraph.In(kImageSizeTag); + norm_rect_in >> hand_gesture_subgraph.In(kNormRectTag); hand_landmarks_id >> hand_gesture_subgraph.In(kHandTrackingIdsTag); auto hand_gestures = hand_gesture_subgraph[Output>( diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/hand_gesture_recognizer_graph.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/hand_gesture_recognizer_graph.cc index 8d7e0bc07..4bbe94974 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/hand_gesture_recognizer_graph.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/hand_gesture_recognizer_graph.cc @@ -25,6 +25,7 @@ limitations under the License. #include "mediapipe/framework/formats/classification.pb.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/framework/formats/matrix.h" +#include "mediapipe/framework/formats/rect.pb.h" #include "mediapipe/framework/formats/tensor.h" #include "mediapipe/tasks/cc/common.h" #include "mediapipe/tasks/cc/components/processors/classification_postprocessing_graph.h" @@ -57,6 +58,7 @@ constexpr char kHandednessTag[] = "HANDEDNESS"; constexpr char kLandmarksTag[] = "LANDMARKS"; constexpr char kWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kImageSizeTag[] = "IMAGE_SIZE"; +constexpr char kNormRectTag[] = "NORM_RECT"; constexpr char kHandTrackingIdsTag[] = "HAND_TRACKING_IDS"; constexpr char kHandGesturesTag[] = "HAND_GESTURES"; constexpr char kLandmarksMatrixTag[] = "LANDMARKS_MATRIX"; @@ -92,6 +94,9 @@ Source> ConvertMatrixToTensor(Source matrix, // Detected hand landmarks in world coordinates. // IMAGE_SIZE - std::pair // The size of image from which the landmarks detected from. +// NORM_RECT - NormalizedRect +// NormalizedRect whose 'rotation' field is used to rotate the +// landmarks before processing them. // // Outputs: // HAND_GESTURES - ClassificationList @@ -106,6 +111,7 @@ Source> ConvertMatrixToTensor(Source matrix, // input_stream: "LANDMARKS:landmarks" // input_stream: "WORLD_LANDMARKS:world_landmarks" // input_stream: "IMAGE_SIZE:image_size" +// input_stream: "NORM_RECT:norm_rect" // output_stream: "HAND_GESTURES:hand_gestures" // options { // [mediapipe.tasks.vision.gesture_recognizer.proto.HandGestureRecognizerGraphOptions.ext] @@ -133,7 +139,8 @@ class SingleHandGestureRecognizerGraph : public core::ModelTaskGraph { graph[Input(kHandednessTag)], graph[Input(kLandmarksTag)], graph[Input(kWorldLandmarksTag)], - graph[Input>(kImageSizeTag)], graph)); + graph[Input>(kImageSizeTag)], + graph[Input(kNormRectTag)], graph)); hand_gestures >> graph[Output(kHandGesturesTag)]; return graph.GetConfig(); } @@ -145,7 +152,8 @@ class SingleHandGestureRecognizerGraph : public core::ModelTaskGraph { Source handedness, Source hand_landmarks, Source hand_world_landmarks, - Source> image_size, Graph& graph) { + Source> image_size, Source norm_rect, + Graph& graph) { // Converts the ClassificationList to a matrix. auto& handedness_to_matrix = graph.AddNode("HandednessToMatrixCalculator"); handedness >> handedness_to_matrix.In(kHandednessTag); @@ -166,6 +174,7 @@ class SingleHandGestureRecognizerGraph : public core::ModelTaskGraph { landmarks_options; hand_landmarks >> hand_landmarks_to_matrix.In(kLandmarksTag); image_size >> hand_landmarks_to_matrix.In(kImageSizeTag); + norm_rect >> hand_landmarks_to_matrix.In(kNormRectTag); auto hand_landmarks_matrix = hand_landmarks_to_matrix[Output(kLandmarksMatrixTag)]; @@ -181,6 +190,7 @@ class SingleHandGestureRecognizerGraph : public core::ModelTaskGraph { hand_world_landmarks >> hand_world_landmarks_to_matrix.In(kWorldLandmarksTag); image_size >> hand_world_landmarks_to_matrix.In(kImageSizeTag); + norm_rect >> hand_world_landmarks_to_matrix.In(kNormRectTag); auto hand_world_landmarks_matrix = hand_world_landmarks_to_matrix[Output(kLandmarksMatrixTag)]; @@ -239,6 +249,9 @@ REGISTER_MEDIAPIPE_GRAPH( // A vector hand landmarks in world coordinates. // IMAGE_SIZE - std::pair // The size of image from which the landmarks detected from. +// NORM_RECT - NormalizedRect +// NormalizedRect whose 'rotation' field is used to rotate the +// landmarks before processing them. // HAND_TRACKING_IDS - std::vector // A vector of the tracking ids of the hands. The tracking id is the vector // index corresponding to the same hand if the graph runs multiple times. @@ -257,6 +270,7 @@ REGISTER_MEDIAPIPE_GRAPH( // input_stream: "LANDMARKS:landmarks" // input_stream: "WORLD_LANDMARKS:world_landmarks" // input_stream: "IMAGE_SIZE:image_size" +// input_stream: "NORM_RECT:norm_rect" // input_stream: "HAND_TRACKING_IDS:hand_tracking_ids" // output_stream: "HAND_GESTURES:hand_gestures" // options { @@ -283,6 +297,7 @@ class MultipleHandGestureRecognizerGraph : public core::ModelTaskGraph { graph[Input>(kLandmarksTag)], graph[Input>(kWorldLandmarksTag)], graph[Input>(kImageSizeTag)], + graph[Input(kNormRectTag)], graph[Input>(kHandTrackingIdsTag)], graph)); multi_hand_gestures >> graph[Output>(kHandGesturesTag)]; @@ -296,18 +311,20 @@ class MultipleHandGestureRecognizerGraph : public core::ModelTaskGraph { Source> multi_handedness, Source> multi_hand_landmarks, Source> multi_hand_world_landmarks, - Source> image_size, + Source> image_size, Source norm_rect, Source> multi_hand_tracking_ids, Graph& graph) { auto& begin_loop_int = graph.AddNode("BeginLoopIntCalculator"); image_size >> begin_loop_int.In(kCloneTag)[0]; - multi_handedness >> begin_loop_int.In(kCloneTag)[1]; - multi_hand_landmarks >> begin_loop_int.In(kCloneTag)[2]; - multi_hand_world_landmarks >> begin_loop_int.In(kCloneTag)[3]; + norm_rect >> begin_loop_int.In(kCloneTag)[1]; + multi_handedness >> begin_loop_int.In(kCloneTag)[2]; + multi_hand_landmarks >> begin_loop_int.In(kCloneTag)[3]; + multi_hand_world_landmarks >> begin_loop_int.In(kCloneTag)[4]; multi_hand_tracking_ids >> begin_loop_int.In(kIterableTag); auto image_size_clone = begin_loop_int.Out(kCloneTag)[0]; - auto multi_handedness_clone = begin_loop_int.Out(kCloneTag)[1]; - auto multi_hand_landmarks_clone = begin_loop_int.Out(kCloneTag)[2]; - auto multi_hand_world_landmarks_clone = begin_loop_int.Out(kCloneTag)[3]; + auto norm_rect_clone = begin_loop_int.Out(kCloneTag)[1]; + auto multi_handedness_clone = begin_loop_int.Out(kCloneTag)[2]; + auto multi_hand_landmarks_clone = begin_loop_int.Out(kCloneTag)[3]; + auto multi_hand_world_landmarks_clone = begin_loop_int.Out(kCloneTag)[4]; auto hand_tracking_id = begin_loop_int.Out(kItemTag); auto batch_end = begin_loop_int.Out(kBatchEndTag); @@ -341,6 +358,7 @@ class MultipleHandGestureRecognizerGraph : public core::ModelTaskGraph { hand_world_landmarks >> hand_gesture_recognizer_graph.In(kWorldLandmarksTag); image_size_clone >> hand_gesture_recognizer_graph.In(kImageSizeTag); + norm_rect_clone >> hand_gesture_recognizer_graph.In(kNormRectTag); auto hand_gestures = hand_gesture_recognizer_graph.Out(kHandGesturesTag); auto& end_loop_classification_lists = diff --git a/mediapipe/tasks/cc/vision/hand_detector/BUILD b/mediapipe/tasks/cc/vision/hand_detector/BUILD index 433a30471..71cef6270 100644 --- a/mediapipe/tasks/cc/vision/hand_detector/BUILD +++ b/mediapipe/tasks/cc/vision/hand_detector/BUILD @@ -32,7 +32,7 @@ cc_library( "//mediapipe/calculators/tflite:ssd_anchors_calculator_cc_proto", "//mediapipe/calculators/util:detection_label_id_to_text_calculator", "//mediapipe/calculators/util:detection_label_id_to_text_calculator_cc_proto", - "//mediapipe/calculators/util:detection_letterbox_removal_calculator", + "//mediapipe/calculators/util:detection_projection_calculator", "//mediapipe/calculators/util:detections_to_rects_calculator", "//mediapipe/calculators/util:detections_to_rects_calculator_cc_proto", "//mediapipe/calculators/util:non_max_suppression_calculator", diff --git a/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph.cc b/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph.cc index 8573d718f..e876d7d09 100644 --- a/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph.cc +++ b/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph.cc @@ -58,6 +58,7 @@ using ::mediapipe::tasks::vision::hand_detector::proto:: HandDetectorGraphOptions; constexpr char kImageTag[] = "IMAGE"; +constexpr char kNormRectTag[] = "NORM_RECT"; constexpr char kPalmDetectionsTag[] = "PALM_DETECTIONS"; constexpr char kHandRectsTag[] = "HAND_RECTS"; constexpr char kPalmRectsTag[] = "PALM_RECTS"; @@ -148,6 +149,9 @@ void ConfigureRectTransformationCalculator( // Inputs: // IMAGE - Image // Image to perform detection on. +// NORM_RECT - NormalizedRect +// Describes image rotation and region of image to perform detection +// on. // // Outputs: // PALM_DETECTIONS - std::vector @@ -159,11 +163,14 @@ void ConfigureRectTransformationCalculator( // IMAGE - Image // The input image that the hand detector runs on and has the pixel data // stored on the target storage (CPU vs GPU). +// All returned coordinates are in the unrotated and uncropped input image +// coordinates system. // // Example: // node { // calculator: "mediapipe.tasks.vision.hand_detector.HandDetectorGraph" // input_stream: "IMAGE:image" +// input_stream: "NORM_RECT:norm_rect" // output_stream: "PALM_DETECTIONS:palm_detections" // output_stream: "HAND_RECTS:hand_rects_from_palm_detections" // output_stream: "PALM_RECTS:palm_rects" @@ -189,11 +196,11 @@ class HandDetectorGraph : public core::ModelTaskGraph { ASSIGN_OR_RETURN(const auto* model_resources, CreateModelResources(sc)); Graph graph; - ASSIGN_OR_RETURN( - auto hand_detection_outs, - BuildHandDetectionSubgraph(sc->Options(), - *model_resources, - graph[Input(kImageTag)], graph)); + ASSIGN_OR_RETURN(auto hand_detection_outs, + BuildHandDetectionSubgraph( + sc->Options(), + *model_resources, graph[Input(kImageTag)], + graph[Input(kNormRectTag)], graph)); hand_detection_outs.palm_detections >> graph[Output>(kPalmDetectionsTag)]; hand_detection_outs.hand_rects >> @@ -216,7 +223,7 @@ class HandDetectorGraph : public core::ModelTaskGraph { absl::StatusOr BuildHandDetectionSubgraph( const HandDetectorGraphOptions& subgraph_options, const core::ModelResources& model_resources, Source image_in, - Graph& graph) { + Source norm_rect_in, Graph& graph) { // Add image preprocessing subgraph. The model expects aspect ratio // unchanged. auto& preprocessing = @@ -233,8 +240,9 @@ class HandDetectorGraph : public core::ModelTaskGraph { &preprocessing .GetOptions())); image_in >> preprocessing.In("IMAGE"); + norm_rect_in >> preprocessing.In("NORM_RECT"); auto preprocessed_tensors = preprocessing.Out("TENSORS"); - auto letterbox_padding = preprocessing.Out("LETTERBOX_PADDING"); + auto matrix = preprocessing.Out("MATRIX"); auto image_size = preprocessing.Out("IMAGE_SIZE"); // Adds SSD palm detection model. @@ -278,17 +286,12 @@ class HandDetectorGraph : public core::ModelTaskGraph { nms_detections >> detection_label_id_to_text.In(""); auto detections_with_text = detection_label_id_to_text.Out(""); - // Adjusts detection locations (already normalized to [0.f, 1.f]) on the - // letterboxed image (after image transformation with the FIT scale mode) to - // the corresponding locations on the same image with the letterbox removed - // (the input image to the graph before image transformation). - auto& detection_letterbox_removal = - graph.AddNode("DetectionLetterboxRemovalCalculator"); - detections_with_text >> detection_letterbox_removal.In("DETECTIONS"); - letterbox_padding >> detection_letterbox_removal.In("LETTERBOX_PADDING"); + // Projects detections back into the input image coordinates system. + auto& detection_projection = graph.AddNode("DetectionProjectionCalculator"); + detections_with_text >> detection_projection.In("DETECTIONS"); + matrix >> detection_projection.In("PROJECTION_MATRIX"); auto palm_detections = - detection_letterbox_removal[Output>( - "DETECTIONS")]; + detection_projection[Output>("DETECTIONS")]; // Converts each palm detection into a rectangle (normalized by image size) // that encloses the palm and is rotated such that the line connecting diff --git a/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph_test.cc b/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph_test.cc index 11cfc3026..cbbc0e193 100644 --- a/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph_test.cc +++ b/mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include #include #include @@ -75,13 +76,18 @@ using ::testing::proto::Partially; constexpr char kTestDataDirectory[] = "/mediapipe/tasks/testdata/vision/"; constexpr char kPalmDetectionModel[] = "palm_detection_full.tflite"; constexpr char kTestRightHandsImage[] = "right_hands.jpg"; +constexpr char kTestRightHandsRotatedImage[] = "right_hands_rotated.jpg"; constexpr char kTestModelResourcesTag[] = "test_model_resources"; constexpr char kOneHandResultFile[] = "hand_detector_result_one_hand.pbtxt"; +constexpr char kOneHandRotatedResultFile[] = + "hand_detector_result_one_hand_rotated.pbtxt"; constexpr char kTwoHandsResultFile[] = "hand_detector_result_two_hands.pbtxt"; constexpr char kImageTag[] = "IMAGE"; constexpr char kImageName[] = "image"; +constexpr char kNormRectTag[] = "NORM_RECT"; +constexpr char kNormRectName[] = "norm_rect"; constexpr char kPalmDetectionsTag[] = "PALM_DETECTIONS"; constexpr char kPalmDetectionsName[] = "palm_detections"; constexpr char kHandRectsTag[] = "HAND_RECTS"; @@ -117,6 +123,8 @@ absl::StatusOr> CreateTaskRunner( graph[Input(kImageTag)].SetName(kImageName) >> hand_detection.In(kImageTag); + graph[Input(kNormRectTag)].SetName(kNormRectName) >> + hand_detection.In(kNormRectTag); hand_detection.Out(kPalmDetectionsTag).SetName(kPalmDetectionsName) >> graph[Output>(kPalmDetectionsTag)]; @@ -142,6 +150,9 @@ struct TestParams { std::string hand_detection_model_name; // The filename of test image. std::string test_image_name; + // The rotation to apply to the test image before processing, in radians + // counter-clockwise. + float rotation; // The number of maximum detected hands. int num_hands; // The expected hand detector result. @@ -154,14 +165,22 @@ TEST_P(HandDetectionTest, DetectTwoHands) { MP_ASSERT_OK_AND_ASSIGN( Image image, DecodeImageFromFile(JoinPath("./", kTestDataDirectory, GetParam().test_image_name))); + NormalizedRect input_norm_rect; + input_norm_rect.set_rotation(GetParam().rotation); + input_norm_rect.set_x_center(0.5); + input_norm_rect.set_y_center(0.5); + input_norm_rect.set_width(1.0); + input_norm_rect.set_height(1.0); MP_ASSERT_OK_AND_ASSIGN( auto model_resources, CreateModelResourcesForModel(GetParam().hand_detection_model_name)); MP_ASSERT_OK_AND_ASSIGN( auto task_runner, CreateTaskRunner(*model_resources, kPalmDetectionModel, GetParam().num_hands)); - auto output_packets = - task_runner->Process({{kImageName, MakePacket(std::move(image))}}); + auto output_packets = task_runner->Process( + {{kImageName, MakePacket(std::move(image))}, + {kNormRectName, + MakePacket(std::move(input_norm_rect))}}); MP_ASSERT_OK(output_packets); const std::vector& palm_detections = (*output_packets)[kPalmDetectionsName].Get>(); @@ -188,15 +207,24 @@ INSTANTIATE_TEST_SUITE_P( Values(TestParams{.test_name = "DetectOneHand", .hand_detection_model_name = kPalmDetectionModel, .test_image_name = kTestRightHandsImage, + .rotation = 0, .num_hands = 1, .expected_result = GetExpectedHandDetectorResult(kOneHandResultFile)}, TestParams{.test_name = "DetectTwoHands", .hand_detection_model_name = kPalmDetectionModel, .test_image_name = kTestRightHandsImage, + .rotation = 0, .num_hands = 2, .expected_result = - GetExpectedHandDetectorResult(kTwoHandsResultFile)}), + GetExpectedHandDetectorResult(kTwoHandsResultFile)}, + TestParams{.test_name = "DetectOneHandWithRotation", + .hand_detection_model_name = kPalmDetectionModel, + .test_image_name = kTestRightHandsRotatedImage, + .rotation = M_PI / 2.0f, + .num_hands = 1, + .expected_result = GetExpectedHandDetectorResult( + kOneHandRotatedResultFile)}), [](const TestParamInfo& info) { return info.param.test_name; }); diff --git a/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph.cc b/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph.cc index 7e199348c..3fbe38c1c 100644 --- a/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph.cc +++ b/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph.cc @@ -64,6 +64,7 @@ using ::mediapipe::tasks::vision::hand_landmarker::proto:: HandLandmarksDetectorGraphOptions; constexpr char kImageTag[] = "IMAGE"; +constexpr char kNormRectTag[] = "NORM_RECT"; constexpr char kLandmarksTag[] = "LANDMARKS"; constexpr char kWorldLandmarksTag[] = "WORLD_LANDMARKS"; constexpr char kHandRectNextFrameTag[] = "HAND_RECT_NEXT_FRAME"; @@ -122,6 +123,9 @@ absl::Status SetSubTaskBaseOptions(const ModelAssetBundleResources& resources, // Inputs: // IMAGE - Image // Image to perform hand landmarks detection on. +// NORM_RECT - NormalizedRect +// Describes image rotation and region of image to perform landmarks +// detection on. // // Outputs: // LANDMARKS: - std::vector @@ -140,11 +144,14 @@ absl::Status SetSubTaskBaseOptions(const ModelAssetBundleResources& resources, // IMAGE - Image // The input image that the hand landmarker runs on and has the pixel data // stored on the target storage (CPU vs GPU). +// All returned coordinates are in the unrotated and uncropped input image +// coordinates system. // // Example: // node { // calculator: "mediapipe.tasks.vision.hand_landmarker.HandLandmarkerGraph" // input_stream: "IMAGE:image_in" +// input_stream: "NORM_RECT:norm_rect" // output_stream: "LANDMARKS:hand_landmarks" // output_stream: "WORLD_LANDMARKS:world_hand_landmarks" // output_stream: "HAND_RECT_NEXT_FRAME:hand_rect_next_frame" @@ -198,10 +205,11 @@ class HandLandmarkerGraph : public core::ModelTaskGraph { !sc->Service(::mediapipe::tasks::core::kModelResourcesCacheService) .IsAvailable())); } - ASSIGN_OR_RETURN( - auto hand_landmarker_outputs, - BuildHandLandmarkerGraph(sc->Options(), - graph[Input(kImageTag)], graph)); + ASSIGN_OR_RETURN(auto hand_landmarker_outputs, + BuildHandLandmarkerGraph( + sc->Options(), + graph[Input(kImageTag)], + graph[Input(kNormRectTag)], graph)); hand_landmarker_outputs.landmark_lists >> graph[Output>(kLandmarksTag)]; hand_landmarker_outputs.world_landmark_lists >> @@ -240,7 +248,7 @@ class HandLandmarkerGraph : public core::ModelTaskGraph { // graph: the mediapipe graph instance to be updated. absl::StatusOr BuildHandLandmarkerGraph( const HandLandmarkerGraphOptions& tasks_options, Source image_in, - Graph& graph) { + Source norm_rect_in, Graph& graph) { const int max_num_hands = tasks_options.hand_detector_graph_options().num_hands(); @@ -258,12 +266,15 @@ class HandLandmarkerGraph : public core::ModelTaskGraph { auto image_for_hand_detector = DisallowIf(image_in, has_enough_hands, graph); + auto norm_rect_in_for_hand_detector = + DisallowIf(norm_rect_in, has_enough_hands, graph); auto& hand_detector = graph.AddNode("mediapipe.tasks.vision.hand_detector.HandDetectorGraph"); hand_detector.GetOptions().CopyFrom( tasks_options.hand_detector_graph_options()); image_for_hand_detector >> hand_detector.In("IMAGE"); + norm_rect_in_for_hand_detector >> hand_detector.In("NORM_RECT"); auto hand_rects_from_hand_detector = hand_detector.Out("HAND_RECTS"); auto& hand_association = graph.AddNode("HandAssociationCalculator"); diff --git a/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph_test.cc b/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph_test.cc index 604f37d53..08beb1a1b 100644 --- a/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph_test.cc +++ b/mediapipe/tasks/cc/vision/hand_landmarker/hand_landmarker_graph_test.cc @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include #include #include #include +#include #include "absl/flags/flag.h" #include "absl/status/statusor.h" @@ -67,9 +69,12 @@ using ::testing::proto::Partially; constexpr char kTestDataDirectory[] = "/mediapipe/tasks/testdata/vision/"; constexpr char kHandLandmarkerModelBundle[] = "hand_landmark.task"; constexpr char kLeftHandsImage[] = "left_hands.jpg"; +constexpr char kLeftHandsRotatedImage[] = "left_hands_rotated.jpg"; constexpr char kImageTag[] = "IMAGE"; constexpr char kImageName[] = "image_in"; +constexpr char kNormRectTag[] = "NORM_RECT"; +constexpr char kNormRectName[] = "norm_rect_in"; constexpr char kLandmarksTag[] = "LANDMARKS"; constexpr char kLandmarksName[] = "landmarks"; constexpr char kWorldLandmarksTag[] = "WORLD_LANDMARKS"; @@ -84,6 +89,11 @@ constexpr char kExpectedLeftUpHandLandmarksFilename[] = "expected_left_up_hand_landmarks.prototxt"; constexpr char kExpectedLeftDownHandLandmarksFilename[] = "expected_left_down_hand_landmarks.prototxt"; +// Same but for the rotated image. +constexpr char kExpectedLeftUpHandRotatedLandmarksFilename[] = + "expected_left_up_hand_rotated_landmarks.prototxt"; +constexpr char kExpectedLeftDownHandRotatedLandmarksFilename[] = + "expected_left_down_hand_rotated_landmarks.prototxt"; constexpr float kFullModelFractionDiff = 0.03; // percentage constexpr float kAbsMargin = 0.03; @@ -111,6 +121,8 @@ absl::StatusOr> CreateTaskRunner() { graph[Input(kImageTag)].SetName(kImageName) >> hand_landmarker_graph.In(kImageTag); + graph[Input(kNormRectTag)].SetName(kNormRectName) >> + hand_landmarker_graph.In(kNormRectTag); hand_landmarker_graph.Out(kLandmarksTag).SetName(kLandmarksName) >> graph[Output>(kLandmarksTag)]; hand_landmarker_graph.Out(kWorldLandmarksTag).SetName(kWorldLandmarksName) >> @@ -130,9 +142,16 @@ TEST_F(HandLandmarkerTest, Succeeds) { MP_ASSERT_OK_AND_ASSIGN( Image image, DecodeImageFromFile(JoinPath("./", kTestDataDirectory, kLeftHandsImage))); + NormalizedRect input_norm_rect; + input_norm_rect.set_x_center(0.5); + input_norm_rect.set_y_center(0.5); + input_norm_rect.set_width(1.0); + input_norm_rect.set_height(1.0); MP_ASSERT_OK_AND_ASSIGN(auto task_runner, CreateTaskRunner()); - auto output_packets = - task_runner->Process({{kImageName, MakePacket(std::move(image))}}); + auto output_packets = task_runner->Process( + {{kImageName, MakePacket(std::move(image))}, + {kNormRectName, + MakePacket(std::move(input_norm_rect))}}); const auto& landmarks = (*output_packets)[kLandmarksName] .Get>(); ASSERT_EQ(landmarks.size(), kMaxNumHands); @@ -150,6 +169,38 @@ TEST_F(HandLandmarkerTest, Succeeds) { /*fraction=*/kFullModelFractionDiff)); } +TEST_F(HandLandmarkerTest, SucceedsWithRotation) { + MP_ASSERT_OK_AND_ASSIGN( + Image image, DecodeImageFromFile(JoinPath("./", kTestDataDirectory, + kLeftHandsRotatedImage))); + NormalizedRect input_norm_rect; + input_norm_rect.set_x_center(0.5); + input_norm_rect.set_y_center(0.5); + input_norm_rect.set_width(1.0); + input_norm_rect.set_height(1.0); + input_norm_rect.set_rotation(M_PI / 2.0); + MP_ASSERT_OK_AND_ASSIGN(auto task_runner, CreateTaskRunner()); + auto output_packets = task_runner->Process( + {{kImageName, MakePacket(std::move(image))}, + {kNormRectName, + MakePacket(std::move(input_norm_rect))}}); + const auto& landmarks = (*output_packets)[kLandmarksName] + .Get>(); + ASSERT_EQ(landmarks.size(), kMaxNumHands); + std::vector expected_landmarks = { + GetExpectedLandmarkList(kExpectedLeftUpHandRotatedLandmarksFilename), + GetExpectedLandmarkList(kExpectedLeftDownHandRotatedLandmarksFilename)}; + + EXPECT_THAT(landmarks[0], + Approximately(Partially(EqualsProto(expected_landmarks[0])), + /*margin=*/kAbsMargin, + /*fraction=*/kFullModelFractionDiff)); + EXPECT_THAT(landmarks[1], + Approximately(Partially(EqualsProto(expected_landmarks[1])), + /*margin=*/kAbsMargin, + /*fraction=*/kFullModelFractionDiff)); +} + } // namespace } // namespace hand_landmarker diff --git a/mediapipe/tasks/java/com/google/mediapipe/tasks/vision/gesturerecognizer/GestureRecognizer.java b/mediapipe/tasks/java/com/google/mediapipe/tasks/vision/gesturerecognizer/GestureRecognizer.java index 128f6eab3..32473b299 100644 --- a/mediapipe/tasks/java/com/google/mediapipe/tasks/vision/gesturerecognizer/GestureRecognizer.java +++ b/mediapipe/tasks/java/com/google/mediapipe/tasks/vision/gesturerecognizer/GestureRecognizer.java @@ -15,6 +15,7 @@ package com.google.mediapipe.tasks.vision.gesturerecognizer; import android.content.Context; +import android.graphics.RectF; import android.os.ParcelFileDescriptor; import com.google.auto.value.AutoValue; import com.google.mediapipe.formats.proto.LandmarkProto.LandmarkList; @@ -71,8 +72,10 @@ import java.util.Optional; public final class GestureRecognizer extends BaseVisionTaskApi { private static final String TAG = GestureRecognizer.class.getSimpleName(); private static final String IMAGE_IN_STREAM_NAME = "image_in"; + private static final String NORM_RECT_IN_STREAM_NAME = "norm_rect_in"; private static final List INPUT_STREAMS = - Collections.unmodifiableList(Arrays.asList("IMAGE:" + IMAGE_IN_STREAM_NAME)); + Collections.unmodifiableList( + Arrays.asList("IMAGE:" + IMAGE_IN_STREAM_NAME, "NORM_RECT:" + NORM_RECT_IN_STREAM_NAME)); private static final List OUTPUT_STREAMS = Collections.unmodifiableList( Arrays.asList( @@ -205,7 +208,7 @@ public final class GestureRecognizer extends BaseVisionTaskApi { * @param runningMode a mediapipe vision task {@link RunningMode}. */ private GestureRecognizer(TaskRunner taskRunner, RunningMode runningMode) { - super(taskRunner, runningMode, IMAGE_IN_STREAM_NAME); + super(taskRunner, runningMode, IMAGE_IN_STREAM_NAME, NORM_RECT_IN_STREAM_NAME); } /** @@ -223,7 +226,8 @@ public final class GestureRecognizer extends BaseVisionTaskApi { * @throws MediaPipeException if there is an internal error. */ public GestureRecognitionResult recognize(Image inputImage) { - return (GestureRecognitionResult) processImageData(inputImage); + // TODO: add proper support for rotations. + return (GestureRecognitionResult) processImageData(inputImage, buildFullImageRectF()); } /** @@ -244,7 +248,9 @@ public final class GestureRecognizer extends BaseVisionTaskApi { * @throws MediaPipeException if there is an internal error. */ public GestureRecognitionResult recognizeForVideo(Image inputImage, long inputTimestampMs) { - return (GestureRecognitionResult) processVideoData(inputImage, inputTimestampMs); + // TODO: add proper support for rotations. + return (GestureRecognitionResult) + processVideoData(inputImage, buildFullImageRectF(), inputTimestampMs); } /** @@ -266,7 +272,8 @@ public final class GestureRecognizer extends BaseVisionTaskApi { * @throws MediaPipeException if there is an internal error. */ public void recognizeAsync(Image inputImage, long inputTimestampMs) { - sendLiveStreamData(inputImage, inputTimestampMs); + // TODO: add proper support for rotations. + sendLiveStreamData(inputImage, buildFullImageRectF(), inputTimestampMs); } /** Options for setting up an {@link GestureRecognizer}. */ @@ -464,4 +471,9 @@ public final class GestureRecognizer extends BaseVisionTaskApi { .build(); } } + + /** Creates a RectF covering the full image. */ + private static RectF buildFullImageRectF() { + return new RectF(0, 0, 1, 1); + } } diff --git a/mediapipe/tasks/testdata/vision/BUILD b/mediapipe/tasks/testdata/vision/BUILD index 0532458aa..ffb4760d9 100644 --- a/mediapipe/tasks/testdata/vision/BUILD +++ b/mediapipe/tasks/testdata/vision/BUILD @@ -39,6 +39,7 @@ mediapipe_files(srcs = [ "hand_landmark_full.tflite", "hand_landmark_lite.tflite", "left_hands.jpg", + "left_hands_rotated.jpg", "mobilenet_v1_0.25_192_quantized_1_default_1.tflite", "mobilenet_v1_0.25_224_1_default_1.tflite", "mobilenet_v1_0.25_224_1_metadata_1.tflite", @@ -52,7 +53,9 @@ mediapipe_files(srcs = [ "multi_objects_rotated.jpg", "palm_detection_full.tflite", "pointing_up.jpg", + "pointing_up_rotated.jpg", "right_hands.jpg", + "right_hands_rotated.jpg", "segmentation_golden_rotation0.png", "segmentation_input_rotation0.jpg", "selfie_segm_128_128_3.tflite", @@ -65,7 +68,9 @@ mediapipe_files(srcs = [ exports_files( srcs = [ "expected_left_down_hand_landmarks.prototxt", + "expected_left_down_hand_rotated_landmarks.prototxt", "expected_left_up_hand_landmarks.prototxt", + "expected_left_up_hand_rotated_landmarks.prototxt", "expected_right_down_hand_landmarks.prototxt", "expected_right_up_hand_landmarks.prototxt", ], @@ -85,11 +90,14 @@ filegroup( "hand_landmark_full.tflite", "hand_landmark_lite.tflite", "left_hands.jpg", + "left_hands_rotated.jpg", "mozart_square.jpg", "multi_objects.jpg", "multi_objects_rotated.jpg", "pointing_up.jpg", + "pointing_up_rotated.jpg", "right_hands.jpg", + "right_hands_rotated.jpg", "segmentation_golden_rotation0.png", "segmentation_input_rotation0.jpg", "selfie_segm_128_128_3_expected_mask.jpg", @@ -131,12 +139,17 @@ filegroup( name = "test_protos", srcs = [ "expected_left_down_hand_landmarks.prototxt", + "expected_left_down_hand_rotated_landmarks.prototxt", "expected_left_up_hand_landmarks.prototxt", + "expected_left_up_hand_rotated_landmarks.prototxt", "expected_right_down_hand_landmarks.prototxt", "expected_right_up_hand_landmarks.prototxt", "hand_detector_result_one_hand.pbtxt", + "hand_detector_result_one_hand_rotated.pbtxt", "hand_detector_result_two_hands.pbtxt", "pointing_up_landmarks.pbtxt", + "pointing_up_rotated_landmarks.pbtxt", "thumb_up_landmarks.pbtxt", + "thumb_up_rotated_landmarks.pbtxt", ], ) diff --git a/mediapipe/tasks/testdata/vision/expected_left_down_hand_rotated_landmarks.prototxt b/mediapipe/tasks/testdata/vision/expected_left_down_hand_rotated_landmarks.prototxt new file mode 100644 index 000000000..3cbf8804f --- /dev/null +++ b/mediapipe/tasks/testdata/vision/expected_left_down_hand_rotated_landmarks.prototxt @@ -0,0 +1,84 @@ +landmark { + x: 0.9259716 + y: 0.18969846 +} +landmark { + x: 0.88135517 + y: 0.28856543 +} +landmark { + x: 0.7600651 + y: 0.3578236 +} +landmark { + x: 0.62631166 + y: 0.40490413 +} +landmark { + x: 0.5374573 + y: 0.45170194 +} +landmark { + x: 0.57372385 + y: 0.29924914 +} +landmark { + x: 0.36731184 + y: 0.33081773 +} +landmark { + x: 0.24132833 + y: 0.34759054 +} +landmark { + x: 0.13690609 + y: 0.35727677 +} +landmark { + x: 0.5535803 + y: 0.2398035 +} +landmark { + x: 0.31834763 + y: 0.24999242 +} +landmark { + x: 0.16748133 + y: 0.25625145 +} +landmark { + x: 0.050747424 + y: 0.25991398 +} +landmark { + x: 0.56593156 + y: 0.1867483 +} +landmark { + x: 0.3543046 + y: 0.17923892 +} +landmark { + x: 0.21360746 + y: 0.17454882 +} +landmark { + x: 0.11110917 + y: 0.17232567 +} +landmark { + x: 0.5948908 + y: 0.14024714 +} +landmark { + x: 0.42692152 + y: 0.11949824 +} +landmark { + x: 0.32239118 + y: 0.106370345 +} +landmark { + x: 0.23672739 + y: 0.09432885 +} diff --git a/mediapipe/tasks/testdata/vision/expected_left_up_hand_rotated_landmarks.prototxt b/mediapipe/tasks/testdata/vision/expected_left_up_hand_rotated_landmarks.prototxt new file mode 100644 index 000000000..42eccbcc5 --- /dev/null +++ b/mediapipe/tasks/testdata/vision/expected_left_up_hand_rotated_landmarks.prototxt @@ -0,0 +1,84 @@ +landmark { + x: 0.06676084 + y: 0.8095678 +} +landmark { + x: 0.11359626 + y: 0.71148247 +} +landmark { + x: 0.23572624 + y: 0.6414506 +} +landmark { + x: 0.37323278 + y: 0.5959156 +} +landmark { + x: 0.46243322 + y: 0.55125874 +} +landmark { + x: 0.4205411 + y: 0.69531494 +} +landmark { + x: 0.62798893 + y: 0.66715276 +} +landmark { + x: 0.7568023 + y: 0.65208924 +} +landmark { + x: 0.86370826 + y: 0.6437276 +} +landmark { + x: 0.445136 + y: 0.75394773 +} +landmark { + x: 0.6787485 + y: 0.745853 +} +landmark { + x: 0.8290694 + y: 0.7412988 +} +landmark { + x: 0.94454145 + y: 0.7384017 +} +landmark { + x: 0.43516788 + y: 0.8082166 +} +landmark { + x: 0.6459554 + y: 0.81768996 +} +landmark { + x: 0.7875173 + y: 0.825062 +} +landmark { + x: 0.89249825 + y: 0.82850707 +} +landmark { + x: 0.40665048 + y: 0.8567925 +} +landmark { + x: 0.57228816 + y: 0.8802181 +} +landmark { + x: 0.6762071 + y: 0.8941581 +} +landmark { + x: 0.76453924 + y: 0.90583205 +} diff --git a/mediapipe/tasks/testdata/vision/hand_detector_result_one_hand_rotated.pbtxt b/mediapipe/tasks/testdata/vision/hand_detector_result_one_hand_rotated.pbtxt new file mode 100644 index 000000000..cec4d6166 --- /dev/null +++ b/mediapipe/tasks/testdata/vision/hand_detector_result_one_hand_rotated.pbtxt @@ -0,0 +1,33 @@ +detections { + label: "Palm" + score: 0.97115 + location_data { + format: RELATIVE_BOUNDING_BOX + relative_bounding_box { + xmin: 0.5198178 + ymin: 0.6467485 + width: 0.42467535 + height: 0.22546273 + } + } +} +detections { + label: "Palm" + score: 0.96701413 + location_data { + format: RELATIVE_BOUNDING_BOX + relative_bounding_box { + xmin: 0.024490356 + ymin: 0.12620124 + width: 0.43832153 + height: 0.23269764 + } + } +} +hand_rects { + x_center: 0.5760683 + y_center: 0.6829921 + height: 0.5862031 + width: 1.1048855 + rotation: -0.8250832 +} diff --git a/mediapipe/tasks/testdata/vision/pointing_up_rotated_landmarks.pbtxt b/mediapipe/tasks/testdata/vision/pointing_up_rotated_landmarks.pbtxt new file mode 100644 index 000000000..65bb11bc8 --- /dev/null +++ b/mediapipe/tasks/testdata/vision/pointing_up_rotated_landmarks.pbtxt @@ -0,0 +1,223 @@ +classifications { + classification { + score: 1.0 + label: "Left" + display_name: "Left" + } +} + +landmarks { + landmark { + x: 0.25546086 + y: 0.47584262 + z: 1.835341e-07 + } + landmark { + x: 0.3363011 + y: 0.54135 + z: -0.041144375 + } + landmark { + x: 0.4375146 + y: 0.57881975 + z: -0.06807727 + } + landmark { + x: 0.49603376 + y: 0.5263966 + z: -0.09387612 + } + landmark { + x: 0.5022822 + y: 0.4413827 + z: -0.1189948 + } + landmark { + x: 0.5569452 + y: 0.4724485 + z: -0.05138246 + } + landmark { + x: 0.6687125 + y: 0.47918057 + z: -0.09121969 + } + landmark { + x: 0.73666537 + y: 0.48318353 + z: -0.11703273 + } + landmark { + x: 0.7998315 + y: 0.4741413 + z: -0.1386424 + } + landmark { + x: 0.5244063 + y: 0.39292705 + z: -0.061040796 + } + landmark { + x: 0.57215345 + y: 0.41514704 + z: -0.11967233 + } + landmark { + x: 0.4724468 + y: 0.45553637 + z: -0.13287684 + } + landmark { + x: 0.43794966 + y: 0.45210314 + z: -0.13210714 + } + landmark { + x: 0.47838163 + y: 0.33329 + z: -0.07421263 + } + landmark { + x: 0.51081127 + y: 0.35479474 + z: -0.13596693 + } + landmark { + x: 0.42433846 + y: 0.40486792 + z: -0.121291734 + } + landmark { + x: 0.40280548 + y: 0.39977497 + z: -0.09928809 + } + landmark { + x: 0.42269367 + y: 0.2798249 + z: -0.09064263 + } + landmark { + x: 0.45849988 + y: 0.3069861 + z: -0.12894689 + } + landmark { + x: 0.40754712 + y: 0.35153976 + z: -0.109160855 + } + landmark { + x: 0.38855004 + y: 0.3467068 + z: -0.08820164 + } +} + +world_landmarks { + landmark { + x: -0.08568013 + y: 0.016593203 + z: 0.036527164 + } + landmark { + x: -0.0565372 + y: 0.041761592 + z: 0.019493781 + } + landmark { + x: -0.031365488 + y: 0.05031186 + z: 0.0025481891 + } + landmark { + x: -0.008534161 + y: 0.04286737 + z: -0.024755282 + } + landmark { + x: -0.0047254 + y: 0.015748458 + z: -0.035581928 + } + landmark { + x: 0.013083893 + y: 0.024668094 + z: 0.0035934823 + } + landmark { + x: 0.04149521 + y: 0.024621274 + z: -0.0030611698 + } + landmark { + x: 0.06257473 + y: 0.025388625 + z: -0.010340984 + } + landmark { + x: 0.08009179 + y: 0.023082614 + z: -0.03162942 + } + landmark { + x: 0.006135068 + y: 0.000696786 + z: 0.0048212176 + } + landmark { + x: 0.01678449 + y: 0.0067061195 + z: -0.029920919 + } + landmark { + x: -0.008948593 + y: 0.016808286 + z: -0.03755109 + } + landmark { + x: -0.01789449 + y: 0.0153161455 + z: -0.012059977 + } + landmark { + x: -0.0061980113 + y: -0.017872887 + z: -0.002366997 + } + landmark { + x: -0.004643807 + y: -0.0108282855 + z: -0.034515083 + } + landmark { + x: -0.027603384 + y: 0.003529715 + z: -0.033665676 + } + landmark { + x: -0.035679806 + y: 0.0038255951 + z: -0.008094264 + } + landmark { + x: -0.02957782 + y: -0.031701155 + z: -0.008180461 + } + landmark { + x: -0.020741666 + y: -0.02506058 + z: -0.026839724 + } + landmark { + x: -0.0310834 + y: -0.009496164 + z: -0.032422185 + } + landmark { + x: -0.037420202 + y: -0.012883307 + z: -0.017971724 + } +} diff --git a/mediapipe/tasks/testdata/vision/thumb_up_rotated_landmarks.pbtxt b/mediapipe/tasks/testdata/vision/thumb_up_rotated_landmarks.pbtxt new file mode 100644 index 000000000..3636e2e4d --- /dev/null +++ b/mediapipe/tasks/testdata/vision/thumb_up_rotated_landmarks.pbtxt @@ -0,0 +1,223 @@ +classifications { + classification { + score: 1.0 + label: "Left" + display_name: "Left" + } +} + +landmarks { + landmark { + x: 0.3283601 + y: 0.63773525 + z: -3.2280354e-07 + } + landmark { + x: 0.46280807 + y: 0.6339767 + z: -0.06408348 + } + landmark { + x: 0.5831279 + y: 0.57430106 + z: -0.08583106 + } + landmark { + x: 0.6689471 + y: 0.49959752 + z: -0.09886064 + } + landmark { + x: 0.74378216 + y: 0.47357544 + z: -0.09680563 + } + landmark { + x: 0.5233122 + y: 0.41020474 + z: -0.038088404 + } + landmark { + x: 0.5296913 + y: 0.3372598 + z: -0.08874837 + } + landmark { + x: 0.49039274 + y: 0.43994758 + z: -0.102315836 + } + landmark { + x: 0.4824569 + y: 0.47969607 + z: -0.1030014 + } + landmark { + x: 0.4451338 + y: 0.39520803 + z: -0.02177739 + } + landmark { + x: 0.4410001 + y: 0.34107083 + z: -0.07294245 + } + landmark { + x: 0.4162798 + y: 0.46102384 + z: -0.07746907 + } + landmark { + x: 0.43492994 + y: 0.47154287 + z: -0.07404131 + } + landmark { + x: 0.37671578 + y: 0.39535576 + z: -0.016277775 + } + landmark { + x: 0.36978847 + y: 0.34265152 + z: -0.07346253 + } + landmark { + x: 0.3559884 + y: 0.44905427 + z: -0.057693005 + } + landmark { + x: 0.37711847 + y: 0.46414754 + z: -0.03662908 + } + landmark { + x: 0.3142985 + y: 0.3942253 + z: -0.0152847925 + } + landmark { + x: 0.30000874 + y: 0.35543376 + z: -0.046002634 + } + landmark { + x: 0.30002704 + y: 0.42357764 + z: -0.032671776 + } + landmark { + x: 0.31079838 + y: 0.44218025 + z: -0.016200554 + } +} + +world_landmarks { + landmark { + x: -0.030687196 + y: 0.0678545 + z: 0.051061403 + } + landmark { + x: 0.0047719833 + y: 0.06330968 + z: 0.018945374 + } + landmark { + x: 0.039799504 + y: 0.054109577 + z: 0.007930638 + } + landmark { + x: 0.069374144 + y: 0.035063196 + z: 2.2522348e-05 + } + landmark { + x: 0.087818466 + y: 0.018390425 + z: 0.004055788 + } + landmark { + x: 0.02810654 + y: 0.0043561812 + z: -0.0038672548 + } + landmark { + x: 0.025270049 + y: -0.0039896416 + z: -0.032991238 + } + landmark { + x: 0.020414166 + y: 0.006768506 + z: -0.032724563 + } + landmark { + x: 0.016415983 + y: 0.024563588 + z: -0.0058115427 + } + landmark { + x: 0.0038743173 + y: -0.0044466974 + z: 0.0024876352 + } + landmark { + x: 0.0041790796 + y: -0.0115309935 + z: -0.03532454 + } + landmark { + x: -0.0016900161 + y: 0.015519895 + z: -0.03596156 + } + landmark { + x: 0.004309217 + y: 0.01917039 + z: 0.003907912 + } + landmark { + x: -0.016969737 + y: -0.005584497 + z: 0.0034258277 + } + landmark { + x: -0.016737012 + y: -0.01159037 + z: -0.02876696 + } + landmark { + x: -0.018165365 + y: 0.01376111 + z: -0.026835402 + } + landmark { + x: -0.012430167 + y: 0.02064222 + z: -0.00087265146 + } + landmark { + x: -0.043247573 + y: 0.0011161827 + z: 0.0056269006 + } + landmark { + x: -0.038128495 + y: -0.011477032 + z: -0.016374081 + } + landmark { + x: -0.034920715 + y: 0.005510211 + z: -0.029714659 + } + landmark { + x: -0.03815982 + y: 0.011989757 + z: -0.014853194 + } +} diff --git a/third_party/external_files.bzl b/third_party/external_files.bzl index b85d93318..2c92293ff 100644 --- a/third_party/external_files.bzl +++ b/third_party/external_files.bzl @@ -151,7 +151,7 @@ def external_files(): http_file( name = "com_google_mediapipe_dummy_gesture_recognizer_task", sha256 = "18e54586bda33300d459ca140cd045f6daf43d897224ba215a16db3423eae18e", - urls = ["https://storage.googleapis.com/mediapipe-assets/dummy_gesture_recognizer.task?generation=1665524417056146"], + urls = ["https://storage.googleapis.com/mediapipe-assets/dummy_gesture_recognizer.task?generation=1665707319890725"], ) http_file( @@ -166,12 +166,24 @@ def external_files(): urls = ["https://storage.googleapis.com/mediapipe-assets/expected_left_down_hand_landmarks.prototxt?generation=1661875720230540"], ) + http_file( + name = "com_google_mediapipe_expected_left_down_hand_rotated_landmarks_prototxt", + sha256 = "a16d6cb8dd07d60f0678ddeb6a7447b73b9b03d4ddde365c8770b472205bb6cf", + urls = ["https://storage.googleapis.com/mediapipe-assets/expected_left_down_hand_rotated_landmarks.prototxt?generation=1666037061297507"], + ) + http_file( name = "com_google_mediapipe_expected_left_up_hand_landmarks_prototxt", sha256 = "1353ba617c4f048083618587cd23a8a22115f634521c153d4e1bd1ebd4f49dd7", urls = ["https://storage.googleapis.com/mediapipe-assets/expected_left_up_hand_landmarks.prototxt?generation=1661875726008879"], ) + http_file( + name = "com_google_mediapipe_expected_left_up_hand_rotated_landmarks_prototxt", + sha256 = "a9b9789c274d48a7cb9cc10af7bc644eb2512bb934529790d0a5404726daa86a", + urls = ["https://storage.googleapis.com/mediapipe-assets/expected_left_up_hand_rotated_landmarks.prototxt?generation=1666037063443676"], + ) + http_file( name = "com_google_mediapipe_expected_right_down_hand_landmarks_prototxt", sha256 = "f281b745175aaa7f458def6cf4c89521fb56302dd61a05642b3b4a4f237ffaa3", @@ -250,6 +262,12 @@ def external_files(): urls = ["https://storage.googleapis.com/mediapipe-assets/hand_detector_result_one_hand.pbtxt?generation=1662745351291628"], ) + http_file( + name = "com_google_mediapipe_hand_detector_result_one_hand_rotated_pbtxt", + sha256 = "ff5ca0654028d78a3380df90054273cae79abe1b7369b164063fd1d5758ec370", + urls = ["https://storage.googleapis.com/mediapipe-assets/hand_detector_result_one_hand_rotated.pbtxt?generation=1666037065601724"], + ) + http_file( name = "com_google_mediapipe_hand_detector_result_two_hands_pbtxt", sha256 = "2589cb08b0ee027dc24649fe597adcfa2156a21d12ea2480f83832714ebdf95f", @@ -352,6 +370,12 @@ def external_files(): urls = ["https://storage.googleapis.com/mediapipe-assets/left_hands.jpg?generation=1661875796949017"], ) + http_file( + name = "com_google_mediapipe_left_hands_rotated_jpg", + sha256 = "8609c6202bca43a99bbf23fa8e687e49fa525e89481152e4c0987f46d60d7931", + urls = ["https://storage.googleapis.com/mediapipe-assets/left_hands_rotated.jpg?generation=1666037068103465"], + ) + http_file( name = "com_google_mediapipe_mobilebert_embedding_with_metadata_tflite", sha256 = "fa47142dcc6f446168bc672f2df9605b6da5d0c0d6264e9be62870282365b95c", @@ -544,6 +568,18 @@ def external_files(): urls = ["https://storage.googleapis.com/mediapipe-assets/pointing_up_landmarks.pbtxt?generation=1665174976408451"], ) + http_file( + name = "com_google_mediapipe_pointing_up_rotated_jpg", + sha256 = "50ff66f50281207072a038e5bb6648c43f4aacbfb8204a4d2591868756aaeff1", + urls = ["https://storage.googleapis.com/mediapipe-assets/pointing_up_rotated.jpg?generation=1666037072219697"], + ) + + http_file( + name = "com_google_mediapipe_pointing_up_rotated_landmarks_pbtxt", + sha256 = "ccf67e5867094ffb6c465a4dfbf2ef1eb3f9db2465803fc25a0b84c958e050de", + urls = ["https://storage.googleapis.com/mediapipe-assets/pointing_up_rotated_landmarks.pbtxt?generation=1666037074376515"], + ) + http_file( name = "com_google_mediapipe_pose_detection_tflite", sha256 = "a63c614bef30d35947f13be361820b1e4e3bec9cfeebf4d11216a18373108e85", @@ -580,6 +616,12 @@ def external_files(): urls = ["https://storage.googleapis.com/mediapipe-assets/right_hands.jpg?generation=1661875908672404"], ) + http_file( + name = "com_google_mediapipe_right_hands_rotated_jpg", + sha256 = "b3bdf692f0d54b86c8b67e6d1286dd0078fbe6e9dfcd507b187e3bd8b398c0f9", + urls = ["https://storage.googleapis.com/mediapipe-assets/right_hands_rotated.jpg?generation=1666037076873345"], + ) + http_file( name = "com_google_mediapipe_score_calibration_file_meta_json", sha256 = "6a3c305620371f662419a496f75be5a10caebca7803b1e99d8d5d22ba51cda94", @@ -724,6 +766,12 @@ def external_files(): urls = ["https://storage.googleapis.com/mediapipe-assets/thumb_up_landmarks.pbtxt?generation=1665174979747784"], ) + http_file( + name = "com_google_mediapipe_thumb_up_rotated_landmarks_pbtxt", + sha256 = "5d0a465959cacbd201ac8dd8fc8a66c5997a172b71809b12d27296db6a28a102", + urls = ["https://storage.googleapis.com/mediapipe-assets/thumb_up_rotated_landmarks.pbtxt?generation=1666037079490527"], + ) + http_file( name = "com_google_mediapipe_two_heads_16000_hz_mono_wav", sha256 = "a291a9c22c39bba30138a26915e154a96286ba6ca3b413053123c504a58cce3b",