diff --git a/mediapipe/tasks/cc/components/containers/BUILD b/mediapipe/tasks/cc/components/containers/BUILD index ca688caf7..33d3e4457 100644 --- a/mediapipe/tasks/cc/components/containers/BUILD +++ b/mediapipe/tasks/cc/components/containers/BUILD @@ -20,3 +20,12 @@ cc_library( name = "landmarks_detection", hdrs = ["landmarks_detection.h"], ) + +cc_library( + name = "gesture_recognition_result", + hdrs = ["gesture_recognition_result.h"], + deps = [ + "//mediapipe/framework/formats:classification_cc_proto", + "//mediapipe/framework/formats:landmark_cc_proto", + ], +) diff --git a/mediapipe/tasks/cc/components/containers/gesture_recognition_result.h b/mediapipe/tasks/cc/components/containers/gesture_recognition_result.h new file mode 100644 index 000000000..4e2e8d775 --- /dev/null +++ b/mediapipe/tasks/cc/components/containers/gesture_recognition_result.h @@ -0,0 +1,46 @@ +/* Copyright 2022 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef MEDIAPIPE_TASKS_CC_COMPONENTS_CONTAINERS_GESTURE_RECOGNITION_RESULT_H_ +#define MEDIAPIPE_TASKS_CC_COMPONENTS_CONTAINERS_GESTURE_RECOGNITION_RESULT_H_ + +#include "mediapipe/framework/formats/classification.pb.h" +#include "mediapipe/framework/formats/landmark.pb.h" + +namespace mediapipe { +namespace tasks { +namespace components { +namespace containers { + +// The gesture recognition result from GestureRecognizer, where each vector +// element represents a single hand detected in the image. +struct GestureRecognitionResult { + // Recognized hand gestures with sorted order such that the winning label is + // the first item in the list. + std::vector gestures; + // Classification of handedness. + std::vector handedness; + // Detected hand landmarks in normalized image coordinates. + std::vector hand_landmarks; + // Detected hand landmarks in world coordinates. + std::vector hand_world_landmarks; +}; + +} // namespace containers +} // namespace components +} // namespace tasks +} // namespace mediapipe + +#endif // MEDIAPIPE_TASKS_CC_COMPONENTS_CONTAINERS_GESTURE_RECOGNITION_RESULT_H_ diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD b/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD index 67ae452d3..985c25cfb 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/BUILD @@ -47,6 +47,7 @@ cc_library( "//mediapipe/calculators/core:begin_loop_calculator", "//mediapipe/calculators/core:concatenate_vector_calculator", "//mediapipe/calculators/core:end_loop_calculator", + "//mediapipe/calculators/core:get_vector_item_calculator", "//mediapipe/calculators/tensor:tensor_converter_calculator", "//mediapipe/calculators/tensor:tensors_to_classification_calculator", "//mediapipe/calculators/tensor:tensors_to_classification_calculator_cc_proto", @@ -69,7 +70,8 @@ cc_library( "//mediapipe/tasks/cc/vision/gesture_recognizer/calculators:landmarks_to_matrix_calculator_cc_proto", "//mediapipe/tasks/cc/vision/gesture_recognizer/proto:hand_gesture_recognizer_graph_options_cc_proto", "//mediapipe/tasks/cc/vision/hand_landmarker:hand_landmarks_detector_graph", - "//mediapipe/tasks/cc/vision/utils:image_tensor_specs", + "//mediapipe/tasks/cc/vision/hand_landmarker/proto:hand_landmarker_graph_options_cc_proto", + "//mediapipe/tasks/cc/vision/hand_landmarker/proto:hand_landmarks_detector_graph_options_cc_proto", "//mediapipe/tasks/metadata:metadata_schema_cc", "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", @@ -90,7 +92,6 @@ cc_library( "//mediapipe/framework/formats:image", "//mediapipe/framework/formats:landmark_cc_proto", "//mediapipe/tasks/cc:common", - "//mediapipe/tasks/cc/components/containers/proto:classifications_cc_proto", "//mediapipe/tasks/cc/components/processors/proto:classifier_options_cc_proto", "//mediapipe/tasks/cc/core:model_task_graph", "//mediapipe/tasks/cc/core:utils", @@ -108,3 +109,42 @@ cc_library( ], alwayslink = 1, ) + +cc_library( + name = "gesture_recognizer", + srcs = ["gesture_recognizer.cc"], + hdrs = ["gesture_recognizer.h"], + deps = [ + ":gesture_recognizer_graph", + ":hand_gesture_recognizer_graph", + "//mediapipe/framework:packet", + "//mediapipe/framework/api2:builder", + "//mediapipe/framework/api2:port", + "//mediapipe/framework/formats:classification_cc_proto", + "//mediapipe/framework/formats:image", + "//mediapipe/framework/formats:landmark_cc_proto", + "//mediapipe/tasks/cc:common", + "//mediapipe/tasks/cc/components:image_preprocessing", + "//mediapipe/tasks/cc/components/containers:gesture_recognition_result", + "//mediapipe/tasks/cc/components/processors/proto:classifier_options_cc_proto", + "//mediapipe/tasks/cc/core:base_options", + "//mediapipe/tasks/cc/core:base_task_api", + "//mediapipe/tasks/cc/core:model_resources", + "//mediapipe/tasks/cc/core:task_runner", + "//mediapipe/tasks/cc/core:utils", + "//mediapipe/tasks/cc/core/proto:inference_subgraph_cc_proto", + "//mediapipe/tasks/cc/vision/core:base_vision_task_api", + "//mediapipe/tasks/cc/vision/core:running_mode", + "//mediapipe/tasks/cc/vision/core:vision_task_api_factory", + "//mediapipe/tasks/cc/vision/gesture_recognizer/proto:gesture_recognizer_graph_options_cc_proto", + "//mediapipe/tasks/cc/vision/gesture_recognizer/proto:hand_gesture_recognizer_graph_options_cc_proto", + "//mediapipe/tasks/cc/vision/hand_detector/proto:hand_detector_graph_options_cc_proto", + "//mediapipe/tasks/cc/vision/hand_landmarker/proto:hand_landmarker_graph_options_cc_proto", + "//mediapipe/tasks/cc/vision/hand_landmarker/proto:hand_landmarks_detector_graph_options_cc_proto", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/status", + "@com_google_absl//absl/status:statusor", + "@org_tensorflow//tensorflow/lite/core/api:op_resolver", + "@org_tensorflow//tensorflow/lite/kernels:builtin_ops", + ], +) diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc new file mode 100644 index 000000000..ca5deee7f --- /dev/null +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.cc @@ -0,0 +1,282 @@ +/* Copyright 2022 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h" + +#include +#include +#include + +#include "absl/memory/memory.h" +#include "absl/status/status.h" +#include "absl/status/statusor.h" +#include "mediapipe/framework/api2/builder.h" +#include "mediapipe/framework/api2/port.h" +#include "mediapipe/framework/formats/classification.pb.h" +#include "mediapipe/framework/formats/image.h" +#include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/framework/packet.h" +#include "mediapipe/tasks/cc/common.h" +#include "mediapipe/tasks/cc/components/image_preprocessing.h" +#include "mediapipe/tasks/cc/components/processors/proto/classifier_options.pb.h" +#include "mediapipe/tasks/cc/core/base_task_api.h" +#include "mediapipe/tasks/cc/core/model_resources.h" +#include "mediapipe/tasks/cc/core/proto/inference_subgraph.pb.h" +#include "mediapipe/tasks/cc/core/task_runner.h" +#include "mediapipe/tasks/cc/core/utils.h" +#include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" +#include "mediapipe/tasks/cc/vision/core/vision_task_api_factory.h" +#include "mediapipe/tasks/cc/vision/gesture_recognizer/proto/gesture_recognizer_graph_options.pb.h" +#include "mediapipe/tasks/cc/vision/gesture_recognizer/proto/hand_gesture_recognizer_graph_options.pb.h" +#include "mediapipe/tasks/cc/vision/hand_detector/proto/hand_detector_graph_options.pb.h" +#include "mediapipe/tasks/cc/vision/hand_landmarker/proto/hand_landmarker_graph_options.pb.h" +#include "mediapipe/tasks/cc/vision/hand_landmarker/proto/hand_landmarks_detector_graph_options.pb.h" + +namespace mediapipe { +namespace tasks { +namespace vision { +namespace gesture_recognizer { + +namespace { + +using GestureRecognizerGraphOptionsProto = ::mediapipe::tasks::vision:: + gesture_recognizer::proto::GestureRecognizerGraphOptions; + +using ::mediapipe::tasks::components::containers::GestureRecognitionResult; + +constexpr char kHandGestureSubgraphTypeName[] = + "mediapipe.tasks.vision.gesture_recognizer.GestureRecognizerGraph"; + +constexpr char kImageTag[] = "IMAGE"; +constexpr char kImageInStreamName[] = "image_in"; +constexpr char kImageOutStreamName[] = "image_out"; +constexpr char kHandGesturesTag[] = "HAND_GESTURES"; +constexpr char kHandGesturesStreamName[] = "hand_gestures"; +constexpr char kHandednessTag[] = "HANDEDNESS"; +constexpr char kHandednessStreamName[] = "handedness"; +constexpr char kHandLandmarksTag[] = "LANDMARKS"; +constexpr char kHandLandmarksStreamName[] = "landmarks"; +constexpr char kHandWorldLandmarksTag[] = "WORLD_LANDMARKS"; +constexpr char kHandWorldLandmarksStreamName[] = "world_landmarks"; +constexpr int kMicroSecondsPerMilliSecond = 1000; + +// Creates a MediaPipe graph config that contains a subgraph node of +// "mediapipe.tasks.vision.GestureRecognizerGraph". If the task is running +// in the live stream mode, a "FlowLimiterCalculator" will be added to limit the +// number of frames in flight. +CalculatorGraphConfig CreateGraphConfig( + std::unique_ptr options, + bool enable_flow_limiting) { + api2::builder::Graph graph; + auto& subgraph = graph.AddNode(kHandGestureSubgraphTypeName); + subgraph.GetOptions().Swap(options.get()); + graph.In(kImageTag).SetName(kImageInStreamName); + subgraph.Out(kHandGesturesTag).SetName(kHandGesturesStreamName) >> + graph.Out(kHandGesturesTag); + subgraph.Out(kHandednessTag).SetName(kHandednessStreamName) >> + graph.Out(kHandednessTag); + subgraph.Out(kHandLandmarksTag).SetName(kHandLandmarksStreamName) >> + graph.Out(kHandLandmarksTag); + subgraph.Out(kHandWorldLandmarksTag).SetName(kHandWorldLandmarksStreamName) >> + graph.Out(kHandWorldLandmarksTag); + subgraph.Out(kImageTag).SetName(kImageOutStreamName) >> graph.Out(kImageTag); + if (enable_flow_limiting) { + return tasks::core::AddFlowLimiterCalculator(graph, subgraph, {kImageTag}, + kHandGesturesTag); + } + graph.In(kImageTag) >> subgraph.In(kImageTag); + return graph.GetConfig(); +} + +// Converts the user-facing GestureRecognizerOptions struct to the internal +// GestureRecognizerGraphOptions proto. +std::unique_ptr +ConvertGestureRecognizerGraphOptionsProto(GestureRecognizerOptions* options) { + auto options_proto = std::make_unique(); + + bool use_stream_mode = options->running_mode != core::RunningMode::IMAGE; + + // TODO remove these workarounds for base options of subgraphs. + // Configure hand detector options. + auto base_options_proto_for_hand_detector = + std::make_unique( + tasks::core::ConvertBaseOptionsToProto( + &(options->base_options_for_hand_detector))); + base_options_proto_for_hand_detector->set_use_stream_mode(use_stream_mode); + auto* hand_detector_graph_options = + options_proto->mutable_hand_landmarker_graph_options() + ->mutable_hand_detector_graph_options(); + hand_detector_graph_options->mutable_base_options()->Swap( + base_options_proto_for_hand_detector.get()); + hand_detector_graph_options->set_num_hands(options->num_hands); + hand_detector_graph_options->set_min_detection_confidence( + options->min_hand_detection_confidence); + + // Configure hand landmark detector options. + auto base_options_proto_for_hand_landmarker = + std::make_unique( + tasks::core::ConvertBaseOptionsToProto( + &(options->base_options_for_hand_landmarker))); + base_options_proto_for_hand_landmarker->set_use_stream_mode(use_stream_mode); + auto* hand_landmarks_detector_graph_options = + options_proto->mutable_hand_landmarker_graph_options() + ->mutable_hand_landmarks_detector_graph_options(); + hand_landmarks_detector_graph_options->mutable_base_options()->Swap( + base_options_proto_for_hand_landmarker.get()); + hand_landmarks_detector_graph_options->set_min_detection_confidence( + options->min_hand_presence_confidence); + + auto* hand_landmarker_graph_options = + options_proto->mutable_hand_landmarker_graph_options(); + hand_landmarker_graph_options->set_min_tracking_confidence( + options->min_tracking_confidence); + + // Configure hand gesture recognizer options. + auto base_options_proto_for_gesture_recognizer = + std::make_unique( + tasks::core::ConvertBaseOptionsToProto( + &(options->base_options_for_gesture_recognizer))); + base_options_proto_for_gesture_recognizer->set_use_stream_mode( + use_stream_mode); + auto* hand_gesture_recognizer_graph_options = + options_proto->mutable_hand_gesture_recognizer_graph_options(); + hand_gesture_recognizer_graph_options->mutable_base_options()->Swap( + base_options_proto_for_gesture_recognizer.get()); + if (options->min_gesture_confidence >= 0) { + hand_gesture_recognizer_graph_options->mutable_classifier_options() + ->set_score_threshold(options->min_gesture_confidence); + } + return options_proto; +} + +} // namespace + +absl::StatusOr> GestureRecognizer::Create( + std::unique_ptr options) { + auto options_proto = ConvertGestureRecognizerGraphOptionsProto(options.get()); + tasks::core::PacketsCallback packets_callback = nullptr; + if (options->result_callback) { + auto result_callback = options->result_callback; + packets_callback = [=](absl::StatusOr + status_or_packets) { + if (!status_or_packets.ok()) { + Image image; + result_callback(status_or_packets.status(), image, + Timestamp::Unset().Value()); + return; + } + if (status_or_packets.value()[kImageOutStreamName].IsEmpty()) { + return; + } + Packet gesture_packet = + status_or_packets.value()[kHandGesturesStreamName]; + Packet handedness_packet = + status_or_packets.value()[kHandednessStreamName]; + Packet hand_landmarks_packet = + status_or_packets.value()[kHandLandmarksStreamName]; + Packet hand_world_landmarks_packet = + status_or_packets.value()[kHandWorldLandmarksStreamName]; + Packet image_packet = status_or_packets.value()[kImageOutStreamName]; + result_callback( + {{gesture_packet.Get>(), + handedness_packet.Get>(), + hand_landmarks_packet.Get>(), + hand_world_landmarks_packet.Get>()}}, + image_packet.Get(), + gesture_packet.Timestamp().Value() / kMicroSecondsPerMilliSecond); + }; + } + return core::VisionTaskApiFactory::Create( + CreateGraphConfig( + std::move(options_proto), + options->running_mode == core::RunningMode::LIVE_STREAM), + std::move(options->base_options.op_resolver), options->running_mode, + std::move(packets_callback)); +} + +absl::StatusOr GestureRecognizer::Recognize( + mediapipe::Image image) { + if (image.UsesGpu()) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + "GPU input images are currently not supported.", + MediaPipeTasksStatus::kRunnerUnexpectedInputError); + } + ASSIGN_OR_RETURN(auto output_packets, + ProcessImageData({{kImageInStreamName, + MakePacket(std::move(image))}})); + return { + {/* gestures= */ {output_packets[kHandGesturesStreamName] + .Get>()}, + /* handedness= */ + {output_packets[kHandednessStreamName] + .Get>()}, + /* hand_landmarks= */ + {output_packets[kHandLandmarksStreamName] + .Get>()}, + /* hand_world_landmarks */ + {output_packets[kHandWorldLandmarksStreamName] + .Get>()}}, + }; +} + +absl::StatusOr GestureRecognizer::RecognizeForVideo( + mediapipe::Image image, int64 timestamp_ms) { + if (image.UsesGpu()) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + absl::StrCat("GPU input images are currently not supported."), + MediaPipeTasksStatus::kRunnerUnexpectedInputError); + } + ASSIGN_OR_RETURN( + auto output_packets, + ProcessVideoData( + {{kImageInStreamName, + MakePacket(std::move(image)) + .At(Timestamp(timestamp_ms * kMicroSecondsPerMilliSecond))}})); + return { + {/* gestures= */ {output_packets[kHandGesturesStreamName] + .Get>()}, + /* handedness= */ + {output_packets[kHandednessStreamName] + .Get>()}, + /* hand_landmarks= */ + {output_packets[kHandLandmarksStreamName] + .Get>()}, + /* hand_world_landmarks */ + {output_packets[kHandWorldLandmarksStreamName] + .Get>()}}, + }; +} + +absl::Status GestureRecognizer::RecognizeAsync(mediapipe::Image image, + int64 timestamp_ms) { + if (image.UsesGpu()) { + return CreateStatusWithPayload( + absl::StatusCode::kInvalidArgument, + absl::StrCat("GPU input images are currently not supported."), + MediaPipeTasksStatus::kRunnerUnexpectedInputError); + } + return SendLiveStreamData( + {{kImageInStreamName, + MakePacket(std::move(image)) + .At(Timestamp(timestamp_ms * kMicroSecondsPerMilliSecond))}}); +} + +} // namespace gesture_recognizer +} // namespace vision +} // namespace tasks +} // namespace mediapipe diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h new file mode 100644 index 000000000..17c9cc921 --- /dev/null +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer.h @@ -0,0 +1,172 @@ +/* Copyright 2022 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef MEDIAPIPE_TASKS_CC_VISION_GESTURE_RECOGNIZRER_GESTURE_RECOGNIZER_H_ +#define MEDIAPIPE_TASKS_CC_VISION_GESTURE_RECOGNIZRER_GESTURE_RECOGNIZER_H_ + +#include + +#include "absl/status/statusor.h" +#include "mediapipe/framework/formats/classification.pb.h" +#include "mediapipe/framework/formats/image.h" +#include "mediapipe/framework/formats/landmark.pb.h" +#include "mediapipe/tasks/cc/components/containers/gesture_recognition_result.h" +#include "mediapipe/tasks/cc/core/base_options.h" +#include "mediapipe/tasks/cc/vision/core/base_vision_task_api.h" +#include "mediapipe/tasks/cc/vision/core/running_mode.h" + +namespace mediapipe { +namespace tasks { +namespace vision { +namespace gesture_recognizer { + +struct GestureRecognizerOptions { + // Base options for configuring Task library, such as specifying the TfLite + // model file with metadata, accelerator options, op resolver, etc. + tasks::core::BaseOptions base_options; + + // TODO: remove these. Temporary solutions before bundle asset is + // ready. + tasks::core::BaseOptions base_options_for_hand_landmarker; + tasks::core::BaseOptions base_options_for_hand_detector; + tasks::core::BaseOptions base_options_for_gesture_recognizer; + + // The running mode of the task. Default to the image mode. + // GestureRecognizer has three running modes: + // 1) The image mode for recognizing hand gestures on single image inputs. + // 2) The video mode for recognizing hand gestures on the decoded frames of a + // video. + // 3) The live stream mode for recognizing hand gestures on the live stream of + // input data, such as from camera. In this mode, the "result_callback" + // below must be specified to receive the detection results asynchronously. + core::RunningMode running_mode = core::RunningMode::IMAGE; + + // The maximum number of hands can be detected by the GestureRecognizer. + int num_hands = 1; + + // The minimum confidence score for the hand detection to be considered + // successfully. + float min_hand_detection_confidence = 0.5; + + // The minimum confidence score of hand presence score in the hand landmark + // detection. + float min_hand_presence_confidence = 0.5; + + // The minimum confidence score for the hand tracking to be considered + // successfully. + float min_tracking_confidence = 0.5; + + // The minimum confidence score for the gestures to be considered + // successfully. If < 0, the gesture confidence thresholds in the model + // metadata are used. + // TODO Note this option is subject to change, after scoring + // merging calculator is implemented. + float min_gesture_confidence = -1; + + // The user-defined result callback for processing live stream data. + // The result callback should only be specified when the running mode is set + // to RunningMode::LIVE_STREAM. + std::function, + const Image&, int64)> + result_callback = nullptr; +}; + +// Performs hand gesture recognition on the given image. +// +// TODO add the link to DevSite. +// This API expects expects a pre-trained hand gesture model asset bundle, or a +// custom one created using Model Maker. See . +// +// Inputs: +// Image +// - The image that gesture recognition runs on. +// Outputs: +// GestureRecognitionResult +// - The hand gesture recognition results. +class GestureRecognizer : tasks::vision::core::BaseVisionTaskApi { + public: + using BaseVisionTaskApi::BaseVisionTaskApi; + + // Creates a GestureRecognizer from a GestureRecognizerhOptions to process + // image data or streaming data. Gesture recognizer can be created with one of + // the following three running modes: + // 1) Image mode for recognizing gestures on single image inputs. + // Users provide mediapipe::Image to the `Recognize` method, and will + // receive the recognized hand gesture results as the return value. + // 2) Video mode for recognizing gestures on the decoded frames of a video. + // 3) Live stream mode for recognizing gestures on the live stream of the + // input data, such as from camera. Users call `RecognizeAsync` to push the + // image data into the GestureRecognizer, the recognized results along with + // the input timestamp and the image that gesture recognizer runs on will + // be available in the result callback when the gesture recognizer finishes + // the work. + static absl::StatusOr> Create( + std::unique_ptr options); + + // Performs hand gesture recognition on the given image. + // Only use this method when the GestureRecognizer is created with the image + // running mode. + // + // image - mediapipe::Image + // Image to perform hand gesture recognition on. + // + // The image can be of any size with format RGB or RGBA. + // TODO: Describes how the input image will be preprocessed + // after the yuv support is implemented. + absl::StatusOr Recognize( + Image image); + + // Performs gesture recognition on the provided video frame. + // Only use this method when the GestureRecognizer is created with the video + // running mode. + // + // The image can be of any size with format RGB or RGBA. It's required to + // provide the video frame's timestamp (in milliseconds). The input timestamps + // must be monotonically increasing. + absl::StatusOr + RecognizeForVideo(Image image, int64 timestamp_ms); + + // Sends live image data to perform gesture recognition, and the results will + // be available via the "result_callback" provided in the + // GestureRecognizerOptions. Only use this method when the GestureRecognizer + // is created with the live stream running mode. + // + // The image can be of any size with format RGB or RGBA. It's required to + // provide a timestamp (in milliseconds) to indicate when the input image is + // sent to the gesture recognizer. The input timestamps must be monotonically + // increasing. + // + // The "result_callback" provides + // - A vector of GestureRecognitionResult, each is the recognized results + // for a input frame. + // - The const reference to the corresponding input image that the gesture + // recognizer runs on. Note that the const reference to the image will no + // longer be valid when the callback returns. To access the image data + // outside of the callback, callers need to make a copy of the image. + // - The input timestamp in milliseconds. + absl::Status RecognizeAsync(Image image, int64 timestamp_ms); + + // Shuts down the GestureRecognizer when all works are done. + absl::Status Close() { return runner_->Close(); } +}; + +} // namespace gesture_recognizer +} // namespace vision +} // namespace tasks +} // namespace mediapipe + +#endif // MEDIAPIPE_TASKS_CC_VISION_GESTURE_RECOGNIZRER_GESTURE_RECOGNIZER_H_ diff --git a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc index 30c28bee3..b4f2af4d8 100644 --- a/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc +++ b/mediapipe/tasks/cc/vision/gesture_recognizer/gesture_recognizer_graph.cc @@ -25,7 +25,6 @@ limitations under the License. #include "mediapipe/framework/formats/image.h" #include "mediapipe/framework/formats/landmark.pb.h" #include "mediapipe/tasks/cc/common.h" -#include "mediapipe/tasks/cc/components/containers/proto/classifications.pb.h" #include "mediapipe/tasks/cc/core/model_task_graph.h" #include "mediapipe/tasks/cc/core/utils.h" #include "mediapipe/tasks/cc/vision/gesture_recognizer/proto/gesture_recognizer_graph_options.pb.h" @@ -46,7 +45,6 @@ using ::mediapipe::api2::Input; using ::mediapipe::api2::Output; using ::mediapipe::api2::builder::Graph; using ::mediapipe::api2::builder::Source; -using ::mediapipe::tasks::components::containers::proto::ClassificationResult; using ::mediapipe::tasks::vision::gesture_recognizer::proto:: GestureRecognizerGraphOptions; using ::mediapipe::tasks::vision::gesture_recognizer::proto:: @@ -63,10 +61,10 @@ constexpr char kHandGesturesTag[] = "HAND_GESTURES"; constexpr char kHandTrackingIdsTag[] = "HAND_TRACKING_IDS"; struct GestureRecognizerOutputs { - Source> gesture; - Source> handedness; - Source> hand_landmarks; - Source> hand_world_landmarks; + Source> gesture; + Source> handedness; + Source> hand_landmarks; + Source> hand_world_landmarks; Source image; }; @@ -80,7 +78,7 @@ struct GestureRecognizerOutputs { // Image to perform hand gesture recognition on. // // Outputs: -// HAND_GESTURES - std::vector +// HAND_GESTURES - std::vector // Recognized hand gestures with sorted order such that the winning label is // the first item in the list. // LANDMARKS: - std::vector @@ -136,15 +134,13 @@ class GestureRecognizerGraph : public core::ModelTaskGraph { *sc->MutableOptions(), graph[Input(kImageTag)], graph)); hand_gesture_recognition_output.gesture >> - graph[Output>(kHandGesturesTag)]; + graph[Output>(kHandGesturesTag)]; hand_gesture_recognition_output.handedness >> - graph[Output>( - kHandednessTag)]; + graph[Output>(kHandednessTag)]; hand_gesture_recognition_output.hand_landmarks >> - graph[Output>( - kLandmarksTag)]; + graph[Output>(kLandmarksTag)]; hand_gesture_recognition_output.hand_world_landmarks >> - graph[Output>(kWorldLandmarksTag)]; + graph[Output>(kWorldLandmarksTag)]; hand_gesture_recognition_output.image >> graph[Output(kImageTag)]; return graph.GetConfig(); } @@ -193,7 +189,7 @@ class GestureRecognizerGraph : public core::ModelTaskGraph { image_size >> hand_gesture_subgraph.In(kImageSizeTag); hand_landmarks_id >> hand_gesture_subgraph.In(kHandTrackingIdsTag); auto hand_gestures = - hand_gesture_subgraph[Output>( + hand_gesture_subgraph[Output>( kHandGesturesTag)]; return {{.gesture = hand_gestures, diff --git a/mediapipe/tasks/testdata/vision/BUILD b/mediapipe/tasks/testdata/vision/BUILD index 5eda42601..290b29016 100644 --- a/mediapipe/tasks/testdata/vision/BUILD +++ b/mediapipe/tasks/testdata/vision/BUILD @@ -47,6 +47,7 @@ mediapipe_files(srcs = [ "mozart_square.jpg", "multi_objects.jpg", "palm_detection_full.tflite", + "pointing_up.jpg", "right_hands.jpg", "segmentation_golden_rotation0.png", "segmentation_input_rotation0.jpg", @@ -54,6 +55,7 @@ mediapipe_files(srcs = [ "selfie_segm_128_128_3_expected_mask.jpg", "selfie_segm_144_256_3.tflite", "selfie_segm_144_256_3_expected_mask.jpg", + "thumb_up.jpg", ]) exports_files( @@ -79,11 +81,13 @@ filegroup( "left_hands.jpg", "mozart_square.jpg", "multi_objects.jpg", + "pointing_up.jpg", "right_hands.jpg", "segmentation_golden_rotation0.png", "segmentation_input_rotation0.jpg", "selfie_segm_128_128_3_expected_mask.jpg", "selfie_segm_144_256_3_expected_mask.jpg", + "thumb_up.jpg", ], visibility = [ "//mediapipe/python:__subpackages__", diff --git a/mediapipe/tasks/testdata/vision/pointing_up_landmarks.pbtxt b/mediapipe/tasks/testdata/vision/pointing_up_landmarks.pbtxt index fdd8b9c8d..05917af3e 100644 --- a/mediapipe/tasks/testdata/vision/pointing_up_landmarks.pbtxt +++ b/mediapipe/tasks/testdata/vision/pointing_up_landmarks.pbtxt @@ -8,216 +8,216 @@ classifications { landmarks { landmark { - x: 0.4749803 - y: 0.76872 - z: 9.286178e-08 + x: 0.47923622 + y: 0.7426044 + z: 2.3221878e-07 } landmark { - x: 0.5466898 - y: 0.6706463 - z: -0.03454024 + x: 0.5403745 + y: 0.66178805 + z: -0.044572093 } landmark { - x: 0.5890165 - y: 0.5604909 - z: -0.055142127 + x: 0.5774534 + y: 0.5608346 + z: -0.07581605 } landmark { - x: 0.52780133 - y: 0.49855334 - z: -0.07846409 + x: 0.52648556 + y: 0.50247055 + z: -0.105467044 } landmark { - x: 0.44487286 - y: 0.49801928 - z: -0.10188004 + x: 0.44289914 + y: 0.49489295 + z: -0.13422011 } landmark { - x: 0.47572923 - y: 0.44477755 - z: -0.028345175 + x: 0.4728853 + y: 0.43925008 + z: -0.058122505 } landmark { - x: 0.48013464 - y: 0.32467923 - z: -0.06513901 + x: 0.4803168 + y: 0.32889345 + z: -0.101187326 } landmark { - x: 0.48351905 - y: 0.25804192 - z: -0.086756624 + x: 0.48436823 + y: 0.25876504 + z: -0.12840955 } landmark { - x: 0.47760454 - y: 0.19289327 - z: -0.10468461 + x: 0.47388697 + y: 0.19592366 + z: -0.15085006 } landmark { - x: 0.3993108 - y: 0.47566867 - z: -0.040357687 + x: 0.39129356 + y: 0.47211456 + z: -0.06835801 } landmark { - x: 0.42361537 - y: 0.42491958 - z: -0.103545874 + x: 0.41798547 + y: 0.42218646 + z: -0.12954563 } landmark { - x: 0.46059948 - y: 0.51723665 - z: -0.1214961 + x: 0.45758423 + y: 0.5232461 + z: -0.14131334 } landmark { - x: 0.4580545 - y: 0.55640894 - z: -0.12272568 + x: 0.45100626 + y: 0.5554065 + z: -0.13883406 } landmark { - x: 0.34109607 - y: 0.5184511 - z: -0.056422118 + x: 0.33133638 + y: 0.51777464 + z: -0.08227023 } landmark { - x: 0.36177525 - y: 0.48427337 - z: -0.12584248 + x: 0.35698116 + y: 0.48688585 + z: -0.14713185 } landmark { - x: 0.40706652 - y: 0.5700621 - z: -0.11658718 + x: 0.40754414 + y: 0.57370347 + z: -0.12981415 } landmark { - x: 0.40535083 - y: 0.6000496 - z: -0.09520916 + x: 0.40011865 + y: 0.5930706 + z: -0.10554546 } landmark { - x: 0.2872031 - y: 0.57303333 - z: -0.074813806 + x: 0.2783401 + y: 0.5735568 + z: -0.09971398 } landmark { - x: 0.30961618 - y: 0.533245 - z: -0.114366606 + x: 0.30884498 + y: 0.5394487 + z: -0.14033116 } landmark { - x: 0.35510173 - y: 0.5838698 - z: -0.096521005 + x: 0.35470563 + y: 0.5917965 + z: -0.11820527 } landmark { - x: 0.36053744 - y: 0.608682 - z: -0.07574715 + x: 0.34865493 + y: 0.61057556 + z: -0.09509217 } } world_landmarks { landmark { - x: 0.018890835 - y: 0.09005852 - z: 0.031907097 + x: 0.016918864 + y: 0.08634466 + z: 0.035783045 } landmark { - x: 0.04198891 - y: 0.061256267 - z: 0.017695501 + x: 0.04193685 + y: 0.056667875 + z: 0.019453367 } landmark { - x: 0.05044507 - y: 0.033841074 - z: 0.0015051212 + x: 0.050382353 + y: 0.031786427 + z: 0.0023380776 } landmark { - x: 0.039822325 - y: 0.0073827556 - z: -0.02168335 + x: 0.043284662 + y: 0.008976387 + z: -0.02496663 } landmark { - x: 0.012921701 - y: 0.0025111444 - z: -0.033813436 + x: 0.016010094 + y: 0.004991216 + z: -0.036876947 } landmark { - x: 0.023851154 - y: -0.011495698 - z: 0.0066048754 + x: 0.02450771 + y: -0.013496464 + z: 0.0041254223 } landmark { - x: 0.023206754 - y: -0.042496294 - z: -0.0026847485 + x: 0.024783865 + y: -0.041331705 + z: -0.0028748964 } landmark { - x: 0.02298078 - y: -0.062678955 - z: -0.013068148 + x: 0.025917178 + y: -0.06191107 + z: -0.010242647 } landmark { - x: 0.021972645 - y: -0.08151748 - z: -0.03677687 + x: 0.023101516 + y: -0.07967696 + z: -0.03152665 } landmark { - x: -0.00016964211 - y: -0.005549716 - z: 0.0058569373 + x: 0.0006629339 + y: -0.0060150283 + z: 0.004906766 } landmark { - x: 0.0075052455 - y: -0.020031122 - z: -0.027775772 + x: 0.0077093104 + y: -0.017035034 + z: -0.029702934 } landmark { - x: 0.017835317 - y: 0.004899453 - z: -0.037390795 + x: 0.017517095 + y: 0.008997183 + z: -0.03692814 } landmark { - x: 0.016913192 - y: 0.018281722 - z: -0.019302163 + x: 0.0145079205 + y: 0.017461296 + z: -0.011290487 } landmark { - x: -0.018799124 - y: 0.0053577404 - z: -0.0040608873 + x: -0.018095909 + y: 0.006112392 + z: -0.0027157406 } landmark { - x: -0.00747582 - y: 0.0019600953 - z: -0.034023333 + x: -0.010212201 + y: 0.0052777785 + z: -0.034659054 } landmark { - x: 0.0035368819 - y: 0.025736088 - z: -0.03452471 + x: 0.0043836404 + y: 0.028383566 + z: -0.03296758 } landmark { - x: 0.0080153765 - y: 0.039885145 - z: -0.013341276 + x: 0.003886811 + y: 0.036054 + z: -0.0074628904 } landmark { - x: -0.029628165 - y: 0.028607829 - z: -0.011377414 + x: -0.03178849 + y: 0.029854178 + z: -0.008874044 } landmark { - x: -0.023356002 - y: 0.017514031 - z: -0.029408533 + x: -0.02403016 + y: 0.021497255 + z: -0.027618393 } landmark { - x: -0.008503268 - y: 0.027560957 - z: -0.035641473 + x: -0.008522437 + y: 0.031886857 + z: -0.032367583 } landmark { - x: -0.0070180474 - y: 0.039056484 - z: -0.023629948 + x: -0.012865841 + y: 0.038687646 + z: -0.017172804 } } diff --git a/mediapipe/tasks/testdata/vision/thumb_up_landmarks.pbtxt b/mediapipe/tasks/testdata/vision/thumb_up_landmarks.pbtxt index 00b47a3da..e73a69d31 100644 --- a/mediapipe/tasks/testdata/vision/thumb_up_landmarks.pbtxt +++ b/mediapipe/tasks/testdata/vision/thumb_up_landmarks.pbtxt @@ -8,216 +8,216 @@ classifications { landmarks { landmark { - x: 0.6065784 - y: 0.7356081 - z: -5.2289305e-08 + x: 0.6387502 + y: 0.67134184 + z: -3.4044612e-07 } landmark { - x: 0.6349347 - y: 0.5735343 - z: -0.047243003 + x: 0.634891 + y: 0.53670025 + z: -0.06968865 } landmark { - x: 0.5788341 - y: 0.42688707 - z: -0.036071796 + x: 0.5746676 + y: 0.41283816 + z: -0.09383486 } landmark { - x: 0.51322824 - y: 0.3153786 - z: -0.021018881 + x: 0.49967948 + y: 0.32550922 + z: -0.10799447 } landmark { - x: 0.49179295 - y: 0.25291175 - z: 0.0061425082 + x: 0.47362617 + y: 0.25102285 + z: -0.10590933 } landmark { - x: 0.49944243 - y: 0.45409226 - z: 0.06513325 + x: 0.40749234 + y: 0.47130388 + z: -0.04694611 } landmark { - x: 0.3822241 - y: 0.45645967 - z: 0.045028925 + x: 0.3372087 + y: 0.46742308 + z: -0.0997342 } landmark { - x: 0.4427338 - y: 0.49150866 - z: 0.024395633 + x: 0.4418445 + y: 0.50960016 + z: -0.111206524 } landmark { - x: 0.5015556 - y: 0.4798539 - z: 0.014423937 + x: 0.48056933 + y: 0.5187666 + z: -0.11022365 } landmark { - x: 0.46654877 - y: 0.5420721 - z: 0.08380699 + x: 0.39218128 + y: 0.5495232 + z: -0.028925514 } landmark { - x: 0.3540949 - y: 0.545657 - z: 0.056201216 + x: 0.34047198 + y: 0.55610204 + z: -0.08213869 } landmark { - x: 0.43828446 - y: 0.5723222 - z: 0.03073385 + x: 0.46152583 + y: 0.58310646 + z: -0.08393028 } landmark { - x: 0.4894746 - y: 0.54662794 - z: 0.016284892 + x: 0.47058716 + y: 0.56413835 + z: -0.078857616 } landmark { - x: 0.44287524 - y: 0.6153337 - z: 0.0878331 + x: 0.39237642 + y: 0.61864823 + z: -0.022026168 } landmark { - x: 0.3531985 - y: 0.6305228 - z: 0.048528627 + x: 0.34304678 + y: 0.62800515 + z: -0.08132204 } landmark { - x: 0.42727134 - y: 0.64344436 - z: 0.027383275 + x: 0.45004016 + y: 0.64300805 + z: -0.06211204 } landmark { - x: 0.46999624 - y: 0.61115295 - z: 0.021795912 + x: 0.4640005 + y: 0.6221539 + z: -0.038953774 } landmark { - x: 0.43323213 - y: 0.6734935 - z: 0.087731235 + x: 0.39231628 + y: 0.68187976 + z: -0.020164328 } landmark { - x: 0.3772134 - y: 0.69590896 - z: 0.07259013 + x: 0.35785866 + y: 0.6985842 + z: -0.052247807 } landmark { - x: 0.42301077 - y: 0.70083475 - z: 0.06279105 + x: 0.42698768 + y: 0.69892275 + z: -0.037642766 } landmark { - x: 0.45672464 - y: 0.6844607 - z: 0.059202813 + x: 0.44422707 + y: 0.6876204 + z: -0.02034688 } } world_landmarks { landmark { - x: 0.047059614 - y: 0.04719348 - z: 0.03951376 + x: 0.06753889 + y: 0.031051591 + z: 0.05541924 } landmark { - x: 0.050449535 - y: 0.012183173 - z: 0.016567508 + x: 0.06327636 + y: -0.003913434 + z: 0.02125023 } landmark { - x: 0.04375921 - y: -0.020305036 - z: 0.012189768 + x: 0.05469646 + y: -0.038668767 + z: 0.01118496 } landmark { - x: 0.022525383 - y: -0.04830697 - z: 0.008714083 + x: 0.03557241 + y: -0.06865983 + z: 0.0029562893 } landmark { - x: 0.011789754 - y: -0.06952699 - z: 0.0029319536 + x: 0.019069858 + y: -0.08740239 + z: 0.007222481 } landmark { - x: 0.009532374 - y: -0.019510617 - z: 0.0015609035 + x: 0.0044852756 + y: -0.02772763 + z: -0.004234833 } landmark { - x: -0.007894232 - y: -0.022080563 - z: -0.014592148 + x: -0.0031203926 + y: -0.024173645 + z: -0.033932913 } landmark { - x: -0.002826123 - y: -0.019949362 - z: -0.009392118 + x: 0.0080217365 + y: -0.018939625 + z: -0.032623816 } landmark { - x: 0.009066351 - y: -0.016403511 - z: 0.005516675 + x: 0.025537387 + y: -0.014517117 + z: -0.004398854 } landmark { - x: -0.0031000748 - y: -0.003971943 - z: 0.004851345 + x: -0.004470923 + y: -0.0040212176 + z: 0.0025033879 } landmark { - x: -0.016852753 - y: -0.009905987 - z: -0.016275175 + x: -0.010845158 + y: -0.0031857258 + z: -0.036282137 } landmark { - x: -0.006703893 - y: -0.0026965735 - z: -0.015606856 + x: 0.016729971 + y: 0.0028876318 + z: -0.036264844 } landmark { - x: 0.007890566 - y: -0.010418876 - z: 0.0050479355 + x: 0.019928008 + y: -0.0032422952 + z: 0.004380459 } landmark { - x: -0.007842411 - y: 0.011552694 - z: -0.0005755241 + x: -0.005686749 + y: 0.017101247 + z: 0.0036791638 } landmark { - x: -0.021125216 - y: 0.009268615 - z: -0.017993882 + x: -0.010514952 + y: 0.017355483 + z: -0.02882688 } landmark { - x: -0.006585305 - y: 0.013378072 - z: -0.01709412 + x: 0.014503509 + y: 0.019414417 + z: -0.026207235 } landmark { - x: 0.008140431 - y: 0.008364402 - z: -0.0051898304 + x: 0.0211232 + y: 0.014327417 + z: 0.0011467658 } landmark { - x: -0.01082343 - y: 0.03213215 - z: -0.00069864903 + x: 0.0011399705 + y: 0.043651186 + z: 0.0068390737 } landmark { - x: -0.0199164 - y: 0.028296603 - z: -0.01447433 + x: -0.010388309 + y: 0.03904784 + z: -0.015677728 } landmark { - x: -0.00960456 - y: 0.026734762 - z: -0.019243335 + x: 0.006957108 + y: 0.03613425 + z: -0.028704688 } landmark { - x: 0.0040425956 - y: 0.025051914 - z: -0.014775545 + x: 0.012793289 + y: 0.03930679 + z: -0.012465539 } } diff --git a/third_party/external_files.bzl b/third_party/external_files.bzl index 24ceba639..c9a499b66 100644 --- a/third_party/external_files.bzl +++ b/third_party/external_files.bzl @@ -432,8 +432,8 @@ def external_files(): http_file( name = "com_google_mediapipe_pointing_up_landmarks_pbtxt", - sha256 = "1255b6ba17b4ef7a9b3ce92c0a139e74fbcec272dc251b049b2f06732f9fed83", - urls = ["https://storage.googleapis.com/mediapipe-assets/pointing_up_landmarks.pbtxt?generation=1662650664573638"], + sha256 = "a3cd7f088a9e997dbb8f00d91dbf3faaacbdb262c8f2fde3c07a9d0656488065", + urls = ["https://storage.googleapis.com/mediapipe-assets/pointing_up_landmarks.pbtxt?generation=1665174976408451"], ) http_file( @@ -588,8 +588,8 @@ def external_files(): http_file( name = "com_google_mediapipe_thumb_up_landmarks_pbtxt", - sha256 = "bf1913df6ac7cc14b492c10411c827832839985c057b112789e04ce7c1fdd0fa", - urls = ["https://storage.googleapis.com/mediapipe-assets/thumb_up_landmarks.pbtxt?generation=1662650669387278"], + sha256 = "b129ae0536be4e25d6cdee74aabe9dedf1bcfe87430a40b68be4079db3a4d926", + urls = ["https://storage.googleapis.com/mediapipe-assets/thumb_up_landmarks.pbtxt?generation=1665174979747784"], ) http_file(