From 8f7e36b3448cc64b92cc970e78d50b58b3dfe864 Mon Sep 17 00:00:00 2001 From: liuyulvv Date: Fri, 12 Aug 2022 09:59:47 +0800 Subject: [PATCH] =?UTF-8?q?holistic=E6=94=AF=E6=8C=81onnxruntime=E7=9A=84c?= =?UTF-8?q?uda=E5=92=8Ctensorrt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../examples/desktop/holistic_tracking/BUILD | 32 +++ mediapipe/graphs/holistic_tracking/BUILD | 24 ++ .../holistic_tracking_onnx_cuda.pbtxt | 75 +++++++ .../holistic_tracking_onnx_tensorrt.pbtxt | 75 +++++++ mediapipe/modules/hand_landmark/BUILD | 40 ++++ .../hand_landmark_onnx_cuda.pbtxt | 205 ++++++++++++++++++ .../hand_landmark_onnx_tensorrt.pbtxt | 205 ++++++++++++++++++ mediapipe/modules/holistic_landmark/BUILD | 142 ++++++++++++ .../face_landmarks_from_pose_onnx_cuda.pbtxt | 82 +++++++ ...ce_landmarks_from_pose_onnx_tensorrt.pbtxt | 82 +++++++ .../hand_landmarks_from_pose_onnx_cuda.pbtxt | 78 +++++++ ...nd_landmarks_from_pose_onnx_tensorrt.pbtxt | 78 +++++++ ...d_landmarks_left_and_right_onnx_cuda.pbtxt | 76 +++++++ ...ndmarks_left_and_right_onnx_tensorrt.pbtxt | 76 +++++++ .../hand_recrop_by_roi_onnx_cuda.pbtxt | 137 ++++++++++++ .../hand_recrop_by_roi_onnx_tensorrt.pbtxt | 137 ++++++++++++ .../holistic_landmark_onnx_cuda.pbtxt | 146 +++++++++++++ .../holistic_landmark_onnx_tensorrt.pbtxt | 146 +++++++++++++ 18 files changed, 1836 insertions(+) create mode 100644 mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_cuda.pbtxt create mode 100644 mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/hand_landmark/hand_landmark_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/hand_landmark/hand_landmark_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/holistic_landmark_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/holistic_landmark/holistic_landmark_onnx_tensorrt.pbtxt diff --git a/mediapipe/examples/desktop/holistic_tracking/BUILD b/mediapipe/examples/desktop/holistic_tracking/BUILD index 55c29d118..c00f8b842 100644 --- a/mediapipe/examples/desktop/holistic_tracking/BUILD +++ b/mediapipe/examples/desktop/holistic_tracking/BUILD @@ -32,6 +32,38 @@ cc_binary( ], ) +cc_binary( + name = "holistic_tracking_onnx_cuda", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main", + "//mediapipe/graphs/holistic_tracking:holistic_tracking_onnx_cuda_graph_deps", + ], +) + +cc_binary( + name = "holistic_tracking_onnx_cuda_fps", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main_fps", + "//mediapipe/graphs/holistic_tracking:holistic_tracking_onnx_cuda_graph_deps", + ], +) + +cc_binary( + name = "holistic_tracking_onnx_tensorrt", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main", + "//mediapipe/graphs/holistic_tracking:holistic_tracking_onnx_tensorrt_graph_deps", + ], +) + +cc_binary( + name = "holistic_tracking_onnx_tensorrt_fps", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main_fps", + "//mediapipe/graphs/holistic_tracking:holistic_tracking_onnx_tensorrt_graph_deps", + ], +) + # Linux only cc_binary( name = "holistic_tracking_gpu", diff --git a/mediapipe/graphs/holistic_tracking/BUILD b/mediapipe/graphs/holistic_tracking/BUILD index dec521de3..8c6cd5ff1 100644 --- a/mediapipe/graphs/holistic_tracking/BUILD +++ b/mediapipe/graphs/holistic_tracking/BUILD @@ -68,3 +68,27 @@ cc_library( "//mediapipe/modules/holistic_landmark:holistic_landmark_cpu", ], ) + +cc_library( + name = "holistic_tracking_onnx_cuda_graph_deps", + deps = [ + ":holistic_tracking_to_render_data", + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/modules/holistic_landmark:holistic_landmark_onnx_cuda", + ], +) + +cc_library( + name = "holistic_tracking_onnx_tensorrt_graph_deps", + deps = [ + ":holistic_tracking_to_render_data", + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/modules/holistic_landmark:holistic_landmark_onnx_tensorrt", + ], +) diff --git a/mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_cuda.pbtxt b/mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_cuda.pbtxt new file mode 100644 index 000000000..c15a23c90 --- /dev/null +++ b/mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_cuda.pbtxt @@ -0,0 +1,75 @@ +# Tracks and renders pose + hands + face landmarks. + +# CPU image. (ImageFrame) +input_stream: "input_video" + +# CPU image with rendered results. (ImageFrame) +output_stream: "output_video" + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:output_video" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" + node_options: { + [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] { + max_in_flight: 1 + max_in_queue: 1 + # Timeout is disabled (set to 0) as first frame processing can take more + # than 1 second. + in_flight_timeout: 0 + } + } +} + +node { + calculator: "HolisticLandmarkOnnxCUDA" + input_stream: "IMAGE:throttled_input_video" + output_stream: "POSE_LANDMARKS:pose_landmarks" + output_stream: "POSE_ROI:pose_roi" + output_stream: "POSE_DETECTION:pose_detection" + output_stream: "FACE_LANDMARKS:face_landmarks" + output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" + output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +} + +# Gets image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:throttled_input_video" + output_stream: "SIZE:image_size" +} + +# Converts pose, hands and face landmarks to a render data vector. +node { + calculator: "HolisticTrackingToRenderData" + input_stream: "IMAGE_SIZE:image_size" + input_stream: "POSE_LANDMARKS:pose_landmarks" + input_stream: "POSE_ROI:pose_roi" + input_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" + input_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" + input_stream: "FACE_LANDMARKS:face_landmarks" + output_stream: "RENDER_DATA_VECTOR:render_data_vector" +} + +# Draws annotations and overlays them on top of the input images. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "IMAGE:throttled_input_video" + input_stream: "VECTOR:render_data_vector" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_tensorrt.pbtxt b/mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..4ab2019dd --- /dev/null +++ b/mediapipe/graphs/holistic_tracking/holistic_tracking_onnx_tensorrt.pbtxt @@ -0,0 +1,75 @@ +# Tracks and renders pose + hands + face landmarks. + +# CPU image. (ImageFrame) +input_stream: "input_video" + +# CPU image with rendered results. (ImageFrame) +output_stream: "output_video" + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:output_video" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" + node_options: { + [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] { + max_in_flight: 1 + max_in_queue: 1 + # Timeout is disabled (set to 0) as first frame processing can take more + # than 1 second. + in_flight_timeout: 0 + } + } +} + +node { + calculator: "HolisticLandmarkOnnxTensorRT" + input_stream: "IMAGE:throttled_input_video" + output_stream: "POSE_LANDMARKS:pose_landmarks" + output_stream: "POSE_ROI:pose_roi" + output_stream: "POSE_DETECTION:pose_detection" + output_stream: "FACE_LANDMARKS:face_landmarks" + output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" + output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +} + +# Gets image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:throttled_input_video" + output_stream: "SIZE:image_size" +} + +# Converts pose, hands and face landmarks to a render data vector. +node { + calculator: "HolisticTrackingToRenderData" + input_stream: "IMAGE_SIZE:image_size" + input_stream: "POSE_LANDMARKS:pose_landmarks" + input_stream: "POSE_ROI:pose_roi" + input_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" + input_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" + input_stream: "FACE_LANDMARKS:face_landmarks" + output_stream: "RENDER_DATA_VECTOR:render_data_vector" +} + +# Draws annotations and overlays them on top of the input images. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "IMAGE:throttled_input_video" + input_stream: "VECTOR:render_data_vector" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/modules/hand_landmark/BUILD b/mediapipe/modules/hand_landmark/BUILD index 6e5c49390..13a173552 100644 --- a/mediapipe/modules/hand_landmark/BUILD +++ b/mediapipe/modules/hand_landmark/BUILD @@ -59,6 +59,46 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "hand_landmark_onnx_cuda", + graph = "hand_landmark_onnx_cuda.pbtxt", + register_as = "HandLandmarkOnnxCUDA", + deps = [ + ":hand_landmark_model_loader", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda", + "//mediapipe/calculators/tensor:tensors_to_classification_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:thresholding_calculator", + "//mediapipe/calculators/util:world_landmark_projection_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmark_onnx_tensorrt", + graph = "hand_landmark_onnx_tensorrt.pbtxt", + register_as = "HandLandmarkOnnxTensorRT", + deps = [ + ":hand_landmark_model_loader", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt", + "//mediapipe/calculators/tensor:tensors_to_classification_calculator", + "//mediapipe/calculators/tensor:tensors_to_floats_calculator", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:thresholding_calculator", + "//mediapipe/calculators/util:world_landmark_projection_calculator", + ], +) + mediapipe_simple_subgraph( name = "hand_landmark_gpu", graph = "hand_landmark_gpu.pbtxt", diff --git a/mediapipe/modules/hand_landmark/hand_landmark_onnx_cuda.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_onnx_cuda.pbtxt new file mode 100644 index 000000000..88fee74f7 --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_onnx_cuda.pbtxt @@ -0,0 +1,205 @@ +# MediaPipe graph to detect/predict hand landmarks on CPU. + +type: "HandLandmarkOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" +# ROI (region of interest) within the given image where a palm/hand is located. +# (NormalizedRect) +input_stream: "ROI:hand_rect" + +# 21 hand landmarks within the given ROI. (NormalizedLandmarkList) +# NOTE: if a hand is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:hand_landmarks" + +# Hand world landmarks within the given ROI. (LandmarkList) +# World landmarks are real-world 3D coordinates in meters with the origin in the +# center of the given ROI. +# +# WORLD_LANDMARKS shares the same landmark topology as LANDMARKS. However, +# LANDMARKS provides coordinates (in pixels) of a 3D object projected onto the +# 2D image surface, while WORLD_LANDMARKS provides coordinates (in meters) of +# the 3D object itself. +output_stream: "WORLD_LANDMARKS:hand_world_landmarks" + +# Handedness of the detected hand (i.e. is hand left or right). +# (ClassificationList) +output_stream: "HANDEDNESS:handedness" + +# Transforms a region of image into a 224x224 tensor while keeping the aspect +# ratio, and therefore may result in potential letterboxing. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + input_stream: "NORM_RECT:hand_rect" + output_stream: "TENSORS:input_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 224 + output_tensor_height: 224 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + } + } +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensor" + output_stream: "TENSORS:output_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/hand_landmark/hand_landmark_lite.onnx" + delegate { cuda {} } + } + } +} + +# Splits a vector of tensors to multiple vectors according to the ranges +# specified in option. +node { + calculator: "SplitTensorVectorCalculator" + input_stream: "output_tensors" + output_stream: "landmark_tensors" + output_stream: "hand_flag_tensor" + output_stream: "handedness_tensor" + output_stream: "world_landmark_tensor" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 1 } + ranges: { begin: 1 end: 2 } + ranges: { begin: 2 end: 3 } + ranges: { begin: 3 end: 4 } + } + } +} + +# Converts the hand-flag tensor into a float that represents the confidence +# score of hand presence. +node { + calculator: "TensorsToFloatsCalculator" + input_stream: "TENSORS:hand_flag_tensor" + output_stream: "FLOAT:hand_presence_score" +} + +# Applies a threshold to the confidence score to determine whether a hand is +# present. +node { + calculator: "ThresholdingCalculator" + input_stream: "FLOAT:hand_presence_score" + output_stream: "FLAG:hand_presence" + options: { + [mediapipe.ThresholdingCalculatorOptions.ext] { + threshold: 0.5 + } + } +} + +# Drops handedness tensor if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "handedness_tensor" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_handedness_tensor" +} + +# Converts the handedness tensor into a float that represents the classification +# score of handedness. +node { + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:ensured_handedness_tensor" + output_stream: "CLASSIFICATIONS:handedness" + options: { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { + top_k: 1 + label_map_path: "mediapipe/modules/hand_landmark/handedness.txt" + binary_classification: true + } + } +} + +# Drops landmarks tensors if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "landmark_tensors" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_landmark_tensors" +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:ensured_landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 21 + input_image_width: 224 + input_image_height: 224 + # The additional scaling factor is used to account for the Z coordinate + # distribution in the training data. + normalize_z: 0.4 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (hand +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:hand_rect" + output_stream: "NORM_LANDMARKS:hand_landmarks" +} + +# Drops world landmarks tensors if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "world_landmark_tensor" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_world_landmark_tensor" +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:ensured_world_landmark_tensor" + output_stream: "LANDMARKS:unprojected_world_landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 21 + } + } +} + +# Projects the world landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "WorldLandmarkProjectionCalculator" + input_stream: "LANDMARKS:unprojected_world_landmarks" + input_stream: "NORM_RECT:hand_rect" + output_stream: "LANDMARKS:hand_world_landmarks" +} diff --git a/mediapipe/modules/hand_landmark/hand_landmark_onnx_tensorrt.pbtxt b/mediapipe/modules/hand_landmark/hand_landmark_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..24578e8ea --- /dev/null +++ b/mediapipe/modules/hand_landmark/hand_landmark_onnx_tensorrt.pbtxt @@ -0,0 +1,205 @@ +# MediaPipe graph to detect/predict hand landmarks on CPU. + +type: "HandLandmarkOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" +# ROI (region of interest) within the given image where a palm/hand is located. +# (NormalizedRect) +input_stream: "ROI:hand_rect" + +# 21 hand landmarks within the given ROI. (NormalizedLandmarkList) +# NOTE: if a hand is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:hand_landmarks" + +# Hand world landmarks within the given ROI. (LandmarkList) +# World landmarks are real-world 3D coordinates in meters with the origin in the +# center of the given ROI. +# +# WORLD_LANDMARKS shares the same landmark topology as LANDMARKS. However, +# LANDMARKS provides coordinates (in pixels) of a 3D object projected onto the +# 2D image surface, while WORLD_LANDMARKS provides coordinates (in meters) of +# the 3D object itself. +output_stream: "WORLD_LANDMARKS:hand_world_landmarks" + +# Handedness of the detected hand (i.e. is hand left or right). +# (ClassificationList) +output_stream: "HANDEDNESS:handedness" + +# Transforms a region of image into a 224x224 tensor while keeping the aspect +# ratio, and therefore may result in potential letterboxing. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + input_stream: "NORM_RECT:hand_rect" + output_stream: "TENSORS:input_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 224 + output_tensor_height: 224 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + } + } +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensor" + output_stream: "TENSORS:output_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/hand_landmark/hand_landmark_lite.onnx" + delegate { tensorrt {} } + } + } +} + +# Splits a vector of tensors to multiple vectors according to the ranges +# specified in option. +node { + calculator: "SplitTensorVectorCalculator" + input_stream: "output_tensors" + output_stream: "landmark_tensors" + output_stream: "hand_flag_tensor" + output_stream: "handedness_tensor" + output_stream: "world_landmark_tensor" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 1 } + ranges: { begin: 1 end: 2 } + ranges: { begin: 2 end: 3 } + ranges: { begin: 3 end: 4 } + } + } +} + +# Converts the hand-flag tensor into a float that represents the confidence +# score of hand presence. +node { + calculator: "TensorsToFloatsCalculator" + input_stream: "TENSORS:hand_flag_tensor" + output_stream: "FLOAT:hand_presence_score" +} + +# Applies a threshold to the confidence score to determine whether a hand is +# present. +node { + calculator: "ThresholdingCalculator" + input_stream: "FLOAT:hand_presence_score" + output_stream: "FLAG:hand_presence" + options: { + [mediapipe.ThresholdingCalculatorOptions.ext] { + threshold: 0.5 + } + } +} + +# Drops handedness tensor if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "handedness_tensor" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_handedness_tensor" +} + +# Converts the handedness tensor into a float that represents the classification +# score of handedness. +node { + calculator: "TensorsToClassificationCalculator" + input_stream: "TENSORS:ensured_handedness_tensor" + output_stream: "CLASSIFICATIONS:handedness" + options: { + [mediapipe.TensorsToClassificationCalculatorOptions.ext] { + top_k: 1 + label_map_path: "mediapipe/modules/hand_landmark/handedness.txt" + binary_classification: true + } + } +} + +# Drops landmarks tensors if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "landmark_tensors" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_landmark_tensors" +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:ensured_landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 21 + input_image_width: 224 + input_image_height: 224 + # The additional scaling factor is used to account for the Z coordinate + # distribution in the training data. + normalize_z: 0.4 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (hand +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:hand_rect" + output_stream: "NORM_LANDMARKS:hand_landmarks" +} + +# Drops world landmarks tensors if hand is not present. +node { + calculator: "GateCalculator" + input_stream: "world_landmark_tensor" + input_stream: "ALLOW:hand_presence" + output_stream: "ensured_world_landmark_tensor" +} + +# Decodes the landmark tensors into a list of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:ensured_world_landmark_tensor" + output_stream: "LANDMARKS:unprojected_world_landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 21 + } + } +} + +# Projects the world landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "WorldLandmarkProjectionCalculator" + input_stream: "LANDMARKS:unprojected_world_landmarks" + input_stream: "NORM_RECT:hand_rect" + output_stream: "LANDMARKS:hand_world_landmarks" +} diff --git a/mediapipe/modules/holistic_landmark/BUILD b/mediapipe/modules/holistic_landmark/BUILD index 6c09eb0d4..3546c368d 100644 --- a/mediapipe/modules/holistic_landmark/BUILD +++ b/mediapipe/modules/holistic_landmark/BUILD @@ -53,6 +53,36 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "face_landmarks_from_pose_onnx_cuda", + graph = "face_landmarks_from_pose_onnx_cuda.pbtxt", + register_as = "FaceLandmarksFromPoseOnnxCUDA", + deps = [ + ":face_detection_front_detections_to_roi", + ":face_landmarks_from_pose_to_recrop_roi", + ":face_tracking", + "//mediapipe/calculators/core:split_proto_list_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/modules/face_detection:face_detection_short_range_by_roi_onnx_cuda", + "//mediapipe/modules/face_landmark:face_landmark_onnx_cuda", + ], +) + +mediapipe_simple_subgraph( + name = "face_landmarks_from_pose_onnx_tensorrt", + graph = "face_landmarks_from_pose_onnx_tensorrt.pbtxt", + register_as = "FaceLandmarksFromPoseOnnxTensorRT", + deps = [ + ":face_detection_front_detections_to_roi", + ":face_landmarks_from_pose_to_recrop_roi", + ":face_tracking", + "//mediapipe/calculators/core:split_proto_list_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/modules/face_detection:face_detection_short_range_by_roi_onnx_tensorrt", + "//mediapipe/modules/face_landmark:face_landmark_onnx_tensorrt", + ], +) + mediapipe_simple_subgraph( name = "face_landmarks_to_roi", graph = "face_landmarks_to_roi.pbtxt", @@ -126,6 +156,36 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "hand_landmarks_from_pose_onnx_cuda", + graph = "hand_landmarks_from_pose_onnx_cuda.pbtxt", + register_as = "HandLandmarksFromPoseOnnxCUDA", + deps = [ + ":hand_landmarks_from_pose_to_recrop_roi", + ":hand_recrop_by_roi_onnx_cuda", + ":hand_tracking", + ":hand_visibility_from_hand_landmarks_from_pose", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/modules/hand_landmark:hand_landmark_onnx_cuda", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmarks_from_pose_onnx_tensorrt", + graph = "hand_landmarks_from_pose_onnx_tensorrt.pbtxt", + register_as = "HandLandmarksFromPoseOnnxTensorRT", + deps = [ + ":hand_landmarks_from_pose_to_recrop_roi", + ":hand_recrop_by_roi_onnx_tensorrt", + ":hand_tracking", + ":hand_visibility_from_hand_landmarks_from_pose", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/modules/hand_landmark:hand_landmark_onnx_tensorrt", + ], +) + mediapipe_simple_subgraph( name = "hand_landmarks_to_roi", graph = "hand_landmarks_to_roi.pbtxt", @@ -170,6 +230,40 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "hand_recrop_by_roi_onnx_cuda", + graph = "hand_recrop_by_roi_onnx_cuda.pbtxt", + register_as = "HandRecropByRoiOnnxCUDA", + deps = [ + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:alignment_points_to_rects_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:landmarks_to_detection_calculator", + "//mediapipe/calculators/util:rect_transformation_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "hand_recrop_by_roi_onnx_tensorrt", + graph = "hand_recrop_by_roi_onnx_tensorrt.pbtxt", + register_as = "HandRecropByRoiOnnxTensorRT", + deps = [ + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt", + "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator", + "//mediapipe/calculators/util:alignment_points_to_rects_calculator", + "//mediapipe/calculators/util:landmark_letterbox_removal_calculator", + "//mediapipe/calculators/util:landmark_projection_calculator", + "//mediapipe/calculators/util:landmarks_to_detection_calculator", + "//mediapipe/calculators/util:rect_transformation_calculator", + ], +) + mediapipe_simple_subgraph( name = "hand_tracking", graph = "hand_tracking.pbtxt", @@ -215,6 +309,26 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "hand_landmarks_left_and_right_onnx_cuda", + graph = "hand_landmarks_left_and_right_onnx_cuda.pbtxt", + register_as = "HandLandmarksLeftAndRightOnnxCUDA", + deps = [ + ":hand_landmarks_from_pose_onnx_cuda", + "//mediapipe/calculators/core:split_proto_list_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "hand_landmarks_left_and_right_onnx_tensorrt", + graph = "hand_landmarks_left_and_right_onnx_tensorrt.pbtxt", + register_as = "HandLandmarksLeftAndRightOnnxTensorRT", + deps = [ + ":hand_landmarks_from_pose_onnx_tensorrt", + "//mediapipe/calculators/core:split_proto_list_calculator", + ], +) + mediapipe_simple_subgraph( name = "hand_landmarks_from_pose_to_recrop_roi", graph = "hand_landmarks_from_pose_to_recrop_roi.pbtxt", @@ -264,3 +378,31 @@ mediapipe_simple_subgraph( "//mediapipe/modules/pose_landmark:pose_landmark_cpu", ], ) + +mediapipe_simple_subgraph( + name = "holistic_landmark_onnx_cuda", + graph = "holistic_landmark_onnx_cuda.pbtxt", + register_as = "HolisticLandmarkOnnxCUDA", + visibility = ["//visibility:public"], + deps = [ + ":face_landmarks_from_pose_onnx_cuda", + ":hand_landmarks_left_and_right_onnx_cuda", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/modules/pose_landmark:pose_landmark_onnx_cuda", + ], +) + +mediapipe_simple_subgraph( + name = "holistic_landmark_onnx_tensorrt", + graph = "holistic_landmark_onnx_tensorrt.pbtxt", + register_as = "HolisticLandmarkOnnxTensorRT", + visibility = ["//visibility:public"], + deps = [ + ":face_landmarks_from_pose_onnx_tensorrt", + ":hand_landmarks_left_and_right_onnx_tensorrt", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/modules/pose_landmark:pose_landmark_onnx_tensorrt", + ], +) diff --git a/mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_cuda.pbtxt b/mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_cuda.pbtxt new file mode 100644 index 000000000..c77be6aa4 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_cuda.pbtxt @@ -0,0 +1,82 @@ +# Predicts face landmarks within an ROI derived from face-related pose +# landmarks. + +type: "FaceLandmarksFromPoseOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# Face-related pose landmarks. (NormalizedLandmarkList) +input_stream: "FACE_LANDMARKS_FROM_POSE:face_landmarks_from_pose" + +# Whether to run the face landmark model with attention on lips and eyes to +# provide more accuracy, and additionally output iris landmarks. If unspecified, +# functions as set to false. (bool) +input_side_packet: "REFINE_LANDMARKS:refine_landmarks" + +# Face landmarks. (NormalizedLandmarkList) +output_stream: "FACE_LANDMARKS:face_landmarks" + +# Debug outputs. +# Face ROI derived from face-related pose landmarks, which defines the search +# region for the face detection model. (NormalizedRect) +output_stream: "FACE_ROI_FROM_POSE:face_roi_from_pose" +# Refined face crop rectangle predicted by face detection model. +# (NormalizedRect) +output_stream: "FACE_ROI_FROM_DETECTION:face_roi_from_detection" +# Rectangle used to predict face landmarks. (NormalizedRect) +output_stream: "FACE_TRACKING_ROI:face_tracking_roi" + +# TODO: do not predict face when most of the face landmarks from +# pose are invisible. + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:input_video" + output_stream: "SIZE:image_size" +} + +# Gets ROI for re-crop model from face-related pose landmarks. +node { + calculator: "FaceLandmarksFromPoseToRecropRoi" + input_stream: "FACE_LANDMARKS_FROM_POSE:face_landmarks_from_pose" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:face_roi_from_pose" +} + +# Detects faces within the face ROI calculated from pose landmarks. This is done +# to refine face ROI for further landmark detection as ROI calculated from +# pose landmarks may be inaccurate. +node { + calculator: "FaceDetectionShortRangeByRoiOnnxCUDA" + input_stream: "IMAGE:input_video" + input_stream: "ROI:face_roi_from_pose" + output_stream: "DETECTIONS:face_detections" +} + +# Calculates refined face ROI. +node { + calculator: "FaceDetectionFrontDetectionsToRoi" + input_stream: "DETECTIONS:face_detections" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:face_roi_from_detection" +} + +# Gets face tracking rectangle (either face rectangle from the previous +# frame or face re-crop rectangle from the current frame) for face prediction. +node { + calculator: "FaceTracking" + input_stream: "LANDMARKS:face_landmarks" + input_stream: "FACE_RECROP_ROI:face_roi_from_detection" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "FACE_TRACKING_ROI:face_tracking_roi" +} + +# Predicts face landmarks from the tracking rectangle. +node { + calculator: "FaceLandmarkOnnxCUDA" + input_stream: "IMAGE:input_video" + input_stream: "ROI:face_tracking_roi" + input_side_packet: "WITH_ATTENTION:refine_landmarks" + output_stream: "LANDMARKS:face_landmarks" +} diff --git a/mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_tensorrt.pbtxt b/mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..234a83ee2 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/face_landmarks_from_pose_onnx_tensorrt.pbtxt @@ -0,0 +1,82 @@ +# Predicts face landmarks within an ROI derived from face-related pose +# landmarks. + +type: "FaceLandmarksFromPoseOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# Face-related pose landmarks. (NormalizedLandmarkList) +input_stream: "FACE_LANDMARKS_FROM_POSE:face_landmarks_from_pose" + +# Whether to run the face landmark model with attention on lips and eyes to +# provide more accuracy, and additionally output iris landmarks. If unspecified, +# functions as set to false. (bool) +input_side_packet: "REFINE_LANDMARKS:refine_landmarks" + +# Face landmarks. (NormalizedLandmarkList) +output_stream: "FACE_LANDMARKS:face_landmarks" + +# Debug outputs. +# Face ROI derived from face-related pose landmarks, which defines the search +# region for the face detection model. (NormalizedRect) +output_stream: "FACE_ROI_FROM_POSE:face_roi_from_pose" +# Refined face crop rectangle predicted by face detection model. +# (NormalizedRect) +output_stream: "FACE_ROI_FROM_DETECTION:face_roi_from_detection" +# Rectangle used to predict face landmarks. (NormalizedRect) +output_stream: "FACE_TRACKING_ROI:face_tracking_roi" + +# TODO: do not predict face when most of the face landmarks from +# pose are invisible. + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:input_video" + output_stream: "SIZE:image_size" +} + +# Gets ROI for re-crop model from face-related pose landmarks. +node { + calculator: "FaceLandmarksFromPoseToRecropRoi" + input_stream: "FACE_LANDMARKS_FROM_POSE:face_landmarks_from_pose" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:face_roi_from_pose" +} + +# Detects faces within the face ROI calculated from pose landmarks. This is done +# to refine face ROI for further landmark detection as ROI calculated from +# pose landmarks may be inaccurate. +node { + calculator: "FaceDetectionShortRangeByRoiOnnxTensorRT" + input_stream: "IMAGE:input_video" + input_stream: "ROI:face_roi_from_pose" + output_stream: "DETECTIONS:face_detections" +} + +# Calculates refined face ROI. +node { + calculator: "FaceDetectionFrontDetectionsToRoi" + input_stream: "DETECTIONS:face_detections" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:face_roi_from_detection" +} + +# Gets face tracking rectangle (either face rectangle from the previous +# frame or face re-crop rectangle from the current frame) for face prediction. +node { + calculator: "FaceTracking" + input_stream: "LANDMARKS:face_landmarks" + input_stream: "FACE_RECROP_ROI:face_roi_from_detection" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "FACE_TRACKING_ROI:face_tracking_roi" +} + +# Predicts face landmarks from the tracking rectangle. +node { + calculator: "FaceLandmarkOnnxTensorRT" + input_stream: "IMAGE:input_video" + input_stream: "ROI:face_tracking_roi" + input_side_packet: "WITH_ATTENTION:refine_landmarks" + output_stream: "LANDMARKS:face_landmarks" +} diff --git a/mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_cuda.pbtxt b/mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_cuda.pbtxt new file mode 100644 index 000000000..65166e3b4 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_cuda.pbtxt @@ -0,0 +1,78 @@ +# Predicts hand landmarks within a ROI derived from hand-related pose landmarks. + +type: "HandLandmarksFromPoseOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# Hand-related pose landmarks in [wrist, pinky, index] order. +# (NormalizedLandmarkList) +input_stream: "HAND_LANDMARKS_FROM_POSE:hand_landmarks_from_pose" + +# Hand landmarks. (NormalizedLandmarkList) +output_stream: "HAND_LANDMARKS:hand_landmarks" + +# Debug outputs. +# Hand ROI derived from hand-related landmarks, which defines the search region +# for the hand re-crop model. (NormalizedRect) +output_stream: "HAND_ROI_FROM_POSE:hand_roi_from_pose" +# Refined hand crop rectangle predicted by hand re-crop model. (NormalizedRect) +output_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop" +# Rectangle used to predict hand landmarks. (NormalizedRect) +output_stream: "HAND_TRACKING_ROI:hand_tracking_roi" + +# Gets hand visibility. +node { + calculator: "HandVisibilityFromHandLandmarksFromPose" + input_stream: "HAND_LANDMARKS_FROM_POSE:hand_landmarks_from_pose" + output_stream: "VISIBILITY:hand_visibility" +} + +# Drops hand-related pose landmarks if pose wrist is not visible. It will +# prevent from predicting hand landmarks on the current frame. +node { + calculator: "GateCalculator" + input_stream: "hand_landmarks_from_pose" + input_stream: "ALLOW:hand_visibility" + output_stream: "ensured_hand_landmarks_from_pose" +} + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:input_video" + output_stream: "SIZE:image_size" +} + +# Gets ROI for re-crop model from hand-related pose landmarks. +node { + calculator: "HandLandmarksFromPoseToRecropRoi" + input_stream: "HAND_LANDMARKS_FROM_POSE:hand_landmarks_from_pose" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:hand_roi_from_pose" +} + +# Predicts hand re-crop rectangle on the current frame. +node { + calculator: "HandRecropByRoiOnnxCUDA", + input_stream: "IMAGE:input_video" + input_stream: "ROI:hand_roi_from_pose" + output_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop" +} + +# Gets hand tracking rectangle (either hand rectangle from the previous +# frame or hand re-crop rectangle from the current frame) for hand prediction. +node { + calculator: "HandTracking" + input_stream: "LANDMARKS:hand_landmarks" + input_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "HAND_TRACKING_ROI:hand_tracking_roi" +} + +# Predicts hand landmarks from the tracking rectangle. +node { + calculator: "HandLandmarkOnnxCUDA" + input_stream: "IMAGE:input_video" + input_stream: "ROI:hand_tracking_roi" + output_stream: "LANDMARKS:hand_landmarks" +} diff --git a/mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_tensorrt.pbtxt b/mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..2231c7fa9 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/hand_landmarks_from_pose_onnx_tensorrt.pbtxt @@ -0,0 +1,78 @@ +# Predicts hand landmarks within a ROI derived from hand-related pose landmarks. + +type: "HandLandmarksFromPoseOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# Hand-related pose landmarks in [wrist, pinky, index] order. +# (NormalizedLandmarkList) +input_stream: "HAND_LANDMARKS_FROM_POSE:hand_landmarks_from_pose" + +# Hand landmarks. (NormalizedLandmarkList) +output_stream: "HAND_LANDMARKS:hand_landmarks" + +# Debug outputs. +# Hand ROI derived from hand-related landmarks, which defines the search region +# for the hand re-crop model. (NormalizedRect) +output_stream: "HAND_ROI_FROM_POSE:hand_roi_from_pose" +# Refined hand crop rectangle predicted by hand re-crop model. (NormalizedRect) +output_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop" +# Rectangle used to predict hand landmarks. (NormalizedRect) +output_stream: "HAND_TRACKING_ROI:hand_tracking_roi" + +# Gets hand visibility. +node { + calculator: "HandVisibilityFromHandLandmarksFromPose" + input_stream: "HAND_LANDMARKS_FROM_POSE:hand_landmarks_from_pose" + output_stream: "VISIBILITY:hand_visibility" +} + +# Drops hand-related pose landmarks if pose wrist is not visible. It will +# prevent from predicting hand landmarks on the current frame. +node { + calculator: "GateCalculator" + input_stream: "hand_landmarks_from_pose" + input_stream: "ALLOW:hand_visibility" + output_stream: "ensured_hand_landmarks_from_pose" +} + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:input_video" + output_stream: "SIZE:image_size" +} + +# Gets ROI for re-crop model from hand-related pose landmarks. +node { + calculator: "HandLandmarksFromPoseToRecropRoi" + input_stream: "HAND_LANDMARKS_FROM_POSE:hand_landmarks_from_pose" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:hand_roi_from_pose" +} + +# Predicts hand re-crop rectangle on the current frame. +node { + calculator: "HandRecropByRoiOnnxTensorRT", + input_stream: "IMAGE:input_video" + input_stream: "ROI:hand_roi_from_pose" + output_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop" +} + +# Gets hand tracking rectangle (either hand rectangle from the previous +# frame or hand re-crop rectangle from the current frame) for hand prediction. +node { + calculator: "HandTracking" + input_stream: "LANDMARKS:hand_landmarks" + input_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "HAND_TRACKING_ROI:hand_tracking_roi" +} + +# Predicts hand landmarks from the tracking rectangle. +node { + calculator: "HandLandmarkOnnxTensorRT" + input_stream: "IMAGE:input_video" + input_stream: "ROI:hand_tracking_roi" + output_stream: "LANDMARKS:hand_landmarks" +} diff --git a/mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_cuda.pbtxt b/mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_cuda.pbtxt new file mode 100644 index 000000000..547384a56 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_cuda.pbtxt @@ -0,0 +1,76 @@ +# Predicts left and right hand landmarks within corresponding ROIs derived from +# hand-related pose landmarks. + +type: "HandLandmarksLeftAndRightOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# Pose landmarks to derive initial hand location from. (NormalizedLandmarkList) +input_stream: "POSE_LANDMARKS:pose_landmarks" + +# Left hand landmarks. (NormalizedLandmarkList) +output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" +# RIght hand landmarks. (NormalizedLandmarkList) +output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" + +# Debug outputs. +output_stream: "LEFT_HAND_ROI_FROM_POSE:left_hand_roi_from_pose" +output_stream: "LEFT_HAND_ROI_FROM_RECROP:left_hand_roi_from_recrop" +output_stream: "LEFT_HAND_TRACKING_ROI:left_hand_tracking_roi" +output_stream: "RIGHT_HAND_ROI_FROM_POSE:right_hand_roi_from_pose" +output_stream: "RIGHT_HAND_ROI_FROM_RECROP:right_hand_roi_from_recrop" +output_stream: "RIGHT_HAND_TRACKING_ROI:right_hand_tracking_roi" + +# Extracts left-hand-related landmarks from the pose landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "pose_landmarks" + output_stream: "left_hand_landmarks_from_pose" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 15 end: 16 } + ranges: { begin: 17 end: 18 } + ranges: { begin: 19 end: 20 } + combine_outputs: true + } + } +} + +# Predicts left hand landmarks. +node { + calculator: "HandLandmarksFromPoseOnnxCUDA" + input_stream: "IMAGE:input_video" + input_stream: "HAND_LANDMARKS_FROM_POSE:left_hand_landmarks_from_pose" + output_stream: "HAND_LANDMARKS:left_hand_landmarks" + # Debug outputs. + output_stream: "HAND_ROI_FROM_POSE:left_hand_roi_from_pose" + output_stream: "HAND_ROI_FROM_RECROP:left_hand_roi_from_recrop" + output_stream: "HAND_TRACKING_ROI:left_hand_tracking_roi" +} + +# Extracts right-hand-related landmarks from the pose landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "pose_landmarks" + output_stream: "right_hand_landmarks_from_pose" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 16 end: 17 } + ranges: { begin: 18 end: 19 } + ranges: { begin: 20 end: 21 } + combine_outputs: true + } + } +} + +# Extracts right-hand-related landmarks from the pose landmarks. +node { + calculator: "HandLandmarksFromPoseOnnxCUDA" + input_stream: "IMAGE:input_video" + input_stream: "HAND_LANDMARKS_FROM_POSE:right_hand_landmarks_from_pose" + output_stream: "HAND_LANDMARKS:right_hand_landmarks" + # Debug outputs. + output_stream: "HAND_ROI_FROM_POSE:right_hand_roi_from_pose" + output_stream: "HAND_ROI_FROM_RECROP:right_hand_roi_from_recrop" + output_stream: "HAND_TRACKING_ROI:right_hand_tracking_roi" +} diff --git a/mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_tensorrt.pbtxt b/mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..eeae2294c --- /dev/null +++ b/mediapipe/modules/holistic_landmark/hand_landmarks_left_and_right_onnx_tensorrt.pbtxt @@ -0,0 +1,76 @@ +# Predicts left and right hand landmarks within corresponding ROIs derived from +# hand-related pose landmarks. + +type: "HandLandmarksLeftAndRightOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# Pose landmarks to derive initial hand location from. (NormalizedLandmarkList) +input_stream: "POSE_LANDMARKS:pose_landmarks" + +# Left hand landmarks. (NormalizedLandmarkList) +output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" +# RIght hand landmarks. (NormalizedLandmarkList) +output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" + +# Debug outputs. +output_stream: "LEFT_HAND_ROI_FROM_POSE:left_hand_roi_from_pose" +output_stream: "LEFT_HAND_ROI_FROM_RECROP:left_hand_roi_from_recrop" +output_stream: "LEFT_HAND_TRACKING_ROI:left_hand_tracking_roi" +output_stream: "RIGHT_HAND_ROI_FROM_POSE:right_hand_roi_from_pose" +output_stream: "RIGHT_HAND_ROI_FROM_RECROP:right_hand_roi_from_recrop" +output_stream: "RIGHT_HAND_TRACKING_ROI:right_hand_tracking_roi" + +# Extracts left-hand-related landmarks from the pose landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "pose_landmarks" + output_stream: "left_hand_landmarks_from_pose" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 15 end: 16 } + ranges: { begin: 17 end: 18 } + ranges: { begin: 19 end: 20 } + combine_outputs: true + } + } +} + +# Predicts left hand landmarks. +node { + calculator: "HandLandmarksFromPoseOnnxTensorRT" + input_stream: "IMAGE:input_video" + input_stream: "HAND_LANDMARKS_FROM_POSE:left_hand_landmarks_from_pose" + output_stream: "HAND_LANDMARKS:left_hand_landmarks" + # Debug outputs. + output_stream: "HAND_ROI_FROM_POSE:left_hand_roi_from_pose" + output_stream: "HAND_ROI_FROM_RECROP:left_hand_roi_from_recrop" + output_stream: "HAND_TRACKING_ROI:left_hand_tracking_roi" +} + +# Extracts right-hand-related landmarks from the pose landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "pose_landmarks" + output_stream: "right_hand_landmarks_from_pose" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 16 end: 17 } + ranges: { begin: 18 end: 19 } + ranges: { begin: 20 end: 21 } + combine_outputs: true + } + } +} + +# Extracts right-hand-related landmarks from the pose landmarks. +node { + calculator: "HandLandmarksFromPoseOnnxTensorRT" + input_stream: "IMAGE:input_video" + input_stream: "HAND_LANDMARKS_FROM_POSE:right_hand_landmarks_from_pose" + output_stream: "HAND_LANDMARKS:right_hand_landmarks" + # Debug outputs. + output_stream: "HAND_ROI_FROM_POSE:right_hand_roi_from_pose" + output_stream: "HAND_ROI_FROM_RECROP:right_hand_roi_from_recrop" + output_stream: "HAND_TRACKING_ROI:right_hand_tracking_roi" +} diff --git a/mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_cuda.pbtxt b/mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_cuda.pbtxt new file mode 100644 index 000000000..051b85b33 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_cuda.pbtxt @@ -0,0 +1,137 @@ +# Predicts more accurate hand location (re-crop ROI) within a given ROI. + +type: "HandRecropByRoiOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# ROI (region of interest) within the given image where a palm/hand is located. +# (NormalizedRect) +input_stream: "ROI:roi" + +# Refined (more accurate) ROI to use for hand landmark prediction. +# (NormalizedRect) +output_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop_refined" + +# Transforms hand ROI from the input image to a 256x256 tensor. Preserves aspect +# ratio, which results in a letterbox padding. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:input_video" + input_stream: "NORM_RECT:roi" + output_stream: "TENSORS:initial_crop_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 256 + output_tensor_height: 256 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + # For OpenGL origin should be at the top left corner. + gpu_origin: TOP_LEFT, + } + } +} + +# Predicts hand re-crop rectangle. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:initial_crop_tensor" + output_stream: "TENSORS:landmark_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/holistic_landmark/hand_recrop.onnx" + delegate { cuda {} } + } + } +} + +# Decodes the landmark tensors into a vector of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. Two +# landmarks represent two virtual points: crop and scale of the new crop. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 2 + input_image_width: 256 + input_image_height: 256 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (hand +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:roi" + output_stream: "NORM_LANDMARKS:alignment_landmarks" +} + +# Converts hand landmarks to a detection that tightly encloses all landmarks. +node { + calculator: "LandmarksToDetectionCalculator" + input_stream: "NORM_LANDMARKS:alignment_landmarks" + output_stream: "DETECTION:hand_detection" +} + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:input_video" + output_stream: "SIZE:image_size" +} + +# Converts hand detection into a rectangle based on center and scale alignment +# points. +node { + calculator: "AlignmentPointsRectsCalculator" + input_stream: "DETECTION:hand_detection" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_RECT:hand_roi_from_recrop" + options: { + [mediapipe.DetectionsToRectsCalculatorOptions.ext] { + rotation_vector_start_keypoint_index: 0 + rotation_vector_end_keypoint_index: 1 + rotation_vector_target_angle_degrees: -90 + } + } +} + +# TODO: revise hand recrop roi calculation. +# Slighly moves hand re-crop rectangle from wrist towards fingertips. Due to the +# new hand cropping logic, crop border is to close to finger tips while a lot of +# space is below the wrist. And when moving hand up fast (with fingers pointing +# up) and using hand rect from the previous frame for tracking - fingertips can +# be cropped. This adjustment partially solves it, but hand cropping logic +# should be reviewed. +node { + calculator: "RectTransformationCalculator" + input_stream: "NORM_RECT:hand_roi_from_recrop" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "hand_roi_from_recrop_refined" + options: { + [mediapipe.RectTransformationCalculatorOptions.ext] { + scale_x: 1.0 + scale_y: 1.0 + shift_y: -0.1 + square_long: true + } + } +} diff --git a/mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_tensorrt.pbtxt b/mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..aa76c24f7 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/hand_recrop_by_roi_onnx_tensorrt.pbtxt @@ -0,0 +1,137 @@ +# Predicts more accurate hand location (re-crop ROI) within a given ROI. + +type: "HandRecropByRoiOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:input_video" +# ROI (region of interest) within the given image where a palm/hand is located. +# (NormalizedRect) +input_stream: "ROI:roi" + +# Refined (more accurate) ROI to use for hand landmark prediction. +# (NormalizedRect) +output_stream: "HAND_ROI_FROM_RECROP:hand_roi_from_recrop_refined" + +# Transforms hand ROI from the input image to a 256x256 tensor. Preserves aspect +# ratio, which results in a letterbox padding. +node { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:input_video" + input_stream: "NORM_RECT:roi" + output_stream: "TENSORS:initial_crop_tensor" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 256 + output_tensor_height: 256 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + # For OpenGL origin should be at the top left corner. + gpu_origin: TOP_LEFT, + } + } +} + +# Predicts hand re-crop rectangle. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:initial_crop_tensor" + output_stream: "TENSORS:landmark_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/holistic_landmark/hand_recrop.onnx" + delegate { tensorrt {} } + } + } +} + +# Decodes the landmark tensors into a vector of landmarks, where the landmark +# coordinates are normalized by the size of the input image to the model. Two +# landmarks represent two virtual points: crop and scale of the new crop. +node { + calculator: "TensorsToLandmarksCalculator" + input_stream: "TENSORS:landmark_tensors" + output_stream: "NORM_LANDMARKS:landmarks" + options: { + [mediapipe.TensorsToLandmarksCalculatorOptions.ext] { + num_landmarks: 2 + input_image_width: 256 + input_image_height: 256 + } + } +} + +# Adjusts landmarks (already normalized to [0.f, 1.f]) on the letterboxed hand +# image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (hand +# image before image transformation). +node { + calculator: "LandmarkLetterboxRemovalCalculator" + input_stream: "LANDMARKS:landmarks" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "LANDMARKS:scaled_landmarks" +} + +# Projects the landmarks from the cropped hand image to the corresponding +# locations on the full image before cropping (input to the graph). +node { + calculator: "LandmarkProjectionCalculator" + input_stream: "NORM_LANDMARKS:scaled_landmarks" + input_stream: "NORM_RECT:roi" + output_stream: "NORM_LANDMARKS:alignment_landmarks" +} + +# Converts hand landmarks to a detection that tightly encloses all landmarks. +node { + calculator: "LandmarksToDetectionCalculator" + input_stream: "NORM_LANDMARKS:alignment_landmarks" + output_stream: "DETECTION:hand_detection" +} + +# Extracts image size from the input images. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE:input_video" + output_stream: "SIZE:image_size" +} + +# Converts hand detection into a rectangle based on center and scale alignment +# points. +node { + calculator: "AlignmentPointsRectsCalculator" + input_stream: "DETECTION:hand_detection" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "NORM_RECT:hand_roi_from_recrop" + options: { + [mediapipe.DetectionsToRectsCalculatorOptions.ext] { + rotation_vector_start_keypoint_index: 0 + rotation_vector_end_keypoint_index: 1 + rotation_vector_target_angle_degrees: -90 + } + } +} + +# TODO: revise hand recrop roi calculation. +# Slighly moves hand re-crop rectangle from wrist towards fingertips. Due to the +# new hand cropping logic, crop border is to close to finger tips while a lot of +# space is below the wrist. And when moving hand up fast (with fingers pointing +# up) and using hand rect from the previous frame for tracking - fingertips can +# be cropped. This adjustment partially solves it, but hand cropping logic +# should be reviewed. +node { + calculator: "RectTransformationCalculator" + input_stream: "NORM_RECT:hand_roi_from_recrop" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "hand_roi_from_recrop_refined" + options: { + [mediapipe.RectTransformationCalculatorOptions.ext] { + scale_x: 1.0 + scale_y: 1.0 + shift_y: -0.1 + square_long: true + } + } +} diff --git a/mediapipe/modules/holistic_landmark/holistic_landmark_onnx_cuda.pbtxt b/mediapipe/modules/holistic_landmark/holistic_landmark_onnx_cuda.pbtxt new file mode 100644 index 000000000..e79a34f52 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/holistic_landmark_onnx_cuda.pbtxt @@ -0,0 +1,146 @@ +# Predicts pose + left/right hand + face landmarks. +# +# It is required that: +# - "face_detection_short_range.onnx" is available at +# "mediapipe/modules/face_detection/face_detection_short_range.onnx" +# +# - "face_landmark.onnx" is available at +# "mediapipe/modules/face_landmark/face_landmark.onnx" +# +# - "hand_landmark_full.onnx" is available at +# "mediapipe/modules/hand_landmark/hand_landmark_full.onnx" +# +# - "hand_recrop.onnx" is available at +# "mediapipe/modules/holistic_landmark/hand_recrop.onnx" +# +# - "handedness.txt" is available at +# "mediapipe/modules/hand_landmark/handedness.txt" +# +# - "pose_detection.onnx" is available at +# "mediapipe/modules/pose_detection/pose_detection.onnx" +# +# - "pose_landmark_lite.onnx" or "pose_landmark_full.onnx" or +# "pose_landmark_heavy.onnx" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx" +# path respectively during execution, depending on the specification in the +# MODEL_COMPLEXITY input side packet. +# +# EXAMPLE: +# node { +# calculator: "HolisticLandmarkOnnxCUDA" +# input_stream: "IMAGE:input_video" +# input_side_packet: "MODEL_COMPLEXITY:model_complexity" +# input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" +# input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" +# input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" +# input_side_packet: "REFINE_FACE_LANDMARKS:refine_face_landmarks" +# input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" +# output_stream: "POSE_LANDMARKS:pose_landmarks" +# output_stream: "FACE_LANDMARKS:face_landmarks" +# output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" +# output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +# } +# +# NOTE: if a pose/hand/face output is not present in the image, for this +# particular timestamp there will not be an output packet in the corresponding +# output stream below. However, the MediaPipe framework will internally inform +# the downstream calculators of the absence of this packet so that they don't +# wait for it unnecessarily. + +type: "HolisticLandmarkOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as +# inference latency generally go up with the model complexity. If unspecified, +# functions as set to 1. (int) +input_side_packet: "MODEL_COMPLEXITY:model_complexity" + +# Whether to filter landmarks across different input images to reduce jitter. +# If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" + +# Whether to predict the segmentation mask. If unspecified, functions as set to +# false. (bool) +input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + +# Whether to filter segmentation mask across different input images to reduce +# jitter. If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" + +# Whether to run the face landmark model with attention on lips and eyes to +# provide more accuracy, and additionally output iris landmarks. If unspecified, +# functions as set to false. (bool) +input_side_packet: "REFINE_FACE_LANDMARKS:refine_face_landmarks" + +# Whether landmarks on the previous image should be used to help localize +# landmarks on the current image. (bool) +input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" + +# Pose landmarks. (NormalizedLandmarkList) +# 33 pose landmarks. +output_stream: "POSE_LANDMARKS:pose_landmarks" +# 33 pose world landmarks. (LandmarkList) +output_stream: "WORLD_LANDMARKS:pose_world_landmarks" +# 21 left hand landmarks. (NormalizedLandmarkList) +output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" +# 21 right hand landmarks. (NormalizedLandmarkList) +output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +# 468 face landmarks. (NormalizedLandmarkList) +output_stream: "FACE_LANDMARKS:face_landmarks" + +# Segmentation mask. (ImageFrame in ImageFormat::VEC32F1) +output_stream: "SEGMENTATION_MASK:segmentation_mask" + +# Debug outputs +output_stream: "POSE_ROI:pose_landmarks_roi" +output_stream: "POSE_DETECTION:pose_detection" + +# Predicts pose landmarks. +node { + calculator: "PoseLandmarkOnnxCUDA" + input_stream: "IMAGE:image" + input_side_packet: "MODEL_COMPLEXITY:model_complexity" + input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" + input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" + output_stream: "LANDMARKS:pose_landmarks" + output_stream: "WORLD_LANDMARKS:pose_world_landmarks" + output_stream: "SEGMENTATION_MASK:segmentation_mask" + output_stream: "ROI_FROM_LANDMARKS:pose_landmarks_roi" + output_stream: "DETECTION:pose_detection" +} + +# Predicts left and right hand landmarks based on the initial pose landmarks. +node { + calculator: "HandLandmarksLeftAndRightOnnxCUDA" + input_stream: "IMAGE:image" + input_stream: "POSE_LANDMARKS:pose_landmarks" + output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" + output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +} + +# Extracts face-related pose landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "pose_landmarks" + output_stream: "face_landmarks_from_pose" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 11 } + } + } +} + +# Predicts face landmarks based on the initial pose landmarks. +node { + calculator: "FaceLandmarksFromPoseOnnxCUDA" + input_stream: "IMAGE:image" + input_stream: "FACE_LANDMARKS_FROM_POSE:face_landmarks_from_pose" + input_side_packet: "REFINE_LANDMARKS:refine_face_landmarks" + output_stream: "FACE_LANDMARKS:face_landmarks" +} diff --git a/mediapipe/modules/holistic_landmark/holistic_landmark_onnx_tensorrt.pbtxt b/mediapipe/modules/holistic_landmark/holistic_landmark_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..404b45f41 --- /dev/null +++ b/mediapipe/modules/holistic_landmark/holistic_landmark_onnx_tensorrt.pbtxt @@ -0,0 +1,146 @@ +# Predicts pose + left/right hand + face landmarks. +# +# It is required that: +# - "face_detection_short_range.onnx" is available at +# "mediapipe/modules/face_detection/face_detection_short_range.onnx" +# +# - "face_landmark.onnx" is available at +# "mediapipe/modules/face_landmark/face_landmark.onnx" +# +# - "hand_landmark_full.onnx" is available at +# "mediapipe/modules/hand_landmark/hand_landmark_full.onnx" +# +# - "hand_recrop.onnx" is available at +# "mediapipe/modules/holistic_landmark/hand_recrop.onnx" +# +# - "handedness.txt" is available at +# "mediapipe/modules/hand_landmark/handedness.txt" +# +# - "pose_detection.onnx" is available at +# "mediapipe/modules/pose_detection/pose_detection.onnx" +# +# - "pose_landmark_lite.onnx" or "pose_landmark_full.onnx" or +# "pose_landmark_heavy.onnx" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx" +# path respectively during execution, depending on the specification in the +# MODEL_COMPLEXITY input side packet. +# +# EXAMPLE: +# node { +# calculator: "HolisticLandmarkOnnxTensorRT" +# input_stream: "IMAGE:input_video" +# input_side_packet: "MODEL_COMPLEXITY:model_complexity" +# input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" +# input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" +# input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" +# input_side_packet: "REFINE_FACE_LANDMARKS:refine_face_landmarks" +# input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" +# output_stream: "POSE_LANDMARKS:pose_landmarks" +# output_stream: "FACE_LANDMARKS:face_landmarks" +# output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" +# output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +# } +# +# NOTE: if a pose/hand/face output is not present in the image, for this +# particular timestamp there will not be an output packet in the corresponding +# output stream below. However, the MediaPipe framework will internally inform +# the downstream calculators of the absence of this packet so that they don't +# wait for it unnecessarily. + +type: "HolisticLandmarkOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as +# inference latency generally go up with the model complexity. If unspecified, +# functions as set to 1. (int) +input_side_packet: "MODEL_COMPLEXITY:model_complexity" + +# Whether to filter landmarks across different input images to reduce jitter. +# If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" + +# Whether to predict the segmentation mask. If unspecified, functions as set to +# false. (bool) +input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + +# Whether to filter segmentation mask across different input images to reduce +# jitter. If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" + +# Whether to run the face landmark model with attention on lips and eyes to +# provide more accuracy, and additionally output iris landmarks. If unspecified, +# functions as set to false. (bool) +input_side_packet: "REFINE_FACE_LANDMARKS:refine_face_landmarks" + +# Whether landmarks on the previous image should be used to help localize +# landmarks on the current image. (bool) +input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" + +# Pose landmarks. (NormalizedLandmarkList) +# 33 pose landmarks. +output_stream: "POSE_LANDMARKS:pose_landmarks" +# 33 pose world landmarks. (LandmarkList) +output_stream: "WORLD_LANDMARKS:pose_world_landmarks" +# 21 left hand landmarks. (NormalizedLandmarkList) +output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" +# 21 right hand landmarks. (NormalizedLandmarkList) +output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +# 468 face landmarks. (NormalizedLandmarkList) +output_stream: "FACE_LANDMARKS:face_landmarks" + +# Segmentation mask. (ImageFrame in ImageFormat::VEC32F1) +output_stream: "SEGMENTATION_MASK:segmentation_mask" + +# Debug outputs +output_stream: "POSE_ROI:pose_landmarks_roi" +output_stream: "POSE_DETECTION:pose_detection" + +# Predicts pose landmarks. +node { + calculator: "PoseLandmarkOnnxTensorRT" + input_stream: "IMAGE:image" + input_side_packet: "MODEL_COMPLEXITY:model_complexity" + input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" + input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" + output_stream: "LANDMARKS:pose_landmarks" + output_stream: "WORLD_LANDMARKS:pose_world_landmarks" + output_stream: "SEGMENTATION_MASK:segmentation_mask" + output_stream: "ROI_FROM_LANDMARKS:pose_landmarks_roi" + output_stream: "DETECTION:pose_detection" +} + +# Predicts left and right hand landmarks based on the initial pose landmarks. +node { + calculator: "HandLandmarksLeftAndRightOnnxTensorRT" + input_stream: "IMAGE:image" + input_stream: "POSE_LANDMARKS:pose_landmarks" + output_stream: "LEFT_HAND_LANDMARKS:left_hand_landmarks" + output_stream: "RIGHT_HAND_LANDMARKS:right_hand_landmarks" +} + +# Extracts face-related pose landmarks. +node { + calculator: "SplitNormalizedLandmarkListCalculator" + input_stream: "pose_landmarks" + output_stream: "face_landmarks_from_pose" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 11 } + } + } +} + +# Predicts face landmarks based on the initial pose landmarks. +node { + calculator: "FaceLandmarksFromPoseOnnxTensorRT" + input_stream: "IMAGE:image" + input_stream: "FACE_LANDMARKS_FROM_POSE:face_landmarks_from_pose" + input_side_packet: "REFINE_LANDMARKS:refine_face_landmarks" + output_stream: "FACE_LANDMARKS:face_landmarks" +}