From 008ed46ee035da058a9d596b5ea3f27130c054eb Mon Sep 17 00:00:00 2001 From: liuyulvv Date: Fri, 12 Aug 2022 09:42:03 +0800 Subject: [PATCH] =?UTF-8?q?pose=20detection=E5=92=8Clandmark=E6=94=AF?= =?UTF-8?q?=E6=8C=81onnxruntime=E7=9A=84cuda=E5=92=8Ctensorrt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../examples/desktop/pose_tracking/BUILD | 40 +++ mediapipe/graphs/pose_tracking/BUILD | 34 +++ .../pose_tracking/pose_tracking_cpu.pbtxt | 2 +- .../pose_tracking_onnx_cuda.pbtxt | 63 ++++ .../pose_tracking_onnx_tensorrt.pbtxt | 63 ++++ mediapipe/modules/pose_detection/BUILD | 28 ++ .../pose_detection_onnx_cuda.pbtxt | 157 ++++++++++ .../pose_detection_onnx_tensorrt.pbtxt | 157 ++++++++++ mediapipe/modules/pose_landmark/BUILD | 76 +++++ .../pose_landmark_by_roi_onnx_cuda.pbtxt | 165 +++++++++++ .../pose_landmark_by_roi_onnx_tensorrt.pbtxt | 165 +++++++++++ .../pose_landmark_onnx_cuda.pbtxt | 268 ++++++++++++++++++ .../pose_landmark_onnx_tensorrt.pbtxt | 268 ++++++++++++++++++ 13 files changed, 1485 insertions(+), 1 deletion(-) create mode 100644 mediapipe/graphs/pose_tracking/pose_tracking_onnx_cuda.pbtxt create mode 100644 mediapipe/graphs/pose_tracking/pose_tracking_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/pose_detection/pose_detection_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/pose_detection/pose_detection_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_tensorrt.pbtxt create mode 100644 mediapipe/modules/pose_landmark/pose_landmark_onnx_cuda.pbtxt create mode 100644 mediapipe/modules/pose_landmark/pose_landmark_onnx_tensorrt.pbtxt diff --git a/mediapipe/examples/desktop/pose_tracking/BUILD b/mediapipe/examples/desktop/pose_tracking/BUILD index 447e2dfdc..6ed97470e 100644 --- a/mediapipe/examples/desktop/pose_tracking/BUILD +++ b/mediapipe/examples/desktop/pose_tracking/BUILD @@ -24,6 +24,46 @@ cc_binary( ], ) +cc_binary( + name = "pose_tracking_cpu_fps", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main_fps", + "//mediapipe/graphs/pose_tracking:pose_tracking_cpu_deps", + ], +) + +cc_binary( + name = "pose_tracking_onnx_cuda", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main", + "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_cuda_deps", + ], +) + +cc_binary( + name = "pose_tracking_onnx_cuda_fps", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main_fps", + "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_cuda_deps", + ], +) + +cc_binary( + name = "pose_tracking_onnx_tensorrt", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main", + "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_tensorrt_deps", + ], +) + +cc_binary( + name = "pose_tracking_onnx_tensorrt_fps", + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main_fps", + "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_tensorrt_deps", + ], +) + # Linux only cc_binary( name = "pose_tracking_gpu", diff --git a/mediapipe/graphs/pose_tracking/BUILD b/mediapipe/graphs/pose_tracking/BUILD index 26f607c45..fe58da964 100644 --- a/mediapipe/graphs/pose_tracking/BUILD +++ b/mediapipe/graphs/pose_tracking/BUILD @@ -54,3 +54,37 @@ mediapipe_binary_graph( output_name = "pose_tracking_cpu.binarypb", deps = [":pose_tracking_cpu_deps"], ) + +cc_library( + name = "pose_tracking_onnx_cuda_deps", + deps = [ + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/graphs/pose_tracking/subgraphs:pose_renderer_cpu", + "//mediapipe/modules/pose_landmark:pose_landmark_onnx_cuda", + ], +) + +mediapipe_binary_graph( + name = "pose_tracking_onnx_cuda_binary_graph", + graph = "pose_tracking_onnx_cuda.pbtxt", + output_name = "pose_tracking_onnx_cuda.binarypb", + deps = [":pose_tracking_onnx_cuda_deps"], +) + +cc_library( + name = "pose_tracking_onnx_tensorrt_deps", + deps = [ + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/graphs/pose_tracking/subgraphs:pose_renderer_cpu", + "//mediapipe/modules/pose_landmark:pose_landmark_onnx_tensorrt", + ], +) + +mediapipe_binary_graph( + name = "pose_tracking_onnx_tensorrt_binary_graph", + graph = "pose_tracking_onnx_tensorrt.pbtxt", + output_name = "pose_tracking_onnx_tensorrt.binarypb", + deps = [":pose_tracking_onnx_tensorrt"], +) diff --git a/mediapipe/graphs/pose_tracking/pose_tracking_cpu.pbtxt b/mediapipe/graphs/pose_tracking/pose_tracking_cpu.pbtxt index 31d847eb4..92ab719c6 100644 --- a/mediapipe/graphs/pose_tracking/pose_tracking_cpu.pbtxt +++ b/mediapipe/graphs/pose_tracking/pose_tracking_cpu.pbtxt @@ -14,7 +14,7 @@ node { output_side_packet: "PACKET:enable_segmentation" node_options: { [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { - packet { bool_value: true } + packet { bool_value: false } } } } diff --git a/mediapipe/graphs/pose_tracking/pose_tracking_onnx_cuda.pbtxt b/mediapipe/graphs/pose_tracking/pose_tracking_onnx_cuda.pbtxt new file mode 100644 index 000000000..5d7ee8556 --- /dev/null +++ b/mediapipe/graphs/pose_tracking/pose_tracking_onnx_cuda.pbtxt @@ -0,0 +1,63 @@ +# MediaPipe graph that performs pose tracking with onnxruntime on cuda. + +# CPU buffer. (ImageFrame) +input_stream: "input_video" + +# Output image with rendered results. (ImageFrame) +output_stream: "output_video" +# Pose landmarks. (NormalizedLandmarkList) +output_stream: "pose_landmarks" + +# Generates side packet to enable segmentation. +node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:enable_segmentation" + node_options: { + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { bool_value: false } + } + } +} + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:output_video" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" +} + +# Subgraph that detects poses and corresponding landmarks. +node { + calculator: "PoseLandmarkOnnxCUDA" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "IMAGE:throttled_input_video" + output_stream: "LANDMARKS:pose_landmarks" + output_stream: "SEGMENTATION_MASK:segmentation_mask" + output_stream: "DETECTION:pose_detection" + output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks" +} + +# Subgraph that renders pose-landmark annotation onto the input image. +node { + calculator: "PoseRendererCpu" + input_stream: "IMAGE:throttled_input_video" + input_stream: "LANDMARKS:pose_landmarks" + input_stream: "SEGMENTATION_MASK:segmentation_mask" + input_stream: "DETECTION:pose_detection" + input_stream: "ROI:roi_from_landmarks" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/graphs/pose_tracking/pose_tracking_onnx_tensorrt.pbtxt b/mediapipe/graphs/pose_tracking/pose_tracking_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..ccc1a3b3f --- /dev/null +++ b/mediapipe/graphs/pose_tracking/pose_tracking_onnx_tensorrt.pbtxt @@ -0,0 +1,63 @@ +# MediaPipe graph that performs pose tracking with onnxruntime on tensorrt. + +# CPU buffer. (ImageFrame) +input_stream: "input_video" + +# Output image with rendered results. (ImageFrame) +output_stream: "output_video" +# Pose landmarks. (NormalizedLandmarkList) +output_stream: "pose_landmarks" + +# Generates side packet to enable segmentation. +node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:enable_segmentation" + node_options: { + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { bool_value: false } + } + } +} + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:output_video" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" +} + +# Subgraph that detects poses and corresponding landmarks. +node { + calculator: "PoseLandmarkOnnxTensorRT" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "IMAGE:throttled_input_video" + output_stream: "LANDMARKS:pose_landmarks" + output_stream: "SEGMENTATION_MASK:segmentation_mask" + output_stream: "DETECTION:pose_detection" + output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks" +} + +# Subgraph that renders pose-landmark annotation onto the input image. +node { + calculator: "PoseRendererCpu" + input_stream: "IMAGE:throttled_input_video" + input_stream: "LANDMARKS:pose_landmarks" + input_stream: "SEGMENTATION_MASK:segmentation_mask" + input_stream: "DETECTION:pose_detection" + input_stream: "ROI:roi_from_landmarks" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/modules/pose_detection/BUILD b/mediapipe/modules/pose_detection/BUILD index f4603007e..71e0a2b6c 100644 --- a/mediapipe/modules/pose_detection/BUILD +++ b/mediapipe/modules/pose_detection/BUILD @@ -35,6 +35,34 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "pose_detection_onnx_cuda", + graph = "pose_detection_onnx_cuda.pbtxt", + register_as = "PoseDetectionOnnxCUDA", + deps = [ + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/util:detection_letterbox_removal_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "pose_detection_onnx_tensorrt", + graph = "pose_detection_onnx_tensorrt.pbtxt", + register_as = "PoseDetectionOnnxTensorRT", + deps = [ + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt", + "//mediapipe/calculators/tensor:tensors_to_detections_calculator", + "//mediapipe/calculators/tflite:ssd_anchors_calculator", + "//mediapipe/calculators/util:detection_letterbox_removal_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", + ], +) + mediapipe_simple_subgraph( name = "pose_detection_gpu", graph = "pose_detection_gpu.pbtxt", diff --git a/mediapipe/modules/pose_detection/pose_detection_onnx_cuda.pbtxt b/mediapipe/modules/pose_detection/pose_detection_onnx_cuda.pbtxt new file mode 100644 index 000000000..534d1c6c8 --- /dev/null +++ b/mediapipe/modules/pose_detection/pose_detection_onnx_cuda.pbtxt @@ -0,0 +1,157 @@ +# MediaPipe graph to detect poses. (CPU input, and inference is executed with onnxruntime on +# cuda.) +# +# It is required that "pose_detection.onnx" is available at +# "mediapipe/modules/pose_detection/pose_detection.onnx" +# path during execution. +# +# EXAMPLE: +# node { +# calculator: "PoseDetectionOnnxCUDA" +# input_stream: "IMAGE:image" +# output_stream: "DETECTIONS:pose_detections" +# } + +type: "PoseDetectionOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Detected poses. (std::vector) +# Bounding box in each pose detection is currently set to the bounding box of +# the detected face. However, 4 additional key points are available in each +# detection, which are used to further calculate a (rotated) bounding box that +# encloses the body region of interest. Among the 4 key points, the first two +# are for identifying the full-body region, and the second two for upper body +# only: +# +# Key point 0 - mid hip center +# Key point 1 - point that encodes size & rotation (for full body) +# Key point 2 - mid shoulder center +# Key point 3 - point that encodes size & rotation (for upper body) +# +# NOTE: there will not be an output packet in the DETECTIONS stream for this +# particular timestamp if none of poses detected. However, the MediaPipe +# framework will internally inform the downstream calculators of the absence of +# this packet so that they don't wait for it unnecessarily. +output_stream: "DETECTIONS:detections" + +# Transforms the input image into a 224x224 one while keeping the aspect ratio +# (what is expected by the corresponding model), resulting in potential +# letterboxing in the transformed image. +node: { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + output_stream: "TENSORS:input_tensors" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 224 + output_tensor_height: 224 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } + border_mode: BORDER_ZERO + # If this calculator truly operates in the CPU, then gpu_origin is + # ignored, but if some build switch insists on GPU inference, then we will + # still need to set this. + gpu_origin: TOP_LEFT + } + } +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" + output_stream: "TENSORS:detection_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/pose_detection/pose_detection.onnx" + delegate { cuda {} } + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + options: { + [mediapipe.SsdAnchorsCalculatorOptions.ext] { + num_layers: 5 + min_scale: 0.1484375 + max_scale: 0.75 + input_size_height: 224 + input_size_width: 224 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 8 + strides: 16 + strides: 32 + strides: 32 + strides: 32 + aspect_ratios: 1.0 + fixed_anchor_size: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:unfiltered_detections" + options: { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { + num_classes: 1 + num_boxes: 2254 + num_coords: 12 + box_coord_offset: 0 + keypoint_coord_offset: 4 + num_keypoints: 4 + num_values_per_keypoint: 2 + sigmoid_score: true + score_clipping_thresh: 100.0 + reverse_output_order: true + x_scale: 224.0 + y_scale: 224.0 + h_scale: 224.0 + w_scale: 224.0 + min_score_thresh: 0.5 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "unfiltered_detections" + output_stream: "filtered_detections" + options: { + [mediapipe.NonMaxSuppressionCalculatorOptions.ext] { + min_suppression_threshold: 0.3 + overlap_type: INTERSECTION_OVER_UNION + algorithm: WEIGHTED + } + } +} + +# Adjusts detection locations (already normalized to [0.f, 1.f]) on the +# letterboxed image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (the +# input image to the graph before image transformation). +node { + calculator: "DetectionLetterboxRemovalCalculator" + input_stream: "DETECTIONS:filtered_detections" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "DETECTIONS:detections" +} diff --git a/mediapipe/modules/pose_detection/pose_detection_onnx_tensorrt.pbtxt b/mediapipe/modules/pose_detection/pose_detection_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..078e7f4c6 --- /dev/null +++ b/mediapipe/modules/pose_detection/pose_detection_onnx_tensorrt.pbtxt @@ -0,0 +1,157 @@ +# MediaPipe graph to detect poses. (CPU input, and inference is executed with onnxruntime on +# tensorrt.) +# +# It is required that "pose_detection.onnx" is available at +# "mediapipe/modules/pose_detection/pose_detection.onnx" +# path during execution. +# +# EXAMPLE: +# node { +# calculator: "PoseDetectionOnnxTensorRT" +# input_stream: "IMAGE:image" +# output_stream: "DETECTIONS:pose_detections" +# } + +type: "PoseDetectionOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Detected poses. (std::vector) +# Bounding box in each pose detection is currently set to the bounding box of +# the detected face. However, 4 additional key points are available in each +# detection, which are used to further calculate a (rotated) bounding box that +# encloses the body region of interest. Among the 4 key points, the first two +# are for identifying the full-body region, and the second two for upper body +# only: +# +# Key point 0 - mid hip center +# Key point 1 - point that encodes size & rotation (for full body) +# Key point 2 - mid shoulder center +# Key point 3 - point that encodes size & rotation (for upper body) +# +# NOTE: there will not be an output packet in the DETECTIONS stream for this +# particular timestamp if none of poses detected. However, the MediaPipe +# framework will internally inform the downstream calculators of the absence of +# this packet so that they don't wait for it unnecessarily. +output_stream: "DETECTIONS:detections" + +# Transforms the input image into a 224x224 one while keeping the aspect ratio +# (what is expected by the corresponding model), resulting in potential +# letterboxing in the transformed image. +node: { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + output_stream: "TENSORS:input_tensors" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 224 + output_tensor_height: 224 + keep_aspect_ratio: true + output_tensor_float_range { + min: -1.0 + max: 1.0 + } + border_mode: BORDER_ZERO + # If this calculator truly operates in the CPU, then gpu_origin is + # ignored, but if some build switch insists on GPU inference, then we will + # still need to set this. + gpu_origin: TOP_LEFT + } + } +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" + output_stream: "TENSORS:detection_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/pose_detection/pose_detection.onnx" + delegate { tensorrt {} } + } + } +} + +# Generates a single side packet containing a vector of SSD anchors based on +# the specification in the options. +node { + calculator: "SsdAnchorsCalculator" + output_side_packet: "anchors" + options: { + [mediapipe.SsdAnchorsCalculatorOptions.ext] { + num_layers: 5 + min_scale: 0.1484375 + max_scale: 0.75 + input_size_height: 224 + input_size_width: 224 + anchor_offset_x: 0.5 + anchor_offset_y: 0.5 + strides: 8 + strides: 16 + strides: 32 + strides: 32 + strides: 32 + aspect_ratios: 1.0 + fixed_anchor_size: true + } + } +} + +# Decodes the detection tensors generated by the TensorFlow Lite model, based on +# the SSD anchors and the specification in the options, into a vector of +# detections. Each detection describes a detected object. +node { + calculator: "TensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" + input_side_packet: "ANCHORS:anchors" + output_stream: "DETECTIONS:unfiltered_detections" + options: { + [mediapipe.TensorsToDetectionsCalculatorOptions.ext] { + num_classes: 1 + num_boxes: 2254 + num_coords: 12 + box_coord_offset: 0 + keypoint_coord_offset: 4 + num_keypoints: 4 + num_values_per_keypoint: 2 + sigmoid_score: true + score_clipping_thresh: 100.0 + reverse_output_order: true + x_scale: 224.0 + y_scale: 224.0 + h_scale: 224.0 + w_scale: 224.0 + min_score_thresh: 0.5 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "unfiltered_detections" + output_stream: "filtered_detections" + options: { + [mediapipe.NonMaxSuppressionCalculatorOptions.ext] { + min_suppression_threshold: 0.3 + overlap_type: INTERSECTION_OVER_UNION + algorithm: WEIGHTED + } + } +} + +# Adjusts detection locations (already normalized to [0.f, 1.f]) on the +# letterboxed image (after image transformation with the FIT scale mode) to the +# corresponding locations on the same image with the letterbox removed (the +# input image to the graph before image transformation). +node { + calculator: "DetectionLetterboxRemovalCalculator" + input_stream: "DETECTIONS:filtered_detections" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "DETECTIONS:detections" +} diff --git a/mediapipe/modules/pose_landmark/BUILD b/mediapipe/modules/pose_landmark/BUILD index 424579a46..b959ff9ae 100644 --- a/mediapipe/modules/pose_landmark/BUILD +++ b/mediapipe/modules/pose_landmark/BUILD @@ -61,6 +61,35 @@ mediapipe_simple_subgraph( ], ) + +mediapipe_simple_subgraph( + name = "pose_landmark_by_roi_onnx_cuda", + graph = "pose_landmark_by_roi_onnx_cuda.pbtxt", + register_as = "PoseLandmarkByRoiOnnxCUDA", + deps = [ + ":pose_landmark_model_loader", + ":pose_landmarks_and_segmentation_inverse_projection", + ":tensors_to_pose_landmarks_and_segmentation", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda", + ], +) + +mediapipe_simple_subgraph( + name = "pose_landmark_by_roi_onnx_tensorrt", + graph = "pose_landmark_by_roi_onnx_tensorrt.pbtxt", + register_as = "PoseLandmarkByRoiOnnxTensorRT", + deps = [ + ":pose_landmark_model_loader", + ":pose_landmarks_and_segmentation_inverse_projection", + ":tensors_to_pose_landmarks_and_segmentation", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/tensor:image_to_tensor_calculator", + "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt", + ], +) + mediapipe_simple_subgraph( name = "tensors_to_pose_landmarks_and_segmentation", graph = "tensors_to_pose_landmarks_and_segmentation.pbtxt", @@ -159,10 +188,57 @@ mediapipe_simple_subgraph( ], ) +mediapipe_simple_subgraph( + name = "pose_landmark_onnx_cuda", + graph = "pose_landmark_onnx_cuda.pbtxt", + register_as = "PoseLandmarkOnnxCUDA", + deps = [ + ":pose_detection_to_roi", + ":pose_landmark_by_roi_onnx_cuda", + ":pose_landmark_filtering", + ":pose_landmarks_to_roi", + ":pose_segmentation_filtering", + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:merge_calculator", + "//mediapipe/calculators/core:packet_presence_calculator", + "//mediapipe/calculators/core:previous_loopback_calculator", + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:from_image_calculator", + "//mediapipe/modules/pose_detection:pose_detection_onnx_cuda", + ], +) + +mediapipe_simple_subgraph( + name = "pose_landmark_onnx_tensorrt", + graph = "pose_landmark_onnx_tensorrt.pbtxt", + register_as = "PoseLandmarkOnnxTensorRT", + deps = [ + ":pose_detection_to_roi", + ":pose_landmark_by_roi_onnx_tensorrt", + ":pose_landmark_filtering", + ":pose_landmarks_to_roi", + ":pose_segmentation_filtering", + "//mediapipe/calculators/core:constant_side_packet_calculator", + "//mediapipe/calculators/core:gate_calculator", + "//mediapipe/calculators/core:merge_calculator", + "//mediapipe/calculators/core:packet_presence_calculator", + "//mediapipe/calculators/core:previous_loopback_calculator", + "//mediapipe/calculators/core:split_vector_calculator", + "//mediapipe/calculators/image:image_properties_calculator", + "//mediapipe/calculators/util:from_image_calculator", + "//mediapipe/modules/pose_detection:pose_detection_onnx_tensorrt", + ], +) + exports_files( srcs = [ + "pose_landmark_full.onnx", "pose_landmark_full.tflite", + "pose_landmark_heavy.onnx", "pose_landmark_heavy.tflite", + "pose_landmark_lite.onnx", "pose_landmark_lite.tflite", ], ) diff --git a/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_cuda.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_cuda.pbtxt new file mode 100644 index 000000000..d684e7ffd --- /dev/null +++ b/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_cuda.pbtxt @@ -0,0 +1,165 @@ +# MediaPipe graph to detect/predict pose landmarks and optionally segmentation +# within an ROI. (CPU input, and inference is executed on CPU.) +# +# It is required that "pose_landmark_lite.onnx" or +# "pose_landmark_full.onnx" or "pose_landmark_heavy.onnx" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx" +# path respectively during execution, depending on the specification in the +# MODEL_COMPLEXITY input side packet. +# +# EXAMPLE: +# node { +# calculator: "PoseLandmarkByRoiOnnxCUDA" +# input_side_packet: "MODEL_COMPLEXITY:model_complexity" +# input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" +# input_stream: "IMAGE:image" +# input_stream: "ROI:roi" +# output_stream: "LANDMARKS:landmarks" +# output_stream: "SEGMENTATION_MASK:segmentation_mask" +# } + +type: "PoseLandmarkByRoiOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" +# ROI (region of interest) within the given image where a pose is located. +# (NormalizedRect) +input_stream: "ROI:roi" + +# Whether to predict the segmentation mask. If unspecified, functions as set to +# false. (bool) +input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + +# Pose landmarks within the given ROI. (NormalizedLandmarkList) +# We have 33 landmarks (see pose_landmark_topology.svg) and there are other +# auxiliary key points. +# 0 - nose +# 1 - left eye (inner) +# 2 - left eye +# 3 - left eye (outer) +# 4 - right eye (inner) +# 5 - right eye +# 6 - right eye (outer) +# 7 - left ear +# 8 - right ear +# 9 - mouth (left) +# 10 - mouth (right) +# 11 - left shoulder +# 12 - right shoulder +# 13 - left elbow +# 14 - right elbow +# 15 - left wrist +# 16 - right wrist +# 17 - left pinky +# 18 - right pinky +# 19 - left index +# 20 - right index +# 21 - left thumb +# 22 - right thumb +# 23 - left hip +# 24 - right hip +# 25 - left knee +# 26 - right knee +# 27 - left ankle +# 28 - right ankle +# 29 - left heel +# 30 - right heel +# 31 - left foot index +# 32 - right foot index +# +# NOTE: If a pose is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:landmarks" +# Auxiliary landmarks for deriving the ROI in the subsequent image. +# (NormalizedLandmarkList) +output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" + +# Pose world landmarks within the given ROI. (LandmarkList) +# World landmarks are real-world 3D coordinates in meters with the origin at the +# center between hips. WORLD_LANDMARKS shares the same landmark topology as +# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object +# projected onto the 2D image surface, while WORLD_LANDMARKS provides +# coordinates (in meters) of the 3D object itself. +output_stream: "WORLD_LANDMARKS:world_landmarks" + +# Segmentation mask on CPU in ImageFormat::VEC32F1. (Image) +output_stream: "SEGMENTATION_MASK:segmentation_mask" + +# Retrieves the image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_CPU:image" + output_stream: "SIZE:image_size" +} + +# Crops and transforms the specified ROI in the input image into an image patch +# represented as a tensor of dimension expected by the corresponding ML model, +# while maintaining the aspect ratio of the ROI (which can be different from +# that of the image patch). Therefore, there can be letterboxing around the ROI +# in the generated tensor representation. +node: { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + input_stream: "NORM_RECT:roi" + output_stream: "TENSORS:input_tensors" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "MATRIX:transformation_matrix" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 256 + output_tensor_height: 256 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + } + } +} + +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" + output_stream: "TENSORS:output_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" + delegate { cuda {} } + } + } +} + +# Decodes the tensors into the corresponding landmark and segmentation mask +# representation. +node { + calculator: "TensorsToPoseLandmarksAndSegmentation" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "TENSORS:output_tensors" + output_stream: "LANDMARKS:roi_landmarks" + output_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks" + output_stream: "WORLD_LANDMARKS:roi_world_landmarks" + output_stream: "SEGMENTATION_MASK:roi_segmentation_mask" +} + +# Projects the landmarks and segmentation mask in the local coordinates of the +# (potentially letterboxed) ROI back to the global coordinates of the full input +# image. +node { + calculator: "PoseLandmarksAndSegmentationInverseProjection" + input_stream: "IMAGE_SIZE:image_size" + input_stream: "NORM_RECT:roi" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + input_stream: "MATRIX:transformation_matrix" + input_stream: "LANDMARKS:roi_landmarks" + input_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks" + input_stream: "WORLD_LANDMARKS:roi_world_landmarks" + input_stream: "SEGMENTATION_MASK:roi_segmentation_mask" + output_stream: "LANDMARKS:landmarks" + output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" + output_stream: "WORLD_LANDMARKS:world_landmarks" + output_stream: "SEGMENTATION_MASK:segmentation_mask" +} diff --git a/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_tensorrt.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..9da639b36 --- /dev/null +++ b/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_tensorrt.pbtxt @@ -0,0 +1,165 @@ +# MediaPipe graph to detect/predict pose landmarks and optionally segmentation +# within an ROI. (CPU input, and inference is executed on CPU.) +# +# It is required that "pose_landmark_lite.onnx" or +# "pose_landmark_full.onnx" or "pose_landmark_heavy.onnx" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx" +# path respectively during execution, depending on the specification in the +# MODEL_COMPLEXITY input side packet. +# +# EXAMPLE: +# node { +# calculator: "PoseLandmarkByRoiOnnxTensorRT" +# input_side_packet: "MODEL_COMPLEXITY:model_complexity" +# input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" +# input_stream: "IMAGE:image" +# input_stream: "ROI:roi" +# output_stream: "LANDMARKS:landmarks" +# output_stream: "SEGMENTATION_MASK:segmentation_mask" +# } + +type: "PoseLandmarkByRoiOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" +# ROI (region of interest) within the given image where a pose is located. +# (NormalizedRect) +input_stream: "ROI:roi" + +# Whether to predict the segmentation mask. If unspecified, functions as set to +# false. (bool) +input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + +# Pose landmarks within the given ROI. (NormalizedLandmarkList) +# We have 33 landmarks (see pose_landmark_topology.svg) and there are other +# auxiliary key points. +# 0 - nose +# 1 - left eye (inner) +# 2 - left eye +# 3 - left eye (outer) +# 4 - right eye (inner) +# 5 - right eye +# 6 - right eye (outer) +# 7 - left ear +# 8 - right ear +# 9 - mouth (left) +# 10 - mouth (right) +# 11 - left shoulder +# 12 - right shoulder +# 13 - left elbow +# 14 - right elbow +# 15 - left wrist +# 16 - right wrist +# 17 - left pinky +# 18 - right pinky +# 19 - left index +# 20 - right index +# 21 - left thumb +# 22 - right thumb +# 23 - left hip +# 24 - right hip +# 25 - left knee +# 26 - right knee +# 27 - left ankle +# 28 - right ankle +# 29 - left heel +# 30 - right heel +# 31 - left foot index +# 32 - right foot index +# +# NOTE: If a pose is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:landmarks" +# Auxiliary landmarks for deriving the ROI in the subsequent image. +# (NormalizedLandmarkList) +output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" + +# Pose world landmarks within the given ROI. (LandmarkList) +# World landmarks are real-world 3D coordinates in meters with the origin at the +# center between hips. WORLD_LANDMARKS shares the same landmark topology as +# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object +# projected onto the 2D image surface, while WORLD_LANDMARKS provides +# coordinates (in meters) of the 3D object itself. +output_stream: "WORLD_LANDMARKS:world_landmarks" + +# Segmentation mask on CPU in ImageFormat::VEC32F1. (Image) +output_stream: "SEGMENTATION_MASK:segmentation_mask" + +# Retrieves the image size. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_CPU:image" + output_stream: "SIZE:image_size" +} + +# Crops and transforms the specified ROI in the input image into an image patch +# represented as a tensor of dimension expected by the corresponding ML model, +# while maintaining the aspect ratio of the ROI (which can be different from +# that of the image patch). Therefore, there can be letterboxing around the ROI +# in the generated tensor representation. +node: { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + input_stream: "NORM_RECT:roi" + output_stream: "TENSORS:input_tensors" + output_stream: "LETTERBOX_PADDING:letterbox_padding" + output_stream: "MATRIX:transformation_matrix" + options: { + [mediapipe.ImageToTensorCalculatorOptions.ext] { + output_tensor_width: 256 + output_tensor_height: 256 + keep_aspect_ratio: true + output_tensor_float_range { + min: 0.0 + max: 1.0 + } + } + } +} + +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" + output_stream: "TENSORS:output_tensors" + options: { + [mediapipe.InferenceCalculatorOptions.ext] { + model_path: "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" + delegate { tensorrt {} } + } + } +} + +# Decodes the tensors into the corresponding landmark and segmentation mask +# representation. +node { + calculator: "TensorsToPoseLandmarksAndSegmentation" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "TENSORS:output_tensors" + output_stream: "LANDMARKS:roi_landmarks" + output_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks" + output_stream: "WORLD_LANDMARKS:roi_world_landmarks" + output_stream: "SEGMENTATION_MASK:roi_segmentation_mask" +} + +# Projects the landmarks and segmentation mask in the local coordinates of the +# (potentially letterboxed) ROI back to the global coordinates of the full input +# image. +node { + calculator: "PoseLandmarksAndSegmentationInverseProjection" + input_stream: "IMAGE_SIZE:image_size" + input_stream: "NORM_RECT:roi" + input_stream: "LETTERBOX_PADDING:letterbox_padding" + input_stream: "MATRIX:transformation_matrix" + input_stream: "LANDMARKS:roi_landmarks" + input_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks" + input_stream: "WORLD_LANDMARKS:roi_world_landmarks" + input_stream: "SEGMENTATION_MASK:roi_segmentation_mask" + output_stream: "LANDMARKS:landmarks" + output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks" + output_stream: "WORLD_LANDMARKS:world_landmarks" + output_stream: "SEGMENTATION_MASK:segmentation_mask" +} diff --git a/mediapipe/modules/pose_landmark/pose_landmark_onnx_cuda.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_onnx_cuda.pbtxt new file mode 100644 index 000000000..231857922 --- /dev/null +++ b/mediapipe/modules/pose_landmark/pose_landmark_onnx_cuda.pbtxt @@ -0,0 +1,268 @@ +# MediaPipe graph to detect/predict pose landmarks. (CPU input, and inference is +# executed on CPU.) This graph tries to skip pose detection as much as possible +# by using previously detected/predicted landmarks for new images. +# +# It is required that "pose_detection.onnx" is available at +# "mediapipe/modules/pose_detection/pose_detection.onnx" +# path during execution. +# +# It is required that "pose_landmark_lite.onnx" or +# "pose_landmark_full.onnx" or "pose_landmark_heavy.onnx" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or +# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx" +# path respectively during execution, depending on the specification in the +# MODEL_COMPLEXITY input side packet. +# +# EXAMPLE: +# node { +# calculator: "PoseLandmarkOnnxCUDA" +# input_side_packet: "MODEL_COMPLEXITY:model_complexity" +# input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" +# input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" +# input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" +# input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" +# input_stream: "IMAGE:image" +# output_stream: "LANDMARKS:pose_landmarks" +# output_stream: "SEGMENTATION_MASK:segmentation_mask" +# } + +type: "PoseLandmarkOnnxCUDA" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Whether to filter landmarks across different input images to reduce jitter. +# If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" + +# Whether to predict the segmentation mask. If unspecified, functions as set to +# false. (bool) +input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + +# Whether to filter segmentation mask across different input images to reduce +# jitter. If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" + +# Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as +# inference latency generally go up with the model complexity. If unspecified, +# functions as set to 1. (int) +input_side_packet: "MODEL_COMPLEXITY:model_complexity" + +# Whether landmarks on the previous image should be used to help localize +# landmarks on the current image. (bool) +input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" + +# Pose landmarks. (NormalizedLandmarkList) +# We have 33 landmarks (see pose_landmark_topology.svg), and there are other +# auxiliary key points. +# 0 - nose +# 1 - left eye (inner) +# 2 - left eye +# 3 - left eye (outer) +# 4 - right eye (inner) +# 5 - right eye +# 6 - right eye (outer) +# 7 - left ear +# 8 - right ear +# 9 - mouth (left) +# 10 - mouth (right) +# 11 - left shoulder +# 12 - right shoulder +# 13 - left elbow +# 14 - right elbow +# 15 - left wrist +# 16 - right wrist +# 17 - left pinky +# 18 - right pinky +# 19 - left index +# 20 - right index +# 21 - left thumb +# 22 - right thumb +# 23 - left hip +# 24 - right hip +# 25 - left knee +# 26 - right knee +# 27 - left ankle +# 28 - right ankle +# 29 - left heel +# 30 - right heel +# 31 - left foot index +# 32 - right foot index +# +# NOTE: if a pose is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:pose_landmarks" + +# Pose world landmarks. (LandmarkList) +# World landmarks are real-world 3D coordinates in meters with the origin at the +# center between hips. WORLD_LANDMARKS shares the same landmark topology as +# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object +# projected onto the 2D image surface, while WORLD_LANDMARKS provides +# coordinates (in meters) of the 3D object itself. +output_stream: "WORLD_LANDMARKS:pose_world_landmarks" + +# Segmentation mask. (ImageFrame in ImageFormat::VEC32F1) +output_stream: "SEGMENTATION_MASK:segmentation_mask" + +# Extra outputs (for debugging, for instance). +# Detected poses. (Detection) +output_stream: "DETECTION:pose_detection" +# Regions of interest calculated based on landmarks. (NormalizedRect) +output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks" +# Regions of interest calculated based on pose detections. (NormalizedRect) +output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection" + +# When the optional input side packet "use_prev_landmarks" is either absent or +# set to true, uses the landmarks on the previous image to help localize +# landmarks on the current image. +node { + calculator: "GateCalculator" + input_side_packet: "ALLOW:use_prev_landmarks" + input_stream: "prev_pose_rect_from_landmarks" + output_stream: "gated_prev_pose_rect_from_landmarks" + options: { + [mediapipe.GateCalculatorOptions.ext] { + allow: true + } + } +} + +# Checks if there's previous pose rect calculated from landmarks. +node: { + calculator: "PacketPresenceCalculator" + input_stream: "PACKET:gated_prev_pose_rect_from_landmarks" + output_stream: "PRESENCE:prev_pose_rect_from_landmarks_is_present" +} + +# Calculates size of the image. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_CPU:image" + output_stream: "SIZE:image_size" +} + +# Drops the incoming image if the pose has already been identified from the +# previous image. Otherwise, passes the incoming image through to trigger a new +# round of pose detection. +node { + calculator: "GateCalculator" + input_stream: "image" + input_stream: "image_size" + input_stream: "DISALLOW:prev_pose_rect_from_landmarks_is_present" + output_stream: "image_for_pose_detection" + output_stream: "image_size_for_pose_detection" + options: { + [mediapipe.GateCalculatorOptions.ext] { + empty_packets_as_allow: true + } + } +} + +# Detects poses. +node { + calculator: "PoseDetectionOnnxCUDA" + input_stream: "IMAGE:image_for_pose_detection" + output_stream: "DETECTIONS:pose_detections" +} + +# Gets the very first detection from "pose_detections" vector. +node { + calculator: "SplitDetectionVectorCalculator" + input_stream: "pose_detections" + output_stream: "pose_detection" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 1 } + element_only: true + } + } +} + +# Calculates region of interest based on pose detection, so that can be used +# to detect landmarks. +node { + calculator: "PoseDetectionToRoi" + input_stream: "DETECTION:pose_detection" + input_stream: "IMAGE_SIZE:image_size_for_pose_detection" + output_stream: "ROI:pose_rect_from_detection" +} + +# Selects either pose rect (or ROI) calculated from detection or from previously +# detected landmarks if available (in this case, calculation of pose rect from +# detection is skipped). +node { + calculator: "MergeCalculator" + input_stream: "pose_rect_from_detection" + input_stream: "gated_prev_pose_rect_from_landmarks" + output_stream: "pose_rect" +} + +# Detects pose landmarks within specified region of interest of the image. +node { + calculator: "PoseLandmarkByRoiOnnxCUDA" + input_side_packet: "MODEL_COMPLEXITY:model_complexity" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "IMAGE:image" + input_stream: "ROI:pose_rect" + output_stream: "LANDMARKS:unfiltered_pose_landmarks" + output_stream: "AUXILIARY_LANDMARKS:unfiltered_auxiliary_landmarks" + output_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks" + output_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask" +} + +# Smoothes landmarks to reduce jitter. +node { + calculator: "PoseLandmarkFiltering" + input_side_packet: "ENABLE:smooth_landmarks" + input_stream: "IMAGE_SIZE:image_size" + input_stream: "NORM_LANDMARKS:unfiltered_pose_landmarks" + input_stream: "AUX_NORM_LANDMARKS:unfiltered_auxiliary_landmarks" + input_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks" + output_stream: "FILTERED_NORM_LANDMARKS:pose_landmarks" + output_stream: "FILTERED_AUX_NORM_LANDMARKS:auxiliary_landmarks" + output_stream: "FILTERED_WORLD_LANDMARKS:pose_world_landmarks" +} + +# Calculates region of interest based on the auxiliary landmarks, to be used in +# the subsequent image. +node { + calculator: "PoseLandmarksToRoi" + input_stream: "LANDMARKS:auxiliary_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:pose_rect_from_landmarks" +} + +# Caches pose rects calculated from landmarks, and upon the arrival of the next +# input image, sends out the cached rects with timestamps replaced by that of +# the input image, essentially generating a packet that carries the previous +# pose rects. Note that upon the arrival of the very first input image, a +# timestamp bound update occurs to jump start the feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:image" + input_stream: "LOOP:pose_rect_from_landmarks" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_pose_rect_from_landmarks" +} + +# Smoothes segmentation to reduce jitter. +node { + calculator: "PoseSegmentationFiltering" + input_side_packet: "ENABLE:smooth_segmentation" + input_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask" + output_stream: "FILTERED_SEGMENTATION_MASK:filtered_segmentation_mask" +} + +# Converts the incoming segmentation mask represented as an Image into the +# corresponding ImageFrame type. +node: { + calculator: "FromImageCalculator" + input_stream: "IMAGE:filtered_segmentation_mask" + output_stream: "IMAGE_CPU:segmentation_mask" +} diff --git a/mediapipe/modules/pose_landmark/pose_landmark_onnx_tensorrt.pbtxt b/mediapipe/modules/pose_landmark/pose_landmark_onnx_tensorrt.pbtxt new file mode 100644 index 000000000..a21256f12 --- /dev/null +++ b/mediapipe/modules/pose_landmark/pose_landmark_onnx_tensorrt.pbtxt @@ -0,0 +1,268 @@ +# MediaPipe graph to detect/predict pose landmarks. (CPU input, and inference is +# executed on CPU.) This graph tries to skip pose detection as much as possible +# by using previously detected/predicted landmarks for new images. +# +# It is required that "pose_detection.tflite" is available at +# "mediapipe/modules/pose_detection/pose_detection.tflite" +# path during execution. +# +# It is required that "pose_landmark_lite.tflite" or +# "pose_landmark_full.tflite" or "pose_landmark_heavy.tflite" is available at +# "mediapipe/modules/pose_landmark/pose_landmark_lite.tflite" or +# "mediapipe/modules/pose_landmark/pose_landmark_full.tflite" or +# "mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite" +# path respectively during execution, depending on the specification in the +# MODEL_COMPLEXITY input side packet. +# +# EXAMPLE: +# node { +# calculator: "PoseLandmarkOnnxTensorRT" +# input_side_packet: "MODEL_COMPLEXITY:model_complexity" +# input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" +# input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" +# input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" +# input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" +# input_stream: "IMAGE:image" +# output_stream: "LANDMARKS:pose_landmarks" +# output_stream: "SEGMENTATION_MASK:segmentation_mask" +# } + +type: "PoseLandmarkOnnxTensorRT" + +# CPU image. (ImageFrame) +input_stream: "IMAGE:image" + +# Whether to filter landmarks across different input images to reduce jitter. +# If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks" + +# Whether to predict the segmentation mask. If unspecified, functions as set to +# false. (bool) +input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + +# Whether to filter segmentation mask across different input images to reduce +# jitter. If unspecified, functions as set to true. (bool) +input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation" + +# Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as +# inference latency generally go up with the model complexity. If unspecified, +# functions as set to 1. (int) +input_side_packet: "MODEL_COMPLEXITY:model_complexity" + +# Whether landmarks on the previous image should be used to help localize +# landmarks on the current image. (bool) +input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" + +# Pose landmarks. (NormalizedLandmarkList) +# We have 33 landmarks (see pose_landmark_topology.svg), and there are other +# auxiliary key points. +# 0 - nose +# 1 - left eye (inner) +# 2 - left eye +# 3 - left eye (outer) +# 4 - right eye (inner) +# 5 - right eye +# 6 - right eye (outer) +# 7 - left ear +# 8 - right ear +# 9 - mouth (left) +# 10 - mouth (right) +# 11 - left shoulder +# 12 - right shoulder +# 13 - left elbow +# 14 - right elbow +# 15 - left wrist +# 16 - right wrist +# 17 - left pinky +# 18 - right pinky +# 19 - left index +# 20 - right index +# 21 - left thumb +# 22 - right thumb +# 23 - left hip +# 24 - right hip +# 25 - left knee +# 26 - right knee +# 27 - left ankle +# 28 - right ankle +# 29 - left heel +# 30 - right heel +# 31 - left foot index +# 32 - right foot index +# +# NOTE: if a pose is not present within the given ROI, for this particular +# timestamp there will not be an output packet in the LANDMARKS stream. However, +# the MediaPipe framework will internally inform the downstream calculators of +# the absence of this packet so that they don't wait for it unnecessarily. +output_stream: "LANDMARKS:pose_landmarks" + +# Pose world landmarks. (LandmarkList) +# World landmarks are real-world 3D coordinates in meters with the origin at the +# center between hips. WORLD_LANDMARKS shares the same landmark topology as +# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object +# projected onto the 2D image surface, while WORLD_LANDMARKS provides +# coordinates (in meters) of the 3D object itself. +output_stream: "WORLD_LANDMARKS:pose_world_landmarks" + +# Segmentation mask. (ImageFrame in ImageFormat::VEC32F1) +output_stream: "SEGMENTATION_MASK:segmentation_mask" + +# Extra outputs (for debugging, for instance). +# Detected poses. (Detection) +output_stream: "DETECTION:pose_detection" +# Regions of interest calculated based on landmarks. (NormalizedRect) +output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks" +# Regions of interest calculated based on pose detections. (NormalizedRect) +output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection" + +# When the optional input side packet "use_prev_landmarks" is either absent or +# set to true, uses the landmarks on the previous image to help localize +# landmarks on the current image. +node { + calculator: "GateCalculator" + input_side_packet: "ALLOW:use_prev_landmarks" + input_stream: "prev_pose_rect_from_landmarks" + output_stream: "gated_prev_pose_rect_from_landmarks" + options: { + [mediapipe.GateCalculatorOptions.ext] { + allow: true + } + } +} + +# Checks if there's previous pose rect calculated from landmarks. +node: { + calculator: "PacketPresenceCalculator" + input_stream: "PACKET:gated_prev_pose_rect_from_landmarks" + output_stream: "PRESENCE:prev_pose_rect_from_landmarks_is_present" +} + +# Calculates size of the image. +node { + calculator: "ImagePropertiesCalculator" + input_stream: "IMAGE_CPU:image" + output_stream: "SIZE:image_size" +} + +# Drops the incoming image if the pose has already been identified from the +# previous image. Otherwise, passes the incoming image through to trigger a new +# round of pose detection. +node { + calculator: "GateCalculator" + input_stream: "image" + input_stream: "image_size" + input_stream: "DISALLOW:prev_pose_rect_from_landmarks_is_present" + output_stream: "image_for_pose_detection" + output_stream: "image_size_for_pose_detection" + options: { + [mediapipe.GateCalculatorOptions.ext] { + empty_packets_as_allow: true + } + } +} + +# Detects poses. +node { + calculator: "PoseDetectionOnnxTensorRT" + input_stream: "IMAGE:image_for_pose_detection" + output_stream: "DETECTIONS:pose_detections" +} + +# Gets the very first detection from "pose_detections" vector. +node { + calculator: "SplitDetectionVectorCalculator" + input_stream: "pose_detections" + output_stream: "pose_detection" + options: { + [mediapipe.SplitVectorCalculatorOptions.ext] { + ranges: { begin: 0 end: 1 } + element_only: true + } + } +} + +# Calculates region of interest based on pose detection, so that can be used +# to detect landmarks. +node { + calculator: "PoseDetectionToRoi" + input_stream: "DETECTION:pose_detection" + input_stream: "IMAGE_SIZE:image_size_for_pose_detection" + output_stream: "ROI:pose_rect_from_detection" +} + +# Selects either pose rect (or ROI) calculated from detection or from previously +# detected landmarks if available (in this case, calculation of pose rect from +# detection is skipped). +node { + calculator: "MergeCalculator" + input_stream: "pose_rect_from_detection" + input_stream: "gated_prev_pose_rect_from_landmarks" + output_stream: "pose_rect" +} + +# Detects pose landmarks within specified region of interest of the image. +node { + calculator: "PoseLandmarkByRoiOnnxTensorRT" + input_side_packet: "MODEL_COMPLEXITY:model_complexity" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "IMAGE:image" + input_stream: "ROI:pose_rect" + output_stream: "LANDMARKS:unfiltered_pose_landmarks" + output_stream: "AUXILIARY_LANDMARKS:unfiltered_auxiliary_landmarks" + output_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks" + output_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask" +} + +# Smoothes landmarks to reduce jitter. +node { + calculator: "PoseLandmarkFiltering" + input_side_packet: "ENABLE:smooth_landmarks" + input_stream: "IMAGE_SIZE:image_size" + input_stream: "NORM_LANDMARKS:unfiltered_pose_landmarks" + input_stream: "AUX_NORM_LANDMARKS:unfiltered_auxiliary_landmarks" + input_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks" + output_stream: "FILTERED_NORM_LANDMARKS:pose_landmarks" + output_stream: "FILTERED_AUX_NORM_LANDMARKS:auxiliary_landmarks" + output_stream: "FILTERED_WORLD_LANDMARKS:pose_world_landmarks" +} + +# Calculates region of interest based on the auxiliary landmarks, to be used in +# the subsequent image. +node { + calculator: "PoseLandmarksToRoi" + input_stream: "LANDMARKS:auxiliary_landmarks" + input_stream: "IMAGE_SIZE:image_size" + output_stream: "ROI:pose_rect_from_landmarks" +} + +# Caches pose rects calculated from landmarks, and upon the arrival of the next +# input image, sends out the cached rects with timestamps replaced by that of +# the input image, essentially generating a packet that carries the previous +# pose rects. Note that upon the arrival of the very first input image, a +# timestamp bound update occurs to jump start the feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:image" + input_stream: "LOOP:pose_rect_from_landmarks" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_pose_rect_from_landmarks" +} + +# Smoothes segmentation to reduce jitter. +node { + calculator: "PoseSegmentationFiltering" + input_side_packet: "ENABLE:smooth_segmentation" + input_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask" + output_stream: "FILTERED_SEGMENTATION_MASK:filtered_segmentation_mask" +} + +# Converts the incoming segmentation mask represented as an Image into the +# corresponding ImageFrame type. +node: { + calculator: "FromImageCalculator" + input_stream: "IMAGE:filtered_segmentation_mask" + output_stream: "IMAGE_CPU:segmentation_mask" +}