pose detection和landmark支持onnxruntime的cuda和tensorrt

2022-08-12 09:42:03 +08:00 · 2022-08-12 09:42:03 +08:00 · 008ed46ee0
commit 008ed46ee0
parent f3bf3ab3e3
13 changed files with 1485 additions and 1 deletions
--- a/mediapipe/examples/desktop/pose_tracking/BUILD
+++ b/mediapipe/examples/desktop/pose_tracking/BUILD
@ -24,6 +24,46 @@ cc_binary(
    ],
 )

+cc_binary(
+    name = "pose_tracking_cpu_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/pose_tracking:pose_tracking_cpu_deps",
+    ],
+)
+
+cc_binary(
+    name = "pose_tracking_onnx_cuda",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main",
+        "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_cuda_deps",
+    ],
+)
+
+cc_binary(
+    name = "pose_tracking_onnx_cuda_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_cuda_deps",
+    ],
+)
+
+cc_binary(
+    name = "pose_tracking_onnx_tensorrt",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main",
+        "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_tensorrt_deps",
+    ],
+)
+
+cc_binary(
+    name = "pose_tracking_onnx_tensorrt_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/pose_tracking:pose_tracking_onnx_tensorrt_deps",
+    ],
+)
+
 # Linux only
 cc_binary(
    name = "pose_tracking_gpu",
--- a/mediapipe/graphs/pose_tracking/BUILD
+++ b/mediapipe/graphs/pose_tracking/BUILD
@ -54,3 +54,37 @@ mediapipe_binary_graph(
    output_name = "pose_tracking_cpu.binarypb",
    deps = [":pose_tracking_cpu_deps"],
 )
+
+cc_library(
+    name = "pose_tracking_onnx_cuda_deps",
+    deps = [
+        "//mediapipe/calculators/core:constant_side_packet_calculator",
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/graphs/pose_tracking/subgraphs:pose_renderer_cpu",
+        "//mediapipe/modules/pose_landmark:pose_landmark_onnx_cuda",
+    ],
+)
+
+mediapipe_binary_graph(
+    name = "pose_tracking_onnx_cuda_binary_graph",
+    graph = "pose_tracking_onnx_cuda.pbtxt",
+    output_name = "pose_tracking_onnx_cuda.binarypb",
+    deps = [":pose_tracking_onnx_cuda_deps"],
+)
+
+cc_library(
+    name = "pose_tracking_onnx_tensorrt_deps",
+    deps = [
+        "//mediapipe/calculators/core:constant_side_packet_calculator",
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/graphs/pose_tracking/subgraphs:pose_renderer_cpu",
+        "//mediapipe/modules/pose_landmark:pose_landmark_onnx_tensorrt",
+    ],
+)
+
+mediapipe_binary_graph(
+    name = "pose_tracking_onnx_tensorrt_binary_graph",
+    graph = "pose_tracking_onnx_tensorrt.pbtxt",
+    output_name = "pose_tracking_onnx_tensorrt.binarypb",
+    deps = [":pose_tracking_onnx_tensorrt"],
+)
--- a/mediapipe/graphs/pose_tracking/pose_tracking_cpu.pbtxt
+++ b/mediapipe/graphs/pose_tracking/pose_tracking_cpu.pbtxt
@ -14,7 +14,7 @@ node {
  output_side_packet: "PACKET:enable_segmentation"
  node_options: {
    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
-      packet { bool_value: true }
+      packet { bool_value: false }
    }
  }
 }
--- a/mediapipe/graphs/pose_tracking/pose_tracking_onnx_cuda.pbtxt
+++ b/mediapipe/graphs/pose_tracking/pose_tracking_onnx_cuda.pbtxt
@ -0,0 +1,63 @@
+# MediaPipe graph that performs pose tracking with onnxruntime on cuda.
+
+# CPU buffer. (ImageFrame)
+input_stream: "input_video"
+
+# Output image with rendered results. (ImageFrame)
+output_stream: "output_video"
+# Pose landmarks. (NormalizedLandmarkList)
+output_stream: "pose_landmarks"
+
+# Generates side packet to enable segmentation.
+node {
+  calculator: "ConstantSidePacketCalculator"
+  output_side_packet: "PACKET:enable_segmentation"
+  node_options: {
+    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
+      packet { bool_value: false }
+    }
+  }
+}
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for downstream nodes
+# (calculators and subgraphs) in the graph to finish their tasks before it
+# passes through another image. All images that come in while waiting are
+# dropped, limiting the number of in-flight images in most part of the graph to
+# 1. This prevents the downstream nodes from queuing up incoming images and data
+# excessively, which leads to increased latency and memory usage, unwanted in
+# real-time mobile applications. It also eliminates unnecessarily computation,
+# e.g., the output produced by a node may get dropped downstream if the
+# subsequent nodes are still busy processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:output_video"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Subgraph that detects poses and corresponding landmarks.
+node {
+  calculator: "PoseLandmarkOnnxCUDA"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "LANDMARKS:pose_landmarks"
+  output_stream: "SEGMENTATION_MASK:segmentation_mask"
+  output_stream: "DETECTION:pose_detection"
+  output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks"
+}
+
+# Subgraph that renders pose-landmark annotation onto the input image.
+node {
+  calculator: "PoseRendererCpu"
+  input_stream: "IMAGE:throttled_input_video"
+  input_stream: "LANDMARKS:pose_landmarks"
+  input_stream: "SEGMENTATION_MASK:segmentation_mask"
+  input_stream: "DETECTION:pose_detection"
+  input_stream: "ROI:roi_from_landmarks"
+  output_stream: "IMAGE:output_video"
+}
--- a/mediapipe/graphs/pose_tracking/pose_tracking_onnx_tensorrt.pbtxt
+++ b/mediapipe/graphs/pose_tracking/pose_tracking_onnx_tensorrt.pbtxt
@ -0,0 +1,63 @@
+# MediaPipe graph that performs pose tracking with onnxruntime on tensorrt.
+
+# CPU buffer. (ImageFrame)
+input_stream: "input_video"
+
+# Output image with rendered results. (ImageFrame)
+output_stream: "output_video"
+# Pose landmarks. (NormalizedLandmarkList)
+output_stream: "pose_landmarks"
+
+# Generates side packet to enable segmentation.
+node {
+  calculator: "ConstantSidePacketCalculator"
+  output_side_packet: "PACKET:enable_segmentation"
+  node_options: {
+    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
+      packet { bool_value: false }
+    }
+  }
+}
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for downstream nodes
+# (calculators and subgraphs) in the graph to finish their tasks before it
+# passes through another image. All images that come in while waiting are
+# dropped, limiting the number of in-flight images in most part of the graph to
+# 1. This prevents the downstream nodes from queuing up incoming images and data
+# excessively, which leads to increased latency and memory usage, unwanted in
+# real-time mobile applications. It also eliminates unnecessarily computation,
+# e.g., the output produced by a node may get dropped downstream if the
+# subsequent nodes are still busy processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:output_video"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Subgraph that detects poses and corresponding landmarks.
+node {
+  calculator: "PoseLandmarkOnnxTensorRT"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "LANDMARKS:pose_landmarks"
+  output_stream: "SEGMENTATION_MASK:segmentation_mask"
+  output_stream: "DETECTION:pose_detection"
+  output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks"
+}
+
+# Subgraph that renders pose-landmark annotation onto the input image.
+node {
+  calculator: "PoseRendererCpu"
+  input_stream: "IMAGE:throttled_input_video"
+  input_stream: "LANDMARKS:pose_landmarks"
+  input_stream: "SEGMENTATION_MASK:segmentation_mask"
+  input_stream: "DETECTION:pose_detection"
+  input_stream: "ROI:roi_from_landmarks"
+  output_stream: "IMAGE:output_video"
+}
--- a/mediapipe/modules/pose_detection/BUILD
+++ b/mediapipe/modules/pose_detection/BUILD
@ -35,6 +35,34 @@ mediapipe_simple_subgraph(
    ],
 )

+mediapipe_simple_subgraph(
+    name = "pose_detection_onnx_cuda",
+    graph = "pose_detection_onnx_cuda.pbtxt",
+    register_as = "PoseDetectionOnnxCUDA",
+    deps = [
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda",
+        "//mediapipe/calculators/tensor:tensors_to_detections_calculator",
+        "//mediapipe/calculators/tflite:ssd_anchors_calculator",
+        "//mediapipe/calculators/util:detection_letterbox_removal_calculator",
+        "//mediapipe/calculators/util:non_max_suppression_calculator",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "pose_detection_onnx_tensorrt",
+    graph = "pose_detection_onnx_tensorrt.pbtxt",
+    register_as = "PoseDetectionOnnxTensorRT",
+    deps = [
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt",
+        "//mediapipe/calculators/tensor:tensors_to_detections_calculator",
+        "//mediapipe/calculators/tflite:ssd_anchors_calculator",
+        "//mediapipe/calculators/util:detection_letterbox_removal_calculator",
+        "//mediapipe/calculators/util:non_max_suppression_calculator",
+    ],
+)
+
 mediapipe_simple_subgraph(
    name = "pose_detection_gpu",
    graph = "pose_detection_gpu.pbtxt",
--- a/mediapipe/modules/pose_detection/pose_detection_onnx_cuda.pbtxt
+++ b/mediapipe/modules/pose_detection/pose_detection_onnx_cuda.pbtxt
@ -0,0 +1,157 @@
+# MediaPipe graph to detect poses. (CPU input, and inference is executed with onnxruntime on
+# cuda.)
+#
+# It is required that "pose_detection.onnx" is available at
+# "mediapipe/modules/pose_detection/pose_detection.onnx"
+# path during execution.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "PoseDetectionOnnxCUDA"
+#     input_stream: "IMAGE:image"
+#     output_stream: "DETECTIONS:pose_detections"
+#   }
+
+type: "PoseDetectionOnnxCUDA"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+
+# Detected poses. (std::vector<Detection>)
+# Bounding box in each pose detection is currently set to the bounding box of
+# the detected face. However, 4 additional key points are available in each
+# detection, which are used to further calculate a (rotated) bounding box that
+# encloses the body region of interest. Among the 4 key points, the first two
+# are for identifying the full-body region, and the second two for upper body
+# only:
+#
+# Key point 0 - mid hip center
+# Key point 1 - point that encodes size & rotation (for full body)
+# Key point 2 - mid shoulder center
+# Key point 3 - point that encodes size & rotation (for upper body)
+#
+# NOTE: there will not be an output packet in the DETECTIONS stream for this
+# particular timestamp if none of poses detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+output_stream: "DETECTIONS:detections"
+
+# Transforms the input image into a 224x224 one while keeping the aspect ratio
+# (what is expected by the corresponding model), resulting in potential
+# letterboxing in the transformed image.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:image"
+  output_stream: "TENSORS:input_tensors"
+  output_stream: "LETTERBOX_PADDING:letterbox_padding"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+      output_tensor_width: 224
+      output_tensor_height: 224
+      keep_aspect_ratio: true
+      output_tensor_float_range {
+        min: -1.0
+        max: 1.0
+      }
+      border_mode: BORDER_ZERO
+      # If this calculator truly operates in the CPU, then gpu_origin is
+      # ignored, but if some build switch insists on GPU inference, then we will
+      # still need to set this.
+      gpu_origin: TOP_LEFT
+    }
+  }
+}
+
+# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
+# vector of tensors representing, for instance, detection boxes/keypoints and
+# scores.
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:detection_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+      model_path: "mediapipe/modules/pose_detection/pose_detection.onnx"
+      delegate { cuda {} }
+    }
+  }
+}
+
+# Generates a single side packet containing a vector of SSD anchors based on
+# the specification in the options.
+node {
+  calculator: "SsdAnchorsCalculator"
+  output_side_packet: "anchors"
+  options: {
+    [mediapipe.SsdAnchorsCalculatorOptions.ext] {
+      num_layers: 5
+      min_scale: 0.1484375
+      max_scale: 0.75
+      input_size_height: 224
+      input_size_width: 224
+      anchor_offset_x: 0.5
+      anchor_offset_y: 0.5
+      strides: 8
+      strides: 16
+      strides: 32
+      strides: 32
+      strides: 32
+      aspect_ratios: 1.0
+      fixed_anchor_size: true
+    }
+  }
+}
+
+# Decodes the detection tensors generated by the TensorFlow Lite model, based on
+# the SSD anchors and the specification in the options, into a vector of
+# detections. Each detection describes a detected object.
+node {
+  calculator: "TensorsToDetectionsCalculator"
+  input_stream: "TENSORS:detection_tensors"
+  input_side_packet: "ANCHORS:anchors"
+  output_stream: "DETECTIONS:unfiltered_detections"
+  options: {
+    [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
+      num_classes: 1
+      num_boxes: 2254
+      num_coords: 12
+      box_coord_offset: 0
+      keypoint_coord_offset: 4
+      num_keypoints: 4
+      num_values_per_keypoint: 2
+      sigmoid_score: true
+      score_clipping_thresh: 100.0
+      reverse_output_order: true
+      x_scale: 224.0
+      y_scale: 224.0
+      h_scale: 224.0
+      w_scale: 224.0
+      min_score_thresh: 0.5
+    }
+  }
+}
+
+# Performs non-max suppression to remove excessive detections.
+node {
+  calculator: "NonMaxSuppressionCalculator"
+  input_stream: "unfiltered_detections"
+  output_stream: "filtered_detections"
+  options: {
+    [mediapipe.NonMaxSuppressionCalculatorOptions.ext] {
+      min_suppression_threshold: 0.3
+      overlap_type: INTERSECTION_OVER_UNION
+      algorithm: WEIGHTED
+    }
+  }
+}
+
+# Adjusts detection locations (already normalized to [0.f, 1.f]) on the
+# letterboxed image (after image transformation with the FIT scale mode) to the
+# corresponding locations on the same image with the letterbox removed (the
+# input image to the graph before image transformation).
+node {
+  calculator: "DetectionLetterboxRemovalCalculator"
+  input_stream: "DETECTIONS:filtered_detections"
+  input_stream: "LETTERBOX_PADDING:letterbox_padding"
+  output_stream: "DETECTIONS:detections"
+}
--- a/mediapipe/modules/pose_detection/pose_detection_onnx_tensorrt.pbtxt
+++ b/mediapipe/modules/pose_detection/pose_detection_onnx_tensorrt.pbtxt
@ -0,0 +1,157 @@
+# MediaPipe graph to detect poses. (CPU input, and inference is executed with onnxruntime on
+# tensorrt.)
+#
+# It is required that "pose_detection.onnx" is available at
+# "mediapipe/modules/pose_detection/pose_detection.onnx"
+# path during execution.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "PoseDetectionOnnxTensorRT"
+#     input_stream: "IMAGE:image"
+#     output_stream: "DETECTIONS:pose_detections"
+#   }
+
+type: "PoseDetectionOnnxTensorRT"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+
+# Detected poses. (std::vector<Detection>)
+# Bounding box in each pose detection is currently set to the bounding box of
+# the detected face. However, 4 additional key points are available in each
+# detection, which are used to further calculate a (rotated) bounding box that
+# encloses the body region of interest. Among the 4 key points, the first two
+# are for identifying the full-body region, and the second two for upper body
+# only:
+#
+# Key point 0 - mid hip center
+# Key point 1 - point that encodes size & rotation (for full body)
+# Key point 2 - mid shoulder center
+# Key point 3 - point that encodes size & rotation (for upper body)
+#
+# NOTE: there will not be an output packet in the DETECTIONS stream for this
+# particular timestamp if none of poses detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+output_stream: "DETECTIONS:detections"
+
+# Transforms the input image into a 224x224 one while keeping the aspect ratio
+# (what is expected by the corresponding model), resulting in potential
+# letterboxing in the transformed image.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:image"
+  output_stream: "TENSORS:input_tensors"
+  output_stream: "LETTERBOX_PADDING:letterbox_padding"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+      output_tensor_width: 224
+      output_tensor_height: 224
+      keep_aspect_ratio: true
+      output_tensor_float_range {
+        min: -1.0
+        max: 1.0
+      }
+      border_mode: BORDER_ZERO
+      # If this calculator truly operates in the CPU, then gpu_origin is
+      # ignored, but if some build switch insists on GPU inference, then we will
+      # still need to set this.
+      gpu_origin: TOP_LEFT
+    }
+  }
+}
+
+# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
+# vector of tensors representing, for instance, detection boxes/keypoints and
+# scores.
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:detection_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+      model_path: "mediapipe/modules/pose_detection/pose_detection.onnx"
+      delegate { tensorrt {} }
+    }
+  }
+}
+
+# Generates a single side packet containing a vector of SSD anchors based on
+# the specification in the options.
+node {
+  calculator: "SsdAnchorsCalculator"
+  output_side_packet: "anchors"
+  options: {
+    [mediapipe.SsdAnchorsCalculatorOptions.ext] {
+      num_layers: 5
+      min_scale: 0.1484375
+      max_scale: 0.75
+      input_size_height: 224
+      input_size_width: 224
+      anchor_offset_x: 0.5
+      anchor_offset_y: 0.5
+      strides: 8
+      strides: 16
+      strides: 32
+      strides: 32
+      strides: 32
+      aspect_ratios: 1.0
+      fixed_anchor_size: true
+    }
+  }
+}
+
+# Decodes the detection tensors generated by the TensorFlow Lite model, based on
+# the SSD anchors and the specification in the options, into a vector of
+# detections. Each detection describes a detected object.
+node {
+  calculator: "TensorsToDetectionsCalculator"
+  input_stream: "TENSORS:detection_tensors"
+  input_side_packet: "ANCHORS:anchors"
+  output_stream: "DETECTIONS:unfiltered_detections"
+  options: {
+    [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
+      num_classes: 1
+      num_boxes: 2254
+      num_coords: 12
+      box_coord_offset: 0
+      keypoint_coord_offset: 4
+      num_keypoints: 4
+      num_values_per_keypoint: 2
+      sigmoid_score: true
+      score_clipping_thresh: 100.0
+      reverse_output_order: true
+      x_scale: 224.0
+      y_scale: 224.0
+      h_scale: 224.0
+      w_scale: 224.0
+      min_score_thresh: 0.5
+    }
+  }
+}
+
+# Performs non-max suppression to remove excessive detections.
+node {
+  calculator: "NonMaxSuppressionCalculator"
+  input_stream: "unfiltered_detections"
+  output_stream: "filtered_detections"
+  options: {
+    [mediapipe.NonMaxSuppressionCalculatorOptions.ext] {
+      min_suppression_threshold: 0.3
+      overlap_type: INTERSECTION_OVER_UNION
+      algorithm: WEIGHTED
+    }
+  }
+}
+
+# Adjusts detection locations (already normalized to [0.f, 1.f]) on the
+# letterboxed image (after image transformation with the FIT scale mode) to the
+# corresponding locations on the same image with the letterbox removed (the
+# input image to the graph before image transformation).
+node {
+  calculator: "DetectionLetterboxRemovalCalculator"
+  input_stream: "DETECTIONS:filtered_detections"
+  input_stream: "LETTERBOX_PADDING:letterbox_padding"
+  output_stream: "DETECTIONS:detections"
+}
--- a/mediapipe/modules/pose_landmark/BUILD
+++ b/mediapipe/modules/pose_landmark/BUILD
@ -61,6 +61,35 @@ mediapipe_simple_subgraph(
    ],
 )

+
+mediapipe_simple_subgraph(
+    name = "pose_landmark_by_roi_onnx_cuda",
+    graph = "pose_landmark_by_roi_onnx_cuda.pbtxt",
+    register_as = "PoseLandmarkByRoiOnnxCUDA",
+    deps = [
+        ":pose_landmark_model_loader",
+        ":pose_landmarks_and_segmentation_inverse_projection",
+        ":tensors_to_pose_landmarks_and_segmentation",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "pose_landmark_by_roi_onnx_tensorrt",
+    graph = "pose_landmark_by_roi_onnx_tensorrt.pbtxt",
+    register_as = "PoseLandmarkByRoiOnnxTensorRT",
+    deps = [
+        ":pose_landmark_model_loader",
+        ":pose_landmarks_and_segmentation_inverse_projection",
+        ":tensors_to_pose_landmarks_and_segmentation",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt",
+    ],
+)
+
 mediapipe_simple_subgraph(
    name = "tensors_to_pose_landmarks_and_segmentation",
    graph = "tensors_to_pose_landmarks_and_segmentation.pbtxt",
@ -159,10 +188,57 @@ mediapipe_simple_subgraph(
    ],
 )

+mediapipe_simple_subgraph(
+    name = "pose_landmark_onnx_cuda",
+    graph = "pose_landmark_onnx_cuda.pbtxt",
+    register_as = "PoseLandmarkOnnxCUDA",
+    deps = [
+        ":pose_detection_to_roi",
+        ":pose_landmark_by_roi_onnx_cuda",
+        ":pose_landmark_filtering",
+        ":pose_landmarks_to_roi",
+        ":pose_segmentation_filtering",
+        "//mediapipe/calculators/core:constant_side_packet_calculator",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/core:merge_calculator",
+        "//mediapipe/calculators/core:packet_presence_calculator",
+        "//mediapipe/calculators/core:previous_loopback_calculator",
+        "//mediapipe/calculators/core:split_vector_calculator",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/util:from_image_calculator",
+        "//mediapipe/modules/pose_detection:pose_detection_onnx_cuda",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "pose_landmark_onnx_tensorrt",
+    graph = "pose_landmark_onnx_tensorrt.pbtxt",
+    register_as = "PoseLandmarkOnnxTensorRT",
+    deps = [
+        ":pose_detection_to_roi",
+        ":pose_landmark_by_roi_onnx_tensorrt",
+        ":pose_landmark_filtering",
+        ":pose_landmarks_to_roi",
+        ":pose_segmentation_filtering",
+        "//mediapipe/calculators/core:constant_side_packet_calculator",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/core:merge_calculator",
+        "//mediapipe/calculators/core:packet_presence_calculator",
+        "//mediapipe/calculators/core:previous_loopback_calculator",
+        "//mediapipe/calculators/core:split_vector_calculator",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/util:from_image_calculator",
+        "//mediapipe/modules/pose_detection:pose_detection_onnx_tensorrt",
+    ],
+)
+
 exports_files(
    srcs = [
+        "pose_landmark_full.onnx",
        "pose_landmark_full.tflite",
+        "pose_landmark_heavy.onnx",
        "pose_landmark_heavy.tflite",
+        "pose_landmark_lite.onnx",
        "pose_landmark_lite.tflite",
    ],
 )
--- a/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_cuda.pbtxt
+++ b/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_cuda.pbtxt
@ -0,0 +1,165 @@
+# MediaPipe graph to detect/predict pose landmarks and optionally segmentation
+# within an ROI. (CPU input, and inference is executed on CPU.)
+#
+# It is required that "pose_landmark_lite.onnx" or
+# "pose_landmark_full.onnx" or "pose_landmark_heavy.onnx" is available at
+# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or
+# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or
+# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx"
+# path respectively during execution, depending on the specification in the
+# MODEL_COMPLEXITY input side packet.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "PoseLandmarkByRoiOnnxCUDA"
+#     input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+#     input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+#     input_stream: "IMAGE:image"
+#     input_stream: "ROI:roi"
+#     output_stream: "LANDMARKS:landmarks"
+#     output_stream: "SEGMENTATION_MASK:segmentation_mask"
+#   }
+
+type: "PoseLandmarkByRoiOnnxCUDA"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+# ROI (region of interest) within the given image where a pose is located.
+# (NormalizedRect)
+input_stream: "ROI:roi"
+
+# Whether to predict the segmentation mask. If unspecified, functions as set to
+# false. (bool)
+input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+
+# Pose landmarks within the given ROI. (NormalizedLandmarkList)
+# We have 33 landmarks (see pose_landmark_topology.svg) and there are other
+# auxiliary key points.
+# 0 - nose
+# 1 - left eye (inner)
+# 2 - left eye
+# 3 - left eye (outer)
+# 4 - right eye (inner)
+# 5 - right eye
+# 6 - right eye (outer)
+# 7 - left ear
+# 8 - right ear
+# 9 - mouth (left)
+# 10 - mouth (right)
+# 11 - left shoulder
+# 12 - right shoulder
+# 13 - left elbow
+# 14 - right elbow
+# 15 - left wrist
+# 16 - right wrist
+# 17 - left pinky
+# 18 - right pinky
+# 19 - left index
+# 20 - right index
+# 21 - left thumb
+# 22 - right thumb
+# 23 - left hip
+# 24 - right hip
+# 25 - left knee
+# 26 - right knee
+# 27 - left ankle
+# 28 - right ankle
+# 29 - left heel
+# 30 - right heel
+# 31 - left foot index
+# 32 - right foot index
+#
+# NOTE: If a pose is not present within the given ROI, for this particular
+# timestamp there will not be an output packet in the LANDMARKS stream. However,
+# the MediaPipe framework will internally inform the downstream calculators of
+# the absence of this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:landmarks"
+# Auxiliary landmarks for deriving the ROI in the subsequent image.
+# (NormalizedLandmarkList)
+output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks"
+
+# Pose world landmarks within the given ROI. (LandmarkList)
+# World landmarks are real-world 3D coordinates in meters with the origin at the
+# center between hips. WORLD_LANDMARKS shares the same landmark topology as
+# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object
+# projected onto the 2D image surface, while WORLD_LANDMARKS provides
+# coordinates (in meters) of the 3D object itself.
+output_stream: "WORLD_LANDMARKS:world_landmarks"
+
+# Segmentation mask on CPU in ImageFormat::VEC32F1. (Image)
+output_stream: "SEGMENTATION_MASK:segmentation_mask"
+
+# Retrieves the image size.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE_CPU:image"
+  output_stream: "SIZE:image_size"
+}
+
+# Crops and transforms the specified ROI in the input image into an image patch
+# represented as a tensor of dimension expected by the corresponding ML model,
+# while maintaining the aspect ratio of the ROI (which can be different from
+# that of the image patch). Therefore, there can be letterboxing around the ROI
+# in the generated tensor representation.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:image"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "TENSORS:input_tensors"
+  output_stream: "LETTERBOX_PADDING:letterbox_padding"
+  output_stream: "MATRIX:transformation_matrix"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+      output_tensor_width: 256
+      output_tensor_height: 256
+      keep_aspect_ratio: true
+      output_tensor_float_range {
+        min: 0.0
+        max: 1.0
+      }
+    }
+  }
+}
+
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:output_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+      model_path: "mediapipe/modules/pose_landmark/pose_landmark_full.onnx"
+      delegate { cuda {} }
+    }
+  }
+}
+
+# Decodes the tensors into the corresponding landmark and segmentation mask
+# representation.
+node {
+  calculator: "TensorsToPoseLandmarksAndSegmentation"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "TENSORS:output_tensors"
+  output_stream: "LANDMARKS:roi_landmarks"
+  output_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks"
+  output_stream: "WORLD_LANDMARKS:roi_world_landmarks"
+  output_stream: "SEGMENTATION_MASK:roi_segmentation_mask"
+}
+
+# Projects the landmarks and segmentation mask in the local coordinates of the
+# (potentially letterboxed) ROI back to the global coordinates of the full input
+# image.
+node {
+  calculator: "PoseLandmarksAndSegmentationInverseProjection"
+  input_stream: "IMAGE_SIZE:image_size"
+  input_stream: "NORM_RECT:roi"
+  input_stream: "LETTERBOX_PADDING:letterbox_padding"
+  input_stream: "MATRIX:transformation_matrix"
+  input_stream: "LANDMARKS:roi_landmarks"
+  input_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks"
+  input_stream: "WORLD_LANDMARKS:roi_world_landmarks"
+  input_stream: "SEGMENTATION_MASK:roi_segmentation_mask"
+  output_stream: "LANDMARKS:landmarks"
+  output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks"
+  output_stream: "WORLD_LANDMARKS:world_landmarks"
+  output_stream: "SEGMENTATION_MASK:segmentation_mask"
+}
--- a/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_tensorrt.pbtxt
+++ b/mediapipe/modules/pose_landmark/pose_landmark_by_roi_onnx_tensorrt.pbtxt
@ -0,0 +1,165 @@
+# MediaPipe graph to detect/predict pose landmarks and optionally segmentation
+# within an ROI. (CPU input, and inference is executed on CPU.)
+#
+# It is required that "pose_landmark_lite.onnx" or
+# "pose_landmark_full.onnx" or "pose_landmark_heavy.onnx" is available at
+# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or
+# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or
+# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx"
+# path respectively during execution, depending on the specification in the
+# MODEL_COMPLEXITY input side packet.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "PoseLandmarkByRoiOnnxTensorRT"
+#     input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+#     input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+#     input_stream: "IMAGE:image"
+#     input_stream: "ROI:roi"
+#     output_stream: "LANDMARKS:landmarks"
+#     output_stream: "SEGMENTATION_MASK:segmentation_mask"
+#   }
+
+type: "PoseLandmarkByRoiOnnxTensorRT"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+# ROI (region of interest) within the given image where a pose is located.
+# (NormalizedRect)
+input_stream: "ROI:roi"
+
+# Whether to predict the segmentation mask. If unspecified, functions as set to
+# false. (bool)
+input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+
+# Pose landmarks within the given ROI. (NormalizedLandmarkList)
+# We have 33 landmarks (see pose_landmark_topology.svg) and there are other
+# auxiliary key points.
+# 0 - nose
+# 1 - left eye (inner)
+# 2 - left eye
+# 3 - left eye (outer)
+# 4 - right eye (inner)
+# 5 - right eye
+# 6 - right eye (outer)
+# 7 - left ear
+# 8 - right ear
+# 9 - mouth (left)
+# 10 - mouth (right)
+# 11 - left shoulder
+# 12 - right shoulder
+# 13 - left elbow
+# 14 - right elbow
+# 15 - left wrist
+# 16 - right wrist
+# 17 - left pinky
+# 18 - right pinky
+# 19 - left index
+# 20 - right index
+# 21 - left thumb
+# 22 - right thumb
+# 23 - left hip
+# 24 - right hip
+# 25 - left knee
+# 26 - right knee
+# 27 - left ankle
+# 28 - right ankle
+# 29 - left heel
+# 30 - right heel
+# 31 - left foot index
+# 32 - right foot index
+#
+# NOTE: If a pose is not present within the given ROI, for this particular
+# timestamp there will not be an output packet in the LANDMARKS stream. However,
+# the MediaPipe framework will internally inform the downstream calculators of
+# the absence of this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:landmarks"
+# Auxiliary landmarks for deriving the ROI in the subsequent image.
+# (NormalizedLandmarkList)
+output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks"
+
+# Pose world landmarks within the given ROI. (LandmarkList)
+# World landmarks are real-world 3D coordinates in meters with the origin at the
+# center between hips. WORLD_LANDMARKS shares the same landmark topology as
+# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object
+# projected onto the 2D image surface, while WORLD_LANDMARKS provides
+# coordinates (in meters) of the 3D object itself.
+output_stream: "WORLD_LANDMARKS:world_landmarks"
+
+# Segmentation mask on CPU in ImageFormat::VEC32F1. (Image)
+output_stream: "SEGMENTATION_MASK:segmentation_mask"
+
+# Retrieves the image size.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE_CPU:image"
+  output_stream: "SIZE:image_size"
+}
+
+# Crops and transforms the specified ROI in the input image into an image patch
+# represented as a tensor of dimension expected by the corresponding ML model,
+# while maintaining the aspect ratio of the ROI (which can be different from
+# that of the image patch). Therefore, there can be letterboxing around the ROI
+# in the generated tensor representation.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:image"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "TENSORS:input_tensors"
+  output_stream: "LETTERBOX_PADDING:letterbox_padding"
+  output_stream: "MATRIX:transformation_matrix"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+      output_tensor_width: 256
+      output_tensor_height: 256
+      keep_aspect_ratio: true
+      output_tensor_float_range {
+        min: 0.0
+        max: 1.0
+      }
+    }
+  }
+}
+
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:output_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+      model_path: "mediapipe/modules/pose_landmark/pose_landmark_full.onnx"
+      delegate { tensorrt {} }
+    }
+  }
+}
+
+# Decodes the tensors into the corresponding landmark and segmentation mask
+# representation.
+node {
+  calculator: "TensorsToPoseLandmarksAndSegmentation"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "TENSORS:output_tensors"
+  output_stream: "LANDMARKS:roi_landmarks"
+  output_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks"
+  output_stream: "WORLD_LANDMARKS:roi_world_landmarks"
+  output_stream: "SEGMENTATION_MASK:roi_segmentation_mask"
+}
+
+# Projects the landmarks and segmentation mask in the local coordinates of the
+# (potentially letterboxed) ROI back to the global coordinates of the full input
+# image.
+node {
+  calculator: "PoseLandmarksAndSegmentationInverseProjection"
+  input_stream: "IMAGE_SIZE:image_size"
+  input_stream: "NORM_RECT:roi"
+  input_stream: "LETTERBOX_PADDING:letterbox_padding"
+  input_stream: "MATRIX:transformation_matrix"
+  input_stream: "LANDMARKS:roi_landmarks"
+  input_stream: "AUXILIARY_LANDMARKS:roi_auxiliary_landmarks"
+  input_stream: "WORLD_LANDMARKS:roi_world_landmarks"
+  input_stream: "SEGMENTATION_MASK:roi_segmentation_mask"
+  output_stream: "LANDMARKS:landmarks"
+  output_stream: "AUXILIARY_LANDMARKS:auxiliary_landmarks"
+  output_stream: "WORLD_LANDMARKS:world_landmarks"
+  output_stream: "SEGMENTATION_MASK:segmentation_mask"
+}
--- a/mediapipe/modules/pose_landmark/pose_landmark_onnx_cuda.pbtxt
+++ b/mediapipe/modules/pose_landmark/pose_landmark_onnx_cuda.pbtxt
@ -0,0 +1,268 @@
+# MediaPipe graph to detect/predict pose landmarks. (CPU input, and inference is
+# executed on CPU.) This graph tries to skip pose detection as much as possible
+# by using previously detected/predicted landmarks for new images.
+#
+# It is required that "pose_detection.onnx" is available at
+# "mediapipe/modules/pose_detection/pose_detection.onnx"
+# path during execution.
+#
+# It is required that "pose_landmark_lite.onnx" or
+# "pose_landmark_full.onnx" or "pose_landmark_heavy.onnx" is available at
+# "mediapipe/modules/pose_landmark/pose_landmark_lite.onnx" or
+# "mediapipe/modules/pose_landmark/pose_landmark_full.onnx" or
+# "mediapipe/modules/pose_landmark/pose_landmark_heavy.onnx"
+# path respectively during execution, depending on the specification in the
+# MODEL_COMPLEXITY input side packet.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "PoseLandmarkOnnxCUDA"
+#     input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+#     input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks"
+#     input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+#     input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation"
+#     input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+#     input_stream: "IMAGE:image"
+#     output_stream: "LANDMARKS:pose_landmarks"
+#     output_stream: "SEGMENTATION_MASK:segmentation_mask"
+#   }
+
+type: "PoseLandmarkOnnxCUDA"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+
+# Whether to filter landmarks across different input images to reduce jitter.
+# If unspecified, functions as set to true. (bool)
+input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks"
+
+# Whether to predict the segmentation mask. If unspecified, functions as set to
+# false. (bool)
+input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+
+# Whether to filter segmentation mask across different input images to reduce
+# jitter. If unspecified, functions as set to true. (bool)
+input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation"
+
+# Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as
+# inference latency generally go up with the model complexity. If unspecified,
+# functions as set to 1. (int)
+input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+
+# Whether landmarks on the previous image should be used to help localize
+# landmarks on the current image. (bool)
+input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+
+# Pose landmarks. (NormalizedLandmarkList)
+# We have 33 landmarks (see pose_landmark_topology.svg), and there are other
+# auxiliary key points.
+# 0 - nose
+# 1 - left eye (inner)
+# 2 - left eye
+# 3 - left eye (outer)
+# 4 - right eye (inner)
+# 5 - right eye
+# 6 - right eye (outer)
+# 7 - left ear
+# 8 - right ear
+# 9 - mouth (left)
+# 10 - mouth (right)
+# 11 - left shoulder
+# 12 - right shoulder
+# 13 - left elbow
+# 14 - right elbow
+# 15 - left wrist
+# 16 - right wrist
+# 17 - left pinky
+# 18 - right pinky
+# 19 - left index
+# 20 - right index
+# 21 - left thumb
+# 22 - right thumb
+# 23 - left hip
+# 24 - right hip
+# 25 - left knee
+# 26 - right knee
+# 27 - left ankle
+# 28 - right ankle
+# 29 - left heel
+# 30 - right heel
+# 31 - left foot index
+# 32 - right foot index
+#
+# NOTE: if a pose is not present within the given ROI, for this particular
+# timestamp there will not be an output packet in the LANDMARKS stream. However,
+# the MediaPipe framework will internally inform the downstream calculators of
+# the absence of this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:pose_landmarks"
+
+# Pose world landmarks. (LandmarkList)
+# World landmarks are real-world 3D coordinates in meters with the origin at the
+# center between hips. WORLD_LANDMARKS shares the same landmark topology as
+# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object
+# projected onto the 2D image surface, while WORLD_LANDMARKS provides
+# coordinates (in meters) of the 3D object itself.
+output_stream: "WORLD_LANDMARKS:pose_world_landmarks"
+
+# Segmentation mask. (ImageFrame in ImageFormat::VEC32F1)
+output_stream: "SEGMENTATION_MASK:segmentation_mask"
+
+# Extra outputs (for debugging, for instance).
+# Detected poses. (Detection)
+output_stream: "DETECTION:pose_detection"
+# Regions of interest calculated based on landmarks. (NormalizedRect)
+output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks"
+# Regions of interest calculated based on pose detections. (NormalizedRect)
+output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection"
+
+# When the optional input side packet "use_prev_landmarks" is either absent or
+# set to true, uses the landmarks on the previous image to help localize
+# landmarks on the current image.
+node {
+  calculator: "GateCalculator"
+  input_side_packet: "ALLOW:use_prev_landmarks"
+  input_stream: "prev_pose_rect_from_landmarks"
+  output_stream: "gated_prev_pose_rect_from_landmarks"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      allow: true
+    }
+  }
+}
+
+# Checks if there's previous pose rect calculated from landmarks.
+node: {
+  calculator: "PacketPresenceCalculator"
+  input_stream: "PACKET:gated_prev_pose_rect_from_landmarks"
+  output_stream: "PRESENCE:prev_pose_rect_from_landmarks_is_present"
+}
+
+# Calculates size of the image.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE_CPU:image"
+  output_stream: "SIZE:image_size"
+}
+
+# Drops the incoming image if the pose has already been identified from the
+# previous image. Otherwise, passes the incoming image through to trigger a new
+# round of pose detection.
+node {
+  calculator: "GateCalculator"
+  input_stream: "image"
+  input_stream: "image_size"
+  input_stream: "DISALLOW:prev_pose_rect_from_landmarks_is_present"
+  output_stream: "image_for_pose_detection"
+  output_stream: "image_size_for_pose_detection"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      empty_packets_as_allow: true
+    }
+  }
+}
+
+# Detects poses.
+node {
+  calculator: "PoseDetectionOnnxCUDA"
+  input_stream: "IMAGE:image_for_pose_detection"
+  output_stream: "DETECTIONS:pose_detections"
+}
+
+# Gets the very first detection from "pose_detections" vector.
+node {
+  calculator: "SplitDetectionVectorCalculator"
+  input_stream: "pose_detections"
+  output_stream: "pose_detection"
+  options: {
+    [mediapipe.SplitVectorCalculatorOptions.ext] {
+      ranges: { begin: 0 end: 1 }
+      element_only: true
+    }
+  }
+}
+
+# Calculates region of interest based on pose detection, so that can be used
+# to detect landmarks.
+node {
+  calculator: "PoseDetectionToRoi"
+  input_stream: "DETECTION:pose_detection"
+  input_stream: "IMAGE_SIZE:image_size_for_pose_detection"
+  output_stream: "ROI:pose_rect_from_detection"
+}
+
+# Selects either pose rect (or ROI) calculated from detection or from previously
+# detected landmarks if available (in this case, calculation of pose rect from
+# detection is skipped).
+node {
+  calculator: "MergeCalculator"
+  input_stream: "pose_rect_from_detection"
+  input_stream: "gated_prev_pose_rect_from_landmarks"
+  output_stream: "pose_rect"
+}
+
+# Detects pose landmarks within specified region of interest of the image.
+node {
+  calculator: "PoseLandmarkByRoiOnnxCUDA"
+  input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:pose_rect"
+  output_stream: "LANDMARKS:unfiltered_pose_landmarks"
+  output_stream: "AUXILIARY_LANDMARKS:unfiltered_auxiliary_landmarks"
+  output_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks"
+  output_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask"
+}
+
+# Smoothes landmarks to reduce jitter.
+node {
+  calculator: "PoseLandmarkFiltering"
+  input_side_packet: "ENABLE:smooth_landmarks"
+  input_stream: "IMAGE_SIZE:image_size"
+  input_stream: "NORM_LANDMARKS:unfiltered_pose_landmarks"
+  input_stream: "AUX_NORM_LANDMARKS:unfiltered_auxiliary_landmarks"
+  input_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks"
+  output_stream: "FILTERED_NORM_LANDMARKS:pose_landmarks"
+  output_stream: "FILTERED_AUX_NORM_LANDMARKS:auxiliary_landmarks"
+  output_stream: "FILTERED_WORLD_LANDMARKS:pose_world_landmarks"
+}
+
+# Calculates region of interest based on the auxiliary landmarks, to be used in
+# the subsequent image.
+node {
+  calculator: "PoseLandmarksToRoi"
+  input_stream: "LANDMARKS:auxiliary_landmarks"
+  input_stream: "IMAGE_SIZE:image_size"
+  output_stream: "ROI:pose_rect_from_landmarks"
+}
+
+# Caches pose rects calculated from landmarks, and upon the arrival of the next
+# input image, sends out the cached rects with timestamps replaced by that of
+# the input image, essentially generating a packet that carries the previous
+# pose rects. Note that upon the arrival of the very first input image, a
+# timestamp bound update occurs to jump start the feedback loop.
+node {
+  calculator: "PreviousLoopbackCalculator"
+  input_stream: "MAIN:image"
+  input_stream: "LOOP:pose_rect_from_landmarks"
+  input_stream_info: {
+    tag_index: "LOOP"
+    back_edge: true
+  }
+  output_stream: "PREV_LOOP:prev_pose_rect_from_landmarks"
+}
+
+# Smoothes segmentation to reduce jitter.
+node {
+  calculator: "PoseSegmentationFiltering"
+  input_side_packet: "ENABLE:smooth_segmentation"
+  input_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask"
+  output_stream: "FILTERED_SEGMENTATION_MASK:filtered_segmentation_mask"
+}
+
+# Converts the incoming segmentation mask represented as an Image into the
+# corresponding ImageFrame type.
+node: {
+  calculator: "FromImageCalculator"
+  input_stream: "IMAGE:filtered_segmentation_mask"
+  output_stream: "IMAGE_CPU:segmentation_mask"
+}
--- a/mediapipe/modules/pose_landmark/pose_landmark_onnx_tensorrt.pbtxt
+++ b/mediapipe/modules/pose_landmark/pose_landmark_onnx_tensorrt.pbtxt
@ -0,0 +1,268 @@
+# MediaPipe graph to detect/predict pose landmarks. (CPU input, and inference is
+# executed on CPU.) This graph tries to skip pose detection as much as possible
+# by using previously detected/predicted landmarks for new images.
+#
+# It is required that "pose_detection.tflite" is available at
+# "mediapipe/modules/pose_detection/pose_detection.tflite"
+# path during execution.
+#
+# It is required that "pose_landmark_lite.tflite" or
+# "pose_landmark_full.tflite" or "pose_landmark_heavy.tflite" is available at
+# "mediapipe/modules/pose_landmark/pose_landmark_lite.tflite" or
+# "mediapipe/modules/pose_landmark/pose_landmark_full.tflite" or
+# "mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite"
+# path respectively during execution, depending on the specification in the
+# MODEL_COMPLEXITY input side packet.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "PoseLandmarkOnnxTensorRT"
+#     input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+#     input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks"
+#     input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+#     input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation"
+#     input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+#     input_stream: "IMAGE:image"
+#     output_stream: "LANDMARKS:pose_landmarks"
+#     output_stream: "SEGMENTATION_MASK:segmentation_mask"
+#   }
+
+type: "PoseLandmarkOnnxTensorRT"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+
+# Whether to filter landmarks across different input images to reduce jitter.
+# If unspecified, functions as set to true. (bool)
+input_side_packet: "SMOOTH_LANDMARKS:smooth_landmarks"
+
+# Whether to predict the segmentation mask. If unspecified, functions as set to
+# false. (bool)
+input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+
+# Whether to filter segmentation mask across different input images to reduce
+# jitter. If unspecified, functions as set to true. (bool)
+input_side_packet: "SMOOTH_SEGMENTATION:smooth_segmentation"
+
+# Complexity of the pose landmark model: 0, 1 or 2. Landmark accuracy as well as
+# inference latency generally go up with the model complexity. If unspecified,
+# functions as set to 1. (int)
+input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+
+# Whether landmarks on the previous image should be used to help localize
+# landmarks on the current image. (bool)
+input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+
+# Pose landmarks. (NormalizedLandmarkList)
+# We have 33 landmarks (see pose_landmark_topology.svg), and there are other
+# auxiliary key points.
+# 0 - nose
+# 1 - left eye (inner)
+# 2 - left eye
+# 3 - left eye (outer)
+# 4 - right eye (inner)
+# 5 - right eye
+# 6 - right eye (outer)
+# 7 - left ear
+# 8 - right ear
+# 9 - mouth (left)
+# 10 - mouth (right)
+# 11 - left shoulder
+# 12 - right shoulder
+# 13 - left elbow
+# 14 - right elbow
+# 15 - left wrist
+# 16 - right wrist
+# 17 - left pinky
+# 18 - right pinky
+# 19 - left index
+# 20 - right index
+# 21 - left thumb
+# 22 - right thumb
+# 23 - left hip
+# 24 - right hip
+# 25 - left knee
+# 26 - right knee
+# 27 - left ankle
+# 28 - right ankle
+# 29 - left heel
+# 30 - right heel
+# 31 - left foot index
+# 32 - right foot index
+#
+# NOTE: if a pose is not present within the given ROI, for this particular
+# timestamp there will not be an output packet in the LANDMARKS stream. However,
+# the MediaPipe framework will internally inform the downstream calculators of
+# the absence of this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:pose_landmarks"
+
+# Pose world landmarks. (LandmarkList)
+# World landmarks are real-world 3D coordinates in meters with the origin at the
+# center between hips. WORLD_LANDMARKS shares the same landmark topology as
+# LANDMARKS. However, LANDMARKS provides coordinates (in pixels) of a 3D object
+# projected onto the 2D image surface, while WORLD_LANDMARKS provides
+# coordinates (in meters) of the 3D object itself.
+output_stream: "WORLD_LANDMARKS:pose_world_landmarks"
+
+# Segmentation mask. (ImageFrame in ImageFormat::VEC32F1)
+output_stream: "SEGMENTATION_MASK:segmentation_mask"
+
+# Extra outputs (for debugging, for instance).
+# Detected poses. (Detection)
+output_stream: "DETECTION:pose_detection"
+# Regions of interest calculated based on landmarks. (NormalizedRect)
+output_stream: "ROI_FROM_LANDMARKS:pose_rect_from_landmarks"
+# Regions of interest calculated based on pose detections. (NormalizedRect)
+output_stream: "ROI_FROM_DETECTION:pose_rect_from_detection"
+
+# When the optional input side packet "use_prev_landmarks" is either absent or
+# set to true, uses the landmarks on the previous image to help localize
+# landmarks on the current image.
+node {
+  calculator: "GateCalculator"
+  input_side_packet: "ALLOW:use_prev_landmarks"
+  input_stream: "prev_pose_rect_from_landmarks"
+  output_stream: "gated_prev_pose_rect_from_landmarks"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      allow: true
+    }
+  }
+}
+
+# Checks if there's previous pose rect calculated from landmarks.
+node: {
+  calculator: "PacketPresenceCalculator"
+  input_stream: "PACKET:gated_prev_pose_rect_from_landmarks"
+  output_stream: "PRESENCE:prev_pose_rect_from_landmarks_is_present"
+}
+
+# Calculates size of the image.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE_CPU:image"
+  output_stream: "SIZE:image_size"
+}
+
+# Drops the incoming image if the pose has already been identified from the
+# previous image. Otherwise, passes the incoming image through to trigger a new
+# round of pose detection.
+node {
+  calculator: "GateCalculator"
+  input_stream: "image"
+  input_stream: "image_size"
+  input_stream: "DISALLOW:prev_pose_rect_from_landmarks_is_present"
+  output_stream: "image_for_pose_detection"
+  output_stream: "image_size_for_pose_detection"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      empty_packets_as_allow: true
+    }
+  }
+}
+
+# Detects poses.
+node {
+  calculator: "PoseDetectionOnnxTensorRT"
+  input_stream: "IMAGE:image_for_pose_detection"
+  output_stream: "DETECTIONS:pose_detections"
+}
+
+# Gets the very first detection from "pose_detections" vector.
+node {
+  calculator: "SplitDetectionVectorCalculator"
+  input_stream: "pose_detections"
+  output_stream: "pose_detection"
+  options: {
+    [mediapipe.SplitVectorCalculatorOptions.ext] {
+      ranges: { begin: 0 end: 1 }
+      element_only: true
+    }
+  }
+}
+
+# Calculates region of interest based on pose detection, so that can be used
+# to detect landmarks.
+node {
+  calculator: "PoseDetectionToRoi"
+  input_stream: "DETECTION:pose_detection"
+  input_stream: "IMAGE_SIZE:image_size_for_pose_detection"
+  output_stream: "ROI:pose_rect_from_detection"
+}
+
+# Selects either pose rect (or ROI) calculated from detection or from previously
+# detected landmarks if available (in this case, calculation of pose rect from
+# detection is skipped).
+node {
+  calculator: "MergeCalculator"
+  input_stream: "pose_rect_from_detection"
+  input_stream: "gated_prev_pose_rect_from_landmarks"
+  output_stream: "pose_rect"
+}
+
+# Detects pose landmarks within specified region of interest of the image.
+node {
+  calculator: "PoseLandmarkByRoiOnnxTensorRT"
+  input_side_packet: "MODEL_COMPLEXITY:model_complexity"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:pose_rect"
+  output_stream: "LANDMARKS:unfiltered_pose_landmarks"
+  output_stream: "AUXILIARY_LANDMARKS:unfiltered_auxiliary_landmarks"
+  output_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks"
+  output_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask"
+}
+
+# Smoothes landmarks to reduce jitter.
+node {
+  calculator: "PoseLandmarkFiltering"
+  input_side_packet: "ENABLE:smooth_landmarks"
+  input_stream: "IMAGE_SIZE:image_size"
+  input_stream: "NORM_LANDMARKS:unfiltered_pose_landmarks"
+  input_stream: "AUX_NORM_LANDMARKS:unfiltered_auxiliary_landmarks"
+  input_stream: "WORLD_LANDMARKS:unfiltered_world_landmarks"
+  output_stream: "FILTERED_NORM_LANDMARKS:pose_landmarks"
+  output_stream: "FILTERED_AUX_NORM_LANDMARKS:auxiliary_landmarks"
+  output_stream: "FILTERED_WORLD_LANDMARKS:pose_world_landmarks"
+}
+
+# Calculates region of interest based on the auxiliary landmarks, to be used in
+# the subsequent image.
+node {
+  calculator: "PoseLandmarksToRoi"
+  input_stream: "LANDMARKS:auxiliary_landmarks"
+  input_stream: "IMAGE_SIZE:image_size"
+  output_stream: "ROI:pose_rect_from_landmarks"
+}
+
+# Caches pose rects calculated from landmarks, and upon the arrival of the next
+# input image, sends out the cached rects with timestamps replaced by that of
+# the input image, essentially generating a packet that carries the previous
+# pose rects. Note that upon the arrival of the very first input image, a
+# timestamp bound update occurs to jump start the feedback loop.
+node {
+  calculator: "PreviousLoopbackCalculator"
+  input_stream: "MAIN:image"
+  input_stream: "LOOP:pose_rect_from_landmarks"
+  input_stream_info: {
+    tag_index: "LOOP"
+    back_edge: true
+  }
+  output_stream: "PREV_LOOP:prev_pose_rect_from_landmarks"
+}
+
+# Smoothes segmentation to reduce jitter.
+node {
+  calculator: "PoseSegmentationFiltering"
+  input_side_packet: "ENABLE:smooth_segmentation"
+  input_stream: "SEGMENTATION_MASK:unfiltered_segmentation_mask"
+  output_stream: "FILTERED_SEGMENTATION_MASK:filtered_segmentation_mask"
+}
+
+# Converts the incoming segmentation mask represented as an Image into the
+# corresponding ImageFrame type.
+node: {
+  calculator: "FromImageCalculator"
+  input_stream: "IMAGE:filtered_segmentation_mask"
+  output_stream: "IMAGE_CPU:segmentation_mask"
+}