diff --git a/mediapipe/examples/desktop/face_detection/BUILD b/mediapipe/examples/desktop/face_detection/BUILD
index 8cd75b44e..6d131ac68 100644
--- a/mediapipe/examples/desktop/face_detection/BUILD
+++ b/mediapipe/examples/desktop/face_detection/BUILD
@@ -24,6 +24,46 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "face_detection_full_range_cpu_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/face_detection:face_detection_full_range_desktop_live_deps",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_full_range_onnx_cuda",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main",
+        "//mediapipe/graphs/face_detection:face_detection_full_range_desktop_live_onnx_cuda_deps",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_full_range_onnx_cuda_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/face_detection:face_detection_full_range_desktop_live_onnx_cuda_deps",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_full_range_onnx_tensorrt",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main",
+        "//mediapipe/graphs/face_detection:face_detection_full_range_desktop_live_onnx_tensorrt_deps",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_full_range_onnx_tensorrt_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/face_detection:face_detection_full_range_desktop_live_onnx_tensorrt_deps",
+    ],
+)
+
 cc_binary(
     name = "face_detection_cpu",
     deps = [
@@ -32,6 +72,46 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "face_detection_cpu_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/face_detection:desktop_live_calculators",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_onnx_cuda",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main",
+        "//mediapipe/graphs/face_detection:desktop_live_onnx_cuda_calculators",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_onnx_cuda_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/face_detection:desktop_live_onnx_cuda_calculators",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_onnx_tensorrt",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main",
+        "//mediapipe/graphs/face_detection:desktop_live_onnx_tensorrt_calculators",
+    ],
+)
+
+cc_binary(
+    name = "face_detection_onnx_tensorrt_fps",
+    deps = [
+        "//mediapipe/examples/desktop:demo_run_graph_main_fps",
+        "//mediapipe/graphs/face_detection:desktop_live_onnx_tensorrt_calculators",
+    ],
+)
+
 # Linux only
 cc_binary(
     name = "face_detection_gpu",
diff --git a/mediapipe/graphs/face_detection/BUILD b/mediapipe/graphs/face_detection/BUILD
index 9e7cf2505..81eec6692 100644
--- a/mediapipe/graphs/face_detection/BUILD
+++ b/mediapipe/graphs/face_detection/BUILD
@@ -43,6 +43,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "desktop_live_onnx_cuda_calculators",
+    deps = [
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/calculators/util:annotation_overlay_calculator",
+        "//mediapipe/calculators/util:detections_to_render_data_calculator",
+        "//mediapipe/modules/face_detection:face_detection_short_range_onnx_cuda",
+    ],
+)
+
+cc_library(
+    name = "desktop_live_onnx_tensorrt_calculators",
+    deps = [
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/calculators/util:annotation_overlay_calculator",
+        "//mediapipe/calculators/util:detections_to_render_data_calculator",
+        "//mediapipe/modules/face_detection:face_detection_short_range_onnx_tensorrt",
+    ],
+)
+
 cc_library(
     name = "desktop_live_gpu_calculators",
     deps = [
@@ -93,3 +113,23 @@ cc_library(
         "//mediapipe/modules/face_detection:face_detection_full_range_cpu",
     ],
 )
+
+cc_library(
+    name = "face_detection_full_range_desktop_live_onnx_cuda_deps",
+    deps = [
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/calculators/util:annotation_overlay_calculator",
+        "//mediapipe/calculators/util:detections_to_render_data_calculator",
+        "//mediapipe/modules/face_detection:face_detection_full_range_onnx_cuda",
+    ],
+)
+
+cc_library(
+    name = "face_detection_full_range_desktop_live_onnx_tensorrt_deps",
+    deps = [
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/calculators/util:annotation_overlay_calculator",
+        "//mediapipe/calculators/util:detections_to_render_data_calculator",
+        "//mediapipe/modules/face_detection:face_detection_full_range_onnx_tensorrt",
+    ],
+)
diff --git a/mediapipe/graphs/face_detection/face_detection_desktop_live_onnx_cuda.pbtxt b/mediapipe/graphs/face_detection/face_detection_desktop_live_onnx_cuda.pbtxt
new file mode 100644
index 000000000..367327335
--- /dev/null
+++ b/mediapipe/graphs/face_detection/face_detection_desktop_live_onnx_cuda.pbtxt
@@ -0,0 +1,58 @@
+# MediaPipe graph that performs face mesh with onnxruntime cuda.
+
+# CPU buffer. (ImageFrame)
+input_stream: "input_video"
+
+# Output image with rendered results. (ImageFrame)
+output_stream: "output_video"
+# Detected faces. (std::vector<Detection>)
+output_stream: "face_detections"
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for downstream nodes
+# (calculators and subgraphs) in the graph to finish their tasks before it
+# passes through another image. All images that come in while waiting are
+# dropped, limiting the number of in-flight images in most part of the graph to
+# 1. This prevents the downstream nodes from queuing up incoming images and data
+# excessively, which leads to increased latency and memory usage, unwanted in
+# real-time mobile applications. It also eliminates unnecessarily computation,
+# e.g., the output produced by a node may get dropped downstream if the
+# subsequent nodes are still busy processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:output_video"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Subgraph that detects faces.
+node {
+  calculator: "FaceDetectionShortRangeOnnxCUDA"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "DETECTIONS:face_detections"
+}
+
+# Converts the detections to drawing primitives for annotation overlay.
+node {
+  calculator: "DetectionsToRenderDataCalculator"
+  input_stream: "DETECTIONS:face_detections"
+  output_stream: "RENDER_DATA:render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
+      thickness: 4.0
+      color { r: 255 g: 0 b: 0 }
+    }
+  }
+}
+
+# Draws annotations and overlays them on top of the input images.
+node {
+  calculator: "AnnotationOverlayCalculator"
+  input_stream: "IMAGE:throttled_input_video"
+  input_stream: "render_data"
+  output_stream: "IMAGE:output_video"
+}
diff --git a/mediapipe/graphs/face_detection/face_detection_desktop_live_onnx_tensorrt.pbtxt b/mediapipe/graphs/face_detection/face_detection_desktop_live_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..d3a7f097f
--- /dev/null
+++ b/mediapipe/graphs/face_detection/face_detection_desktop_live_onnx_tensorrt.pbtxt
@@ -0,0 +1,58 @@
+# MediaPipe graph that performs face mesh with onnxruntime tensorrt.
+
+# CPU buffer. (ImageFrame)
+input_stream: "input_video"
+
+# Output image with rendered results. (ImageFrame)
+output_stream: "output_video"
+# Detected faces. (std::vector<Detection>)
+output_stream: "face_detections"
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for downstream nodes
+# (calculators and subgraphs) in the graph to finish their tasks before it
+# passes through another image. All images that come in while waiting are
+# dropped, limiting the number of in-flight images in most part of the graph to
+# 1. This prevents the downstream nodes from queuing up incoming images and data
+# excessively, which leads to increased latency and memory usage, unwanted in
+# real-time mobile applications. It also eliminates unnecessarily computation,
+# e.g., the output produced by a node may get dropped downstream if the
+# subsequent nodes are still busy processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:output_video"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Subgraph that detects faces.
+node {
+  calculator: "FaceDetectionShortRangeOnnxTensorRT"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "DETECTIONS:face_detections"
+}
+
+# Converts the detections to drawing primitives for annotation overlay.
+node {
+  calculator: "DetectionsToRenderDataCalculator"
+  input_stream: "DETECTIONS:face_detections"
+  output_stream: "RENDER_DATA:render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
+      thickness: 4.0
+      color { r: 255 g: 0 b: 0 }
+    }
+  }
+}
+
+# Draws annotations and overlays them on top of the input images.
+node {
+  calculator: "AnnotationOverlayCalculator"
+  input_stream: "IMAGE:throttled_input_video"
+  input_stream: "render_data"
+  output_stream: "IMAGE:output_video"
+}
diff --git a/mediapipe/graphs/face_detection/face_detection_full_range_desktop_live_onnx_cuda.pbtxt b/mediapipe/graphs/face_detection/face_detection_full_range_desktop_live_onnx_cuda.pbtxt
new file mode 100644
index 000000000..d33a772a3
--- /dev/null
+++ b/mediapipe/graphs/face_detection/face_detection_full_range_desktop_live_onnx_cuda.pbtxt
@@ -0,0 +1,58 @@
+# MediaPipe graph that performs face detection with onnxruntime on cuda.
+
+# Images on GPU coming into and out of the graph.
+input_stream: "input_video"
+output_stream: "output_video"
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for
+# TfLiteTensorsToDetectionsCalculator downstream in the graph to finish
+# generating the corresponding detections before it passes through another
+# image. All images that come in while waiting are dropped, limiting the number
+# of in-flight images between this calculator and
+# TfLiteTensorsToDetectionsCalculator to 1. This prevents the nodes in between
+# from queuing up incoming images and data excessively, which leads to increased
+# latency and memory usage, unwanted in real-time mobile applications. It also
+# eliminates unnecessarily computation, e.g., a transformed image produced by
+# ImageTransformationCalculator may get dropped downstream if the subsequent
+# TfLiteConverterCalculator or TfLiteInferenceCalculator is still busy
+# processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:detections"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Detects faces.
+node {
+  calculator: "FaceDetectionFullRangeOnnxCUDA"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "DETECTIONS:detections"
+}
+
+# Converts the detections to drawing primitives for annotation overlay.
+node {
+  calculator: "DetectionsToRenderDataCalculator"
+  input_stream: "DETECTIONS:detections"
+  output_stream: "RENDER_DATA:render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
+      thickness: 4.0
+      color { r: 255 g: 0 b: 0 }
+    }
+  }
+}
+
+# Draws annotations and overlays them on top of the input images.
+node {
+  calculator: "AnnotationOverlayCalculator"
+  input_stream: "IMAGE:throttled_input_video"
+  input_stream: "render_data"
+  output_stream: "IMAGE:output_video"
+}
+
diff --git a/mediapipe/graphs/face_detection/face_detection_full_range_desktop_live_onnx_tensorrt.pbtxt b/mediapipe/graphs/face_detection/face_detection_full_range_desktop_live_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..4db446757
--- /dev/null
+++ b/mediapipe/graphs/face_detection/face_detection_full_range_desktop_live_onnx_tensorrt.pbtxt
@@ -0,0 +1,58 @@
+# MediaPipe graph that performs face detection with onnxruntime on tensorrt.
+
+# Images on GPU coming into and out of the graph.
+input_stream: "input_video"
+output_stream: "output_video"
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for
+# TfLiteTensorsToDetectionsCalculator downstream in the graph to finish
+# generating the corresponding detections before it passes through another
+# image. All images that come in while waiting are dropped, limiting the number
+# of in-flight images between this calculator and
+# TfLiteTensorsToDetectionsCalculator to 1. This prevents the nodes in between
+# from queuing up incoming images and data excessively, which leads to increased
+# latency and memory usage, unwanted in real-time mobile applications. It also
+# eliminates unnecessarily computation, e.g., a transformed image produced by
+# ImageTransformationCalculator may get dropped downstream if the subsequent
+# TfLiteConverterCalculator or TfLiteInferenceCalculator is still busy
+# processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:detections"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Detects faces.
+node {
+  calculator: "FaceDetectionFullRangeOnnxTensorRT"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "DETECTIONS:detections"
+}
+
+# Converts the detections to drawing primitives for annotation overlay.
+node {
+  calculator: "DetectionsToRenderDataCalculator"
+  input_stream: "DETECTIONS:detections"
+  output_stream: "RENDER_DATA:render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.DetectionsToRenderDataCalculatorOptions] {
+      thickness: 4.0
+      color { r: 255 g: 0 b: 0 }
+    }
+  }
+}
+
+# Draws annotations and overlays them on top of the input images.
+node {
+  calculator: "AnnotationOverlayCalculator"
+  input_stream: "IMAGE:throttled_input_video"
+  input_stream: "render_data"
+  output_stream: "IMAGE:output_video"
+}
+
diff --git a/mediapipe/modules/face_detection/BUILD b/mediapipe/modules/face_detection/BUILD
index 84c9388ea..d6815d5ac 100644
--- a/mediapipe/modules/face_detection/BUILD
+++ b/mediapipe/modules/face_detection/BUILD
@@ -17,7 +17,7 @@ load(
     "mediapipe_simple_subgraph",
 )
 load("//mediapipe/framework/port:build_config.bzl", "mediapipe_proto_library")
-load("//mediapipe/framework:mediapipe_cc_test.bzl", "mediapipe_cc_test")
+load("//mediapipe/framework:mediapipe_cc_test.bzl", "mediapipe_cc_test")  #@unused
 
 licenses(["notice"])
 
@@ -35,6 +35,24 @@ mediapipe_simple_subgraph(
     ],
 )
 
+mediapipe_simple_subgraph(
+    name = "face_detection_short_range_by_roi_onnx_cuda",
+    graph = "face_detection_short_range_by_roi_onnx_cuda.pbtxt",
+    register_as = "FaceDetectionShortRangeByRoiOnnxCUDA",
+    deps = [
+        ":face_detection_short_range_onnx_cuda",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "face_detection_short_range_by_roi_onnx_tensorrt",
+    graph = "face_detection_short_range_by_roi_onnx_tensorrt.pbtxt",
+    register_as = "FaceDetectionShortRangeByRoiOnnxTensorRT",
+    deps = [
+        ":face_detection_short_range_onnx_tensorrt",
+    ],
+)
+
 mediapipe_simple_subgraph(
     name = "face_detection_short_range_by_roi_gpu",
     graph = "face_detection_short_range_by_roi_gpu.pbtxt",
@@ -74,6 +92,24 @@ mediapipe_simple_subgraph(
     ],
 )
 
+mediapipe_simple_subgraph(
+    name = "face_detection_short_range_onnx_cuda",
+    graph = "face_detection_short_range_onnx_cuda.pbtxt",
+    register_as = "FaceDetectionShortRangeOnnxCUDA",
+    deps = [
+        ":face_detection_onnx_cuda",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "face_detection_short_range_onnx_tensorrt",
+    graph = "face_detection_short_range_onnx_tensorrt.pbtxt",
+    register_as = "FaceDetectionShortRangeOnnxTensorRT",
+    deps = [
+        ":face_detection_onnx_tensorrt",
+    ],
+)
+
 mediapipe_simple_subgraph(
     name = "face_detection_full_range",
     graph = "face_detection_full_range.pbtxt",
@@ -83,6 +119,24 @@ mediapipe_simple_subgraph(
     ],
 )
 
+mediapipe_simple_subgraph(
+    name = "face_detection_full_range_onnx_cuda",
+    graph = "face_detection_full_range_onnx_cuda.pbtxt",
+    register_as = "FaceDetectionFullRangeOnnxCUDA",
+    deps = [
+        ":face_detection_onnx_cuda",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "face_detection_full_range_onnx_tensorrt",
+    graph = "face_detection_full_range_onnx_tensorrt.pbtxt",
+    register_as = "FaceDetectionFullRangeOnnxTensorRT",
+    deps = [
+        ":face_detection_onnx_tensorrt",
+    ],
+)
+
 mediapipe_simple_subgraph(
     name = "face_detection_without_roi",
     graph = "face_detection_without_roi.pbtxt",
@@ -110,6 +164,42 @@ mediapipe_simple_subgraph(
     ],
 )
 
+mediapipe_simple_subgraph(
+    name = "face_detection_onnx_cuda",
+    graph = "face_detection_onnx_cuda.pbtxt",
+    register_as = "FaceDetectionOnnxCUDA",
+    deps = [
+        ":face_detection_cc_proto",
+        ":face_detection_options_lib",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda",
+        "//mediapipe/calculators/tensor:tensors_to_detections_calculator",
+        "//mediapipe/calculators/tflite:ssd_anchors_calculator",
+        "//mediapipe/calculators/util:detection_projection_calculator",
+        "//mediapipe/calculators/util:non_max_suppression_calculator",
+        "//mediapipe/calculators/util:to_image_calculator",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "face_detection_onnx_tensorrt",
+    graph = "face_detection_onnx_tensorrt.pbtxt",
+    register_as = "FaceDetectionOnnxTensorRT",
+    deps = [
+        ":face_detection_cc_proto",
+        ":face_detection_options_lib",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt",
+        "//mediapipe/calculators/tensor:tensors_to_detections_calculator",
+        "//mediapipe/calculators/tflite:ssd_anchors_calculator",
+        "//mediapipe/calculators/util:detection_projection_calculator",
+        "//mediapipe/calculators/util:non_max_suppression_calculator",
+        "//mediapipe/calculators/util:to_image_calculator",
+    ],
+)
+
 mediapipe_proto_library(
     name = "face_detection_proto",
     srcs = ["face_detection.proto"],
@@ -168,8 +258,11 @@ mediapipe_simple_subgraph(
 
 exports_files(
     srcs = [
+        "face_detection_full_range.onnx",
         "face_detection_full_range.tflite",
+        "face_detection_full_range_sparse.onnx",
         "face_detection_full_range_sparse.tflite",
+        "face_detection_short_range.onnx",
         "face_detection_short_range.tflite",
     ],
 )
diff --git a/mediapipe/modules/face_detection/face_detection_full_range.onnx b/mediapipe/modules/face_detection/face_detection_full_range.onnx
index ef2d8df17..27c3a29b9 100644
Binary files a/mediapipe/modules/face_detection/face_detection_full_range.onnx and b/mediapipe/modules/face_detection/face_detection_full_range.onnx differ
diff --git a/mediapipe/modules/face_detection/face_detection_full_range_onnx_cuda.pbtxt b/mediapipe/modules/face_detection/face_detection_full_range_onnx_cuda.pbtxt
new file mode 100644
index 000000000..44c13f661
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_full_range_onnx_cuda.pbtxt
@@ -0,0 +1,37 @@
+type: "FaceDetectionFullRangeOnnxCUDA"
+
+input_stream: "IMAGE:image"
+
+input_stream: "ROI:roi"
+
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+node {
+  calculator: "FaceDetectionOnnxCUDA"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:roi"
+  output_stream: "DETECTIONS:detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.FaceDetectionOptions] {
+      model_path: "mediapipe/modules/face_detection/face_detection_full_range.onnx"
+      tensor_width: 192
+      tensor_height: 192
+
+      num_layers: 1
+      strides: 4
+      interpolated_scale_aspect_ratio: 0.0
+
+      num_boxes: 2304
+      x_scale: 192.0
+      y_scale: 192.0
+      h_scale: 192.0
+      w_scale: 192.0
+      min_score_thresh: 0.6
+    }
+  }
+  option_value: "OPTIONS:options"
+}
\ No newline at end of file
diff --git a/mediapipe/modules/face_detection/face_detection_full_range_onnx_tensorrt.pbtxt b/mediapipe/modules/face_detection/face_detection_full_range_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..24dd21772
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_full_range_onnx_tensorrt.pbtxt
@@ -0,0 +1,37 @@
+type: "FaceDetectionFullRangeOnnxTensorRT"
+
+input_stream: "IMAGE:image"
+
+input_stream: "ROI:roi"
+
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+node {
+  calculator: "FaceDetectionOnnxTensorRT"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:roi"
+  output_stream: "DETECTIONS:detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.FaceDetectionOptions] {
+      model_path: "mediapipe/modules/face_detection/face_detection_full_range.onnx"
+      tensor_width: 192
+      tensor_height: 192
+
+      num_layers: 1
+      strides: 4
+      interpolated_scale_aspect_ratio: 0.0
+
+      num_boxes: 2304
+      x_scale: 192.0
+      y_scale: 192.0
+      h_scale: 192.0
+      w_scale: 192.0
+      min_score_thresh: 0.6
+    }
+  }
+  option_value: "OPTIONS:options"
+}
diff --git a/mediapipe/modules/face_detection/face_detection_onnx_cuda.pbtxt b/mediapipe/modules/face_detection/face_detection_onnx_cuda.pbtxt
new file mode 100644
index 000000000..5d23d3f16
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_onnx_cuda.pbtxt
@@ -0,0 +1,155 @@
+type: "FaceDetectionOnnxCUDA"
+
+# The input image, either ImageFrame, GpuBuffer, or (multi-backend) Image.
+input_stream: "IMAGE:image"
+
+# ROI (region of interest) within the given image where faces should be
+# detected. (NormalizedRect)
+input_stream: "ROI:roi"
+
+# Detected faces. (std::vector<Detection>)
+# NOTE: there will not be an output packet in the DETECTIONS stream for this
+# particular timestamp if none of faces detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+# Converts the input CPU or GPU image to the multi-backend image type (Image).
+node: {
+  calculator: "ToImageCalculator"
+  input_stream: "IMAGE:image"
+  output_stream: "IMAGE:multi_backend_image"
+}
+
+# Transforms the input image into a 128x128 tensor while keeping the aspect
+# ratio (what is expected by the corresponding face detection model), resulting
+# in potential letterboxing in the transformed image.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:multi_backend_image"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "TENSORS:input_tensors"
+  output_stream: "MATRIX:transform_matrix"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+        keep_aspect_ratio: true
+        output_tensor_float_range {
+          min: -1.0
+          max: 1.0
+        }
+        border_mode: BORDER_ZERO
+    }
+  }
+  option_value: "gpu_origin:options/gpu_origin"
+  option_value: "output_tensor_width:options/tensor_width"
+  option_value: "output_tensor_height:options/tensor_height"
+}
+
+# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
+# vector of tensors representing, for instance, detection boxes/keypoints and
+# scores.
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:detection_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+        delegate { cuda {} }
+    }
+  }
+  option_value: "model_path:options/model_path"
+}
+
+# Detection tensors. (std::vector<Tensor>)
+#input_stream: "TENSORS:detection_tensors"
+
+# A 4x4 row-major-order matrix that maps a point represented in the detection
+# tensors to a desired coordinate system, e.g., in the original input image
+# before scaling/cropping. (std::array<float, 16>)
+#input_stream: "MATRIX:transform_matrix"
+
+# Detected faces. (std::vector<Detection>)
+# NOTE: there will not be an output packet in the DETECTIONS stream for this
+# particular timestamp if none of faces detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+#output_stream: "DETECTIONS:detections"
+
+# Generates a single side packet containing a vector of SSD anchors based on
+# the specification in the options.
+node {
+  calculator: "SsdAnchorsCalculator"
+  output_side_packet: "anchors"
+  options: {
+    [mediapipe.SsdAnchorsCalculatorOptions.ext] {
+        num_layers: 1
+        min_scale: 0.1484375
+        max_scale: 0.75
+        anchor_offset_x: 0.5
+        anchor_offset_y: 0.5
+        aspect_ratios: 1.0
+        fixed_anchor_size: true
+    }
+  }
+  option_value: "input_size_width:tensor_width"
+  option_value: "input_size_height:tensor_height"
+  option_value: "num_layers:num_layers"
+  option_value: "strides:strides"
+  option_value: "interpolated_scale_aspect_ratio:interpolated_scale_aspect_ratio"
+}
+
+# Decodes the detection tensors generated by the TensorFlow Lite model, based on
+# the SSD anchors and the specification in the options, into a vector of
+# detections. Each detection describes a detected object.
+node {
+  calculator: "TensorsToDetectionsCalculator"
+  input_stream: "TENSORS:detection_tensors"
+  input_side_packet: "ANCHORS:anchors"
+  output_stream: "DETECTIONS:unfiltered_detections"
+  options: {
+    [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
+      num_classes: 1
+      num_coords: 16
+      box_coord_offset: 0
+      keypoint_coord_offset: 4
+      num_keypoints: 6
+      num_values_per_keypoint: 2
+      sigmoid_score: true
+      score_clipping_thresh: 100.0
+      reverse_output_order: true
+    }
+  }
+  option_value: "num_boxes:num_boxes"
+  option_value: "x_scale:x_scale"
+  option_value: "y_scale:y_scale"
+  option_value: "h_scale:h_scale"
+  option_value: "w_scale:w_scale"
+  option_value: "min_score_thresh:min_score_thresh"
+}
+
+# Performs non-max suppression to remove excessive detections.
+node {
+  calculator: "NonMaxSuppressionCalculator"
+  input_stream: "unfiltered_detections"
+  output_stream: "filtered_detections"
+  options: {
+    [mediapipe.NonMaxSuppressionCalculatorOptions.ext] {
+      min_suppression_threshold: 0.3
+      overlap_type: INTERSECTION_OVER_UNION
+      algorithm: WEIGHTED
+    }
+  }
+}
+
+# Projects the detections from input tensor to the corresponding locations on
+# the original image (input to the graph).
+node {
+  calculator: "DetectionProjectionCalculator"
+  input_stream: "DETECTIONS:filtered_detections"
+  input_stream: "PROJECTION_MATRIX:transform_matrix"
+  output_stream: "DETECTIONS:detections"
+}
diff --git a/mediapipe/modules/face_detection/face_detection_onnx_tensorrt.pbtxt b/mediapipe/modules/face_detection/face_detection_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..321736b5f
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_onnx_tensorrt.pbtxt
@@ -0,0 +1,165 @@
+# MediaPipe graph to detect faces.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "FaceDetectionFrontCpu"
+#     input_stream: "IMAGE:image"
+#     input_stream: "ROI:roi"
+#     output_stream: "DETECTIONS:face_detections"
+#   }
+
+type: "FaceDetectionOnnxTensorRT"
+
+# The input image, either ImageFrame, GpuBuffer, or (multi-backend) Image.
+input_stream: "IMAGE:image"
+
+# ROI (region of interest) within the given image where faces should be
+# detected. (NormalizedRect)
+input_stream: "ROI:roi"
+
+# Detected faces. (std::vector<Detection>)
+# NOTE: there will not be an output packet in the DETECTIONS stream for this
+# particular timestamp if none of faces detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+# Converts the input CPU or GPU image to the multi-backend image type (Image).
+node: {
+  calculator: "ToImageCalculator"
+  input_stream: "IMAGE:image"
+  output_stream: "IMAGE:multi_backend_image"
+}
+
+# Transforms the input image into a 128x128 tensor while keeping the aspect
+# ratio (what is expected by the corresponding face detection model), resulting
+# in potential letterboxing in the transformed image.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:multi_backend_image"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "TENSORS:input_tensors"
+  output_stream: "MATRIX:transform_matrix"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+        keep_aspect_ratio: true
+        output_tensor_float_range {
+          min: -1.0
+          max: 1.0
+        }
+        border_mode: BORDER_ZERO
+    }
+  }
+  option_value: "gpu_origin:options/gpu_origin"
+  option_value: "output_tensor_width:options/tensor_width"
+  option_value: "output_tensor_height:options/tensor_height"
+}
+
+# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
+# vector of tensors representing, for instance, detection boxes/keypoints and
+# scores.
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:detection_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+        delegate { tensorrt {} }
+    }
+  }
+  option_value: "model_path:options/model_path"
+}
+
+# Detection tensors. (std::vector<Tensor>)
+#input_stream: "TENSORS:detection_tensors"
+
+# A 4x4 row-major-order matrix that maps a point represented in the detection
+# tensors to a desired coordinate system, e.g., in the original input image
+# before scaling/cropping. (std::array<float, 16>)
+#input_stream: "MATRIX:transform_matrix"
+
+# Detected faces. (std::vector<Detection>)
+# NOTE: there will not be an output packet in the DETECTIONS stream for this
+# particular timestamp if none of faces detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+#output_stream: "DETECTIONS:detections"
+
+# Generates a single side packet containing a vector of SSD anchors based on
+# the specification in the options.
+node {
+  calculator: "SsdAnchorsCalculator"
+  output_side_packet: "anchors"
+  options: {
+    [mediapipe.SsdAnchorsCalculatorOptions.ext] {
+        num_layers: 1
+        min_scale: 0.1484375
+        max_scale: 0.75
+        anchor_offset_x: 0.5
+        anchor_offset_y: 0.5
+        aspect_ratios: 1.0
+        fixed_anchor_size: true
+    }
+  }
+  option_value: "input_size_width:tensor_width"
+  option_value: "input_size_height:tensor_height"
+  option_value: "num_layers:num_layers"
+  option_value: "strides:strides"
+  option_value: "interpolated_scale_aspect_ratio:interpolated_scale_aspect_ratio"
+}
+
+# Decodes the detection tensors generated by the TensorFlow Lite model, based on
+# the SSD anchors and the specification in the options, into a vector of
+# detections. Each detection describes a detected object.
+node {
+  calculator: "TensorsToDetectionsCalculator"
+  input_stream: "TENSORS:detection_tensors"
+  input_side_packet: "ANCHORS:anchors"
+  output_stream: "DETECTIONS:unfiltered_detections"
+  options: {
+    [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
+      num_classes: 1
+      num_coords: 16
+      box_coord_offset: 0
+      keypoint_coord_offset: 4
+      num_keypoints: 6
+      num_values_per_keypoint: 2
+      sigmoid_score: true
+      score_clipping_thresh: 100.0
+      reverse_output_order: true
+    }
+  }
+  option_value: "num_boxes:num_boxes"
+  option_value: "x_scale:x_scale"
+  option_value: "y_scale:y_scale"
+  option_value: "h_scale:h_scale"
+  option_value: "w_scale:w_scale"
+  option_value: "min_score_thresh:min_score_thresh"
+}
+
+# Performs non-max suppression to remove excessive detections.
+node {
+  calculator: "NonMaxSuppressionCalculator"
+  input_stream: "unfiltered_detections"
+  output_stream: "filtered_detections"
+  options: {
+    [mediapipe.NonMaxSuppressionCalculatorOptions.ext] {
+      min_suppression_threshold: 0.3
+      overlap_type: INTERSECTION_OVER_UNION
+      algorithm: WEIGHTED
+    }
+  }
+}
+
+# Projects the detections from input tensor to the corresponding locations on
+# the original image (input to the graph).
+node {
+  calculator: "DetectionProjectionCalculator"
+  input_stream: "DETECTIONS:filtered_detections"
+  input_stream: "PROJECTION_MATRIX:transform_matrix"
+  output_stream: "DETECTIONS:detections"
+}
diff --git a/mediapipe/modules/face_detection/face_detection_short_range.onnx b/mediapipe/modules/face_detection/face_detection_short_range.onnx
index 638600236..18edac978 100644
Binary files a/mediapipe/modules/face_detection/face_detection_short_range.onnx and b/mediapipe/modules/face_detection/face_detection_short_range.onnx differ
diff --git a/mediapipe/modules/face_detection/face_detection_short_range_by_roi_onnx_cuda.pbtxt b/mediapipe/modules/face_detection/face_detection_short_range_by_roi_onnx_cuda.pbtxt
new file mode 100644
index 000000000..10dd4774f
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_short_range_by_roi_onnx_cuda.pbtxt
@@ -0,0 +1,40 @@
+type: "FaceDetectionShortRangeByRoiOnnxCUDA"
+
+input_stream: "IMAGE:image"
+
+input_stream: "ROI:roi"
+
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+node {
+  calculator: "FaceDetectionOnnxCUDA"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:roi"
+  output_stream: "DETECTIONS:detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.FaceDetectionOptions] {
+      model_path: "mediapipe/modules/face_detection/face_detection_short_range.onnx"
+      tensor_width: 128
+      tensor_height: 128
+
+      num_layers: 4
+      strides: 8
+      strides: 16
+      strides: 16
+      strides: 16
+      interpolated_scale_aspect_ratio: 1.0
+
+      num_boxes: 896
+      x_scale: 128.0
+      y_scale: 128.0
+      h_scale: 128.0
+      w_scale: 128.0
+      min_score_thresh: 0.5
+    }
+  }
+  option_value: "OPTIONS:options"
+}
diff --git a/mediapipe/modules/face_detection/face_detection_short_range_by_roi_onnx_tensorrt.pbtxt b/mediapipe/modules/face_detection/face_detection_short_range_by_roi_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..9d431912e
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_short_range_by_roi_onnx_tensorrt.pbtxt
@@ -0,0 +1,40 @@
+type: "FaceDetectionShortRangeByRoiOnnxTensorRT"
+
+input_stream: "IMAGE:image"
+
+input_stream: "ROI:roi"
+
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+node {
+  calculator: "FaceDetectionOnnxTensorRT"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:roi"
+  output_stream: "DETECTIONS:detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.FaceDetectionOptions] {
+      model_path: "mediapipe/modules/face_detection/face_detection_short_range.onnx"
+      tensor_width: 128
+      tensor_height: 128
+
+      num_layers: 4
+      strides: 8
+      strides: 16
+      strides: 16
+      strides: 16
+      interpolated_scale_aspect_ratio: 1.0
+
+      num_boxes: 896
+      x_scale: 128.0
+      y_scale: 128.0
+      h_scale: 128.0
+      w_scale: 128.0
+      min_score_thresh: 0.5
+    }
+  }
+  option_value: "OPTIONS:options"
+}
diff --git a/mediapipe/modules/face_detection/face_detection_short_range_onnx_cuda.pbtxt b/mediapipe/modules/face_detection/face_detection_short_range_onnx_cuda.pbtxt
new file mode 100644
index 000000000..9d79fb6ac
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_short_range_onnx_cuda.pbtxt
@@ -0,0 +1,40 @@
+type: "FaceDetectionShortRangeOnnxCUDA"
+
+input_stream: "IMAGE:image"
+
+input_stream: "ROI:roi"
+
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+node {
+  calculator: "FaceDetectionOnnxCUDA"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:roi"
+  output_stream: "DETECTIONS:detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.FaceDetectionOptions] {
+      model_path: "mediapipe/modules/face_detection/face_detection_short_range.onnx"
+      tensor_width: 128
+      tensor_height: 128
+
+      num_layers: 4
+      strides: 8
+      strides: 16
+      strides: 16
+      strides: 16
+      interpolated_scale_aspect_ratio: 1.0
+
+      num_boxes: 896
+      x_scale: 128.0
+      y_scale: 128.0
+      h_scale: 128.0
+      w_scale: 128.0
+      min_score_thresh: 0.5
+    }
+  }
+  option_value: "OPTIONS:options"
+}
diff --git a/mediapipe/modules/face_detection/face_detection_short_range_onnx_tensorrt.pbtxt b/mediapipe/modules/face_detection/face_detection_short_range_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..c54dff27d
--- /dev/null
+++ b/mediapipe/modules/face_detection/face_detection_short_range_onnx_tensorrt.pbtxt
@@ -0,0 +1,40 @@
+type: "FaceDetectionShortRangeOnnxTensorRT"
+
+input_stream: "IMAGE:image"
+
+input_stream: "ROI:roi"
+
+output_stream: "DETECTIONS:detections"
+
+graph_options: {
+  [type.googleapis.com/mediapipe.FaceDetectionOptions] {}
+}
+
+node {
+  calculator: "FaceDetectionOnnxTensorRT"
+  input_stream: "IMAGE:image"
+  input_stream: "ROI:roi"
+  output_stream: "DETECTIONS:detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.FaceDetectionOptions] {
+      model_path: "mediapipe/modules/face_detection/face_detection_short_range.onnx"
+      tensor_width: 128
+      tensor_height: 128
+
+      num_layers: 4
+      strides: 8
+      strides: 16
+      strides: 16
+      strides: 16
+      interpolated_scale_aspect_ratio: 1.0
+
+      num_boxes: 896
+      x_scale: 128.0
+      y_scale: 128.0
+      h_scale: 128.0
+      w_scale: 128.0
+      min_score_thresh: 0.5
+    }
+  }
+  option_value: "OPTIONS:options"
+}
diff --git a/mediapipe/modules/face_landmark/BUILD b/mediapipe/modules/face_landmark/BUILD
index f155e46d5..331319fcf 100644
--- a/mediapipe/modules/face_landmark/BUILD
+++ b/mediapipe/modules/face_landmark/BUILD
@@ -42,6 +42,45 @@ mediapipe_simple_subgraph(
     ],
 )
 
+mediapipe_simple_subgraph(
+    name = "face_landmark_onnx_cuda",
+    graph = "face_landmark_onnx_cuda.pbtxt",
+    register_as = "FaceLandmarkOnnxCUDA",
+    deps = [
+        ":tensors_to_face_landmarks",
+        ":tensors_to_face_landmarks_with_attention",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/core:split_vector_calculator",
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_cuda",
+        "//mediapipe/calculators/tensor:tensors_to_floats_calculator",
+        "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator",
+        "//mediapipe/calculators/util:landmark_projection_calculator",
+        "//mediapipe/calculators/util:thresholding_calculator",
+        "//mediapipe/framework/tool:switch_container",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "face_landmark_onnx_tensorrt",
+    graph = "face_landmark_onnx_tensorrt.pbtxt",
+    register_as = "FaceLandmarkOnnxTensorRT",
+    deps = [
+        ":tensors_to_face_landmarks",
+        ":tensors_to_face_landmarks_with_attention",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/core:split_vector_calculator",
+        "//mediapipe/calculators/tensor:image_to_tensor_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator",
+        "//mediapipe/calculators/tensor:inference_calculator_onnx_tensorrt",
+        "//mediapipe/calculators/tensor:tensors_to_floats_calculator",
+        "//mediapipe/calculators/tensor:tensors_to_landmarks_calculator",
+        "//mediapipe/calculators/util:landmark_projection_calculator",
+        "//mediapipe/calculators/util:thresholding_calculator",
+        "//mediapipe/framework/tool:switch_container",
+    ],
+)
+
 mediapipe_simple_subgraph(
     name = "face_landmark_gpu",
     graph = "face_landmark_gpu.pbtxt",
@@ -84,6 +123,48 @@ mediapipe_simple_subgraph(
     ],
 )
 
+mediapipe_simple_subgraph(
+    name = "face_landmark_front_onnx_cuda",
+    graph = "face_landmark_front_onnx_cuda.pbtxt",
+    register_as = "FaceLandmarkFrontOnnxCUDA",
+    deps = [
+        ":face_detection_front_detection_to_roi",
+        ":face_landmark_landmarks_to_roi",
+        ":face_landmark_onnx_cuda",
+        "//mediapipe/calculators/core:begin_loop_calculator",
+        "//mediapipe/calculators/core:clip_vector_size_calculator",
+        "//mediapipe/calculators/core:constant_side_packet_calculator",
+        "//mediapipe/calculators/core:end_loop_calculator",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/core:previous_loopback_calculator",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/util:association_norm_rect_calculator",
+        "//mediapipe/calculators/util:collection_has_min_size_calculator",
+        "//mediapipe/modules/face_detection:face_detection_short_range_onnx_cuda",
+    ],
+)
+
+mediapipe_simple_subgraph(
+    name = "face_landmark_front_onnx_tensorrt",
+    graph = "face_landmark_front_onnx_tensorrt.pbtxt",
+    register_as = "FaceLandmarkFrontOnnxTensorRT",
+    deps = [
+        ":face_detection_front_detection_to_roi",
+        ":face_landmark_landmarks_to_roi",
+        ":face_landmark_onnx_tensorrt",
+        "//mediapipe/calculators/core:begin_loop_calculator",
+        "//mediapipe/calculators/core:clip_vector_size_calculator",
+        "//mediapipe/calculators/core:constant_side_packet_calculator",
+        "//mediapipe/calculators/core:end_loop_calculator",
+        "//mediapipe/calculators/core:gate_calculator",
+        "//mediapipe/calculators/core:previous_loopback_calculator",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/util:association_norm_rect_calculator",
+        "//mediapipe/calculators/util:collection_has_min_size_calculator",
+        "//mediapipe/modules/face_detection:face_detection_short_range_onnx_tensorrt",
+    ],
+)
+
 mediapipe_simple_subgraph(
     name = "face_landmark_front_gpu",
     graph = "face_landmark_front_gpu.pbtxt",
diff --git a/mediapipe/modules/face_landmark/face_landmark_front_onnx_cuda.pbtxt b/mediapipe/modules/face_landmark/face_landmark_front_onnx_cuda.pbtxt
new file mode 100644
index 000000000..fa1283b14
--- /dev/null
+++ b/mediapipe/modules/face_landmark/face_landmark_front_onnx_cuda.pbtxt
@@ -0,0 +1,247 @@
+# MediaPipe graph to detect/predict face landmarks. (CPU input, and inference is
+# executed with onnxruntime on cuda.) This graph tries to skip face detection as much as possible
+# by using previously detected/predicted landmarks for new images.
+#
+# It is required that "face_detection_short_range.onnxruntime" is available at
+# "mediapipe/modules/face_detection/face_detection_short_range.onnxruntime"
+# path during execution.
+#
+# It is required that "face_landmark.onnxruntime" is available at
+# "mediapipe/modules/face_landmark/face_landmark.onnxruntime"
+# path during execution if `with_attention` is not set or set to `false`.
+#
+# It is required that "face_landmark_with_attention.onnxruntime" is available at
+# "mediapipe/modules/face_landmark/face_landmark_with_attention.onnxruntime"
+# path during execution if `with_attention` is set to `true`.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "FaceLandmarkFrontOnnxCUDA"
+#     input_stream: "IMAGE:image"
+#     input_side_packet: "NUM_FACES:num_faces"
+#     input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+#     input_side_packet: "WITH_ATTENTION:with_attention"
+#     output_stream: "LANDMARKS:multi_face_landmarks"
+#   }
+
+type: "FaceLandmarkFrontOnnxCUDA"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+
+# Max number of faces to detect/track. (int)
+input_side_packet: "NUM_FACES:num_faces"
+
+# Whether landmarks on the previous image should be used to help localize
+# landmarks on the current image. (bool)
+input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
+
+# Collection of detected/predicted faces, each represented as a list of 468 face
+# landmarks. (std::vector<NormalizedLandmarkList>)
+# NOTE: there will not be an output packet in the LANDMARKS stream for this
+# particular timestamp if none of faces detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:multi_face_landmarks"
+
+# Extra outputs (for debugging, for instance).
+# Detected faces. (std::vector<Detection>)
+output_stream: "DETECTIONS:face_detections"
+# Regions of interest calculated based on landmarks.
+# (std::vector<NormalizedRect>)
+output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
+# Regions of interest calculated based on face detections.
+# (std::vector<NormalizedRect>)
+output_stream: "ROIS_FROM_DETECTIONS:face_rects_from_detections"
+
+# When the optional input side packet "use_prev_landmarks" is either absent or
+# set to true, uses the landmarks on the previous image to help localize
+# landmarks on the current image.
+node {
+  calculator: "GateCalculator"
+  input_side_packet: "ALLOW:use_prev_landmarks"
+  input_stream: "prev_face_rects_from_landmarks"
+  output_stream: "gated_prev_face_rects_from_landmarks"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      allow: true
+    }
+  }
+}
+
+# Determines if an input vector of NormalizedRect has a size greater than or
+# equal to the provided num_faces.
+node {
+  calculator: "NormalizedRectVectorHasMinSizeCalculator"
+  input_stream: "ITERABLE:gated_prev_face_rects_from_landmarks"
+  input_side_packet: "num_faces"
+  output_stream: "prev_has_enough_faces"
+}
+
+# Drops the incoming image if enough faces have already been identified from the
+# previous image. Otherwise, passes the incoming image through to trigger a new
+# round of face detection.
+node {
+  calculator: "GateCalculator"
+  input_stream: "image"
+  input_stream: "DISALLOW:prev_has_enough_faces"
+  output_stream: "gated_image"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      empty_packets_as_allow: true
+    }
+  }
+}
+
+# Detects faces.
+node {
+  calculator: "FaceDetectionShortRangeOnnxCUDA"
+  input_stream: "IMAGE:gated_image"
+  output_stream: "DETECTIONS:all_face_detections"
+}
+
+# Makes sure there are no more detections than the provided num_faces.
+node {
+  calculator: "ClipDetectionVectorSizeCalculator"
+  input_stream: "all_face_detections"
+  output_stream: "face_detections"
+  input_side_packet: "num_faces"
+}
+
+# Calculate size of the image.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:gated_image"
+  output_stream: "SIZE:gated_image_size"
+}
+
+# Outputs each element of face_detections at a fake timestamp for the rest of
+# the graph to process. Clones the image size packet for each face_detection at
+# the fake timestamp. At the end of the loop, outputs the BATCH_END timestamp
+# for downstream calculators to inform them that all elements in the vector have
+# been processed.
+node {
+  calculator: "BeginLoopDetectionCalculator"
+  input_stream: "ITERABLE:face_detections"
+  input_stream: "CLONE:gated_image_size"
+  output_stream: "ITEM:face_detection"
+  output_stream: "CLONE:detections_loop_image_size"
+  output_stream: "BATCH_END:detections_loop_end_timestamp"
+}
+
+# Calculates region of interest based on face detections, so that can be used
+# to detect landmarks.
+node {
+  calculator: "FaceDetectionFrontDetectionToRoi"
+  input_stream: "DETECTION:face_detection"
+  input_stream: "IMAGE_SIZE:detections_loop_image_size"
+  output_stream: "ROI:face_rect_from_detection"
+}
+
+# Collects a NormalizedRect for each face into a vector. Upon receiving the
+# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END
+# timestamp.
+node {
+  calculator: "EndLoopNormalizedRectCalculator"
+  input_stream: "ITEM:face_rect_from_detection"
+  input_stream: "BATCH_END:detections_loop_end_timestamp"
+  output_stream: "ITERABLE:face_rects_from_detections"
+}
+
+# Performs association between NormalizedRect vector elements from previous
+# image and rects based on face detections from the current image. This
+# calculator ensures that the output face_rects vector doesn't contain
+# overlapping regions based on the specified min_similarity_threshold.
+node {
+  calculator: "AssociationNormRectCalculator"
+  input_stream: "face_rects_from_detections"
+  input_stream: "gated_prev_face_rects_from_landmarks"
+  output_stream: "face_rects"
+  options: {
+    [mediapipe.AssociationCalculatorOptions.ext] {
+      min_similarity_threshold: 0.5
+    }
+  }
+}
+
+# Calculate size of the image.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:image"
+  output_stream: "SIZE:image_size"
+}
+
+# Outputs each element of face_rects at a fake timestamp for the rest of the
+# graph to process. Clones image and image size packets for each
+# single_face_rect at the fake timestamp. At the end of the loop, outputs the
+# BATCH_END timestamp for downstream calculators to inform them that all
+# elements in the vector have been processed.
+node {
+  calculator: "BeginLoopNormalizedRectCalculator"
+  input_stream: "ITERABLE:face_rects"
+  input_stream: "CLONE:0:image"
+  input_stream: "CLONE:1:image_size"
+  output_stream: "ITEM:face_rect"
+  output_stream: "CLONE:0:landmarks_loop_image"
+  output_stream: "CLONE:1:landmarks_loop_image_size"
+  output_stream: "BATCH_END:landmarks_loop_end_timestamp"
+}
+
+# Detects face landmarks within specified region of interest of the image.
+node {
+  calculator: "FaceLandmarkOnnxCUDA"
+  input_stream: "IMAGE:landmarks_loop_image"
+  input_stream: "ROI:face_rect"
+  input_side_packet: "WITH_ATTENTION:with_attention"
+  output_stream: "LANDMARKS:face_landmarks"
+}
+
+# Calculates region of interest based on face landmarks, so that can be reused
+# for subsequent image.
+node {
+  calculator: "FaceLandmarkLandmarksToRoi"
+  input_stream: "LANDMARKS:face_landmarks"
+  input_stream: "IMAGE_SIZE:landmarks_loop_image_size"
+  output_stream: "ROI:face_rect_from_landmarks"
+}
+
+# Collects a set of landmarks for each face into a vector. Upon receiving the
+# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END
+# timestamp.
+node {
+  calculator: "EndLoopNormalizedLandmarkListVectorCalculator"
+  input_stream: "ITEM:face_landmarks"
+  input_stream: "BATCH_END:landmarks_loop_end_timestamp"
+  output_stream: "ITERABLE:multi_face_landmarks"
+}
+
+# Collects a NormalizedRect for each face into a vector. Upon receiving the
+# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END
+# timestamp.
+node {
+  calculator: "EndLoopNormalizedRectCalculator"
+  input_stream: "ITEM:face_rect_from_landmarks"
+  input_stream: "BATCH_END:landmarks_loop_end_timestamp"
+  output_stream: "ITERABLE:face_rects_from_landmarks"
+}
+
+# Caches face rects calculated from landmarks, and upon the arrival of the next
+# input image, sends out the cached rects with timestamps replaced by that of
+# the input image, essentially generating a packet that carries the previous
+# face rects. Note that upon the arrival of the very first input image, a
+# timestamp bound update occurs to jump start the feedback loop.
+node {
+  calculator: "PreviousLoopbackCalculator"
+  input_stream: "MAIN:image"
+  input_stream: "LOOP:face_rects_from_landmarks"
+  input_stream_info: {
+    tag_index: "LOOP"
+    back_edge: true
+  }
+  output_stream: "PREV_LOOP:prev_face_rects_from_landmarks"
+}
diff --git a/mediapipe/modules/face_landmark/face_landmark_front_onnx_tensorrt.pbtxt b/mediapipe/modules/face_landmark/face_landmark_front_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..fca5f7105
--- /dev/null
+++ b/mediapipe/modules/face_landmark/face_landmark_front_onnx_tensorrt.pbtxt
@@ -0,0 +1,247 @@
+# MediaPipe graph to detect/predict face landmarks. (CPU input, and inference is
+# executed with onnxruntime on tensorrt.) This graph tries to skip face detection as much as possible
+# by using previously detected/predicted landmarks for new images.
+#
+# It is required that "face_detection_short_range.onnxruntime" is available at
+# "mediapipe/modules/face_detection/face_detection_short_range.onnxruntime"
+# path during execution.
+#
+# It is required that "face_landmark.onnxruntime" is available at
+# "mediapipe/modules/face_landmark/face_landmark.onnxruntime"
+# path during execution if `with_attention` is not set or set to `false`.
+#
+# It is required that "face_landmark_with_attention.onnxruntime" is available at
+# "mediapipe/modules/face_landmark/face_landmark_with_attention.onnxruntime"
+# path during execution if `with_attention` is set to `true`.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "FaceLandmarkFrontTensorRT"
+#     input_stream: "IMAGE:image"
+#     input_side_packet: "NUM_FACES:num_faces"
+#     input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+#     input_side_packet: "WITH_ATTENTION:with_attention"
+#     output_stream: "LANDMARKS:multi_face_landmarks"
+#   }
+
+type: "FaceLandmarkFrontTensorRT"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+
+# Max number of faces to detect/track. (int)
+input_side_packet: "NUM_FACES:num_faces"
+
+# Whether landmarks on the previous image should be used to help localize
+# landmarks on the current image. (bool)
+input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"
+
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
+
+# Collection of detected/predicted faces, each represented as a list of 468 face
+# landmarks. (std::vector<NormalizedLandmarkList>)
+# NOTE: there will not be an output packet in the LANDMARKS stream for this
+# particular timestamp if none of faces detected. However, the MediaPipe
+# framework will internally inform the downstream calculators of the absence of
+# this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:multi_face_landmarks"
+
+# Extra outputs (for debugging, for instance).
+# Detected faces. (std::vector<Detection>)
+output_stream: "DETECTIONS:face_detections"
+# Regions of interest calculated based on landmarks.
+# (std::vector<NormalizedRect>)
+output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
+# Regions of interest calculated based on face detections.
+# (std::vector<NormalizedRect>)
+output_stream: "ROIS_FROM_DETECTIONS:face_rects_from_detections"
+
+# When the optional input side packet "use_prev_landmarks" is either absent or
+# set to true, uses the landmarks on the previous image to help localize
+# landmarks on the current image.
+node {
+  calculator: "GateCalculator"
+  input_side_packet: "ALLOW:use_prev_landmarks"
+  input_stream: "prev_face_rects_from_landmarks"
+  output_stream: "gated_prev_face_rects_from_landmarks"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      allow: true
+    }
+  }
+}
+
+# Determines if an input vector of NormalizedRect has a size greater than or
+# equal to the provided num_faces.
+node {
+  calculator: "NormalizedRectVectorHasMinSizeCalculator"
+  input_stream: "ITERABLE:gated_prev_face_rects_from_landmarks"
+  input_side_packet: "num_faces"
+  output_stream: "prev_has_enough_faces"
+}
+
+# Drops the incoming image if enough faces have already been identified from the
+# previous image. Otherwise, passes the incoming image through to trigger a new
+# round of face detection.
+node {
+  calculator: "GateCalculator"
+  input_stream: "image"
+  input_stream: "DISALLOW:prev_has_enough_faces"
+  output_stream: "gated_image"
+  options: {
+    [mediapipe.GateCalculatorOptions.ext] {
+      empty_packets_as_allow: true
+    }
+  }
+}
+
+# Detects faces.
+node {
+  calculator: "FaceDetectionShortRangeOnnxTensorRT"
+  input_stream: "IMAGE:gated_image"
+  output_stream: "DETECTIONS:all_face_detections"
+}
+
+# Makes sure there are no more detections than the provided num_faces.
+node {
+  calculator: "ClipDetectionVectorSizeCalculator"
+  input_stream: "all_face_detections"
+  output_stream: "face_detections"
+  input_side_packet: "num_faces"
+}
+
+# Calculate size of the image.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:gated_image"
+  output_stream: "SIZE:gated_image_size"
+}
+
+# Outputs each element of face_detections at a fake timestamp for the rest of
+# the graph to process. Clones the image size packet for each face_detection at
+# the fake timestamp. At the end of the loop, outputs the BATCH_END timestamp
+# for downstream calculators to inform them that all elements in the vector have
+# been processed.
+node {
+  calculator: "BeginLoopDetectionCalculator"
+  input_stream: "ITERABLE:face_detections"
+  input_stream: "CLONE:gated_image_size"
+  output_stream: "ITEM:face_detection"
+  output_stream: "CLONE:detections_loop_image_size"
+  output_stream: "BATCH_END:detections_loop_end_timestamp"
+}
+
+# Calculates region of interest based on face detections, so that can be used
+# to detect landmarks.
+node {
+  calculator: "FaceDetectionFrontDetectionToRoi"
+  input_stream: "DETECTION:face_detection"
+  input_stream: "IMAGE_SIZE:detections_loop_image_size"
+  output_stream: "ROI:face_rect_from_detection"
+}
+
+# Collects a NormalizedRect for each face into a vector. Upon receiving the
+# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END
+# timestamp.
+node {
+  calculator: "EndLoopNormalizedRectCalculator"
+  input_stream: "ITEM:face_rect_from_detection"
+  input_stream: "BATCH_END:detections_loop_end_timestamp"
+  output_stream: "ITERABLE:face_rects_from_detections"
+}
+
+# Performs association between NormalizedRect vector elements from previous
+# image and rects based on face detections from the current image. This
+# calculator ensures that the output face_rects vector doesn't contain
+# overlapping regions based on the specified min_similarity_threshold.
+node {
+  calculator: "AssociationNormRectCalculator"
+  input_stream: "face_rects_from_detections"
+  input_stream: "gated_prev_face_rects_from_landmarks"
+  output_stream: "face_rects"
+  options: {
+    [mediapipe.AssociationCalculatorOptions.ext] {
+      min_similarity_threshold: 0.5
+    }
+  }
+}
+
+# Calculate size of the image.
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:image"
+  output_stream: "SIZE:image_size"
+}
+
+# Outputs each element of face_rects at a fake timestamp for the rest of the
+# graph to process. Clones image and image size packets for each
+# single_face_rect at the fake timestamp. At the end of the loop, outputs the
+# BATCH_END timestamp for downstream calculators to inform them that all
+# elements in the vector have been processed.
+node {
+  calculator: "BeginLoopNormalizedRectCalculator"
+  input_stream: "ITERABLE:face_rects"
+  input_stream: "CLONE:0:image"
+  input_stream: "CLONE:1:image_size"
+  output_stream: "ITEM:face_rect"
+  output_stream: "CLONE:0:landmarks_loop_image"
+  output_stream: "CLONE:1:landmarks_loop_image_size"
+  output_stream: "BATCH_END:landmarks_loop_end_timestamp"
+}
+
+# Detects face landmarks within specified region of interest of the image.
+node {
+  calculator: "FaceLandmarkOnnxTensorRT"
+  input_stream: "IMAGE:landmarks_loop_image"
+  input_stream: "ROI:face_rect"
+  input_side_packet: "WITH_ATTENTION:with_attention"
+  output_stream: "LANDMARKS:face_landmarks"
+}
+
+# Calculates region of interest based on face landmarks, so that can be reused
+# for subsequent image.
+node {
+  calculator: "FaceLandmarkLandmarksToRoi"
+  input_stream: "LANDMARKS:face_landmarks"
+  input_stream: "IMAGE_SIZE:landmarks_loop_image_size"
+  output_stream: "ROI:face_rect_from_landmarks"
+}
+
+# Collects a set of landmarks for each face into a vector. Upon receiving the
+# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END
+# timestamp.
+node {
+  calculator: "EndLoopNormalizedLandmarkListVectorCalculator"
+  input_stream: "ITEM:face_landmarks"
+  input_stream: "BATCH_END:landmarks_loop_end_timestamp"
+  output_stream: "ITERABLE:multi_face_landmarks"
+}
+
+# Collects a NormalizedRect for each face into a vector. Upon receiving the
+# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END
+# timestamp.
+node {
+  calculator: "EndLoopNormalizedRectCalculator"
+  input_stream: "ITEM:face_rect_from_landmarks"
+  input_stream: "BATCH_END:landmarks_loop_end_timestamp"
+  output_stream: "ITERABLE:face_rects_from_landmarks"
+}
+
+# Caches face rects calculated from landmarks, and upon the arrival of the next
+# input image, sends out the cached rects with timestamps replaced by that of
+# the input image, essentially generating a packet that carries the previous
+# face rects. Note that upon the arrival of the very first input image, a
+# timestamp bound update occurs to jump start the feedback loop.
+node {
+  calculator: "PreviousLoopbackCalculator"
+  input_stream: "MAIN:image"
+  input_stream: "LOOP:face_rects_from_landmarks"
+  input_stream_info: {
+    tag_index: "LOOP"
+    back_edge: true
+  }
+  output_stream: "PREV_LOOP:prev_face_rects_from_landmarks"
+}
diff --git a/mediapipe/modules/face_landmark/face_landmark_onnx_cuda.pbtxt b/mediapipe/modules/face_landmark/face_landmark_onnx_cuda.pbtxt
new file mode 100644
index 000000000..a006ea15f
--- /dev/null
+++ b/mediapipe/modules/face_landmark/face_landmark_onnx_cuda.pbtxt
@@ -0,0 +1,166 @@
+# MediaPipe graph to detect/predict face landmarks. (CPU input, and inference is
+# executed with onnxruntime on cuda.)
+#
+# It is required that "face_landmark.onnx" is available at
+# "mediapipe/modules/face_landmark/face_landmark.onnx"
+# path during execution if `with_attention` is not set or set to `false`.
+#
+# It is required that "face_landmark_with_attention.onnx" is available at
+# "mediapipe/modules/face_landmark/face_landmark_with_attention.onnx"
+# path during execution if `with_attention` is set to `true`.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "FaceLandmarkOnnxCUDA"
+#     input_stream: "IMAGE:image"
+#     input_stream: "ROI:face_roi"
+#     input_side_packet: "WITH_ATTENTION:with_attention"
+#     output_stream: "LANDMARKS:face_landmarks"
+#   }
+
+type: "FaceLandmarkOnnxCUDA"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+# ROI (region of interest) within the given image where a face is located.
+# (NormalizedRect)
+input_stream: "ROI:roi"
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
+
+# 468 or 478 facial landmarks within the given ROI. (NormalizedLandmarkList)
+#
+# Number of landmarks depends on the WITH_ATTENTION flag. If it's `true` - then
+# there will be 478 landmarks with refined lips, eyes and irises (10 extra
+# landmarks are for irises), otherwise 468 non-refined landmarks are returned.
+#
+# NOTE: if a face is not present within the given ROI, for this particular
+# timestamp there will not be an output packet in the LANDMARKS stream. However,
+# the MediaPipe framework will internally inform the downstream calculators of
+# the absence of this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:face_landmarks"
+
+# Transforms the input image into a 192x192 tensor.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:image"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "TENSORS:input_tensors"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+      output_tensor_width: 192
+      output_tensor_height: 192
+      output_tensor_float_range {
+        min: 0.0
+        max: 1.0
+      }
+    }
+  }
+}
+
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:output_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+      delegate { cuda {} }
+      model_path: "mediapipe/modules/face_landmark/face_landmark.onnx"
+    }
+  }
+}
+
+# Splits a vector of tensors into landmark tensors and face flag tensor.
+node {
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
+  input_stream: "output_tensors"
+  output_stream: "landmark_tensors"
+  output_stream: "face_flag_tensor"
+  options: {
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 1 }
+            ranges: { begin: 1 end: 2 }
+          }
+        }
+      }
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 6 }
+            ranges: { begin: 6 end: 7 }
+          }
+        }
+      }
+    }
+  }
+}
+
+# Converts the face-flag tensor into a float that represents the confidence
+# score of face presence.
+node {
+  calculator: "TensorsToFloatsCalculator"
+  input_stream: "TENSORS:face_flag_tensor"
+  output_stream: "FLOAT:face_presence_score"
+  options {
+    [mediapipe.TensorsToFloatsCalculatorOptions.ext] {
+      activation: SIGMOID
+    }
+  }
+}
+
+# Applies a threshold to the confidence score to determine whether a face is
+# present.
+node {
+  calculator: "ThresholdingCalculator"
+  input_stream: "FLOAT:face_presence_score"
+  output_stream: "FLAG:face_presence"
+  options: {
+    [mediapipe.ThresholdingCalculatorOptions.ext] {
+      threshold: 0.5
+    }
+  }
+}
+
+# Drop landmarks tensors if face is not present.
+node {
+  calculator: "GateCalculator"
+  input_stream: "landmark_tensors"
+  input_stream: "ALLOW:face_presence"
+  output_stream: "ensured_landmark_tensors"
+}
+
+# Decodes the landmark tensors into a vector of landmarks, where the landmark
+# coordinates are normalized by the size of the input image to the model.
+node {
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
+  input_stream: "TENSORS:ensured_landmark_tensors"
+  output_stream: "LANDMARKS:landmarks"
+  options: {
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "TensorsToFaceLandmarks"
+      }
+      contained_node: {
+        calculator: "TensorsToFaceLandmarksWithAttention"
+      }
+    }
+  }
+}
+
+# Projects the landmarks from the cropped face image to the corresponding
+# locations on the full image before cropping (input to the graph).
+node {
+  calculator: "LandmarkProjectionCalculator"
+  input_stream: "NORM_LANDMARKS:landmarks"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "NORM_LANDMARKS:face_landmarks"
+}
diff --git a/mediapipe/modules/face_landmark/face_landmark_onnx_tensorrt.pbtxt b/mediapipe/modules/face_landmark/face_landmark_onnx_tensorrt.pbtxt
new file mode 100644
index 000000000..37af474c8
--- /dev/null
+++ b/mediapipe/modules/face_landmark/face_landmark_onnx_tensorrt.pbtxt
@@ -0,0 +1,166 @@
+# MediaPipe graph to detect/predict face landmarks. (CPU input, and inference is
+# executed with onnxruntime on TensorRT.)
+#
+# It is required that "face_landmark.onnx" is available at
+# "mediapipe/modules/face_landmark/face_landmark.onnx"
+# path during execution if `with_attention` is not set or set to `false`.
+#
+# It is required that "face_landmark_with_attention.onnx" is available at
+# "mediapipe/modules/face_landmark/face_landmark_with_attention.onnx"
+# path during execution if `with_attention` is set to `true`.
+#
+# EXAMPLE:
+#   node {
+#     calculator: "FaceLandmarkOnnxTensorrt"
+#     input_stream: "IMAGE:image"
+#     input_stream: "ROI:face_roi"
+#     input_side_packet: "WITH_ATTENTION:with_attention"
+#     output_stream: "LANDMARKS:face_landmarks"
+#   }
+
+type: "FaceLandmarkOnnxTensorrt"
+
+# CPU image. (ImageFrame)
+input_stream: "IMAGE:image"
+# ROI (region of interest) within the given image where a face is located.
+# (NormalizedRect)
+input_stream: "ROI:roi"
+# Whether to run face mesh model with attention on lips and eyes. (bool)
+# Attention provides more accuracy on lips and eye regions as well as iris
+# landmarks.
+input_side_packet: "WITH_ATTENTION:with_attention"
+
+# 468 or 478 facial landmarks within the given ROI. (NormalizedLandmarkList)
+#
+# Number of landmarks depends on the WITH_ATTENTION flag. If it's `true` - then
+# there will be 478 landmarks with refined lips, eyes and irises (10 extra
+# landmarks are for irises), otherwise 468 non-refined landmarks are returned.
+#
+# NOTE: if a face is not present within the given ROI, for this particular
+# timestamp there will not be an output packet in the LANDMARKS stream. However,
+# the MediaPipe framework will internally inform the downstream calculators of
+# the absence of this packet so that they don't wait for it unnecessarily.
+output_stream: "LANDMARKS:face_landmarks"
+
+# Transforms the input image into a 192x192 tensor.
+node: {
+  calculator: "ImageToTensorCalculator"
+  input_stream: "IMAGE:image"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "TENSORS:input_tensors"
+  options: {
+    [mediapipe.ImageToTensorCalculatorOptions.ext] {
+      output_tensor_width: 192
+      output_tensor_height: 192
+      output_tensor_float_range {
+        min: 0.0
+        max: 1.0
+      }
+    }
+  }
+}
+
+node {
+  calculator: "InferenceCalculator"
+  input_stream: "TENSORS:input_tensors"
+  output_stream: "TENSORS:output_tensors"
+  options: {
+    [mediapipe.InferenceCalculatorOptions.ext] {
+      delegate { tensorrt {} }
+      model_path: "mediapipe/modules/face_landmark/face_landmark.onnx"
+    }
+  }
+}
+
+# Splits a vector of tensors into landmark tensors and face flag tensor.
+node {
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
+  input_stream: "output_tensors"
+  output_stream: "landmark_tensors"
+  output_stream: "face_flag_tensor"
+  options: {
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 1 }
+            ranges: { begin: 1 end: 2 }
+          }
+        }
+      }
+      contained_node: {
+        calculator: "SplitTensorVectorCalculator"
+        options: {
+          [mediapipe.SplitVectorCalculatorOptions.ext] {
+            ranges: { begin: 0 end: 6 }
+            ranges: { begin: 6 end: 7 }
+          }
+        }
+      }
+    }
+  }
+}
+
+# Converts the face-flag tensor into a float that represents the confidence
+# score of face presence.
+node {
+  calculator: "TensorsToFloatsCalculator"
+  input_stream: "TENSORS:face_flag_tensor"
+  output_stream: "FLOAT:face_presence_score"
+  options {
+    [mediapipe.TensorsToFloatsCalculatorOptions.ext] {
+      activation: SIGMOID
+    }
+  }
+}
+
+# Applies a threshold to the confidence score to determine whether a face is
+# present.
+node {
+  calculator: "ThresholdingCalculator"
+  input_stream: "FLOAT:face_presence_score"
+  output_stream: "FLAG:face_presence"
+  options: {
+    [mediapipe.ThresholdingCalculatorOptions.ext] {
+      threshold: 0.5
+    }
+  }
+}
+
+# Drop landmarks tensors if face is not present.
+node {
+  calculator: "GateCalculator"
+  input_stream: "landmark_tensors"
+  input_stream: "ALLOW:face_presence"
+  output_stream: "ensured_landmark_tensors"
+}
+
+# Decodes the landmark tensors into a vector of landmarks, where the landmark
+# coordinates are normalized by the size of the input image to the model.
+node {
+  calculator: "SwitchContainer"
+  input_side_packet: "ENABLE:with_attention"
+  input_stream: "TENSORS:ensured_landmark_tensors"
+  output_stream: "LANDMARKS:landmarks"
+  options: {
+    [mediapipe.SwitchContainerOptions.ext] {
+      contained_node: {
+        calculator: "TensorsToFaceLandmarks"
+      }
+      contained_node: {
+        calculator: "TensorsToFaceLandmarksWithAttention"
+      }
+    }
+  }
+}
+
+# Projects the landmarks from the cropped face image to the corresponding
+# locations on the full image before cropping (input to the graph).
+node {
+  calculator: "LandmarkProjectionCalculator"
+  input_stream: "NORM_LANDMARKS:landmarks"
+  input_stream: "NORM_RECT:roi"
+  output_stream: "NORM_LANDMARKS:face_landmarks"
+}