mediapipe-rs/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt

# MediaPipe graph to detect poses. (CPU input, and inference is executed on
# CPU.)
#
# It is required that "pose_detection.tflite" is available at
# "mediapipe/modules/pose_detection/pose_detection.tflite"
# path during execution.
#
# EXAMPLE:
#   node {
#     calculator: "PoseDetectionCpu"
#     input_stream: "IMAGE:image"
#     output_stream: "DETECTIONS:pose_detections"
#   }

type: "PoseDetectionCpu"

# CPU image. (ImageFrame)
input_stream: "IMAGE:image"

# Detected poses. (std::vector<Detection>)
# Bounding box in each pose detection is currently set to the bounding box of
# the detected face. However, 4 additional key points are available in each
# detection, which are used to further calculate a (rotated) bounding box that
# encloses the body region of interest. Among the 4 key points, the first two
# are for identifying the full-body region, and the second two for upper body
# only:
#
# Key point 0 - mid hip center
# Key point 1 - point that encodes size & rotation (for full body)
# Key point 2 - mid shoulder center
# Key point 3 - point that encodes size & rotation (for upper body)
#
# NOTE: there will not be an output packet in the DETECTIONS stream for this
# particular timestamp if none of poses detected. However, the MediaPipe
# framework will internally inform the downstream calculators of the absence of
# this packet so that they don't wait for it unnecessarily.
output_stream: "DETECTIONS:detections"

# Transforms the input image into a 224x224 one while keeping the aspect ratio
# (what is expected by the corresponding model), resulting in potential
# letterboxing in the transformed image.
node: {
  calculator: "ImageToTensorCalculator"
  input_stream: "IMAGE:image"
  output_stream: "TENSORS:input_tensors"
  output_stream: "LETTERBOX_PADDING:letterbox_padding"
  options: {
    [mediapipe.ImageToTensorCalculatorOptions.ext] {
      output_tensor_width: 224
      output_tensor_height: 224
      keep_aspect_ratio: true
      output_tensor_float_range {
        min: -1.0
        max: 1.0
      }
      border_mode: BORDER_ZERO
      # If this calculator truly operates in the CPU, then gpu_origin is
      # ignored, but if some build switch insists on GPU inference, then we will
      # still need to set this.
      gpu_origin: TOP_LEFT
    }
  }
}

# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a
# vector of tensors representing, for instance, detection boxes/keypoints and
# scores.
node {
  calculator: "InferenceCalculator"
  input_stream: "TENSORS:input_tensors"
  output_stream: "TENSORS:detection_tensors"
  options: {
    [mediapipe.InferenceCalculatorOptions.ext] {
      model_path: "mediapipe/modules/pose_detection/pose_detection.tflite"
      delegate {
        xnnpack {}
      }
    }
  }
}

# Generates a single side packet containing a vector of SSD anchors based on
# the specification in the options.
node {
  calculator: "SsdAnchorsCalculator"
  output_side_packet: "anchors"
  options: {
    [mediapipe.SsdAnchorsCalculatorOptions.ext] {
      num_layers: 5
      min_scale: 0.1484375
      max_scale: 0.75
      input_size_height: 224
      input_size_width: 224
      anchor_offset_x: 0.5
      anchor_offset_y: 0.5
      strides: 8
      strides: 16
      strides: 32
      strides: 32
      strides: 32
      aspect_ratios: 1.0
      fixed_anchor_size: true
    }
  }
}

# Decodes the detection tensors generated by the TensorFlow Lite model, based on
# the SSD anchors and the specification in the options, into a vector of
# detections. Each detection describes a detected object.
node {
  calculator: "TensorsToDetectionsCalculator"
  input_stream: "TENSORS:detection_tensors"
  input_side_packet: "ANCHORS:anchors"
  output_stream: "DETECTIONS:unfiltered_detections"
  options: {
    [mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
      num_classes: 1
      num_boxes: 2254
      num_coords: 12
      box_coord_offset: 0
      keypoint_coord_offset: 4
      num_keypoints: 4
      num_values_per_keypoint: 2
      sigmoid_score: true
      score_clipping_thresh: 100.0
      reverse_output_order: true
      x_scale: 224.0
      y_scale: 224.0
      h_scale: 224.0
      w_scale: 224.0
      min_score_thresh: 0.5
    }
  }
}

# Performs non-max suppression to remove excessive detections.
node {
  calculator: "NonMaxSuppressionCalculator"
  input_stream: "unfiltered_detections"
  output_stream: "filtered_detections"
  options: {
    [mediapipe.NonMaxSuppressionCalculatorOptions.ext] {
      min_suppression_threshold: 0.3
      overlap_type: INTERSECTION_OVER_UNION
      algorithm: WEIGHTED
    }
  }
}

# Adjusts detection locations (already normalized to [0.f, 1.f]) on the
# letterboxed image (after image transformation with the FIT scale mode) to the
# corresponding locations on the same image with the letterbox removed (the
# input image to the graph before image transformation).
node {
  calculator: "DetectionLetterboxRemovalCalculator"
  input_stream: "DETECTIONS:filtered_detections"
  input_stream: "LETTERBOX_PADDING:letterbox_padding"
  output_stream: "DETECTIONS:detections"
}