mediapipe-rs/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt

# MediaPipe graph to detect/predict hand landmarks on GPU.
#
# The procedure is done in two steps:
# - locate palms/hands
# - detect landmarks for each palm/hand.
# This graph tries to skip palm detection as much as possible by reusing
# previously detected/predicted landmarks for new images.

type: "HandLandmarkTrackingGpu"

# GPU image. (GpuBuffer)
input_stream: "IMAGE:image"

# Max number of hands to detect/track. (int)
input_side_packet: "NUM_HANDS:num_hands"

# Complexity of hand landmark and palm detection models: 0 or 1. Accuracy as
# well as inference latency generally go up with the model complexity. If
# unspecified, functions as set to 1. (int)
input_side_packet: "MODEL_COMPLEXITY:model_complexity"

# Whether landmarks on the previous image should be used to help localize
# landmarks on the current image. (bool)
input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"

# Collection of detected/predicted hands, each represented as a list of
# landmarks. (std::vector<NormalizedLandmarkList>)
# NOTE: there will not be an output packet in the LANDMARKS stream for this
# particular timestamp if none of hands detected. However, the MediaPipe
# framework will internally inform the downstream calculators of the absence of
# this packet so that they don't wait for it unnecessarily.
output_stream: "LANDMARKS:multi_hand_landmarks"

# Collection of detected/predicted hand world landmarks.
# (std::vector<LandmarkList>)
#
# World landmarks are real-world 3D coordinates in meters with the origin in the
# center of the hand bounding box calculated from the landmarks.
#
# WORLD_LANDMARKS shares the same landmark topology as LANDMARKS. However,
# LANDMARKS provides coordinates (in pixels) of a 3D object projected onto the
# 2D image surface, while WORLD_LANDMARKS provides coordinates (in meters) of
# the 3D object itself.
output_stream: "WORLD_LANDMARKS:multi_hand_world_landmarks"

# Collection of handedness of the detected hands (i.e. is hand left or right),
# each represented as a ClassificationList proto with a single Classification
# entry. (std::vector<ClassificationList>)
# Note that handedness is determined assuming the input image is mirrored,
# i.e., taken with a front-facing/selfie camera with images flipped
# horizontally.
output_stream: "HANDEDNESS:multi_handedness"

# Extra outputs (for debugging, for instance).
# Detected palms. (std::vector<Detection>)
output_stream: "PALM_DETECTIONS:palm_detections"
# Regions of interest calculated based on landmarks.
# (std::vector<NormalizedRect>)
output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects"
# Regions of interest calculated based on palm detections.
# (std::vector<NormalizedRect>)
output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections"

# When the optional input side packet "use_prev_landmarks" is either absent or
# set to true, uses the landmarks on the previous image to help localize
# landmarks on the current image.
node {
  calculator: "GateCalculator"
  input_side_packet: "ALLOW:use_prev_landmarks"
  input_stream: "prev_hand_rects_from_landmarks"
  output_stream: "gated_prev_hand_rects_from_landmarks"
  options: {
    [mediapipe.GateCalculatorOptions.ext] {
      allow: true
    }
  }
}

# Determines if an input vector of NormalizedRect has a size greater than or
# equal to the provided num_hands.
node {
  calculator: "NormalizedRectVectorHasMinSizeCalculator"
  input_stream: "ITERABLE:gated_prev_hand_rects_from_landmarks"
  input_side_packet: "num_hands"
  output_stream: "prev_has_enough_hands"
}

# Drops the incoming image if enough hands have already been identified from the
# previous image. Otherwise, passes the incoming image through to trigger a new
# round of palm detection.
node {
  calculator: "GateCalculator"
  input_stream: "image"
  input_stream: "DISALLOW:prev_has_enough_hands"
  output_stream: "palm_detection_image"
  options: {
    [mediapipe.GateCalculatorOptions.ext] {
      empty_packets_as_allow: true
    }
  }
}

# Detects palms.
node {
  calculator: "PalmDetectionGpu"
  input_side_packet: "MODEL_COMPLEXITY:model_complexity"
  input_stream: "IMAGE:palm_detection_image"
  output_stream: "DETECTIONS:all_palm_detections"
}

# Makes sure there are no more detections than provided num_hands.
node {
  calculator: "ClipDetectionVectorSizeCalculator"
  input_stream: "all_palm_detections"
  output_stream: "palm_detections"
  input_side_packet: "num_hands"
}

# Extracts image size.
node {
  calculator: "ImagePropertiesCalculator"
  input_stream: "IMAGE_GPU:palm_detection_image"
  output_stream: "SIZE:palm_detection_image_size"
}

# Outputs each element of palm_detections at a fake timestamp for the rest of
# the graph to process. Clones the image_size packet for each palm_detection at
# the fake timestamp. At the end of the loop, outputs the BATCH_END timestamp
# for downstream calculators to inform them that all elements in the vector have
# been processed.
node {
  calculator: "BeginLoopDetectionCalculator"
  input_stream: "ITERABLE:palm_detections"
  input_stream: "CLONE:palm_detection_image_size"
  output_stream: "ITEM:palm_detection"
  output_stream: "CLONE:image_size_for_palms"
  output_stream: "BATCH_END:palm_detections_timestamp"
}

# Calculates region of interest (ROI) base on the specified palm.
node {
  calculator: "PalmDetectionDetectionToRoi"
  input_stream: "DETECTION:palm_detection"
  input_stream: "IMAGE_SIZE:image_size_for_palms"
  output_stream: "ROI:hand_rect_from_palm_detection"
}

# Collects a NormalizedRect for each hand into a vector. Upon receiving the
# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END
# timestamp.
node {
  name: "EndLoopForPalmDetections"
  calculator: "EndLoopNormalizedRectCalculator"
  input_stream: "ITEM:hand_rect_from_palm_detection"
  input_stream: "BATCH_END:palm_detections_timestamp"
  output_stream: "ITERABLE:hand_rects_from_palm_detections"
}

# Performs association between NormalizedRect vector elements from previous
# image and rects based on palm detections from the current image. This
# calculator ensures that the output hand_rects vector doesn't contain
# overlapping regions based on the specified min_similarity_threshold.
node {
  calculator: "AssociationNormRectCalculator"
  input_stream: "hand_rects_from_palm_detections"
  input_stream: "gated_prev_hand_rects_from_landmarks"
  output_stream: "hand_rects"
  options: {
    [mediapipe.AssociationCalculatorOptions.ext] {
      min_similarity_threshold: 0.5
    }
  }
}

# Extracts image size.
node {
  calculator: "ImagePropertiesCalculator"
  input_stream: "IMAGE_GPU:image"
  output_stream: "SIZE:image_size"
}

# Outputs each element of hand_rects at a fake timestamp for the rest of the
# graph to process. Clones image and image size packets for each
# single_hand_rect at the fake timestamp. At the end of the loop, outputs the
# BATCH_END timestamp for downstream calculators to inform them that all
# elements in the vector have been processed.
node {
  calculator: "BeginLoopNormalizedRectCalculator"
  input_stream: "ITERABLE:hand_rects"
  input_stream: "CLONE:0:image"
  input_stream: "CLONE:1:image_size"
  output_stream: "ITEM:single_hand_rect"
  output_stream: "CLONE:0:image_for_landmarks"
  output_stream: "CLONE:1:image_size_for_landmarks"
  output_stream: "BATCH_END:hand_rects_timestamp"
}

# Detect hand landmarks for the specific hand rect.
node {
  calculator: "HandLandmarkGpu"
  input_side_packet: "MODEL_COMPLEXITY:model_complexity"
  input_stream: "IMAGE:image_for_landmarks"
  input_stream: "ROI:single_hand_rect"
  output_stream: "LANDMARKS:single_hand_landmarks"
  output_stream: "WORLD_LANDMARKS:single_hand_world_landmarks"
  output_stream: "HANDEDNESS:single_handedness"
}

# Collects the handedness for each single hand into a vector. Upon receiving the
# BATCH_END timestamp, outputs a vector of ClassificationList at the BATCH_END
# timestamp.
node {
  calculator: "EndLoopClassificationListCalculator"
  input_stream: "ITEM:single_handedness"
  input_stream: "BATCH_END:hand_rects_timestamp"
  output_stream: "ITERABLE:multi_handedness"
}

# Calculate region of interest (ROI) based on detected hand landmarks to reuse
# on the subsequent runs of the graph.
node {
  calculator: "HandLandmarkLandmarksToRoi"
  input_stream: "IMAGE_SIZE:image_size_for_landmarks"
  input_stream: "LANDMARKS:single_hand_landmarks"
  output_stream: "ROI:single_hand_rect_from_landmarks"
}

# Collects a set of landmarks for each hand into a vector. Upon receiving the
# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END
# timestamp.
node {
  calculator: "EndLoopNormalizedLandmarkListVectorCalculator"
  input_stream: "ITEM:single_hand_landmarks"
  input_stream: "BATCH_END:hand_rects_timestamp"
  output_stream: "ITERABLE:multi_hand_landmarks"
}

# Collects a set of world landmarks for each hand into a vector. Upon receiving
# the BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END
# timestamp.
node {
  calculator: "EndLoopLandmarkListVectorCalculator"
  input_stream: "ITEM:single_hand_world_landmarks"
  input_stream: "BATCH_END:hand_rects_timestamp"
  output_stream: "ITERABLE:multi_hand_world_landmarks"
}

# Collects a NormalizedRect for each hand into a vector. Upon receiving the
# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END
# timestamp.
node {
  calculator: "EndLoopNormalizedRectCalculator"
  input_stream: "ITEM:single_hand_rect_from_landmarks"
  input_stream: "BATCH_END:hand_rects_timestamp"
  output_stream: "ITERABLE:hand_rects_from_landmarks"
}

# Caches hand rects calculated from landmarks, and upon the arrival of the next
# input image, sends out the cached rects with timestamps replaced by that of
# the input image, essentially generating a packet that carries the previous
# hand rects. Note that upon the arrival of the very first input image, a
# timestamp bound update occurs to jump start the feedback loop.
node {
  calculator: "PreviousLoopbackCalculator"
  input_stream: "MAIN:image"
  input_stream: "LOOP:hand_rects_from_landmarks"
  input_stream_info: {
    tag_index: "LOOP"
    back_edge: true
  }
  output_stream: "PREV_LOOP:prev_hand_rects_from_landmarks"
}
code fill 2022-03-01 13:04:01 +01:00			`# MediaPipe graph to detect/predict hand landmarks on GPU.`
			`#`
			`# The procedure is done in two steps:`
			`# - locate palms/hands`
			`# - detect landmarks for each palm/hand.`
			`# This graph tries to skip palm detection as much as possible by reusing`
			`# previously detected/predicted landmarks for new images.`

			`type: "HandLandmarkTrackingGpu"`

			`# GPU image. (GpuBuffer)`
			`input_stream: "IMAGE:image"`

			`# Max number of hands to detect/track. (int)`
			`input_side_packet: "NUM_HANDS:num_hands"`

			`# Complexity of hand landmark and palm detection models: 0 or 1. Accuracy as`
			`# well as inference latency generally go up with the model complexity. If`
			`# unspecified, functions as set to 1. (int)`
			`input_side_packet: "MODEL_COMPLEXITY:model_complexity"`

			`# Whether landmarks on the previous image should be used to help localize`
			`# landmarks on the current image. (bool)`
			`input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks"`

			`# Collection of detected/predicted hands, each represented as a list of`
			`# landmarks. (std::vector<NormalizedLandmarkList>)`
			`# NOTE: there will not be an output packet in the LANDMARKS stream for this`
			`# particular timestamp if none of hands detected. However, the MediaPipe`
			`# framework will internally inform the downstream calculators of the absence of`
			`# this packet so that they don't wait for it unnecessarily.`
			`output_stream: "LANDMARKS:multi_hand_landmarks"`

			`# Collection of detected/predicted hand world landmarks.`
			`# (std::vector<LandmarkList>)`
			`#`
			`# World landmarks are real-world 3D coordinates in meters with the origin in the`
			`# center of the hand bounding box calculated from the landmarks.`
			`#`
			`# WORLD_LANDMARKS shares the same landmark topology as LANDMARKS. However,`
			`# LANDMARKS provides coordinates (in pixels) of a 3D object projected onto the`
			`# 2D image surface, while WORLD_LANDMARKS provides coordinates (in meters) of`
			`# the 3D object itself.`
			`output_stream: "WORLD_LANDMARKS:multi_hand_world_landmarks"`

			`# Collection of handedness of the detected hands (i.e. is hand left or right),`
			`# each represented as a ClassificationList proto with a single Classification`
			`# entry. (std::vector<ClassificationList>)`
			`# Note that handedness is determined assuming the input image is mirrored,`
			`# i.e., taken with a front-facing/selfie camera with images flipped`
			`# horizontally.`
			`output_stream: "HANDEDNESS:multi_handedness"`

			`# Extra outputs (for debugging, for instance).`
			`# Detected palms. (std::vector<Detection>)`
			`output_stream: "PALM_DETECTIONS:palm_detections"`
			`# Regions of interest calculated based on landmarks.`
			`# (std::vector<NormalizedRect>)`
			`output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects"`
			`# Regions of interest calculated based on palm detections.`
			`# (std::vector<NormalizedRect>)`
			`output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections"`

			`# When the optional input side packet "use_prev_landmarks" is either absent or`
			`# set to true, uses the landmarks on the previous image to help localize`
			`# landmarks on the current image.`
			`node {`
			`calculator: "GateCalculator"`
			`input_side_packet: "ALLOW:use_prev_landmarks"`
			`input_stream: "prev_hand_rects_from_landmarks"`
			`output_stream: "gated_prev_hand_rects_from_landmarks"`
			`options: {`
			`[mediapipe.GateCalculatorOptions.ext] {`
			`allow: true`
			`}`
			`}`
			`}`

			`# Determines if an input vector of NormalizedRect has a size greater than or`
			`# equal to the provided num_hands.`
			`node {`
			`calculator: "NormalizedRectVectorHasMinSizeCalculator"`
			`input_stream: "ITERABLE:gated_prev_hand_rects_from_landmarks"`
			`input_side_packet: "num_hands"`
			`output_stream: "prev_has_enough_hands"`
			`}`

			`# Drops the incoming image if enough hands have already been identified from the`
			`# previous image. Otherwise, passes the incoming image through to trigger a new`
			`# round of palm detection.`
			`node {`
			`calculator: "GateCalculator"`
			`input_stream: "image"`
			`input_stream: "DISALLOW:prev_has_enough_hands"`
			`output_stream: "palm_detection_image"`
			`options: {`
			`[mediapipe.GateCalculatorOptions.ext] {`
			`empty_packets_as_allow: true`
			`}`
			`}`
			`}`

			`# Detects palms.`
			`node {`
			`calculator: "PalmDetectionGpu"`
			`input_side_packet: "MODEL_COMPLEXITY:model_complexity"`
			`input_stream: "IMAGE:palm_detection_image"`
			`output_stream: "DETECTIONS:all_palm_detections"`
			`}`

			`# Makes sure there are no more detections than provided num_hands.`
			`node {`
			`calculator: "ClipDetectionVectorSizeCalculator"`
			`input_stream: "all_palm_detections"`
			`output_stream: "palm_detections"`
			`input_side_packet: "num_hands"`
			`}`

			`# Extracts image size.`
			`node {`
			`calculator: "ImagePropertiesCalculator"`
			`input_stream: "IMAGE_GPU:palm_detection_image"`
			`output_stream: "SIZE:palm_detection_image_size"`
			`}`

			`# Outputs each element of palm_detections at a fake timestamp for the rest of`
			`# the graph to process. Clones the image_size packet for each palm_detection at`
			`# the fake timestamp. At the end of the loop, outputs the BATCH_END timestamp`
			`# for downstream calculators to inform them that all elements in the vector have`
			`# been processed.`
			`node {`
			`calculator: "BeginLoopDetectionCalculator"`
			`input_stream: "ITERABLE:palm_detections"`
			`input_stream: "CLONE:palm_detection_image_size"`
			`output_stream: "ITEM:palm_detection"`
			`output_stream: "CLONE:image_size_for_palms"`
			`output_stream: "BATCH_END:palm_detections_timestamp"`
			`}`

			`# Calculates region of interest (ROI) base on the specified palm.`
			`node {`
			`calculator: "PalmDetectionDetectionToRoi"`
			`input_stream: "DETECTION:palm_detection"`
			`input_stream: "IMAGE_SIZE:image_size_for_palms"`
			`output_stream: "ROI:hand_rect_from_palm_detection"`
			`}`

			`# Collects a NormalizedRect for each hand into a vector. Upon receiving the`
			`# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END`
			`# timestamp.`
			`node {`
			`name: "EndLoopForPalmDetections"`
			`calculator: "EndLoopNormalizedRectCalculator"`
			`input_stream: "ITEM:hand_rect_from_palm_detection"`
			`input_stream: "BATCH_END:palm_detections_timestamp"`
			`output_stream: "ITERABLE:hand_rects_from_palm_detections"`
			`}`

			`# Performs association between NormalizedRect vector elements from previous`
			`# image and rects based on palm detections from the current image. This`
			`# calculator ensures that the output hand_rects vector doesn't contain`
			`# overlapping regions based on the specified min_similarity_threshold.`
			`node {`
			`calculator: "AssociationNormRectCalculator"`
			`input_stream: "hand_rects_from_palm_detections"`
			`input_stream: "gated_prev_hand_rects_from_landmarks"`
			`output_stream: "hand_rects"`
			`options: {`
			`[mediapipe.AssociationCalculatorOptions.ext] {`
			`min_similarity_threshold: 0.5`
			`}`
			`}`
			`}`

			`# Extracts image size.`
			`node {`
			`calculator: "ImagePropertiesCalculator"`
			`input_stream: "IMAGE_GPU:image"`
			`output_stream: "SIZE:image_size"`
			`}`

			`# Outputs each element of hand_rects at a fake timestamp for the rest of the`
			`# graph to process. Clones image and image size packets for each`
			`# single_hand_rect at the fake timestamp. At the end of the loop, outputs the`
			`# BATCH_END timestamp for downstream calculators to inform them that all`
			`# elements in the vector have been processed.`
			`node {`
			`calculator: "BeginLoopNormalizedRectCalculator"`
			`input_stream: "ITERABLE:hand_rects"`
			`input_stream: "CLONE:0:image"`
			`input_stream: "CLONE:1:image_size"`
			`output_stream: "ITEM:single_hand_rect"`
			`output_stream: "CLONE:0:image_for_landmarks"`
			`output_stream: "CLONE:1:image_size_for_landmarks"`
			`output_stream: "BATCH_END:hand_rects_timestamp"`
			`}`

			`# Detect hand landmarks for the specific hand rect.`
			`node {`
			`calculator: "HandLandmarkGpu"`
			`input_side_packet: "MODEL_COMPLEXITY:model_complexity"`
			`input_stream: "IMAGE:image_for_landmarks"`
			`input_stream: "ROI:single_hand_rect"`
			`output_stream: "LANDMARKS:single_hand_landmarks"`
			`output_stream: "WORLD_LANDMARKS:single_hand_world_landmarks"`
			`output_stream: "HANDEDNESS:single_handedness"`
			`}`

			`# Collects the handedness for each single hand into a vector. Upon receiving the`
			`# BATCH_END timestamp, outputs a vector of ClassificationList at the BATCH_END`
			`# timestamp.`
			`node {`
			`calculator: "EndLoopClassificationListCalculator"`
			`input_stream: "ITEM:single_handedness"`
			`input_stream: "BATCH_END:hand_rects_timestamp"`
			`output_stream: "ITERABLE:multi_handedness"`
			`}`

			`# Calculate region of interest (ROI) based on detected hand landmarks to reuse`
			`# on the subsequent runs of the graph.`
			`node {`
			`calculator: "HandLandmarkLandmarksToRoi"`
			`input_stream: "IMAGE_SIZE:image_size_for_landmarks"`
			`input_stream: "LANDMARKS:single_hand_landmarks"`
			`output_stream: "ROI:single_hand_rect_from_landmarks"`
			`}`

			`# Collects a set of landmarks for each hand into a vector. Upon receiving the`
			`# BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END`
			`# timestamp.`
			`node {`
			`calculator: "EndLoopNormalizedLandmarkListVectorCalculator"`
			`input_stream: "ITEM:single_hand_landmarks"`
			`input_stream: "BATCH_END:hand_rects_timestamp"`
			`output_stream: "ITERABLE:multi_hand_landmarks"`
			`}`

			`# Collects a set of world landmarks for each hand into a vector. Upon receiving`
			`# the BATCH_END timestamp, outputs the vector of landmarks at the BATCH_END`
			`# timestamp.`
			`node {`
			`calculator: "EndLoopLandmarkListVectorCalculator"`
			`input_stream: "ITEM:single_hand_world_landmarks"`
			`input_stream: "BATCH_END:hand_rects_timestamp"`
			`output_stream: "ITERABLE:multi_hand_world_landmarks"`
			`}`

			`# Collects a NormalizedRect for each hand into a vector. Upon receiving the`
			`# BATCH_END timestamp, outputs the vector of NormalizedRect at the BATCH_END`
			`# timestamp.`
			`node {`
			`calculator: "EndLoopNormalizedRectCalculator"`
			`input_stream: "ITEM:single_hand_rect_from_landmarks"`
			`input_stream: "BATCH_END:hand_rects_timestamp"`
			`output_stream: "ITERABLE:hand_rects_from_landmarks"`
			`}`

			`# Caches hand rects calculated from landmarks, and upon the arrival of the next`
			`# input image, sends out the cached rects with timestamps replaced by that of`
			`# the input image, essentially generating a packet that carries the previous`
			`# hand rects. Note that upon the arrival of the very first input image, a`
			`# timestamp bound update occurs to jump start the feedback loop.`
			`node {`
			`calculator: "PreviousLoopbackCalculator"`
			`input_stream: "MAIN:image"`
			`input_stream: "LOOP:hand_rects_from_landmarks"`
			`input_stream_info: {`
			`tag_index: "LOOP"`
			`back_edge: true`
			`}`
			`output_stream: "PREV_LOOP:prev_hand_rects_from_landmarks"`
			`}`