# MediaPipe graph to detect/predict hand landmarks on CPU. # # The procedure is done in two steps: # - locate palms/hands # - detect landmarks for each palm/hand. # This graph tries to skip palm detection as much as possible by reusing # previously detected/predicted landmarks for new images. type: "HandLandmarkTrackingCpuImage" # Input image. (Image) input_stream: "IMAGE:image" # Max number of hands to detect/track. (int) input_side_packet: "NUM_HANDS:num_hands" # Complexity of hand landmark and palm detection models: 0 or 1. Accuracy as # well as inference latency generally go up with the model complexity. If # unspecified, functions as set to 1. (int) input_side_packet: "MODEL_COMPLEXITY:model_complexity" # Whether landmarks on the previous image should be used to help localize # landmarks on the current image. (bool) input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" # The throttled input image. (Image) output_stream: "IMAGE:throttled_image" # Collection of detected/predicted hands, each represented as a list of # landmarks. (std::vector) # NOTE: there will not be an output packet in the LANDMARKS stream for this # particular timestamp if none of hands detected. However, the MediaPipe # framework will internally inform the downstream calculators of the absence of # this packet so that they don't wait for it unnecessarily. output_stream: "LANDMARKS:multi_hand_landmarks" # Collection of detected/predicted hand world landmarks. # (std::vector) # # World landmarks are real-world 3D coordinates in meters with the origin in the # center of the hand bounding box calculated from the landmarks. # # WORLD_LANDMARKS shares the same landmark topology as LANDMARKS. However, # LANDMARKS provides coordinates (in pixels) of a 3D object projected onto the # 2D image surface, while WORLD_LANDMARKS provides coordinates (in meters) of # the 3D object itself. output_stream: "WORLD_LANDMARKS:multi_hand_world_landmarks" # Collection of handedness of the detected hands (i.e. is hand left or right), # each represented as a ClassificationList proto with a single Classification # entry. (std::vector) # Note that handedness is determined assuming the input image is mirrored, # i.e., taken with a front-facing/selfie camera with images flipped # horizontally. output_stream: "HANDEDNESS:multi_handedness" # Extra outputs (for debugging, for instance). # Detected palms. (std::vector) output_stream: "PALM_DETECTIONS:palm_detections" # Regions of interest calculated based on landmarks. # (std::vector) output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects" # Regions of interest calculated based on palm detections. # (std::vector) output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections" node { calculator: "FlowLimiterCalculator" input_stream: "image" input_stream: "FINISHED:multi_hand_landmarks" input_stream_info: { tag_index: "FINISHED" back_edge: true } output_stream: "throttled_image" options: { [mediapipe.FlowLimiterCalculatorOptions.ext] { max_in_flight: 1 max_in_queue: 1 } } } # Converts Image to ImageFrame for HandLandmarkTrackingCpu to consume. node { calculator: "FromImageCalculator" input_stream: "IMAGE:throttled_image" output_stream: "IMAGE_CPU:raw_image_frame" output_stream: "SOURCE_ON_GPU:is_gpu_image" } # TODO: Remove the extra flipping once adopting MlImage. # If the source images are on gpu, flip the data vertically before sending them # into HandLandmarkTrackingCpu. This maybe needed because OpenGL represents # images assuming the image origin is at the bottom-left corner, whereas # MediaPipe in general assumes the image origin is at the top-left corner. node: { calculator: "ImageTransformationCalculator" input_stream: "IMAGE:raw_image_frame" input_stream: "FLIP_VERTICALLY:is_gpu_image" output_stream: "IMAGE:image_frame" } node { calculator: "HandLandmarkTrackingCpu" input_stream: "IMAGE:image_frame" input_side_packet: "NUM_HANDS:num_hands" input_side_packet: "MODEL_COMPLEXITY:model_complexity" input_side_packet: "USE_PREV_LANDMARKS:use_prev_landmarks" output_stream: "LANDMARKS:multi_hand_landmarks" output_stream: "WORLD_LANDMARKS:multi_hand_world_landmarks" output_stream: "HANDEDNESS:multi_handedness" output_stream: "PALM_DETECTIONS:palm_detections" output_stream: "HAND_ROIS_FROM_LANDMARKS:hand_rects" output_stream: "HAND_ROIS_FROM_PALM_DETECTIONS:hand_rects_from_palm_detections" }