add multi pose estimation and multi person holistic tracking

2022-07-04 19:36:32 -07:00 · 2022-07-04 19:36:32 -07:00 · b72fc70c01
commit b72fc70c01
parent f405c764b9
12 changed files with 429 additions and 14 deletions
--- a/examples/face_mesh.rs
+++ b/examples/face_mesh.rs
@ -35,7 +35,7 @@ fn face_mesh() -> Result<()> {
            highgui::imshow(window, &mut flip_frame)?;
            if !result.is_empty() {
-                let landmark = result[0][0];
+                let landmark = result[0].data[0];
                println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
            }
        } else {
--- a/examples/hand_tracking.rs
+++ b/examples/hand_tracking.rs
@ -35,7 +35,7 @@ pub fn hand_tracking() -> Result<()> {
            highgui::imshow(window, &mut flip_frame)?;
            if !result.is_empty() {
-                let landmark = result[0][0];
+                let landmark = result[0].data[0];
                println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
            }
        } else {
--- a/examples/holistic_tracking.rs
+++ b/examples/holistic_tracking.rs
@ -31,12 +31,11 @@ fn face_mesh() -> Result<()> {
            println!("processing");
            let result = detector.process(&flip_frame);
            println!("received {} types of landmarks", result.len());
            highgui::imshow(window, &mut flip_frame)?;
-            if !result[0].is_empty() {
+            if let Some(pose) = result.pose {
-                let landmark = result[0][0][0];
+                let landmark = pose.data[0];
                println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
            }
        } else {
--- a/examples/multi_person_holistic_tracking.rs
+++ b/examples/multi_person_holistic_tracking.rs
@ -0,0 +1,57 @@
 use mediapipe::*;
 use opencv::prelude::*;
 use opencv::{highgui, imgproc, videoio, Result};
 fn face_mesh() -> Result<()> {
    let window = "video capture";
    highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?;
    let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?;
    if !cap.is_opened()? {
        panic!("Unable to open default cam")
    }
    cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?;
    cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?;
    cap.set(videoio::CAP_PROP_FPS, 30.0)?;
    let mut detector = holistic::MultiPersonHolisticDetector::default();
    let mut raw_frame = Mat::default();
    let mut rgb_frame = Mat::default();
    let mut flip_frame = Mat::default();
    loop {
        cap.read(&mut raw_frame)?;
        let size = raw_frame.size()?;
        if size.width > 0 && !raw_frame.empty() {
            imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?;
            opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal
            println!("processing");
            let result = detector.process(&flip_frame);
            highgui::imshow(window, &mut flip_frame)?;
            if !result.is_empty() {
                if let Some(pose) = &result[0].pose {
                    let landmark = pose.data[0];
                    println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
                }
            }
        } else {
            println!("WARN: Skip empty frame");
        }
        let key = highgui::wait_key(10)?;
        if key > 0 && key != 255 {
            break;
        }
    }
    Ok(())
 }
 fn main() {
    face_mesh().unwrap()
 }
--- a/examples/multi_pose_estimation.rs
+++ b/examples/multi_pose_estimation.rs
@ -0,0 +1,55 @@
 use mediapipe::*;
 use opencv::prelude::*;
 use opencv::{highgui, imgproc, videoio, Result};
 pub fn pose_estimation() -> Result<()> {
    let window = "video capture";
    highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?;
    let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?;
    if !cap.is_opened()? {
        panic!("Unable to open default cam")
    }
    cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?;
    cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?;
    cap.set(videoio::CAP_PROP_FPS, 30.0)?;
    let mut detector = pose::MultiPoseDetector::default();
    let mut raw_frame = Mat::default();
    let mut rgb_frame = Mat::default();
    let mut flip_frame = Mat::default();
    loop {
        cap.read(&mut raw_frame)?;
        let size = raw_frame.size()?;
        if size.width > 0 && !raw_frame.empty() {
            imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?;
            opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal
            println!("processing");
            let result = detector.process(&rgb_frame);
            highgui::imshow(window, &mut rgb_frame)?;
            if !result.is_empty() {
                let landmark = result[0].data[0];
                println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
            }
        } else {
            println!("WARN: Skip empty frame");
        }
        let key = highgui::wait_key(10)?;
        if key > 0 && key != 255 {
            break;
        }
    }
    Ok(())
 }
 fn main() {
    pose_estimation().unwrap()
 }
--- a/src/face_mesh.rs
+++ b/src/face_mesh.rs
@ -10,7 +10,7 @@ impl FaceMeshDetector {
        let graph = Detector::new(
            include_str!("graphs/face_mesh_desktop_live.pbtxt"),
            vec![Output {
-                type_: FeatureType::Face,
+                type_: FeatureType::Faces,
                name: "multi_face_landmarks".into(),
            }],
        );
@ -19,9 +19,17 @@ impl FaceMeshDetector {
    }
    /// Processes the input frame, returns a face mesh if detected.
-    pub fn process(&mut self, input: &Mat) -> Vec<Vec<Landmark>> {
+    pub fn process(&mut self, input: &Mat) -> Vec<FaceMesh> {
        let landmarks = self.graph.process(input);
-        landmarks[0].clone()
+        let mut faces = vec![];
        for face_landmarks in landmarks[0].iter() {
            let mut face = FaceMesh::default();
            face.data.copy_from_slice(&face_landmarks[..]);
            faces.push(face);
        }
        faces
    }
 }
--- a/src/graphs/multi_person_holistic_tracking_cpu.pbtxt
+++ b/src/graphs/multi_person_holistic_tracking_cpu.pbtxt
@ -0,0 +1,55 @@
 # Tracks pose + hands + face landmarks.
 # CPU image. (ImageFrame)
 input_stream: "input_video"
 output_stream: "multi_pose_landmarks"
 output_stream: "pose_rois"
 output_stream: "pose_detections"
 output_stream: "multi_left_hand_landmarks"
 output_stream: "multi_right_hand_landmarks"
 # Throttles the images flowing downstream for flow control. It passes through
 # the very first incoming image unaltered, and waits for downstream nodes
 # (calculators and subgraphs) in the graph to finish their tasks before it
 # passes through another image. All images that come in while waiting are
 # dropped, limiting the number of in-flight images in most part of the graph to
 # 1. This prevents the downstream nodes from queuing up incoming images and data
 # excessively, which leads to increased latency and memory usage, unwanted in
 # real-time mobile applications. It also eliminates unnecessarily computation,
 # e.g., the output produced by a node may get dropped downstream if the
 # subsequent nodes are still busy processing previous inputs.
 node {
  calculator: "FlowLimiterCalculator"
  input_stream: "input_video"
  input_stream: "FINISHED:output_video"
  input_stream_info: {
    tag_index: "FINISHED"
    back_edge: true
  }
  output_stream: "throttled_input_video"
  node_options: {
    [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] {
      max_in_flight: 1
      max_in_queue: 1
      # Timeout is disabled (set to 0) as first frame processing can take more
      # than 1 second.
      in_flight_timeout: 0
    }
  }
 }
 node {
  calculator: "MultiPersonHolisticLandmarkCpu"
  input_stream: "IMAGE:throttled_input_video"
  output_stream: "POSE_LANDMARKS:multi_pose_landmarks"
  output_stream: "POSE_ROI:pose_rois"
  output_stream: "POSE_DETECTION:pose_detections"
  output_stream: "FACE_LANDMARKS:multi_face_landmarks"
  output_stream: "LEFT_HAND_LANDMARKS:multi_left_hand_landmarks"
  output_stream: "RIGHT_HAND_LANDMARKS:multi_right_hand_landmarks"
 }
--- a/src/graphs/multi_person_pose_tracking_cpu.pbtxt
+++ b/src/graphs/multi_person_pose_tracking_cpu.pbtxt
@ -0,0 +1,53 @@
 # MediaPipe graph that performs pose tracking with TensorFlow Lite on CPU.
 # CPU buffer. (ImageFrame)
 input_stream: "input_video"
 # Output image with rendered results. (ImageFrame)
 output_stream: "multi_pose_landmarks"
 output_stream: "pose_detections"
 output_stream: "roi_from_landmarks"
 # Generates side packet to enable segmentation.
 node {
  calculator: "ConstantSidePacketCalculator"
  output_side_packet: "PACKET:enable_segmentation"
  node_options: {
    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
      packet { bool_value: true }
    }
  }
 }
 # Throttles the images flowing downstream for flow control. It passes through
 # the very first incoming image unaltered, and waits for downstream nodes
 # (calculators and subgraphs) in the graph to finish their tasks before it
 # passes through another image. All images that come in while waiting are
 # dropped, limiting the number of in-flight images in most part of the graph to
 # 1. This prevents the downstream nodes from queuing up incoming images and data
 # excessively, which leads to increased latency and memory usage, unwanted in
 # real-time mobile applications. It also eliminates unnecessarily computation,
 # e.g., the output produced by a node may get dropped downstream if the
 # subsequent nodes are still busy processing previous inputs.
 node {
  calculator: "FlowLimiterCalculator"
  input_stream: "input_video"
  input_stream: "FINISHED:output_video"
  input_stream_info: {
    tag_index: "FINISHED"
    back_edge: true
  }
  output_stream: "throttled_input_video"
 }
 # Subgraph that detects poses and corresponding landmarks.
 node {
  calculator: "MultiPoseLandmarkCpu"
  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
  input_stream: "IMAGE:throttled_input_video"
  output_stream: "LANDMARKS:multi_pose_landmarks"
  output_stream: "DETECTION:pose_detections"
  output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks"
 }
--- a/src/hands.rs
+++ b/src/hands.rs
@ -1,6 +1,8 @@
 //! Hand detection utilities.
 use super::*;
 pub const NUM_HAND_LANDMARKS: usize = 21;
 /// Hand landmark indices.
 pub enum HandLandmark {
    WRIST = 0,
@ -44,9 +46,17 @@ impl HandDetector {
    }
    /// Processes the input frame, returns a list of hands
-    pub fn process(&mut self, input: &Mat) -> Vec<Vec<Landmark>> {
+    pub fn process(&mut self, input: &Mat) -> Vec<Hand> {
        let result = self.graph.process(input);
-        result[0].clone()
+        let mut hands = vec![];
        for hand_landmarks in result[0].iter() {
            let mut hand = Hand::default();
            hand.data.copy_from_slice(&hand_landmarks[..]);
            hands.push(hand);
        }
        hands
    }
 }
--- a/src/holistic.rs
+++ b/src/holistic.rs
@ -5,6 +5,14 @@ pub struct HolisticDetector {
    graph: Detector,
 }
 #[derive(Clone, Debug)]
 pub struct HolisticDetection {
    pub pose: Option<Pose>,
    pub face: Option<FaceMesh>,
    pub left_hand: Option<Hand>,
    pub right_hand: Option<Hand>,
 }
 impl HolisticDetector {
    pub fn new() -> Self {
        let outputs = vec![
@ -32,9 +40,44 @@ impl HolisticDetector {
    }
    /// Processes the input frame, returns landmarks if detected
-    pub fn process(&mut self, input: &Mat) -> Vec<Vec<Vec<Landmark>>> {
+    pub fn process(&mut self, input: &Mat) -> HolisticDetection {
        let landmarks = self.graph.process(input);
-        landmarks.clone()
+
        let mut pose = None;
        let mut face = None;
        let mut left_hand = None;
        let mut right_hand = None;
        if !landmarks[0].is_empty() {
            let mut p = Pose::default();
            p.data.copy_from_slice(&landmarks[0][0][..]);
            pose = Some(p);
        }
        if !landmarks[1].is_empty() {
            let mut f = FaceMesh::default();
            f.data.copy_from_slice(&landmarks[1][0][..]);
            face = Some(f);
        }
        if !landmarks[2].is_empty() {
            let mut l = Hand::default();
            l.data.copy_from_slice(&landmarks[2][0][..]);
            left_hand = Some(l);
        }
        if !landmarks[3].is_empty() {
            let mut r = Hand::default();
            r.data.copy_from_slice(&landmarks[3][0][..]);
            right_hand = Some(r);
        }
        HolisticDetection {
            pose,
            face,
            left_hand,
            right_hand,
        }
    }
 }
@ -43,3 +86,96 @@ impl Default for HolisticDetector {
        Self::new()
    }
 }
 pub struct MultiPersonHolisticDetector {
    graph: Detector,
 }
 impl MultiPersonHolisticDetector {
    pub fn new() -> Self {
        let outputs = vec![
            Output {
                type_: FeatureType::Poses,
                name: "multi_pose_landmarks".into(),
            },
            Output {
                type_: FeatureType::Faces,
                name: "multi_face_landmarks".into(),
            },
            Output {
                type_: FeatureType::Hands,
                name: "multi_left_hand_landmarks".into(),
            },
            Output {
                type_: FeatureType::Hands,
                name: "multi_right_hand_landmarks".into(),
            },
        ];
        let graph = Detector::new(
            include_str!("graphs/multi_person_holistic_tracking_cpu.pbtxt"),
            outputs,
        );
        Self { graph }
    }
    /// Processes the input frame, returns landmarks if detected
    pub fn process(&mut self, input: &Mat) -> Vec<HolisticDetection> {
        let landmarks = self.graph.process(input);
        let max_landmarks = landmarks
            .iter()
            .map(|l| l.len())
            .reduce(|acc, item| acc.max(item))
            .unwrap();
        let mut detections = vec![];
        for i in 0..max_landmarks {
            let mut pose = None;
            let mut face = None;
            let mut left_hand = None;
            let mut right_hand = None;
            if landmarks[0].len() > i {
                let mut p = Pose::default();
                p.data.copy_from_slice(&landmarks[0][i][..]);
                pose = Some(p);
            }
            if landmarks[1].len() > i {
                let mut f = FaceMesh::default();
                f.data.copy_from_slice(&landmarks[1][i][..]);
                face = Some(f);
            }
            if landmarks[2].len() > i {
                let mut l = Hand::default();
                l.data.copy_from_slice(&landmarks[2][i][..]);
                left_hand = Some(l);
            }
            if landmarks[3].len() > i {
                let mut r = Hand::default();
                r.data.copy_from_slice(&landmarks[3][i][..]);
                right_hand = Some(r);
            }
            detections.push(HolisticDetection {
                pose,
                face,
                left_hand,
                right_hand,
            });
        }
        detections
    }
 }
 impl Default for MultiPersonHolisticDetector {
    fn default() -> Self {
        Self::new()
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -43,7 +43,7 @@ impl FeatureType {
            FeatureType::Face => 478,
            FeatureType::Faces => 478,
            FeatureType::Hand => 21,
-            FeatureType::Hands => 42,
+            FeatureType::Hands => 21,
            FeatureType::Pose => 33,
            FeatureType::Poses => 33,
        }
@ -100,6 +100,7 @@ impl Default for Landmark {
 /// Represents a detected pose, as 33 landmarks.
 /// Landmark names are in [pose::PoseLandmark].
 #[derive(Clone, Debug)]
 pub struct Pose {
    pub data: [Landmark; 33],
 }
@ -114,12 +115,13 @@ impl Default for Pose {
 /// Represents a detected hand, as 21 landmarks.
 /// Landmark names are in [hands::HandLandmark]
-#[derive(Default)]
+#[derive(Clone, Debug, Default)]
 pub struct Hand {
    pub data: [Landmark; 21],
 }
 /// Represents a detected face mesh, as 478 landmarks.
 #[derive(Clone, Debug)]
 pub struct FaceMesh {
    pub data: [Landmark; 478],
 }
--- a/src/pose.rs
+++ b/src/pose.rs
@ -1,6 +1,8 @@
 //! Pose detection utilities.
 use super::*;
 pub const NUM_POSE_LANDMARKS: usize = 33;
 /// Pose landmark indices.
 pub enum PoseLandmark {
    NOSE = 0,
@ -76,3 +78,41 @@ impl Default for PoseDetector {
        Self::new()
    }
 }
 pub struct MultiPoseDetector {
    graph: Detector,
 }
 impl MultiPoseDetector {
    pub fn new() -> Self {
        let graph = Detector::new(
            include_str!("graphs/multi_person_pose_tracking_cpu.pbtxt"),
            vec![Output {
                type_: FeatureType::Poses,
                name: "multi_pose_landmarks".into(),
            }],
        );
        Self { graph }
    }
    /// Processes the input frame, returns poses if detected.
    pub fn process(&mut self, input: &Mat) -> Vec<Pose> {
        let result = self.graph.process(input);
        let mut poses = vec![];
        for pose_landmarks in result[0].iter() {
            let mut pose = Pose::default();
            pose.data.copy_from_slice(&pose_landmarks[..]);
            poses.push(pose);
        }
        poses
    }
 }
 impl Default for MultiPoseDetector {
    fn default() -> Self {
        Self::new()
    }
 }