From b72fc70c01aee839e61de04242c0b1cce4051e20 Mon Sep 17 00:00:00 2001
From: Jules Youngberg <julesdayoungberg@gmail.com>
Date: Mon, 4 Jul 2022 19:36:32 -0700
Subject: [PATCH] add multi pose estimation and multi person holistic tracking

---
 examples/face_mesh.rs                         |   2 +-
 examples/hand_tracking.rs                     |   2 +-
 examples/holistic_tracking.rs                 |   5 +-
 examples/multi_person_holistic_tracking.rs    |  57 +++++++
 examples/multi_pose_estimation.rs             |  55 +++++++
 src/face_mesh.rs                              |  14 +-
 .../multi_person_holistic_tracking_cpu.pbtxt  |  55 +++++++
 .../multi_person_pose_tracking_cpu.pbtxt      |  53 +++++++
 src/hands.rs                                  |  14 +-
 src/holistic.rs                               | 140 +++++++++++++++++-
 src/lib.rs                                    |   6 +-
 src/pose.rs                                   |  40 +++++
 12 files changed, 429 insertions(+), 14 deletions(-)
 create mode 100644 examples/multi_person_holistic_tracking.rs
 create mode 100644 examples/multi_pose_estimation.rs
 create mode 100644 src/graphs/multi_person_holistic_tracking_cpu.pbtxt
 create mode 100644 src/graphs/multi_person_pose_tracking_cpu.pbtxt

diff --git a/examples/face_mesh.rs b/examples/face_mesh.rs
index 6ca6b64..c806dfc 100644
--- a/examples/face_mesh.rs
+++ b/examples/face_mesh.rs
@@ -35,7 +35,7 @@ fn face_mesh() -> Result<()> {
             highgui::imshow(window, &mut flip_frame)?;
 
             if !result.is_empty() {
-                let landmark = result[0][0];
+                let landmark = result[0].data[0];
                 println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
             }
         } else {
diff --git a/examples/hand_tracking.rs b/examples/hand_tracking.rs
index 38a136f..540925d 100644
--- a/examples/hand_tracking.rs
+++ b/examples/hand_tracking.rs
@@ -35,7 +35,7 @@ pub fn hand_tracking() -> Result<()> {
             highgui::imshow(window, &mut flip_frame)?;
 
             if !result.is_empty() {
-                let landmark = result[0][0];
+                let landmark = result[0].data[0];
                 println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
             }
         } else {
diff --git a/examples/holistic_tracking.rs b/examples/holistic_tracking.rs
index a663d27..c5e2218 100644
--- a/examples/holistic_tracking.rs
+++ b/examples/holistic_tracking.rs
@@ -31,12 +31,11 @@ fn face_mesh() -> Result<()> {
 
             println!("processing");
             let result = detector.process(&flip_frame);
-            println!("received {} types of landmarks", result.len());
 
             highgui::imshow(window, &mut flip_frame)?;
 
-            if !result[0].is_empty() {
-                let landmark = result[0][0][0];
+            if let Some(pose) = result.pose {
+                let landmark = pose.data[0];
                 println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
             }
         } else {
diff --git a/examples/multi_person_holistic_tracking.rs b/examples/multi_person_holistic_tracking.rs
new file mode 100644
index 0000000..bd61f52
--- /dev/null
+++ b/examples/multi_person_holistic_tracking.rs
@@ -0,0 +1,57 @@
+use mediapipe::*;
+use opencv::prelude::*;
+use opencv::{highgui, imgproc, videoio, Result};
+
+fn face_mesh() -> Result<()> {
+    let window = "video capture";
+
+    highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?;
+
+    let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?;
+    if !cap.is_opened()? {
+        panic!("Unable to open default cam")
+    }
+
+    cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?;
+    cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?;
+    cap.set(videoio::CAP_PROP_FPS, 30.0)?;
+
+    let mut detector = holistic::MultiPersonHolisticDetector::default();
+
+    let mut raw_frame = Mat::default();
+    let mut rgb_frame = Mat::default();
+    let mut flip_frame = Mat::default();
+    loop {
+        cap.read(&mut raw_frame)?;
+
+        let size = raw_frame.size()?;
+        if size.width > 0 && !raw_frame.empty() {
+            imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?;
+            opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal
+
+            println!("processing");
+            let result = detector.process(&flip_frame);
+
+            highgui::imshow(window, &mut flip_frame)?;
+
+            if !result.is_empty() {
+                if let Some(pose) = &result[0].pose {
+                    let landmark = pose.data[0];
+                    println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
+                }
+            }
+        } else {
+            println!("WARN: Skip empty frame");
+        }
+
+        let key = highgui::wait_key(10)?;
+        if key > 0 && key != 255 {
+            break;
+        }
+    }
+    Ok(())
+}
+
+fn main() {
+    face_mesh().unwrap()
+}
diff --git a/examples/multi_pose_estimation.rs b/examples/multi_pose_estimation.rs
new file mode 100644
index 0000000..d31f15e
--- /dev/null
+++ b/examples/multi_pose_estimation.rs
@@ -0,0 +1,55 @@
+use mediapipe::*;
+use opencv::prelude::*;
+use opencv::{highgui, imgproc, videoio, Result};
+
+pub fn pose_estimation() -> Result<()> {
+    let window = "video capture";
+
+    highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?;
+
+    let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?;
+    if !cap.is_opened()? {
+        panic!("Unable to open default cam")
+    }
+
+    cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?;
+    cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?;
+    cap.set(videoio::CAP_PROP_FPS, 30.0)?;
+
+    let mut detector = pose::MultiPoseDetector::default();
+
+    let mut raw_frame = Mat::default();
+    let mut rgb_frame = Mat::default();
+    let mut flip_frame = Mat::default();
+    loop {
+        cap.read(&mut raw_frame)?;
+
+        let size = raw_frame.size()?;
+        if size.width > 0 && !raw_frame.empty() {
+            imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?;
+            opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal
+
+            println!("processing");
+            let result = detector.process(&rgb_frame);
+
+            highgui::imshow(window, &mut rgb_frame)?;
+
+            if !result.is_empty() {
+                let landmark = result[0].data[0];
+                println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
+            }
+        } else {
+            println!("WARN: Skip empty frame");
+        }
+
+        let key = highgui::wait_key(10)?;
+        if key > 0 && key != 255 {
+            break;
+        }
+    }
+    Ok(())
+}
+
+fn main() {
+    pose_estimation().unwrap()
+}
diff --git a/src/face_mesh.rs b/src/face_mesh.rs
index 340b029..91230db 100644
--- a/src/face_mesh.rs
+++ b/src/face_mesh.rs
@@ -10,7 +10,7 @@ impl FaceMeshDetector {
         let graph = Detector::new(
             include_str!("graphs/face_mesh_desktop_live.pbtxt"),
             vec![Output {
-                type_: FeatureType::Face,
+                type_: FeatureType::Faces,
                 name: "multi_face_landmarks".into(),
             }],
         );
@@ -19,9 +19,17 @@ impl FaceMeshDetector {
     }
 
     /// Processes the input frame, returns a face mesh if detected.
-    pub fn process(&mut self, input: &Mat) -> Vec<Vec<Landmark>> {
+    pub fn process(&mut self, input: &Mat) -> Vec<FaceMesh> {
         let landmarks = self.graph.process(input);
-        landmarks[0].clone()
+        let mut faces = vec![];
+
+        for face_landmarks in landmarks[0].iter() {
+            let mut face = FaceMesh::default();
+            face.data.copy_from_slice(&face_landmarks[..]);
+            faces.push(face);
+        }
+
+        faces
     }
 }
 
diff --git a/src/graphs/multi_person_holistic_tracking_cpu.pbtxt b/src/graphs/multi_person_holistic_tracking_cpu.pbtxt
new file mode 100644
index 0000000..d7393e8
--- /dev/null
+++ b/src/graphs/multi_person_holistic_tracking_cpu.pbtxt
@@ -0,0 +1,55 @@
+# Tracks pose + hands + face landmarks.
+
+# CPU image. (ImageFrame)
+input_stream: "input_video"
+
+output_stream: "multi_pose_landmarks"
+
+output_stream: "pose_rois"
+
+output_stream: "pose_detections"
+
+output_stream: "multi_left_hand_landmarks"
+
+output_stream: "multi_right_hand_landmarks"
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for downstream nodes
+# (calculators and subgraphs) in the graph to finish their tasks before it
+# passes through another image. All images that come in while waiting are
+# dropped, limiting the number of in-flight images in most part of the graph to
+# 1. This prevents the downstream nodes from queuing up incoming images and data
+# excessively, which leads to increased latency and memory usage, unwanted in
+# real-time mobile applications. It also eliminates unnecessarily computation,
+# e.g., the output produced by a node may get dropped downstream if the
+# subsequent nodes are still busy processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:output_video"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+  node_options: {
+    [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] {
+      max_in_flight: 1
+      max_in_queue: 1
+      # Timeout is disabled (set to 0) as first frame processing can take more
+      # than 1 second.
+      in_flight_timeout: 0
+    }
+  }
+}
+
+node {
+  calculator: "MultiPersonHolisticLandmarkCpu"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "POSE_LANDMARKS:multi_pose_landmarks"
+  output_stream: "POSE_ROI:pose_rois"
+  output_stream: "POSE_DETECTION:pose_detections"
+  output_stream: "FACE_LANDMARKS:multi_face_landmarks"
+  output_stream: "LEFT_HAND_LANDMARKS:multi_left_hand_landmarks"
+  output_stream: "RIGHT_HAND_LANDMARKS:multi_right_hand_landmarks"
+}
diff --git a/src/graphs/multi_person_pose_tracking_cpu.pbtxt b/src/graphs/multi_person_pose_tracking_cpu.pbtxt
new file mode 100644
index 0000000..cb7f87a
--- /dev/null
+++ b/src/graphs/multi_person_pose_tracking_cpu.pbtxt
@@ -0,0 +1,53 @@
+# MediaPipe graph that performs pose tracking with TensorFlow Lite on CPU.
+
+# CPU buffer. (ImageFrame)
+input_stream: "input_video"
+
+# Output image with rendered results. (ImageFrame)
+output_stream: "multi_pose_landmarks"
+
+output_stream: "pose_detections"
+
+output_stream: "roi_from_landmarks"
+
+# Generates side packet to enable segmentation.
+node {
+  calculator: "ConstantSidePacketCalculator"
+  output_side_packet: "PACKET:enable_segmentation"
+  node_options: {
+    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
+      packet { bool_value: true }
+    }
+  }
+}
+
+# Throttles the images flowing downstream for flow control. It passes through
+# the very first incoming image unaltered, and waits for downstream nodes
+# (calculators and subgraphs) in the graph to finish their tasks before it
+# passes through another image. All images that come in while waiting are
+# dropped, limiting the number of in-flight images in most part of the graph to
+# 1. This prevents the downstream nodes from queuing up incoming images and data
+# excessively, which leads to increased latency and memory usage, unwanted in
+# real-time mobile applications. It also eliminates unnecessarily computation,
+# e.g., the output produced by a node may get dropped downstream if the
+# subsequent nodes are still busy processing previous inputs.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:output_video"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Subgraph that detects poses and corresponding landmarks.
+node {
+  calculator: "MultiPoseLandmarkCpu"
+  input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
+  input_stream: "IMAGE:throttled_input_video"
+  output_stream: "LANDMARKS:multi_pose_landmarks"
+  output_stream: "DETECTION:pose_detections"
+  output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks"
+}
diff --git a/src/hands.rs b/src/hands.rs
index 79e69d5..f56cc11 100644
--- a/src/hands.rs
+++ b/src/hands.rs
@@ -1,6 +1,8 @@
 //! Hand detection utilities.
 use super::*;
 
+pub const NUM_HAND_LANDMARKS: usize = 21;
+
 /// Hand landmark indices.
 pub enum HandLandmark {
     WRIST = 0,
@@ -44,9 +46,17 @@ impl HandDetector {
     }
 
     /// Processes the input frame, returns a list of hands
-    pub fn process(&mut self, input: &Mat) -> Vec<Vec<Landmark>> {
+    pub fn process(&mut self, input: &Mat) -> Vec<Hand> {
         let result = self.graph.process(input);
-        result[0].clone()
+        let mut hands = vec![];
+
+        for hand_landmarks in result[0].iter() {
+            let mut hand = Hand::default();
+            hand.data.copy_from_slice(&hand_landmarks[..]);
+            hands.push(hand);
+        }
+
+        hands
     }
 }
 
diff --git a/src/holistic.rs b/src/holistic.rs
index e9af1b9..a798c9f 100644
--- a/src/holistic.rs
+++ b/src/holistic.rs
@@ -5,6 +5,14 @@ pub struct HolisticDetector {
     graph: Detector,
 }
 
+#[derive(Clone, Debug)]
+pub struct HolisticDetection {
+    pub pose: Option<Pose>,
+    pub face: Option<FaceMesh>,
+    pub left_hand: Option<Hand>,
+    pub right_hand: Option<Hand>,
+}
+
 impl HolisticDetector {
     pub fn new() -> Self {
         let outputs = vec![
@@ -32,9 +40,44 @@ impl HolisticDetector {
     }
 
     /// Processes the input frame, returns landmarks if detected
-    pub fn process(&mut self, input: &Mat) -> Vec<Vec<Vec<Landmark>>> {
+    pub fn process(&mut self, input: &Mat) -> HolisticDetection {
         let landmarks = self.graph.process(input);
-        landmarks.clone()
+
+        let mut pose = None;
+        let mut face = None;
+        let mut left_hand = None;
+        let mut right_hand = None;
+
+        if !landmarks[0].is_empty() {
+            let mut p = Pose::default();
+            p.data.copy_from_slice(&landmarks[0][0][..]);
+            pose = Some(p);
+        }
+
+        if !landmarks[1].is_empty() {
+            let mut f = FaceMesh::default();
+            f.data.copy_from_slice(&landmarks[1][0][..]);
+            face = Some(f);
+        }
+
+        if !landmarks[2].is_empty() {
+            let mut l = Hand::default();
+            l.data.copy_from_slice(&landmarks[2][0][..]);
+            left_hand = Some(l);
+        }
+
+        if !landmarks[3].is_empty() {
+            let mut r = Hand::default();
+            r.data.copy_from_slice(&landmarks[3][0][..]);
+            right_hand = Some(r);
+        }
+
+        HolisticDetection {
+            pose,
+            face,
+            left_hand,
+            right_hand,
+        }
     }
 }
 
@@ -43,3 +86,96 @@ impl Default for HolisticDetector {
         Self::new()
     }
 }
+
+pub struct MultiPersonHolisticDetector {
+    graph: Detector,
+}
+
+impl MultiPersonHolisticDetector {
+    pub fn new() -> Self {
+        let outputs = vec![
+            Output {
+                type_: FeatureType::Poses,
+                name: "multi_pose_landmarks".into(),
+            },
+            Output {
+                type_: FeatureType::Faces,
+                name: "multi_face_landmarks".into(),
+            },
+            Output {
+                type_: FeatureType::Hands,
+                name: "multi_left_hand_landmarks".into(),
+            },
+            Output {
+                type_: FeatureType::Hands,
+                name: "multi_right_hand_landmarks".into(),
+            },
+        ];
+
+        let graph = Detector::new(
+            include_str!("graphs/multi_person_holistic_tracking_cpu.pbtxt"),
+            outputs,
+        );
+
+        Self { graph }
+    }
+
+    /// Processes the input frame, returns landmarks if detected
+    pub fn process(&mut self, input: &Mat) -> Vec<HolisticDetection> {
+        let landmarks = self.graph.process(input);
+
+        let max_landmarks = landmarks
+            .iter()
+            .map(|l| l.len())
+            .reduce(|acc, item| acc.max(item))
+            .unwrap();
+
+        let mut detections = vec![];
+
+        for i in 0..max_landmarks {
+            let mut pose = None;
+            let mut face = None;
+            let mut left_hand = None;
+            let mut right_hand = None;
+
+            if landmarks[0].len() > i {
+                let mut p = Pose::default();
+                p.data.copy_from_slice(&landmarks[0][i][..]);
+                pose = Some(p);
+            }
+
+            if landmarks[1].len() > i {
+                let mut f = FaceMesh::default();
+                f.data.copy_from_slice(&landmarks[1][i][..]);
+                face = Some(f);
+            }
+
+            if landmarks[2].len() > i {
+                let mut l = Hand::default();
+                l.data.copy_from_slice(&landmarks[2][i][..]);
+                left_hand = Some(l);
+            }
+
+            if landmarks[3].len() > i {
+                let mut r = Hand::default();
+                r.data.copy_from_slice(&landmarks[3][i][..]);
+                right_hand = Some(r);
+            }
+
+            detections.push(HolisticDetection {
+                pose,
+                face,
+                left_hand,
+                right_hand,
+            });
+        }
+
+        detections
+    }
+}
+
+impl Default for MultiPersonHolisticDetector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 1fdf074..9d8716f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,7 +43,7 @@ impl FeatureType {
             FeatureType::Face => 478,
             FeatureType::Faces => 478,
             FeatureType::Hand => 21,
-            FeatureType::Hands => 42,
+            FeatureType::Hands => 21,
             FeatureType::Pose => 33,
             FeatureType::Poses => 33,
         }
@@ -100,6 +100,7 @@ impl Default for Landmark {
 
 /// Represents a detected pose, as 33 landmarks.
 /// Landmark names are in [pose::PoseLandmark].
+#[derive(Clone, Debug)]
 pub struct Pose {
     pub data: [Landmark; 33],
 }
@@ -114,12 +115,13 @@ impl Default for Pose {
 
 /// Represents a detected hand, as 21 landmarks.
 /// Landmark names are in [hands::HandLandmark]
-#[derive(Default)]
+#[derive(Clone, Debug, Default)]
 pub struct Hand {
     pub data: [Landmark; 21],
 }
 
 /// Represents a detected face mesh, as 478 landmarks.
+#[derive(Clone, Debug)]
 pub struct FaceMesh {
     pub data: [Landmark; 478],
 }
diff --git a/src/pose.rs b/src/pose.rs
index 9d256de..67752d5 100644
--- a/src/pose.rs
+++ b/src/pose.rs
@@ -1,6 +1,8 @@
 //! Pose detection utilities.
 use super::*;
 
+pub const NUM_POSE_LANDMARKS: usize = 33;
+
 /// Pose landmark indices.
 pub enum PoseLandmark {
     NOSE = 0,
@@ -76,3 +78,41 @@ impl Default for PoseDetector {
         Self::new()
     }
 }
+
+pub struct MultiPoseDetector {
+    graph: Detector,
+}
+
+impl MultiPoseDetector {
+    pub fn new() -> Self {
+        let graph = Detector::new(
+            include_str!("graphs/multi_person_pose_tracking_cpu.pbtxt"),
+            vec![Output {
+                type_: FeatureType::Poses,
+                name: "multi_pose_landmarks".into(),
+            }],
+        );
+
+        Self { graph }
+    }
+
+    /// Processes the input frame, returns poses if detected.
+    pub fn process(&mut self, input: &Mat) -> Vec<Pose> {
+        let result = self.graph.process(input);
+        let mut poses = vec![];
+
+        for pose_landmarks in result[0].iter() {
+            let mut pose = Pose::default();
+            pose.data.copy_from_slice(&pose_landmarks[..]);
+            poses.push(pose);
+        }
+
+        poses
+    }
+}
+
+impl Default for MultiPoseDetector {
+    fn default() -> Self {
+        Self::new()
+    }
+}