From b72fc70c01aee839e61de04242c0b1cce4051e20 Mon Sep 17 00:00:00 2001 From: Jules Youngberg Date: Mon, 4 Jul 2022 19:36:32 -0700 Subject: [PATCH] add multi pose estimation and multi person holistic tracking --- examples/face_mesh.rs | 2 +- examples/hand_tracking.rs | 2 +- examples/holistic_tracking.rs | 5 +- examples/multi_person_holistic_tracking.rs | 57 +++++++ examples/multi_pose_estimation.rs | 55 +++++++ src/face_mesh.rs | 14 +- .../multi_person_holistic_tracking_cpu.pbtxt | 55 +++++++ .../multi_person_pose_tracking_cpu.pbtxt | 53 +++++++ src/hands.rs | 14 +- src/holistic.rs | 140 +++++++++++++++++- src/lib.rs | 6 +- src/pose.rs | 40 +++++ 12 files changed, 429 insertions(+), 14 deletions(-) create mode 100644 examples/multi_person_holistic_tracking.rs create mode 100644 examples/multi_pose_estimation.rs create mode 100644 src/graphs/multi_person_holistic_tracking_cpu.pbtxt create mode 100644 src/graphs/multi_person_pose_tracking_cpu.pbtxt diff --git a/examples/face_mesh.rs b/examples/face_mesh.rs index 6ca6b64..c806dfc 100644 --- a/examples/face_mesh.rs +++ b/examples/face_mesh.rs @@ -35,7 +35,7 @@ fn face_mesh() -> Result<()> { highgui::imshow(window, &mut flip_frame)?; if !result.is_empty() { - let landmark = result[0][0]; + let landmark = result[0].data[0]; println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z); } } else { diff --git a/examples/hand_tracking.rs b/examples/hand_tracking.rs index 38a136f..540925d 100644 --- a/examples/hand_tracking.rs +++ b/examples/hand_tracking.rs @@ -35,7 +35,7 @@ pub fn hand_tracking() -> Result<()> { highgui::imshow(window, &mut flip_frame)?; if !result.is_empty() { - let landmark = result[0][0]; + let landmark = result[0].data[0]; println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z); } } else { diff --git a/examples/holistic_tracking.rs b/examples/holistic_tracking.rs index a663d27..c5e2218 100644 --- a/examples/holistic_tracking.rs +++ b/examples/holistic_tracking.rs @@ -31,12 +31,11 @@ fn face_mesh() -> Result<()> { println!("processing"); let result = detector.process(&flip_frame); - println!("received {} types of landmarks", result.len()); highgui::imshow(window, &mut flip_frame)?; - if !result[0].is_empty() { - let landmark = result[0][0][0]; + if let Some(pose) = result.pose { + let landmark = pose.data[0]; println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z); } } else { diff --git a/examples/multi_person_holistic_tracking.rs b/examples/multi_person_holistic_tracking.rs new file mode 100644 index 0000000..bd61f52 --- /dev/null +++ b/examples/multi_person_holistic_tracking.rs @@ -0,0 +1,57 @@ +use mediapipe::*; +use opencv::prelude::*; +use opencv::{highgui, imgproc, videoio, Result}; + +fn face_mesh() -> Result<()> { + let window = "video capture"; + + highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?; + + let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?; + if !cap.is_opened()? { + panic!("Unable to open default cam") + } + + cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?; + cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?; + cap.set(videoio::CAP_PROP_FPS, 30.0)?; + + let mut detector = holistic::MultiPersonHolisticDetector::default(); + + let mut raw_frame = Mat::default(); + let mut rgb_frame = Mat::default(); + let mut flip_frame = Mat::default(); + loop { + cap.read(&mut raw_frame)?; + + let size = raw_frame.size()?; + if size.width > 0 && !raw_frame.empty() { + imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?; + opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal + + println!("processing"); + let result = detector.process(&flip_frame); + + highgui::imshow(window, &mut flip_frame)?; + + if !result.is_empty() { + if let Some(pose) = &result[0].pose { + let landmark = pose.data[0]; + println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z); + } + } + } else { + println!("WARN: Skip empty frame"); + } + + let key = highgui::wait_key(10)?; + if key > 0 && key != 255 { + break; + } + } + Ok(()) +} + +fn main() { + face_mesh().unwrap() +} diff --git a/examples/multi_pose_estimation.rs b/examples/multi_pose_estimation.rs new file mode 100644 index 0000000..d31f15e --- /dev/null +++ b/examples/multi_pose_estimation.rs @@ -0,0 +1,55 @@ +use mediapipe::*; +use opencv::prelude::*; +use opencv::{highgui, imgproc, videoio, Result}; + +pub fn pose_estimation() -> Result<()> { + let window = "video capture"; + + highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?; + + let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?; + if !cap.is_opened()? { + panic!("Unable to open default cam") + } + + cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?; + cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?; + cap.set(videoio::CAP_PROP_FPS, 30.0)?; + + let mut detector = pose::MultiPoseDetector::default(); + + let mut raw_frame = Mat::default(); + let mut rgb_frame = Mat::default(); + let mut flip_frame = Mat::default(); + loop { + cap.read(&mut raw_frame)?; + + let size = raw_frame.size()?; + if size.width > 0 && !raw_frame.empty() { + imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?; + opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal + + println!("processing"); + let result = detector.process(&rgb_frame); + + highgui::imshow(window, &mut rgb_frame)?; + + if !result.is_empty() { + let landmark = result[0].data[0]; + println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z); + } + } else { + println!("WARN: Skip empty frame"); + } + + let key = highgui::wait_key(10)?; + if key > 0 && key != 255 { + break; + } + } + Ok(()) +} + +fn main() { + pose_estimation().unwrap() +} diff --git a/src/face_mesh.rs b/src/face_mesh.rs index 340b029..91230db 100644 --- a/src/face_mesh.rs +++ b/src/face_mesh.rs @@ -10,7 +10,7 @@ impl FaceMeshDetector { let graph = Detector::new( include_str!("graphs/face_mesh_desktop_live.pbtxt"), vec![Output { - type_: FeatureType::Face, + type_: FeatureType::Faces, name: "multi_face_landmarks".into(), }], ); @@ -19,9 +19,17 @@ impl FaceMeshDetector { } /// Processes the input frame, returns a face mesh if detected. - pub fn process(&mut self, input: &Mat) -> Vec> { + pub fn process(&mut self, input: &Mat) -> Vec { let landmarks = self.graph.process(input); - landmarks[0].clone() + let mut faces = vec![]; + + for face_landmarks in landmarks[0].iter() { + let mut face = FaceMesh::default(); + face.data.copy_from_slice(&face_landmarks[..]); + faces.push(face); + } + + faces } } diff --git a/src/graphs/multi_person_holistic_tracking_cpu.pbtxt b/src/graphs/multi_person_holistic_tracking_cpu.pbtxt new file mode 100644 index 0000000..d7393e8 --- /dev/null +++ b/src/graphs/multi_person_holistic_tracking_cpu.pbtxt @@ -0,0 +1,55 @@ +# Tracks pose + hands + face landmarks. + +# CPU image. (ImageFrame) +input_stream: "input_video" + +output_stream: "multi_pose_landmarks" + +output_stream: "pose_rois" + +output_stream: "pose_detections" + +output_stream: "multi_left_hand_landmarks" + +output_stream: "multi_right_hand_landmarks" + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:output_video" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" + node_options: { + [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] { + max_in_flight: 1 + max_in_queue: 1 + # Timeout is disabled (set to 0) as first frame processing can take more + # than 1 second. + in_flight_timeout: 0 + } + } +} + +node { + calculator: "MultiPersonHolisticLandmarkCpu" + input_stream: "IMAGE:throttled_input_video" + output_stream: "POSE_LANDMARKS:multi_pose_landmarks" + output_stream: "POSE_ROI:pose_rois" + output_stream: "POSE_DETECTION:pose_detections" + output_stream: "FACE_LANDMARKS:multi_face_landmarks" + output_stream: "LEFT_HAND_LANDMARKS:multi_left_hand_landmarks" + output_stream: "RIGHT_HAND_LANDMARKS:multi_right_hand_landmarks" +} diff --git a/src/graphs/multi_person_pose_tracking_cpu.pbtxt b/src/graphs/multi_person_pose_tracking_cpu.pbtxt new file mode 100644 index 0000000..cb7f87a --- /dev/null +++ b/src/graphs/multi_person_pose_tracking_cpu.pbtxt @@ -0,0 +1,53 @@ +# MediaPipe graph that performs pose tracking with TensorFlow Lite on CPU. + +# CPU buffer. (ImageFrame) +input_stream: "input_video" + +# Output image with rendered results. (ImageFrame) +output_stream: "multi_pose_landmarks" + +output_stream: "pose_detections" + +output_stream: "roi_from_landmarks" + +# Generates side packet to enable segmentation. +node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:enable_segmentation" + node_options: { + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { bool_value: true } + } + } +} + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:output_video" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" +} + +# Subgraph that detects poses and corresponding landmarks. +node { + calculator: "MultiPoseLandmarkCpu" + input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation" + input_stream: "IMAGE:throttled_input_video" + output_stream: "LANDMARKS:multi_pose_landmarks" + output_stream: "DETECTION:pose_detections" + output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks" +} diff --git a/src/hands.rs b/src/hands.rs index 79e69d5..f56cc11 100644 --- a/src/hands.rs +++ b/src/hands.rs @@ -1,6 +1,8 @@ //! Hand detection utilities. use super::*; +pub const NUM_HAND_LANDMARKS: usize = 21; + /// Hand landmark indices. pub enum HandLandmark { WRIST = 0, @@ -44,9 +46,17 @@ impl HandDetector { } /// Processes the input frame, returns a list of hands - pub fn process(&mut self, input: &Mat) -> Vec> { + pub fn process(&mut self, input: &Mat) -> Vec { let result = self.graph.process(input); - result[0].clone() + let mut hands = vec![]; + + for hand_landmarks in result[0].iter() { + let mut hand = Hand::default(); + hand.data.copy_from_slice(&hand_landmarks[..]); + hands.push(hand); + } + + hands } } diff --git a/src/holistic.rs b/src/holistic.rs index e9af1b9..a798c9f 100644 --- a/src/holistic.rs +++ b/src/holistic.rs @@ -5,6 +5,14 @@ pub struct HolisticDetector { graph: Detector, } +#[derive(Clone, Debug)] +pub struct HolisticDetection { + pub pose: Option, + pub face: Option, + pub left_hand: Option, + pub right_hand: Option, +} + impl HolisticDetector { pub fn new() -> Self { let outputs = vec![ @@ -32,9 +40,44 @@ impl HolisticDetector { } /// Processes the input frame, returns landmarks if detected - pub fn process(&mut self, input: &Mat) -> Vec>> { + pub fn process(&mut self, input: &Mat) -> HolisticDetection { let landmarks = self.graph.process(input); - landmarks.clone() + + let mut pose = None; + let mut face = None; + let mut left_hand = None; + let mut right_hand = None; + + if !landmarks[0].is_empty() { + let mut p = Pose::default(); + p.data.copy_from_slice(&landmarks[0][0][..]); + pose = Some(p); + } + + if !landmarks[1].is_empty() { + let mut f = FaceMesh::default(); + f.data.copy_from_slice(&landmarks[1][0][..]); + face = Some(f); + } + + if !landmarks[2].is_empty() { + let mut l = Hand::default(); + l.data.copy_from_slice(&landmarks[2][0][..]); + left_hand = Some(l); + } + + if !landmarks[3].is_empty() { + let mut r = Hand::default(); + r.data.copy_from_slice(&landmarks[3][0][..]); + right_hand = Some(r); + } + + HolisticDetection { + pose, + face, + left_hand, + right_hand, + } } } @@ -43,3 +86,96 @@ impl Default for HolisticDetector { Self::new() } } + +pub struct MultiPersonHolisticDetector { + graph: Detector, +} + +impl MultiPersonHolisticDetector { + pub fn new() -> Self { + let outputs = vec![ + Output { + type_: FeatureType::Poses, + name: "multi_pose_landmarks".into(), + }, + Output { + type_: FeatureType::Faces, + name: "multi_face_landmarks".into(), + }, + Output { + type_: FeatureType::Hands, + name: "multi_left_hand_landmarks".into(), + }, + Output { + type_: FeatureType::Hands, + name: "multi_right_hand_landmarks".into(), + }, + ]; + + let graph = Detector::new( + include_str!("graphs/multi_person_holistic_tracking_cpu.pbtxt"), + outputs, + ); + + Self { graph } + } + + /// Processes the input frame, returns landmarks if detected + pub fn process(&mut self, input: &Mat) -> Vec { + let landmarks = self.graph.process(input); + + let max_landmarks = landmarks + .iter() + .map(|l| l.len()) + .reduce(|acc, item| acc.max(item)) + .unwrap(); + + let mut detections = vec![]; + + for i in 0..max_landmarks { + let mut pose = None; + let mut face = None; + let mut left_hand = None; + let mut right_hand = None; + + if landmarks[0].len() > i { + let mut p = Pose::default(); + p.data.copy_from_slice(&landmarks[0][i][..]); + pose = Some(p); + } + + if landmarks[1].len() > i { + let mut f = FaceMesh::default(); + f.data.copy_from_slice(&landmarks[1][i][..]); + face = Some(f); + } + + if landmarks[2].len() > i { + let mut l = Hand::default(); + l.data.copy_from_slice(&landmarks[2][i][..]); + left_hand = Some(l); + } + + if landmarks[3].len() > i { + let mut r = Hand::default(); + r.data.copy_from_slice(&landmarks[3][i][..]); + right_hand = Some(r); + } + + detections.push(HolisticDetection { + pose, + face, + left_hand, + right_hand, + }); + } + + detections + } +} + +impl Default for MultiPersonHolisticDetector { + fn default() -> Self { + Self::new() + } +} diff --git a/src/lib.rs b/src/lib.rs index 1fdf074..9d8716f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,7 +43,7 @@ impl FeatureType { FeatureType::Face => 478, FeatureType::Faces => 478, FeatureType::Hand => 21, - FeatureType::Hands => 42, + FeatureType::Hands => 21, FeatureType::Pose => 33, FeatureType::Poses => 33, } @@ -100,6 +100,7 @@ impl Default for Landmark { /// Represents a detected pose, as 33 landmarks. /// Landmark names are in [pose::PoseLandmark]. +#[derive(Clone, Debug)] pub struct Pose { pub data: [Landmark; 33], } @@ -114,12 +115,13 @@ impl Default for Pose { /// Represents a detected hand, as 21 landmarks. /// Landmark names are in [hands::HandLandmark] -#[derive(Default)] +#[derive(Clone, Debug, Default)] pub struct Hand { pub data: [Landmark; 21], } /// Represents a detected face mesh, as 478 landmarks. +#[derive(Clone, Debug)] pub struct FaceMesh { pub data: [Landmark; 478], } diff --git a/src/pose.rs b/src/pose.rs index 9d256de..67752d5 100644 --- a/src/pose.rs +++ b/src/pose.rs @@ -1,6 +1,8 @@ //! Pose detection utilities. use super::*; +pub const NUM_POSE_LANDMARKS: usize = 33; + /// Pose landmark indices. pub enum PoseLandmark { NOSE = 0, @@ -76,3 +78,41 @@ impl Default for PoseDetector { Self::new() } } + +pub struct MultiPoseDetector { + graph: Detector, +} + +impl MultiPoseDetector { + pub fn new() -> Self { + let graph = Detector::new( + include_str!("graphs/multi_person_pose_tracking_cpu.pbtxt"), + vec![Output { + type_: FeatureType::Poses, + name: "multi_pose_landmarks".into(), + }], + ); + + Self { graph } + } + + /// Processes the input frame, returns poses if detected. + pub fn process(&mut self, input: &Mat) -> Vec { + let result = self.graph.process(input); + let mut poses = vec![]; + + for pose_landmarks in result[0].iter() { + let mut pose = Pose::default(); + pose.data.copy_from_slice(&pose_landmarks[..]); + poses.push(pose); + } + + poses + } +} + +impl Default for MultiPoseDetector { + fn default() -> Self { + Self::new() + } +}