add multi pose estimation and multi person holistic tracking

This commit is contained in:
Jules Youngberg 2022-07-04 19:36:32 -07:00
parent f405c764b9
commit b72fc70c01
12 changed files with 429 additions and 14 deletions

View File

@ -35,7 +35,7 @@ fn face_mesh() -> Result<()> {
highgui::imshow(window, &mut flip_frame)?;
if !result.is_empty() {
let landmark = result[0][0];
let landmark = result[0].data[0];
println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
}
} else {

View File

@ -35,7 +35,7 @@ pub fn hand_tracking() -> Result<()> {
highgui::imshow(window, &mut flip_frame)?;
if !result.is_empty() {
let landmark = result[0][0];
let landmark = result[0].data[0];
println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
}
} else {

View File

@ -31,12 +31,11 @@ fn face_mesh() -> Result<()> {
println!("processing");
let result = detector.process(&flip_frame);
println!("received {} types of landmarks", result.len());
highgui::imshow(window, &mut flip_frame)?;
if !result[0].is_empty() {
let landmark = result[0][0][0];
if let Some(pose) = result.pose {
let landmark = pose.data[0];
println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
}
} else {

View File

@ -0,0 +1,57 @@
use mediapipe::*;
use opencv::prelude::*;
use opencv::{highgui, imgproc, videoio, Result};
fn face_mesh() -> Result<()> {
let window = "video capture";
highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?;
let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?;
if !cap.is_opened()? {
panic!("Unable to open default cam")
}
cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?;
cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?;
cap.set(videoio::CAP_PROP_FPS, 30.0)?;
let mut detector = holistic::MultiPersonHolisticDetector::default();
let mut raw_frame = Mat::default();
let mut rgb_frame = Mat::default();
let mut flip_frame = Mat::default();
loop {
cap.read(&mut raw_frame)?;
let size = raw_frame.size()?;
if size.width > 0 && !raw_frame.empty() {
imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?;
opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal
println!("processing");
let result = detector.process(&flip_frame);
highgui::imshow(window, &mut flip_frame)?;
if !result.is_empty() {
if let Some(pose) = &result[0].pose {
let landmark = pose.data[0];
println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
}
}
} else {
println!("WARN: Skip empty frame");
}
let key = highgui::wait_key(10)?;
if key > 0 && key != 255 {
break;
}
}
Ok(())
}
fn main() {
face_mesh().unwrap()
}

View File

@ -0,0 +1,55 @@
use mediapipe::*;
use opencv::prelude::*;
use opencv::{highgui, imgproc, videoio, Result};
pub fn pose_estimation() -> Result<()> {
let window = "video capture";
highgui::named_window(window, highgui::WINDOW_AUTOSIZE)?;
let mut cap = videoio::VideoCapture::new(0, videoio::CAP_ANY)?;
if !cap.is_opened()? {
panic!("Unable to open default cam")
}
cap.set(videoio::CAP_PROP_FRAME_WIDTH, 640.0)?;
cap.set(videoio::CAP_PROP_FRAME_HEIGHT, 480.0)?;
cap.set(videoio::CAP_PROP_FPS, 30.0)?;
let mut detector = pose::MultiPoseDetector::default();
let mut raw_frame = Mat::default();
let mut rgb_frame = Mat::default();
let mut flip_frame = Mat::default();
loop {
cap.read(&mut raw_frame)?;
let size = raw_frame.size()?;
if size.width > 0 && !raw_frame.empty() {
imgproc::cvt_color(&raw_frame, &mut rgb_frame, imgproc::COLOR_BGR2RGB, 0)?;
opencv::core::flip(&rgb_frame, &mut flip_frame, 1)?; // horizontal
println!("processing");
let result = detector.process(&rgb_frame);
highgui::imshow(window, &mut rgb_frame)?;
if !result.is_empty() {
let landmark = result[0].data[0];
println!("LANDMARK: {} {} {}", landmark.x, landmark.y, landmark.z);
}
} else {
println!("WARN: Skip empty frame");
}
let key = highgui::wait_key(10)?;
if key > 0 && key != 255 {
break;
}
}
Ok(())
}
fn main() {
pose_estimation().unwrap()
}

View File

@ -10,7 +10,7 @@ impl FaceMeshDetector {
let graph = Detector::new(
include_str!("graphs/face_mesh_desktop_live.pbtxt"),
vec![Output {
type_: FeatureType::Face,
type_: FeatureType::Faces,
name: "multi_face_landmarks".into(),
}],
);
@ -19,9 +19,17 @@ impl FaceMeshDetector {
}
/// Processes the input frame, returns a face mesh if detected.
pub fn process(&mut self, input: &Mat) -> Vec<Vec<Landmark>> {
pub fn process(&mut self, input: &Mat) -> Vec<FaceMesh> {
let landmarks = self.graph.process(input);
landmarks[0].clone()
let mut faces = vec![];
for face_landmarks in landmarks[0].iter() {
let mut face = FaceMesh::default();
face.data.copy_from_slice(&face_landmarks[..]);
faces.push(face);
}
faces
}
}

View File

@ -0,0 +1,55 @@
# Tracks pose + hands + face landmarks.
# CPU image. (ImageFrame)
input_stream: "input_video"
output_stream: "multi_pose_landmarks"
output_stream: "pose_rois"
output_stream: "pose_detections"
output_stream: "multi_left_hand_landmarks"
output_stream: "multi_right_hand_landmarks"
# Throttles the images flowing downstream for flow control. It passes through
# the very first incoming image unaltered, and waits for downstream nodes
# (calculators and subgraphs) in the graph to finish their tasks before it
# passes through another image. All images that come in while waiting are
# dropped, limiting the number of in-flight images in most part of the graph to
# 1. This prevents the downstream nodes from queuing up incoming images and data
# excessively, which leads to increased latency and memory usage, unwanted in
# real-time mobile applications. It also eliminates unnecessarily computation,
# e.g., the output produced by a node may get dropped downstream if the
# subsequent nodes are still busy processing previous inputs.
node {
calculator: "FlowLimiterCalculator"
input_stream: "input_video"
input_stream: "FINISHED:output_video"
input_stream_info: {
tag_index: "FINISHED"
back_edge: true
}
output_stream: "throttled_input_video"
node_options: {
[type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] {
max_in_flight: 1
max_in_queue: 1
# Timeout is disabled (set to 0) as first frame processing can take more
# than 1 second.
in_flight_timeout: 0
}
}
}
node {
calculator: "MultiPersonHolisticLandmarkCpu"
input_stream: "IMAGE:throttled_input_video"
output_stream: "POSE_LANDMARKS:multi_pose_landmarks"
output_stream: "POSE_ROI:pose_rois"
output_stream: "POSE_DETECTION:pose_detections"
output_stream: "FACE_LANDMARKS:multi_face_landmarks"
output_stream: "LEFT_HAND_LANDMARKS:multi_left_hand_landmarks"
output_stream: "RIGHT_HAND_LANDMARKS:multi_right_hand_landmarks"
}

View File

@ -0,0 +1,53 @@
# MediaPipe graph that performs pose tracking with TensorFlow Lite on CPU.
# CPU buffer. (ImageFrame)
input_stream: "input_video"
# Output image with rendered results. (ImageFrame)
output_stream: "multi_pose_landmarks"
output_stream: "pose_detections"
output_stream: "roi_from_landmarks"
# Generates side packet to enable segmentation.
node {
calculator: "ConstantSidePacketCalculator"
output_side_packet: "PACKET:enable_segmentation"
node_options: {
[type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
packet { bool_value: true }
}
}
}
# Throttles the images flowing downstream for flow control. It passes through
# the very first incoming image unaltered, and waits for downstream nodes
# (calculators and subgraphs) in the graph to finish their tasks before it
# passes through another image. All images that come in while waiting are
# dropped, limiting the number of in-flight images in most part of the graph to
# 1. This prevents the downstream nodes from queuing up incoming images and data
# excessively, which leads to increased latency and memory usage, unwanted in
# real-time mobile applications. It also eliminates unnecessarily computation,
# e.g., the output produced by a node may get dropped downstream if the
# subsequent nodes are still busy processing previous inputs.
node {
calculator: "FlowLimiterCalculator"
input_stream: "input_video"
input_stream: "FINISHED:output_video"
input_stream_info: {
tag_index: "FINISHED"
back_edge: true
}
output_stream: "throttled_input_video"
}
# Subgraph that detects poses and corresponding landmarks.
node {
calculator: "MultiPoseLandmarkCpu"
input_side_packet: "ENABLE_SEGMENTATION:enable_segmentation"
input_stream: "IMAGE:throttled_input_video"
output_stream: "LANDMARKS:multi_pose_landmarks"
output_stream: "DETECTION:pose_detections"
output_stream: "ROI_FROM_LANDMARKS:roi_from_landmarks"
}

View File

@ -1,6 +1,8 @@
//! Hand detection utilities.
use super::*;
pub const NUM_HAND_LANDMARKS: usize = 21;
/// Hand landmark indices.
pub enum HandLandmark {
WRIST = 0,
@ -44,9 +46,17 @@ impl HandDetector {
}
/// Processes the input frame, returns a list of hands
pub fn process(&mut self, input: &Mat) -> Vec<Vec<Landmark>> {
pub fn process(&mut self, input: &Mat) -> Vec<Hand> {
let result = self.graph.process(input);
result[0].clone()
let mut hands = vec![];
for hand_landmarks in result[0].iter() {
let mut hand = Hand::default();
hand.data.copy_from_slice(&hand_landmarks[..]);
hands.push(hand);
}
hands
}
}

View File

@ -5,6 +5,14 @@ pub struct HolisticDetector {
graph: Detector,
}
#[derive(Clone, Debug)]
pub struct HolisticDetection {
pub pose: Option<Pose>,
pub face: Option<FaceMesh>,
pub left_hand: Option<Hand>,
pub right_hand: Option<Hand>,
}
impl HolisticDetector {
pub fn new() -> Self {
let outputs = vec![
@ -32,9 +40,44 @@ impl HolisticDetector {
}
/// Processes the input frame, returns landmarks if detected
pub fn process(&mut self, input: &Mat) -> Vec<Vec<Vec<Landmark>>> {
pub fn process(&mut self, input: &Mat) -> HolisticDetection {
let landmarks = self.graph.process(input);
landmarks.clone()
let mut pose = None;
let mut face = None;
let mut left_hand = None;
let mut right_hand = None;
if !landmarks[0].is_empty() {
let mut p = Pose::default();
p.data.copy_from_slice(&landmarks[0][0][..]);
pose = Some(p);
}
if !landmarks[1].is_empty() {
let mut f = FaceMesh::default();
f.data.copy_from_slice(&landmarks[1][0][..]);
face = Some(f);
}
if !landmarks[2].is_empty() {
let mut l = Hand::default();
l.data.copy_from_slice(&landmarks[2][0][..]);
left_hand = Some(l);
}
if !landmarks[3].is_empty() {
let mut r = Hand::default();
r.data.copy_from_slice(&landmarks[3][0][..]);
right_hand = Some(r);
}
HolisticDetection {
pose,
face,
left_hand,
right_hand,
}
}
}
@ -43,3 +86,96 @@ impl Default for HolisticDetector {
Self::new()
}
}
pub struct MultiPersonHolisticDetector {
graph: Detector,
}
impl MultiPersonHolisticDetector {
pub fn new() -> Self {
let outputs = vec![
Output {
type_: FeatureType::Poses,
name: "multi_pose_landmarks".into(),
},
Output {
type_: FeatureType::Faces,
name: "multi_face_landmarks".into(),
},
Output {
type_: FeatureType::Hands,
name: "multi_left_hand_landmarks".into(),
},
Output {
type_: FeatureType::Hands,
name: "multi_right_hand_landmarks".into(),
},
];
let graph = Detector::new(
include_str!("graphs/multi_person_holistic_tracking_cpu.pbtxt"),
outputs,
);
Self { graph }
}
/// Processes the input frame, returns landmarks if detected
pub fn process(&mut self, input: &Mat) -> Vec<HolisticDetection> {
let landmarks = self.graph.process(input);
let max_landmarks = landmarks
.iter()
.map(|l| l.len())
.reduce(|acc, item| acc.max(item))
.unwrap();
let mut detections = vec![];
for i in 0..max_landmarks {
let mut pose = None;
let mut face = None;
let mut left_hand = None;
let mut right_hand = None;
if landmarks[0].len() > i {
let mut p = Pose::default();
p.data.copy_from_slice(&landmarks[0][i][..]);
pose = Some(p);
}
if landmarks[1].len() > i {
let mut f = FaceMesh::default();
f.data.copy_from_slice(&landmarks[1][i][..]);
face = Some(f);
}
if landmarks[2].len() > i {
let mut l = Hand::default();
l.data.copy_from_slice(&landmarks[2][i][..]);
left_hand = Some(l);
}
if landmarks[3].len() > i {
let mut r = Hand::default();
r.data.copy_from_slice(&landmarks[3][i][..]);
right_hand = Some(r);
}
detections.push(HolisticDetection {
pose,
face,
left_hand,
right_hand,
});
}
detections
}
}
impl Default for MultiPersonHolisticDetector {
fn default() -> Self {
Self::new()
}
}

View File

@ -43,7 +43,7 @@ impl FeatureType {
FeatureType::Face => 478,
FeatureType::Faces => 478,
FeatureType::Hand => 21,
FeatureType::Hands => 42,
FeatureType::Hands => 21,
FeatureType::Pose => 33,
FeatureType::Poses => 33,
}
@ -100,6 +100,7 @@ impl Default for Landmark {
/// Represents a detected pose, as 33 landmarks.
/// Landmark names are in [pose::PoseLandmark].
#[derive(Clone, Debug)]
pub struct Pose {
pub data: [Landmark; 33],
}
@ -114,12 +115,13 @@ impl Default for Pose {
/// Represents a detected hand, as 21 landmarks.
/// Landmark names are in [hands::HandLandmark]
#[derive(Default)]
#[derive(Clone, Debug, Default)]
pub struct Hand {
pub data: [Landmark; 21],
}
/// Represents a detected face mesh, as 478 landmarks.
#[derive(Clone, Debug)]
pub struct FaceMesh {
pub data: [Landmark; 478],
}

View File

@ -1,6 +1,8 @@
//! Pose detection utilities.
use super::*;
pub const NUM_POSE_LANDMARKS: usize = 33;
/// Pose landmark indices.
pub enum PoseLandmark {
NOSE = 0,
@ -76,3 +78,41 @@ impl Default for PoseDetector {
Self::new()
}
}
pub struct MultiPoseDetector {
graph: Detector,
}
impl MultiPoseDetector {
pub fn new() -> Self {
let graph = Detector::new(
include_str!("graphs/multi_person_pose_tracking_cpu.pbtxt"),
vec![Output {
type_: FeatureType::Poses,
name: "multi_pose_landmarks".into(),
}],
);
Self { graph }
}
/// Processes the input frame, returns poses if detected.
pub fn process(&mut self, input: &Mat) -> Vec<Pose> {
let result = self.graph.process(input);
let mut poses = vec![];
for pose_landmarks in result[0].iter() {
let mut pose = Pose::default();
pose.data.copy_from_slice(&pose_landmarks[..]);
poses.push(pose);
}
poses
}
}
impl Default for MultiPoseDetector {
fn default() -> Self {
Self::new()
}
}