3d keypoints detection.
This commit is contained in:
parent
c5adb57f30
commit
d9f11e7d4a
|
@ -27,6 +27,9 @@
|
||||||
#include "mediapipe/framework/port/status.h"
|
#include "mediapipe/framework/port/status.h"
|
||||||
#include "mediapipe/util/header_util.h"
|
#include "mediapipe/util/header_util.h"
|
||||||
|
|
||||||
|
#include "mediapipe/framework/formats/image_frame.h"
|
||||||
|
#include "mediapipe/gpu/gpu_buffer.h"
|
||||||
|
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
@ -43,7 +46,10 @@ public:
|
||||||
{
|
{
|
||||||
cc->Inputs().Get("LANDMARKS", 0).SetAny();
|
cc->Inputs().Get("LANDMARKS", 0).SetAny();
|
||||||
cc->Inputs().Get("NORM_RECTS", 0).SetAny();
|
cc->Inputs().Get("NORM_RECTS", 0).SetAny();
|
||||||
cc->SetInputStreamHandler("ImmediateInputStreamHandler");
|
cc->Inputs().Get("FACE_LANDMARKS", 0).SetAny();
|
||||||
|
cc->Inputs().Get("HAND_DETECTIONS", 0).SetAny();
|
||||||
|
cc->Inputs().Get("IMAGE", 0).SetAny();
|
||||||
|
// cc->SetInputStreamHandler("ImmediateInputStreamHandler");
|
||||||
return ::mediapipe::OkStatus();
|
return ::mediapipe::OkStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,48 +68,112 @@ public:
|
||||||
{
|
{
|
||||||
const auto &landmarks =
|
const auto &landmarks =
|
||||||
cc->Inputs().Tag("LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
|
cc->Inputs().Tag("LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
|
||||||
|
PublishJson("HandLandmarks", ConvertLandmarkListsToJson(landmarks));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cc->Inputs().Tag("FACE_LANDMARKS").IsEmpty())
|
||||||
|
{
|
||||||
|
const auto &landmark_lists =
|
||||||
|
cc->Inputs().Tag("FACE_LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
|
||||||
|
|
||||||
|
PublishJson("FaceLandmarks", ConvertLandmarkListsToJson(landmark_lists));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!cc->Inputs().Tag("NORM_RECTS").IsEmpty())
|
if (!cc->Inputs().Tag("NORM_RECTS").IsEmpty())
|
||||||
{
|
{
|
||||||
const auto &norm_rects =
|
const auto &norm_rects =
|
||||||
cc->Inputs().Tag("NORM_RECTS").Get<std::vector<NormalizedRect>>();
|
cc->Inputs().Tag("NORM_RECTS").Get<std::vector<NormalizedRect>>();
|
||||||
|
const auto &detections =
|
||||||
|
cc->Inputs().Tag("HAND_DETECTIONS").Get<std::vector<Detection>>();
|
||||||
|
const auto& image_frame =
|
||||||
|
cc->Inputs().Tag("IMAGE").Get<mediapipe::GpuBuffer>();
|
||||||
|
const auto &landmark_lists =
|
||||||
|
cc->Inputs().Tag("LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
|
||||||
|
const auto& landmark_lists2 = ConvertLandmarkListsToJson(landmark_lists);
|
||||||
|
|
||||||
|
assert(norm_rects.size() == detections.size());
|
||||||
|
if (norm_rects.size() != landmark_lists2.size()) {
|
||||||
|
LOG(INFO) << "BUG";
|
||||||
|
}
|
||||||
if (norm_rects.size() > 0)
|
if (norm_rects.size() > 0)
|
||||||
{
|
{
|
||||||
json data = json({});
|
json data = json({});
|
||||||
data["hands"] = json::array();
|
data["hands"] = json::array();
|
||||||
|
|
||||||
for (const auto &norm_rect : norm_rects)
|
for (int i = 0; i < norm_rects.size(); i++)
|
||||||
{
|
{
|
||||||
if (norm_rect.width() == 0.0 && norm_rect.height() == 0.0 && norm_rect.x_center() == 0.0 && norm_rect.y_center() == 0.0 && norm_rect.rect_id() == 0) {
|
const auto& norm_rect = norm_rects[i];
|
||||||
|
const auto& detection = detections[i];
|
||||||
|
|
||||||
|
if (norm_rect.width() == 0.0 && norm_rect.height() == 0.0 && norm_rect.x_center() == 0.0 && norm_rect.y_center() == 0.0 && norm_rect.rect_id() == 0)
|
||||||
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// const auto& landmarks = landmark_lists2[i]["landmarks"];
|
||||||
|
// LOG(INFO) << "Inside" << landmark_lists2;
|
||||||
json empty_object_explicit = json::object();
|
json empty_object_explicit = json::object();
|
||||||
empty_object_explicit["width"] = norm_rect.width();
|
empty_object_explicit["width"] = norm_rect.width();
|
||||||
empty_object_explicit["height"] = norm_rect.height();
|
empty_object_explicit["height"] = norm_rect.height();
|
||||||
empty_object_explicit["x_center"] = norm_rect.x_center();
|
empty_object_explicit["x_center"] = norm_rect.x_center();
|
||||||
empty_object_explicit["y_center"] = norm_rect.y_center();
|
empty_object_explicit["y_center"] = norm_rect.y_center();
|
||||||
empty_object_explicit["rect_id"] = norm_rect.rect_id();
|
empty_object_explicit["rect_id"] = norm_rect.rect_id();
|
||||||
|
empty_object_explicit["image_width"] = image_frame.width();
|
||||||
|
empty_object_explicit["image_height"] = image_frame.height();
|
||||||
|
// LOG(INFO) << landmarks;
|
||||||
|
if (landmark_lists2.size() >= (i + 1)) {
|
||||||
|
// LOG(INFO) << (landmark_lists2.size() - 1);
|
||||||
|
// LOG(INFO) << i;
|
||||||
|
const auto& landmarks = landmark_lists2[i]["landmarks"];
|
||||||
|
empty_object_explicit["landmarks"] = landmarks;
|
||||||
|
}
|
||||||
|
// empty_object_explicit["track_id"] = norm_rect.id();
|
||||||
data["hands"].push_back(empty_object_explicit);
|
data["hands"].push_back(empty_object_explicit);
|
||||||
}
|
}
|
||||||
std::string s = data.dump();
|
data["timestamp"] = cc->InputTimestamp().Microseconds();
|
||||||
std::string topic = "Detection";
|
PublishJson("Detection", data);
|
||||||
|
|
||||||
zmq::message_t message(topic.size());
|
|
||||||
memcpy(message.data(), topic.c_str(), topic.size());
|
|
||||||
socket.send(message, ZMQ_SNDMORE);
|
|
||||||
|
|
||||||
zmq::message_t message2(s.size());
|
|
||||||
memcpy(message2.data(), s.c_str(), s.size());
|
|
||||||
socket.send(message2);
|
|
||||||
|
|
||||||
std::cout << "Publishing" << s << std::endl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ::mediapipe::OkStatus();
|
return ::mediapipe::OkStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PublishJson(const std::string &topic, const json &json_data)
|
||||||
|
{
|
||||||
|
std::string s = json_data.dump();
|
||||||
|
// std::string topic = topic;
|
||||||
|
|
||||||
|
zmq::message_t message(topic.size());
|
||||||
|
memcpy(message.data(), topic.c_str(), topic.size());
|
||||||
|
socket.send(message, ZMQ_SNDMORE);
|
||||||
|
|
||||||
|
zmq::message_t message2(s.size());
|
||||||
|
memcpy(message2.data(), s.c_str(), s.size());
|
||||||
|
socket.send(message2);
|
||||||
|
|
||||||
|
// std::cout << "Publishing" << s << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
json ConvertLandmarkListsToJson(const std::vector<NormalizedLandmarkList> &landmark_lists)
|
||||||
|
{
|
||||||
|
json landmark_list_json = json::array();
|
||||||
|
for (const auto &landmark_list : landmark_lists)
|
||||||
|
{
|
||||||
|
json data = json({});
|
||||||
|
data["landmarks"] = json::array();
|
||||||
|
for (int i = 0; i < landmark_list.landmark_size(); ++i)
|
||||||
|
{
|
||||||
|
const NormalizedLandmark &landmark = landmark_list.landmark(i);
|
||||||
|
json landmark_json = json::array();
|
||||||
|
landmark_json.push_back(landmark.x());
|
||||||
|
landmark_json.push_back(landmark.y());
|
||||||
|
landmark_json.push_back(landmark.z());
|
||||||
|
data["landmarks"].push_back(landmark_json);
|
||||||
|
}
|
||||||
|
landmark_list_json.push_back(data);
|
||||||
|
}
|
||||||
|
return landmark_list_json;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
zmq::context_t context{1};
|
zmq::context_t context{1};
|
||||||
zmq::socket_t socket{context, ZMQ_PUB};
|
zmq::socket_t socket{context, ZMQ_PUB};
|
||||||
|
|
|
@ -72,5 +72,7 @@ cc_library(
|
||||||
"//mediapipe/gpu:gl_calculator_helper",
|
"//mediapipe/gpu:gl_calculator_helper",
|
||||||
"//mediapipe/gpu:gpu_buffer",
|
"//mediapipe/gpu:gpu_buffer",
|
||||||
"//mediapipe/gpu:gpu_shared_data_internal",
|
"//mediapipe/gpu:gpu_shared_data_internal",
|
||||||
|
"//mediapipe/framework/formats:video_stream_header",
|
||||||
|
"//mediapipe/calculators/core:packet_cloner_calculator"
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -38,8 +38,10 @@ cc_binary(
|
||||||
deps = [
|
deps = [
|
||||||
"//mediapipe/examples/desktop:demo_run_graph_main_gpu",
|
"//mediapipe/examples/desktop:demo_run_graph_main_gpu",
|
||||||
"//mediapipe/calculators/ipc:zmq_calculator",
|
"//mediapipe/calculators/ipc:zmq_calculator",
|
||||||
|
"//mediapipe/calculators/image:sobel_edges_calculator",
|
||||||
"//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators",
|
"//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators",
|
||||||
"//mediapipe/graphs/face_mesh:desktop_live_gpu_calculators",
|
"//mediapipe/graphs/face_mesh:desktop_live_gpu_calculators",
|
||||||
|
"//mediapipe/calculators/video:opencv_video_encoder_calculator",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,195 @@
|
||||||
|
# MediaPipe graph that performs multi-hand tracking with TensorFlow Lite on GPU.
|
||||||
|
# Used in the examples in
|
||||||
|
# mediapipe/examples/android/src/java/com/mediapipe/apps/multihandtrackinggpu.
|
||||||
|
|
||||||
|
# Images coming into and out of the graph.
|
||||||
|
input_stream: "input_video"
|
||||||
|
# input_stream: "input_video_header"
|
||||||
|
output_stream: "output_video"
|
||||||
|
|
||||||
|
# Collection of detected/processed faces, each represented as a list of
|
||||||
|
# landmarks. (std::vector<NormalizedLandmarkList>)
|
||||||
|
output_stream: "multi_face_landmarks"
|
||||||
|
|
||||||
|
|
||||||
|
# Throttles the images flowing downstream for flow control. It passes through
|
||||||
|
# the very first incoming image unaltered, and waits for downstream nodes
|
||||||
|
# (calculators and subgraphs) in the graph to finish their tasks before it
|
||||||
|
# passes through another image. All images that come in while waiting are
|
||||||
|
# dropped, limiting the number of in-flight images in most part of the graph to
|
||||||
|
# 1. This prevents the downstream nodes from queuing up incoming images and data
|
||||||
|
# excessively, which leads to increased latency and memory usage, unwanted in
|
||||||
|
# real-time mobile applications. It also eliminates unnecessarily computation,
|
||||||
|
# e.g., the output produced by a node may get dropped downstream if the
|
||||||
|
# subsequent nodes are still busy processing previous inputs.
|
||||||
|
node {
|
||||||
|
calculator: "FlowLimiterCalculator"
|
||||||
|
input_stream: "input_video"
|
||||||
|
input_stream: "FINISHED:multi_hand_rects"
|
||||||
|
# input_stream: "FINISHED:output_video_1"
|
||||||
|
input_stream_info: {
|
||||||
|
tag_index: "FINISHED"
|
||||||
|
back_edge: true
|
||||||
|
}
|
||||||
|
output_stream: "throttled_input_video"
|
||||||
|
}
|
||||||
|
|
||||||
|
# node {
|
||||||
|
# calculator: "FlowLimiterCalculator"
|
||||||
|
# input_stream: "input_video_header"
|
||||||
|
# input_stream: "FINISHED:multi_hand_rects"
|
||||||
|
# input_stream_info: {
|
||||||
|
# tag_index: "FINISHED"
|
||||||
|
# back_edge: true
|
||||||
|
# }
|
||||||
|
# output_stream: "throttled_input_video_header"
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
# Subgraph that detections hands (see multi_hand_detection_gpu.pbtxt).
|
||||||
|
node {
|
||||||
|
calculator: "MultiHandDetectionSubgraph"
|
||||||
|
input_stream: "throttled_input_video"
|
||||||
|
output_stream: "DETECTIONS:multi_palm_detections"
|
||||||
|
output_stream: "NORM_RECTS:multi_palm_rects"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Subgraph that localizes hand landmarks for multiple hands (see
|
||||||
|
# multi_hand_landmark.pbtxt).
|
||||||
|
node {
|
||||||
|
calculator: "MultiHandLandmarkSubgraph"
|
||||||
|
input_stream: "IMAGE:throttled_input_video"
|
||||||
|
input_stream: "NORM_RECTS:multi_hand_rects"
|
||||||
|
output_stream: "LANDMARKS:multi_hand_landmarks"
|
||||||
|
output_stream: "NORM_RECTS:multi_hand_rects_from_landmarks"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Caches a hand rectangle fed back from MultiHandLandmarkSubgraph, and upon the
|
||||||
|
# arrival of the next input image sends out the cached rectangle with the
|
||||||
|
# timestamp replaced by that of the input image, essentially generating a packet
|
||||||
|
# that carries the previous hand rectangle. Note that upon the arrival of the
|
||||||
|
# very first input image, an empty packet is sent out to jump start the
|
||||||
|
# feedback loop.
|
||||||
|
node {
|
||||||
|
calculator: "PreviousLoopbackCalculator"
|
||||||
|
input_stream: "MAIN:throttled_input_video"
|
||||||
|
input_stream: "LOOP:multi_hand_rects_from_landmarks"
|
||||||
|
input_stream_info: {
|
||||||
|
tag_index: "LOOP"
|
||||||
|
back_edge: true
|
||||||
|
}
|
||||||
|
output_stream: "PREV_LOOP:prev_multi_hand_rects_from_landmarks"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Performs association between NormalizedRect vector elements from previous
|
||||||
|
# frame and those from the current frame if MultiHandDetectionSubgraph runs.
|
||||||
|
# This calculator ensures that the output multi_hand_rects vector doesn't
|
||||||
|
# contain overlapping regions based on the specified min_similarity_threshold.
|
||||||
|
node {
|
||||||
|
calculator: "AssociationNormRectCalculator"
|
||||||
|
input_stream: "prev_multi_hand_rects_from_landmarks"
|
||||||
|
input_stream: "multi_palm_rects"
|
||||||
|
output_stream: "multi_hand_rects"
|
||||||
|
node_options: {
|
||||||
|
[type.googleapis.com/mediapipe.AssociationCalculatorOptions] {
|
||||||
|
min_similarity_threshold: 0.5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Defines side packets for further use in the graph.
|
||||||
|
node {
|
||||||
|
calculator: "ConstantSidePacketCalculator"
|
||||||
|
output_side_packet: "PACKET:num_faces"
|
||||||
|
node_options: {
|
||||||
|
[type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
|
||||||
|
packet { int_value: 1 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Subgraph that detects faces and corresponding landmarks.
|
||||||
|
node {
|
||||||
|
calculator: "FaceLandmarkFrontGpu"
|
||||||
|
input_stream: "IMAGE:throttled_input_video"
|
||||||
|
input_side_packet: "NUM_FACES:num_faces"
|
||||||
|
output_stream: "LANDMARKS:multi_face_landmarks"
|
||||||
|
output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
|
||||||
|
output_stream: "DETECTIONS:face_detections"
|
||||||
|
output_stream: "ROIS_FROM_DETECTIONS:face_rects_from_detections"
|
||||||
|
}
|
||||||
|
|
||||||
|
node {
|
||||||
|
calculator: "PacketClonerCalculator"
|
||||||
|
input_stream: "multi_hand_landmarks"
|
||||||
|
input_stream: "multi_palm_detections"
|
||||||
|
input_stream: "multi_hand_rects"
|
||||||
|
input_stream: "throttled_input_video"
|
||||||
|
output_stream: "cloned_multi_hand_landmarks"
|
||||||
|
output_stream: "cloned_multi_palm_detections"
|
||||||
|
output_stream: "cloned_multi_hand_rects"
|
||||||
|
}
|
||||||
|
|
||||||
|
node {
|
||||||
|
calculator: "ZmqCalculator"
|
||||||
|
input_stream: "FACE_LANDMARKS:multi_face_landmarks"
|
||||||
|
input_stream: "LANDMARKS:cloned_multi_hand_landmarks"
|
||||||
|
input_stream: "HAND_DETECTIONS:cloned_multi_palm_detections"
|
||||||
|
input_stream: "NORM_RECTS:cloned_multi_hand_rects"
|
||||||
|
input_stream: "IMAGE:throttled_input_video"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Subgraph that renders face-landmark annotation onto the input image.
|
||||||
|
node {
|
||||||
|
calculator: "FaceRendererGpu"
|
||||||
|
input_stream: "IMAGE:throttled_input_video"
|
||||||
|
input_stream: "LANDMARKS:multi_face_landmarks"
|
||||||
|
input_stream: "NORM_RECTS:face_rects_from_landmarks"
|
||||||
|
input_stream: "DETECTIONS:face_detections"
|
||||||
|
output_stream: "IMAGE:output_video_1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Subgraph that renders annotations and overlays them on top of the input
|
||||||
|
# images (see multi_hand_renderer_gpu.pbtxt).
|
||||||
|
node {
|
||||||
|
calculator: "MultiHandRendererSubgraph"
|
||||||
|
input_stream: "IMAGE:output_video_1"
|
||||||
|
input_stream: "DETECTIONS:multi_palm_detections"
|
||||||
|
input_stream: "LANDMARKS:multi_hand_landmarks"
|
||||||
|
input_stream: "NORM_RECTS:0:multi_palm_rects"
|
||||||
|
# input_stream: "NORM_RECTS:1:multi_hand_rects"
|
||||||
|
output_stream: "IMAGE:output_video"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Defines side packets for further use in the graph.
|
||||||
|
node {
|
||||||
|
calculator: "ConstantSidePacketCalculator"
|
||||||
|
output_side_packet: "PACKET:output_video_path"
|
||||||
|
node_options: {
|
||||||
|
[type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
|
||||||
|
packet { string_value: "hand_tracking.mp4" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
node: {
|
||||||
|
calculator: "GpuBufferToImageFrameCalculator"
|
||||||
|
input_stream: "output_video"
|
||||||
|
output_stream: "output_video_cpu"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Encodes the annotated images into a video file, adopting properties specified
|
||||||
|
# in the input video header, e.g., video framerate.
|
||||||
|
# node {
|
||||||
|
# calculator: "OpenCvVideoEncoderCalculator"
|
||||||
|
# input_stream: "VIDEO:output_video_cpu"
|
||||||
|
# input_stream: "VIDEO_PRESTREAM:throttled_input_video_header"
|
||||||
|
# input_side_packet: "OUTPUT_FILE_PATH:output_video_path"
|
||||||
|
# node_options: {
|
||||||
|
# [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: {
|
||||||
|
# codec: "avc1"
|
||||||
|
# video_format: "mp4"
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
|
@ -60,7 +60,8 @@ node {
|
||||||
output_stream: "TENSORS:output_tensors"
|
output_stream: "TENSORS:output_tensors"
|
||||||
node_options: {
|
node_options: {
|
||||||
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
|
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
|
||||||
model_path: "mediapipe/models/hand_landmark.tflite"
|
# model_path: "mediapipe/models/hand_landmark.tflite"
|
||||||
|
model_path: "mediapipe/models/hand_landmark_3d.tflite"
|
||||||
use_gpu: true
|
use_gpu: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user