diff --git a/mediapipe/calculators/ipc/zmq_calculator.cc b/mediapipe/calculators/ipc/zmq_calculator.cc index d146c4046..d3a7e73b9 100644 --- a/mediapipe/calculators/ipc/zmq_calculator.cc +++ b/mediapipe/calculators/ipc/zmq_calculator.cc @@ -27,6 +27,9 @@ #include "mediapipe/framework/port/status.h" #include "mediapipe/util/header_util.h" +#include "mediapipe/framework/formats/image_frame.h" +#include "mediapipe/gpu/gpu_buffer.h" + #include "json.hpp" using json = nlohmann::json; @@ -43,7 +46,10 @@ public: { cc->Inputs().Get("LANDMARKS", 0).SetAny(); cc->Inputs().Get("NORM_RECTS", 0).SetAny(); - cc->SetInputStreamHandler("ImmediateInputStreamHandler"); + cc->Inputs().Get("FACE_LANDMARKS", 0).SetAny(); + cc->Inputs().Get("HAND_DETECTIONS", 0).SetAny(); + cc->Inputs().Get("IMAGE", 0).SetAny(); + // cc->SetInputStreamHandler("ImmediateInputStreamHandler"); return ::mediapipe::OkStatus(); } @@ -62,48 +68,112 @@ public: { const auto &landmarks = cc->Inputs().Tag("LANDMARKS").Get>(); + PublishJson("HandLandmarks", ConvertLandmarkListsToJson(landmarks)); + } + + if (!cc->Inputs().Tag("FACE_LANDMARKS").IsEmpty()) + { + const auto &landmark_lists = + cc->Inputs().Tag("FACE_LANDMARKS").Get>(); + + PublishJson("FaceLandmarks", ConvertLandmarkListsToJson(landmark_lists)); } if (!cc->Inputs().Tag("NORM_RECTS").IsEmpty()) { const auto &norm_rects = cc->Inputs().Tag("NORM_RECTS").Get>(); + const auto &detections = + cc->Inputs().Tag("HAND_DETECTIONS").Get>(); + const auto& image_frame = + cc->Inputs().Tag("IMAGE").Get(); + const auto &landmark_lists = + cc->Inputs().Tag("LANDMARKS").Get>(); + const auto& landmark_lists2 = ConvertLandmarkListsToJson(landmark_lists); + + assert(norm_rects.size() == detections.size()); + if (norm_rects.size() != landmark_lists2.size()) { + LOG(INFO) << "BUG"; + } if (norm_rects.size() > 0) { json data = json({}); data["hands"] = json::array(); - for (const auto &norm_rect : norm_rects) + for (int i = 0; i < norm_rects.size(); i++) { - if (norm_rect.width() == 0.0 && norm_rect.height() == 0.0 && norm_rect.x_center() == 0.0 && norm_rect.y_center() == 0.0 && norm_rect.rect_id() == 0) { + const auto& norm_rect = norm_rects[i]; + const auto& detection = detections[i]; + + if (norm_rect.width() == 0.0 && norm_rect.height() == 0.0 && norm_rect.x_center() == 0.0 && norm_rect.y_center() == 0.0 && norm_rect.rect_id() == 0) + { continue; } + // const auto& landmarks = landmark_lists2[i]["landmarks"]; + // LOG(INFO) << "Inside" << landmark_lists2; json empty_object_explicit = json::object(); empty_object_explicit["width"] = norm_rect.width(); empty_object_explicit["height"] = norm_rect.height(); empty_object_explicit["x_center"] = norm_rect.x_center(); empty_object_explicit["y_center"] = norm_rect.y_center(); empty_object_explicit["rect_id"] = norm_rect.rect_id(); + empty_object_explicit["image_width"] = image_frame.width(); + empty_object_explicit["image_height"] = image_frame.height(); + // LOG(INFO) << landmarks; + if (landmark_lists2.size() >= (i + 1)) { + // LOG(INFO) << (landmark_lists2.size() - 1); + // LOG(INFO) << i; + const auto& landmarks = landmark_lists2[i]["landmarks"]; + empty_object_explicit["landmarks"] = landmarks; + } + // empty_object_explicit["track_id"] = norm_rect.id(); data["hands"].push_back(empty_object_explicit); } - std::string s = data.dump(); - std::string topic = "Detection"; - - zmq::message_t message(topic.size()); - memcpy(message.data(), topic.c_str(), topic.size()); - socket.send(message, ZMQ_SNDMORE); - - zmq::message_t message2(s.size()); - memcpy(message2.data(), s.c_str(), s.size()); - socket.send(message2); - - std::cout << "Publishing" << s << std::endl; + data["timestamp"] = cc->InputTimestamp().Microseconds(); + PublishJson("Detection", data); } } return ::mediapipe::OkStatus(); } + void PublishJson(const std::string &topic, const json &json_data) + { + std::string s = json_data.dump(); + // std::string topic = topic; + + zmq::message_t message(topic.size()); + memcpy(message.data(), topic.c_str(), topic.size()); + socket.send(message, ZMQ_SNDMORE); + + zmq::message_t message2(s.size()); + memcpy(message2.data(), s.c_str(), s.size()); + socket.send(message2); + + // std::cout << "Publishing" << s << std::endl; + } + + json ConvertLandmarkListsToJson(const std::vector &landmark_lists) + { + json landmark_list_json = json::array(); + for (const auto &landmark_list : landmark_lists) + { + json data = json({}); + data["landmarks"] = json::array(); + for (int i = 0; i < landmark_list.landmark_size(); ++i) + { + const NormalizedLandmark &landmark = landmark_list.landmark(i); + json landmark_json = json::array(); + landmark_json.push_back(landmark.x()); + landmark_json.push_back(landmark.y()); + landmark_json.push_back(landmark.z()); + data["landmarks"].push_back(landmark_json); + } + landmark_list_json.push_back(data); + } + return landmark_list_json; + } + private: zmq::context_t context{1}; zmq::socket_t socket{context, ZMQ_PUB}; diff --git a/mediapipe/examples/desktop/BUILD b/mediapipe/examples/desktop/BUILD index 0e0335157..768cbdbab 100644 --- a/mediapipe/examples/desktop/BUILD +++ b/mediapipe/examples/desktop/BUILD @@ -72,5 +72,7 @@ cc_library( "//mediapipe/gpu:gl_calculator_helper", "//mediapipe/gpu:gpu_buffer", "//mediapipe/gpu:gpu_shared_data_internal", + "//mediapipe/framework/formats:video_stream_header", + "//mediapipe/calculators/core:packet_cloner_calculator" ], ) diff --git a/mediapipe/examples/desktop/multi_hand_tracking/BUILD b/mediapipe/examples/desktop/multi_hand_tracking/BUILD index a6076723c..06cc9ef3a 100644 --- a/mediapipe/examples/desktop/multi_hand_tracking/BUILD +++ b/mediapipe/examples/desktop/multi_hand_tracking/BUILD @@ -38,8 +38,10 @@ cc_binary( deps = [ "//mediapipe/examples/desktop:demo_run_graph_main_gpu", "//mediapipe/calculators/ipc:zmq_calculator", + "//mediapipe/calculators/image:sobel_edges_calculator", "//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators", "//mediapipe/graphs/face_mesh:desktop_live_gpu_calculators", + "//mediapipe/calculators/video:opencv_video_encoder_calculator", ], ) diff --git a/mediapipe/graphs/hand_tracking/hand_face_detection_no_gating.pbtxt b/mediapipe/graphs/hand_tracking/hand_face_detection_no_gating.pbtxt new file mode 100644 index 000000000..661ec1085 --- /dev/null +++ b/mediapipe/graphs/hand_tracking/hand_face_detection_no_gating.pbtxt @@ -0,0 +1,195 @@ +# MediaPipe graph that performs multi-hand tracking with TensorFlow Lite on GPU. +# Used in the examples in +# mediapipe/examples/android/src/java/com/mediapipe/apps/multihandtrackinggpu. + +# Images coming into and out of the graph. +input_stream: "input_video" +# input_stream: "input_video_header" +output_stream: "output_video" + +# Collection of detected/processed faces, each represented as a list of +# landmarks. (std::vector) +output_stream: "multi_face_landmarks" + + +# Throttles the images flowing downstream for flow control. It passes through +# the very first incoming image unaltered, and waits for downstream nodes +# (calculators and subgraphs) in the graph to finish their tasks before it +# passes through another image. All images that come in while waiting are +# dropped, limiting the number of in-flight images in most part of the graph to +# 1. This prevents the downstream nodes from queuing up incoming images and data +# excessively, which leads to increased latency and memory usage, unwanted in +# real-time mobile applications. It also eliminates unnecessarily computation, +# e.g., the output produced by a node may get dropped downstream if the +# subsequent nodes are still busy processing previous inputs. +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:multi_hand_rects" + # input_stream: "FINISHED:output_video_1" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" +} + +# node { +# calculator: "FlowLimiterCalculator" +# input_stream: "input_video_header" +# input_stream: "FINISHED:multi_hand_rects" +# input_stream_info: { +# tag_index: "FINISHED" +# back_edge: true +# } +# output_stream: "throttled_input_video_header" +# } + + +# Subgraph that detections hands (see multi_hand_detection_gpu.pbtxt). +node { + calculator: "MultiHandDetectionSubgraph" + input_stream: "throttled_input_video" + output_stream: "DETECTIONS:multi_palm_detections" + output_stream: "NORM_RECTS:multi_palm_rects" +} + +# Subgraph that localizes hand landmarks for multiple hands (see +# multi_hand_landmark.pbtxt). +node { + calculator: "MultiHandLandmarkSubgraph" + input_stream: "IMAGE:throttled_input_video" + input_stream: "NORM_RECTS:multi_hand_rects" + output_stream: "LANDMARKS:multi_hand_landmarks" + output_stream: "NORM_RECTS:multi_hand_rects_from_landmarks" +} + +# Caches a hand rectangle fed back from MultiHandLandmarkSubgraph, and upon the +# arrival of the next input image sends out the cached rectangle with the +# timestamp replaced by that of the input image, essentially generating a packet +# that carries the previous hand rectangle. Note that upon the arrival of the +# very first input image, an empty packet is sent out to jump start the +# feedback loop. +node { + calculator: "PreviousLoopbackCalculator" + input_stream: "MAIN:throttled_input_video" + input_stream: "LOOP:multi_hand_rects_from_landmarks" + input_stream_info: { + tag_index: "LOOP" + back_edge: true + } + output_stream: "PREV_LOOP:prev_multi_hand_rects_from_landmarks" +} + +# Performs association between NormalizedRect vector elements from previous +# frame and those from the current frame if MultiHandDetectionSubgraph runs. +# This calculator ensures that the output multi_hand_rects vector doesn't +# contain overlapping regions based on the specified min_similarity_threshold. +node { + calculator: "AssociationNormRectCalculator" + input_stream: "prev_multi_hand_rects_from_landmarks" + input_stream: "multi_palm_rects" + output_stream: "multi_hand_rects" + node_options: { + [type.googleapis.com/mediapipe.AssociationCalculatorOptions] { + min_similarity_threshold: 0.5 + } + } +} + + +# Defines side packets for further use in the graph. +node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:num_faces" + node_options: { + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { int_value: 1 } + } + } +} + +# Subgraph that detects faces and corresponding landmarks. +node { + calculator: "FaceLandmarkFrontGpu" + input_stream: "IMAGE:throttled_input_video" + input_side_packet: "NUM_FACES:num_faces" + output_stream: "LANDMARKS:multi_face_landmarks" + output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks" + output_stream: "DETECTIONS:face_detections" + output_stream: "ROIS_FROM_DETECTIONS:face_rects_from_detections" +} + +node { + calculator: "PacketClonerCalculator" + input_stream: "multi_hand_landmarks" + input_stream: "multi_palm_detections" + input_stream: "multi_hand_rects" + input_stream: "throttled_input_video" + output_stream: "cloned_multi_hand_landmarks" + output_stream: "cloned_multi_palm_detections" + output_stream: "cloned_multi_hand_rects" +} + +node { + calculator: "ZmqCalculator" + input_stream: "FACE_LANDMARKS:multi_face_landmarks" + input_stream: "LANDMARKS:cloned_multi_hand_landmarks" + input_stream: "HAND_DETECTIONS:cloned_multi_palm_detections" + input_stream: "NORM_RECTS:cloned_multi_hand_rects" + input_stream: "IMAGE:throttled_input_video" +} + +# Subgraph that renders face-landmark annotation onto the input image. +node { + calculator: "FaceRendererGpu" + input_stream: "IMAGE:throttled_input_video" + input_stream: "LANDMARKS:multi_face_landmarks" + input_stream: "NORM_RECTS:face_rects_from_landmarks" + input_stream: "DETECTIONS:face_detections" + output_stream: "IMAGE:output_video_1" +} + +# Subgraph that renders annotations and overlays them on top of the input +# images (see multi_hand_renderer_gpu.pbtxt). +node { + calculator: "MultiHandRendererSubgraph" + input_stream: "IMAGE:output_video_1" + input_stream: "DETECTIONS:multi_palm_detections" + input_stream: "LANDMARKS:multi_hand_landmarks" + input_stream: "NORM_RECTS:0:multi_palm_rects" + # input_stream: "NORM_RECTS:1:multi_hand_rects" + output_stream: "IMAGE:output_video" +} + +# Defines side packets for further use in the graph. +node { + calculator: "ConstantSidePacketCalculator" + output_side_packet: "PACKET:output_video_path" + node_options: { + [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: { + packet { string_value: "hand_tracking.mp4" } + } + } +} + +node: { + calculator: "GpuBufferToImageFrameCalculator" + input_stream: "output_video" + output_stream: "output_video_cpu" +} + +# Encodes the annotated images into a video file, adopting properties specified +# in the input video header, e.g., video framerate. +# node { +# calculator: "OpenCvVideoEncoderCalculator" +# input_stream: "VIDEO:output_video_cpu" +# input_stream: "VIDEO_PRESTREAM:throttled_input_video_header" +# input_side_packet: "OUTPUT_FILE_PATH:output_video_path" +# node_options: { +# [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: { +# codec: "avc1" +# video_format: "mp4" +# } +# } +# } diff --git a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt b/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt index a45e0e1e1..5724fba69 100644 --- a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt +++ b/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt @@ -60,7 +60,8 @@ node { output_stream: "TENSORS:output_tensors" node_options: { [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { - model_path: "mediapipe/models/hand_landmark.tflite" + # model_path: "mediapipe/models/hand_landmark.tflite" + model_path: "mediapipe/models/hand_landmark_3d.tflite" use_gpu: true } }