3d keypoints detection.

2020-05-09 15:40:47 +09:00 · 2020-05-09 15:40:47 +09:00 · d9f11e7d4a
commit d9f11e7d4a
parent c5adb57f30
5 changed files with 286 additions and 16 deletions
--- a/mediapipe/calculators/ipc/zmq_calculator.cc
+++ b/mediapipe/calculators/ipc/zmq_calculator.cc
@ -27,6 +27,9 @@
 #include "mediapipe/framework/port/status.h"
 #include "mediapipe/util/header_util.h"
 #include "mediapipe/framework/formats/image_frame.h"
 #include "mediapipe/gpu/gpu_buffer.h"
 #include "json.hpp"
 using json = nlohmann::json;
@ -43,7 +46,10 @@ public:
    {
        cc->Inputs().Get("LANDMARKS", 0).SetAny();
        cc->Inputs().Get("NORM_RECTS", 0).SetAny();
-        cc->SetInputStreamHandler("ImmediateInputStreamHandler");
+        cc->Inputs().Get("FACE_LANDMARKS", 0).SetAny();
        cc->Inputs().Get("HAND_DETECTIONS", 0).SetAny();
        cc->Inputs().Get("IMAGE", 0).SetAny();
        // cc->SetInputStreamHandler("ImmediateInputStreamHandler");
        return ::mediapipe::OkStatus();
    }
@ -62,48 +68,112 @@ public:
        {
            const auto &landmarks =
                cc->Inputs().Tag("LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
            PublishJson("HandLandmarks", ConvertLandmarkListsToJson(landmarks));
        }
        if (!cc->Inputs().Tag("FACE_LANDMARKS").IsEmpty())
        {
            const auto &landmark_lists =
                cc->Inputs().Tag("FACE_LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
            PublishJson("FaceLandmarks", ConvertLandmarkListsToJson(landmark_lists));
        }
        if (!cc->Inputs().Tag("NORM_RECTS").IsEmpty())
        {
            const auto &norm_rects =
                cc->Inputs().Tag("NORM_RECTS").Get<std::vector<NormalizedRect>>();
            const auto &detections =
                cc->Inputs().Tag("HAND_DETECTIONS").Get<std::vector<Detection>>();
            const auto& image_frame = 
                cc->Inputs().Tag("IMAGE").Get<mediapipe::GpuBuffer>();
            const auto &landmark_lists =
                cc->Inputs().Tag("LANDMARKS").Get<std::vector<NormalizedLandmarkList>>();
            const auto& landmark_lists2 = ConvertLandmarkListsToJson(landmark_lists);
            assert(norm_rects.size() == detections.size());
            if (norm_rects.size() != landmark_lists2.size()) {
                LOG(INFO) << "BUG";
            }
            if (norm_rects.size() > 0)
            {
                json data = json({});
                data["hands"] = json::array();
-                for (const auto &norm_rect : norm_rects)
+                for (int i = 0; i < norm_rects.size(); i++)
                {
-                    if (norm_rect.width() == 0.0 && norm_rect.height() == 0.0 && norm_rect.x_center() == 0.0 && norm_rect.y_center() == 0.0 && norm_rect.rect_id() == 0) {
+                    const auto& norm_rect = norm_rects[i];
                    const auto& detection = detections[i];
                    if (norm_rect.width() == 0.0 && norm_rect.height() == 0.0 && norm_rect.x_center() == 0.0 && norm_rect.y_center() == 0.0 && norm_rect.rect_id() == 0)
                    {
                        continue;
                    }
                    // const auto& landmarks = landmark_lists2[i]["landmarks"];
                    // LOG(INFO) << "Inside" << landmark_lists2;
                    json empty_object_explicit = json::object();
                    empty_object_explicit["width"] = norm_rect.width();
                    empty_object_explicit["height"] = norm_rect.height();
                    empty_object_explicit["x_center"] = norm_rect.x_center();
                    empty_object_explicit["y_center"] = norm_rect.y_center();
                    empty_object_explicit["rect_id"] = norm_rect.rect_id();
                    empty_object_explicit["image_width"] = image_frame.width();
                    empty_object_explicit["image_height"] = image_frame.height();
                    // LOG(INFO) << landmarks;
                    if (landmark_lists2.size() >= (i + 1)) {
                        // LOG(INFO) << (landmark_lists2.size() - 1);
                        // LOG(INFO) << i;
                        const auto& landmarks = landmark_lists2[i]["landmarks"];
                        empty_object_explicit["landmarks"] = landmarks;
                    }
                    // empty_object_explicit["track_id"] = norm_rect.id();
                    data["hands"].push_back(empty_object_explicit);
                }
-                std::string s = data.dump();
+                data["timestamp"] = cc->InputTimestamp().Microseconds();
-                std::string topic = "Detection";
+                PublishJson("Detection", data);
                zmq::message_t message(topic.size());
                memcpy(message.data(), topic.c_str(), topic.size());
                socket.send(message, ZMQ_SNDMORE);
                zmq::message_t message2(s.size());
                memcpy(message2.data(), s.c_str(), s.size());
                socket.send(message2);
                std::cout << "Publishing" << s << std::endl;
            }
        }
        return ::mediapipe::OkStatus();
    }
    void PublishJson(const std::string &topic, const json &json_data)
    {
        std::string s = json_data.dump();
        // std::string topic = topic;
        zmq::message_t message(topic.size());
        memcpy(message.data(), topic.c_str(), topic.size());
        socket.send(message, ZMQ_SNDMORE);
        zmq::message_t message2(s.size());
        memcpy(message2.data(), s.c_str(), s.size());
        socket.send(message2);
        // std::cout << "Publishing" << s << std::endl;
    }
    json ConvertLandmarkListsToJson(const std::vector<NormalizedLandmarkList> &landmark_lists)
    {
        json landmark_list_json = json::array();
        for (const auto &landmark_list : landmark_lists)
        {
            json data = json({});
            data["landmarks"] = json::array();
            for (int i = 0; i < landmark_list.landmark_size(); ++i)
            {
                const NormalizedLandmark &landmark = landmark_list.landmark(i);
                json landmark_json = json::array();
                landmark_json.push_back(landmark.x());
                landmark_json.push_back(landmark.y());
                landmark_json.push_back(landmark.z());
                data["landmarks"].push_back(landmark_json);
            }
            landmark_list_json.push_back(data);
        }
        return landmark_list_json;
    }
 private:
    zmq::context_t context{1};
    zmq::socket_t socket{context, ZMQ_PUB};
--- a/mediapipe/examples/desktop/BUILD
+++ b/mediapipe/examples/desktop/BUILD
@ -72,5 +72,7 @@ cc_library(
        "//mediapipe/gpu:gl_calculator_helper",
        "//mediapipe/gpu:gpu_buffer",
        "//mediapipe/gpu:gpu_shared_data_internal",
        "//mediapipe/framework/formats:video_stream_header",
        "//mediapipe/calculators/core:packet_cloner_calculator"
    ],
 )
--- a/mediapipe/examples/desktop/multi_hand_tracking/BUILD
+++ b/mediapipe/examples/desktop/multi_hand_tracking/BUILD
@ -38,8 +38,10 @@ cc_binary(
    deps = [
        "//mediapipe/examples/desktop:demo_run_graph_main_gpu",
        "//mediapipe/calculators/ipc:zmq_calculator",
        "//mediapipe/calculators/image:sobel_edges_calculator",
        "//mediapipe/graphs/hand_tracking:multi_hand_mobile_calculators",
        "//mediapipe/graphs/face_mesh:desktop_live_gpu_calculators",
        "//mediapipe/calculators/video:opencv_video_encoder_calculator",
    ],
 )
--- a/mediapipe/graphs/hand_tracking/hand_face_detection_no_gating.pbtxt
+++ b/mediapipe/graphs/hand_tracking/hand_face_detection_no_gating.pbtxt
@ -0,0 +1,195 @@
 # MediaPipe graph that performs multi-hand tracking with TensorFlow Lite on GPU.
 # Used in the examples in
 # mediapipe/examples/android/src/java/com/mediapipe/apps/multihandtrackinggpu.
 # Images coming into and out of the graph.
 input_stream: "input_video"
 # input_stream: "input_video_header"
 output_stream: "output_video"
 # Collection of detected/processed faces, each represented as a list of
 # landmarks. (std::vector<NormalizedLandmarkList>)
 output_stream: "multi_face_landmarks"
 # Throttles the images flowing downstream for flow control. It passes through
 # the very first incoming image unaltered, and waits for downstream nodes
 # (calculators and subgraphs) in the graph to finish their tasks before it
 # passes through another image. All images that come in while waiting are
 # dropped, limiting the number of in-flight images in most part of the graph to
 # 1. This prevents the downstream nodes from queuing up incoming images and data
 # excessively, which leads to increased latency and memory usage, unwanted in
 # real-time mobile applications. It also eliminates unnecessarily computation,
 # e.g., the output produced by a node may get dropped downstream if the
 # subsequent nodes are still busy processing previous inputs.
 node {
  calculator: "FlowLimiterCalculator"
  input_stream: "input_video"
  input_stream: "FINISHED:multi_hand_rects"
  # input_stream: "FINISHED:output_video_1"
  input_stream_info: {
    tag_index: "FINISHED"
    back_edge: true
  }
  output_stream: "throttled_input_video"
 }
 # node {
 #   calculator: "FlowLimiterCalculator"
 #   input_stream: "input_video_header"
 #   input_stream: "FINISHED:multi_hand_rects"
 #   input_stream_info: {
 #     tag_index: "FINISHED"
 #     back_edge: true
 #   }
 #   output_stream: "throttled_input_video_header"
 # }
 # Subgraph that detections hands (see multi_hand_detection_gpu.pbtxt).
 node {
  calculator: "MultiHandDetectionSubgraph"
  input_stream: "throttled_input_video"
  output_stream: "DETECTIONS:multi_palm_detections"
  output_stream: "NORM_RECTS:multi_palm_rects"
 }
 # Subgraph that localizes hand landmarks for multiple hands (see
 # multi_hand_landmark.pbtxt).
 node {
  calculator: "MultiHandLandmarkSubgraph"
  input_stream: "IMAGE:throttled_input_video"
  input_stream: "NORM_RECTS:multi_hand_rects"
  output_stream: "LANDMARKS:multi_hand_landmarks"
  output_stream: "NORM_RECTS:multi_hand_rects_from_landmarks"
 }
 # Caches a hand rectangle fed back from MultiHandLandmarkSubgraph, and upon the
 # arrival of the next input image sends out the cached rectangle with the
 # timestamp replaced by that of the input image, essentially generating a packet
 # that carries the previous hand rectangle. Note that upon the arrival of the
 # very first input image, an empty packet is sent out to jump start the
 # feedback loop.
 node {
  calculator: "PreviousLoopbackCalculator"
  input_stream: "MAIN:throttled_input_video"
  input_stream: "LOOP:multi_hand_rects_from_landmarks"
  input_stream_info: {
    tag_index: "LOOP"
    back_edge: true
  }
  output_stream: "PREV_LOOP:prev_multi_hand_rects_from_landmarks"
 }
 # Performs association between NormalizedRect vector elements from previous
 # frame and those from the current frame if MultiHandDetectionSubgraph runs.
 # This calculator ensures that the output multi_hand_rects vector doesn't
 # contain overlapping regions based on the specified min_similarity_threshold.
 node {
  calculator: "AssociationNormRectCalculator"
  input_stream: "prev_multi_hand_rects_from_landmarks"
  input_stream: "multi_palm_rects"
  output_stream: "multi_hand_rects"
  node_options: {
    [type.googleapis.com/mediapipe.AssociationCalculatorOptions] {
      min_similarity_threshold: 0.5
    }
  }
 }
 # Defines side packets for further use in the graph.
 node {
  calculator: "ConstantSidePacketCalculator"
  output_side_packet: "PACKET:num_faces"
  node_options: {
    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
      packet { int_value: 1 }
    }
  }
 }
 # Subgraph that detects faces and corresponding landmarks.
 node {
  calculator: "FaceLandmarkFrontGpu"
  input_stream: "IMAGE:throttled_input_video"
  input_side_packet: "NUM_FACES:num_faces"
  output_stream: "LANDMARKS:multi_face_landmarks"
  output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
  output_stream: "DETECTIONS:face_detections"
  output_stream: "ROIS_FROM_DETECTIONS:face_rects_from_detections"
 }
 node {
  calculator: "PacketClonerCalculator"
  input_stream: "multi_hand_landmarks"
  input_stream: "multi_palm_detections"
  input_stream: "multi_hand_rects"
  input_stream: "throttled_input_video"
  output_stream: "cloned_multi_hand_landmarks"
  output_stream: "cloned_multi_palm_detections"
  output_stream: "cloned_multi_hand_rects"
 }
 node {
  calculator: "ZmqCalculator"
  input_stream: "FACE_LANDMARKS:multi_face_landmarks"
  input_stream: "LANDMARKS:cloned_multi_hand_landmarks"
  input_stream: "HAND_DETECTIONS:cloned_multi_palm_detections"
  input_stream: "NORM_RECTS:cloned_multi_hand_rects"
  input_stream: "IMAGE:throttled_input_video"
 }
 # Subgraph that renders face-landmark annotation onto the input image.
 node {
   calculator: "FaceRendererGpu"
   input_stream: "IMAGE:throttled_input_video"
   input_stream: "LANDMARKS:multi_face_landmarks"
   input_stream: "NORM_RECTS:face_rects_from_landmarks"
   input_stream: "DETECTIONS:face_detections"
   output_stream: "IMAGE:output_video_1"
 }
 # Subgraph that renders annotations and overlays them on top of the input
 # images (see multi_hand_renderer_gpu.pbtxt).
 node {
  calculator: "MultiHandRendererSubgraph"
  input_stream: "IMAGE:output_video_1"
  input_stream: "DETECTIONS:multi_palm_detections"
  input_stream: "LANDMARKS:multi_hand_landmarks"
  input_stream: "NORM_RECTS:0:multi_palm_rects"
  # input_stream: "NORM_RECTS:1:multi_hand_rects"
  output_stream: "IMAGE:output_video"
 }
 # Defines side packets for further use in the graph.
 node {
  calculator: "ConstantSidePacketCalculator"
  output_side_packet: "PACKET:output_video_path"
  node_options: {
    [type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
      packet { string_value: "hand_tracking.mp4" }
    }
  }
 }
 node: {
  calculator: "GpuBufferToImageFrameCalculator"
  input_stream: "output_video"
  output_stream: "output_video_cpu"
 }
 # Encodes the annotated images into a video file, adopting properties specified
 # in the input video header, e.g., video framerate.
 # node {
 #   calculator: "OpenCvVideoEncoderCalculator"
 #   input_stream: "VIDEO:output_video_cpu"
 #   input_stream: "VIDEO_PRESTREAM:throttled_input_video_header"
 #   input_side_packet: "OUTPUT_FILE_PATH:output_video_path"
 #   node_options: {
 #     [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: {
 #       codec: "avc1"
 #       video_format: "mp4"
 #     }
 #   }
 # }
--- a/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt
+++ b/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt
@ -60,7 +60,8 @@ node {
  output_stream: "TENSORS:output_tensors"
  node_options: {
    [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
-      model_path: "mediapipe/models/hand_landmark.tflite"
+      # model_path: "mediapipe/models/hand_landmark.tflite"
      model_path: "mediapipe/models/hand_landmark_3d.tflite"
      use_gpu: true
    }
  }