diff --git a/README.md b/README.md index b16793b99..1630df000 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ * [Hair Segmentation](mediapipe/docs/hair_segmentation_mobile_gpu.md) [[Web Demo]](https://viz.mediapipe.dev/runner/demos/hair_segmentation/hair_segmentation.html) * [Object Detection](mediapipe/docs/object_detection_mobile_gpu.md) * [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md) +* [Objectron: 3D Object Detection and Tracking](mediapipe/docs/objectron_mobile_gpu.md) * [AutoFlip](mediapipe/docs/autoflip.md) ![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif) @@ -43,6 +44,8 @@ A web-based visualizer is hosted on [viz.mediapipe.dev](https://viz.mediapipe.de * [YouTube Channel](https://www.youtube.com/channel/UCObqmpuSMx-usADtL_qdMAw) ## Publications +* [MediaPipe Objectron: Real-time 3D Object Detection on Mobile Devices](https://mediapipe.page.link/objectron-aiblog) +* [AutoFlip: An Open Source Framework for Intelligent Video Reframing](https://mediapipe.page.link/autoflip) * [Google Developer Blog: MediaPipe on the Web](https://mediapipe.page.link/webdevblog) * [Google Developer Blog: Object Detection and Tracking using MediaPipe](https://mediapipe.page.link/objecttrackingblog) * [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html) @@ -63,7 +66,7 @@ A web-based visualizer is hosted on [viz.mediapipe.dev](https://viz.mediapipe.de * [Discuss](https://groups.google.com/forum/#!forum/mediapipe) - General community discussion around MediaPipe ## Alpha Disclaimer -MediaPipe is currently in alpha for v0.6. We are still making breaking API changes and expect to get to stable API by v1.0. +MediaPipe is currently in alpha for v0.7. We are still making breaking API changes and expect to get to stable API by v1.0. ## Contributing We welcome contributions. Please follow these [guidelines](./CONTRIBUTING.md). diff --git a/mediapipe/calculators/image/image_cropping_calculator.cc b/mediapipe/calculators/image/image_cropping_calculator.cc index a9c7ae657..79a3221ea 100644 --- a/mediapipe/calculators/image/image_cropping_calculator.cc +++ b/mediapipe/calculators/image/image_cropping_calculator.cc @@ -75,11 +75,28 @@ REGISTER_CALCULATOR(ImageCroppingCalculator); } #endif // !MEDIAPIPE_DISABLE_GPU - RET_CHECK(cc->Inputs().HasTag(kRectTag) ^ cc->Inputs().HasTag(kNormRectTag) ^ - (cc->Options() - .has_norm_width() && - cc->Options() - .has_norm_height())); + int flags = 0; + if (cc->Inputs().HasTag(kRectTag)) { + ++flags; + } + if (cc->Inputs().HasTag(kWidthTag) && cc->Inputs().HasTag(kHeightTag)) { + ++flags; + } + if (cc->Inputs().HasTag(kNormRectTag)) { + ++flags; + } + if (cc->Options() + .has_norm_width() && + cc->Options() + .has_norm_height()) { + ++flags; + } + if (cc->Options().has_width() && + cc->Options().has_height()) { + ++flags; + } + RET_CHECK(flags == 1) << "Illegal combination of input streams/options."; + if (cc->Inputs().HasTag(kRectTag)) { cc->Inputs().Tag(kRectTag).Set(); } diff --git a/mediapipe/calculators/util/BUILD b/mediapipe/calculators/util/BUILD index 6f7064e40..b5df9ff04 100644 --- a/mediapipe/calculators/util/BUILD +++ b/mediapipe/calculators/util/BUILD @@ -39,6 +39,15 @@ proto_library( ], ) +proto_library( + name = "timed_box_list_id_to_label_calculator_proto", + srcs = ["timed_box_list_id_to_label_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_proto", + ], +) + proto_library( name = "latency_proto", srcs = ["latency.proto"], @@ -113,6 +122,18 @@ mediapipe_cc_proto_library( ], ) +mediapipe_cc_proto_library( + name = "timed_box_list_id_to_label_calculator_cc_proto", + srcs = ["timed_box_list_id_to_label_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [ + ":timed_box_list_id_to_label_calculator_proto", + ], +) + mediapipe_cc_proto_library( name = "latency_cc_proto", srcs = ["latency.proto"], @@ -313,6 +334,34 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "timed_box_list_id_to_label_calculator", + srcs = ["timed_box_list_id_to_label_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":timed_box_list_id_to_label_calculator_cc_proto", + "//mediapipe/framework/port:status", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:packet", + "//mediapipe/util/tracking:box_tracker_cc_proto", + "//mediapipe/util:resource_util", + ] + select({ + "//mediapipe:android": [ + "//mediapipe/util/android/file/base", + ], + "//mediapipe:apple": [ + "//mediapipe/util/android/file/base", + ], + "//mediapipe:macos": [ + "//mediapipe/framework/port:file_helpers", + ], + "//conditions:default": [ + "//mediapipe/framework/port:file_helpers", + ], + }), + alwayslink = 1, +) + cc_library( name = "non_max_suppression_calculator", srcs = ["non_max_suppression_calculator.cc"], diff --git a/mediapipe/calculators/util/detection_label_id_to_text_calculator.cc b/mediapipe/calculators/util/detection_label_id_to_text_calculator.cc index 4ac09e5af..4d6d980bf 100644 --- a/mediapipe/calculators/util/detection_label_id_to_text_calculator.cc +++ b/mediapipe/calculators/util/detection_label_id_to_text_calculator.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mediapipe//framework/packet.h" #include "mediapipe/calculators/util/detection_label_id_to_text_calculator.pb.h" #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/packet.h" #include "mediapipe/framework/port/status.h" #include "mediapipe/util/resource_util.h" diff --git a/mediapipe/calculators/util/timed_box_list_id_to_label_calculator.cc b/mediapipe/calculators/util/timed_box_list_id_to_label_calculator.cc new file mode 100644 index 000000000..c01327b9b --- /dev/null +++ b/mediapipe/calculators/util/timed_box_list_id_to_label_calculator.cc @@ -0,0 +1,105 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/calculators/util/timed_box_list_id_to_label_calculator.pb.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/packet.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/util/resource_util.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +#if defined(MEDIAPIPE_MOBILE) +#include "mediapipe/util/android/file/base/file.h" +#include "mediapipe/util/android/file/base/helpers.h" +#else +#include "mediapipe/framework/port/file_helpers.h" +#endif + +namespace mediapipe { + +using mediapipe::TimedBoxProto; +using mediapipe::TimedBoxProtoList; + +// Takes a label map (from label IDs to names), and populate the label field in +// TimedBoxProto according to it's ID. +// +// Example usage: +// node { +// calculator: "TimedBoxListIdToLabelCalculator" +// input_stream: "input_timed_box_list" +// output_stream: "output_timed_box_list" +// node_options: { +// [mediapipe.TimedBoxListIdToLabelCalculatorOptions] { +// label_map_path: "labelmap.txt" +// } +// } +// } +class TimedBoxListIdToLabelCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + std::unordered_map label_map_; +}; +REGISTER_CALCULATOR(TimedBoxListIdToLabelCalculator); + +::mediapipe::Status TimedBoxListIdToLabelCalculator::GetContract( + CalculatorContract* cc) { + cc->Inputs().Index(0).Set(); + cc->Outputs().Index(0).Set(); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TimedBoxListIdToLabelCalculator::Open( + CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + + const auto& options = + cc->Options<::mediapipe::TimedBoxListIdToLabelCalculatorOptions>(); + + std::string string_path; + ASSIGN_OR_RETURN(string_path, PathToResourceAsFile(options.label_map_path())); + std::string label_map_string; + MP_RETURN_IF_ERROR(file::GetContents(string_path, &label_map_string)); + + std::istringstream stream(label_map_string); + std::string line; + int i = 0; + while (std::getline(stream, line)) { + label_map_[i++] = line; + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TimedBoxListIdToLabelCalculator::Process( + CalculatorContext* cc) { + const auto& input_list = cc->Inputs().Index(0).Get(); + auto output_list = absl::make_unique(); + for (const auto& input_box : input_list.box()) { + TimedBoxProto* box_ptr = output_list->add_box(); + *box_ptr = input_box; + + if (label_map_.find(input_box.id()) != label_map_.end()) { + box_ptr->set_label(label_map_[input_box.id()]); + } + } + cc->Outputs().Index(0).Add(output_list.release(), cc->InputTimestamp()); + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/calculators/util/timed_box_list_id_to_label_calculator.proto b/mediapipe/calculators/util/timed_box_list_id_to_label_calculator.proto new file mode 100644 index 000000000..95cf76590 --- /dev/null +++ b/mediapipe/calculators/util/timed_box_list_id_to_label_calculator.proto @@ -0,0 +1,28 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message TimedBoxListIdToLabelCalculatorOptions { + extend mediapipe.CalculatorOptions { + optional TimedBoxListIdToLabelCalculatorOptions ext = 297701606; + } + + // Path to a label map file for getting the actual name of detected classes. + optional string label_map_path = 1; +} diff --git a/mediapipe/calculators/util/timed_box_list_to_render_data_calculator.cc b/mediapipe/calculators/util/timed_box_list_to_render_data_calculator.cc index 51c9f2c83..d76498d87 100644 --- a/mediapipe/calculators/util/timed_box_list_to_render_data_calculator.cc +++ b/mediapipe/calculators/util/timed_box_list_to_render_data_calculator.cc @@ -66,6 +66,25 @@ void AddTimedBoxProtoToRenderData( rect->set_bottom(box_proto.bottom()); rect->set_rotation(box_proto.rotation()); } + + if (box_proto.has_label()) { + auto* label_annotation = render_data->add_render_annotations(); + label_annotation->mutable_color()->set_r(options.box_color().r()); + label_annotation->mutable_color()->set_g(options.box_color().g()); + label_annotation->mutable_color()->set_b(options.box_color().b()); + label_annotation->set_thickness(options.thickness()); + RenderAnnotation::Text* text = label_annotation->mutable_text(); + text->set_display_text(box_proto.label()); + text->set_normalized(true); + constexpr float text_left_start = 0.3f; + text->set_left((1.0f - text_left_start) * box_proto.left() + + text_left_start * box_proto.right()); + constexpr float text_baseline = 0.6f; + text->set_baseline(text_baseline * box_proto.bottom() + + (1.0f - text_baseline) * box_proto.top()); + constexpr float text_height = 0.2f; + text->set_font_height((box_proto.bottom() - box_proto.top()) * text_height); + } } } // namespace diff --git a/mediapipe/docs/autoflip.md b/mediapipe/docs/autoflip.md index ac4199d05..b250db3bf 100644 --- a/mediapipe/docs/autoflip.md +++ b/mediapipe/docs/autoflip.md @@ -15,6 +15,9 @@ For overall context on AutoFlip, please read this Run the following command to build the AutoFlip pipeline: +Note: AutoFlip currently only works with OpenCV 3 . Please verify your OpenCV +version beforehand. + ```bash bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/autoflip:run_autoflip ``` diff --git a/mediapipe/docs/examples.md b/mediapipe/docs/examples.md index 3bc6e9617..506564573 100644 --- a/mediapipe/docs/examples.md +++ b/mediapipe/docs/examples.md @@ -44,6 +44,14 @@ graphs can be easily adapted to run on CPU v.s. GPU. [Object Detection and Tracking with GPU](./object_tracking_mobile_gpu.md) illustrates how to use MediaPipe for object detection and tracking. +### Objectron: 3D Object Detection and Tracking with GPU + +[MediaPipe Objectron is 3D Object Detection with GPU](./objectron_mobile_gpu.md) +illustrates mobile real-time 3D object detection and tracking pipeline for every +day objects like shoes and chairs + +* [Android](./objectron_mobile_gpu.md) + ### Face Detection with GPU [Face Detection with GPU](./face_detection_mobile_gpu.md) illustrates how to use diff --git a/mediapipe/docs/images/mobile/object_detection_3d_android_gpu.png b/mediapipe/docs/images/mobile/object_detection_3d_android_gpu.png new file mode 100644 index 000000000..4b0372d16 Binary files /dev/null and b/mediapipe/docs/images/mobile/object_detection_3d_android_gpu.png differ diff --git a/mediapipe/docs/images/mobile/objectron_chair_android_gpu.gif b/mediapipe/docs/images/mobile/objectron_chair_android_gpu.gif new file mode 100644 index 000000000..abd1652ca Binary files /dev/null and b/mediapipe/docs/images/mobile/objectron_chair_android_gpu.gif differ diff --git a/mediapipe/docs/images/mobile/objectron_detection_subgraph.png b/mediapipe/docs/images/mobile/objectron_detection_subgraph.png new file mode 100644 index 000000000..4d3bbc422 Binary files /dev/null and b/mediapipe/docs/images/mobile/objectron_detection_subgraph.png differ diff --git a/mediapipe/docs/images/mobile/objectron_shoe_android_gpu.gif b/mediapipe/docs/images/mobile/objectron_shoe_android_gpu.gif new file mode 100644 index 000000000..117cdc5de Binary files /dev/null and b/mediapipe/docs/images/mobile/objectron_shoe_android_gpu.gif differ diff --git a/mediapipe/docs/images/mobile/objectron_tracking_subgraph.png b/mediapipe/docs/images/mobile/objectron_tracking_subgraph.png new file mode 100644 index 000000000..34296a502 Binary files /dev/null and b/mediapipe/docs/images/mobile/objectron_tracking_subgraph.png differ diff --git a/mediapipe/docs/install.md b/mediapipe/docs/install.md index 899f6a5aa..c7b227d76 100644 --- a/mediapipe/docs/install.md +++ b/mediapipe/docs/install.md @@ -364,8 +364,10 @@ To build and run iOS apps: ### Installing on Windows Subsystem for Linux (WSL) -Note: WSL has historically not provided access to USB cameras. Mediapipe can use -a video file as input. +Note: The pre-built OpenCV packages don't support cameras in WSL. Unless you +[compile](https://funvision.blogspot.com/2019/12/opencv-web-camera-and-video-streams-in.html) +OpenCV with FFMPEG and GStreamer in WSL, the live demos won't work with any +cameras. Alternatively, you use a video file as input. 1. Follow the [instruction](https://docs.microsoft.com/en-us/windows/wsl/install-win10) to @@ -373,7 +375,7 @@ a video file as input. 2. Install Windows ADB and start the ADB server in Windows. - Note: Window’s and WSL’s adb versions must be the same version, e.g., if WSL + Note: Windows' and WSL’s adb versions must be the same version, e.g., if WSL has ADB 1.0.39, you need to download the corresponding Windows ADB from [here](https://dl.google.com/android/repository/platform-tools_r26.0.1-windows.zip). diff --git a/mediapipe/docs/object_detection_desktop.md b/mediapipe/docs/object_detection_desktop.md index efd23ef86..6ad872927 100644 --- a/mediapipe/docs/object_detection_desktop.md +++ b/mediapipe/docs/object_detection_desktop.md @@ -26,6 +26,7 @@ To build and run the TensorFlow example on desktop, run: $ bazel build -c opt \ --define MEDIAPIPE_DISABLE_GPU=1 \ --define no_aws_support=true \ + --linkopt=-s \ mediapipe/examples/desktop/object_detection:object_detection_tensorflow # It should print: diff --git a/mediapipe/docs/objectron_mobile_gpu.md b/mediapipe/docs/objectron_mobile_gpu.md new file mode 100644 index 000000000..e0dda48c2 --- /dev/null +++ b/mediapipe/docs/objectron_mobile_gpu.md @@ -0,0 +1,489 @@ +# MediaPipe Objectron (GPU) + +This doc focuses on the +[below example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt) +that performs 3D object detection and tracking with TensorFlow Lite on GPU. + +Objectron for shoes | Objectron for chairs +:-----------------------------------------------------------------------------: | :------------------: +![objectron_shoe_android_gpu_gif](images/mobile/objectron_shoe_android_gpu.gif) | ![objectron_chair_android_gpu_gif](images/mobile/objectron_chair_android_gpu.gif) + +For overall context on MediaPipe Objectron, please read the +[Google AI Blog](https://mediapipe.page.link/objectron-aiblog). The Objectron's +ML model (see also the [model card](https://mediapipe.page.link/objectron-mc)) +estimates a 3D bounding box for the detected object. + +## Android + +[Source](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d) + +An arm64 build of Objectron for shoes can be +[downloaded here](https://drive.google.com/open?id=1S0K4hbWt3o31FfQ4QU3Rz7IHrvOUMx1d), +and for chairs can be +[downloaded here](https://drive.google.com/open?id=1MM8K-13bXLCVS1EHQ-KgkVyEahEPrKej). + +To build and install the Objectron for shoes: + +```bash +bazel build -c opt --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d +``` + +Similarly to build and install the Objectron for chairs, add **--define +chair=true** flag to build command. + +```bash +bazel build -c opt --define chair=true --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d +``` + +Once the app is built, install in on Android device with: + +```bash +adb install bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/objectdetection3d.apk +``` + +## Graph + +The Objectron main graph internally utilizes the Objectron detection subgraph, +and the Objectron tracking subgraph. To visualize the graph as shown above, copy +the text specification of the graph below and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). + +### Main Graph + +This is the main graph for the shoe detector. This graph runs detection and +tracking and renders the output to the display. + +![object_detection_mobile_gpu_graph](images/mobile/object_detection_3d_android_gpu.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt) + +```bash +# MediaPipe object detection 3D with tracking graph. + +# Images on GPU coming into and out of the graph. +input_stream: "input_video" +output_stream: "output_video" + +# Creates a copy of the input_video stream. At the end of the graph, the +# GlAnimationOverlayCalculator will consume the input_video texture and draws +# on top of it. +node: { + calculator: "GlScalerCalculator" + input_stream: "VIDEO:input_video" + output_stream: "VIDEO:input_video_copy" +} + +# Resamples the images by specific frame rate. This calculator is used to +# control the frequecy of subsequent calculators/subgraphs, e.g. less power +# consumption for expensive process. +node { + calculator: "PacketResamplerCalculator" + input_stream: "DATA:input_video_copy" + output_stream: "DATA:sampled_input_video" + node_options: { + [type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] { + frame_rate: 5 + } + } +} + +node { + calculator: "ObjectronDetectionSubgraphGpu" + input_stream: "IMAGE_GPU:sampled_input_video" + output_stream: "ANNOTATIONS:objects" +} + +node { + calculator: "ObjectronTrackingSubgraphGpu" + input_stream: "FRAME_ANNOTATION:objects" + input_stream: "IMAGE_GPU:input_video_copy" + output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" +} + +# The rendering nodes: +# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly +# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask. +# These models are designed using different tools, so we supply a transformation +# to bring both of them to the Objectron's coordinate system. + +# Creates a model matrices for the tracked object given the lifted 3D points. +# This calculator does two things: 1) Estimates object's pose (orientation, +# translation, and scale) from the 3D vertices, and +# 2) bring the object from the objectron's coordinate system to the renderer +# (OpenGL) coordinate system. Since the final goal is to render a mesh file on +# top of the object, we also supply a transformation to bring the mesh to the +# objectron's coordinate system, and rescale mesh to the unit size. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_tracked_objects" + output_stream: "MODEL_MATRICES:model_matrices" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: 0.05 + model_scale: 0.05 + model_scale: 0.05 + # Bring the box CAD model to objectron's coordinate system. This + # is equivalent of -pi/2 rotation along the y-axis (right-hand rule): + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY()) + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: -1.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 1.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 1.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 1.0 + } + } +} + +# Compute the model matrices for the CAD model of the shoe, to be used as an +# occlusion mask. The model will be rendered at the exact same location as the +# bounding box. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_tracked_objects" + output_stream: "MODEL_MATRICES:mask_model_matrices" + #input_side_packet: "MODEL_SCALE:model_scale" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: 0.45 + model_scale: 0.25 + model_scale: 0.15 + # Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This + # is equivalent of -pi/2 rotation along the x-axis (right-hand rule): + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX()) + model_transformation: 1.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 1.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: -1.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 0.0 + model_transformation: 1.0 + } + } +} + +# Render everything together. First we render the 3D bounding box animation, +# then we render the occlusion mask. +node: { + calculator: "GlAnimationOverlayCalculator" + input_stream: "VIDEO:input_video" + input_stream: "MODEL_MATRICES:model_matrices" + input_stream: "MASK_MODEL_MATRICES:mask_model_matrices" + output_stream: "output_video" + input_side_packet: "TEXTURE:box_texture" + input_side_packet: "ANIMATION_ASSET:box_asset_name" + input_side_packet: "MASK_TEXTURE:obj_texture" + input_side_packet: "MASK_ASSET:obj_asset_name" + node_options: { + [type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] { + # Output resolution is 480x640 with the aspect ratio of 0.75 + aspect_ratio: 0.75 + vertical_fov_degrees: 70. + animation_speed_fps: 25 + } + } +} + +``` + +### Objectron Detection Subgraph + +Objectron detection subgraph uses the *TfLiteInferenceCalculator* to run +inference and decodes the output tensor to *FrameAnnotation* protobuf. The +*FrameAnnotation* contains nine keypoints: the bounding box's center, as well as +its eight vertices. The boxes will be passed to the Objectron tracking subgraph. + +![object_detection_subgraph](images/mobile/objectron_detection_subgraph.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt) + +```bash +# MediaPipe Objectron detection gpu subgraph + +type: "ObjectronDetectionSubgraphGpu" + +input_stream: "IMAGE_GPU:input_video" +output_stream: "ANNOTATIONS:objects" + +# Transforms the input image on GPU to a 480x640 image. To scale the input +# image, the scale_mode option is set to FIT to preserve the aspect ratio, +# resulting in potential letterboxing in the transformed image. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 480 + output_height: 640 + scale_mode: FIT + } + } +} + +# Converts the transformed input image on GPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE_GPU:transformed_input_video" + output_stream: "TENSORS_GPU:image_tensor" +} + +# Generates a single side packet containing a TensorFlow Lite op resolver that +# supports custom ops needed by the model used in this graph. +node { + calculator: "TfLiteCustomOpResolverCalculator" + output_side_packet: "opresolver" + node_options: { + [type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] { + use_gpu: true + } + } +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS_GPU:image_tensor" + output_stream: "TENSORS:detection_tensors" + input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "object_detection_3d.tflite" + } + } +} + +# Decodes the model's output tensor (the heatmap and the distance fields) to 2D +# keypoints. There are nine 2D keypoints: one center keypoint and eight vertices +# for the 3D bounding box. The calculator parameters determine's the decoder's +# sensitivity. +node { + calculator: "TfLiteTensorsToObjectsCalculator" + input_stream: "TENSORS:detection_tensors" + output_stream: "ANNOTATIONS:objects" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToObjectsCalculatorOptions] { + num_classes: 1 + num_keypoints: 9 + decoder_config { + heatmap_threshold: 0.6 + local_max_distance: 2 + offset_scale_coef: 1.0 + voting_radius: 2 + voting_allowance: 1 + voting_threshold: 0.2 + } + } + } +} +``` + +### Object Tracking Subgraph + +Object tracking subgraph uses a *BoxTracker* calculator which is a generic +tracking library, also used in +[Mediapipe's 2D Object Detection and Tracking](https://github.com/google/mediapipe/tree/master/mediapipe/g3doc/object_tracking_mobile_gpu.md). +The tracking runs every frame and when a new detection is available, it +consolidates the detection and tracking results. The tracker tracks the box with +its 2D keypoints, so at the end we lift the 2D keypoints to 3D using EPnP +algorithm in *Lift2DFrameAnnotationTo3D* Calculator. + +![object_tracking_subgraph](images/mobile/objectron_tracking_subgraph.png) + +[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_tracking_gpu.pbtxt) + +```bash +# MediaPipe Objectron tracking gpu subgraph + +type: "ObjectronTrackingSubgraphGpu" + +input_stream: "FRAME_ANNOTATION:objects" +input_stream: "IMAGE_GPU:input_video" +output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" + + +# Converts the detected keypoints to Boxes, used by the tracking subgraph. +node { + calculator: "FrameAnnotationToTimedBoxListCalculator" + input_stream: "FRAME_ANNOTATION:objects" + output_stream: "BOXES:start_pos" +} + +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:downscaled_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 240 + output_height: 320 + } + } +} + +# Converts GPU buffer to ImageFrame for processing tracking. +node: { + calculator: "GpuBufferToImageFrameCalculator" + input_stream: "downscaled_input_video" + output_stream: "downscaled_input_video_cpu" +} + +# Performs motion analysis on an incoming video stream. +node: { + calculator: "MotionAnalysisCalculator" + input_stream: "VIDEO:downscaled_input_video_cpu" + output_stream: "CAMERA:camera_motion" + output_stream: "FLOW:region_flow" + + node_options: { + [type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: { + analysis_options { + analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE + flow_options { + fast_estimation_min_block_size: 100 + top_inlier_sets: 1 + frac_inlier_error_threshold: 3e-3 + downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE + verification_distance: 5.0 + verify_long_feature_acceleration: true + verify_long_feature_trigger_ratio: 0.1 + tracking_options { + max_features: 500 + adaptive_extraction_levels: 2 + min_eig_val_settings { + adaptive_lowest_quality_level: 2e-4 + } + klt_tracker_implementation: KLT_OPENCV + } + } + } + } + } +} + +# Reads optical flow fields defined in +# mediapipe/framework/formats/motion/optical_flow_field.h, +# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized +# to 0-255. +node: { + calculator: "FlowPackagerCalculator" + input_stream: "FLOW:region_flow" + input_stream: "CAMERA:camera_motion" + output_stream: "TRACKING:tracking_data" + + node_options: { + [type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: { + flow_packager_options: { + binary_tracking_data_support: false + } + } + } +} + +# Tracks box positions over time. +node: { + calculator: "BoxTrackerCalculator" + input_stream: "TRACKING:tracking_data" + input_stream: "TRACK_TIME:input_video" + input_stream: "START_POS:start_pos" + input_stream: "CANCEL_OBJECT_ID:cancel_object_id" + input_stream_info: { + tag_index: "CANCEL_OBJECT_ID" + back_edge: true + } + output_stream: "BOXES:boxes" + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "TRACKING" + tag_index: "TRACK_TIME" + } + sync_set { + tag_index: "START_POS" + } + sync_set { + tag_index: "CANCEL_OBJECT_ID" + } + } + } + } + + node_options: { + [type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: { + tracker_options: { + track_step_options { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} + +# Consolidates tracking and detection results. +node { + calculator: "FrameAnnotationTrackerCalculator" + input_stream: "FRAME_ANNOTATION:objects" + input_stream: "TRACKED_BOXES:boxes" + output_stream: "TRACKED_FRAME_ANNOTATION:tracked_objects" + output_stream: "CANCEL_OBJECT_ID:cancel_object_id" + node_options: { + [type.googleapis.com/mediapipe.FrameAnnotationTrackerCalculatorOptions] { + img_width: 240 + img_height: 320 + } + } + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "FRAME_ANNOTATION" + } + sync_set { + tag_index: "TRACKED_BOXES" + } + } + } + } +} + +# Lift the tracked 2D keypoints to 3D using EPnP algorithm. +node { + calculator: "Lift2DFrameAnnotationTo3DCalculator" + input_stream: "FRAME_ANNOTATION:tracked_objects" + output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" +} +``` diff --git a/mediapipe/docs/youtube_8m.md b/mediapipe/docs/youtube_8m.md index 3b26c2129..3b42aca30 100644 --- a/mediapipe/docs/youtube_8m.md +++ b/mediapipe/docs/youtube_8m.md @@ -61,6 +61,8 @@ videos. ```bash # cd to the root directory of the MediaPipe repo cd - + + pip3 install tf_slim python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph ``` @@ -78,7 +80,7 @@ videos. 5. Run the MediaPipe binary to extract the features. ```bash - bazel build -c opt \ + bazel build -c opt --linkopt=-s \ --define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \ mediapipe/examples/desktop/youtube8m:extract_yt8m_features @@ -126,13 +128,13 @@ the inference for both local videos and the dataset 2. Build the inference binary. ```bash - bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ + bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \ mediapipe/examples/desktop/youtube8m:model_inference ``` 3. Run the python web server. - Note: pip install absl-py + Note: pip3 install absl-py ```bash python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd` @@ -162,7 +164,7 @@ the inference for both local videos and the dataset 3. Build and run the inference binary. ```bash - bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ + bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \ mediapipe/examples/desktop/youtube8m:model_inference # segment_size is the number of seconds window of frames. diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/AndroidManifest.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/AndroidManifest.xml new file mode 100644 index 000000000..73ade8b08 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/AndroidManifest.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD new file mode 100644 index 000000000..1f1f203b4 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/BUILD @@ -0,0 +1,115 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:private"]) + +cc_binary( + name = "libmediapipe_jni.so", + linkshared = 1, + linkstatic = 1, + deps = [ + "//mediapipe/graphs/object_detection_3d:mobile_calculators", + "//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni", + ], +) + +cc_library( + name = "mediapipe_jni_lib", + srcs = [":libmediapipe_jni.so"], + alwayslink = 1, +) + +# To use the "chair" model instead of the default "shoes" model, +# add "--define chair=true" to the bazel build command. +config_setting( + name = "use_chair_model", + define_values = { + "chair": "true", + }, +) + +# Maps the binary graph to an alias (e.g., the app name) for convenience so that the alias can be +# easily incorporated into the app via, for example, +# MainActivity.BINARY_GRAPH_NAME = "appname.binarypb". +genrule( + name = "binary_graph", + srcs = select({ + "//conditions:default": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_shoe"], + ":use_chair_model": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_chair"], + }), + outs = ["objectdetection3d.binarypb"], + cmd = "cp $< $@", +) + +genrule( + name = "model", + srcs = select({ + "//conditions:default": ["//mediapipe/models:object_detection_3d_sneakers.tflite"], + ":use_chair_model": ["//mediapipe/models:object_detection_3d_chair.tflite"], + }), + outs = ["object_detection_3d.tflite"], + cmd = "cp $< $@", +) + +android_library( + name = "mediapipe_lib", + srcs = glob(["*.java"]), + assets = [ + ":binary_graph", + ":model", + "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:box.obj.uuu", + "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:classic_colors.png", + ] + select({ + "//conditions:default": [ + "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:model.obj.uuu", + "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:texture.bmp", + ], + ":use_chair_model": [ + "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:model.obj.uuu", + "//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:texture.bmp", + ], + }), + assets_dir = "", + manifest = "AndroidManifest.xml", + resource_files = glob(["res/**"]), + deps = [ + ":mediapipe_jni_lib", + "//mediapipe/framework/formats:landmark_java_proto_lite", + "//mediapipe/java/com/google/mediapipe/components:android_camerax_helper", + "//mediapipe/java/com/google/mediapipe/components:android_components", + "//mediapipe/java/com/google/mediapipe/framework:android_framework", + "//mediapipe/java/com/google/mediapipe/glutil", + "//third_party:androidx_appcompat", + "//third_party:androidx_constraint_layout", + "//third_party:androidx_legacy_support_v4", + "//third_party:androidx_recyclerview", + "//third_party:opencv", + "@androidx_concurrent_futures//jar", + "@androidx_lifecycle//jar", + "@com_google_code_findbugs//jar", + "@com_google_guava_android//jar", + ], +) + +android_binary( + name = "objectdetection3d", + manifest = "AndroidManifest.xml", + manifest_values = {"applicationId": "com.google.mediapipe.apps.objectdetection3d"}, + multidex = "native", + deps = [ + ":mediapipe_lib", + ], +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java new file mode 100644 index 000000000..6423cd3da --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/MainActivity.java @@ -0,0 +1,280 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.google.mediapipe.apps.objectdetection3d; + +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.graphics.SurfaceTexture; +import android.os.Bundle; +import androidx.appcompat.app.AppCompatActivity; +import android.util.Log; +import android.util.Size; +import android.view.SurfaceHolder; +import android.view.SurfaceView; +import android.view.View; +import android.view.ViewGroup; +import com.google.mediapipe.components.CameraHelper; +import com.google.mediapipe.components.CameraXPreviewHelper; +import com.google.mediapipe.components.ExternalTextureConverter; +import com.google.mediapipe.components.FrameProcessor; +import com.google.mediapipe.components.PermissionHelper; +import com.google.mediapipe.framework.AndroidAssetUtil; +import com.google.mediapipe.framework.AndroidPacketCreator; +import com.google.mediapipe.framework.Packet; +import com.google.mediapipe.glutil.EglManager; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +/** Main activity of MediaPipe example apps. */ +public class MainActivity extends AppCompatActivity { + private static final String TAG = "MainActivity"; + + private static final String BINARY_GRAPH_NAME = "objectdetection3d.binarypb"; + private static final String INPUT_VIDEO_STREAM_NAME = "input_video"; + private static final String OUTPUT_VIDEO_STREAM_NAME = "output_video"; + + private static final String OBJ_TEXTURE = "texture.bmp"; + private static final String OBJ_FILE = "model.obj.uuu"; + private static final String BOX_TEXTURE = "classic_colors.png"; + private static final String BOX_FILE = "box.obj.uuu"; + + private static final CameraHelper.CameraFacing CAMERA_FACING = CameraHelper.CameraFacing.BACK; + + // Flips the camera-preview frames vertically before sending them into FrameProcessor to be + // processed in a MediaPipe graph, and flips the processed frames back when they are displayed. + // This is needed because OpenGL represents images assuming the image origin is at the bottom-left + // corner, whereas MediaPipe in general assumes the image origin is at top-left. + private static final boolean FLIP_FRAMES_VERTICALLY = true; + + // Target resolution should be 4:3 for this application, as expected by the model and tracker. + private static final Size TARGET_RESOLUTION = new Size(1280, 960); + + static { + // Load all native libraries needed by the app. + System.loadLibrary("mediapipe_jni"); + System.loadLibrary("opencv_java3"); + } + + // {@link SurfaceTexture} where the camera-preview frames can be accessed. + private SurfaceTexture previewFrameTexture; + // {@link SurfaceView} that displays the camera-preview frames processed by a MediaPipe graph. + private SurfaceView previewDisplayView; + + // Creates and manages an {@link EGLContext}. + private EglManager eglManager; + // Sends camera-preview frames into a MediaPipe graph for processing, and displays the processed + // frames onto a {@link Surface}. + private FrameProcessor processor; + // Converts the GL_TEXTURE_EXTERNAL_OES texture from Android camera into a regular texture to be + // consumed by {@link FrameProcessor} and the underlying MediaPipe graph. + private ExternalTextureConverter converter; + + // Handles camera access via the {@link CameraX} Jetpack support library. + private CameraXPreviewHelper cameraHelper; + + // Assets. + private Bitmap objTexture = null; + private Bitmap boxTexture = null; + + Size cameraImageSize; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_main); + + previewDisplayView = new SurfaceView(this); + setupPreviewDisplayView(); + + // Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g., + // binary graphs. + AndroidAssetUtil.initializeNativeAssetManager(this); + + eglManager = new EglManager(null); + processor = + new FrameProcessor( + this, + eglManager.getNativeContext(), + BINARY_GRAPH_NAME, + INPUT_VIDEO_STREAM_NAME, + OUTPUT_VIDEO_STREAM_NAME); + processor.getVideoSurfaceOutput().setFlipY(FLIP_FRAMES_VERTICALLY); + + prepareDemoAssets(); + AndroidPacketCreator packetCreator = processor.getPacketCreator(); + Map inputSidePackets = new HashMap<>(); + inputSidePackets.put("obj_asset_name", packetCreator.createString(OBJ_FILE)); + inputSidePackets.put("box_asset_name", packetCreator.createString(BOX_FILE)); + inputSidePackets.put("obj_texture", packetCreator.createRgbaImageFrame(objTexture)); + inputSidePackets.put("box_texture", packetCreator.createRgbaImageFrame(boxTexture)); + processor.setInputSidePackets(inputSidePackets); + + PermissionHelper.checkAndRequestCameraPermissions(this); + } + + @Override + protected void onResume() { + super.onResume(); + converter = new ExternalTextureConverter(eglManager.getContext()); + converter.setFlipY(FLIP_FRAMES_VERTICALLY); + converter.setConsumer(processor); + if (PermissionHelper.cameraPermissionsGranted(this)) { + startCamera(); + } + } + + @Override + protected void onPause() { + super.onPause(); + converter.close(); + } + + @Override + public void onRequestPermissionsResult( + int requestCode, String[] permissions, int[] grantResults) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults); + PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults); + } + + private void setupPreviewDisplayView() { + previewDisplayView.setVisibility(View.GONE); + ViewGroup viewGroup = findViewById(R.id.preview_display_layout); + viewGroup.addView(previewDisplayView); + + previewDisplayView + .getHolder() + .addCallback( + new SurfaceHolder.Callback() { + @Override + public void surfaceCreated(SurfaceHolder holder) { + processor.getVideoSurfaceOutput().setSurface(holder.getSurface()); + } + + @Override + public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) { + // (Re-)Compute the ideal size of the camera-preview display (the area that the + // camera-preview frames get rendered onto, potentially with scaling and rotation) + // based on the size of the SurfaceView that contains the display. + Size viewSize = new Size(height, height * 3 / 4); // Prefer 3:4 aspect ratio. + Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize); + boolean isCameraRotated = cameraHelper.isCameraRotated(); + cameraImageSize = cameraHelper.getFrameSize(); + + // Connect the converter to the camera-preview frames as its input (via + // previewFrameTexture), and configure the output width and height as the computed + // display size. + converter.setSurfaceTextureAndAttachToGLContext( + previewFrameTexture, + isCameraRotated ? displaySize.getHeight() : displaySize.getWidth(), + isCameraRotated ? displaySize.getWidth() : displaySize.getHeight()); + processor.setOnWillAddFrameListener( + (timestamp) -> { + try { + int cameraTextureWidth = + isCameraRotated + ? cameraImageSize.getHeight() + : cameraImageSize.getWidth(); + int cameraTextureHeight = + isCameraRotated + ? cameraImageSize.getWidth() + : cameraImageSize.getHeight(); + + // Find limiting side and scale to 3:4 aspect ratio + float aspectRatio = + (float) cameraTextureWidth / (float) cameraTextureHeight; + if (aspectRatio > 3.0 / 4.0) { + // width too big + cameraTextureWidth = (int) ((float) cameraTextureHeight * 3.0 / 4.0); + } else { + // height too big + cameraTextureHeight = (int) ((float) cameraTextureWidth * 4.0 / 3.0); + } + Packet widthPacket = + processor.getPacketCreator().createInt32(cameraTextureWidth); + Packet heightPacket = + processor.getPacketCreator().createInt32(cameraTextureHeight); + + try { + processor + .getGraph() + .addPacketToInputStream("input_width", widthPacket, timestamp); + processor + .getGraph() + .addPacketToInputStream("input_height", heightPacket, timestamp); + } catch (Exception e) { + Log.e( + TAG, + "MediaPipeException encountered adding packets to width and height" + + " input streams."); + } + widthPacket.release(); + heightPacket.release(); + } catch (IllegalStateException ise) { + Log.e( + TAG, + "Exception while adding packets to width and height input streams."); + } + }); + } + + @Override + public void surfaceDestroyed(SurfaceHolder holder) { + processor.getVideoSurfaceOutput().setSurface(null); + } + }); + } + + private void startCamera() { + cameraHelper = new CameraXPreviewHelper(); + cameraHelper.setOnCameraStartedListener( + surfaceTexture -> { + previewFrameTexture = surfaceTexture; + // Make the display view visible to start showing the preview. This triggers the + // SurfaceHolder.Callback added to (the holder of) previewDisplayView. + previewDisplayView.setVisibility(View.VISIBLE); + }); + cameraHelper.startCamera( + this, CAMERA_FACING, /*surfaceTexture=*/ null, /*targetSize=*/ TARGET_RESOLUTION); + cameraImageSize = cameraHelper.getFrameSize(); + } + + private void prepareDemoAssets() { + AndroidAssetUtil.initializeNativeAssetManager(this); + // We render from raw data with openGL, so disable decoding preprocessing + BitmapFactory.Options decodeOptions = new BitmapFactory.Options(); + decodeOptions.inScaled = false; + decodeOptions.inDither = false; + decodeOptions.inPremultiplied = false; + + try { + InputStream inputStream = getAssets().open(OBJ_TEXTURE); + objTexture = BitmapFactory.decodeStream(inputStream, null /*outPadding*/, decodeOptions); + inputStream.close(); + } catch (Exception e) { + Log.e(TAG, "Error parsing object texture; error: " + e); + throw new IllegalStateException(e); + } + + try { + InputStream inputStream = getAssets().open(BOX_TEXTURE); + boxTexture = BitmapFactory.decodeStream(inputStream, null /*outPadding*/, decodeOptions); + inputStream.close(); + } catch (Exception e) { + Log.e(TAG, "Error parsing box texture; error: " + e); + throw new RuntimeException(e); + } + } +} diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD new file mode 100644 index 000000000..ef6f88d65 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/BUILD @@ -0,0 +1,21 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +exports_files( + srcs = glob(["**"]), +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/box.obj.uuu b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/box.obj.uuu new file mode 100644 index 000000000..80a2aa8ca Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/box.obj.uuu differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD new file mode 100644 index 000000000..ef6f88d65 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/BUILD @@ -0,0 +1,21 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +exports_files( + srcs = glob(["**"]), +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/model.obj.uuu b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/model.obj.uuu new file mode 100644 index 000000000..e2fc9bc1f Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/model.obj.uuu differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/texture.bmp b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/texture.bmp new file mode 100644 index 000000000..0a4d1187d Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair/texture.bmp differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/classic_colors.png b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/classic_colors.png new file mode 100644 index 000000000..92dad8ef6 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/classic_colors.png differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/colors.bmp b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/colors.bmp new file mode 100644 index 000000000..1bbb1ca07 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/colors.bmp differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD new file mode 100644 index 000000000..ef6f88d65 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/BUILD @@ -0,0 +1,21 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +exports_files( + srcs = glob(["**"]), +) diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/model.obj.uuu b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/model.obj.uuu new file mode 100644 index 000000000..ee5183652 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/model.obj.uuu differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/texture.bmp b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/texture.bmp new file mode 100644 index 000000000..fa6c85a37 Binary files /dev/null and b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker/texture.bmp differ diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/layout/activity_main.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/layout/activity_main.xml new file mode 100644 index 000000000..c19d7e628 --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/layout/activity_main.xml @@ -0,0 +1,20 @@ + + + + + + + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/colors.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/colors.xml new file mode 100644 index 000000000..69b22338c --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/colors.xml @@ -0,0 +1,6 @@ + + + #008577 + #00574B + #D81B60 + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/strings.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/strings.xml new file mode 100644 index 000000000..e9eafc52a --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/strings.xml @@ -0,0 +1,4 @@ + + Object Detection 3D + Please grant camera permissions. + diff --git a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/styles.xml b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/styles.xml new file mode 100644 index 000000000..5885930df --- /dev/null +++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/res/values/styles.xml @@ -0,0 +1,11 @@ + + + + + + diff --git a/mediapipe/examples/coral/Dockerfile b/mediapipe/examples/coral/Dockerfile index 1640f57df..a2c2a153f 100644 --- a/mediapipe/examples/coral/Dockerfile +++ b/mediapipe/examples/coral/Dockerfile @@ -63,7 +63,7 @@ COPY . /mediapipe/ # Install bazel -ARG BAZEL_VERSION=0.29.1 +ARG BAZEL_VERSION=1.1.0 RUN mkdir /bazel && \ wget --no-check-certificate -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \ wget --no-check-certificate -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \ diff --git a/mediapipe/examples/coral/README.md b/mediapipe/examples/coral/README.md index 2289ad825..2e32b7b1a 100644 --- a/mediapipe/examples/coral/README.md +++ b/mediapipe/examples/coral/README.md @@ -1,9 +1,11 @@ # Coral Dev Board Setup (experimental) -**Dislaimer**: Running MediaPipe on Coral is experimental, and this process may +**Disclaimer**: Running MediaPipe on Coral is experimental, and this process may not be exact and is subject to change. These instructions have only been tested -on the [Coral Dev Board](https://coral.ai/products/dev-board/) with Mendel 4.0, -and may vary for different devices and workstations. +on the [Coral Dev Board](https://coral.ai/products/dev-board/) +running [Mendel Enterprise Day 13](https://coral.ai/software/) OS and +using [Diploria2](https://github.com/google-coral/edgetpu/tree/diploria2) +edgetpu libs, and may vary for different devices and workstations. This file describes how to prepare a Coral Dev Board and setup a Linux Docker container for building MediaPipe applications that run on Edge TPU. @@ -16,10 +18,12 @@ Docker container for building MediaPipe applications that run on Edge TPU. * Setup the coral device via [here](https://coral.withgoogle.com/docs/dev-board/get-started/), and ensure the _mdt_ command works + Note: alias mdt="python3 -m mdt.main" may be needed on some systems + * (on coral device) prepare MediaPipe cd ~ - sudo apt-get install -y git + sudo apt-get update && sudo apt-get install -y git git clone https://github.com/google/mediapipe.git mkdir mediapipe/bazel-bin diff --git a/mediapipe/examples/coral/WORKSPACE b/mediapipe/examples/coral/WORKSPACE index 28112f958..3afda1a86 100644 --- a/mediapipe/examples/coral/WORKSPACE +++ b/mediapipe/examples/coral/WORKSPACE @@ -10,19 +10,25 @@ http_archive( sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e", ) load("@bazel_skylib//lib:versions.bzl", "versions") -versions.check(minimum_bazel_version = "0.24.1") +versions.check(minimum_bazel_version = "1.0.0", + maximum_bazel_version = "1.2.1") -# ABSL cpp library. + +# ABSL cpp library lts_2020_02_25 http_archive( name = "com_google_absl", - # Head commit on 2019-04-12. - # TODO: Switch to the latest absl version when the problem gets - # fixed. urls = [ - "https://github.com/abseil/abseil-cpp/archive/a02f62f456f2c4a7ecf2be3104fe0c6e16fbad9a.tar.gz", + "https://github.com/abseil/abseil-cpp/archive/20200225.tar.gz", ], - sha256 = "d437920d1434c766d22e85773b899c77c672b8b4865d5dc2cd61a29fdff3cf03", - strip_prefix = "abseil-cpp-a02f62f456f2c4a7ecf2be3104fe0c6e16fbad9a", + # Remove after https://github.com/abseil/abseil-cpp/issues/326 is solved. + patches = [ + "@//third_party:com_google_absl_f863b622fe13612433fdf43f76547d5edda0c93001.diff" + ], + patch_args = [ + "-p1", + ], + strip_prefix = "abseil-cpp-20200225", + sha256 = "728a813291bdec2aa46eab8356ace9f75ac2ed9dfe2df5ab603c4e6c09f1c353" ) http_archive( @@ -72,6 +78,14 @@ http_archive( ], ) +# easyexif +http_archive( + name = "easyexif", + url = "https://github.com/mayanklahiri/easyexif/archive/master.zip", + strip_prefix = "easyexif-master", + build_file = "@//third_party:easyexif.BUILD", +) + # libyuv http_archive( name = "libyuv", @@ -103,15 +117,23 @@ http_archive( ], ) -# 2019-11-12 -_TENSORFLOW_GIT_COMMIT = "a5f9bcd64453ff3d1f64cb4da4786db3d2da7f82" -_TENSORFLOW_SHA256= "f2b6f2ab2ffe63e86eccd3ce4bea6b7197383d726638dfeeebcdc1e7de73f075" +# 2020-02-12 +# The last commit before TensorFlow switched to Bazel 2.0 +_TENSORFLOW_GIT_COMMIT = "77e9ffb9b2bfb1a4f7056e62d84039626923e328" +_TENSORFLOW_SHA256= "176ccd82f7dd17c5e117b50d353603b129c7a6ccbfebd522ca47cc2a40f33f13" http_archive( name = "org_tensorflow", urls = [ "https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, ], + # A compatibility patch + patches = [ + "@//third_party:org_tensorflow_528e22eae8bf3206189a066032c66e9e5c9b4a61.diff" + ], + patch_args = [ + "-p1", + ], strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT, sha256 = _TENSORFLOW_SHA256, ) @@ -119,8 +141,22 @@ http_archive( load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace") tf_workspace(tf_repo_name = "org_tensorflow") +http_archive( + name = "ceres_solver", + url = "https://github.com/ceres-solver/ceres-solver/archive/1.14.0.zip", + patches = [ + "@//third_party:ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff" + ], + patch_args = [ + "-p1", + ], + strip_prefix = "ceres-solver-1.14.0", + sha256 = "5ba6d0db4e784621fda44a50c58bb23b0892684692f0c623e2063f9c19f192f1" +) + # Please run # $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \ +# libopencv-calib3d-dev libopencv-features2d-dev \ # libopencv-imgproc-dev libopencv-video-dev new_local_repository( name = "linux_opencv", @@ -149,11 +185,10 @@ new_local_repository( http_archive( name = "android_opencv", - sha256 = "056b849842e4fa8751d09edbb64530cfa7a63c84ccd232d0ace330e27ba55d0b", build_file = "@//third_party:opencv_android.BUILD", strip_prefix = "OpenCV-android-sdk", type = "zip", - url = "https://github.com/opencv/opencv/releases/download/4.1.0/opencv-4.1.0-android-sdk.zip", + url = "https://github.com/opencv/opencv/releases/download/3.4.3/opencv-3.4.3-android-sdk.zip", ) # After OpenCV 3.2.0, the pre-compiled opencv2.framework has google protobuf symbols, which will @@ -184,13 +219,18 @@ maven_install( artifacts = [ "androidx.annotation:annotation:aar:1.1.0", "androidx.appcompat:appcompat:aar:1.1.0-rc01", + "androidx.camera:camera-core:aar:1.0.0-alpha06", + "androidx.camera:camera-camera2:aar:1.0.0-alpha06", "androidx.constraintlayout:constraintlayout:aar:1.1.3", "androidx.core:core:aar:1.1.0-rc03", "androidx.legacy:legacy-support-v4:aar:1.0.0", "androidx.recyclerview:recyclerview:aar:1.1.0-beta02", "com.google.android.material:material:aar:1.0.0-rc01", ], - repositories = ["https://dl.google.com/dl/android/maven2"], + repositories = [ + "https://dl.google.com/dl/android/maven2", + "https://repo1.maven.org/maven2", + ], ) maven_server( @@ -206,10 +246,10 @@ maven_jar( ) maven_jar( - name = "androidx_concurrent_futures", - artifact = "androidx.concurrent:concurrent-futures:1.0.0-alpha03", - sha1 = "b528df95c7e2fefa2210c0c742bf3e491c1818ae", - server = "google_server", + name = "androidx_concurrent_futures", + artifact = "androidx.concurrent:concurrent-futures:1.0.0-alpha03", + sha1 = "b528df95c7e2fefa2210c0c742bf3e491c1818ae", + server = "google_server", ) maven_jar( @@ -285,10 +325,13 @@ http_archive( build_file = "@//third_party:google_toolbox_for_mac.BUILD", ) +### Coral ### -# Coral #COMMIT=$(git ls-remote https://github.com/google-coral/crosstool master | awk '{print $1}') #SHA256=$(curl -L "https://github.com/google-coral/crosstool/archive/${COMMIT}.tar.gz" | sha256sum | awk '{print $1}') +# Oct 2019 +#COMMIT=9e00d5be43bf001f883b5700f5d04882fea00229 +#SHA256=cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb http_archive( name = "coral_crosstool", sha256 = "cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb", diff --git a/mediapipe/examples/coral/setup.sh b/mediapipe/examples/coral/setup.sh index d8680e49c..dabf4c7dd 100755 --- a/mediapipe/examples/coral/setup.sh +++ b/mediapipe/examples/coral/setup.sh @@ -8,7 +8,7 @@ echo ' sh mediapipe/examples/coral/setup.sh ' sleep 3 -mkdir opencv32_arm64_libs +mkdir -p opencv32_arm64_libs cp mediapipe/examples/coral/update_sources.sh update_sources.sh chmod +x update_sources.sh diff --git a/mediapipe/examples/desktop/autoflip/README.md b/mediapipe/examples/desktop/autoflip/README.md index 250fcf4b7..98004a782 100644 --- a/mediapipe/examples/desktop/autoflip/README.md +++ b/mediapipe/examples/desktop/autoflip/README.md @@ -11,6 +11,8 @@ 2. Build and run the run_autoflip binary to process a local video. +Note: AutoFlip currently only works with OpenCV 3 . Please verify your OpenCV version beforehand. + ```bash bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \ mediapipe/examples/desktop/autoflip:run_autoflip diff --git a/mediapipe/examples/desktop/media_sequence/charades_dataset.py b/mediapipe/examples/desktop/media_sequence/charades_dataset.py index 0a3f005f4..9c8540575 100644 --- a/mediapipe/examples/desktop/media_sequence/charades_dataset.py +++ b/mediapipe/examples/desktop/media_sequence/charades_dataset.py @@ -63,12 +63,15 @@ import random import subprocess import sys import tempfile -import urllib import zipfile + from absl import app from absl import flags from absl import logging +from six.moves import range +from six.moves import urllib import tensorflow.compat.v1 as tf + from mediapipe.util.sequence import media_sequence as ms @@ -218,7 +221,7 @@ class Charades(object): return output_dict if split not in SPLITS: - raise ValueError("Split %s not in %s" % split, str(SPLITS.keys())) + raise ValueError("Split %s not in %s" % split, str(list(SPLITS.keys()))) all_shards = tf.io.gfile.glob( os.path.join(self.path_to_data, SPLITS[split][0] + "-*-of-*")) random.shuffle(all_shards) @@ -329,7 +332,7 @@ class Charades(object): if sys.version_info >= (3, 0): urlretrieve = urllib.request.urlretrieve else: - urlretrieve = urllib.urlretrieve + urlretrieve = urllib.request.urlretrieve logging.info("Creating data directory.") tf.io.gfile.makedirs(self.path_to_data) logging.info("Downloading license.") diff --git a/mediapipe/examples/desktop/media_sequence/demo_dataset.py b/mediapipe/examples/desktop/media_sequence/demo_dataset.py index 8279f9bfb..e05568cb7 100644 --- a/mediapipe/examples/desktop/media_sequence/demo_dataset.py +++ b/mediapipe/examples/desktop/media_sequence/demo_dataset.py @@ -57,11 +57,12 @@ import random import subprocess import sys import tempfile -import urllib from absl import app from absl import flags from absl import logging +from six.moves import range +from six.moves import urllib import tensorflow.compat.v1 as tf from mediapipe.util.sequence import media_sequence as ms @@ -198,7 +199,7 @@ class DemoDataset(object): if sys.version_info >= (3, 0): urlretrieve = urllib.request.urlretrieve else: - urlretrieve = urllib.urlretrieve + urlretrieve = urllib.request.urlretrieve for split in SPLITS: reader = csv.DictReader(SPLITS[split].split("\n")) all_metadata = [] diff --git a/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py b/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py index 04e6c59d1..eafe18f77 100644 --- a/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py +++ b/mediapipe/examples/desktop/media_sequence/kinetics_dataset.py @@ -73,11 +73,13 @@ import subprocess import sys import tarfile import tempfile -import urllib from absl import app from absl import flags from absl import logging +from six.moves import range +from six.moves import urllib +from six.moves import zip import tensorflow.compat.v1 as tf from mediapipe.util.sequence import media_sequence as ms @@ -96,15 +98,15 @@ FILEPATTERN = "kinetics_700_%s_25fps_rgb_flow" SPLITS = { "train": { "shards": 1000, - "examples": 540247 + "examples": 538779 }, "validate": { "shards": 100, - "examples": 34610 + "examples": 34499 }, "test": { "shards": 100, - "examples": 69103 + "examples": 68847 }, "custom": { "csv": None, # Add a CSV for your own data here. @@ -198,7 +200,7 @@ class Kinetics(object): return output_dict if split not in SPLITS: - raise ValueError("Split %s not in %s" % split, str(SPLITS.keys())) + raise ValueError("Split %s not in %s" % split, str(list(SPLITS.keys()))) all_shards = tf.io.gfile.glob( os.path.join(self.path_to_data, FILEPATTERN % split + "-*-of-*")) random.shuffle(all_shards) @@ -302,11 +304,12 @@ class Kinetics(object): continue # rename the row with a constitent set of names. if len(csv_row) == 5: - row = dict(zip(["label_name", "video", "start", "end", "split"], - csv_row)) + row = dict( + list( + zip(["label_name", "video", "start", "end", "split"], + csv_row))) else: - row = dict(zip(["video", "start", "end", "split"], - csv_row)) + row = dict(list(zip(["video", "start", "end", "split"], csv_row))) metadata = tf.train.SequenceExample() ms.set_example_id(bytes23(row["video"] + "_" + row["start"]), metadata) @@ -328,7 +331,7 @@ class Kinetics(object): if sys.version_info >= (3, 0): urlretrieve = urllib.request.urlretrieve else: - urlretrieve = urllib.urlretrieve + urlretrieve = urllib.request.urlretrieve logging.info("Creating data directory.") tf.io.gfile.makedirs(self.path_to_data) logging.info("Downloading annotations.") @@ -404,7 +407,7 @@ class Kinetics(object): assert NUM_CLASSES == num_keys, ( "Found %d labels for split: %s, should be %d" % ( num_keys, name, NUM_CLASSES)) - label_map = dict(zip(classes, range(len(classes)))) + label_map = dict(list(zip(classes, list(range(len(classes)))))) if SPLITS[name]["examples"] > 0: assert SPLITS[name]["examples"] == num_examples, ( "Found %d examples for split: %s, should be %d" % ( diff --git a/mediapipe/examples/desktop/youtube8m/README.md b/mediapipe/examples/desktop/youtube8m/README.md index 4ae262335..775acc9ff 100644 --- a/mediapipe/examples/desktop/youtube8m/README.md +++ b/mediapipe/examples/desktop/youtube8m/README.md @@ -30,6 +30,8 @@ ```bash # cd to the root directory of the MediaPipe repo cd - + + pip3 install tf_slim python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph ``` @@ -47,7 +49,7 @@ 5. Run the MediaPipe binary to extract the features. ```bash - bazel build -c opt \ + bazel build -c opt --linkopt=-s \ --define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \ mediapipe/examples/desktop/youtube8m:extract_yt8m_features @@ -87,7 +89,7 @@ 3. Build and run the inference binary. ```bash - bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ + bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \ mediapipe/examples/desktop/youtube8m:model_inference GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/youtube8m/model_inference \ @@ -113,13 +115,13 @@ 2. Build the inference binary. ```bash - bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ + bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \ mediapipe/examples/desktop/youtube8m:model_inference ``` 3. Run the python web server. - Note: pip install absl-py + Note: pip3 install absl-py ```bash python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd` @@ -142,7 +144,7 @@ 3. Build and run the inference binary. ```bash - bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ + bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \ mediapipe/examples/desktop/youtube8m:model_inference # segment_size is the number of seconds window of frames. diff --git a/mediapipe/examples/desktop/youtube8m/generate_vggish_frozen_graph.py b/mediapipe/examples/desktop/youtube8m/generate_vggish_frozen_graph.py index 179bfb31f..786e2a6e7 100644 --- a/mediapipe/examples/desktop/youtube8m/generate_vggish_frozen_graph.py +++ b/mediapipe/examples/desktop/youtube8m/generate_vggish_frozen_graph.py @@ -25,7 +25,7 @@ import sys from absl import app import tensorflow.compat.v1 as tf -from tensorflow.compat.v1.python.tools import freeze_graph +from tensorflow.python.tools import freeze_graph BASE_DIR = '/tmp/mediapipe/' diff --git a/mediapipe/framework/BUILD b/mediapipe/framework/BUILD index 8d76c2e21..4176bcd9c 100644 --- a/mediapipe/framework/BUILD +++ b/mediapipe/framework/BUILD @@ -1078,10 +1078,16 @@ cc_library( cc_library( name = "port", hdrs = ["port.h"], + defines = select({ + "//conditions:default": [], + }) + select({ + "//conditions:default": [], + "//mediapipe/gpu:disable_gpu": ["MEDIAPIPE_DISABLE_GPU"], + }), visibility = [ "//mediapipe/framework:__subpackages__", "//mediapipe/framework/port:__pkg__", - "//mediapipe/util:__pkg__", + "//mediapipe/util:__subpackages__", ], ) diff --git a/mediapipe/framework/deps/status_macros.h b/mediapipe/framework/deps/status_macros.h index 3e97510f5..8e3ddf2c6 100644 --- a/mediapipe/framework/deps/status_macros.h +++ b/mediapipe/framework/deps/status_macros.h @@ -134,17 +134,21 @@ // Example: Logging the error on failure. // ASSIGN_OR_RETURN(ValueType value, MaybeGetValue(query), _.LogError()); // -#define ASSIGN_OR_RETURN(...) \ - STATUS_MACROS_IMPL_GET_VARIADIC_(__VA_ARGS__, \ - STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_, \ - STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_) \ +#define ASSIGN_OR_RETURN(...) \ + STATUS_MACROS_IMPL_GET_VARIADIC_((__VA_ARGS__, \ + STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_, \ + STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_)) \ (__VA_ARGS__) // ================================================================= // == Implementation details, do not rely on anything below here. == // ================================================================= -#define STATUS_MACROS_IMPL_GET_VARIADIC_(_1, _2, _3, NAME, ...) NAME +// MSVC incorrectly expands variadic macros, splice together a macro call to +// work around the bug. +#define STATUS_MACROS_IMPL_GET_VARIADIC_HELPER_(_1, _2, _3, NAME, ...) NAME +#define STATUS_MACROS_IMPL_GET_VARIADIC_(args) \ + STATUS_MACROS_IMPL_GET_VARIADIC_HELPER_ args #define STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_(lhs, rexpr) \ STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, std::move(_)) diff --git a/mediapipe/framework/encode_binary_proto.bzl b/mediapipe/framework/encode_binary_proto.bzl index 6bbfc33e9..030897bce 100644 --- a/mediapipe/framework/encode_binary_proto.bzl +++ b/mediapipe/framework/encode_binary_proto.bzl @@ -99,7 +99,12 @@ def _encode_binary_proto_impl(ctx): ), mnemonic = "EncodeProto", ) - return struct(files = depset([binarypb])) + + output_depset = depset([binarypb]) + return [DefaultInfo( + files = output_depset, + data_runfiles = ctx.runfiles(transitive_files = output_depset), + )] encode_binary_proto = rule( implementation = _encode_binary_proto_impl, diff --git a/mediapipe/framework/profiler/sharded_map.h b/mediapipe/framework/profiler/sharded_map.h index 2a610f312..53c18cd8b 100644 --- a/mediapipe/framework/profiler/sharded_map.h +++ b/mediapipe/framework/profiler/sharded_map.h @@ -131,7 +131,7 @@ class ShardedMap { return *this; } inline bool operator==(const Iterator& other) const { - return iter_ == other.iter_; + return shard_ == other.shard_ && iter_ == other.iter_; } inline bool operator!=(const Iterator& other) const { return !operator==(other); @@ -154,7 +154,10 @@ class ShardedMap { : shard_(shard), iter_(iter), map_(map) {} // Releases all resources. inline void Clear() ABSL_NO_THREAD_SAFETY_ANALYSIS { - if (map_ && iter_ != map_->maps_.back().end()) { + if (!map_) return; + bool is_end = (shard_ == map_->maps_.size() - 1 && + iter_ == map_->maps_[shard_].end()); + if (!is_end) { map_->mutexes_[shard_].Unlock(); } map_ = nullptr; diff --git a/mediapipe/framework/timestamp.h b/mediapipe/framework/timestamp.h index a79c4fd7b..dc574cbdc 100644 --- a/mediapipe/framework/timestamp.h +++ b/mediapipe/framework/timestamp.h @@ -100,7 +100,6 @@ class Timestamp { } // Special values. - static Timestamp Unset(); static Timestamp Unstarted(); static Timestamp PreStream(); diff --git a/mediapipe/framework/tool/subgraph_expansion.cc b/mediapipe/framework/tool/subgraph_expansion.cc index 6d2ce40e9..9b9a50fb5 100644 --- a/mediapipe/framework/tool/subgraph_expansion.cc +++ b/mediapipe/framework/tool/subgraph_expansion.cc @@ -264,6 +264,10 @@ static ::mediapipe::Status PrefixNames(std::string prefix, generator.mutable_input_side_packet(), replace_names)); MP_RETURN_IF_ERROR(TransformStreamNames( generator.mutable_output_side_packet(), replace_names)); + + // Remove input side packets ignored by the subgraph-node. + MP_RETURN_IF_ERROR(RemoveIgnoredStreams( + generator.mutable_input_side_packet(), ignored_input_side_packets)); } return ::mediapipe::OkStatus(); } diff --git a/mediapipe/gpu/gpu_buffer_multi_pool.cc b/mediapipe/gpu/gpu_buffer_multi_pool.cc index a8555819c..0bfc21fb5 100644 --- a/mediapipe/gpu/gpu_buffer_multi_pool.cc +++ b/mediapipe/gpu/gpu_buffer_multi_pool.cc @@ -105,17 +105,27 @@ GpuBuffer GpuBufferMultiPool::GetBuffer(int width, int height, BufferSpec key(width, height, format); auto pool_it = pools_.find(key); if (pool_it == pools_.end()) { - // Discard the oldest pool in order of creation. - // TODO: implement a better policy. + // Discard the least recently used pool in LRU cache. if (pools_.size() >= kMaxPoolCount) { - auto old_spec = buffer_specs_.front(); - buffer_specs_.pop(); + auto old_spec = buffer_specs_.front(); // Front has LRU. + buffer_specs_.pop_front(); pools_.erase(old_spec); } - buffer_specs_.push(key); + buffer_specs_.push_back(key); // Push new spec to back. std::tie(pool_it, std::ignore) = pools_.emplace(std::piecewise_construct, std::forward_as_tuple(key), std::forward_as_tuple(MakeSimplePool(key))); + } else { + // Find and move current 'key' spec to back, keeping others in same order. + auto specs_it = buffer_specs_.begin(); + while (specs_it != buffer_specs_.end()) { + if (*specs_it == key) { + buffer_specs_.erase(specs_it); + break; + } + ++specs_it; + } + buffer_specs_.push_back(key); } return GetBufferFromSimplePool(pool_it->first, pool_it->second); } diff --git a/mediapipe/gpu/gpu_buffer_multi_pool.h b/mediapipe/gpu/gpu_buffer_multi_pool.h index 73a871ade..70c4e68c6 100644 --- a/mediapipe/gpu/gpu_buffer_multi_pool.h +++ b/mediapipe/gpu/gpu_buffer_multi_pool.h @@ -22,8 +22,8 @@ #ifndef MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_ #define MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_ +#include #include -#include #include #include "absl/synchronization/mutex.h" @@ -110,7 +110,7 @@ class GpuBufferMultiPool { ABSL_GUARDED_BY(mutex_); // A queue of BufferSpecs to keep track of the age of each BufferSpec added to // the pool. - std::queue buffer_specs_; + std::deque buffer_specs_; #ifdef __APPLE__ // Texture caches used with this pool. diff --git a/mediapipe/gpu/metal.bzl b/mediapipe/gpu/metal.bzl index 0f19ec2c5..9d5291d95 100644 --- a/mediapipe/gpu/metal.bzl +++ b/mediapipe/gpu/metal.bzl @@ -73,13 +73,15 @@ def _metal_compiler_args(ctx, src, obj, minimum_os_version, copts, diagnostics, def _metal_compiler_inputs(srcs, hdrs, deps = []): """Determines the list of inputs required for a compile action.""" - objc_providers = [x.objc for x in deps if hasattr(x, "objc")] - objc_files = depset() - for objc in objc_providers: - objc_files += objc.header + cc_infos = [dep[CcInfo] for dep in deps if CcInfo in dep] - return srcs + hdrs + objc_files.to_list() + dep_headers = depset(transitive = [ + cc_info.compilation_context.headers + for cc_info in cc_infos + ]) + + return depset(srcs + hdrs, transitive = [dep_headers]) def _metal_library_impl(ctx): """Implementation for metal_library Skylark rule.""" @@ -144,11 +146,22 @@ def _metal_library_impl(ctx): **additional_params ) + cc_infos = [dep[CcInfo] for dep in ctx.attr.deps if CcInfo in dep] + if ctx.files.hdrs: + cc_infos.append( + CcInfo( + compilation_context = cc_common.create_compilation_context( + headers = depset([f for f in ctx.files.hdrs]), + ), + ), + ) + return [ DefaultInfo( files = depset([output_lib]), ), objc_provider, + cc_common.merge_cc_infos(cc_infos = cc_infos), # Return the provider for the new bundling logic of rules_apple. resources.bucketize_typed([output_lib], "unprocessed"), ] @@ -156,7 +169,7 @@ def _metal_library_impl(ctx): METAL_LIBRARY_ATTRS = dicts.add(apple_support.action_required_attrs(), { "srcs": attr.label_list(allow_files = [".metal"], allow_empty = False), "hdrs": attr.label_list(allow_files = [".h"]), - "deps": attr.label_list(providers = [["objc"]]), + "deps": attr.label_list(providers = [["objc", CcInfo]]), "copts": attr.string_list(), "minimum_os_version": attr.string(), }) diff --git a/mediapipe/graphs/object_detection_3d/BUILD b/mediapipe/graphs/object_detection_3d/BUILD new file mode 100644 index 000000000..d8f0a9744 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/BUILD @@ -0,0 +1,56 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_binary_graph", +) + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +exports_files(glob([ + "*.pbtxt", +])) + +cc_library( + name = "mobile_calculators", + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/calculators/core:packet_resampler_calculator", + "//mediapipe/calculators/image:image_cropping_calculator", + "//mediapipe/gpu:gl_scaler_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:annotations_to_model_matrices_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:gl_animation_overlay_calculator", + "//mediapipe/graphs/object_detection_3d/subgraphs:objectron_detection_gpu", + "//mediapipe/graphs/object_detection_3d/subgraphs:objectron_tracking_gpu", + ], +) + +mediapipe_binary_graph( + name = "mobile_gpu_binary_graph_shoe", + graph = "shoe_classic_occlusion_tracking.pbtxt", + output_name = "mobile_gpu_shoe.binarypb", + visibility = ["//visibility:public"], + deps = [":mobile_calculators"], +) + +mediapipe_binary_graph( + name = "mobile_gpu_binary_graph_chair", + graph = "chair_classic_occlusion_tracking.pbtxt", + output_name = "mobile_gpu_chair.binarypb", + visibility = ["//visibility:public"], + deps = [":mobile_calculators"], +) diff --git a/mediapipe/graphs/object_detection_3d/calculators/BUILD b/mediapipe/graphs/object_detection_3d/calculators/BUILD new file mode 100644 index 000000000..be84c3e40 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/BUILD @@ -0,0 +1,476 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library") + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:private"]) + +proto_library( + name = "object_proto", + srcs = [ + "object.proto", + ], +) + +proto_library( + name = "a_r_capture_metadata_proto", + srcs = [ + "a_r_capture_metadata.proto", + ], +) + +proto_library( + name = "annotation_proto", + srcs = [ + "annotation_data.proto", + ], + deps = [ + ":a_r_capture_metadata_proto", + ":object_proto", + ], +) + +proto_library( + name = "belief_decoder_config_proto", + srcs = [ + "belief_decoder_config.proto", + ], +) + +proto_library( + name = "camera_parameters_proto", + srcs = [ + "camera_parameters.proto", + ], +) + +proto_library( + name = "frame_annotation_tracker_calculator_proto", + srcs = ["frame_annotation_tracker_calculator.proto"], + deps = [ + "//mediapipe/framework:calculator_proto", + ], +) + +proto_library( + name = "gl_animation_overlay_calculator_proto", + srcs = ["gl_animation_overlay_calculator.proto"], + visibility = ["//visibility:public"], + deps = ["//mediapipe/framework:calculator_proto"], +) + +proto_library( + name = "tflite_tensors_to_objects_calculator_proto", + srcs = ["tflite_tensors_to_objects_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + ":belief_decoder_config_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +proto_library( + name = "lift_2d_frame_annotation_to_3d_calculator_proto", + srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + ":belief_decoder_config_proto", + "//mediapipe/framework:calculator_proto", + ], +) + +proto_library( + name = "annotations_to_model_matrices_calculator_proto", + srcs = ["annotations_to_model_matrices_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_proto", + ], +) + +proto_library( + name = "model_matrix_proto", + srcs = ["model_matrix.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_proto", + ], +) + +proto_library( + name = "annotations_to_render_data_calculator_proto", + srcs = ["annotations_to_render_data_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_proto", + "//mediapipe/util:color_proto", + ], +) + +mediapipe_cc_proto_library( + name = "object_cc_proto", + srcs = ["object.proto"], + visibility = ["//visibility:public"], + deps = [":object_proto"], +) + +mediapipe_cc_proto_library( + name = "a_r_capture_metadata_cc_proto", + srcs = ["a_r_capture_metadata.proto"], + visibility = ["//visibility:public"], + deps = [":a_r_capture_metadata_proto"], +) + +mediapipe_cc_proto_library( + name = "annotation_cc_proto", + srcs = ["annotation_data.proto"], + cc_deps = [ + ":a_r_capture_metadata_cc_proto", + ":object_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":annotation_proto"], +) + +mediapipe_cc_proto_library( + name = "camera_parameters_cc_proto", + srcs = ["camera_parameters.proto"], + visibility = ["//visibility:public"], + deps = [":camera_parameters_proto"], +) + +mediapipe_cc_proto_library( + name = "frame_annotation_tracker_calculator_cc_proto", + srcs = ["frame_annotation_tracker_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":frame_annotation_tracker_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "gl_animation_overlay_calculator_cc_proto", + srcs = ["gl_animation_overlay_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":gl_animation_overlay_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "belief_decoder_config_cc_proto", + srcs = ["belief_decoder_config.proto"], + visibility = ["//visibility:public"], + deps = [":belief_decoder_config_proto"], +) + +mediapipe_cc_proto_library( + name = "tflite_tensors_to_objects_calculator_cc_proto", + srcs = ["tflite_tensors_to_objects_calculator.proto"], + cc_deps = [ + ":belief_decoder_config_cc_proto", + "//mediapipe/framework:calculator_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":tflite_tensors_to_objects_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "lift_2d_frame_annotation_to_3d_calculator_cc_proto", + srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"], + cc_deps = [ + ":belief_decoder_config_cc_proto", + "//mediapipe/framework:calculator_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":lift_2d_frame_annotation_to_3d_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "annotations_to_model_matrices_calculator_cc_proto", + srcs = ["annotations_to_model_matrices_calculator.proto"], + cc_deps = ["//mediapipe/framework:calculator_cc_proto"], + visibility = ["//visibility:public"], + deps = [":annotations_to_model_matrices_calculator_proto"], +) + +mediapipe_cc_proto_library( + name = "model_matrix_cc_proto", + srcs = ["model_matrix.proto"], + cc_deps = ["//mediapipe/framework:calculator_cc_proto"], + visibility = ["//visibility:public"], + deps = [":model_matrix_proto"], +) + +mediapipe_cc_proto_library( + name = "annotations_to_render_data_calculator_cc_proto", + srcs = ["annotations_to_render_data_calculator.proto"], + cc_deps = [ + "//mediapipe/framework:calculator_cc_proto", + "//mediapipe/util:color_cc_proto", + ], + visibility = ["//visibility:public"], + deps = [":annotations_to_render_data_calculator_proto"], +) + +cc_library( + name = "box_util", + srcs = ["box_util.cc"], + hdrs = ["box_util.h"], + deps = [ + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/util/tracking:box_tracker_cc_proto", + ], +) + +cc_library( + name = "frame_annotation_tracker", + srcs = ["frame_annotation_tracker.cc"], + hdrs = ["frame_annotation_tracker.h"], + deps = [ + ":annotation_cc_proto", + ":box_util", + "//mediapipe/framework/port:integral_types", + "//mediapipe/framework/port:logging", + "//mediapipe/util/tracking:box_tracker_cc_proto", + "@com_google_absl//absl/container:btree", + "@com_google_absl//absl/container:flat_hash_set", + ], +) + +cc_library( + name = "gl_animation_overlay_calculator", + srcs = ["gl_animation_overlay_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":camera_parameters_cc_proto", + ":gl_animation_overlay_calculator_cc_proto", + ":model_matrix_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/gpu:gl_calculator_helper", + "//mediapipe/gpu:shader_util", + "//mediapipe/util/android:asset_manager_util", + ], + alwayslink = 1, +) + +cc_library( + name = "decoder", + srcs = [ + "decoder.cc", + ], + hdrs = [ + "decoder.h", + ], + deps = [ + ":annotation_cc_proto", + ":belief_decoder_config_cc_proto", + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:status", + "@com_google_absl//absl/status", + "@eigen_archive//:eigen", + ], +) + +cc_library( + name = "tensor_util", + srcs = [ + "tensor_util.cc", + ], + hdrs = [ + "tensor_util.h", + ], + deps = [ + "//mediapipe/framework/port:logging", + "//mediapipe/framework/port:opencv_core", + "@org_tensorflow//tensorflow/lite:framework", + ], +) + +cc_library( + name = "box", + srcs = [ + "box.cc", + "model.cc", + ], + hdrs = [ + "box.h", + "model.h", + "types.h", + ], + deps = [ + ":annotation_cc_proto", + ":object_cc_proto", + "//mediapipe/framework/port:logging", + "@eigen_archive//:eigen", + ], +) + +cc_library( + name = "frame_annotation_to_timed_box_list_calculator", + srcs = ["frame_annotation_to_timed_box_list_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":annotation_cc_proto", + ":box_util", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:opencv_imgproc", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/util/tracking:box_tracker_cc_proto", + "@com_google_absl//absl/memory", + ], + alwayslink = 1, +) + +cc_library( + name = "frame_annotation_tracker_calculator", + srcs = ["frame_annotation_tracker_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":annotation_cc_proto", + ":frame_annotation_tracker", + ":frame_annotation_tracker_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/util/tracking:box_tracker_cc_proto", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/memory", + ], + alwayslink = 1, +) + +cc_library( + name = "tflite_tensors_to_objects_calculator", + srcs = ["tflite_tensors_to_objects_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":annotation_cc_proto", + ":belief_decoder_config_cc_proto", + ":decoder", + ":tensor_util", + ":tflite_tensors_to_objects_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:ret_check", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@eigen_archive//:eigen", + "@org_tensorflow//tensorflow/lite:framework", + ], + alwayslink = 1, +) + +cc_library( + name = "lift_2d_frame_annotation_to_3d_calculator", + srcs = ["lift_2d_frame_annotation_to_3d_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":annotation_cc_proto", + ":belief_decoder_config_cc_proto", + ":decoder", + ":lift_2d_frame_annotation_to_3d_calculator_cc_proto", + ":tensor_util", + ":tflite_tensors_to_objects_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/deps:file_path", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/framework/port:ret_check", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@eigen_archive//:eigen", + "@org_tensorflow//tensorflow/lite:framework", + ], + alwayslink = 1, +) + +cc_library( + name = "annotations_to_model_matrices_calculator", + srcs = ["annotations_to_model_matrices_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":annotation_cc_proto", + ":annotations_to_model_matrices_calculator_cc_proto", + ":box", + ":model_matrix_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:calculator_options_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//mediapipe/util:color_cc_proto", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + "@eigen_archive//:eigen", + ], + alwayslink = 1, +) + +cc_library( + name = "annotations_to_render_data_calculator", + srcs = ["annotations_to_render_data_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":annotation_cc_proto", + ":annotations_to_render_data_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework:calculator_options_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/util:color_cc_proto", + "//mediapipe/util:render_data_cc_proto", + "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", + ], + alwayslink = 1, +) + +cc_test( + name = "box_util_test", + srcs = ["box_util_test.cc"], + deps = [ + ":box_util", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:opencv_core", + "//mediapipe/util/tracking:box_tracker_cc_proto", + ], +) + +cc_test( + name = "frame_annotation_tracker_test", + srcs = ["frame_annotation_tracker_test.cc"], + deps = [ + ":annotation_cc_proto", + ":frame_annotation_tracker", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:logging", + "//mediapipe/util/tracking:box_tracker_cc_proto", + "@com_google_absl//absl/container:flat_hash_set", + ], +) diff --git a/mediapipe/graphs/object_detection_3d/calculators/a_r_capture_metadata.proto b/mediapipe/graphs/object_detection_3d/calculators/a_r_capture_metadata.proto new file mode 100644 index 000000000..edc8c4b38 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/a_r_capture_metadata.proto @@ -0,0 +1,551 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +// Info about the camera characteristics used to capture images and depth data. +// See developer.apple.com/documentation/avfoundation/avcameracalibrationdata +// for more information. +message AVCameraCalibrationData { + // 3x3 row-major matrix relating a camera's internal properties to an ideal + // pinhole-camera model. + // See + // developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881135-intrinsicmatrix + // for detailed usage information. + repeated float intrinsic_matrix = 1 [packed = true]; + + // The image dimensions to which the intrinsic_matrix values are relative. + optional float intrinsic_matrix_reference_dimension_width = 2; + optional float intrinsic_matrix_reference_dimension_height = 3; + + // 3x4 row-major matrix relating a camera's position and orientation to a + // world or scene coordinate system. Consists of a unitless 3x3 rotation + // matrix (R) on the left and a translation (t) 3x1 vector on the right. The + // translation vector's units are millimeters. For example: + // + // |r1,1 r2,1 r3,1 | t1| + // [R | t] = |r1,2 r2,2 r3,2 | t2| + // |r1,3 r2,3 r3,3 | t3| + // + // is stored as [r11, r21, r31, t1, r12, r22, r32, t2, ...] + // + // See + // developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881130-extrinsicmatrix?language=objc + // for more information. + repeated float extrinsic_matrix = 4 [packed = true]; + + // The size, in millimeters, of one image pixel. + optional float pixel_size = 5; + + // A list of floating-point values describing radial distortions imparted by + // the camera lens, for use in rectifying camera images. + // See + // developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881129-lensdistortionlookuptable?language=objc + // for more information. + repeated float lens_distortion_lookup_values = 6 [packed = true]; + + // A list of floating-point values describing radial distortions for use in + // reapplying camera geometry to a rectified image. + // See + // developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881132-inverselensdistortionlookuptable?language=objc + // for more information. + repeated float inverse_lens_distortion_lookup_values = 7 [packed = true]; + + // The offset of the distortion center of the camera lens from the top-left + // corner of the image. + // See + // developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881131-lensdistortioncenter?language=objc + // for more information. + optional float lens_distortion_center_x = 8; + optional float lens_distortion_center_y = 9; +} + +// Container for depth data information. +// See developer.apple.com/documentation/avfoundation/avdepthdata for more info. +message AVDepthData { + // PNG representation of the grayscale depth data map. See discussion about + // depth_data_map_original_minimum_value, below, for information about how + // to interpret the pixel values. + optional bytes depth_data_map = 1; + + // Pixel format type of the original captured depth data. + // See + // developer.apple.com/documentation/corevideo/1563591-pixel_format_identifiers?language=objc + // for the complete list of possible pixel format types. This value represents + // a string for the associated OSType/FourCharCode. + optional string depth_data_type = 2; + + // Indicates the general accuracy of the depth_data_map. + // See developer.apple.com/documentation/avfoundation/avdepthdataaccuracy for + // more information. + enum Accuracy { + UNDEFINED_ACCURACY = 0; + // Values in the depth map are usable for foreground/background separation + // but are not absolutely accurate in the physical world. + RELATIVE = 1; + // Values in the depth map are absolutely accurate in the physical world. + ABSOLUTE = 2; + } + optional Accuracy depth_data_accuracy = 3 [default = RELATIVE]; + + // Indicates whether the depth_data_map contains temporally smoothed data. + optional bool depth_data_filtered = 4; + + // Quality of the depth_data_map. + enum Quality { + UNDEFINED_QUALITY = 0; + HIGH = 1; + LOW = 2; + } + optional Quality depth_data_quality = 5; + + // Associated calibration data for the depth_data_map. + optional AVCameraCalibrationData camera_calibration_data = 6; + + // The original range of values expressed by the depth_data_map, before + // grayscale normalization. For example, if the minimum and maximum values + // indicate a range of [0.5, 2.2], and the depth_data_type value indicates + // it was a depth map, then white pixels (255, 255, 255) will map to 0.5 and + // black pixels (0, 0, 0) will map to 2.2 with the grayscale range linearly + // interpolated inbetween. Conversely, if the depth_data_type value indicates + // it was a disparity map, then white pixels will map to 2.2 and black pixels + // will map to 0.5. + optional float depth_data_map_original_minimum_value = 7; + optional float depth_data_map_original_maximum_value = 8; + + // The width of the depth buffer map. + optional int32 depth_data_map_width = 9; + + // The height of the depth buffer map. + optional int32 depth_data_map_height = 10; + + // The row-major flattened array of the depth buffer map pixels. This will be + // either a float32 or float16 byte array, depending on 'depth_data_type'. + optional bytes depth_data_map_raw_values = 11; +} + +// Estimated scene lighting information associated with a captured video frame. +// See developer.apple.com/documentation/arkit/arlightestimate for more info. +message ARLightEstimate { + // The estimated intensity, in lumens, of ambient light throughout the scene. + optional double ambient_intensity = 1; + + // The estimated color temperature, in degrees Kelvin, of ambient light + // throughout the scene. + optional double ambient_color_temperature = 2; + + // Data describing the estimated lighting environment in all directions. + // Second-level spherical harmonics in separate red, green, and blue data + // planes. Thus, this buffer contains 3 sets of 9 coefficients, or a total of + // 27 values. + // See + // https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928222-sphericalharmonicscoefficients?language=objc + // for more information. + repeated float spherical_harmonics_coefficients = 3 [packed = true]; + + message DirectionVector { + optional float x = 1; + optional float y = 2; + optional float z = 3; + } + // A vector indicating the orientation of the strongest directional light + // source, normalized in the world-coordinate space. + // See + // https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928221-primarylightdirection?language=objc + // for more information; + optional DirectionVector primary_light_direction = 4; + + // The estimated intensity, in lumens, of the strongest directional light + // source in the scene. + // See + // https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928219-primarylightintensity?language=objc + // for more information. + optional float primary_light_intensity = 5; +} + +// Information about the camera position and imaging characteristics for a +// captured video frame. +// See developer.apple.com/documentation/arkit/arcamera for more information. +message ARCamera { + // The general quality of position tracking available when the camera captured + // a frame. + enum TrackingState { + UNDEFINED_TRACKING_STATE = 0; + // Camera position tracking is not available. + UNAVAILABLE = 1; + // Tracking is available, but the quality of results is questionable. + LIMITED = 2; + // Camera position tracking is providing optimal results. + NORMAL = 3; + } + optional TrackingState tracking_state = 1 [default = UNAVAILABLE]; + + // A possible diagnosis for limited position tracking quality as of when the + // frame was captured. + enum TrackingStateReason { + UNDEFINED_TRACKING_STATE_REASON = 0; + // The current tracking state is not limited. + NONE = 1; + // Not yet enough camera or motion data to provide tracking information. + INITIALIZING = 2; + // The device is moving too fast for accurate image-based position tracking. + EXCESSIVE_MOTION = 3; + // Not enough distinguishable features for image-based position tracking. + INSUFFICIENT_FEATURES = 4; + // Tracking is limited due to a relocalization in progress. + RELOCALIZING = 5; + } + optional TrackingStateReason tracking_state_reason = 2 [default = NONE]; + + // 4x4 row-major matrix expressing position and orientation of the camera in + // world coordinate space. + // See developer.apple.com/documentation/arkit/arcamera/2866108-transform for + // more information. + repeated float transform = 3 [packed = true]; + + // The orientation of the camera, expressed as roll, pitch, and yaw values. + message EulerAngles { + optional float roll = 1; + optional float pitch = 2; + optional float yaw = 3; + } + optional EulerAngles euler_angles = 4; + + // The width and height, in pixels, of the captured camera image. + optional int32 image_resolution_width = 5; + optional int32 image_resolution_height = 6; + + // 3x3 row-major matrix that converts between the 2D camera plane and 3D world + // coordinate space. + // See developer.apple.com/documentation/arkit/arcamera/2875730-intrinsics for + // usage information. + repeated float intrinsics = 7 [packed = true]; + + // 4x4 row-major transform matrix appropriate for rendering 3D content to + // match the image captured by the camera. + // See + // developer.apple.com/documentation/arkit/arcamera/2887458-projectionmatrix + // for usage information. + repeated float projection_matrix = 8 [packed = true]; + + // 4x4 row-major transform matrix appropriate for converting from world-space + // to camera space. Relativized for the captured_image orientation (i.e. + // UILandscapeOrientationRight). + // See + // https://developer.apple.com/documentation/arkit/arcamera/2921672-viewmatrixfororientation?language=objc + // for more information. + repeated float view_matrix = 9 [packed = true]; +} + +// Container for a 3D mesh describing face topology. +message ARFaceGeometry { + // Each vertex represents a 3D point in the face mesh, in the face coordinate + // space. + // See developer.apple.com/documentation/arkit/arfacegeometry/2928201-vertices + // for more information. + message Vertex { + optional float x = 1; + optional float y = 2; + optional float z = 3; + } + repeated Vertex vertices = 1; + + // The number of elements in the vertices list. + optional int32 vertex_count = 2; + + // Each texture coordinate represents UV texture coordinates for the vertex at + // the corresponding index in the vertices buffer. + // See + // developer.apple.com/documentation/arkit/arfacegeometry/2928203-texturecoordinates + // for more information. + message TextureCoordinate { + optional float u = 1; + optional float v = 2; + } + repeated TextureCoordinate texture_coordinates = 3; + + // The number of elements in the texture_coordinates list. + optional int32 texture_coordinate_count = 4; + + // Each integer value in this ordered list represents an index into the + // vertices and texture_coordinates lists. Each set of three indices + // identifies the vertices comprising a single triangle in the mesh. Each set + // of three indices forms a triangle, so the number of indices in the + // triangle_indices buffer is three times the triangle_count value. + // See + // developer.apple.com/documentation/arkit/arfacegeometry/2928199-triangleindices + // for more information. + repeated int32 triangle_indices = 5 [packed = true]; + + // The number of triangles described by the triangle_indices buffer. + // See + // developer.apple.com/documentation/arkit/arfacegeometry/2928207-trianglecount + // for more information. + optional int32 triangle_count = 6; +} + +// Contains a list of blend shape entries wherein each item maps a specific +// blend shape location to its associated coefficient. +message ARBlendShapeMap { + message MapEntry { + // Identifier for the specific facial feature. + // See developer.apple.com/documentation/arkit/arblendshapelocation for a + // complete list of identifiers. + optional string blend_shape_location = 1; + + // Indicates the current position of the feature relative to its neutral + // configuration, ranging from 0.0 (neutral) to 1.0 (maximum movement). + optional float blend_shape_coefficient = 2; + } + repeated MapEntry entries = 1; +} + +// Information about the pose, topology, and expression of a detected face. +// See developer.apple.com/documentation/arkit/arfaceanchor for more info. +message ARFaceAnchor { + // A coarse triangle mesh representing the topology of the detected face. + optional ARFaceGeometry geometry = 1; + + // A map of named coefficients representing the detected facial expression in + // terms of the movement of specific facial features. + optional ARBlendShapeMap blend_shapes = 2; + + // 4x4 row-major matrix encoding the position, orientation, and scale of the + // anchor relative to the world coordinate space. + // See + // https://developer.apple.com/documentation/arkit/aranchor/2867981-transform?language=objc + // for more information. + repeated float transform = 3; + + // Indicates whether the anchor's transform is valid. Frames that have a face + // anchor with this value set to NO should probably be ignored. + optional bool is_tracked = 4; +} + +// Container for a 3D mesh. +message ARPlaneGeometry { + message Vertex { + optional float x = 1; + optional float y = 2; + optional float z = 3; + } + + // Each texture coordinate represents UV texture coordinates for the vertex at + // the corresponding index in the vertices buffer. + // See + // https://developer.apple.com/documentation/arkit/arfacegeometry/2928203-texturecoordinates + // for more information. + message TextureCoordinate { + optional float u = 1; + optional float v = 2; + } + + // A buffer of vertex positions for each point in the plane mesh. + repeated Vertex vertices = 1; + + // The number of elements in the vertices buffer. + optional int32 vertex_count = 2; + + // A buffer of texture coordinate values for each point in the plane mesh. + repeated TextureCoordinate texture_coordinates = 3; + + // The number of elements in the texture_coordinates buffer. + optional int32 texture_coordinate_count = 4; + + // Each integer value in this ordered list represents an index into the + // vertices and texture_coordinates lists. Each set of three indices + // identifies the vertices comprising a single triangle in the mesh. Each set + // of three indices forms a triangle, so the number of indices in the + // triangle_indices buffer is three times the triangle_count value. + // See + // https://developer.apple.com/documentation/arkit/arplanegeometry/2941051-triangleindices + // for more information. + repeated int32 triangle_indices = 5 [packed = true]; + + // Each set of three indices forms a triangle, so the number of indices in the + // triangle_indices buffer is three times the triangle_count value. + // See + // https://developer.apple.com/documentation/arkit/arplanegeometry/2941058-trianglecount + // for more information. + optional int32 triangle_count = 6; + + // Each value in this buffer represents the position of a vertex along the + // boundary polygon of the estimated plane. The owning plane anchor's + // transform matrix defines the coordinate system for these points. + // See + // https://developer.apple.com/documentation/arkit/arplanegeometry/2941052-boundaryvertices + // for more information. + repeated Vertex boundary_vertices = 7; + + // The number of elements in the boundary_vertices buffer. + optional int32 boundary_vertex_count = 8; +} + +// Information about the position and orientation of a real-world flat surface. +// See https://developer.apple.com/documentation/arkit/arplaneanchor for more +// information. +message ARPlaneAnchor { + enum Alignment { + UNDEFINED = 0; + // The plane is perpendicular to gravity. + HORIZONTAL = 1; + // The plane is parallel to gravity. + VERTICAL = 2; + } + + // Wrapper for a 3D point / vector within the plane. See extent and center + // values for more information. + message PlaneVector { + optional float x = 1; + optional float y = 2; + optional float z = 3; + } + + enum PlaneClassification { + NONE = 0; + WALL = 1; + FLOOR = 2; + CEILING = 3; + TABLE = 4; + SEAT = 5; + } + + // The classification status for the plane. + enum PlaneClassificationStatus { + // The classfication process for the plane anchor has completed but the + // result is inconclusive. + UNKNOWN = 0; + // No classication information can be provided (set on error or if the + // device does not support plane classification). + UNAVAILABLE = 1; + // The classification process has not completed. + UNDETERMINED = 2; + // The classfication process for the plane anchor has completed. + KNOWN = 3; + } + + // The ID of the plane. + optional string identifier = 1; + + // 4x4 row-major matrix encoding the position, orientation, and scale of the + // anchor relative to the world coordinate space. + // See + // https://developer.apple.com/documentation/arkit/aranchor/2867981-transform + // for more information. + repeated float transform = 2; + + // The general orientation of the detected plane with respect to gravity. + optional Alignment alignment = 3; + + // A coarse triangle mesh representing the general shape of the detected + // plane. + optional ARPlaneGeometry geometry = 4; + + // The center point of the plane relative to its anchor position. + // Although the type of this property is a 3D vector, a plane anchor is always + // two-dimensional, and is always positioned in only the x and z directions + // relative to its transform position. (That is, the y-component of this + // vector is always zero.) + // See + // https://developer.apple.com/documentation/arkit/arplaneanchor/2882056-center + // for more information. + optional PlaneVector center = 5; + + // The estimated width and length of the detected plane. + // See + // https://developer.apple.com/documentation/arkit/arplaneanchor/2882055-extent + // for more information. + optional PlaneVector extent = 6; + + // A Boolean value that indicates whether plane classification is available on + // the current device. On devices without plane classification support, all + // plane anchors report a classification value of NONE + // and a classification_status value of UNAVAILABLE. + optional bool classification_supported = 7; + + // A general characterization of what kind of real-world surface the plane + // anchor represents. + // See + // https://developer.apple.com/documentation/arkit/arplaneanchor/2990936-classification + // for more information. + optional PlaneClassification classification = 8; + + // The current state of ARKit's process for classifying the plane anchor. + // When this property's value is KNOWN, the classification property represents + // ARKit's characterization of the real-world surface corresponding to the + // plane anchor. + // See + // https://developer.apple.com/documentation/arkit/arplaneanchor/2990937-classificationstatus + // for more information. + optional PlaneClassificationStatus classification_status = 9; +} + +// A collection of points in the world coordinate space. +// See https://developer.apple.com/documentation/arkit/arpointcloud for more +// information. +message ARPointCloud { + message Point { + optional float x = 1; + optional float y = 2; + optional float z = 3; + } + + // The number of points in the cloud. + optional int32 count = 1; + + // The list of detected points. + repeated Point point = 2; + + // A list of unique identifiers corresponding to detected feature points. + // Each identifier in this list corresponds to the point at the same index + // in the points array. + repeated int64 identifier = 3 [packed = true]; +} + +// Video image and face position tracking information. +// See developer.apple.com/documentation/arkit/arframe for more information. +message ARFrame { + // The timestamp for the frame. + optional double timestamp = 1; + + // The depth data associated with the frame. Not all frames have depth data. + optional AVDepthData depth_data = 2; + + // The depth data object timestamp associated with the frame. May differ from + // the frame timestamp value. Is only set when the frame has depth_data. + optional double depth_data_timestamp = 3; + + // Camera information associated with the frame. + optional ARCamera camera = 4; + + // Light information associated with the frame. + optional ARLightEstimate light_estimate = 5; + + // Face anchor information associated with the frame. Not all frames have an + // active face anchor. + optional ARFaceAnchor face_anchor = 6; + + // Plane anchors associated with the frame. Not all frames have a plane + // anchor. Plane anchors and face anchors are mutually exclusive. + repeated ARPlaneAnchor plane_anchor = 7; + + // The current intermediate results of the scene analysis used to perform + // world tracking. + // See + // https://developer.apple.com/documentation/arkit/arframe/2887449-rawfeaturepoints + // for more information. + optional ARPointCloud raw_feature_points = 8; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/annotation_data.proto b/mediapipe/graphs/object_detection_3d/calculators/annotation_data.proto new file mode 100644 index 000000000..5a417cbbd --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/annotation_data.proto @@ -0,0 +1,92 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package mediapipe; + +import "mediapipe/graphs/object_detection_3d/calculators/a_r_capture_metadata.proto"; +import "mediapipe/graphs/object_detection_3d/calculators/object.proto"; + +// Projection of a 3D point on an image, and its metric depth. +message NormalizedPoint2D { + // x-y position of the 2d keypoint in the image coordinate system. + // u,v \in [0, 1], where top left corner is (0, 0) and the bottom-right corner + // is (1, 1). + float x = 1; + float y = 2; + + // The depth of the point in the camera coordinate system (in meters). + float depth = 3; +} + +// The 3D point in the camera coordinate system, the scales are in meters. +message Point3D { + float x = 1; + float y = 2; + float z = 3; +} + +message AnnotatedKeyPoint { + int32 id = 1; + Point3D point_3d = 2; + NormalizedPoint2D point_2d = 3; +} + +message ObjectAnnotation { + // Reference to the object identifier in ObjectInstance. + int32 object_id = 1; + + // For each objects, list all the annotated keypoints here. + // E.g. for bounding-boxes, we have 8 keypoints, hands = 21 keypoints, etc. + // These normalized points are the projection of the Object's 3D keypoint + // on the current frame's camera poses. + repeated AnnotatedKeyPoint keypoints = 2; + + // Visibiity of this annotation in a frame. + float visibility = 3; +} + +message FrameAnnotation { + // Unique frame id, corresponds to images. + int32 frame_id = 1; + + // List of the annotated objects in this frame. Depending on how many object + // are observable in this frame, we might have non or as much as + // sequence.objects_size() annotations. + repeated ObjectAnnotation annotations = 2; + + // Information about the camera transformation (in the world coordinate) and + // imaging characteristics for a captured video frame. + ARCamera camera = 3; + + // The timestamp for the frame. + double timestamp = 4; + + // Plane center and normal in camera frame. + repeated float plane_center = 5; + repeated float plane_normal = 6; +} + +// The sequence protocol contains the annotation data for the entire video clip. +message Sequence { + // List of all the annotated 3D objects in this sequence in the world + // Coordinate system. Given the camera poses of each frame (also in the + // world-coordinate) these objects bounding boxes can be projected to each + // frame to get the per-frame annotation (i.e. image_annotation below). + repeated Object objects = 1; + + // List of annotated data per each frame in sequence + frame information. + repeated FrameAnnotation frame_annotations = 2; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc new file mode 100644 index 000000000..220869945 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.cc @@ -0,0 +1,209 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "Eigen/Dense" +#include "Eigen/src/Core/util/Constants.h" +#include "Eigen/src/Geometry/Quaternion.h" +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/calculator_options.pb.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/box.h" +#include "mediapipe/graphs/object_detection_3d/calculators/model_matrix.pb.h" +#include "mediapipe/util/color.pb.h" + +namespace mediapipe { + +namespace { + +constexpr char kAnnotationTag[] = "ANNOTATIONS"; +constexpr char kModelMatricesTag[] = "MODEL_MATRICES"; + +using Matrix4fRM = Eigen::Matrix; + +} // namespace + +// Converts the box prediction from Objectron Model to the Model matrices +// to be rendered. +// +// Input: +// ANNOTATIONS - Frame annotations with lifted 3D points, the points are in +// Objectron coordinate system. +// Output: +// MODEL_MATRICES - Result ModelMatrices, in OpenGL coordinate system. +// +// Usage example: +// node { +// calculator: "AnnotationsToModelMatricesCalculator" +// input_stream: "ANNOTATIONS:objects" +// output_stream: "MODEL_MATRICES:model_matrices" +//} + +class AnnotationsToModelMatricesCalculator : public CalculatorBase { + public: + AnnotationsToModelMatricesCalculator() {} + ~AnnotationsToModelMatricesCalculator() override {} + AnnotationsToModelMatricesCalculator( + const AnnotationsToModelMatricesCalculator&) = delete; + AnnotationsToModelMatricesCalculator& operator=( + const AnnotationsToModelMatricesCalculator&) = delete; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + ::mediapipe::Status GetModelMatricesForAnnotations( + const FrameAnnotation& annotations, + TimedModelMatrixProtoList* model_matrix_list); + + AnnotationsToModelMatricesCalculatorOptions options_; + Eigen::Vector3f model_scale_; + Matrix4fRM model_transformation_; +}; +REGISTER_CALCULATOR(AnnotationsToModelMatricesCalculator); + +::mediapipe::Status AnnotationsToModelMatricesCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)) << "No input stream found."; + if (cc->Inputs().HasTag(kAnnotationTag)) { + cc->Inputs().Tag(kAnnotationTag).Set(); + } + + if (cc->Outputs().HasTag(kModelMatricesTag)) { + cc->Outputs().Tag(kModelMatricesTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status AnnotationsToModelMatricesCalculator::Open( + CalculatorContext* cc) { + RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)); + + cc->SetOffset(TimestampDiff(0)); + options_ = cc->Options(); + + if (options_.model_scale_size() == 3) { + model_scale_ = + Eigen::Map(options_.model_scale().data()); + } else { + model_scale_.setOnes(); + } + + if (options_.model_transformation_size() == 16) { + model_transformation_ = + Eigen::Map(options_.model_transformation().data()); + } else { + model_transformation_.setIdentity(); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status AnnotationsToModelMatricesCalculator::Process( + CalculatorContext* cc) { + auto model_matrices = std::make_unique(); + + const FrameAnnotation& annotations = + cc->Inputs().Tag(kAnnotationTag).Get(); + + if (!GetModelMatricesForAnnotations(annotations, model_matrices.get()).ok()) { + return ::mediapipe::InvalidArgumentError( + "Error in GetModelMatricesForBoxes"); + } + cc->Outputs() + .Tag(kModelMatricesTag) + .Add(model_matrices.release(), cc->InputTimestamp()); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status +AnnotationsToModelMatricesCalculator::GetModelMatricesForAnnotations( + const FrameAnnotation& annotations, + TimedModelMatrixProtoList* model_matrix_list) { + if (model_matrix_list == nullptr) { + return ::mediapipe::InvalidArgumentError("model_matrix_list is nullptr"); + } + model_matrix_list->clear_model_matrix(); + + Box box("category"); + for (const auto& object : annotations.annotations()) { + TimedModelMatrixProto* model_matrix = model_matrix_list->add_model_matrix(); + model_matrix->set_id(object.object_id()); + + // Fit a box to the original vertices to estimate the scale of the box + std::vector vertices; + for (const auto& keypoint : object.keypoints()) { + const auto& point = keypoint.point_3d(); + Eigen::Vector3f p(point.x(), point.y(), point.z()); + vertices.emplace_back(p); + } + box.Fit(vertices); + + // Re-scale the box if necessary + Eigen::Vector3f estimated_scale = box.GetScale(); + vertices.clear(); + for (const auto& keypoint : object.keypoints()) { + const auto& point = keypoint.point_3d(); + Eigen::Vector3f p(point.x(), point.y(), point.z()); + vertices.emplace_back(p); + } + box.Fit(vertices); + + Matrix4fRM object_transformation = box.GetTransformation(); + Matrix4fRM model_view; + Matrix4fRM pursuit_model; + // The reference view is + // + // ref << 0., 0., 1., 0., + // -1., 0., 0., 0., + // 0., -1., 0., 0., + // 0., 0., 0., 1.; + // We have pursuit_model * model = model_view, to get pursuit_model: + // pursuit_model = model_view * model^-1 + // clang-format off + pursuit_model << 0.0, 1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0; + // clang-format on + + // Re-scale the CAD model to the scale of the estimated bounding box. + const Eigen::Vector3f scale = model_scale_.cwiseProduct(estimated_scale); + const Matrix4fRM model = + model_transformation_.array().colwise() * scale.homogeneous().array(); + + // Finally compute the model_view matrix. + model_view = pursuit_model * object_transformation * model; + + for (int i = 0; i < model_view.rows(); ++i) { + for (int j = 0; j < model_view.cols(); ++j) { + model_matrix->add_matrix_entries(model_view(i, j)); + } + } + } + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.proto new file mode 100644 index 000000000..c0159d453 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.proto @@ -0,0 +1,33 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message AnnotationsToModelMatricesCalculatorOptions { + extend CalculatorOptions { + optional AnnotationsToModelMatricesCalculatorOptions ext = 290166283; + } + + // Vector of size 3 indicating the scale vector [x, y, z]. We will re-scale + // the model size with this vector. (Defaults to [1., 1., 1.]) + repeated float model_scale = 1; + + // 4x4 Row major matrix denoting the transformation from the model to the + // Deep Pursuit 3D coordinate system (where front is +z, and up is +y). + repeated float model_transformation = 2; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.cc new file mode 100644 index 000000000..7f8b16009 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.cc @@ -0,0 +1,273 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/memory/memory.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/calculator_options.pb.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.pb.h" +#include "mediapipe/util/color.pb.h" +#include "mediapipe/util/render_data.pb.h" + +namespace mediapipe { + +namespace { + +constexpr char kAnnotationTag[] = "ANNOTATIONS"; +constexpr char kRenderDataTag[] = "RENDER_DATA"; +constexpr char kKeypointLabel[] = "KEYPOINT"; +constexpr int kMaxLandmarkThickness = 18; + +inline void SetColor(RenderAnnotation* annotation, const Color& color) { + annotation->mutable_color()->set_r(color.r()); + annotation->mutable_color()->set_g(color.g()); + annotation->mutable_color()->set_b(color.b()); +} + +// Remap x from range [lo hi] to range [0 1] then multiply by scale. +inline float Remap(float x, float lo, float hi, float scale) { + return (x - lo) / (hi - lo + 1e-6) * scale; +} + +inline void GetMinMaxZ(const FrameAnnotation& annotations, float* z_min, + float* z_max) { + *z_min = std::numeric_limits::max(); + *z_max = std::numeric_limits::min(); + // Use a global depth scale for all the objects in the scene + for (const auto& object : annotations.annotations()) { + for (const auto& keypoint : object.keypoints()) { + *z_min = std::min(keypoint.point_2d().depth(), *z_min); + *z_max = std::max(keypoint.point_2d().depth(), *z_max); + } + } +} + +void SetColorSizeValueFromZ(float z, float z_min, float z_max, + RenderAnnotation* render_annotation) { + const int color_value = 255 - static_cast(Remap(z, z_min, z_max, 255)); + ::mediapipe::Color color; + color.set_r(color_value); + color.set_g(color_value); + color.set_b(color_value); + SetColor(render_annotation, color); + const int thickness = static_cast((1.f - Remap(z, z_min, z_max, 1)) * + kMaxLandmarkThickness); + render_annotation->set_thickness(thickness); +} + +} // namespace + +// A calculator that converts FrameAnnotation proto to RenderData proto for +// visualization. The input should be the FrameAnnotation proto buffer. It is +// also possible to specify the connections between landmarks. +// +// Example config: +// node { +// calculator: "AnnotationsToRenderDataCalculator" +// input_stream: "ANNOTATIONS:annotations" +// output_stream: "RENDER_DATA:render_data" +// options { +// [AnnotationsToRenderDataCalculator.ext] { +// landmark_connections: [0, 1, 1, 2] +// landmark_color { r: 0 g: 255 b: 0 } +// connection_color { r: 0 g: 255 b: 0 } +// thickness: 4.0 +// } +// } +// } +class AnnotationsToRenderDataCalculator : public CalculatorBase { + public: + AnnotationsToRenderDataCalculator() {} + ~AnnotationsToRenderDataCalculator() override {} + AnnotationsToRenderDataCalculator(const AnnotationsToRenderDataCalculator&) = + delete; + AnnotationsToRenderDataCalculator& operator=( + const AnnotationsToRenderDataCalculator&) = delete; + + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + + ::mediapipe::Status Process(CalculatorContext* cc) override; + + private: + static void SetRenderAnnotationColorThickness( + const AnnotationsToRenderDataCalculatorOptions& options, + RenderAnnotation* render_annotation); + static RenderAnnotation* AddPointRenderData( + const AnnotationsToRenderDataCalculatorOptions& options, + RenderData* render_data); + + // Add a command to draw a line in the rendering queue. The line is drawn from + // (start_x, start_y) to (end_x, end_y). The input x,y can either be in pixel + // or normalized coordinate [0, 1] as indicated by the normalized flag. + static void AddConnectionToRenderData( + float start_x, float start_y, float end_x, float end_y, + const AnnotationsToRenderDataCalculatorOptions& options, bool normalized, + RenderData* render_data); + + // Same as above function. Instead of using color data to render the line, it + // re-colors the line according to the two depth value. gray_val1 is the color + // of the starting point and gray_val2 is the color of the ending point. The + // line is colored using gradient color from gray_val1 to gray_val2. The + // gray_val ranges from [0 to 255] for black to white. + static void AddConnectionToRenderData( + float start_x, float start_y, float end_x, float end_y, + const AnnotationsToRenderDataCalculatorOptions& options, bool normalized, + int gray_val1, int gray_val2, RenderData* render_data); + + AnnotationsToRenderDataCalculatorOptions options_; +}; +REGISTER_CALCULATOR(AnnotationsToRenderDataCalculator); + +::mediapipe::Status AnnotationsToRenderDataCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)) << "No input stream found."; + if (cc->Inputs().HasTag(kAnnotationTag)) { + cc->Inputs().Tag(kAnnotationTag).Set(); + } + cc->Outputs().Tag(kRenderDataTag).Set(); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status AnnotationsToRenderDataCalculator::Open( + CalculatorContext* cc) { + cc->SetOffset(TimestampDiff(0)); + options_ = cc->Options(); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status AnnotationsToRenderDataCalculator::Process( + CalculatorContext* cc) { + auto render_data = absl::make_unique(); + bool visualize_depth = options_.visualize_landmark_depth(); + float z_min = 0.f; + float z_max = 0.f; + + if (cc->Inputs().HasTag(kAnnotationTag)) { + const auto& annotations = + cc->Inputs().Tag(kAnnotationTag).Get(); + RET_CHECK_EQ(options_.landmark_connections_size() % 2, 0) + << "Number of entries in landmark connections must be a multiple of 2"; + + if (visualize_depth) { + GetMinMaxZ(annotations, &z_min, &z_max); + // Only change rendering if there are actually z values other than 0. + visualize_depth &= ((z_max - z_min) > 1e-3); + } + + for (const auto& object : annotations.annotations()) { + for (const auto& keypoint : object.keypoints()) { + auto* keypoint_data_render = + AddPointRenderData(options_, render_data.get()); + auto* point = keypoint_data_render->mutable_point(); + if (visualize_depth) { + SetColorSizeValueFromZ(keypoint.point_2d().depth(), z_min, z_max, + keypoint_data_render); + } + + point->set_normalized(true); + point->set_x(keypoint.point_2d().x()); + point->set_y(keypoint.point_2d().y()); + } + + // Add edges + for (int i = 0; i < options_.landmark_connections_size(); i += 2) { + const auto& ld0 = + object.keypoints(options_.landmark_connections(i)).point_2d(); + const auto& ld1 = + object.keypoints(options_.landmark_connections(i + 1)).point_2d(); + const bool normalized = true; + + if (visualize_depth) { + const int gray_val1 = + 255 - static_cast(Remap(ld0.depth(), z_min, z_max, 255)); + const int gray_val2 = + 255 - static_cast(Remap(ld1.depth(), z_min, z_max, 255)); + AddConnectionToRenderData(ld0.x(), ld0.y(), ld1.x(), ld1.y(), + options_, normalized, gray_val1, gray_val2, + render_data.get()); + } else { + AddConnectionToRenderData(ld0.x(), ld0.y(), ld1.x(), ld1.y(), + options_, normalized, render_data.get()); + } + } + } + } + + cc->Outputs() + .Tag(kRenderDataTag) + .Add(render_data.release(), cc->InputTimestamp()); + + return ::mediapipe::OkStatus(); +} + +void AnnotationsToRenderDataCalculator::AddConnectionToRenderData( + float start_x, float start_y, float end_x, float end_y, + const AnnotationsToRenderDataCalculatorOptions& options, bool normalized, + int gray_val1, int gray_val2, RenderData* render_data) { + auto* connection_annotation = render_data->add_render_annotations(); + RenderAnnotation::GradientLine* line = + connection_annotation->mutable_gradient_line(); + line->set_x_start(start_x); + line->set_y_start(start_y); + line->set_x_end(end_x); + line->set_y_end(end_y); + line->set_normalized(normalized); + line->mutable_color1()->set_r(gray_val1); + line->mutable_color1()->set_g(gray_val1); + line->mutable_color1()->set_b(gray_val1); + line->mutable_color2()->set_r(gray_val2); + line->mutable_color2()->set_g(gray_val2); + line->mutable_color2()->set_b(gray_val2); + connection_annotation->set_thickness(options.thickness()); +} + +void AnnotationsToRenderDataCalculator::AddConnectionToRenderData( + float start_x, float start_y, float end_x, float end_y, + const AnnotationsToRenderDataCalculatorOptions& options, bool normalized, + RenderData* render_data) { + auto* connection_annotation = render_data->add_render_annotations(); + RenderAnnotation::Line* line = connection_annotation->mutable_line(); + line->set_x_start(start_x); + line->set_y_start(start_y); + line->set_x_end(end_x); + line->set_y_end(end_y); + line->set_normalized(normalized); + SetColor(connection_annotation, options.connection_color()); + connection_annotation->set_thickness(options.thickness()); +} + +RenderAnnotation* AnnotationsToRenderDataCalculator::AddPointRenderData( + const AnnotationsToRenderDataCalculatorOptions& options, + RenderData* render_data) { + auto* landmark_data_annotation = render_data->add_render_annotations(); + landmark_data_annotation->set_scene_tag(kKeypointLabel); + SetRenderAnnotationColorThickness(options, landmark_data_annotation); + return landmark_data_annotation; +} + +void AnnotationsToRenderDataCalculator::SetRenderAnnotationColorThickness( + const AnnotationsToRenderDataCalculatorOptions& options, + RenderAnnotation* render_annotation) { + SetColor(render_annotation, options.landmark_color()); + render_annotation->set_thickness(options.thickness()); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.proto new file mode 100644 index 000000000..1e04d955f --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.proto @@ -0,0 +1,43 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; +import "mediapipe/util/color.proto"; + +message AnnotationsToRenderDataCalculatorOptions { + extend CalculatorOptions { + optional AnnotationsToRenderDataCalculatorOptions ext = 267644238; + } + + // Specifies the landmarks to be connected in the drawing. For example, the + // landmark_connections value of [0, 1, 1, 2] specifies two connections: one + // that connects landmarks with index 0 and 1, and another that connects + // landmarks with index 1 and 2. + repeated int32 landmark_connections = 1; + + // Color of the landmarks. + optional Color landmark_color = 2; + // Color of the connections. + optional Color connection_color = 3; + + // Thickness of the drawing of landmarks and connections. + optional double thickness = 4 [default = 1.0]; + + // Change color and size of rendered landmarks based on its z value. + optional bool visualize_landmark_depth = 5 [default = true]; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto b/mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto new file mode 100644 index 000000000..f0f10ae87 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto @@ -0,0 +1,38 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +message BeliefDecoderConfig { + optional float heatmap_threshold = 1 [default = 0.9]; + // Maximum distance in pixels between two local max heatmap values. + optional float local_max_distance = 2 [default = 10.0]; + // Coefficient of offset_scale. + // offset_scale = offset_scale_coef * min(rows, cols). + // offset_scale is used to multiply the offset predictions from the network. + optional float offset_scale_coef = 3 [default = 0.5, deprecated = true]; + + // The radius for vertex voting. Use no voting if the radius is less than or + // euqal to 1. Example: 10. + optional int32 voting_radius = 4; + + // The number of pixels to determine whether two points are the same. + // Example: 5 (voting_radius / 2). + optional int32 voting_allowance = 5; + + // The threshold of beliefs, with which the points can vote. Example: 0.2. + optional float voting_threshold = 6; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/box.cc b/mediapipe/graphs/object_detection_3d/calculators/box.cc new file mode 100644 index 000000000..31c8ddc1c --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/box.cc @@ -0,0 +1,255 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/box.h" + +#include "Eigen/src/Core/util/Constants.h" +#include "mediapipe/framework/port/logging.h" + +namespace mediapipe { + +namespace { +constexpr int kFrontFaceId = 4; +constexpr int kTopFaceId = 2; +constexpr int kNumKeypoints = 8 + 1; +constexpr int kNumberOfAxis = 3; +constexpr int kEdgesPerAxis = 4; + +} // namespace + +Box::Box(const std::string& category) + : Model(kBoundingBox, kNumKeypoints, category), + bounding_box_(kNumKeypoints) { + transformation_.setIdentity(); + + scale_ << 0.1, 0.1, 0.1; + + // The vertices are ordered according to the left-hand rule, so the normal + // vector of each face will point inward the box. + faces_.push_back({5, 6, 8, 7}); // +x on yz plane + faces_.push_back({1, 3, 4, 2}); // -x on yz plane + + faces_.push_back({3, 7, 8, 4}); // +y on xz plane = top + faces_.push_back({1, 2, 6, 5}); // -y on xz plane + + faces_.push_back({2, 4, 8, 6}); // +z on xy plane = front + faces_.push_back({1, 5, 7, 3}); // -z on xy plane + + // Add the edges in the cube, they are sorted according to axis (x-y-z). + edges_.push_back({1, 5}); + edges_.push_back({2, 6}); + edges_.push_back({3, 7}); + edges_.push_back({4, 8}); + + edges_.push_back({1, 3}); + edges_.push_back({5, 7}); + edges_.push_back({2, 4}); + edges_.push_back({6, 8}); + + edges_.push_back({1, 2}); + edges_.push_back({3, 4}); + edges_.push_back({5, 6}); + edges_.push_back({7, 8}); + Update(); +} + +void Box::Update() { + // Compute the eight vertices of the bounding box from Box's parameters + auto w = scale_[0] / 2.f; + auto h = scale_[1] / 2.f; + auto d = scale_[2] / 2.f; + + // Define the local coordinate system, w.r.t. the center of the boxs + bounding_box_[0] << 0., 0., 0.; + bounding_box_[1] << -w, -h, -d; + bounding_box_[2] << -w, -h, +d; + bounding_box_[3] << -w, +h, -d; + bounding_box_[4] << -w, +h, +d; + bounding_box_[5] << +w, -h, -d; + bounding_box_[6] << +w, -h, +d; + bounding_box_[7] << +w, +h, -d; + bounding_box_[8] << +w, +h, +d; + + // Convert to world coordinate system + for (int i = 0; i < kNumKeypoints; ++i) { + bounding_box_[i] = + transformation_.topLeftCorner<3, 3>() * bounding_box_[i] + + transformation_.col(3).head<3>(); + } +} + +void Box::Adjust(const std::vector& variables) { + Eigen::Vector3f translation; + translation << variables[0], variables[1], variables[2]; + SetTranslation(translation); + + const float roll = variables[3]; + const float pitch = variables[4]; + const float yaw = variables[5]; + SetRotation(roll, pitch, yaw); + + Eigen::Vector3f scale; + scale << variables[6], variables[7], variables[8]; + + SetScale(scale); + Update(); +} + +float* Box::GetVertex(size_t vertex_id) { + CHECK_LT(vertex_id, kNumKeypoints); + return bounding_box_[vertex_id].data(); +} + +const float* Box::GetVertex(size_t vertex_id) const { + CHECK_LT(vertex_id, kNumKeypoints); + return bounding_box_[vertex_id].data(); +} + +bool Box::InsideTest(const Eigen::Vector3f& point, int check_axis) const { + const float* v0 = GetVertex(1); + const float* v1 = GetVertex(2); + const float* v2 = GetVertex(3); + const float* v4 = GetVertex(5); + + switch (check_axis) { + case 1: + return (v0[0] <= point[0] && point[0] <= v1[0]); // X-axis + case 2: + return (v0[1] <= point[1] && point[1] <= v2[1]); // Y-axis + case 3: + return (v0[2] <= point[2] && point[2] <= v4[2]); // Z-axis + default: + return false; + } +} + +void Box::Deserialize(const Object& obj) { + CHECK_EQ(obj.keypoints_size(), kNumKeypoints); + Model::Deserialize(obj); +} + +void Box::Serialize(Object* obj) { + Model::Serialize(obj); + obj->set_type(Object::BOUNDING_BOX); + std::vector local_bounding_box(9); + // Define the local coordinate system, w.r.t. the center of the boxs + local_bounding_box[0] << 0., 0., 0.; + local_bounding_box[1] << -0.5, -0.5, -0.5; + local_bounding_box[2] << -0.5, -0.5, +0.5; + local_bounding_box[3] << -0.5, +0.5, -0.5; + local_bounding_box[4] << -0.5, +0.5, +0.5; + local_bounding_box[5] << +0.5, -0.5, -0.5; + local_bounding_box[6] << +0.5, -0.5, +0.5; + local_bounding_box[7] << +0.5, +0.5, -0.5; + local_bounding_box[8] << +0.5, +0.5, +0.5; + for (int i = 0; i < kNumKeypoints; ++i) { + KeyPoint* keypoint = obj->add_keypoints(); + keypoint->set_x(local_bounding_box[i][0]); + keypoint->set_y(local_bounding_box[i][1]); + keypoint->set_z(local_bounding_box[i][2]); + keypoint->set_confidence_radius(0.); + } +} + +const Face& Box::GetFrontFace() const { return faces_[kFrontFaceId]; } + +const Face& Box::GetTopFace() const { return faces_[kTopFaceId]; } + +std::pair Box::GetGroundPlane() const { + const Vector3f gravity = Vector3f(0., 1., 0.); + int ground_plane_id = 0; + float ground_plane_error = 10.0; + + auto get_face_center = [&](const Face& face) { + Vector3f center = Vector3f::Zero(); + for (const int vertex_id : face) { + center += Map(GetVertex(vertex_id)); + } + center /= face.size(); + return center; + }; + + auto get_face_normal = [&](const Face& face, const Vector3f& center) { + Vector3f v1 = Map(GetVertex(face[0])) - center; + Vector3f v2 = Map(GetVertex(face[1])) - center; + Vector3f normal = v1.cross(v2); + return normal; + }; + + // The ground plane is defined as a plane aligned with gravity. + // gravity is the (0, 1, 0) vector in the world coordinate system. + const auto& faces = GetFaces(); + for (int face_id = 0; face_id < faces.size(); face_id += 2) { + const auto& face = faces[face_id]; + Vector3f center = get_face_center(face); + Vector3f normal = get_face_normal(face, center); + Vector3f w = gravity.cross(normal); + const float w_sq_norm = w.squaredNorm(); + if (w_sq_norm < ground_plane_error) { + ground_plane_error = w_sq_norm; + ground_plane_id = face_id; + } + } + + Vector3f center = get_face_center(faces[ground_plane_id]); + Vector3f normal = get_face_normal(faces[ground_plane_id], center); + + // For each face, we also have a parallel face that it's normal is also + // aligned with gravity vector. We pick the face with lower height (y-value). + // The parallel to face 0 is 1, face 2 is 3, and face 4 is 5. + int parallel_face_id = ground_plane_id + 1; + const auto& parallel_face = faces[parallel_face_id]; + Vector3f parallel_face_center = get_face_center(parallel_face); + Vector3f parallel_face_normal = + get_face_normal(parallel_face, parallel_face_center); + if (parallel_face_center[1] < center[1]) { + center = parallel_face_center; + normal = parallel_face_normal; + } + return {center, normal}; +} + +template +void Box::Fit(const std::vector& vertices) { + CHECK_EQ(vertices.size(), kNumKeypoints); + scale_.setZero(); + // The scale would remain invariant under rotation and translation. + // We can safely estimate the scale from the oriented box. + for (int axis = 0; axis < kNumberOfAxis; ++axis) { + for (int edge_id = 0; edge_id < kEdgesPerAxis; ++edge_id) { + // The edges are stored in quadruples according to each axis + const std::array& edge = edges_[axis * kEdgesPerAxis + edge_id]; + scale_[axis] += (vertices[edge[0]] - vertices[edge[1]]).norm(); + } + scale_[axis] /= kEdgesPerAxis; + } + // Create a scaled axis-aligned box + transformation_.setIdentity(); + Update(); + + using MatrixN3_RM = Eigen::Matrix; + Eigen::Map v(vertices[0].data()); + Eigen::Map system(bounding_box_[0].data()); + auto system_h = system.rowwise().homogeneous().eval(); + auto system_g = system_h.colPivHouseholderQr(); + auto solution = system_g.solve(v).eval(); + transformation_.topLeftCorner<3, 4>() = solution.transpose(); + Update(); +} + +template void Box::Fit(const std::vector&); +template void Box::Fit>(const std::vector>&); +template void Box::Fit>( + const std::vector>&); +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/box.h b/mediapipe/graphs/object_detection_3d/calculators/box.h new file mode 100644 index 000000000..22839b52b --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/box.h @@ -0,0 +1,132 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_ + +#include + +#include "mediapipe/graphs/object_detection_3d/calculators/model.h" + +namespace mediapipe { + +// Model for the bounding box in 3D +// The box has 9 degrees of freedom, which uniquely defines 8 keypoints in the +// fixed world-coordinate system. +// +// The 8 keypoints are defined as follows +// +// kp-id axis +// 0 000 --- +// 1 001 --+ +// 2 010 -+- +// 3 011 -++ +// 4 100 +-- +// 5 101 +-+ +// 6 110 ++- +// 7 111 +++ +// +// where xyz means positive or negative vector along the axis where the center +// of the box is the origin. The resulting bounding box is +// +// x x +// 0 + + + + + + + + 4 .------- +// +\ +\ |\ +// + \ y + \ z | \ y +// + \ + \ | \ +// + 2 + + + + + + + + 6 +// z + + + + +// + + + + +// + + C + + +// + + + + +// 1 + + + + + + + + 5 + +// \ + \ + +// \ + \ + +// \+ \+ +// 3 + + + + + + + + 7 +// +// World coordinate system: +y is up (aligned with gravity), +// +z is toward the user, +x follows right hand rule. +// The front face is defined as +z axis on xy plane. +// The top face is defined as +y axis on xz plane. +// + +class Box : public Model { + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + + explicit Box(const std::string& category); + ~Box() override = default; + + bool InsideTest(const Vector3f& point, int check_axis) const; + + const std::vector& GetFaces() const { return faces_; } + const Face& GetFace(size_t face_id) const { return faces_[face_id]; } + + const std::vector>& GetEdges() const { return edges_; } + const std::array& GetEdge(size_t edge_id) const { + return edges_[edge_id]; + } + + // Returns the keypoints for the front face of the box. + // The front face is defind as a face with +z normal vector on xy plane + // In Box's c'tor, the top face is set to {1, 3, 7, 5} + const Face& GetFrontFace() const; + + // Returns the keypoints for the top face of the box. + // The top face is defind as a face with +z normal vector on xy plane + // In Box's c'tor, the top face is set to {1, 3, 7, 5} + const Face& GetTopFace() const; + + void Update() override; + void Adjust(const std::vector& variables) override; + float* GetVertex(size_t vertex_id) override; + const float* GetVertex(size_t vertex_id) const override; + void Deserialize(const Object& obj) override; + void Serialize(Object* obj) override; + + // Computes the plane center and the normal vector for the plane the object + // is sitting on in the world cooordinate system. The normal vector is roughly + // aligned with gravity. + std::pair GetGroundPlane() const; + + // Estimates a box 9-dof parameters from the given vertices. Directly computes + // the scale of the box, then solves for orientation and translation. + // Expects a std::vector of size 9 of a Eigen::Vector3f or mapped Vector3f. + // If mapping proto messages, we recommend to use the Map. + // For example: + // + // using T = Map; + // std::vector vertices; + // for (const auto& point : message) { // point is a repeated float message. + // T p(point.data()); + // vertices.emplace_back(p); + // } + // box.Fit(vertices); + // + // The Points must be arranged as 1 + 8 (center keypoint followed by 8 box + // vertices) vector. This function will overwrite the scale and transformation + // properties of the class. + template > + void Fit(const std::vector& vertices); + + private: + std::vector faces_; + std::vector> edges_; + std::vector bounding_box_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_ diff --git a/mediapipe/graphs/object_detection_3d/calculators/box_util.cc b/mediapipe/graphs/object_detection_3d/calculators/box_util.cc new file mode 100644 index 000000000..e07cac54c --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/box_util.cc @@ -0,0 +1,153 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h" + +#include + +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace mediapipe { +void ComputeBoundingRect(const std::vector& points, + mediapipe::TimedBoxProto* box) { + CHECK(box != nullptr); + float top = 1.0f; + float bottom = 0.0f; + float left = 1.0f; + float right = 0.0f; + for (const auto& point : points) { + top = std::min(top, point.y); + bottom = std::max(bottom, point.y); + left = std::min(left, point.x); + right = std::max(right, point.x); + } + box->set_top(top); + box->set_bottom(bottom); + box->set_left(left); + box->set_right(right); + // We are currently only doing axis aligned bounding box. If we need to + // compute rotated bounding box, then we need the original image aspect ratio, + // map back to original image space, compute cv::convexHull, then for each + // edge of the hull, rotate according to edge orientation, find the box. + box->set_rotation(0.0f); +} + +float ComputeBoxIoU(const TimedBoxProto& box1, const TimedBoxProto& box2) { + cv::Point2f box1_center((box1.left() + box1.right()) * 0.5f, + (box1.top() + box1.bottom()) * 0.5f); + cv::Size2f box1_size(box1.right() - box1.left(), box1.bottom() - box1.top()); + cv::RotatedRect rect1(box1_center, box1_size, + -box1.rotation() * 180.0f / M_PI); + cv::Point2f box2_center((box2.left() + box2.right()) * 0.5f, + (box2.top() + box2.bottom()) * 0.5f); + cv::Size2f box2_size(box2.right() - box2.left(), box2.bottom() - box2.top()); + cv::RotatedRect rect2(box2_center, box2_size, + -box2.rotation() * 180.0f / M_PI); + std::vector intersections_unsorted; + std::vector intersections; + cv::rotatedRectangleIntersection(rect1, rect2, intersections_unsorted); + if (intersections_unsorted.size() < 3) { + return 0.0f; + } + cv::convexHull(intersections_unsorted, intersections); + + // We use Shoelace formula to compute area of polygons. + float intersection_area = 0.0f; + for (int i = 0; i < intersections.size(); ++i) { + const auto& curr_pt = intersections[i]; + const int i_next = (i + 1) == intersections.size() ? 0 : (i + 1); + const auto& next_pt = intersections[i_next]; + intersection_area += (curr_pt.x * next_pt.y - next_pt.x * curr_pt.y); + } + intersection_area = std::abs(intersection_area) * 0.5f; + + // Compute union area + const float union_area = + rect1.size.area() + rect2.size.area() - intersection_area + 1e-5f; + + const float iou = intersection_area / union_area; + return iou; +} + +std::vector ComputeBoxCorners(const TimedBoxProto& box, + float width, float height) { + // Rotate 4 corner w.r.t. center. + const cv::Point2f center(0.5f * (box.left() + box.right()) * width, + 0.5f * (box.top() + box.bottom()) * height); + const std::vector corners{ + cv::Point2f(box.left() * width, box.top() * height), + cv::Point2f(box.left() * width, box.bottom() * height), + cv::Point2f(box.right() * width, box.bottom() * height), + cv::Point2f(box.right() * width, box.top() * height)}; + + const float cos_a = std::cos(box.rotation()); + const float sin_a = std::sin(box.rotation()); + std::vector transformed_corners(4); + for (int k = 0; k < 4; ++k) { + // Scale and rotate w.r.t. center. + const cv::Point2f rad = corners[k] - center; + const cv::Point2f rot_rad(cos_a * rad.x - sin_a * rad.y, + sin_a * rad.x + cos_a * rad.y); + transformed_corners[k] = center + rot_rad; + transformed_corners[k].x /= width; + transformed_corners[k].y /= height; + } + return transformed_corners; +} + +cv::Mat PerspectiveTransformBetweenBoxes(const TimedBoxProto& src_box, + const TimedBoxProto& dst_box, + const float aspect_ratio) { + std::vector box1_corners = + ComputeBoxCorners(src_box, /*width*/ aspect_ratio, /*height*/ 1.0f); + std::vector box2_corners = + ComputeBoxCorners(dst_box, /*width*/ aspect_ratio, /*height*/ 1.0f); + cv::Mat affine_transform = cv::getPerspectiveTransform( + /*src*/ box1_corners, /*dst*/ box2_corners); + cv::Mat output_affine; + affine_transform.convertTo(output_affine, CV_32FC1); + return output_affine; +} + +cv::Point2f MapPoint(const TimedBoxProto& src_box, const TimedBoxProto& dst_box, + const cv::Point2f& src_point, float width, float height) { + const cv::Point2f src_center( + 0.5f * (src_box.left() + src_box.right()) * width, + 0.5f * (src_box.top() + src_box.bottom()) * height); + const cv::Point2f dst_center( + 0.5f * (dst_box.left() + dst_box.right()) * width, + 0.5f * (dst_box.top() + dst_box.bottom()) * height); + const float scale_x = + (dst_box.right() - dst_box.left()) / (src_box.right() - src_box.left()); + const float scale_y = + (dst_box.bottom() - dst_box.top()) / (src_box.bottom() - src_box.top()); + const float rotation = dst_box.rotation() - src_box.rotation(); + const cv::Point2f rad = + cv::Point2f(src_point.x * width, src_point.y * height) - src_center; + const float rad_x = rad.x * scale_x; + const float rad_y = rad.y * scale_y; + const float cos_a = std::cos(rotation); + const float sin_a = std::sin(rotation); + const cv::Point2f rot_rad(cos_a * rad_x - sin_a * rad_y, + sin_a * rad_x + cos_a * rad_y); + const cv::Point2f dst_point_image = dst_center + rot_rad; + const cv::Point2f dst_point(dst_point_image.x / width, + dst_point_image.y / height); + return dst_point; +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/box_util.h b/mediapipe/graphs/object_detection_3d/calculators/box_util.h new file mode 100644 index 000000000..4076b156d --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/box_util.h @@ -0,0 +1,50 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_ + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace mediapipe { + +// This function fills the geometry of the TimedBoxProto. Id, timestamp etc. +// need to be set outside this function. +void ComputeBoundingRect(const std::vector& points, + mediapipe::TimedBoxProto* box); + +// This function computes the intersection over union between two boxes. +float ComputeBoxIoU(const TimedBoxProto& box1, const TimedBoxProto& box2); + +// Computes corners of the box. +// width and height are image width and height, which is typically +// needed since the box is in normalized coordinates. +std::vector ComputeBoxCorners(const TimedBoxProto& box, + float width, float height); + +// Computes the perspective transform from box1 to box2. +// The input argument aspect_ratio is width / height of the image. +// The returned matrix should be a 3x3 matrix. +cv::Mat PerspectiveTransformBetweenBoxes(const TimedBoxProto& src_box, + const TimedBoxProto& dst_box, + const float aspect_ratio); + +// Map point according to source and destination box location. +cv::Point2f MapPoint(const TimedBoxProto& src_box, const TimedBoxProto& dst_box, + const cv::Point2f& src_point, float width, float height); + +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_ diff --git a/mediapipe/graphs/object_detection_3d/calculators/box_util_test.cc b/mediapipe/graphs/object_detection_3d/calculators/box_util_test.cc new file mode 100644 index 000000000..97698cad2 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/box_util_test.cc @@ -0,0 +1,123 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h" + +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace mediapipe { +namespace { + +TEST(BoxUtilTest, TestComputeBoundingRect) { + std::vector points{ + cv::Point2f(0.35f, 0.25f), cv::Point2f(0.3f, 0.3f), + cv::Point2f(0.2f, 0.4f), cv::Point2f(0.3f, 0.1f), + cv::Point2f(0.2f, 0.2f), cv::Point2f(0.5f, 0.3f), + cv::Point2f(0.4f, 0.4f), cv::Point2f(0.5f, 0.1f), + cv::Point2f(0.4f, 0.2f)}; + TimedBoxProto box; + ComputeBoundingRect(points, &box); + EXPECT_FLOAT_EQ(0.1f, box.top()); + EXPECT_FLOAT_EQ(0.4f, box.bottom()); + EXPECT_FLOAT_EQ(0.2f, box.left()); + EXPECT_FLOAT_EQ(0.5f, box.right()); +} + +TEST(BoxUtilTest, TestComputeBoxIoU) { + TimedBoxProto box1; + box1.set_top(0.2f); + box1.set_bottom(0.6f); + box1.set_left(0.1f); + box1.set_right(0.3f); + box1.set_rotation(0.0f); + TimedBoxProto box2 = box1; + box2.set_rotation(/*pi/2*/ 1.570796f); + const float box_area = + (box1.bottom() - box1.top()) * (box1.right() - box1.left()); + const float box_intersection = + (box1.right() - box1.left()) * (box1.right() - box1.left()); + const float expected_iou = + box_intersection / (box_area * 2 - box_intersection); + EXPECT_NEAR(expected_iou, ComputeBoxIoU(box1, box2), 3e-5f); + + TimedBoxProto box3; + box3.set_top(0.2f); + box3.set_bottom(0.6f); + box3.set_left(0.5f); + box3.set_right(0.7f); + EXPECT_NEAR(0.0f, ComputeBoxIoU(box1, box3), 3e-5f); +} + +TEST(BoxUtilTest, TestPerspectiveTransformBetweenBoxes) { + TimedBoxProto box1; + const float height = 4.0f; + const float width = 3.0f; + box1.set_top(1.0f / height); + box1.set_bottom(2.0f / height); + box1.set_left(1.0f / width); + box1.set_right(2.0f / width); + TimedBoxProto box2; + box2.set_top(1.0f / height); + box2.set_bottom(2.0f / height); + box2.set_left(1.0f / width); + box2.set_right(2.0f / width); + box2.set_rotation(/*pi/4*/ -0.785398f); + cv::Mat transform = + PerspectiveTransformBetweenBoxes(box1, box2, width / height); + const float kTolerence = 1e-5f; + const cv::Vec3f original_position(1.5f / width, 1.0f / height, 1.0f); + const cv::Mat transformed_position = transform * cv::Mat(original_position); + EXPECT_NEAR( + (1.5f - 0.5f * std::sqrt(2) / 2.0f) / width, + transformed_position.at(0) / transformed_position.at(2), + kTolerence); + EXPECT_NEAR( + (1.5f - 0.5f * std::sqrt(2) / 2.0f) / height, + transformed_position.at(1) / transformed_position.at(2), + kTolerence); +} + +TEST(BoxUtilTest, TestMapPoint) { + const float height = 4.0f; + const float width = 3.0f; + TimedBoxProto box1; + box1.set_top(1.0f / height); + box1.set_bottom(2.0f / height); + box1.set_left(1.0f / width); + box1.set_right(2.0f / width); + TimedBoxProto box2; + box2.set_top(1.0f / height); + box2.set_bottom(2.0f / height); + box2.set_left(1.0f / width); + box2.set_right(2.0f / width); + box2.set_rotation(/*pi/4*/ -0.785398f); + + cv::Point2f src_point1(1.2f / width, 1.4f / height); + cv::Point2f src_point2(1.3f / width, 1.8f / height); + const float distance1 = std::sqrt(0.1 * 0.1 + 0.4 * 0.4); + cv::Point2f dst_point1 = MapPoint(box1, box2, src_point1, width, height); + cv::Point2f dst_point2 = MapPoint(box1, box2, src_point2, width, height); + const float distance2 = + std::sqrt((dst_point1.x * width - dst_point2.x * width) * + (dst_point1.x * width - dst_point2.x * width) + + (dst_point1.y * height - dst_point2.y * height) * + (dst_point1.y * height - dst_point2.y * height)); + EXPECT_NEAR(distance1, distance2, 1e-5f); +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/camera_parameters.proto b/mediapipe/graphs/object_detection_3d/calculators/camera_parameters.proto new file mode 100644 index 000000000..f5c843b6e --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/camera_parameters.proto @@ -0,0 +1,47 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +message CameraParametersProto { + // This number is non-negative, it represents camera height above ground + // normalized by focal length. + optional float height_above_ground = 1 [default = 100.0]; + // Width of image in portrait orientation normalized by focal length + optional float portrait_width = 2 [default = 1.0103]; + // Height of image in portrait orientation normalized by focal length + optional float portrait_height = 3 [default = 1.3435]; + enum ImageOrientation { + PORTRAIT_ORIENTATION = 0; + LANDSCAPE_ORIENTATION = 1; + } + // The input image orientation + optional ImageOrientation image_orientation = 4 + [default = PORTRAIT_ORIENTATION]; + + // This defines the projection method from 2D screen to 3D. + enum ProjectionMode { + UNSPECIFIED = 0; + // Projects 2D point to ground plane (horizontal plane). + GROUND_PLANE = 1; + // Projects 2D point to sphere. + SPHERE = 2; + } + optional ProjectionMode projection_mode = 5 [default = GROUND_PLANE]; + // Radius of sphere when using the SPHERE projection mode above. + // The value is normalized by focal length. + optional float projection_sphere_radius = 6 [default = 100.0]; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/decoder.cc b/mediapipe/graphs/object_detection_3d/calculators/decoder.cc new file mode 100644 index 000000000..fdea8d51f --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/decoder.cc @@ -0,0 +1,257 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h" + +#include + +#include "Eigen/Dense" +#include "mediapipe/framework/port/canonical_errors.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" + +namespace mediapipe { +constexpr int Decoder::kNumOffsetmaps = 16; + +namespace { +void SetPoint3d(float x, float y, float z, Point3D* point_3d) { + point_3d->set_x(x); + point_3d->set_y(y); + point_3d->set_z(z); +} +} // namespace + +FrameAnnotation Decoder::DecodeBoundingBoxKeypoints( + const cv::Mat& heatmap, const cv::Mat& offsetmap) const { + CHECK_EQ(1, heatmap.channels()); + CHECK_EQ(kNumOffsetmaps, offsetmap.channels()); + CHECK_EQ(heatmap.cols, offsetmap.cols); + CHECK_EQ(heatmap.rows, offsetmap.rows); + + const float offset_scale = std::min(offsetmap.cols, offsetmap.rows); + const std::vector center_points = ExtractCenterKeypoints(heatmap); + std::vector boxes; + for (const auto& center_point : center_points) { + BeliefBox box; + box.box_2d.emplace_back(center_point.x, center_point.y); + const int center_x = static_cast(std::round(center_point.x)); + const int center_y = static_cast(std::round(center_point.y)); + box.belief = heatmap.at(center_y, center_x); + if (config_.voting_radius() > 1) { + DecodeByVoting(heatmap, offsetmap, center_x, center_y, offset_scale, + offset_scale, &box); + } else { + DecodeByPeak(offsetmap, center_x, center_y, offset_scale, offset_scale, + &box); + } + if (IsNewBox(&boxes, &box)) { + boxes.push_back(std::move(box)); + } + } + + const float x_scale = 1.0f / offsetmap.cols; + const float y_scale = 1.0f / offsetmap.rows; + FrameAnnotation frame_annotations; + for (const auto& box : boxes) { + auto* object = frame_annotations.add_annotations(); + for (const auto& point : box.box_2d) { + auto* point2d = object->add_keypoints()->mutable_point_2d(); + point2d->set_x(point.first * x_scale); + point2d->set_y(point.second * y_scale); + } + } + return frame_annotations; +} + +void Decoder::DecodeByPeak(const cv::Mat& offsetmap, int center_x, int center_y, + float offset_scale_x, float offset_scale_y, + BeliefBox* box) const { + const auto& offset = offsetmap.at>( + /*row*/ center_y, /*col*/ center_x); + for (int i = 0; i < kNumOffsetmaps / 2; ++i) { + const float x_offset = offset[2 * i] * offset_scale_x; + const float y_offset = offset[2 * i + 1] * offset_scale_y; + box->box_2d.emplace_back(center_x + x_offset, center_y + y_offset); + } +} + +void Decoder::DecodeByVoting(const cv::Mat& heatmap, const cv::Mat& offsetmap, + int center_x, int center_y, float offset_scale_x, + float offset_scale_y, BeliefBox* box) const { + // Votes at the center. + const auto& center_offset = offsetmap.at>( + /*row*/ center_y, /*col*/ center_x); + std::vector center_votes(kNumOffsetmaps, 0.f); + for (int i = 0; i < kNumOffsetmaps / 2; ++i) { + center_votes[2 * i] = center_x + center_offset[2 * i] * offset_scale_x; + center_votes[2 * i + 1] = + center_y + center_offset[2 * i + 1] * offset_scale_y; + } + + // Find voting window. + int x_min = std::max(0, center_x - config_.voting_radius()); + int y_min = std::max(0, center_y - config_.voting_radius()); + int width = std::min(heatmap.cols - x_min, config_.voting_radius() * 2 + 1); + int height = std::min(heatmap.rows - y_min, config_.voting_radius() * 2 + 1); + cv::Rect rect(x_min, y_min, width, height); + cv::Mat heat = heatmap(rect); + cv::Mat offset = offsetmap(rect); + + for (int i = 0; i < kNumOffsetmaps / 2; ++i) { + float x_sum = 0.f; + float y_sum = 0.f; + float votes = 0.f; + for (int r = 0; r < heat.rows; ++r) { + for (int c = 0; c < heat.cols; ++c) { + const float belief = heat.at(r, c); + if (belief < config_.voting_threshold()) { + continue; + } + float offset_x = + offset.at>(r, c)[2 * i] * + offset_scale_x; + float offset_y = + offset.at>(r, c)[2 * i + 1] * + offset_scale_y; + float vote_x = c + rect.x + offset_x; + float vote_y = r + rect.y + offset_y; + float x_diff = std::abs(vote_x - center_votes[2 * i]); + float y_diff = std::abs(vote_y - center_votes[2 * i + 1]); + if (x_diff > config_.voting_allowance() || + y_diff > config_.voting_allowance()) { + continue; + } + x_sum += vote_x * belief; + y_sum += vote_y * belief; + votes += belief; + } + } + box->box_2d.emplace_back(x_sum / votes, y_sum / votes); + } +} + +bool Decoder::IsNewBox(std::vector* boxes, BeliefBox* box) const { + for (auto& b : *boxes) { + if (IsIdentical(b, *box)) { + if (b.belief < box->belief) { + std::swap(b, *box); + } + return false; + } + } + return true; +} + +bool Decoder::IsIdentical(const BeliefBox& box_1, + const BeliefBox& box_2) const { + // Skip the center point. + for (int i = 1; i < box_1.box_2d.size(); ++i) { + const float x_diff = + std::abs(box_1.box_2d[i].first - box_2.box_2d[i].first); + const float y_diff = + std::abs(box_1.box_2d[i].second - box_2.box_2d[i].second); + if (x_diff > config_.voting_allowance() || + y_diff > config_.voting_allowance()) { + return false; + } + } + return true; +} + +std::vector Decoder::ExtractCenterKeypoints( + const cv::Mat& center_heatmap) const { + cv::Mat max_filtered_heatmap(center_heatmap.rows, center_heatmap.cols, + center_heatmap.type()); + const int kernel_size = + static_cast(config_.local_max_distance() * 2 + 1 + 0.5f); + const cv::Size morph_size(kernel_size, kernel_size); + cv::dilate(center_heatmap, max_filtered_heatmap, + cv::getStructuringElement(cv::MORPH_RECT, morph_size)); + cv::Mat peak_map; + cv::bitwise_and((center_heatmap >= max_filtered_heatmap), + (center_heatmap >= config_.heatmap_threshold()), peak_map); + std::vector locations; // output, locations of non-zero pixels + cv::findNonZero(peak_map, locations); + return locations; +} + +absl::Status Decoder::Lift2DTo3D( + const Eigen::Matrix& projection_matrix, + bool portrait, FrameAnnotation* estimated_box) const { + CHECK(estimated_box != nullptr); + const float fx = projection_matrix(0, 0); + const float fy = projection_matrix(1, 1); + const float cx = projection_matrix(0, 2); + const float cy = projection_matrix(1, 2); + for (auto& annotation : *estimated_box->mutable_annotations()) { + Eigen::Matrix m = + Eigen::Matrix::Zero(16, 12); + CHECK_EQ(9, annotation.keypoints_size()); + float u, v; + for (int i = 0; i < 8; ++i) { + const auto& keypoint2d = annotation.keypoints(i + 1).point_2d(); + if (portrait) { + // swap x and y given that our image is in portrait orientation + u = keypoint2d.y() * 2 - 1; + v = keypoint2d.x() * 2 - 1; + } else { + u = keypoint2d.x() * 2 - 1; + v = 1 - keypoint2d.y() * 2; // (1 - keypoint2d.y()) * 2 - 1 + } + for (int j = 0; j < 4; ++j) { + // For each of the 4 control points, formulate two rows of the + // m matrix (two equations). + const float control_alpha = epnp_alpha_(i, j); + m(i * 2, j * 3) = fx * control_alpha; + m(i * 2, j * 3 + 2) = (cx + u) * control_alpha; + m(i * 2 + 1, j * 3 + 1) = fy * control_alpha; + m(i * 2 + 1, j * 3 + 2) = (cy + v) * control_alpha; + } + } + // This is a self adjoint matrix. Use SelfAdjointEigenSolver for a fast + // and stable solution. + Eigen::Matrix mt_m = m.transpose() * m; + Eigen::SelfAdjointEigenSolver> + eigen_solver(mt_m); + if (eigen_solver.info() != Eigen::Success) { + return absl::AbortedError("Eigen decomposition failed."); + } + CHECK_EQ(12, eigen_solver.eigenvalues().size()); + // Eigenvalues are sorted in increasing order for SelfAdjointEigenSolver + // only! If you use other Eigen Solvers, it's not guaranteed to be in + // increasing order. Here, we just take the eigen vector corresponding + // to first/smallest eigen value, since we used SelfAdjointEigenSolver. + Eigen::VectorXf eigen_vec = eigen_solver.eigenvectors().col(0); + Eigen::Map> control_matrix( + eigen_vec.data()); + if (control_matrix(0, 2) > 0) { + control_matrix = -control_matrix; + } + // First set the center keypoint. + SetPoint3d(control_matrix(0, 0), control_matrix(0, 1), control_matrix(0, 2), + annotation.mutable_keypoints(0)->mutable_point_3d()); + // Then set the 8 vertices. + Eigen::Matrix vertices = + epnp_alpha_ * control_matrix; + for (int i = 0; i < 8; ++i) { + SetPoint3d(vertices(i, 0), vertices(i, 1), vertices(i, 2), + annotation.mutable_keypoints(i + 1)->mutable_point_3d()); + } + } + return absl::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/decoder.h b/mediapipe/graphs/object_detection_3d/calculators/decoder.h new file mode 100644 index 000000000..2d7065062 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/decoder.h @@ -0,0 +1,109 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_ + +#include + +#include "Eigen/Dense" +#include "absl/status/status.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.pb.h" + +namespace mediapipe { + +// Decodes 3D bounding box from heatmaps and offset maps. In the future, +// if we want to develop decoder for generic skeleton, then we need to +// generalize this class, and make a few child classes. +class Decoder { + public: + static const int kNumOffsetmaps; + + explicit Decoder(const BeliefDecoderConfig& config) : config_(config) { + epnp_alpha_ << 4.0f, -1.0f, -1.0f, -1.0f, 2.0f, -1.0f, -1.0f, 1.0f, 2.0f, + -1.0f, 1.0f, -1.0f, 0.0f, -1.0f, 1.0f, 1.0f, 2.0f, 1.0f, -1.0f, -1.0f, + 0.0f, 1.0f, -1.0f, 1.0f, 0.0f, 1.0f, 1.0f, -1.0f, -2.0f, 1.0f, 1.0f, + 1.0f; + } + + // Decodes bounding boxes from predicted heatmap and offset maps. + // Input: + // heatmap: a single channel cv::Mat representing center point heatmap + // offsetmap: a 16 channel cv::Mat representing the 16 offset maps + // (2 for each of the 8 vertices) + // Output: + // Outputs 3D bounding boxes 2D vertices, represented by 'point_2d' field + // in each 'keypoints' field of object annotations. + FrameAnnotation DecodeBoundingBoxKeypoints(const cv::Mat& heatmap, + const cv::Mat& offsetmap) const; + + // Lifts the estimated 2D projections of bounding box vertices to 3D. + // This function uses the EPnP approach described in this paper: + // https://icwww.epfl.ch/~lepetit/papers/lepetit_ijcv08.pdf . + // Input: + // projection_matrix: the projection matrix from 3D coordinate + // to screen coordinate. + // The 2D screen coordinate is defined as: u is along the long + // edge of the device, pointing down; v is along the short edge + // of the device, pointing right. + // portrait: a boolen variable indicating whether our images are + // obtained in portrait orientation or not. + // estimated_box: annotation with point_2d field populated with + // 2d vertices. + // Output: + // estimated_box: annotation with point_3d field populated with + // 3d vertices. + absl::Status Lift2DTo3D( + const Eigen::Matrix& projection_matrix, + bool portrait, FrameAnnotation* estimated_box) const; + + private: + struct BeliefBox { + float belief; + std::vector> box_2d; + }; + + std::vector ExtractCenterKeypoints( + const cv::Mat& center_heatmap) const; + + // Decodes 2D keypoints at the peak point. + void DecodeByPeak(const cv::Mat& offsetmap, int center_x, int center_y, + float offset_scale_x, float offset_scale_y, + BeliefBox* box) const; + + // Decodes 2D keypoints by voting around the peak. + void DecodeByVoting(const cv::Mat& heatmap, const cv::Mat& offsetmap, + int center_x, int center_y, float offset_scale_x, + float offset_scale_y, BeliefBox* box) const; + + // Returns true if it is a new box. Otherwise, it may replace an existing box + // if the new box's belief is higher. + bool IsNewBox(std::vector* boxes, BeliefBox* box) const; + + // Returns true if the two boxes are identical. + bool IsIdentical(const BeliefBox& box_1, const BeliefBox& box_2) const; + + BeliefDecoderConfig config_; + // Following equation (1) in this paper + // https://icwww.epfl.ch/~lepetit/papers/lepetit_ijcv08.pdf, + // this variable denotes the coefficients for the 4 control points + // for each of the 8 3D box vertices. + Eigen::Matrix epnp_alpha_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_ diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_timed_box_list_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_timed_box_list_calculator.cc new file mode 100644 index 000000000..788456ad0 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_to_timed_box_list_calculator.cc @@ -0,0 +1,115 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "absl/memory/memory.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/opencv_imgproc_inc.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace { +constexpr char kInputStreamTag[] = "FRAME_ANNOTATION"; +constexpr char kOutputStreamTag[] = "BOXES"; +} // namespace + +namespace mediapipe { + +// Convert FrameAnnotation 3d bounding box detections to TimedBoxListProto +// 2d bounding boxes. +// +// Input: +// FRAME_ANNOTATION - 3d bounding box annotation. +// Output: +// BOXES - 2d bounding box enclosing the projection of 3d box. +// +// Usage example: +// node { +// calculator: "FrameAnnotationToTimedBoxListCalculator" +// input_stream: "FRAME_ANNOTATION:frame_annotation" +// output_stream: "BOXES:boxes" +// } +class FrameAnnotationToTimedBoxListCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; +}; +REGISTER_CALCULATOR(FrameAnnotationToTimedBoxListCalculator); + +::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag(kInputStreamTag)) { + cc->Inputs().Tag(kInputStreamTag).Set(); + } + + if (cc->Outputs().HasTag(kOutputStreamTag)) { + cc->Outputs().Tag(kOutputStreamTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Open( + CalculatorContext* cc) { + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().HasTag(kInputStreamTag) && + !cc->Inputs().Tag(kInputStreamTag).IsEmpty()) { + const auto& frame_annotation = + cc->Inputs().Tag(kInputStreamTag).Get(); + auto output_objects = absl::make_unique(); + for (const auto& annotation : frame_annotation.annotations()) { + std::vector key_points; + for (const auto& keypoint : annotation.keypoints()) { + key_points.push_back( + cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y())); + } + TimedBoxProto* added_box = output_objects->add_box(); + ComputeBoundingRect(key_points, added_box); + added_box->set_id(annotation.object_id()); + const int64 time_msec = + static_cast(std::round(frame_annotation.timestamp() / 1000)); + added_box->set_time_msec(time_msec); + } + + // Output + if (cc->Outputs().HasTag(kOutputStreamTag)) { + cc->Outputs() + .Tag(kOutputStreamTag) + .Add(output_objects.release(), cc->InputTimestamp()); + } + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Close( + CalculatorContext* cc) { + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.cc b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.cc new file mode 100644 index 000000000..1cfdb2ddc --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.cc @@ -0,0 +1,102 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h" + +#include "absl/container/flat_hash_set.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace mediapipe { + +void FrameAnnotationTracker::AddDetectionResult( + const FrameAnnotation& frame_annotation) { + const int64 time_us = + static_cast(std::round(frame_annotation.timestamp())); + for (const auto& object_annotation : frame_annotation.annotations()) { + detected_objects_[time_us + object_annotation.object_id()] = + object_annotation; + } +} + +FrameAnnotation FrameAnnotationTracker::ConsolidateTrackingResult( + const TimedBoxProtoList& tracked_boxes, + absl::flat_hash_set* cancel_object_ids) { + CHECK(cancel_object_ids != nullptr); + FrameAnnotation frame_annotation; + std::vector keys_to_be_deleted; + for (const auto& detected_obj : detected_objects_) { + const int object_id = detected_obj.second.object_id(); + if (cancel_object_ids->contains(object_id)) { + // Remember duplicated detections' keys. + keys_to_be_deleted.push_back(detected_obj.first); + continue; + } + TimedBoxProto ref_box; + for (const auto& box : tracked_boxes.box()) { + if (box.id() == object_id) { + ref_box = box; + break; + } + } + if (!ref_box.has_id() || ref_box.id() < 0) { + LOG(ERROR) << "Can't find matching tracked box for object id: " + << object_id << ". Likely lost tracking of it."; + keys_to_be_deleted.push_back(detected_obj.first); + continue; + } + + // Find duplicated boxes + for (const auto& box : tracked_boxes.box()) { + if (box.id() != object_id) { + if (ComputeBoxIoU(ref_box, box) > iou_threshold_) { + cancel_object_ids->insert(box.id()); + } + } + } + + // Map ObjectAnnotation from detection to tracked time. + // First, gather all keypoints from source detection. + std::vector key_points; + for (const auto& keypoint : detected_obj.second.keypoints()) { + key_points.push_back( + cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y())); + } + // Second, find source box. + TimedBoxProto src_box; + ComputeBoundingRect(key_points, &src_box); + ObjectAnnotation* tracked_obj = frame_annotation.add_annotations(); + tracked_obj->set_object_id(ref_box.id()); + // Finally, map all keypoints in the source detection to tracked location. + for (const auto& keypoint : detected_obj.second.keypoints()) { + cv::Point2f dst = MapPoint( + src_box, ref_box, + cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()), + img_width_, img_height_); + auto* dst_point = tracked_obj->add_keypoints()->mutable_point_2d(); + dst_point->set_x(dst.x); + dst_point->set_y(dst.y); + } + } + + for (const auto& key : keys_to_be_deleted) { + detected_objects_.erase(key); + } + + return frame_annotation; +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h new file mode 100644 index 000000000..2113c7711 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h @@ -0,0 +1,62 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_ + +#include + +#include "absl/container/btree_map.h" +#include "absl/container/flat_hash_set.h" +#include "mediapipe/framework/port/integral_types.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace mediapipe { + +class FrameAnnotationTracker { + public: + // If two bounding boxes have IoU over iou_threshold, then we consider them + // describing the same object. + FrameAnnotationTracker(float iou_threshold, float img_width, float img_height) + : iou_threshold_(iou_threshold), + img_width_(img_width), + img_height_(img_height) {} + + // Adds detection results from an external detector. + void AddDetectionResult(const FrameAnnotation& frame_annotation); + + // Consolidates tracking result from an external tracker, associates with + // the detection result by the object id, and produces the corresponding + // result in FrameAnnotation. When there are duplicates, output the ids that + // need to be cancelled in cancel_object_ids. + // Note that the returned FrameAnnotation is missing timestamp. Need to fill + // that field. + FrameAnnotation ConsolidateTrackingResult( + const TimedBoxProtoList& tracked_boxes, + absl::flat_hash_set* cancel_object_ids); + + private: + float iou_threshold_; + float img_width_; + float img_height_; + // Cached detection results over time. + // Key is timestamp_us + object_id. + absl::btree_map> + detected_objects_; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_ diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.cc new file mode 100644 index 000000000..ef3f4f5d4 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.cc @@ -0,0 +1,137 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/container/flat_hash_set.h" +#include "absl/memory/memory.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h" +#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.pb.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace { +constexpr char kInputFrameAnnotationTag[] = "FRAME_ANNOTATION"; +constexpr char kInputTrackedBoxesTag[] = "TRACKED_BOXES"; +constexpr char kOutputTrackedFrameAnnotationTag[] = "TRACKED_FRAME_ANNOTATION"; +constexpr char kOutputCancelObjectIdTag[] = "CANCEL_OBJECT_ID"; +} // namespace + +namespace mediapipe { + +// Tracks frame annotations seeded/updated by FRAME_ANNOTATION input_stream. +// When using this calculator, make sure FRAME_ANNOTATION and TRACKED_BOXES +// are in different sync set. +// +// Input: +// FRAME_ANNOTATION - frame annotation. +// TRACKED_BOXES - 2d box tracking result +// Output: +// TRACKED_FRAME_ANNOTATION - annotation inferred from 2d tracking result. +// CANCEL_OBJECT_ID - object id that needs to be cancelled from the tracker. +// +// Usage example: +// node { +// calculator: "FrameAnnotationTrackerCalculator" +// input_stream: "FRAME_ANNOTATION:frame_annotation" +// input_stream: "TRACKED_BOXES:tracked_boxes" +// output_stream: "TRACKED_FRAME_ANNOTATION:tracked_frame_annotation" +// output_stream: "CANCEL_OBJECT_ID:cancel_object_id" +// } +class FrameAnnotationTrackerCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + std::unique_ptr frame_annotation_tracker_; +}; +REGISTER_CALCULATOR(FrameAnnotationTrackerCalculator); + +::mediapipe::Status FrameAnnotationTrackerCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag(kInputFrameAnnotationTag)) { + cc->Inputs().Tag(kInputFrameAnnotationTag).Set(); + } + if (cc->Inputs().HasTag(kInputTrackedBoxesTag)) { + cc->Inputs().Tag(kInputTrackedBoxesTag).Set(); + } + if (cc->Outputs().HasTag(kOutputTrackedFrameAnnotationTag)) { + cc->Outputs().Tag(kOutputTrackedFrameAnnotationTag).Set(); + } + if (cc->Outputs().HasTag(kOutputCancelObjectIdTag)) { + cc->Outputs().Tag(kOutputCancelObjectIdTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationTrackerCalculator::Open( + CalculatorContext* cc) { + const auto& options = cc->Options(); + frame_annotation_tracker_ = absl::make_unique( + options.iou_threshold(), options.img_width(), options.img_height()); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationTrackerCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().HasTag(kInputFrameAnnotationTag) && + !cc->Inputs().Tag(kInputFrameAnnotationTag).IsEmpty()) { + frame_annotation_tracker_->AddDetectionResult( + cc->Inputs().Tag(kInputFrameAnnotationTag).Get()); + } + if (cc->Inputs().HasTag(kInputTrackedBoxesTag) && + !cc->Inputs().Tag(kInputTrackedBoxesTag).IsEmpty() && + cc->Outputs().HasTag(kOutputTrackedFrameAnnotationTag)) { + absl::flat_hash_set cancel_object_ids; + auto output_frame_annotation = absl::make_unique(); + *output_frame_annotation = + frame_annotation_tracker_->ConsolidateTrackingResult( + cc->Inputs().Tag(kInputTrackedBoxesTag).Get(), + &cancel_object_ids); + output_frame_annotation->set_timestamp(cc->InputTimestamp().Microseconds()); + + cc->Outputs() + .Tag(kOutputTrackedFrameAnnotationTag) + .Add(output_frame_annotation.release(), cc->InputTimestamp()); + + if (cc->Outputs().HasTag(kOutputCancelObjectIdTag)) { + auto packet_timestamp = cc->InputTimestamp(); + for (const auto& id : cancel_object_ids) { + // The timestamp is incremented (by 1 us) because currently the box + // tracker calculator only accepts one cancel object ID for any given + // timestamp. + cc->Outputs() + .Tag(kOutputCancelObjectIdTag) + .AddPacket(mediapipe::MakePacket(id).At(packet_timestamp++)); + } + } + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status FrameAnnotationTrackerCalculator::Close( + CalculatorContext* cc) { + return ::mediapipe::OkStatus(); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.proto new file mode 100644 index 000000000..f37308a3e --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.proto @@ -0,0 +1,36 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The option proto for the FrameAnnotationTrackerCalculatorOptions. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message FrameAnnotationTrackerCalculatorOptions { + extend CalculatorOptions { + optional FrameAnnotationTrackerCalculatorOptions ext = 291291253; + } + + // The threshold on intersection-over-union (IoU). We consider + // boxes with IoU larger than this threshold to be the duplicates. + optional float iou_threshold = 1 [default = 0.5]; + + // We need image dimension to properly compute annotation locations. + optional float img_width = 2; + + optional float img_height = 3; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_test.cc b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_test.cc new file mode 100644 index 000000000..94d64bbbe --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_test.cc @@ -0,0 +1,143 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h" + +#include "absl/container/flat_hash_set.h" +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/logging.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/util/tracking/box_tracker.pb.h" + +namespace mediapipe { +namespace { + +// Create a new object annotation by shifting a reference +// object annotation. +ObjectAnnotation ShiftObject2d(const ObjectAnnotation& ref_obj, float dx, + float dy) { + ObjectAnnotation obj = ref_obj; + for (auto& keypoint : *(obj.mutable_keypoints())) { + const float ref_x = keypoint.point_2d().x(); + const float ref_y = keypoint.point_2d().y(); + keypoint.mutable_point_2d()->set_x(ref_x + dx); + keypoint.mutable_point_2d()->set_y(ref_y + dy); + } + return obj; +} + +TimedBoxProto ShiftBox(const TimedBoxProto& ref_box, float dx, float dy) { + TimedBoxProto box = ref_box; + box.set_top(ref_box.top() + dy); + box.set_bottom(ref_box.bottom() + dy); + box.set_left(ref_box.left() + dx); + box.set_right(ref_box.right() + dx); + return box; +} + +// Constructs a fixed ObjectAnnotation. +ObjectAnnotation ConstructFixedObject( + const std::vector>& points) { + ObjectAnnotation obj; + for (const auto& point : points) { + auto* keypoint = obj.add_keypoints(); + CHECK_EQ(2, point.size()); + keypoint->mutable_point_2d()->set_x(point[0]); + keypoint->mutable_point_2d()->set_y(point[1]); + } + return obj; +} + +TEST(FrameAnnotationTrackerTest, TestConsolidation) { + // Add 4 detections represented by FrameAnnotation, of which 3 correspond + // to the same object. + ObjectAnnotation object1, object2, object3, object4; + // The bounding rectangle for these object keypoints is: + // x: [0.2, 0.5], y: [0.1, 0.4] + object3 = ConstructFixedObject({{0.35f, 0.25f}, + {0.3f, 0.3f}, + {0.2f, 0.4f}, + {0.3f, 0.1f}, + {0.2f, 0.2f}, + {0.5f, 0.3f}, + {0.4f, 0.4f}, + {0.5f, 0.1f}, + {0.4f, 0.2f}}); + object3.set_object_id(3); + object1 = ShiftObject2d(object3, -0.05f, -0.05f); + object1.set_object_id(1); + object2 = ShiftObject2d(object3, 0.05f, 0.05f); + object2.set_object_id(2); + object4 = ShiftObject2d(object3, 0.2f, 0.2f); + object4.set_object_id(4); + FrameAnnotation frame_annotation_1; + frame_annotation_1.set_timestamp(30 * 1000); // 30ms + *(frame_annotation_1.add_annotations()) = object1; + *(frame_annotation_1.add_annotations()) = object4; + FrameAnnotation frame_annotation_2; + frame_annotation_2.set_timestamp(60 * 1000); // 60ms + *(frame_annotation_2.add_annotations()) = object2; + FrameAnnotation frame_annotation_3; + frame_annotation_3.set_timestamp(90 * 1000); // 90ms + *(frame_annotation_3.add_annotations()) = object3; + + FrameAnnotationTracker frame_annotation_tracker(/*iou_threshold*/ 0.5f, 1.0f, + 1.0f); + frame_annotation_tracker.AddDetectionResult(frame_annotation_1); + frame_annotation_tracker.AddDetectionResult(frame_annotation_2); + frame_annotation_tracker.AddDetectionResult(frame_annotation_3); + + TimedBoxProtoList timed_box_proto_list; + TimedBoxProto* timed_box_proto = timed_box_proto_list.add_box(); + timed_box_proto->set_top(0.4f); + timed_box_proto->set_bottom(0.7f); + timed_box_proto->set_left(0.6f); + timed_box_proto->set_right(0.9f); + timed_box_proto->set_id(3); + timed_box_proto->set_time_msec(150); + timed_box_proto = timed_box_proto_list.add_box(); + *timed_box_proto = ShiftBox(timed_box_proto_list.box(0), 0.01f, 0.01f); + timed_box_proto->set_id(1); + timed_box_proto->set_time_msec(150); + timed_box_proto = timed_box_proto_list.add_box(); + *timed_box_proto = ShiftBox(timed_box_proto_list.box(0), -0.01f, -0.01f); + timed_box_proto->set_id(2); + timed_box_proto->set_time_msec(150); + absl::flat_hash_set cancel_object_ids; + FrameAnnotation tracked_detection = + frame_annotation_tracker.ConsolidateTrackingResult(timed_box_proto_list, + &cancel_object_ids); + EXPECT_EQ(2, cancel_object_ids.size()); + EXPECT_EQ(1, cancel_object_ids.count(1)); + EXPECT_EQ(1, cancel_object_ids.count(2)); + EXPECT_EQ(1, tracked_detection.annotations_size()); + EXPECT_EQ(3, tracked_detection.annotations(0).object_id()); + EXPECT_EQ(object3.keypoints_size(), + tracked_detection.annotations(0).keypoints_size()); + const float x_offset = 0.4f; + const float y_offset = 0.3f; + const float tolerance = 1e-5f; + for (int i = 0; i < object3.keypoints_size(); ++i) { + const auto& point_2d = + tracked_detection.annotations(0).keypoints(i).point_2d(); + EXPECT_NEAR(point_2d.x(), object3.keypoints(i).point_2d().x() + x_offset, + tolerance); + EXPECT_NEAR(point_2d.y(), object3.keypoints(i).point_2d().y() + y_offset, + tolerance); + } +} + +} // namespace +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc new file mode 100644 index 000000000..be415b3cd --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc @@ -0,0 +1,760 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(__ANDROID__) +#include "mediapipe/util/android/asset_manager_util.h" +#else +#include +#include +#endif + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "mediapipe/gpu/gl_calculator_helper.h" +#include "mediapipe/gpu/shader_util.h" +#include "mediapipe/graphs/object_detection_3d/calculators/camera_parameters.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/model_matrix.pb.h" + +namespace mediapipe { + +namespace { + +#if defined(GL_DEBUG) +#define GLCHECK(command) \ + command; \ + if (int err = glGetError()) LOG(ERROR) << "GL error detected: " << err; +#else +#define GLCHECK(command) command +#endif + +// For ease of use, we prefer ImageFrame on Android and GpuBuffer otherwise. +#if defined(__ANDROID__) +typedef ImageFrame AssetTextureFormat; +#else +typedef GpuBuffer AssetTextureFormat; +#endif + +enum { ATTRIB_VERTEX, ATTRIB_TEXTURE_POSITION, NUM_ATTRIBUTES }; +static const int kNumMatrixEntries = 16; + +// Hard-coded MVP Matrix for testing. +static const float kModelMatrix[] = {0.83704215, -0.36174262, 0.41049102, 0.0, + 0.06146407, 0.8076706, 0.5864218, 0.0, + -0.54367524, -0.4656292, 0.69828844, 0.0, + 0.0, 0.0, -98.64117, 1.0}; + +// Loads a texture from an input side packet, and streams in an animation file +// from a filename given in another input side packet, and renders the animation +// over the screen according to the input timestamp and desired animation FPS. +// +// Inputs: +// VIDEO (GpuBuffer, optional): +// If provided, the input buffer will be assumed to be unique, and will be +// consumed by this calculator and rendered to directly. The output video +// buffer will then be the released reference to the input video buffer. +// MODEL_MATRICES (TimedModelMatrixProtoList, optional): +// If provided, will set the model matrices for the objects to be rendered +// during future rendering calls. +// +// Input side packets: +// TEXTURE (ImageFrame on Android / GpuBuffer on iOS, required): +// Texture to use with animation file. +// ANIMATION_ASSET (String, required): +// Path of animation file to load and render. Should be generated by +// //java/com/google/android/apps/motionstills/SimpleObjEncryptor with +// --compressed_mode=true. See comments and documentation there for more +// information on custom .obj.uuu file format. +// CAMERA_PARAMETERS_PROTO_STRING (String, optional): +// Serialized proto std::string of CameraParametersProto. We need this to +// get the right aspect ratio and field of view. +// Options: +// aspect_ratio: the ratio between the rendered image width and height. +// It will be ignored if CAMERA_PARAMETERS_PROTO_STRING input side packet +// is provided. +// vertical_fov_degrees: vertical field of view in degrees. +// It will be ignored if CAMERA_PARAMETERS_PROTO_STRING input side packet +// is provided. +// z_clipping_plane_near: near plane value for z-clipping. +// z_clipping_plane_far: far plane value for z-clipping. +// animation_speed_fps: speed at which to cycle through animation frames (in +// frames per second). +// +// Outputs: +// OUTPUT, or index 0 (GpuBuffer): +// Frames filled with the given texture. + +// Simple helper-struct for containing the parsed geometry data from a 3D +// animation frame for rendering. + +struct TriangleMesh { + int index_count = 0; // Needed for glDrawElements rendering call + std::unique_ptr vertices = nullptr; + std::unique_ptr texture_coords = nullptr; + std::unique_ptr triangle_indices = nullptr; +}; + +typedef std::unique_ptr ModelMatrix; + +} // namespace + +class GlAnimationOverlayCalculator : public CalculatorBase { + public: + GlAnimationOverlayCalculator() {} + ~GlAnimationOverlayCalculator(); + + static ::mediapipe::Status GetContract(CalculatorContract *cc); + + ::mediapipe::Status Open(CalculatorContext *cc) override; + ::mediapipe::Status Process(CalculatorContext *cc) override; + + private: + bool has_video_stream_ = false; + bool has_model_matrix_stream_ = false; + bool has_mask_model_matrix_stream_ = false; + bool has_occlusion_mask_ = false; + + GlCalculatorHelper helper_; + bool initialized_ = false; + GlTexture texture_; + GlTexture mask_texture_; + + GLuint renderbuffer_ = 0; + bool depth_buffer_created_ = false; + + GLuint program_ = 0; + GLint texture_uniform_ = -1; + GLint perspective_matrix_uniform_ = -1; + GLint model_matrix_uniform_ = -1; + + std::vector triangle_meshes_; + std::vector mask_meshes_; + Timestamp animation_start_time_; + int frame_count_ = 0; + float animation_speed_fps_; + + std::vector current_model_matrices_; + std::vector current_mask_model_matrices_; + + // Perspective matrix for rendering, to be applied to all model matrices + // prior to passing through to the shader as a MVP matrix. Initialized during + // first image packet read. + float perspective_matrix_[kNumMatrixEntries]; + + void ComputeAspectRatioAndFovFromCameraParameters( + const CameraParametersProto &camera_parameters, float *aspect_ratio, + float *vertical_fov_degrees); + int GetAnimationFrameIndex(Timestamp timestamp); + ::mediapipe::Status GlSetup(); + ::mediapipe::Status GlBind(const TriangleMesh &triangle_mesh, + const GlTexture &texture); + ::mediapipe::Status GlRender(const TriangleMesh &triangle_mesh, + const float *model_matrix); + void InitializePerspectiveMatrix(float aspect_ratio, + float vertical_fov_degrees, float z_near, + float z_far); + void LoadModelMatrices(const TimedModelMatrixProtoList &model_matrices, + std::vector *current_model_matrices); + +#if !defined(__ANDROID__) + // Asset loading routine for all non-Android platforms. + bool LoadAnimation(const std::string &filename); +#else + // Asset loading for all Android platforms. + bool LoadAnimationAndroid(const std::string &filename, + std::vector *mesh); + bool ReadBytesFromAsset(AAsset *asset, void *buffer, int num_bytes_to_read); +#endif +}; +REGISTER_CALCULATOR(GlAnimationOverlayCalculator); + +// static +::mediapipe::Status GlAnimationOverlayCalculator::GetContract( + CalculatorContract *cc) { + MP_RETURN_IF_ERROR( + GlCalculatorHelper::SetupInputSidePackets(&(cc->InputSidePackets()))); + if (cc->Inputs().HasTag("VIDEO")) { + // Currently used only for size and timestamp. + cc->Inputs().Tag("VIDEO").Set(); + } + TagOrIndex(&(cc->Outputs()), "OUTPUT", 0).Set(); + + if (cc->Inputs().HasTag("MODEL_MATRICES")) { + cc->Inputs().Tag("MODEL_MATRICES").Set(); + } + if (cc->Inputs().HasTag("MASK_MODEL_MATRICES")) { + cc->Inputs().Tag("MASK_MODEL_MATRICES").Set(); + } + + cc->InputSidePackets().Tag("TEXTURE").Set(); + cc->InputSidePackets().Tag("ANIMATION_ASSET").Set(); + if (cc->InputSidePackets().HasTag("CAMERA_PARAMETERS_PROTO_STRING")) { + cc->InputSidePackets() + .Tag("CAMERA_PARAMETERS_PROTO_STRING") + .Set(); + } + + if (cc->InputSidePackets().HasTag("MASK_TEXTURE")) { + cc->InputSidePackets().Tag("MASK_TEXTURE").Set(); + } + if (cc->InputSidePackets().HasTag("MASK_ASSET")) { + cc->InputSidePackets().Tag("MASK_ASSET").Set(); + } + + return ::mediapipe::OkStatus(); +} + +// Helper function for initializing our perspective matrix. +void GlAnimationOverlayCalculator::InitializePerspectiveMatrix( + float aspect_ratio, float fov_degrees, float z_near, float z_far) { + // Standard perspective projection matrix calculations. + const float f = 1.0f / std::tan(fov_degrees * M_PI / 360.0f); + for (int i = 0; i < kNumMatrixEntries; i++) { + perspective_matrix_[i] = 0; + } + const float denom = 1.0f / (z_near - z_far); + perspective_matrix_[0] = f / aspect_ratio; + perspective_matrix_[5] = f; + perspective_matrix_[10] = (z_near + z_far) * denom; + perspective_matrix_[11] = -1.0f; + perspective_matrix_[14] = 2.0f * z_far * z_near * denom; +} + +#if defined(__ANDROID__) +// Helper function for reading in a specified number of bytes from an Android +// asset. Returns true if successfully reads in all bytes into buffer. +bool GlAnimationOverlayCalculator::ReadBytesFromAsset(AAsset *asset, + void *buffer, + int num_bytes_to_read) { + // Most file systems use block sizes of 4KB or 8KB; ideally we'd choose a + // small multiple of the block size for best input streaming performance, so + // we go for a reasobably safe buffer size of 8KB = 8*1024 bytes. + static const int kMaxChunkSize = 8192; + + int bytes_left = num_bytes_to_read; + int bytes_read = 1; // any value > 0 here just to start looping. + + // Treat as uint8_t array so we can deal in single byte arithmetic easily. + uint8_t *currBufferIndex = reinterpret_cast(buffer); + while (bytes_read > 0 && bytes_left > 0) { + bytes_read = AAsset_read(asset, (void *)currBufferIndex, + std::min(bytes_left, kMaxChunkSize)); + bytes_left -= bytes_read; + currBufferIndex += bytes_read; + } + // At least log any I/O errors encountered. + if (bytes_read < 0) { + LOG(ERROR) << "Error reading from AAsset: " << bytes_read; + return false; + } + if (bytes_left > 0) { + // Reached EOF before reading in specified number of bytes. + LOG(WARNING) << "Reached EOF before reading in specified number of bytes."; + return false; + } + return true; +} + +// The below asset streaming code is Android-only, making use of the platform +// JNI helper classes AAssetManager and AAsset. +bool GlAnimationOverlayCalculator::LoadAnimationAndroid( + const std::string &filename, std::vector *meshes) { + mediapipe::AssetManager *mediapipe_asset_manager = + Singleton::get(); + AAssetManager *asset_manager = mediapipe_asset_manager->GetAssetManager(); + if (!asset_manager) { + LOG(ERROR) << "Failed to access Android asset manager."; + return false; + } + + // New read-bytes stuff here! First we open file for streaming. + AAsset *asset = AAssetManager_open(asset_manager, filename.c_str(), + AASSET_MODE_STREAMING); + if (!asset) { + LOG(ERROR) << "Failed to open animation asset: " << filename; + return false; + } + + // And now, while we are able to stream in more frames, we do so. + frame_count_ = 0; + int32 lengths[3]; + while (ReadBytesFromAsset(asset, (void *)lengths, sizeof(lengths[0]) * 3)) { + // About to start reading the next animation frame. Stream it in here. + // Each frame stores first the object counts of its three arrays + // (vertices, texture coordinates, triangle indices; respectively), and + // then stores each of those arrays as a byte dump, in order. + meshes->emplace_back(); + TriangleMesh &triangle_mesh = meshes->back(); + // Try to read in vertices (4-byte floats) + triangle_mesh.vertices.reset(new float[lengths[0]]); + if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.vertices.get(), + sizeof(float) * lengths[0])) { + LOG(ERROR) << "Failed to read vertices for frame " << frame_count_; + return false; + } + // Try to read in texture coordinates (4-byte floats) + triangle_mesh.texture_coords.reset(new float[lengths[1]]); + if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.texture_coords.get(), + sizeof(float) * lengths[1])) { + LOG(ERROR) << "Failed to read tex-coords for frame " << frame_count_; + return false; + } + // Try to read in indices (2-byte shorts) + triangle_mesh.index_count = lengths[2]; + triangle_mesh.triangle_indices.reset(new int16[lengths[2]]); + if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.triangle_indices.get(), + sizeof(int16) * lengths[2])) { + LOG(ERROR) << "Failed to read indices for frame " << frame_count_; + return false; + } + frame_count_++; + } + AAsset_close(asset); + + LOG(INFO) << "Finished parsing " << frame_count_ << " animation frames."; + if (meshes->empty()) { + LOG(ERROR) << "No animation frames were parsed! Erroring out calculator."; + return false; + } + return true; +} + +#else // defined(__ANDROID__) + +bool GlAnimationOverlayCalculator::LoadAnimation(const std::string &filename) { + std::ifstream infile(filename.c_str(), std::ifstream::binary); + if (!infile) { + LOG(ERROR) << "Error opening asset with filename: " << filename; + return false; + } + + frame_count_ = 0; + int32 lengths[3]; + while (true) { + // See if we have more initial size counts to read in. + infile.read((char *)(lengths), sizeof(lengths[0]) * 3); + if (!infile) { + // No more frames to read. Close out. + infile.close(); + break; + } + + triangle_meshes_.emplace_back(); + TriangleMesh &triangle_mesh = triangle_meshes_.back(); + + // Try to read in vertices (4-byte floats). + triangle_mesh.vertices.reset(new float[lengths[0]]); + infile.read((char *)(triangle_mesh.vertices.get()), + sizeof(float) * lengths[0]); + if (!infile) { + LOG(ERROR) << "Failed to read vertices for frame " << frame_count_; + return false; + } + + // Try to read in texture coordinates (4-byte floats) + triangle_mesh.texture_coords.reset(new float[lengths[1]]); + infile.read((char *)(triangle_mesh.texture_coords.get()), + sizeof(float) * lengths[1]); + if (!infile) { + LOG(ERROR) << "Failed to read texture coordinates for frame " + << frame_count_; + return false; + } + + // Try to read in the triangle indices (2-byte shorts) + triangle_mesh.index_count = lengths[2]; + triangle_mesh.triangle_indices.reset(new int16[lengths[2]]); + infile.read((char *)(triangle_mesh.triangle_indices.get()), + sizeof(int16) * lengths[2]); + if (!infile) { + LOG(ERROR) << "Failed to read triangle indices for frame " + << frame_count_; + return false; + } + frame_count_++; + } + + LOG(INFO) << "Finished parsing " << frame_count_ << " animation frames."; + if (triangle_meshes_.empty()) { + LOG(ERROR) << "No animation frames were parsed! Erroring out calculator."; + return false; + } + return true; +} + +#endif + +void GlAnimationOverlayCalculator::ComputeAspectRatioAndFovFromCameraParameters( + const CameraParametersProto &camera_parameters, float *aspect_ratio, + float *vertical_fov_degrees) { + CHECK(aspect_ratio != nullptr); + CHECK(vertical_fov_degrees != nullptr); + *aspect_ratio = + camera_parameters.portrait_width() / camera_parameters.portrait_height(); + *vertical_fov_degrees = + std::atan(camera_parameters.portrait_height() * 0.5f) * 2 * 180 / M_PI; +} + +::mediapipe::Status GlAnimationOverlayCalculator::Open(CalculatorContext *cc) { + cc->SetOffset(TimestampDiff(0)); + MP_RETURN_IF_ERROR(helper_.Open(cc)); + + const auto &options = cc->Options(); + + animation_speed_fps_ = options.animation_speed_fps(); + + // Construct projection matrix using input side packets or option + float aspect_ratio; + float vertical_fov_degrees; + if (cc->InputSidePackets().HasTag("CAMERA_PARAMETERS_PROTO_STRING")) { + const std::string &camera_parameters_proto_string = + cc->InputSidePackets() + .Tag("CAMERA_PARAMETERS_PROTO_STRING") + .Get(); + CameraParametersProto camera_parameters_proto; + camera_parameters_proto.ParseFromString(camera_parameters_proto_string); + ComputeAspectRatioAndFovFromCameraParameters( + camera_parameters_proto, &aspect_ratio, &vertical_fov_degrees); + } else { + aspect_ratio = options.aspect_ratio(); + vertical_fov_degrees = options.vertical_fov_degrees(); + } + + // when constructing projection matrix. + InitializePerspectiveMatrix(aspect_ratio, vertical_fov_degrees, + options.z_clipping_plane_near(), + options.z_clipping_plane_far()); + + // See what streams we have. + has_video_stream_ = cc->Inputs().HasTag("VIDEO"); + has_model_matrix_stream_ = cc->Inputs().HasTag("MODEL_MATRICES"); + has_mask_model_matrix_stream_ = cc->Inputs().HasTag("MASK_MODEL_MATRICES"); + + // Try to load in the animation asset in a platform-specific manner. + const std::string &asset_name = + cc->InputSidePackets().Tag("ANIMATION_ASSET").Get(); + bool loaded_animation = false; +#if defined(__ANDROID__) + if (cc->InputSidePackets().HasTag("MASK_ASSET")) { + has_occlusion_mask_ = true; + const std::string &mask_asset_name = + cc->InputSidePackets().Tag("MASK_ASSET").Get(); + loaded_animation = LoadAnimationAndroid(mask_asset_name, &mask_meshes_); + if (!loaded_animation) { + LOG(ERROR) << "Failed to load mask asset."; + return ::mediapipe::UnknownError("Failed to load mask asset."); + } + } + loaded_animation = LoadAnimationAndroid(asset_name, &triangle_meshes_); +#else + loaded_animation = LoadAnimation(asset_name); +#endif + if (!loaded_animation) { + LOG(ERROR) << "Failed to load animation asset."; + return ::mediapipe::UnknownError("Failed to load animation asset."); + } + + return helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status { + if (cc->InputSidePackets().HasTag("MASK_TEXTURE")) { + const auto &mask_texture = + cc->InputSidePackets().Tag("MASK_TEXTURE").Get(); + mask_texture_ = helper_.CreateSourceTexture(mask_texture); + } + + // Load in our asset's texture data + const auto &input_texture = + cc->InputSidePackets().Tag("TEXTURE").Get(); + texture_ = helper_.CreateSourceTexture(input_texture); + VLOG(2) << "Input texture size: " << texture_.width() << ", " + << texture_.height() << std::endl; + + return ::mediapipe::OkStatus(); + }); +} + +int GlAnimationOverlayCalculator::GetAnimationFrameIndex(Timestamp timestamp) { + double seconds_delta = timestamp.Seconds() - animation_start_time_.Seconds(); + int64_t frame_index = + static_cast(seconds_delta * animation_speed_fps_); + frame_index %= frame_count_; + return static_cast(frame_index); +} + +void GlAnimationOverlayCalculator::LoadModelMatrices( + const TimedModelMatrixProtoList &model_matrices, + std::vector *current_model_matrices) { + current_model_matrices->clear(); + for (int i = 0; i < model_matrices.model_matrix_size(); ++i) { + const auto &model_matrix = model_matrices.model_matrix(i); + CHECK(model_matrix.matrix_entries_size() == kNumMatrixEntries) + << "Invalid Model Matrix"; + current_model_matrices->emplace_back(); + ModelMatrix &new_matrix = current_model_matrices->back(); + new_matrix.reset(new float[kNumMatrixEntries]); + for (int j = 0; j < kNumMatrixEntries; j++) { + // Model matrices streamed in using ROW-MAJOR format, but we want + // COLUMN-MAJOR for rendering, so we transpose here. + int col = j % 4; + int row = j / 4; + new_matrix[row + col * 4] = model_matrix.matrix_entries(j); + } + } +} + +::mediapipe::Status GlAnimationOverlayCalculator::Process( + CalculatorContext *cc) { + return helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status { + if (!initialized_) { + MP_RETURN_IF_ERROR(GlSetup()); + initialized_ = true; + animation_start_time_ = cc->InputTimestamp(); + } + + // Process model matrices, if any are being streamed in, and update our + // list. + if (has_model_matrix_stream_ && + !cc->Inputs().Tag("MODEL_MATRICES").IsEmpty()) { + const TimedModelMatrixProtoList &model_matrices = + cc->Inputs().Tag("MODEL_MATRICES").Get(); + LoadModelMatrices(model_matrices, ¤t_model_matrices_); + } + if (has_mask_model_matrix_stream_ && + !cc->Inputs().Tag("MASK_MODEL_MATRICES").IsEmpty()) { + const TimedModelMatrixProtoList &model_matrices = + cc->Inputs() + .Tag("MASK_MODEL_MATRICES") + .Get(); + LoadModelMatrices(model_matrices, ¤t_mask_model_matrices_); + } + + // Arbitrary default width and height for output destination texture, in the + // event that we don't have a valid and unique input buffer to overlay. + int width = 640; + int height = 480; + + GlTexture dst; + std::unique_ptr input_frame(nullptr); + if (has_video_stream_ && !(cc->Inputs().Tag("VIDEO").IsEmpty())) { + auto result = cc->Inputs().Tag("VIDEO").Value().Consume(); + if (result.ok()) { + input_frame = std::move(result).ValueOrDie(); +#if !MEDIAPIPE_GPU_BUFFER_USE_CV_PIXEL_BUFFER + input_frame->GetGlTextureBufferSharedPtr()->Reuse(); +#endif + width = input_frame->width(); + height = input_frame->height(); + dst = helper_.CreateSourceTexture(*input_frame); + } else { + LOG(ERROR) << "Unable to consume input video frame for overlay!"; + LOG(ERROR) << "Status returned was: " << result.status(); + dst = helper_.CreateDestinationTexture(width, height); + } + } else if (!has_video_stream_) { + dst = helper_.CreateDestinationTexture(width, height); + } else { + // We have an input video stream, but not for this frame. Don't render! + return ::mediapipe::OkStatus(); + } + helper_.BindFramebuffer(dst); + + if (!depth_buffer_created_) { + // Create our private depth buffer. + GLCHECK(glGenRenderbuffers(1, &renderbuffer_)); + GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer_)); + GLCHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT16, + width, height)); + GLCHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, + GL_RENDERBUFFER, renderbuffer_)); + GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0)); + depth_buffer_created_ = true; + } + + // Re-bind our depth renderbuffer to our FBO depth attachment here. + GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer_)); + GLCHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, + GL_RENDERBUFFER, renderbuffer_)); + GLenum status = GLCHECK(glCheckFramebufferStatus(GL_FRAMEBUFFER)); + if (status != GL_FRAMEBUFFER_COMPLETE) { + LOG(ERROR) << "Incomplete framebuffer with status: " << status; + } + GLCHECK(glClear(GL_DEPTH_BUFFER_BIT)); + + if (has_occlusion_mask_) { + glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); + const TriangleMesh &mask_frame = mask_meshes_.front(); + MP_RETURN_IF_ERROR(GlBind(mask_frame, mask_texture_)); + // Draw objects using our latest model matrix stream packet. + for (const ModelMatrix &model_matrix : current_mask_model_matrices_) { + MP_RETURN_IF_ERROR(GlRender(mask_frame, model_matrix.get())); + } + } + + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + int frame_index = GetAnimationFrameIndex(cc->InputTimestamp()); + const TriangleMesh ¤t_frame = triangle_meshes_[frame_index]; + MP_RETURN_IF_ERROR(GlBind(current_frame, texture_)); + if (has_model_matrix_stream_) { + // Draw objects using our latest model matrix stream packet. + for (const ModelMatrix &model_matrix : current_model_matrices_) { + MP_RETURN_IF_ERROR(GlRender(current_frame, model_matrix.get())); + } + } else { + // Just draw one object to a static model matrix. + MP_RETURN_IF_ERROR(GlRender(current_frame, kModelMatrix)); + } + + // Disable vertex attributes + GLCHECK(glEnableVertexAttribArray(ATTRIB_VERTEX)); + GLCHECK(glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION)); + + // Disable depth test + GLCHECK(glDisable(GL_DEPTH_TEST)); + + // Unbind texture + GLCHECK(glActiveTexture(GL_TEXTURE1)); + GLCHECK(glBindTexture(texture_.target(), 0)); + + // Unbind depth buffer + GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0)); + + GLCHECK(glFlush()); + + auto output = dst.GetFrame(); + dst.Release(); + TagOrIndex(&(cc->Outputs()), "OUTPUT", 0) + .Add(output.release(), cc->InputTimestamp()); + GLCHECK(glFrontFace(GL_CCW)); + return ::mediapipe::OkStatus(); + }); +} + +::mediapipe::Status GlAnimationOverlayCalculator::GlSetup() { + // Load vertex and fragment shaders + const GLint attr_location[NUM_ATTRIBUTES] = { + ATTRIB_VERTEX, + ATTRIB_TEXTURE_POSITION, + }; + const GLchar *attr_name[NUM_ATTRIBUTES] = { + "position", + "texture_coordinate", + }; + + const GLchar *vert_src = R"( + // Perspective projection matrix for rendering / clipping + uniform mat4 perspectiveMatrix; + + // Matrix defining the currently rendered object model + uniform mat4 modelMatrix; + + // vertex position in threespace + attribute vec4 position; + + // texture coordinate for each vertex in normalized texture space (0..1) + attribute mediump vec4 texture_coordinate; + + // texture coordinate for fragment shader (will be interpolated) + varying mediump vec2 sample_coordinate; + + void main() { + sample_coordinate = texture_coordinate.xy; + mat4 mvpMatrix = perspectiveMatrix * modelMatrix; + gl_Position = mvpMatrix * position; + } + )"; + + const GLchar *frag_src = R"( + precision mediump float; + + varying vec2 sample_coordinate; // texture coordinate (0..1) + uniform sampler2D texture; // texture to shade with + + void main() { + gl_FragColor = texture2D(texture, sample_coordinate); + } + )"; + + // Shader program + GLCHECK(GlhCreateProgram(vert_src, frag_src, NUM_ATTRIBUTES, + (const GLchar **)&attr_name[0], attr_location, + &program_)); + RET_CHECK(program_) << "Problem initializing the program."; + texture_uniform_ = GLCHECK(glGetUniformLocation(program_, "texture")); + perspective_matrix_uniform_ = + GLCHECK(glGetUniformLocation(program_, "perspectiveMatrix")); + model_matrix_uniform_ = + GLCHECK(glGetUniformLocation(program_, "modelMatrix")); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status GlAnimationOverlayCalculator::GlBind( + const TriangleMesh &triangle_mesh, const GlTexture &texture) { + GLCHECK(glUseProgram(program_)); + + // Disable backface culling to allow occlusion effects. + // Some options for solid arbitrary 3D geometry rendering + GLCHECK(glEnable(GL_BLEND)); + GLCHECK(glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)); + GLCHECK(glEnable(GL_DEPTH_TEST)); + GLCHECK(glFrontFace(GL_CW)); + GLCHECK(glDepthMask(GL_TRUE)); + GLCHECK(glDepthFunc(GL_LESS)); + + // Clear our depth buffer before starting draw calls + GLCHECK(glVertexAttribPointer(ATTRIB_VERTEX, 3, GL_FLOAT, 0, 0, + triangle_mesh.vertices.get())); + GLCHECK(glEnableVertexAttribArray(ATTRIB_VERTEX)); + GLCHECK(glVertexAttribPointer(ATTRIB_TEXTURE_POSITION, 2, GL_FLOAT, 0, 0, + triangle_mesh.texture_coords.get())); + GLCHECK(glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION)); + GLCHECK(glActiveTexture(GL_TEXTURE1)); + GLCHECK(glBindTexture(texture.target(), texture.name())); + + // We previously bound it to GL_TEXTURE1 + GLCHECK(glUniform1i(texture_uniform_, 1)); + + GLCHECK(glUniformMatrix4fv(perspective_matrix_uniform_, 1, GL_FALSE, + perspective_matrix_)); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status GlAnimationOverlayCalculator::GlRender( + const TriangleMesh &triangle_mesh, const float *model_matrix) { + GLCHECK(glUniformMatrix4fv(model_matrix_uniform_, 1, GL_FALSE, model_matrix)); + GLCHECK(glDrawElements(GL_TRIANGLES, triangle_mesh.index_count, + GL_UNSIGNED_SHORT, + triangle_mesh.triangle_indices.get())); + return ::mediapipe::OkStatus(); +} + +GlAnimationOverlayCalculator::~GlAnimationOverlayCalculator() { + helper_.RunInGlContext([this] { + if (program_) { + GLCHECK(glDeleteProgram(program_)); + program_ = 0; + } + if (depth_buffer_created_) { + GLCHECK(glDeleteRenderbuffers(1, &renderbuffer_)); + renderbuffer_ = 0; + } + if (texture_.width() > 0) { + texture_.Release(); + } + if (mask_texture_.width() > 0) { + mask_texture_.Release(); + } + }); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.proto new file mode 100644 index 000000000..4966f0ae9 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.proto @@ -0,0 +1,41 @@ +// Copyright 2019 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message GlAnimationOverlayCalculatorOptions { + extend CalculatorOptions { + optional GlAnimationOverlayCalculatorOptions ext = 174760573; + } + + // Default aspect ratio of rendering target width over height. + // This specific value is for 3:4 view. Do not change this default value. + optional float aspect_ratio = 1 [default = 0.75]; + // Default vertical field of view in degrees. This specific default value + // is arbitrary. Do not change this default value. If you want to use + // a different vertical_fov_degrees, set it in the options. + optional float vertical_fov_degrees = 2 [default = 70.0]; + + // Perspective projection matrix z-clipping near plane value. + optional float z_clipping_plane_near = 3 [default = 0.1]; + // Perspective projection matrix z-clipping far plane value. + optional float z_clipping_plane_far = 4 [default = 1000.0]; + + // Speed at which to play the animation (in frames per second). + optional float animation_speed_fps = 5 [default = 25.0]; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.cc new file mode 100644 index 000000000..0f1a9966a --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.cc @@ -0,0 +1,168 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "Eigen/Dense" +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "absl/types/span.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h" +#include "mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h" + +namespace { +constexpr char kInputStreamTag[] = "FRAME_ANNOTATION"; +constexpr char kOutputStreamTag[] = "LIFTED_FRAME_ANNOTATION"; + +// Each detection object will be assigned an unique id that starts from 1. +static int object_id = 0; + +inline int GetNextObjectId() { return ++object_id; } +} // namespace + +namespace mediapipe { + +// Lifted the 2D points in a tracked frame annotation to 3D. +// +// Input: +// FRAME_ANNOTATIONS - Frame annotations with detected 2D points +// Output: +// LIFTED_FRAME_ANNOTATIONS - Result FrameAnnotation with lifted 3D points. +// +// Usage example: +// node { +// calculator: "Lift2DFrameAnnotationTo3DCalculator" +// input_stream: "FRAME_ANNOTATIONS:tracked_annotations" +// output_stream: "LIFTED_FRAME_ANNOTATIONS:lifted_3d_annotations" +// } +class Lift2DFrameAnnotationTo3DCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + ::mediapipe::Status ProcessCPU(CalculatorContext* cc, + FrameAnnotation* output_objects); + ::mediapipe::Status LoadOptions(CalculatorContext* cc); + + // Increment and assign object ID for each detected object. + // In a single MediaPipe session, the IDs are unique. + // Also assign timestamp for the FrameAnnotation to be the input packet + // timestamp. + void AssignObjectIdAndTimestamp(int64 timestamp_us, + FrameAnnotation* annotation); + std::unique_ptr decoder_; + ::mediapipe::Lift2DFrameAnnotationTo3DCalculatorOptions options_; + Eigen::Matrix projection_matrix_; +}; +REGISTER_CALCULATOR(Lift2DFrameAnnotationTo3DCalculator); + +::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(cc->Inputs().HasTag(kInputStreamTag)); + RET_CHECK(cc->Outputs().HasTag(kOutputStreamTag)); + cc->Inputs().Tag(kInputStreamTag).Set(); + cc->Outputs().Tag(kOutputStreamTag).Set(); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Open( + CalculatorContext* cc) { + MP_RETURN_IF_ERROR(LoadOptions(cc)); + // clang-format off + projection_matrix_ << + 1.5731, 0, 0, 0, + 0, 2.0975, 0, 0, + 0, 0, -1.0002, -0.2, + 0, 0, -1, 0; + // clang-format on + + decoder_ = absl::make_unique( + BeliefDecoderConfig(options_.decoder_config())); + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + + auto output_objects = absl::make_unique(); + + MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get())); + + // Output + if (cc->Outputs().HasTag(kOutputStreamTag)) { + cc->Outputs() + .Tag(kOutputStreamTag) + .Add(output_objects.release(), cc->InputTimestamp()); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::ProcessCPU( + CalculatorContext* cc, FrameAnnotation* output_objects) { + const auto& input_frame_annotations = + cc->Inputs().Tag(kInputStreamTag).Get(); + // Copy the input frame annotation to the output + *output_objects = input_frame_annotations; + + auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true, + output_objects); + if (!status.ok()) { + LOG(ERROR) << status; + return status; + } + AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(), + output_objects); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Close( + CalculatorContext* cc) { + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::LoadOptions( + CalculatorContext* cc) { + // Get calculator options specified in the graph. + options_ = + cc->Options<::mediapipe::Lift2DFrameAnnotationTo3DCalculatorOptions>(); + + return ::mediapipe::OkStatus(); +} + +void Lift2DFrameAnnotationTo3DCalculator::AssignObjectIdAndTimestamp( + int64 timestamp_us, FrameAnnotation* annotation) { + for (auto& ann : *annotation->mutable_annotations()) { + ann.set_object_id(GetNextObjectId()); + } + annotation->set_timestamp(timestamp_us); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.proto new file mode 100644 index 000000000..ccbdf2ee4 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.proto @@ -0,0 +1,30 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The option proto for the Lift2DFrameAnnotationTo3DCalculatorOptions. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; +import "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto"; + +message Lift2DFrameAnnotationTo3DCalculatorOptions { + extend CalculatorOptions { + optional Lift2DFrameAnnotationTo3DCalculatorOptions ext = 290166284; + } + + optional BeliefDecoderConfig decoder_config = 1; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/model.cc b/mediapipe/graphs/object_detection_3d/calculators/model.cc new file mode 100644 index 000000000..e664aebb3 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/model.cc @@ -0,0 +1,101 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/model.h" + +#include "mediapipe/framework/port/logging.h" + +namespace mediapipe { + +void Model::SetTransformation(const Eigen::Matrix4f& transform) { + transformation_ = transform; +} + +void Model::SetTranslation(const Eigen::Vector3f& translation) { + transformation_.col(3).template head<3>() = translation; +} + +void Model::SetRotation(float roll, float pitch, float yaw) { + // In our coordinate system, Y is up. We first rotate the object around Y + // (yaw), then around Z (pitch), and finally around X (roll). + Eigen::Matrix3f r; + r = Eigen::AngleAxisf(yaw, Eigen::Vector3f::UnitY()) * + Eigen::AngleAxisf(pitch, Eigen::Vector3f::UnitZ()) * + Eigen::AngleAxisf(roll, Eigen::Vector3f::UnitX()); + transformation_.topLeftCorner<3, 3>() = r; +} + +void Model::SetRotation(const Eigen::Matrix3f& rotation) { + transformation_.topLeftCorner<3, 3>() = rotation; +} + +void Model::SetScale(const Eigen::Vector3f& scale) { scale_ = scale; } + +void Model::SetCategory(const std::string& category) { category_ = category; } + +const Eigen::Vector3f Model::GetRotationAngles() const { + Vector3f ypr = transformation_.topLeftCorner<3, 3>().eulerAngles(1, 2, 0); + return Vector3f(ypr(2), ypr(1), ypr(0)); // swap YPR with RPY +} + +const Eigen::Matrix4f& Model::GetTransformation() const { + return transformation_; +} + +const Eigen::Vector3f& Model::GetScale() const { return scale_; } + +const Eigen::Ref Model::GetTranslation() const { + return transformation_.col(3).template head<3>(); +} + +const Eigen::Ref Model::GetRotation() const { + return transformation_.template topLeftCorner<3, 3>(); +} + +const std::string& Model::GetCategory() const { return category_; } + +void Model::Deserialize(const Object& obj) { + CHECK_EQ(obj.rotation_size(), 9); + CHECK_EQ(obj.translation_size(), 3); + CHECK_EQ(obj.scale_size(), 3); + category_ = obj.category(); + + using RotationMatrix = Eigen::Matrix; + transformation_.setIdentity(); + transformation_.topLeftCorner<3, 3>() = + Eigen::Map(obj.rotation().data()); + transformation_.col(3).head<3>() = + Eigen::Map(obj.translation().data()); + scale_ = Eigen::Map(obj.scale().data()); + Update(); +} + +void Model::Serialize(Object* obj) { + obj->set_category(category_); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + obj->add_rotation(transformation_(i, j)); + } + } + + for (int i = 0; i < 3; ++i) { + obj->add_translation(transformation_(i, 3)); + } + + for (int i = 0; i < 3; ++i) { + obj->add_scale(scale_[i]); + } +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/model.h b/mediapipe/graphs/object_detection_3d/calculators/model.h new file mode 100644 index 000000000..301b21d7a --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/model.h @@ -0,0 +1,92 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_ + +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/object.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/types.h" + +namespace mediapipe { + +class Model { + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + + enum Type { + kVisualizationOnly = 0, + kBoundingBox, + kSkeleton, + kShape, // Shape is a virtual object. + kNumModes, + }; + + virtual ~Model() = default; + + virtual void SetTransformation(const Eigen::Matrix4f& transform); + virtual void SetTranslation(const Eigen::Vector3f& translation); + + // Compute the rotation matrix from these angles and update the transformation + // matrix accordingly + virtual void SetRotation(float roll, float pitch, float yaw); + virtual void SetRotation(const Eigen::Matrix3f& rotation); + virtual void SetScale(const Eigen::Vector3f& scale); + virtual void SetCategory(const std::string& category); + virtual size_t GetNumberKeypoints() const { return number_keypoints_; } + + // Gets Euler angles in the order of roll, pitch, yaw. + virtual const Eigen::Vector3f GetRotationAngles() const; + virtual const Eigen::Matrix4f& GetTransformation() const; + virtual const Eigen::Vector3f& GetScale() const; + virtual const Eigen::Ref GetTranslation() const; + virtual const Eigen::Ref GetRotation() const; + virtual const std::string& GetCategory() const; + + // Update the model's keypoints in the world-coordinate system. + // The update includes transforming the model to the world-coordinate system + // as well as scaling the model. + // The user is expected to call this function after Setting the rotation, + // orientation or the scale of the model to get an updated model. + virtual void Update() = 0; + + // Update the model's parameters (orientation, position, and scale) from the + // user-provided variables. + virtual void Adjust(const std::vector& variables) = 0; + + // Returns a pointer to the model's keypoints. + // Use Eigen::Map to cast the pointer back to Vector3 or Vector4 + virtual const float* GetVertex(size_t id) const = 0; + virtual float* GetVertex(size_t id) = 0; + virtual void Deserialize(const Object& obj); + virtual void Serialize(Object* obj); + + // TODO: make member variables protected, and add public apis. + // 4x4 transformation matrix mapping the first keypoint to world coordinate + Eigen::Matrix4f transformation_; + Eigen::Vector3f scale_; // width, height, depth + Type model_type_; + size_t number_keypoints_; + std::string category_; + + protected: + Model(Type type, size_t number_keypoints, const std::string& category) + : model_type_(type), + number_keypoints_(number_keypoints), + category_(category) {} +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_ diff --git a/mediapipe/graphs/object_detection_3d/calculators/model_matrix.proto b/mediapipe/graphs/object_detection_3d/calculators/model_matrix.proto new file mode 100644 index 000000000..406cc9fc9 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/model_matrix.proto @@ -0,0 +1,48 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package mediapipe; + +message TimedModelMatrixProto { + // 4x4 model matrix stored in ROW major order. + repeated float matrix_entries = 1 [packed = true]; + // Timestamp of this model matrix in milliseconds. + optional int64 time_msec = 2 [default = 0]; + // Unique per object id + optional int32 id = 3 [default = -1]; +} + +message TimedModelMatrixProtoList { + repeated TimedModelMatrixProto model_matrix = 1; +} + +// For convenience, when the desired information or transformation can be +// encoded into vectors (e.g. when the matrix represents a scale or Euler-angle- +// based rotation operation.) +message TimedVectorProto { + // The vector values themselves. + repeated float vector_entries = 1 [packed = true]; + + // Timestamp of this vector in milliseconds. + optional int64 time_msec = 2 [default = 0]; + + // Unique per object id + optional int32 id = 3 [default = -1]; +} + +message TimedVectorProtoList { + repeated TimedVectorProto vector_list = 1; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/object.proto b/mediapipe/graphs/object_detection_3d/calculators/object.proto new file mode 100644 index 000000000..a07e83fee --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/object.proto @@ -0,0 +1,124 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package mediapipe; + +message KeyPoint { + // The position of the keypoint in the local coordinate system of the rigid + // object. + float x = 1; + float y = 2; + float z = 3; + + // Sphere around the keypoint, indiciating annotator's confidence of the + // position in meters. + float confidence_radius = 4; + + // The name of the keypoint (e.g. legs, head, etc.). + // Does not have to be unique. + string name = 5; + + // Indicates whether the keypoint is hidden or not. + bool hidden = 6; +} + +message Object { + // Unique object id through a sequence. There might be multiple objects of + // the same label in this sequence. + int32 id = 1; + + // Describes what category an object is. E.g. object class, attribute, + // instance or person identity. This provides additional context for the + // object type. + string category = 2; + + enum Type { + UNDEFINED_TYPE = 0; + BOUNDING_BOX = 1; + SKELETON = 2; + } + + Type type = 3; + + // 3x3 row-major rotation matrix describing the orientation of the rigid + // object's frame of reference in the world-coordinate system. + repeated float rotation = 4; + + // 3x1 vector describing the translation of the rigid object's frame of + // reference in the world-coordinate system in meters. + repeated float translation = 5; + + // 3x1 vector describing the scale of the rigid object's frame of reference in + // the world-coordinate system in meters. + repeated float scale = 6; + + // List of all the key points associated with this object in the object + // coordinate system. + // The first keypoint is always the object's frame of reference, + // e.g. the centroid of the box. + // E.g. bounding box with its center as frame of reference, the 9 keypoints : + // {0., 0., 0.}, + // {-.5, -.5, -.5}, {-.5, -.5, +.5}, {-.5, +.5, -.5}, {-.5, +.5, +.5}, + // {+.5, -.5, -.5}, {+.5, -.5, +.5}, {+.5, +.5, -.5}, {+.5, +.5, +.5} + // To get the bounding box in the world-coordinate system, we first scale the + // box then transform the scaled box. + // For example, bounding box in the world coordinate system is + // rotation * scale * keypoints + translation + repeated KeyPoint keypoints = 7; + + // Enum to reflect how this object is created. + enum Method { + UNKNOWN_METHOD = 0; + ANNOTATION = 1; // Created by data annotation. + AUGMENTATION = 2; // Created by data augmentation. + } + Method method = 8; +} + +// The edge connecting two keypoints together +message Edge { + // keypoint id of the edge's source + int32 source = 1; + + // keypoint id of the edge's sink + int32 sink = 2; +} + +// The skeleton template for different objects (e.g. humans, chairs, hands, etc) +// The annotation tool reads the skeleton template dictionary. +message Skeleton { + // The origin keypoint in the object coordinate system. (i.e. Point 0, 0, 0) + int32 reference_keypoint = 1; + + // The skeleton's category (e.g. human, chair, hand.). Should be unique in the + // dictionary. + string category = 2; + + // Initialization value for all the keypoints in the skeleton in the object's + // local coordinate system. Pursuit will transform these points using object's + // transformation to get the keypoint in the world-cooridnate. + repeated KeyPoint keypoints = 3; + + // List of edges connecting keypoints + repeated Edge edges = 4; +} + +// The list of all the modeled skeletons in our library. These models can be +// objects (chairs, desks, etc), humans (full pose, hands, faces, etc), or box. +// We can have multiple skeletons in the same file. +message Skeletons { + repeated Skeleton object = 1; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/tensor_util.cc b/mediapipe/graphs/object_detection_3d/calculators/tensor_util.cc new file mode 100644 index 000000000..728b25f4c --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/tensor_util.cc @@ -0,0 +1,33 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h" + +#include "mediapipe/framework/port/logging.h" + +namespace mediapipe { + +cv::Mat ConvertTfliteTensorToCvMat(const TfLiteTensor& tensor) { + // Check tensor is BxCxWxH (size = 4) and the batch size is one(data[0] = 1) + CHECK(tensor.dims->size == 4 && tensor.dims->data[0] == 1); + CHECK_EQ(kTfLiteFloat32, tensor.type) << "tflite_tensor type is not float"; + + const size_t num_output_channels = tensor.dims->data[3]; + const int dims = 2; + const int sizes[] = {tensor.dims->data[1], tensor.dims->data[2]}; + const int type = CV_MAKETYPE(CV_32F, num_output_channels); + return cv::Mat(dims, sizes, type, reinterpret_cast(tensor.data.f)); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/tensor_util.h b/mediapipe/graphs/object_detection_3d/calculators/tensor_util.h new file mode 100644 index 000000000..0fb5b4933 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/tensor_util.h @@ -0,0 +1,27 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_ + +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "tensorflow/lite/interpreter.h" + +namespace mediapipe { + +// Converts a single channel tflite tensor to a grayscale image +cv::Mat ConvertTfliteTensorToCvMat(const TfLiteTensor& tensor); +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_ diff --git a/mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.cc b/mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.cc new file mode 100644 index 000000000..3b3e692f0 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.cc @@ -0,0 +1,216 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "Eigen/Dense" +#include "absl/memory/memory.h" +#include "absl/strings/str_format.h" +#include "absl/types/span.h" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/deps/file_path.h" +#include "mediapipe/framework/port/opencv_core_inc.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.pb.h" +#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h" +#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h" +#include "mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.pb.h" +#include "tensorflow/lite/interpreter.h" + +namespace { +constexpr char kInputStreamTag[] = "TENSORS"; +constexpr char kOutputStreamTag[] = "ANNOTATIONS"; + +// Each detection object will be assigned an unique id that starts from 1. +static int object_id = 0; + +inline int GetNextObjectId() { return ++object_id; } +} // namespace + +namespace mediapipe { + +// Convert result TFLite tensors from deep pursuit 3d model into +// FrameAnnotation. +// +// Input: +// TENSORS - Vector of TfLiteTensor of type kTfLiteFloat32. +// Output: +// ANNOTATIONS - Result FrameAnnotation. +// +// Usage example: +// node { +// calculator: "TfLiteTensorsToObjectsCalculator" +// input_stream: "TENSORS:tensors" +// output_stream: "ANNOTATIONS:annotations" +// } +class TfLiteTensorsToObjectsCalculator : public CalculatorBase { + public: + static ::mediapipe::Status GetContract(CalculatorContract* cc); + + ::mediapipe::Status Open(CalculatorContext* cc) override; + ::mediapipe::Status Process(CalculatorContext* cc) override; + ::mediapipe::Status Close(CalculatorContext* cc) override; + + private: + ::mediapipe::Status ProcessCPU(CalculatorContext* cc, + FrameAnnotation* output_objects); + ::mediapipe::Status LoadOptions(CalculatorContext* cc); + // Takes point_3d in FrameAnnotation, projects to 2D, and overwrite the + // point_2d field with the projection. + void Project3DTo2D(bool portrait, FrameAnnotation* annotation) const; + // Increment and assign object ID for each detected object. + // In a single MediaPipe session, the IDs are unique. + // Also assign timestamp for the FrameAnnotation to be the input packet + // timestamp. + void AssignObjectIdAndTimestamp(int64 timestamp_us, + FrameAnnotation* annotation); + + int num_classes_ = 0; + int num_keypoints_ = 0; + + ::mediapipe::TfLiteTensorsToObjectsCalculatorOptions options_; + std::unique_ptr decoder_; + Eigen::Matrix projection_matrix_; +}; +REGISTER_CALCULATOR(TfLiteTensorsToObjectsCalculator); + +::mediapipe::Status TfLiteTensorsToObjectsCalculator::GetContract( + CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + + if (cc->Inputs().HasTag(kInputStreamTag)) { + cc->Inputs().Tag(kInputStreamTag).Set>(); + } + + if (cc->Outputs().HasTag(kOutputStreamTag)) { + cc->Outputs().Tag(kOutputStreamTag).Set(); + } + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TfLiteTensorsToObjectsCalculator::Open( + CalculatorContext* cc) { + MP_RETURN_IF_ERROR(LoadOptions(cc)); + // clang-format off + projection_matrix_ << + 1.5731, 0, 0, 0, + 0, 2.0975, 0, 0, + 0, 0, -1.0002, -0.2, + 0, 0, -1, 0; + // clang-format on + decoder_ = absl::make_unique( + BeliefDecoderConfig(options_.decoder_config())); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TfLiteTensorsToObjectsCalculator::Process( + CalculatorContext* cc) { + if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) { + return ::mediapipe::OkStatus(); + } + + auto output_objects = absl::make_unique(); + + MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get())); + + // Output + if (cc->Outputs().HasTag(kOutputStreamTag)) { + cc->Outputs() + .Tag(kOutputStreamTag) + .Add(output_objects.release(), cc->InputTimestamp()); + } + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TfLiteTensorsToObjectsCalculator::ProcessCPU( + CalculatorContext* cc, FrameAnnotation* output_objects) { + const auto& input_tensors = + cc->Inputs().Tag(kInputStreamTag).Get>(); + + cv::Mat prediction_heatmap = ConvertTfliteTensorToCvMat(input_tensors[0]); + cv::Mat offsetmap = ConvertTfliteTensorToCvMat(input_tensors[1]); + + *output_objects = + decoder_->DecodeBoundingBoxKeypoints(prediction_heatmap, offsetmap); + auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true, + output_objects); + if (!status.ok()) { + LOG(ERROR) << status; + return status; + } + Project3DTo2D(/*portrait*/ true, output_objects); + AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(), + output_objects); + + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TfLiteTensorsToObjectsCalculator::Close( + CalculatorContext* cc) { + return ::mediapipe::OkStatus(); +} + +::mediapipe::Status TfLiteTensorsToObjectsCalculator::LoadOptions( + CalculatorContext* cc) { + // Get calculator options specified in the graph. + options_ = + cc->Options<::mediapipe::TfLiteTensorsToObjectsCalculatorOptions>(); + + num_classes_ = options_.num_classes(); + num_keypoints_ = options_.num_keypoints(); + + // Currently only support 2D when num_values_per_keypoint equals to 2. + CHECK_EQ(options_.num_values_per_keypoint(), 2); + + return ::mediapipe::OkStatus(); +} + +void TfLiteTensorsToObjectsCalculator::Project3DTo2D( + bool portrait, FrameAnnotation* annotation) const { + for (auto& ann : *annotation->mutable_annotations()) { + for (auto& key_point : *ann.mutable_keypoints()) { + Eigen::Vector4f point3d; + point3d << key_point.point_3d().x(), key_point.point_3d().y(), + key_point.point_3d().z(), 1.0f; + Eigen::Vector4f point3d_projection = projection_matrix_ * point3d; + float u, v; + const float inv_w = 1.0f / point3d_projection(3); + if (portrait) { + u = (point3d_projection(1) * inv_w + 1.0f) * 0.5f; + v = (point3d_projection(0) * inv_w + 1.0f) * 0.5f; + } else { + u = (point3d_projection(0) * inv_w + 1.0f) * 0.5f; + v = (1.0f - point3d_projection(1) * inv_w) * 0.5f; + } + key_point.mutable_point_2d()->set_x(u); + key_point.mutable_point_2d()->set_y(v); + } + } +} + +void TfLiteTensorsToObjectsCalculator::AssignObjectIdAndTimestamp( + int64 timestamp_us, FrameAnnotation* annotation) { + for (auto& ann : *annotation->mutable_annotations()) { + ann.set_object_id(GetNextObjectId()); + } + annotation->set_timestamp(timestamp_us); +} + +} // namespace mediapipe diff --git a/mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.proto b/mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.proto new file mode 100644 index 000000000..4adf72f1a --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.proto @@ -0,0 +1,39 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The option proto for the TfLiteTensorsToObjectsCalculatorOptions. + +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; +import "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto"; + +message TfLiteTensorsToObjectsCalculatorOptions { + extend CalculatorOptions { + optional TfLiteTensorsToObjectsCalculatorOptions ext = 263667646; + } + + // The number of output classes predicted by the detection model. + optional int32 num_classes = 1; + + // The number of predicted keypoints. + optional int32 num_keypoints = 2; + // The dimension of each keypoint, e.g. number of values predicted for each + // keypoint. + optional int32 num_values_per_keypoint = 3 [default = 2]; + + optional BeliefDecoderConfig decoder_config = 4; +} diff --git a/mediapipe/graphs/object_detection_3d/calculators/types.h b/mediapipe/graphs/object_detection_3d/calculators/types.h new file mode 100644 index 000000000..db4d14728 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/calculators/types.h @@ -0,0 +1,56 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_ +#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_ + +#include + +#include "Eigen/Geometry" + +namespace mediapipe { + +using Eigen::Map; +using Eigen::Vector2f; +using Eigen::Vector3f; +using Eigen::Vector4f; +using Matrix4f_RM = Eigen::Matrix; +using Matrix3f_RM = Eigen::Matrix; + +using Face = std::array; + +struct SuperPoint { + enum PointSourceType { kPointCloud = 0, kBoundingBox = 1, kSkeleton = 2 }; + // The id of the point in the point-cloud + int reference_point; + // The source of the + PointSourceType source; + // The id of the point in set of points in current frame + int id; + // If source is kBoundingBox or kSkeleton, object_id stores the id of which \ + // object this point belongs to. + int object_id; + // projected u-v value + Vector2f uv; + Vector2f pixel; + // the 3D point + Vector3f point_3d; + // Color + Eigen::Matrix color; + bool rendered; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_ diff --git a/mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt b/mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt new file mode 100644 index 000000000..5700bb1ad --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/chair_classic_occlusion_tracking.pbtxt @@ -0,0 +1,133 @@ +# MediaPipe object detection 3D with tracking graph. + +# Images on GPU coming into and out of the graph. +input_stream: "input_video" +input_stream: "input_width" +input_stream: "input_height" +output_stream: "output_video" + +# Crops the image from the center to the size WIDTHxHEIGHT. +node: { + calculator: "ImageCroppingCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:input_video_4x3" + input_stream: "WIDTH:input_width" + input_stream: "HEIGHT:input_height" + node_options: { + [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { + border_mode: BORDER_REPLICATE + } + } +} + +# Creates a copy of the input_video stream. At the end of the graph, the +# GlAnimationOverlayCalculator will consume the input_video texture and draws +# on top of it. +node: { + calculator: "GlScalerCalculator" + input_stream: "VIDEO:input_video_4x3" + output_stream: "VIDEO:input_video_copy" +} + +# Resamples the images by specific frame rate. This calculator is used to +# control the frequecy of subsequent calculators/subgraphs, e.g. less power +# consumption for expensive process. +node { + calculator: "PacketResamplerCalculator" + input_stream: "DATA:input_video_copy" + output_stream: "DATA:sampled_input_video" + node_options: { + [type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] { + frame_rate: 5 + } + } +} + +node { + calculator: "ObjectronDetectionSubgraphGpu" + input_stream: "IMAGE_GPU:sampled_input_video" + output_stream: "ANNOTATIONS:objects" +} + +node { + calculator: "ObjectronTrackingSubgraphGpu" + input_stream: "FRAME_ANNOTATION:objects" + input_stream: "IMAGE_GPU:input_video_copy" + output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" +} + +# The rendering nodes: +# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly +# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask. +# These models are designed using different tools, so we supply a transformation +# to bring both of them to the Objectron's coordinate system. + +# Creates a model matrices for the tracked object given the lifted 3D points. +# This calculator does two things: 1) Estimates object's pose (orientation, +# translation, and scale) from the 3D vertices, and +# 2) bring the object from the objectron's coordinate system to the renderer +# (OpenGL) coordinate system. Since the final goal is to render a mesh file on +# top of the object, we also supply a transformation to bring the mesh to the +# objectron's coordinate system, and rescale mesh to the unit size. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_tracked_objects" + output_stream: "MODEL_MATRICES:model_matrices" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: [0.05, 0.05, 0.05] + # Bring the box CAD model to objectron's coordinate system. This + # is equivalent of -pi/2 rotation along the y-axis (right-hand rule): + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY()) + model_transformation: [0.0, 0.0, -1.0, 0.0] + model_transformation: [0.0, 1.0, 0.0, 0.0] + model_transformation: [1.0, 0.0, 0.0, 0.0] + model_transformation: [0.0, 0.0, 0.0, 1.0] + } + } +} + +# Compute the model matrices for the CAD model of the chair, to be used as an +# occlusion mask. The model will be rendered at the exact same location as the +# bounding box. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_tracked_objects" + output_stream: "MODEL_MATRICES:mask_model_matrices" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: [0.15, 0.1, 0.15] + # Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This + # is equivalent of -pi/2 rotation along the x-axis: + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX()) + model_transformation: [1.0, 0.0, 0.0, 0.0] + model_transformation: [0.0, 1.0, 0.0, -10.0] + model_transformation: [0.0, 0.0, -1.0, 0.0] + model_transformation: [0.0, 0.0, 0.0, 1.0] + } + } +} + +# Render everything together. First we render the 3D bounding box animation, +# then we render the occlusion mask. +node:{ + calculator:"GlAnimationOverlayCalculator" + input_stream:"VIDEO:input_video_4x3" + input_stream:"MODEL_MATRICES:model_matrices" + input_stream:"MASK_MODEL_MATRICES:mask_model_matrices" + output_stream:"output_video" + input_side_packet:"TEXTURE:box_texture" + input_side_packet:"ANIMATION_ASSET:box_asset_name" + input_side_packet:"MASK_TEXTURE:obj_texture" + input_side_packet:"MASK_ASSET:obj_asset_name" + node_options: { + [type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] { + # Output resolution is 480x640 with the aspect ratio of 0.75 + aspect_ratio: 0.75 + vertical_fov_degrees: 70. + animation_speed_fps: 25 + } + } +} diff --git a/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt b/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt new file mode 100644 index 000000000..0889c1d5a --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt @@ -0,0 +1,134 @@ +# MediaPipe object detection 3D with tracking graph. + +# Images on GPU coming into and out of the graph. +input_stream: "input_video" +input_stream: "input_width" +input_stream: "input_height" +output_stream: "output_video" + +# Crops the image from the center to the size WIDTHxHEIGHT. +node: { + calculator: "ImageCroppingCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:input_video_4x3" + input_stream: "WIDTH:input_width" + input_stream: "HEIGHT:input_height" + node_options: { + [type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] { + border_mode: BORDER_REPLICATE + } + } +} + +# Creates a copy of the input_video stream. At the end of the graph, the +# GlAnimationOverlayCalculator will consume the input_video texture and draws +# on top of it. +node: { + calculator: "GlScalerCalculator" + input_stream: "VIDEO:input_video_4x3" + output_stream: "VIDEO:input_video_copy" +} + +# Resamples the images by specific frame rate. This calculator is used to +# control the frequecy of subsequent calculators/subgraphs, e.g. less power +# consumption for expensive process. +node { + calculator: "PacketResamplerCalculator" + input_stream: "DATA:input_video_copy" + output_stream: "DATA:sampled_input_video" + node_options: { + [type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] { + frame_rate: 5 + } + } +} + +node { + calculator: "ObjectronDetectionSubgraphGpu" + input_stream: "IMAGE_GPU:sampled_input_video" + output_stream: "ANNOTATIONS:objects" +} + +node { + calculator: "ObjectronTrackingSubgraphGpu" + input_stream: "FRAME_ANNOTATION:objects" + input_stream: "IMAGE_GPU:input_video_copy" + output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" +} + +# The rendering nodes: +# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly +# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask. +# These models are designed using different tools, so we supply a transformation +# to bring both of them to the Objectron's coordinate system. + +# Creates a model matrices for the tracked object given the lifted 3D points. +# This calculator does two things: 1) Estimates object's pose (orientation, +# translation, and scale) from the 3D vertices, and +# 2) bring the object from the objectron's coordinate system to the renderer +# (OpenGL) coordinate system. Since the final goal is to render a mesh file on +# top of the object, we also supply a transformation to bring the mesh to the +# objectron's coordinate system, and rescale mesh to the unit size. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_tracked_objects" + output_stream: "MODEL_MATRICES:model_matrices" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: [0.05, 0.05, 0.05] + # Bring the box CAD model to objectron's coordinate system. This + # is equivalent of -pi/2 rotation along the y-axis (right-hand rule): + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY()) + model_transformation: [0.0, 0.0, -1.0, 0.0] + model_transformation: [0.0, 1.0, 0.0, 0.0] + model_transformation: [1.0, 0.0, 0.0, 0.0] + model_transformation: [0.0, 0.0, 0.0, 1.0] + } + } +} + +# Compute the model matrices for the CAD model of the shoe, to be used as an +# occlusion mask. The model will be rendered at the exact same location as the +# bounding box. +node { + calculator: "AnnotationsToModelMatricesCalculator" + input_stream: "ANNOTATIONS:lifted_tracked_objects" + output_stream: "MODEL_MATRICES:mask_model_matrices" + #input_side_packet: "MODEL_SCALE:model_scale" + node_options: { + [type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] { + # Re-scale the CAD model to the size of a unit box + model_scale: [0.45, 0.25, 0.15] + # Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This + # is equivalent of -pi/2 rotation along the x-axis (right-hand rule): + # Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX()) + model_transformation: [1.0, 0.0, 0.0, 0.0] + model_transformation: [0.0, 0.0, 1.0, 0.0] + model_transformation: [0.0, -1.0, 0.0, 0.0] + model_transformation: [0.0, 0.0, 0.0, 1.0] + } + } +} + +# Render everything together. First we render the 3D bounding box animation, +# then we render the occlusion mask. +node: { + calculator: "GlAnimationOverlayCalculator" + input_stream: "VIDEO:input_video_4x3" + input_stream: "MODEL_MATRICES:model_matrices" + input_stream: "MASK_MODEL_MATRICES:mask_model_matrices" + output_stream: "output_video" + input_side_packet: "TEXTURE:box_texture" + input_side_packet: "ANIMATION_ASSET:box_asset_name" + input_side_packet: "MASK_TEXTURE:obj_texture" + input_side_packet: "MASK_ASSET:obj_asset_name" + node_options: { + [type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] { + # Output resolution is 480x640 with the aspect ratio of 0.75 + aspect_ratio: 0.75 + vertical_fov_degrees: 70. + animation_speed_fps: 25 + } + } +} diff --git a/mediapipe/graphs/object_detection_3d/subgraphs/BUILD b/mediapipe/graphs/object_detection_3d/subgraphs/BUILD new file mode 100644 index 000000000..f471e4189 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/subgraphs/BUILD @@ -0,0 +1,52 @@ +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_simple_subgraph", +) + +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//visibility:public"]) + +mediapipe_simple_subgraph( + name = "objectron_detection_gpu", + graph = "objectron_detection_gpu.pbtxt", + register_as = "ObjectronDetectionSubgraphGpu", + deps = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tflite:tflite_converter_calculator", + "//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator", + "//mediapipe/calculators/tflite:tflite_inference_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:tflite_tensors_to_objects_calculator", + ], +) + +mediapipe_simple_subgraph( + name = "objectron_tracking_gpu", + graph = "objectron_tracking_gpu.pbtxt", + register_as = "ObjectronTrackingSubgraphGpu", + deps = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/video:box_tracker_calculator", + "//mediapipe/calculators/video:flow_packager_calculator", + "//mediapipe/calculators/video:motion_analysis_calculator", + "//mediapipe/framework/stream_handler:sync_set_input_stream_handler", + "//mediapipe/gpu:gpu_buffer_to_image_frame_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_to_timed_box_list_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_tracker_calculator", + "//mediapipe/graphs/object_detection_3d/calculators:lift_2d_frame_annotation_to_3d_calculator", + ], +) diff --git a/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt b/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt new file mode 100644 index 000000000..ad0d3653b --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt @@ -0,0 +1,81 @@ +# MediaPipe Objectron detection gpu subgraph + +type: "ObjectronDetectionSubgraphGpu" + +input_stream: "IMAGE_GPU:input_video" +output_stream: "ANNOTATIONS:objects" + +# Transforms the input image on GPU to a 480x640 image. To scale the input +# image, the scale_mode option is set to FIT to preserve the aspect ratio, +# resulting in potential letterboxing in the transformed image. +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 480 + output_height: 640 + scale_mode: FIT + } + } +} + +# Converts the transformed input image on GPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE_GPU:transformed_input_video" + output_stream: "TENSORS_GPU:image_tensor" +} + +# Generates a single side packet containing a TensorFlow Lite op resolver that +# supports custom ops needed by the model used in this graph. +node { + calculator: "TfLiteCustomOpResolverCalculator" + output_side_packet: "opresolver" + node_options: { + [type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] { + use_gpu: true + } + } +} + +# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS_GPU:image_tensor" + output_stream: "TENSORS:detection_tensors" + input_side_packet: "CUSTOM_OP_RESOLVER:opresolver" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "object_detection_3d.tflite" + } + } +} + +# Decodes the model's output tensor (the heatmap and the distance fields) to 2D +# keypoints. There are nine 2D keypoints: one center keypoint and eight vertices +# for the 3D bounding box. The calculator parameters determine's the decoder's +# sensitivity. +node { + calculator: "TfLiteTensorsToObjectsCalculator" + input_stream: "TENSORS:detection_tensors" + output_stream: "ANNOTATIONS:objects" + node_options: { + [type.googleapis.com/mediapipe.TfLiteTensorsToObjectsCalculatorOptions] { + num_classes: 1 + num_keypoints: 9 + decoder_config { + heatmap_threshold: 0.6 + local_max_distance: 2 + offset_scale_coef: 1.0 + voting_radius: 2 + voting_allowance: 1 + voting_threshold: 0.2 + } + } + } +} diff --git a/mediapipe/graphs/object_detection_3d/subgraphs/objectron_tracking_gpu.pbtxt b/mediapipe/graphs/object_detection_3d/subgraphs/objectron_tracking_gpu.pbtxt new file mode 100644 index 000000000..b6c778c44 --- /dev/null +++ b/mediapipe/graphs/object_detection_3d/subgraphs/objectron_tracking_gpu.pbtxt @@ -0,0 +1,170 @@ +# MediaPipe Objectron tracking gpu subgraph + +type: "ObjectronTrackingSubgraphGpu" + +input_stream: "FRAME_ANNOTATION:objects" +input_stream: "IMAGE_GPU:input_video" +output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" + + +# Converts the detected keypoints to Boxes, used by the tracking subgraph. +node { + calculator: "FrameAnnotationToTimedBoxListCalculator" + input_stream: "FRAME_ANNOTATION:objects" + output_stream: "BOXES:start_pos" +} + +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE_GPU:input_video" + output_stream: "IMAGE_GPU:downscaled_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 240 + output_height: 320 + } + } +} + +# Converts GPU buffer to ImageFrame for processing tracking. +node: { + calculator: "GpuBufferToImageFrameCalculator" + input_stream: "downscaled_input_video" + output_stream: "downscaled_input_video_cpu" +} + +# Performs motion analysis on an incoming video stream. +node: { + calculator: "MotionAnalysisCalculator" + input_stream: "VIDEO:downscaled_input_video_cpu" + output_stream: "CAMERA:camera_motion" + output_stream: "FLOW:region_flow" + + node_options: { + [type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: { + analysis_options { + analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE + flow_options { + fast_estimation_min_block_size: 100 + top_inlier_sets: 1 + frac_inlier_error_threshold: 3e-3 + downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE + verification_distance: 5.0 + verify_long_feature_acceleration: true + verify_long_feature_trigger_ratio: 0.1 + tracking_options { + max_features: 500 + adaptive_extraction_levels: 2 + min_eig_val_settings { + adaptive_lowest_quality_level: 2e-4 + } + klt_tracker_implementation: KLT_OPENCV + } + } + } + } + } +} + +# Reads optical flow fields defined in +# mediapipe/framework/formats/motion/optical_flow_field.h, +# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized +# to 0-255. +node: { + calculator: "FlowPackagerCalculator" + input_stream: "FLOW:region_flow" + input_stream: "CAMERA:camera_motion" + output_stream: "TRACKING:tracking_data" + + node_options: { + [type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: { + flow_packager_options: { + binary_tracking_data_support: false + } + } + } +} + +# Tracks box positions over time. +node: { + calculator: "BoxTrackerCalculator" + input_stream: "TRACKING:tracking_data" + input_stream: "TRACK_TIME:input_video" + input_stream: "START_POS:start_pos" + input_stream: "CANCEL_OBJECT_ID:cancel_object_id" + input_stream_info: { + tag_index: "CANCEL_OBJECT_ID" + back_edge: true + } + output_stream: "BOXES:boxes" + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "TRACKING" + tag_index: "TRACK_TIME" + } + sync_set { + tag_index: "START_POS" + } + sync_set { + tag_index: "CANCEL_OBJECT_ID" + } + } + } + } + + node_options: { + [type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: { + tracker_options: { + track_step_options { + track_object_and_camera: true + tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE + inlier_spring_force: 0.0 + static_motion_temporal_ratio: 3e-2 + } + } + visualize_tracking_data: false + streaming_track_data_cache_size: 100 + } + } +} + +# Consolidates tracking and detection results. +node { + calculator: "FrameAnnotationTrackerCalculator" + input_stream: "FRAME_ANNOTATION:objects" + input_stream: "TRACKED_BOXES:boxes" + output_stream: "TRACKED_FRAME_ANNOTATION:tracked_objects" + output_stream: "CANCEL_OBJECT_ID:cancel_object_id" + node_options: { + [type.googleapis.com/mediapipe.FrameAnnotationTrackerCalculatorOptions] { + img_width: 240 + img_height: 320 + iou_threshold: 0.1 + } + } + + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler" + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "FRAME_ANNOTATION" + } + sync_set { + tag_index: "TRACKED_BOXES" + } + } + } + } +} + +# Lift the tracked 2D keypoints to 3D using EPnP algorithm. +node { + calculator: "Lift2DFrameAnnotationTo3DCalculator" + input_stream: "FRAME_ANNOTATION:tracked_objects" + output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects" +} diff --git a/mediapipe/java/com/google/mediapipe/components/CameraXPreviewHelper.java b/mediapipe/java/com/google/mediapipe/components/CameraXPreviewHelper.java index 24ee92aa6..b2b36d351 100644 --- a/mediapipe/java/com/google/mediapipe/components/CameraXPreviewHelper.java +++ b/mediapipe/java/com/google/mediapipe/components/CameraXPreviewHelper.java @@ -67,15 +67,19 @@ public class CameraXPreviewHelper extends CameraHelper { private int cameraTimestampSource = CameraCharacteristics.SENSOR_INFO_TIMESTAMP_SOURCE_UNKNOWN; @Override - @SuppressWarnings("RestrictTo") // See b/132705545. public void startCamera( Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture) { + startCamera(context, cameraFacing, surfaceTexture, TARGET_SIZE); + } + + public void startCamera( + Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture, Size targetSize) { LensFacing cameraLensFacing = cameraFacing == CameraHelper.CameraFacing.FRONT ? LensFacing.FRONT : LensFacing.BACK; PreviewConfig previewConfig = new PreviewConfig.Builder() .setLensFacing(cameraLensFacing) - .setTargetResolution(TARGET_SIZE) + .setTargetResolution(targetSize) .build(); preview = new Preview(previewConfig); @@ -110,7 +114,6 @@ public class CameraXPreviewHelper extends CameraHelper { } }); CameraX.bindToLifecycle(/*lifecycleOwner=*/ (LifecycleOwner) context, preview); - } @Override @@ -210,6 +213,10 @@ public class CameraXPreviewHelper extends CameraHelper { return focalLengthPixels; } + public Size getFrameSize() { + return frameSize; + } + // Computes the focal length of the camera in pixels based on lens and sensor properties. private float calculateFocalLengthInPixels() { // Focal length of the camera in millimeters. diff --git a/mediapipe/models/object_detection_3d_chair.tflite b/mediapipe/models/object_detection_3d_chair.tflite new file mode 100644 index 000000000..718dc9766 Binary files /dev/null and b/mediapipe/models/object_detection_3d_chair.tflite differ diff --git a/mediapipe/models/object_detection_3d_sneakers.tflite b/mediapipe/models/object_detection_3d_sneakers.tflite new file mode 100644 index 000000000..207711433 Binary files /dev/null and b/mediapipe/models/object_detection_3d_sneakers.tflite differ diff --git a/mediapipe/util/tflite/BUILD b/mediapipe/util/tflite/BUILD index da6d432d4..0e13bf667 100644 --- a/mediapipe/util/tflite/BUILD +++ b/mediapipe/util/tflite/BUILD @@ -41,3 +41,37 @@ cc_library( "@org_tensorflow//tensorflow/lite/kernels:builtin_ops", ], ) + +cc_library( + name = "tensor_buffer", + srcs = ["tensor_buffer.cc"], + hdrs = ["tensor_buffer.h"], + deps = [ + "@org_tensorflow//tensorflow/lite:framework", + "@com_google_absl//absl/memory", + "//mediapipe/framework:port", + ] + select({ + "//mediapipe/gpu:disable_gpu": [], + "//mediapipe:ios": [ + "//mediapipe/gpu:MPPMetalUtil", + "//mediapipe/gpu:gl_base", + ], + "//conditions:default": [ + "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_buffer", + "//mediapipe/gpu:gl_base", + "//mediapipe/gpu:gl_context", + ], + }), +) + +cc_test( + name = "tensor_buffer_test", + srcs = ["tensor_buffer_test.cc"], + deps = [ + ":tensor_buffer", + "//mediapipe/framework/port:gtest_main", + ] + select({ + "//mediapipe/gpu:disable_gpu": [], + "//conditions:default": [], + }), +) diff --git a/mediapipe/util/tflite/tensor_buffer.cc b/mediapipe/util/tflite/tensor_buffer.cc new file mode 100644 index 000000000..47b2487ff --- /dev/null +++ b/mediapipe/util/tflite/tensor_buffer.cc @@ -0,0 +1,43 @@ +#include "mediapipe/util/tflite/tensor_buffer.h" + +namespace mediapipe { + +TensorBuffer::TensorBuffer() {} + +TensorBuffer::~TensorBuffer() { uses_gpu_ = false; } + +TensorBuffer::TensorBuffer(TfLiteTensor& tensor) { + cpu_ = tensor; + uses_gpu_ = false; +} + +#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) +TensorBuffer::TensorBuffer(std::shared_ptr tensor) { + gpu_ = std::move(tensor); + uses_gpu_ = true; +} +// static +std::shared_ptr TensorBuffer::CreateGlBuffer( + std::shared_ptr context) { + std::shared_ptr ptr( + new tflite::gpu::gl::GlBuffer, [context](tflite::gpu::gl::GlBuffer* ref) { + if (context) { + context->Run([ref]() { + if (ref) delete ref; + }); + } else { + if (ref) delete ref; // No context provided. + } + }); + return ptr; +} +#endif // MEDIAPIPE_DISABLE_GL_COMPUTE + +#if defined(MEDIAPIPE_IOS) +TensorBuffer::TensorBuffer(id tensor) { + gpu_ = tensor; + uses_gpu_ = true; +} +#endif // MEDIAPIPE_IOS + +} // namespace mediapipe diff --git a/mediapipe/util/tflite/tensor_buffer.h b/mediapipe/util/tflite/tensor_buffer.h new file mode 100644 index 000000000..e41e96ba4 --- /dev/null +++ b/mediapipe/util/tflite/tensor_buffer.h @@ -0,0 +1,78 @@ +// Copyright 2020 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MEDIAPIPE_UTIL_TFLITE_TENSOR_BUFFER_H_ +#define MEDIAPIPE_UTIL_TFLITE_TENSOR_BUFFER_H_ + +#include "absl/memory/memory.h" +#include "mediapipe/framework/port.h" +#include "tensorflow/lite/interpreter.h" + +#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) +#include "mediapipe/gpu/gl_context.h" +#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h" +#endif // MEDIAPIPE_DISABLE_GL_COMPUTE + +#if defined(MEDIAPIPE_IOS) +#import +#endif // MEDIAPIPE_IOS + +namespace mediapipe { + +class TensorBuffer { + public: + TensorBuffer(); + ~TensorBuffer(); + + TensorBuffer(TfLiteTensor& tensor); + TfLiteTensor* GetTfLiteTensor() { return &cpu_; } + const TfLiteTensor* GetTfLiteTensor() const { return &cpu_; } + +#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + TensorBuffer(std::shared_ptr tensor); + std::shared_ptr GetGlBuffer() { return gpu_; } + const std::shared_ptr GetGlBuffer() const { + return gpu_; + } + // Example use: + // auto tensor_buf = TensorBuffer(TensorBuffer::CreateGlBuffer(gl_context)); + static std::shared_ptr CreateGlBuffer( + std::shared_ptr context); +#endif // MEDIAPIPE_DISABLE_GL_COMPUTE + +#if defined(MEDIAPIPE_IOS) + TensorBuffer(id tensor); + id GetMetalBuffer() { return gpu_; } + const id GetMetalBuffer() const { return gpu_; } +#endif // MEDIAPIPE_IOS + + bool UsesGpu() const { return uses_gpu_; } + + private: + TfLiteTensor cpu_; + +#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE) + std::shared_ptr gpu_; +#endif // MEDIAPIPE_DISABLE_GL_COMPUTE + +#if defined(MEDIAPIPE_IOS) + typedef id gpu_; +#endif // MEDIAPIPE_IOS + + bool uses_gpu_ = false; +}; + +} // namespace mediapipe + +#endif // MEDIAPIPE_UTIL_TFLITE_TENSOR_BUFFER_H_ diff --git a/mediapipe/util/tflite/tensor_buffer_test.cc b/mediapipe/util/tflite/tensor_buffer_test.cc new file mode 100644 index 000000000..197a60f79 --- /dev/null +++ b/mediapipe/util/tflite/tensor_buffer_test.cc @@ -0,0 +1,30 @@ +#include "mediapipe/util/tflite/tensor_buffer.h" + +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" + +namespace mediapipe { + +TEST(Cpu, BasicTest) { + TensorBuffer tb; + TfLiteTensor tfl_tb; + tb = TensorBuffer(tfl_tb); + EXPECT_FALSE(tb.UsesGpu()); +} + +#if !defined(MEDIAPIPE_DISABLE_GPU) +TEST(Gpu, BasicTest) { + TensorBuffer tb; + std::shared_ptr tfg_tb = + TensorBuffer::CreateGlBuffer(nullptr); + tb = TensorBuffer(tfg_tb); + EXPECT_TRUE(tb.UsesGpu()); +} +#endif // !MEDIAPIPE_DISABLE_GPU + +} // namespace mediapipe + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/mediapipe/util/tracking/BUILD b/mediapipe/util/tracking/BUILD index 6eb4a16a4..5bc125c34 100644 --- a/mediapipe/util/tracking/BUILD +++ b/mediapipe/util/tracking/BUILD @@ -670,6 +670,7 @@ cc_test( copts = PARALLEL_COPTS, data = ["testdata/stabilize_test.png"], linkopts = PARALLEL_LINKOPTS, + linkstatic = 1, deps = [ ":region_flow", ":region_flow_cc_proto", diff --git a/mediapipe/util/tracking/box_tracker.proto b/mediapipe/util/tracking/box_tracker.proto index 1d778da1c..404c1bd3c 100644 --- a/mediapipe/util/tracking/box_tracker.proto +++ b/mediapipe/util/tracking/box_tracker.proto @@ -43,7 +43,7 @@ message BoxTrackerOptions { optional TrackStepOptions track_step_options = 6; } -// Next tag: 13 +// Next tag: 14 // Proto equivalent of struct TimedBox. message TimedBoxProto { // Normalized coords - in [0, 1] @@ -59,6 +59,9 @@ message TimedBoxProto { // Unique per object id to disambiguate boxes. optional int32 id = 6 [default = -1]; + // Box lable name. + optional string label = 13; + // Confidence of box tracked in the range [0, 1], with 0 being least // confident, and 1 being most confident. A reasonable threshold is 0.5 // to filter out unconfident boxes.