Project import generated by Copybara.
GitOrigin-RevId: 4419aaa472eeb91123d1f8576188166ee0e5ea69
|
@ -15,6 +15,7 @@
|
|||
* [Hair Segmentation](mediapipe/docs/hair_segmentation_mobile_gpu.md) [[Web Demo]](https://viz.mediapipe.dev/runner/demos/hair_segmentation/hair_segmentation.html)
|
||||
* [Object Detection](mediapipe/docs/object_detection_mobile_gpu.md)
|
||||
* [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md)
|
||||
* [Objectron: 3D Object Detection and Tracking](mediapipe/docs/objectron_mobile_gpu.md)
|
||||
* [AutoFlip](mediapipe/docs/autoflip.md)
|
||||
|
||||
![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif)
|
||||
|
@ -43,6 +44,8 @@ A web-based visualizer is hosted on [viz.mediapipe.dev](https://viz.mediapipe.de
|
|||
* [YouTube Channel](https://www.youtube.com/channel/UCObqmpuSMx-usADtL_qdMAw)
|
||||
|
||||
## Publications
|
||||
* [MediaPipe Objectron: Real-time 3D Object Detection on Mobile Devices](https://mediapipe.page.link/objectron-aiblog)
|
||||
* [AutoFlip: An Open Source Framework for Intelligent Video Reframing](https://mediapipe.page.link/autoflip)
|
||||
* [Google Developer Blog: MediaPipe on the Web](https://mediapipe.page.link/webdevblog)
|
||||
* [Google Developer Blog: Object Detection and Tracking using MediaPipe](https://mediapipe.page.link/objecttrackingblog)
|
||||
* [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html)
|
||||
|
@ -63,7 +66,7 @@ A web-based visualizer is hosted on [viz.mediapipe.dev](https://viz.mediapipe.de
|
|||
* [Discuss](https://groups.google.com/forum/#!forum/mediapipe) - General community discussion around MediaPipe
|
||||
|
||||
## Alpha Disclaimer
|
||||
MediaPipe is currently in alpha for v0.6. We are still making breaking API changes and expect to get to stable API by v1.0.
|
||||
MediaPipe is currently in alpha for v0.7. We are still making breaking API changes and expect to get to stable API by v1.0.
|
||||
|
||||
## Contributing
|
||||
We welcome contributions. Please follow these [guidelines](./CONTRIBUTING.md).
|
||||
|
|
|
@ -75,11 +75,28 @@ REGISTER_CALCULATOR(ImageCroppingCalculator);
|
|||
}
|
||||
#endif // !MEDIAPIPE_DISABLE_GPU
|
||||
|
||||
RET_CHECK(cc->Inputs().HasTag(kRectTag) ^ cc->Inputs().HasTag(kNormRectTag) ^
|
||||
(cc->Options<mediapipe::ImageCroppingCalculatorOptions>()
|
||||
int flags = 0;
|
||||
if (cc->Inputs().HasTag(kRectTag)) {
|
||||
++flags;
|
||||
}
|
||||
if (cc->Inputs().HasTag(kWidthTag) && cc->Inputs().HasTag(kHeightTag)) {
|
||||
++flags;
|
||||
}
|
||||
if (cc->Inputs().HasTag(kNormRectTag)) {
|
||||
++flags;
|
||||
}
|
||||
if (cc->Options<mediapipe::ImageCroppingCalculatorOptions>()
|
||||
.has_norm_width() &&
|
||||
cc->Options<mediapipe::ImageCroppingCalculatorOptions>()
|
||||
.has_norm_height()));
|
||||
.has_norm_height()) {
|
||||
++flags;
|
||||
}
|
||||
if (cc->Options<mediapipe::ImageCroppingCalculatorOptions>().has_width() &&
|
||||
cc->Options<mediapipe::ImageCroppingCalculatorOptions>().has_height()) {
|
||||
++flags;
|
||||
}
|
||||
RET_CHECK(flags == 1) << "Illegal combination of input streams/options.";
|
||||
|
||||
if (cc->Inputs().HasTag(kRectTag)) {
|
||||
cc->Inputs().Tag(kRectTag).Set<Rect>();
|
||||
}
|
||||
|
|
|
@ -39,6 +39,15 @@ proto_library(
|
|||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "timed_box_list_id_to_label_calculator_proto",
|
||||
srcs = ["timed_box_list_id_to_label_calculator.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "latency_proto",
|
||||
srcs = ["latency.proto"],
|
||||
|
@ -113,6 +122,18 @@ mediapipe_cc_proto_library(
|
|||
],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "timed_box_list_id_to_label_calculator_cc_proto",
|
||||
srcs = ["timed_box_list_id_to_label_calculator.proto"],
|
||||
cc_deps = [
|
||||
"//mediapipe/framework:calculator_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":timed_box_list_id_to_label_calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "latency_cc_proto",
|
||||
srcs = ["latency.proto"],
|
||||
|
@ -313,6 +334,34 @@ cc_library(
|
|||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "timed_box_list_id_to_label_calculator",
|
||||
srcs = ["timed_box_list_id_to_label_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":timed_box_list_id_to_label_calculator_cc_proto",
|
||||
"//mediapipe/framework/port:status",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework:packet",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
"//mediapipe/util:resource_util",
|
||||
] + select({
|
||||
"//mediapipe:android": [
|
||||
"//mediapipe/util/android/file/base",
|
||||
],
|
||||
"//mediapipe:apple": [
|
||||
"//mediapipe/util/android/file/base",
|
||||
],
|
||||
"//mediapipe:macos": [
|
||||
"//mediapipe/framework/port:file_helpers",
|
||||
],
|
||||
"//conditions:default": [
|
||||
"//mediapipe/framework/port:file_helpers",
|
||||
],
|
||||
}),
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "non_max_suppression_calculator",
|
||||
srcs = ["non_max_suppression_calculator.cc"],
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe//framework/packet.h"
|
||||
#include "mediapipe/calculators/util/detection_label_id_to_text_calculator.pb.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/formats/detection.pb.h"
|
||||
#include "mediapipe/framework/packet.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/util/resource_util.h"
|
||||
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
// Copyright 2019 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/calculators/util/timed_box_list_id_to_label_calculator.pb.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/packet.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/util/resource_util.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
#if defined(MEDIAPIPE_MOBILE)
|
||||
#include "mediapipe/util/android/file/base/file.h"
|
||||
#include "mediapipe/util/android/file/base/helpers.h"
|
||||
#else
|
||||
#include "mediapipe/framework/port/file_helpers.h"
|
||||
#endif
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
using mediapipe::TimedBoxProto;
|
||||
using mediapipe::TimedBoxProtoList;
|
||||
|
||||
// Takes a label map (from label IDs to names), and populate the label field in
|
||||
// TimedBoxProto according to it's ID.
|
||||
//
|
||||
// Example usage:
|
||||
// node {
|
||||
// calculator: "TimedBoxListIdToLabelCalculator"
|
||||
// input_stream: "input_timed_box_list"
|
||||
// output_stream: "output_timed_box_list"
|
||||
// node_options: {
|
||||
// [mediapipe.TimedBoxListIdToLabelCalculatorOptions] {
|
||||
// label_map_path: "labelmap.txt"
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
class TimedBoxListIdToLabelCalculator : public CalculatorBase {
|
||||
public:
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
|
||||
private:
|
||||
std::unordered_map<int, std::string> label_map_;
|
||||
};
|
||||
REGISTER_CALCULATOR(TimedBoxListIdToLabelCalculator);
|
||||
|
||||
::mediapipe::Status TimedBoxListIdToLabelCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
cc->Inputs().Index(0).Set<TimedBoxProtoList>();
|
||||
cc->Outputs().Index(0).Set<TimedBoxProtoList>();
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TimedBoxListIdToLabelCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
cc->SetOffset(TimestampDiff(0));
|
||||
|
||||
const auto& options =
|
||||
cc->Options<::mediapipe::TimedBoxListIdToLabelCalculatorOptions>();
|
||||
|
||||
std::string string_path;
|
||||
ASSIGN_OR_RETURN(string_path, PathToResourceAsFile(options.label_map_path()));
|
||||
std::string label_map_string;
|
||||
MP_RETURN_IF_ERROR(file::GetContents(string_path, &label_map_string));
|
||||
|
||||
std::istringstream stream(label_map_string);
|
||||
std::string line;
|
||||
int i = 0;
|
||||
while (std::getline(stream, line)) {
|
||||
label_map_[i++] = line;
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TimedBoxListIdToLabelCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
const auto& input_list = cc->Inputs().Index(0).Get<TimedBoxProtoList>();
|
||||
auto output_list = absl::make_unique<TimedBoxProtoList>();
|
||||
for (const auto& input_box : input_list.box()) {
|
||||
TimedBoxProto* box_ptr = output_list->add_box();
|
||||
*box_ptr = input_box;
|
||||
|
||||
if (label_map_.find(input_box.id()) != label_map_.end()) {
|
||||
box_ptr->set_label(label_map_[input_box.id()]);
|
||||
}
|
||||
}
|
||||
cc->Outputs().Index(0).Add(output_list.release(), cc->InputTimestamp());
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,28 @@
|
|||
// Copyright 2019 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
|
||||
message TimedBoxListIdToLabelCalculatorOptions {
|
||||
extend mediapipe.CalculatorOptions {
|
||||
optional TimedBoxListIdToLabelCalculatorOptions ext = 297701606;
|
||||
}
|
||||
|
||||
// Path to a label map file for getting the actual name of detected classes.
|
||||
optional string label_map_path = 1;
|
||||
}
|
|
@ -66,6 +66,25 @@ void AddTimedBoxProtoToRenderData(
|
|||
rect->set_bottom(box_proto.bottom());
|
||||
rect->set_rotation(box_proto.rotation());
|
||||
}
|
||||
|
||||
if (box_proto.has_label()) {
|
||||
auto* label_annotation = render_data->add_render_annotations();
|
||||
label_annotation->mutable_color()->set_r(options.box_color().r());
|
||||
label_annotation->mutable_color()->set_g(options.box_color().g());
|
||||
label_annotation->mutable_color()->set_b(options.box_color().b());
|
||||
label_annotation->set_thickness(options.thickness());
|
||||
RenderAnnotation::Text* text = label_annotation->mutable_text();
|
||||
text->set_display_text(box_proto.label());
|
||||
text->set_normalized(true);
|
||||
constexpr float text_left_start = 0.3f;
|
||||
text->set_left((1.0f - text_left_start) * box_proto.left() +
|
||||
text_left_start * box_proto.right());
|
||||
constexpr float text_baseline = 0.6f;
|
||||
text->set_baseline(text_baseline * box_proto.bottom() +
|
||||
(1.0f - text_baseline) * box_proto.top());
|
||||
constexpr float text_height = 0.2f;
|
||||
text->set_font_height((box_proto.bottom() - box_proto.top()) * text_height);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -15,6 +15,9 @@ For overall context on AutoFlip, please read this
|
|||
|
||||
Run the following command to build the AutoFlip pipeline:
|
||||
|
||||
Note: AutoFlip currently only works with OpenCV 3 . Please verify your OpenCV
|
||||
version beforehand.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/autoflip:run_autoflip
|
||||
```
|
||||
|
|
|
@ -44,6 +44,14 @@ graphs can be easily adapted to run on CPU v.s. GPU.
|
|||
[Object Detection and Tracking with GPU](./object_tracking_mobile_gpu.md) illustrates how to
|
||||
use MediaPipe for object detection and tracking.
|
||||
|
||||
### Objectron: 3D Object Detection and Tracking with GPU
|
||||
|
||||
[MediaPipe Objectron is 3D Object Detection with GPU](./objectron_mobile_gpu.md)
|
||||
illustrates mobile real-time 3D object detection and tracking pipeline for every
|
||||
day objects like shoes and chairs
|
||||
|
||||
* [Android](./objectron_mobile_gpu.md)
|
||||
|
||||
### Face Detection with GPU
|
||||
|
||||
[Face Detection with GPU](./face_detection_mobile_gpu.md) illustrates how to use
|
||||
|
|
BIN
mediapipe/docs/images/mobile/object_detection_3d_android_gpu.png
Normal file
After Width: | Height: | Size: 100 KiB |
BIN
mediapipe/docs/images/mobile/objectron_chair_android_gpu.gif
Normal file
After Width: | Height: | Size: 2.5 MiB |
BIN
mediapipe/docs/images/mobile/objectron_detection_subgraph.png
Normal file
After Width: | Height: | Size: 64 KiB |
BIN
mediapipe/docs/images/mobile/objectron_shoe_android_gpu.gif
Normal file
After Width: | Height: | Size: 2.8 MiB |
BIN
mediapipe/docs/images/mobile/objectron_tracking_subgraph.png
Normal file
After Width: | Height: | Size: 113 KiB |
|
@ -364,8 +364,10 @@ To build and run iOS apps:
|
|||
|
||||
### Installing on Windows Subsystem for Linux (WSL)
|
||||
|
||||
Note: WSL has historically not provided access to USB cameras. Mediapipe can use
|
||||
a video file as input.
|
||||
Note: The pre-built OpenCV packages don't support cameras in WSL. Unless you
|
||||
[compile](https://funvision.blogspot.com/2019/12/opencv-web-camera-and-video-streams-in.html)
|
||||
OpenCV with FFMPEG and GStreamer in WSL, the live demos won't work with any
|
||||
cameras. Alternatively, you use a video file as input.
|
||||
|
||||
1. Follow the
|
||||
[instruction](https://docs.microsoft.com/en-us/windows/wsl/install-win10) to
|
||||
|
@ -373,7 +375,7 @@ a video file as input.
|
|||
|
||||
2. Install Windows ADB and start the ADB server in Windows.
|
||||
|
||||
Note: Window’s and WSL’s adb versions must be the same version, e.g., if WSL
|
||||
Note: Windows' and WSL’s adb versions must be the same version, e.g., if WSL
|
||||
has ADB 1.0.39, you need to download the corresponding Windows ADB from
|
||||
[here](https://dl.google.com/android/repository/platform-tools_r26.0.1-windows.zip).
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ To build and run the TensorFlow example on desktop, run:
|
|||
$ bazel build -c opt \
|
||||
--define MEDIAPIPE_DISABLE_GPU=1 \
|
||||
--define no_aws_support=true \
|
||||
--linkopt=-s \
|
||||
mediapipe/examples/desktop/object_detection:object_detection_tensorflow
|
||||
|
||||
# It should print:
|
||||
|
|
489
mediapipe/docs/objectron_mobile_gpu.md
Normal file
|
@ -0,0 +1,489 @@
|
|||
# MediaPipe Objectron (GPU)
|
||||
|
||||
This doc focuses on the
|
||||
[below example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt)
|
||||
that performs 3D object detection and tracking with TensorFlow Lite on GPU.
|
||||
|
||||
Objectron for shoes | Objectron for chairs
|
||||
:-----------------------------------------------------------------------------: | :------------------:
|
||||
![objectron_shoe_android_gpu_gif](images/mobile/objectron_shoe_android_gpu.gif) | ![objectron_chair_android_gpu_gif](images/mobile/objectron_chair_android_gpu.gif)
|
||||
|
||||
For overall context on MediaPipe Objectron, please read the
|
||||
[Google AI Blog](https://mediapipe.page.link/objectron-aiblog). The Objectron's
|
||||
ML model (see also the [model card](https://mediapipe.page.link/objectron-mc))
|
||||
estimates a 3D bounding box for the detected object.
|
||||
|
||||
## Android
|
||||
|
||||
[Source](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d)
|
||||
|
||||
An arm64 build of Objectron for shoes can be
|
||||
[downloaded here](https://drive.google.com/open?id=1S0K4hbWt3o31FfQ4QU3Rz7IHrvOUMx1d),
|
||||
and for chairs can be
|
||||
[downloaded here](https://drive.google.com/open?id=1MM8K-13bXLCVS1EHQ-KgkVyEahEPrKej).
|
||||
|
||||
To build and install the Objectron for shoes:
|
||||
|
||||
```bash
|
||||
bazel build -c opt --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d
|
||||
```
|
||||
|
||||
Similarly to build and install the Objectron for chairs, add **--define
|
||||
chair=true** flag to build command.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define chair=true --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d
|
||||
```
|
||||
|
||||
Once the app is built, install in on Android device with:
|
||||
|
||||
```bash
|
||||
adb install bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/objectdetection3d.apk
|
||||
```
|
||||
|
||||
## Graph
|
||||
|
||||
The Objectron main graph internally utilizes the Objectron detection subgraph,
|
||||
and the Objectron tracking subgraph. To visualize the graph as shown above, copy
|
||||
the text specification of the graph below and paste it into
|
||||
[MediaPipe Visualizer](https://viz.mediapipe.dev/).
|
||||
|
||||
### Main Graph
|
||||
|
||||
This is the main graph for the shoe detector. This graph runs detection and
|
||||
tracking and renders the output to the display.
|
||||
|
||||
![object_detection_mobile_gpu_graph](images/mobile/object_detection_3d_android_gpu.png)
|
||||
|
||||
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt)
|
||||
|
||||
```bash
|
||||
# MediaPipe object detection 3D with tracking graph.
|
||||
|
||||
# Images on GPU coming into and out of the graph.
|
||||
input_stream: "input_video"
|
||||
output_stream: "output_video"
|
||||
|
||||
# Creates a copy of the input_video stream. At the end of the graph, the
|
||||
# GlAnimationOverlayCalculator will consume the input_video texture and draws
|
||||
# on top of it.
|
||||
node: {
|
||||
calculator: "GlScalerCalculator"
|
||||
input_stream: "VIDEO:input_video"
|
||||
output_stream: "VIDEO:input_video_copy"
|
||||
}
|
||||
|
||||
# Resamples the images by specific frame rate. This calculator is used to
|
||||
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
|
||||
# consumption for expensive process.
|
||||
node {
|
||||
calculator: "PacketResamplerCalculator"
|
||||
input_stream: "DATA:input_video_copy"
|
||||
output_stream: "DATA:sampled_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
|
||||
frame_rate: 5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node {
|
||||
calculator: "ObjectronDetectionSubgraphGpu"
|
||||
input_stream: "IMAGE_GPU:sampled_input_video"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
}
|
||||
|
||||
node {
|
||||
calculator: "ObjectronTrackingSubgraphGpu"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "IMAGE_GPU:input_video_copy"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
}
|
||||
|
||||
# The rendering nodes:
|
||||
# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly
|
||||
# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask.
|
||||
# These models are designed using different tools, so we supply a transformation
|
||||
# to bring both of them to the Objectron's coordinate system.
|
||||
|
||||
# Creates a model matrices for the tracked object given the lifted 3D points.
|
||||
# This calculator does two things: 1) Estimates object's pose (orientation,
|
||||
# translation, and scale) from the 3D vertices, and
|
||||
# 2) bring the object from the objectron's coordinate system to the renderer
|
||||
# (OpenGL) coordinate system. Since the final goal is to render a mesh file on
|
||||
# top of the object, we also supply a transformation to bring the mesh to the
|
||||
# objectron's coordinate system, and rescale mesh to the unit size.
|
||||
node {
|
||||
calculator: "AnnotationsToModelMatricesCalculator"
|
||||
input_stream: "ANNOTATIONS:lifted_tracked_objects"
|
||||
output_stream: "MODEL_MATRICES:model_matrices"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
|
||||
# Re-scale the CAD model to the size of a unit box
|
||||
model_scale: 0.05
|
||||
model_scale: 0.05
|
||||
model_scale: 0.05
|
||||
# Bring the box CAD model to objectron's coordinate system. This
|
||||
# is equivalent of -pi/2 rotation along the y-axis (right-hand rule):
|
||||
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY())
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: -1.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 1.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 1.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Compute the model matrices for the CAD model of the shoe, to be used as an
|
||||
# occlusion mask. The model will be rendered at the exact same location as the
|
||||
# bounding box.
|
||||
node {
|
||||
calculator: "AnnotationsToModelMatricesCalculator"
|
||||
input_stream: "ANNOTATIONS:lifted_tracked_objects"
|
||||
output_stream: "MODEL_MATRICES:mask_model_matrices"
|
||||
#input_side_packet: "MODEL_SCALE:model_scale"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
|
||||
# Re-scale the CAD model to the size of a unit box
|
||||
model_scale: 0.45
|
||||
model_scale: 0.25
|
||||
model_scale: 0.15
|
||||
# Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This
|
||||
# is equivalent of -pi/2 rotation along the x-axis (right-hand rule):
|
||||
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX())
|
||||
model_transformation: 1.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 1.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: -1.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 0.0
|
||||
model_transformation: 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Render everything together. First we render the 3D bounding box animation,
|
||||
# then we render the occlusion mask.
|
||||
node: {
|
||||
calculator: "GlAnimationOverlayCalculator"
|
||||
input_stream: "VIDEO:input_video"
|
||||
input_stream: "MODEL_MATRICES:model_matrices"
|
||||
input_stream: "MASK_MODEL_MATRICES:mask_model_matrices"
|
||||
output_stream: "output_video"
|
||||
input_side_packet: "TEXTURE:box_texture"
|
||||
input_side_packet: "ANIMATION_ASSET:box_asset_name"
|
||||
input_side_packet: "MASK_TEXTURE:obj_texture"
|
||||
input_side_packet: "MASK_ASSET:obj_asset_name"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] {
|
||||
# Output resolution is 480x640 with the aspect ratio of 0.75
|
||||
aspect_ratio: 0.75
|
||||
vertical_fov_degrees: 70.
|
||||
animation_speed_fps: 25
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### Objectron Detection Subgraph
|
||||
|
||||
Objectron detection subgraph uses the *TfLiteInferenceCalculator* to run
|
||||
inference and decodes the output tensor to *FrameAnnotation* protobuf. The
|
||||
*FrameAnnotation* contains nine keypoints: the bounding box's center, as well as
|
||||
its eight vertices. The boxes will be passed to the Objectron tracking subgraph.
|
||||
|
||||
![object_detection_subgraph](images/mobile/objectron_detection_subgraph.png)
|
||||
|
||||
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt)
|
||||
|
||||
```bash
|
||||
# MediaPipe Objectron detection gpu subgraph
|
||||
|
||||
type: "ObjectronDetectionSubgraphGpu"
|
||||
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
|
||||
# Transforms the input image on GPU to a 480x640 image. To scale the input
|
||||
# image, the scale_mode option is set to FIT to preserve the aspect ratio,
|
||||
# resulting in potential letterboxing in the transformed image.
|
||||
node: {
|
||||
calculator: "ImageTransformationCalculator"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "IMAGE_GPU:transformed_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
|
||||
output_width: 480
|
||||
output_height: 640
|
||||
scale_mode: FIT
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Converts the transformed input image on GPU into an image tensor stored as a
|
||||
# TfLiteTensor.
|
||||
node {
|
||||
calculator: "TfLiteConverterCalculator"
|
||||
input_stream: "IMAGE_GPU:transformed_input_video"
|
||||
output_stream: "TENSORS_GPU:image_tensor"
|
||||
}
|
||||
|
||||
# Generates a single side packet containing a TensorFlow Lite op resolver that
|
||||
# supports custom ops needed by the model used in this graph.
|
||||
node {
|
||||
calculator: "TfLiteCustomOpResolverCalculator"
|
||||
output_side_packet: "opresolver"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] {
|
||||
use_gpu: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
|
||||
# vector of tensors representing, for instance, detection boxes/keypoints and
|
||||
# scores.
|
||||
node {
|
||||
calculator: "TfLiteInferenceCalculator"
|
||||
input_stream: "TENSORS_GPU:image_tensor"
|
||||
output_stream: "TENSORS:detection_tensors"
|
||||
input_side_packet: "CUSTOM_OP_RESOLVER:opresolver"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
|
||||
model_path: "object_detection_3d.tflite"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Decodes the model's output tensor (the heatmap and the distance fields) to 2D
|
||||
# keypoints. There are nine 2D keypoints: one center keypoint and eight vertices
|
||||
# for the 3D bounding box. The calculator parameters determine's the decoder's
|
||||
# sensitivity.
|
||||
node {
|
||||
calculator: "TfLiteTensorsToObjectsCalculator"
|
||||
input_stream: "TENSORS:detection_tensors"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.TfLiteTensorsToObjectsCalculatorOptions] {
|
||||
num_classes: 1
|
||||
num_keypoints: 9
|
||||
decoder_config {
|
||||
heatmap_threshold: 0.6
|
||||
local_max_distance: 2
|
||||
offset_scale_coef: 1.0
|
||||
voting_radius: 2
|
||||
voting_allowance: 1
|
||||
voting_threshold: 0.2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Object Tracking Subgraph
|
||||
|
||||
Object tracking subgraph uses a *BoxTracker* calculator which is a generic
|
||||
tracking library, also used in
|
||||
[Mediapipe's 2D Object Detection and Tracking](https://github.com/google/mediapipe/tree/master/mediapipe/g3doc/object_tracking_mobile_gpu.md).
|
||||
The tracking runs every frame and when a new detection is available, it
|
||||
consolidates the detection and tracking results. The tracker tracks the box with
|
||||
its 2D keypoints, so at the end we lift the 2D keypoints to 3D using EPnP
|
||||
algorithm in *Lift2DFrameAnnotationTo3D* Calculator.
|
||||
|
||||
![object_tracking_subgraph](images/mobile/objectron_tracking_subgraph.png)
|
||||
|
||||
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_tracking_gpu.pbtxt)
|
||||
|
||||
```bash
|
||||
# MediaPipe Objectron tracking gpu subgraph
|
||||
|
||||
type: "ObjectronTrackingSubgraphGpu"
|
||||
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
|
||||
|
||||
# Converts the detected keypoints to Boxes, used by the tracking subgraph.
|
||||
node {
|
||||
calculator: "FrameAnnotationToTimedBoxListCalculator"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
output_stream: "BOXES:start_pos"
|
||||
}
|
||||
|
||||
node: {
|
||||
calculator: "ImageTransformationCalculator"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "IMAGE_GPU:downscaled_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
|
||||
output_width: 240
|
||||
output_height: 320
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Converts GPU buffer to ImageFrame for processing tracking.
|
||||
node: {
|
||||
calculator: "GpuBufferToImageFrameCalculator"
|
||||
input_stream: "downscaled_input_video"
|
||||
output_stream: "downscaled_input_video_cpu"
|
||||
}
|
||||
|
||||
# Performs motion analysis on an incoming video stream.
|
||||
node: {
|
||||
calculator: "MotionAnalysisCalculator"
|
||||
input_stream: "VIDEO:downscaled_input_video_cpu"
|
||||
output_stream: "CAMERA:camera_motion"
|
||||
output_stream: "FLOW:region_flow"
|
||||
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: {
|
||||
analysis_options {
|
||||
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
|
||||
flow_options {
|
||||
fast_estimation_min_block_size: 100
|
||||
top_inlier_sets: 1
|
||||
frac_inlier_error_threshold: 3e-3
|
||||
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
|
||||
verification_distance: 5.0
|
||||
verify_long_feature_acceleration: true
|
||||
verify_long_feature_trigger_ratio: 0.1
|
||||
tracking_options {
|
||||
max_features: 500
|
||||
adaptive_extraction_levels: 2
|
||||
min_eig_val_settings {
|
||||
adaptive_lowest_quality_level: 2e-4
|
||||
}
|
||||
klt_tracker_implementation: KLT_OPENCV
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Reads optical flow fields defined in
|
||||
# mediapipe/framework/formats/motion/optical_flow_field.h,
|
||||
# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized
|
||||
# to 0-255.
|
||||
node: {
|
||||
calculator: "FlowPackagerCalculator"
|
||||
input_stream: "FLOW:region_flow"
|
||||
input_stream: "CAMERA:camera_motion"
|
||||
output_stream: "TRACKING:tracking_data"
|
||||
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: {
|
||||
flow_packager_options: {
|
||||
binary_tracking_data_support: false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Tracks box positions over time.
|
||||
node: {
|
||||
calculator: "BoxTrackerCalculator"
|
||||
input_stream: "TRACKING:tracking_data"
|
||||
input_stream: "TRACK_TIME:input_video"
|
||||
input_stream: "START_POS:start_pos"
|
||||
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
|
||||
input_stream_info: {
|
||||
tag_index: "CANCEL_OBJECT_ID"
|
||||
back_edge: true
|
||||
}
|
||||
output_stream: "BOXES:boxes"
|
||||
|
||||
input_stream_handler {
|
||||
input_stream_handler: "SyncSetInputStreamHandler"
|
||||
options {
|
||||
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
|
||||
sync_set {
|
||||
tag_index: "TRACKING"
|
||||
tag_index: "TRACK_TIME"
|
||||
}
|
||||
sync_set {
|
||||
tag_index: "START_POS"
|
||||
}
|
||||
sync_set {
|
||||
tag_index: "CANCEL_OBJECT_ID"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: {
|
||||
tracker_options: {
|
||||
track_step_options {
|
||||
track_object_and_camera: true
|
||||
tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE
|
||||
inlier_spring_force: 0.0
|
||||
static_motion_temporal_ratio: 3e-2
|
||||
}
|
||||
}
|
||||
visualize_tracking_data: false
|
||||
streaming_track_data_cache_size: 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Consolidates tracking and detection results.
|
||||
node {
|
||||
calculator: "FrameAnnotationTrackerCalculator"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "TRACKED_BOXES:boxes"
|
||||
output_stream: "TRACKED_FRAME_ANNOTATION:tracked_objects"
|
||||
output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.FrameAnnotationTrackerCalculatorOptions] {
|
||||
img_width: 240
|
||||
img_height: 320
|
||||
}
|
||||
}
|
||||
|
||||
input_stream_handler {
|
||||
input_stream_handler: "SyncSetInputStreamHandler"
|
||||
options {
|
||||
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
|
||||
sync_set {
|
||||
tag_index: "FRAME_ANNOTATION"
|
||||
}
|
||||
sync_set {
|
||||
tag_index: "TRACKED_BOXES"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Lift the tracked 2D keypoints to 3D using EPnP algorithm.
|
||||
node {
|
||||
calculator: "Lift2DFrameAnnotationTo3DCalculator"
|
||||
input_stream: "FRAME_ANNOTATION:tracked_objects"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
}
|
||||
```
|
|
@ -61,6 +61,8 @@ videos.
|
|||
```bash
|
||||
# cd to the root directory of the MediaPipe repo
|
||||
cd -
|
||||
|
||||
pip3 install tf_slim
|
||||
python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph
|
||||
```
|
||||
|
||||
|
@ -78,7 +80,7 @@ videos.
|
|||
5. Run the MediaPipe binary to extract the features.
|
||||
|
||||
```bash
|
||||
bazel build -c opt \
|
||||
bazel build -c opt --linkopt=-s \
|
||||
--define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \
|
||||
mediapipe/examples/desktop/youtube8m:extract_yt8m_features
|
||||
|
||||
|
@ -126,13 +128,13 @@ the inference for both local videos and the dataset
|
|||
2. Build the inference binary.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
|
||||
mediapipe/examples/desktop/youtube8m:model_inference
|
||||
```
|
||||
|
||||
3. Run the python web server.
|
||||
|
||||
Note: pip install absl-py
|
||||
Note: pip3 install absl-py
|
||||
|
||||
```bash
|
||||
python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd`
|
||||
|
@ -162,7 +164,7 @@ the inference for both local videos and the dataset
|
|||
3. Build and run the inference binary.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
|
||||
mediapipe/examples/desktop/youtube8m:model_inference
|
||||
|
||||
# segment_size is the number of seconds window of frames.
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
package="com.google.mediapipe.apps.objectdetection3d">
|
||||
|
||||
<uses-sdk
|
||||
android:minSdkVersion="21"
|
||||
android:targetSdkVersion="27" />
|
||||
|
||||
<!-- For using the camera -->
|
||||
<uses-permission android:name="android.permission.CAMERA" />
|
||||
<uses-feature android:name="android.hardware.camera" />
|
||||
<uses-feature android:name="android.hardware.camera.autofocus" />
|
||||
<!-- For MediaPipe -->
|
||||
<uses-feature android:glEsVersion="0x00020000" android:required="true" />
|
||||
|
||||
|
||||
<application
|
||||
android:allowBackup="true"
|
||||
android:label="@string/app_name"
|
||||
android:supportsRtl="true"
|
||||
android:theme="@style/AppTheme">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:exported="true"
|
||||
android:screenOrientation="portrait">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
<category android:name="android.intent.category.LAUNCHER" />
|
||||
</intent-filter>
|
||||
</activity>
|
||||
</application>
|
||||
|
||||
</manifest>
|
|
@ -0,0 +1,115 @@
|
|||
# Copyright 2019 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:private"])
|
||||
|
||||
cc_binary(
|
||||
name = "libmediapipe_jni.so",
|
||||
linkshared = 1,
|
||||
linkstatic = 1,
|
||||
deps = [
|
||||
"//mediapipe/graphs/object_detection_3d:mobile_calculators",
|
||||
"//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "mediapipe_jni_lib",
|
||||
srcs = [":libmediapipe_jni.so"],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
# To use the "chair" model instead of the default "shoes" model,
|
||||
# add "--define chair=true" to the bazel build command.
|
||||
config_setting(
|
||||
name = "use_chair_model",
|
||||
define_values = {
|
||||
"chair": "true",
|
||||
},
|
||||
)
|
||||
|
||||
# Maps the binary graph to an alias (e.g., the app name) for convenience so that the alias can be
|
||||
# easily incorporated into the app via, for example,
|
||||
# MainActivity.BINARY_GRAPH_NAME = "appname.binarypb".
|
||||
genrule(
|
||||
name = "binary_graph",
|
||||
srcs = select({
|
||||
"//conditions:default": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_shoe"],
|
||||
":use_chair_model": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_chair"],
|
||||
}),
|
||||
outs = ["objectdetection3d.binarypb"],
|
||||
cmd = "cp $< $@",
|
||||
)
|
||||
|
||||
genrule(
|
||||
name = "model",
|
||||
srcs = select({
|
||||
"//conditions:default": ["//mediapipe/models:object_detection_3d_sneakers.tflite"],
|
||||
":use_chair_model": ["//mediapipe/models:object_detection_3d_chair.tflite"],
|
||||
}),
|
||||
outs = ["object_detection_3d.tflite"],
|
||||
cmd = "cp $< $@",
|
||||
)
|
||||
|
||||
android_library(
|
||||
name = "mediapipe_lib",
|
||||
srcs = glob(["*.java"]),
|
||||
assets = [
|
||||
":binary_graph",
|
||||
":model",
|
||||
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:box.obj.uuu",
|
||||
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:classic_colors.png",
|
||||
] + select({
|
||||
"//conditions:default": [
|
||||
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:model.obj.uuu",
|
||||
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:texture.bmp",
|
||||
],
|
||||
":use_chair_model": [
|
||||
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:model.obj.uuu",
|
||||
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:texture.bmp",
|
||||
],
|
||||
}),
|
||||
assets_dir = "",
|
||||
manifest = "AndroidManifest.xml",
|
||||
resource_files = glob(["res/**"]),
|
||||
deps = [
|
||||
":mediapipe_jni_lib",
|
||||
"//mediapipe/framework/formats:landmark_java_proto_lite",
|
||||
"//mediapipe/java/com/google/mediapipe/components:android_camerax_helper",
|
||||
"//mediapipe/java/com/google/mediapipe/components:android_components",
|
||||
"//mediapipe/java/com/google/mediapipe/framework:android_framework",
|
||||
"//mediapipe/java/com/google/mediapipe/glutil",
|
||||
"//third_party:androidx_appcompat",
|
||||
"//third_party:androidx_constraint_layout",
|
||||
"//third_party:androidx_legacy_support_v4",
|
||||
"//third_party:androidx_recyclerview",
|
||||
"//third_party:opencv",
|
||||
"@androidx_concurrent_futures//jar",
|
||||
"@androidx_lifecycle//jar",
|
||||
"@com_google_code_findbugs//jar",
|
||||
"@com_google_guava_android//jar",
|
||||
],
|
||||
)
|
||||
|
||||
android_binary(
|
||||
name = "objectdetection3d",
|
||||
manifest = "AndroidManifest.xml",
|
||||
manifest_values = {"applicationId": "com.google.mediapipe.apps.objectdetection3d"},
|
||||
multidex = "native",
|
||||
deps = [
|
||||
":mediapipe_lib",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,280 @@
|
|||
// Copyright 2019 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package com.google.mediapipe.apps.objectdetection3d;
|
||||
|
||||
import android.graphics.Bitmap;
|
||||
import android.graphics.BitmapFactory;
|
||||
import android.graphics.SurfaceTexture;
|
||||
import android.os.Bundle;
|
||||
import androidx.appcompat.app.AppCompatActivity;
|
||||
import android.util.Log;
|
||||
import android.util.Size;
|
||||
import android.view.SurfaceHolder;
|
||||
import android.view.SurfaceView;
|
||||
import android.view.View;
|
||||
import android.view.ViewGroup;
|
||||
import com.google.mediapipe.components.CameraHelper;
|
||||
import com.google.mediapipe.components.CameraXPreviewHelper;
|
||||
import com.google.mediapipe.components.ExternalTextureConverter;
|
||||
import com.google.mediapipe.components.FrameProcessor;
|
||||
import com.google.mediapipe.components.PermissionHelper;
|
||||
import com.google.mediapipe.framework.AndroidAssetUtil;
|
||||
import com.google.mediapipe.framework.AndroidPacketCreator;
|
||||
import com.google.mediapipe.framework.Packet;
|
||||
import com.google.mediapipe.glutil.EglManager;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/** Main activity of MediaPipe example apps. */
|
||||
public class MainActivity extends AppCompatActivity {
|
||||
private static final String TAG = "MainActivity";
|
||||
|
||||
private static final String BINARY_GRAPH_NAME = "objectdetection3d.binarypb";
|
||||
private static final String INPUT_VIDEO_STREAM_NAME = "input_video";
|
||||
private static final String OUTPUT_VIDEO_STREAM_NAME = "output_video";
|
||||
|
||||
private static final String OBJ_TEXTURE = "texture.bmp";
|
||||
private static final String OBJ_FILE = "model.obj.uuu";
|
||||
private static final String BOX_TEXTURE = "classic_colors.png";
|
||||
private static final String BOX_FILE = "box.obj.uuu";
|
||||
|
||||
private static final CameraHelper.CameraFacing CAMERA_FACING = CameraHelper.CameraFacing.BACK;
|
||||
|
||||
// Flips the camera-preview frames vertically before sending them into FrameProcessor to be
|
||||
// processed in a MediaPipe graph, and flips the processed frames back when they are displayed.
|
||||
// This is needed because OpenGL represents images assuming the image origin is at the bottom-left
|
||||
// corner, whereas MediaPipe in general assumes the image origin is at top-left.
|
||||
private static final boolean FLIP_FRAMES_VERTICALLY = true;
|
||||
|
||||
// Target resolution should be 4:3 for this application, as expected by the model and tracker.
|
||||
private static final Size TARGET_RESOLUTION = new Size(1280, 960);
|
||||
|
||||
static {
|
||||
// Load all native libraries needed by the app.
|
||||
System.loadLibrary("mediapipe_jni");
|
||||
System.loadLibrary("opencv_java3");
|
||||
}
|
||||
|
||||
// {@link SurfaceTexture} where the camera-preview frames can be accessed.
|
||||
private SurfaceTexture previewFrameTexture;
|
||||
// {@link SurfaceView} that displays the camera-preview frames processed by a MediaPipe graph.
|
||||
private SurfaceView previewDisplayView;
|
||||
|
||||
// Creates and manages an {@link EGLContext}.
|
||||
private EglManager eglManager;
|
||||
// Sends camera-preview frames into a MediaPipe graph for processing, and displays the processed
|
||||
// frames onto a {@link Surface}.
|
||||
private FrameProcessor processor;
|
||||
// Converts the GL_TEXTURE_EXTERNAL_OES texture from Android camera into a regular texture to be
|
||||
// consumed by {@link FrameProcessor} and the underlying MediaPipe graph.
|
||||
private ExternalTextureConverter converter;
|
||||
|
||||
// Handles camera access via the {@link CameraX} Jetpack support library.
|
||||
private CameraXPreviewHelper cameraHelper;
|
||||
|
||||
// Assets.
|
||||
private Bitmap objTexture = null;
|
||||
private Bitmap boxTexture = null;
|
||||
|
||||
Size cameraImageSize;
|
||||
|
||||
@Override
|
||||
protected void onCreate(Bundle savedInstanceState) {
|
||||
super.onCreate(savedInstanceState);
|
||||
setContentView(R.layout.activity_main);
|
||||
|
||||
previewDisplayView = new SurfaceView(this);
|
||||
setupPreviewDisplayView();
|
||||
|
||||
// Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g.,
|
||||
// binary graphs.
|
||||
AndroidAssetUtil.initializeNativeAssetManager(this);
|
||||
|
||||
eglManager = new EglManager(null);
|
||||
processor =
|
||||
new FrameProcessor(
|
||||
this,
|
||||
eglManager.getNativeContext(),
|
||||
BINARY_GRAPH_NAME,
|
||||
INPUT_VIDEO_STREAM_NAME,
|
||||
OUTPUT_VIDEO_STREAM_NAME);
|
||||
processor.getVideoSurfaceOutput().setFlipY(FLIP_FRAMES_VERTICALLY);
|
||||
|
||||
prepareDemoAssets();
|
||||
AndroidPacketCreator packetCreator = processor.getPacketCreator();
|
||||
Map<String, Packet> inputSidePackets = new HashMap<>();
|
||||
inputSidePackets.put("obj_asset_name", packetCreator.createString(OBJ_FILE));
|
||||
inputSidePackets.put("box_asset_name", packetCreator.createString(BOX_FILE));
|
||||
inputSidePackets.put("obj_texture", packetCreator.createRgbaImageFrame(objTexture));
|
||||
inputSidePackets.put("box_texture", packetCreator.createRgbaImageFrame(boxTexture));
|
||||
processor.setInputSidePackets(inputSidePackets);
|
||||
|
||||
PermissionHelper.checkAndRequestCameraPermissions(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onResume() {
|
||||
super.onResume();
|
||||
converter = new ExternalTextureConverter(eglManager.getContext());
|
||||
converter.setFlipY(FLIP_FRAMES_VERTICALLY);
|
||||
converter.setConsumer(processor);
|
||||
if (PermissionHelper.cameraPermissionsGranted(this)) {
|
||||
startCamera();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onPause() {
|
||||
super.onPause();
|
||||
converter.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onRequestPermissionsResult(
|
||||
int requestCode, String[] permissions, int[] grantResults) {
|
||||
super.onRequestPermissionsResult(requestCode, permissions, grantResults);
|
||||
PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults);
|
||||
}
|
||||
|
||||
private void setupPreviewDisplayView() {
|
||||
previewDisplayView.setVisibility(View.GONE);
|
||||
ViewGroup viewGroup = findViewById(R.id.preview_display_layout);
|
||||
viewGroup.addView(previewDisplayView);
|
||||
|
||||
previewDisplayView
|
||||
.getHolder()
|
||||
.addCallback(
|
||||
new SurfaceHolder.Callback() {
|
||||
@Override
|
||||
public void surfaceCreated(SurfaceHolder holder) {
|
||||
processor.getVideoSurfaceOutput().setSurface(holder.getSurface());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
|
||||
// (Re-)Compute the ideal size of the camera-preview display (the area that the
|
||||
// camera-preview frames get rendered onto, potentially with scaling and rotation)
|
||||
// based on the size of the SurfaceView that contains the display.
|
||||
Size viewSize = new Size(height, height * 3 / 4); // Prefer 3:4 aspect ratio.
|
||||
Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize);
|
||||
boolean isCameraRotated = cameraHelper.isCameraRotated();
|
||||
cameraImageSize = cameraHelper.getFrameSize();
|
||||
|
||||
// Connect the converter to the camera-preview frames as its input (via
|
||||
// previewFrameTexture), and configure the output width and height as the computed
|
||||
// display size.
|
||||
converter.setSurfaceTextureAndAttachToGLContext(
|
||||
previewFrameTexture,
|
||||
isCameraRotated ? displaySize.getHeight() : displaySize.getWidth(),
|
||||
isCameraRotated ? displaySize.getWidth() : displaySize.getHeight());
|
||||
processor.setOnWillAddFrameListener(
|
||||
(timestamp) -> {
|
||||
try {
|
||||
int cameraTextureWidth =
|
||||
isCameraRotated
|
||||
? cameraImageSize.getHeight()
|
||||
: cameraImageSize.getWidth();
|
||||
int cameraTextureHeight =
|
||||
isCameraRotated
|
||||
? cameraImageSize.getWidth()
|
||||
: cameraImageSize.getHeight();
|
||||
|
||||
// Find limiting side and scale to 3:4 aspect ratio
|
||||
float aspectRatio =
|
||||
(float) cameraTextureWidth / (float) cameraTextureHeight;
|
||||
if (aspectRatio > 3.0 / 4.0) {
|
||||
// width too big
|
||||
cameraTextureWidth = (int) ((float) cameraTextureHeight * 3.0 / 4.0);
|
||||
} else {
|
||||
// height too big
|
||||
cameraTextureHeight = (int) ((float) cameraTextureWidth * 4.0 / 3.0);
|
||||
}
|
||||
Packet widthPacket =
|
||||
processor.getPacketCreator().createInt32(cameraTextureWidth);
|
||||
Packet heightPacket =
|
||||
processor.getPacketCreator().createInt32(cameraTextureHeight);
|
||||
|
||||
try {
|
||||
processor
|
||||
.getGraph()
|
||||
.addPacketToInputStream("input_width", widthPacket, timestamp);
|
||||
processor
|
||||
.getGraph()
|
||||
.addPacketToInputStream("input_height", heightPacket, timestamp);
|
||||
} catch (Exception e) {
|
||||
Log.e(
|
||||
TAG,
|
||||
"MediaPipeException encountered adding packets to width and height"
|
||||
+ " input streams.");
|
||||
}
|
||||
widthPacket.release();
|
||||
heightPacket.release();
|
||||
} catch (IllegalStateException ise) {
|
||||
Log.e(
|
||||
TAG,
|
||||
"Exception while adding packets to width and height input streams.");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void surfaceDestroyed(SurfaceHolder holder) {
|
||||
processor.getVideoSurfaceOutput().setSurface(null);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void startCamera() {
|
||||
cameraHelper = new CameraXPreviewHelper();
|
||||
cameraHelper.setOnCameraStartedListener(
|
||||
surfaceTexture -> {
|
||||
previewFrameTexture = surfaceTexture;
|
||||
// Make the display view visible to start showing the preview. This triggers the
|
||||
// SurfaceHolder.Callback added to (the holder of) previewDisplayView.
|
||||
previewDisplayView.setVisibility(View.VISIBLE);
|
||||
});
|
||||
cameraHelper.startCamera(
|
||||
this, CAMERA_FACING, /*surfaceTexture=*/ null, /*targetSize=*/ TARGET_RESOLUTION);
|
||||
cameraImageSize = cameraHelper.getFrameSize();
|
||||
}
|
||||
|
||||
private void prepareDemoAssets() {
|
||||
AndroidAssetUtil.initializeNativeAssetManager(this);
|
||||
// We render from raw data with openGL, so disable decoding preprocessing
|
||||
BitmapFactory.Options decodeOptions = new BitmapFactory.Options();
|
||||
decodeOptions.inScaled = false;
|
||||
decodeOptions.inDither = false;
|
||||
decodeOptions.inPremultiplied = false;
|
||||
|
||||
try {
|
||||
InputStream inputStream = getAssets().open(OBJ_TEXTURE);
|
||||
objTexture = BitmapFactory.decodeStream(inputStream, null /*outPadding*/, decodeOptions);
|
||||
inputStream.close();
|
||||
} catch (Exception e) {
|
||||
Log.e(TAG, "Error parsing object texture; error: " + e);
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
|
||||
try {
|
||||
InputStream inputStream = getAssets().open(BOX_TEXTURE);
|
||||
boxTexture = BitmapFactory.decodeStream(inputStream, null /*outPadding*/, decodeOptions);
|
||||
inputStream.close();
|
||||
} catch (Exception e) {
|
||||
Log.e(TAG, "Error parsing box texture; error: " + e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
# Copyright 2019 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
exports_files(
|
||||
srcs = glob(["**"]),
|
||||
)
|
|
@ -0,0 +1,21 @@
|
|||
# Copyright 2019 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
exports_files(
|
||||
srcs = glob(["**"]),
|
||||
)
|
After Width: | Height: | Size: 6.9 MiB |
After Width: | Height: | Size: 17 KiB |
After Width: | Height: | Size: 410 KiB |
|
@ -0,0 +1,21 @@
|
|||
# Copyright 2019 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
exports_files(
|
||||
srcs = glob(["**"]),
|
||||
)
|
After Width: | Height: | Size: 48 MiB |
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
xmlns:app="http://schemas.android.com/apk/res-auto"
|
||||
xmlns:tools="http://schemas.android.com/tools"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent">
|
||||
|
||||
<FrameLayout
|
||||
android:id="@+id/preview_display_layout"
|
||||
android:layout_width="fill_parent"
|
||||
android:layout_height="fill_parent"
|
||||
android:layout_weight="1">
|
||||
<TextView
|
||||
android:id="@+id/no_camera_access_view"
|
||||
android:layout_height="fill_parent"
|
||||
android:layout_width="fill_parent"
|
||||
android:gravity="center"
|
||||
android:text="@string/no_camera_access" />
|
||||
</FrameLayout>
|
||||
</androidx.constraintlayout.widget.ConstraintLayout>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
<color name="colorPrimary">#008577</color>
|
||||
<color name="colorPrimaryDark">#00574B</color>
|
||||
<color name="colorAccent">#D81B60</color>
|
||||
</resources>
|
|
@ -0,0 +1,4 @@
|
|||
<resources>
|
||||
<string name="app_name" translatable="false">Object Detection 3D</string>
|
||||
<string name="no_camera_access" translatable="false">Please grant camera permissions.</string>
|
||||
</resources>
|
|
@ -0,0 +1,11 @@
|
|||
<resources>
|
||||
|
||||
<!-- Base application theme. -->
|
||||
<style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
|
||||
<!-- Customize your theme here. -->
|
||||
<item name="colorPrimary">@color/colorPrimary</item>
|
||||
<item name="colorPrimaryDark">@color/colorPrimaryDark</item>
|
||||
<item name="colorAccent">@color/colorAccent</item>
|
||||
</style>
|
||||
|
||||
</resources>
|
|
@ -63,7 +63,7 @@ COPY . /mediapipe/
|
|||
|
||||
# Install bazel
|
||||
|
||||
ARG BAZEL_VERSION=0.29.1
|
||||
ARG BAZEL_VERSION=1.1.0
|
||||
RUN mkdir /bazel && \
|
||||
wget --no-check-certificate -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
|
||||
wget --no-check-certificate -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
# Coral Dev Board Setup (experimental)
|
||||
|
||||
**Dislaimer**: Running MediaPipe on Coral is experimental, and this process may
|
||||
**Disclaimer**: Running MediaPipe on Coral is experimental, and this process may
|
||||
not be exact and is subject to change. These instructions have only been tested
|
||||
on the [Coral Dev Board](https://coral.ai/products/dev-board/) with Mendel 4.0,
|
||||
and may vary for different devices and workstations.
|
||||
on the [Coral Dev Board](https://coral.ai/products/dev-board/)
|
||||
running [Mendel Enterprise Day 13](https://coral.ai/software/) OS and
|
||||
using [Diploria2](https://github.com/google-coral/edgetpu/tree/diploria2)
|
||||
edgetpu libs, and may vary for different devices and workstations.
|
||||
|
||||
This file describes how to prepare a Coral Dev Board and setup a Linux
|
||||
Docker container for building MediaPipe applications that run on Edge TPU.
|
||||
|
@ -16,10 +18,12 @@ Docker container for building MediaPipe applications that run on Edge TPU.
|
|||
|
||||
* Setup the coral device via [here](https://coral.withgoogle.com/docs/dev-board/get-started/), and ensure the _mdt_ command works
|
||||
|
||||
Note: alias mdt="python3 -m mdt.main" may be needed on some systems
|
||||
|
||||
* (on coral device) prepare MediaPipe
|
||||
|
||||
cd ~
|
||||
sudo apt-get install -y git
|
||||
sudo apt-get update && sudo apt-get install -y git
|
||||
git clone https://github.com/google/mediapipe.git
|
||||
mkdir mediapipe/bazel-bin
|
||||
|
||||
|
|
|
@ -10,19 +10,25 @@ http_archive(
|
|||
sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e",
|
||||
)
|
||||
load("@bazel_skylib//lib:versions.bzl", "versions")
|
||||
versions.check(minimum_bazel_version = "0.24.1")
|
||||
versions.check(minimum_bazel_version = "1.0.0",
|
||||
maximum_bazel_version = "1.2.1")
|
||||
|
||||
# ABSL cpp library.
|
||||
|
||||
# ABSL cpp library lts_2020_02_25
|
||||
http_archive(
|
||||
name = "com_google_absl",
|
||||
# Head commit on 2019-04-12.
|
||||
# TODO: Switch to the latest absl version when the problem gets
|
||||
# fixed.
|
||||
urls = [
|
||||
"https://github.com/abseil/abseil-cpp/archive/a02f62f456f2c4a7ecf2be3104fe0c6e16fbad9a.tar.gz",
|
||||
"https://github.com/abseil/abseil-cpp/archive/20200225.tar.gz",
|
||||
],
|
||||
sha256 = "d437920d1434c766d22e85773b899c77c672b8b4865d5dc2cd61a29fdff3cf03",
|
||||
strip_prefix = "abseil-cpp-a02f62f456f2c4a7ecf2be3104fe0c6e16fbad9a",
|
||||
# Remove after https://github.com/abseil/abseil-cpp/issues/326 is solved.
|
||||
patches = [
|
||||
"@//third_party:com_google_absl_f863b622fe13612433fdf43f76547d5edda0c93001.diff"
|
||||
],
|
||||
patch_args = [
|
||||
"-p1",
|
||||
],
|
||||
strip_prefix = "abseil-cpp-20200225",
|
||||
sha256 = "728a813291bdec2aa46eab8356ace9f75ac2ed9dfe2df5ab603c4e6c09f1c353"
|
||||
)
|
||||
|
||||
http_archive(
|
||||
|
@ -72,6 +78,14 @@ http_archive(
|
|||
],
|
||||
)
|
||||
|
||||
# easyexif
|
||||
http_archive(
|
||||
name = "easyexif",
|
||||
url = "https://github.com/mayanklahiri/easyexif/archive/master.zip",
|
||||
strip_prefix = "easyexif-master",
|
||||
build_file = "@//third_party:easyexif.BUILD",
|
||||
)
|
||||
|
||||
# libyuv
|
||||
http_archive(
|
||||
name = "libyuv",
|
||||
|
@ -103,15 +117,23 @@ http_archive(
|
|||
],
|
||||
)
|
||||
|
||||
# 2019-11-12
|
||||
_TENSORFLOW_GIT_COMMIT = "a5f9bcd64453ff3d1f64cb4da4786db3d2da7f82"
|
||||
_TENSORFLOW_SHA256= "f2b6f2ab2ffe63e86eccd3ce4bea6b7197383d726638dfeeebcdc1e7de73f075"
|
||||
# 2020-02-12
|
||||
# The last commit before TensorFlow switched to Bazel 2.0
|
||||
_TENSORFLOW_GIT_COMMIT = "77e9ffb9b2bfb1a4f7056e62d84039626923e328"
|
||||
_TENSORFLOW_SHA256= "176ccd82f7dd17c5e117b50d353603b129c7a6ccbfebd522ca47cc2a40f33f13"
|
||||
http_archive(
|
||||
name = "org_tensorflow",
|
||||
urls = [
|
||||
"https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT,
|
||||
"https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT,
|
||||
],
|
||||
# A compatibility patch
|
||||
patches = [
|
||||
"@//third_party:org_tensorflow_528e22eae8bf3206189a066032c66e9e5c9b4a61.diff"
|
||||
],
|
||||
patch_args = [
|
||||
"-p1",
|
||||
],
|
||||
strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT,
|
||||
sha256 = _TENSORFLOW_SHA256,
|
||||
)
|
||||
|
@ -119,8 +141,22 @@ http_archive(
|
|||
load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
|
||||
tf_workspace(tf_repo_name = "org_tensorflow")
|
||||
|
||||
http_archive(
|
||||
name = "ceres_solver",
|
||||
url = "https://github.com/ceres-solver/ceres-solver/archive/1.14.0.zip",
|
||||
patches = [
|
||||
"@//third_party:ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff"
|
||||
],
|
||||
patch_args = [
|
||||
"-p1",
|
||||
],
|
||||
strip_prefix = "ceres-solver-1.14.0",
|
||||
sha256 = "5ba6d0db4e784621fda44a50c58bb23b0892684692f0c623e2063f9c19f192f1"
|
||||
)
|
||||
|
||||
# Please run
|
||||
# $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \
|
||||
# libopencv-calib3d-dev libopencv-features2d-dev \
|
||||
# libopencv-imgproc-dev libopencv-video-dev
|
||||
new_local_repository(
|
||||
name = "linux_opencv",
|
||||
|
@ -149,11 +185,10 @@ new_local_repository(
|
|||
|
||||
http_archive(
|
||||
name = "android_opencv",
|
||||
sha256 = "056b849842e4fa8751d09edbb64530cfa7a63c84ccd232d0ace330e27ba55d0b",
|
||||
build_file = "@//third_party:opencv_android.BUILD",
|
||||
strip_prefix = "OpenCV-android-sdk",
|
||||
type = "zip",
|
||||
url = "https://github.com/opencv/opencv/releases/download/4.1.0/opencv-4.1.0-android-sdk.zip",
|
||||
url = "https://github.com/opencv/opencv/releases/download/3.4.3/opencv-3.4.3-android-sdk.zip",
|
||||
)
|
||||
|
||||
# After OpenCV 3.2.0, the pre-compiled opencv2.framework has google protobuf symbols, which will
|
||||
|
@ -184,13 +219,18 @@ maven_install(
|
|||
artifacts = [
|
||||
"androidx.annotation:annotation:aar:1.1.0",
|
||||
"androidx.appcompat:appcompat:aar:1.1.0-rc01",
|
||||
"androidx.camera:camera-core:aar:1.0.0-alpha06",
|
||||
"androidx.camera:camera-camera2:aar:1.0.0-alpha06",
|
||||
"androidx.constraintlayout:constraintlayout:aar:1.1.3",
|
||||
"androidx.core:core:aar:1.1.0-rc03",
|
||||
"androidx.legacy:legacy-support-v4:aar:1.0.0",
|
||||
"androidx.recyclerview:recyclerview:aar:1.1.0-beta02",
|
||||
"com.google.android.material:material:aar:1.0.0-rc01",
|
||||
],
|
||||
repositories = ["https://dl.google.com/dl/android/maven2"],
|
||||
repositories = [
|
||||
"https://dl.google.com/dl/android/maven2",
|
||||
"https://repo1.maven.org/maven2",
|
||||
],
|
||||
)
|
||||
|
||||
maven_server(
|
||||
|
@ -285,10 +325,13 @@ http_archive(
|
|||
build_file = "@//third_party:google_toolbox_for_mac.BUILD",
|
||||
)
|
||||
|
||||
### Coral ###
|
||||
|
||||
# Coral
|
||||
#COMMIT=$(git ls-remote https://github.com/google-coral/crosstool master | awk '{print $1}')
|
||||
#SHA256=$(curl -L "https://github.com/google-coral/crosstool/archive/${COMMIT}.tar.gz" | sha256sum | awk '{print $1}')
|
||||
# Oct 2019
|
||||
#COMMIT=9e00d5be43bf001f883b5700f5d04882fea00229
|
||||
#SHA256=cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb
|
||||
http_archive(
|
||||
name = "coral_crosstool",
|
||||
sha256 = "cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb",
|
||||
|
|
|
@ -8,7 +8,7 @@ echo ' sh mediapipe/examples/coral/setup.sh '
|
|||
|
||||
sleep 3
|
||||
|
||||
mkdir opencv32_arm64_libs
|
||||
mkdir -p opencv32_arm64_libs
|
||||
|
||||
cp mediapipe/examples/coral/update_sources.sh update_sources.sh
|
||||
chmod +x update_sources.sh
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
|
||||
2. Build and run the run_autoflip binary to process a local video.
|
||||
|
||||
Note: AutoFlip currently only works with OpenCV 3 . Please verify your OpenCV version beforehand.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \
|
||||
mediapipe/examples/desktop/autoflip:run_autoflip
|
||||
|
|
|
@ -63,12 +63,15 @@ import random
|
|||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib
|
||||
import zipfile
|
||||
|
||||
from absl import app
|
||||
from absl import flags
|
||||
from absl import logging
|
||||
from six.moves import range
|
||||
from six.moves import urllib
|
||||
import tensorflow.compat.v1 as tf
|
||||
|
||||
from mediapipe.util.sequence import media_sequence as ms
|
||||
|
||||
|
||||
|
@ -218,7 +221,7 @@ class Charades(object):
|
|||
return output_dict
|
||||
|
||||
if split not in SPLITS:
|
||||
raise ValueError("Split %s not in %s" % split, str(SPLITS.keys()))
|
||||
raise ValueError("Split %s not in %s" % split, str(list(SPLITS.keys())))
|
||||
all_shards = tf.io.gfile.glob(
|
||||
os.path.join(self.path_to_data, SPLITS[split][0] + "-*-of-*"))
|
||||
random.shuffle(all_shards)
|
||||
|
@ -329,7 +332,7 @@ class Charades(object):
|
|||
if sys.version_info >= (3, 0):
|
||||
urlretrieve = urllib.request.urlretrieve
|
||||
else:
|
||||
urlretrieve = urllib.urlretrieve
|
||||
urlretrieve = urllib.request.urlretrieve
|
||||
logging.info("Creating data directory.")
|
||||
tf.io.gfile.makedirs(self.path_to_data)
|
||||
logging.info("Downloading license.")
|
||||
|
|
|
@ -57,11 +57,12 @@ import random
|
|||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib
|
||||
|
||||
from absl import app
|
||||
from absl import flags
|
||||
from absl import logging
|
||||
from six.moves import range
|
||||
from six.moves import urllib
|
||||
import tensorflow.compat.v1 as tf
|
||||
|
||||
from mediapipe.util.sequence import media_sequence as ms
|
||||
|
@ -198,7 +199,7 @@ class DemoDataset(object):
|
|||
if sys.version_info >= (3, 0):
|
||||
urlretrieve = urllib.request.urlretrieve
|
||||
else:
|
||||
urlretrieve = urllib.urlretrieve
|
||||
urlretrieve = urllib.request.urlretrieve
|
||||
for split in SPLITS:
|
||||
reader = csv.DictReader(SPLITS[split].split("\n"))
|
||||
all_metadata = []
|
||||
|
|
|
@ -73,11 +73,13 @@ import subprocess
|
|||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import urllib
|
||||
|
||||
from absl import app
|
||||
from absl import flags
|
||||
from absl import logging
|
||||
from six.moves import range
|
||||
from six.moves import urllib
|
||||
from six.moves import zip
|
||||
import tensorflow.compat.v1 as tf
|
||||
|
||||
from mediapipe.util.sequence import media_sequence as ms
|
||||
|
@ -96,15 +98,15 @@ FILEPATTERN = "kinetics_700_%s_25fps_rgb_flow"
|
|||
SPLITS = {
|
||||
"train": {
|
||||
"shards": 1000,
|
||||
"examples": 540247
|
||||
"examples": 538779
|
||||
},
|
||||
"validate": {
|
||||
"shards": 100,
|
||||
"examples": 34610
|
||||
"examples": 34499
|
||||
},
|
||||
"test": {
|
||||
"shards": 100,
|
||||
"examples": 69103
|
||||
"examples": 68847
|
||||
},
|
||||
"custom": {
|
||||
"csv": None, # Add a CSV for your own data here.
|
||||
|
@ -198,7 +200,7 @@ class Kinetics(object):
|
|||
return output_dict
|
||||
|
||||
if split not in SPLITS:
|
||||
raise ValueError("Split %s not in %s" % split, str(SPLITS.keys()))
|
||||
raise ValueError("Split %s not in %s" % split, str(list(SPLITS.keys())))
|
||||
all_shards = tf.io.gfile.glob(
|
||||
os.path.join(self.path_to_data, FILEPATTERN % split + "-*-of-*"))
|
||||
random.shuffle(all_shards)
|
||||
|
@ -302,11 +304,12 @@ class Kinetics(object):
|
|||
continue
|
||||
# rename the row with a constitent set of names.
|
||||
if len(csv_row) == 5:
|
||||
row = dict(zip(["label_name", "video", "start", "end", "split"],
|
||||
csv_row))
|
||||
row = dict(
|
||||
list(
|
||||
zip(["label_name", "video", "start", "end", "split"],
|
||||
csv_row)))
|
||||
else:
|
||||
row = dict(zip(["video", "start", "end", "split"],
|
||||
csv_row))
|
||||
row = dict(list(zip(["video", "start", "end", "split"], csv_row)))
|
||||
metadata = tf.train.SequenceExample()
|
||||
ms.set_example_id(bytes23(row["video"] + "_" + row["start"]),
|
||||
metadata)
|
||||
|
@ -328,7 +331,7 @@ class Kinetics(object):
|
|||
if sys.version_info >= (3, 0):
|
||||
urlretrieve = urllib.request.urlretrieve
|
||||
else:
|
||||
urlretrieve = urllib.urlretrieve
|
||||
urlretrieve = urllib.request.urlretrieve
|
||||
logging.info("Creating data directory.")
|
||||
tf.io.gfile.makedirs(self.path_to_data)
|
||||
logging.info("Downloading annotations.")
|
||||
|
@ -404,7 +407,7 @@ class Kinetics(object):
|
|||
assert NUM_CLASSES == num_keys, (
|
||||
"Found %d labels for split: %s, should be %d" % (
|
||||
num_keys, name, NUM_CLASSES))
|
||||
label_map = dict(zip(classes, range(len(classes))))
|
||||
label_map = dict(list(zip(classes, list(range(len(classes))))))
|
||||
if SPLITS[name]["examples"] > 0:
|
||||
assert SPLITS[name]["examples"] == num_examples, (
|
||||
"Found %d examples for split: %s, should be %d" % (
|
||||
|
|
|
@ -30,6 +30,8 @@
|
|||
```bash
|
||||
# cd to the root directory of the MediaPipe repo
|
||||
cd -
|
||||
|
||||
pip3 install tf_slim
|
||||
python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph
|
||||
```
|
||||
|
||||
|
@ -47,7 +49,7 @@
|
|||
5. Run the MediaPipe binary to extract the features.
|
||||
|
||||
```bash
|
||||
bazel build -c opt \
|
||||
bazel build -c opt --linkopt=-s \
|
||||
--define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \
|
||||
mediapipe/examples/desktop/youtube8m:extract_yt8m_features
|
||||
|
||||
|
@ -87,7 +89,7 @@
|
|||
3. Build and run the inference binary.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
|
||||
mediapipe/examples/desktop/youtube8m:model_inference
|
||||
|
||||
GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/youtube8m/model_inference \
|
||||
|
@ -113,13 +115,13 @@
|
|||
2. Build the inference binary.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
|
||||
mediapipe/examples/desktop/youtube8m:model_inference
|
||||
```
|
||||
|
||||
3. Run the python web server.
|
||||
|
||||
Note: pip install absl-py
|
||||
Note: pip3 install absl-py
|
||||
|
||||
```bash
|
||||
python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd`
|
||||
|
@ -142,7 +144,7 @@
|
|||
3. Build and run the inference binary.
|
||||
|
||||
```bash
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \
|
||||
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
|
||||
mediapipe/examples/desktop/youtube8m:model_inference
|
||||
|
||||
# segment_size is the number of seconds window of frames.
|
||||
|
|
|
@ -25,7 +25,7 @@ import sys
|
|||
|
||||
from absl import app
|
||||
import tensorflow.compat.v1 as tf
|
||||
from tensorflow.compat.v1.python.tools import freeze_graph
|
||||
from tensorflow.python.tools import freeze_graph
|
||||
|
||||
BASE_DIR = '/tmp/mediapipe/'
|
||||
|
||||
|
|
|
@ -1078,10 +1078,16 @@ cc_library(
|
|||
cc_library(
|
||||
name = "port",
|
||||
hdrs = ["port.h"],
|
||||
defines = select({
|
||||
"//conditions:default": [],
|
||||
}) + select({
|
||||
"//conditions:default": [],
|
||||
"//mediapipe/gpu:disable_gpu": ["MEDIAPIPE_DISABLE_GPU"],
|
||||
}),
|
||||
visibility = [
|
||||
"//mediapipe/framework:__subpackages__",
|
||||
"//mediapipe/framework/port:__pkg__",
|
||||
"//mediapipe/util:__pkg__",
|
||||
"//mediapipe/util:__subpackages__",
|
||||
],
|
||||
)
|
||||
|
||||
|
|
|
@ -135,16 +135,20 @@
|
|||
// ASSIGN_OR_RETURN(ValueType value, MaybeGetValue(query), _.LogError());
|
||||
//
|
||||
#define ASSIGN_OR_RETURN(...) \
|
||||
STATUS_MACROS_IMPL_GET_VARIADIC_(__VA_ARGS__, \
|
||||
STATUS_MACROS_IMPL_GET_VARIADIC_((__VA_ARGS__, \
|
||||
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_, \
|
||||
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_) \
|
||||
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_)) \
|
||||
(__VA_ARGS__)
|
||||
|
||||
// =================================================================
|
||||
// == Implementation details, do not rely on anything below here. ==
|
||||
// =================================================================
|
||||
|
||||
#define STATUS_MACROS_IMPL_GET_VARIADIC_(_1, _2, _3, NAME, ...) NAME
|
||||
// MSVC incorrectly expands variadic macros, splice together a macro call to
|
||||
// work around the bug.
|
||||
#define STATUS_MACROS_IMPL_GET_VARIADIC_HELPER_(_1, _2, _3, NAME, ...) NAME
|
||||
#define STATUS_MACROS_IMPL_GET_VARIADIC_(args) \
|
||||
STATUS_MACROS_IMPL_GET_VARIADIC_HELPER_ args
|
||||
|
||||
#define STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_(lhs, rexpr) \
|
||||
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, std::move(_))
|
||||
|
|
|
@ -99,7 +99,12 @@ def _encode_binary_proto_impl(ctx):
|
|||
),
|
||||
mnemonic = "EncodeProto",
|
||||
)
|
||||
return struct(files = depset([binarypb]))
|
||||
|
||||
output_depset = depset([binarypb])
|
||||
return [DefaultInfo(
|
||||
files = output_depset,
|
||||
data_runfiles = ctx.runfiles(transitive_files = output_depset),
|
||||
)]
|
||||
|
||||
encode_binary_proto = rule(
|
||||
implementation = _encode_binary_proto_impl,
|
||||
|
|
|
@ -131,7 +131,7 @@ class ShardedMap {
|
|||
return *this;
|
||||
}
|
||||
inline bool operator==(const Iterator& other) const {
|
||||
return iter_ == other.iter_;
|
||||
return shard_ == other.shard_ && iter_ == other.iter_;
|
||||
}
|
||||
inline bool operator!=(const Iterator& other) const {
|
||||
return !operator==(other);
|
||||
|
@ -154,7 +154,10 @@ class ShardedMap {
|
|||
: shard_(shard), iter_(iter), map_(map) {}
|
||||
// Releases all resources.
|
||||
inline void Clear() ABSL_NO_THREAD_SAFETY_ANALYSIS {
|
||||
if (map_ && iter_ != map_->maps_.back().end()) {
|
||||
if (!map_) return;
|
||||
bool is_end = (shard_ == map_->maps_.size() - 1 &&
|
||||
iter_ == map_->maps_[shard_].end());
|
||||
if (!is_end) {
|
||||
map_->mutexes_[shard_].Unlock();
|
||||
}
|
||||
map_ = nullptr;
|
||||
|
|
|
@ -100,7 +100,6 @@ class Timestamp {
|
|||
}
|
||||
|
||||
// Special values.
|
||||
|
||||
static Timestamp Unset();
|
||||
static Timestamp Unstarted();
|
||||
static Timestamp PreStream();
|
||||
|
|
|
@ -264,6 +264,10 @@ static ::mediapipe::Status PrefixNames(std::string prefix,
|
|||
generator.mutable_input_side_packet(), replace_names));
|
||||
MP_RETURN_IF_ERROR(TransformStreamNames(
|
||||
generator.mutable_output_side_packet(), replace_names));
|
||||
|
||||
// Remove input side packets ignored by the subgraph-node.
|
||||
MP_RETURN_IF_ERROR(RemoveIgnoredStreams(
|
||||
generator.mutable_input_side_packet(), ignored_input_side_packets));
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
|
|
@ -105,17 +105,27 @@ GpuBuffer GpuBufferMultiPool::GetBuffer(int width, int height,
|
|||
BufferSpec key(width, height, format);
|
||||
auto pool_it = pools_.find(key);
|
||||
if (pool_it == pools_.end()) {
|
||||
// Discard the oldest pool in order of creation.
|
||||
// TODO: implement a better policy.
|
||||
// Discard the least recently used pool in LRU cache.
|
||||
if (pools_.size() >= kMaxPoolCount) {
|
||||
auto old_spec = buffer_specs_.front();
|
||||
buffer_specs_.pop();
|
||||
auto old_spec = buffer_specs_.front(); // Front has LRU.
|
||||
buffer_specs_.pop_front();
|
||||
pools_.erase(old_spec);
|
||||
}
|
||||
buffer_specs_.push(key);
|
||||
buffer_specs_.push_back(key); // Push new spec to back.
|
||||
std::tie(pool_it, std::ignore) =
|
||||
pools_.emplace(std::piecewise_construct, std::forward_as_tuple(key),
|
||||
std::forward_as_tuple(MakeSimplePool(key)));
|
||||
} else {
|
||||
// Find and move current 'key' spec to back, keeping others in same order.
|
||||
auto specs_it = buffer_specs_.begin();
|
||||
while (specs_it != buffer_specs_.end()) {
|
||||
if (*specs_it == key) {
|
||||
buffer_specs_.erase(specs_it);
|
||||
break;
|
||||
}
|
||||
++specs_it;
|
||||
}
|
||||
buffer_specs_.push_back(key);
|
||||
}
|
||||
return GetBufferFromSimplePool(pool_it->first, pool_it->second);
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@
|
|||
#ifndef MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_
|
||||
#define MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_
|
||||
|
||||
#include <deque>
|
||||
#include <limits>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "absl/synchronization/mutex.h"
|
||||
|
@ -110,7 +110,7 @@ class GpuBufferMultiPool {
|
|||
ABSL_GUARDED_BY(mutex_);
|
||||
// A queue of BufferSpecs to keep track of the age of each BufferSpec added to
|
||||
// the pool.
|
||||
std::queue<BufferSpec> buffer_specs_;
|
||||
std::deque<BufferSpec> buffer_specs_;
|
||||
|
||||
#ifdef __APPLE__
|
||||
// Texture caches used with this pool.
|
||||
|
|
|
@ -73,13 +73,15 @@ def _metal_compiler_args(ctx, src, obj, minimum_os_version, copts, diagnostics,
|
|||
|
||||
def _metal_compiler_inputs(srcs, hdrs, deps = []):
|
||||
"""Determines the list of inputs required for a compile action."""
|
||||
objc_providers = [x.objc for x in deps if hasattr(x, "objc")]
|
||||
|
||||
objc_files = depset()
|
||||
for objc in objc_providers:
|
||||
objc_files += objc.header
|
||||
cc_infos = [dep[CcInfo] for dep in deps if CcInfo in dep]
|
||||
|
||||
return srcs + hdrs + objc_files.to_list()
|
||||
dep_headers = depset(transitive = [
|
||||
cc_info.compilation_context.headers
|
||||
for cc_info in cc_infos
|
||||
])
|
||||
|
||||
return depset(srcs + hdrs, transitive = [dep_headers])
|
||||
|
||||
def _metal_library_impl(ctx):
|
||||
"""Implementation for metal_library Skylark rule."""
|
||||
|
@ -144,11 +146,22 @@ def _metal_library_impl(ctx):
|
|||
**additional_params
|
||||
)
|
||||
|
||||
cc_infos = [dep[CcInfo] for dep in ctx.attr.deps if CcInfo in dep]
|
||||
if ctx.files.hdrs:
|
||||
cc_infos.append(
|
||||
CcInfo(
|
||||
compilation_context = cc_common.create_compilation_context(
|
||||
headers = depset([f for f in ctx.files.hdrs]),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
return [
|
||||
DefaultInfo(
|
||||
files = depset([output_lib]),
|
||||
),
|
||||
objc_provider,
|
||||
cc_common.merge_cc_infos(cc_infos = cc_infos),
|
||||
# Return the provider for the new bundling logic of rules_apple.
|
||||
resources.bucketize_typed([output_lib], "unprocessed"),
|
||||
]
|
||||
|
@ -156,7 +169,7 @@ def _metal_library_impl(ctx):
|
|||
METAL_LIBRARY_ATTRS = dicts.add(apple_support.action_required_attrs(), {
|
||||
"srcs": attr.label_list(allow_files = [".metal"], allow_empty = False),
|
||||
"hdrs": attr.label_list(allow_files = [".h"]),
|
||||
"deps": attr.label_list(providers = [["objc"]]),
|
||||
"deps": attr.label_list(providers = [["objc", CcInfo]]),
|
||||
"copts": attr.string_list(),
|
||||
"minimum_os_version": attr.string(),
|
||||
})
|
||||
|
|
56
mediapipe/graphs/object_detection_3d/BUILD
Normal file
|
@ -0,0 +1,56 @@
|
|||
# Copyright 2019 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
load(
|
||||
"//mediapipe/framework/tool:mediapipe_graph.bzl",
|
||||
"mediapipe_binary_graph",
|
||||
)
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
exports_files(glob([
|
||||
"*.pbtxt",
|
||||
]))
|
||||
|
||||
cc_library(
|
||||
name = "mobile_calculators",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//mediapipe/calculators/core:packet_resampler_calculator",
|
||||
"//mediapipe/calculators/image:image_cropping_calculator",
|
||||
"//mediapipe/gpu:gl_scaler_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/calculators:annotations_to_model_matrices_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/calculators:gl_animation_overlay_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/subgraphs:objectron_detection_gpu",
|
||||
"//mediapipe/graphs/object_detection_3d/subgraphs:objectron_tracking_gpu",
|
||||
],
|
||||
)
|
||||
|
||||
mediapipe_binary_graph(
|
||||
name = "mobile_gpu_binary_graph_shoe",
|
||||
graph = "shoe_classic_occlusion_tracking.pbtxt",
|
||||
output_name = "mobile_gpu_shoe.binarypb",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":mobile_calculators"],
|
||||
)
|
||||
|
||||
mediapipe_binary_graph(
|
||||
name = "mobile_gpu_binary_graph_chair",
|
||||
graph = "chair_classic_occlusion_tracking.pbtxt",
|
||||
output_name = "mobile_gpu_chair.binarypb",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":mobile_calculators"],
|
||||
)
|
476
mediapipe/graphs/object_detection_3d/calculators/BUILD
Normal file
|
@ -0,0 +1,476 @@
|
|||
# Copyright 2020 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
load("//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library")
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:private"])
|
||||
|
||||
proto_library(
|
||||
name = "object_proto",
|
||||
srcs = [
|
||||
"object.proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "a_r_capture_metadata_proto",
|
||||
srcs = [
|
||||
"a_r_capture_metadata.proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "annotation_proto",
|
||||
srcs = [
|
||||
"annotation_data.proto",
|
||||
],
|
||||
deps = [
|
||||
":a_r_capture_metadata_proto",
|
||||
":object_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "belief_decoder_config_proto",
|
||||
srcs = [
|
||||
"belief_decoder_config.proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "camera_parameters_proto",
|
||||
srcs = [
|
||||
"camera_parameters.proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "frame_annotation_tracker_calculator_proto",
|
||||
srcs = ["frame_annotation_tracker_calculator.proto"],
|
||||
deps = [
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "gl_animation_overlay_calculator_proto",
|
||||
srcs = ["gl_animation_overlay_calculator.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = ["//mediapipe/framework:calculator_proto"],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "tflite_tensors_to_objects_calculator_proto",
|
||||
srcs = ["tflite_tensors_to_objects_calculator.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":belief_decoder_config_proto",
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "lift_2d_frame_annotation_to_3d_calculator_proto",
|
||||
srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":belief_decoder_config_proto",
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "annotations_to_model_matrices_calculator_proto",
|
||||
srcs = ["annotations_to_model_matrices_calculator.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "model_matrix_proto",
|
||||
srcs = ["model_matrix.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
],
|
||||
)
|
||||
|
||||
proto_library(
|
||||
name = "annotations_to_render_data_calculator_proto",
|
||||
srcs = ["annotations_to_render_data_calculator.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//mediapipe/framework:calculator_proto",
|
||||
"//mediapipe/util:color_proto",
|
||||
],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "object_cc_proto",
|
||||
srcs = ["object.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":object_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "a_r_capture_metadata_cc_proto",
|
||||
srcs = ["a_r_capture_metadata.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":a_r_capture_metadata_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "annotation_cc_proto",
|
||||
srcs = ["annotation_data.proto"],
|
||||
cc_deps = [
|
||||
":a_r_capture_metadata_cc_proto",
|
||||
":object_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":annotation_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "camera_parameters_cc_proto",
|
||||
srcs = ["camera_parameters.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":camera_parameters_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "frame_annotation_tracker_calculator_cc_proto",
|
||||
srcs = ["frame_annotation_tracker_calculator.proto"],
|
||||
cc_deps = [
|
||||
"//mediapipe/framework:calculator_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":frame_annotation_tracker_calculator_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "gl_animation_overlay_calculator_cc_proto",
|
||||
srcs = ["gl_animation_overlay_calculator.proto"],
|
||||
cc_deps = [
|
||||
"//mediapipe/framework:calculator_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":gl_animation_overlay_calculator_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "belief_decoder_config_cc_proto",
|
||||
srcs = ["belief_decoder_config.proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":belief_decoder_config_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "tflite_tensors_to_objects_calculator_cc_proto",
|
||||
srcs = ["tflite_tensors_to_objects_calculator.proto"],
|
||||
cc_deps = [
|
||||
":belief_decoder_config_cc_proto",
|
||||
"//mediapipe/framework:calculator_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":tflite_tensors_to_objects_calculator_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "lift_2d_frame_annotation_to_3d_calculator_cc_proto",
|
||||
srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"],
|
||||
cc_deps = [
|
||||
":belief_decoder_config_cc_proto",
|
||||
"//mediapipe/framework:calculator_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":lift_2d_frame_annotation_to_3d_calculator_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "annotations_to_model_matrices_calculator_cc_proto",
|
||||
srcs = ["annotations_to_model_matrices_calculator.proto"],
|
||||
cc_deps = ["//mediapipe/framework:calculator_cc_proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":annotations_to_model_matrices_calculator_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "model_matrix_cc_proto",
|
||||
srcs = ["model_matrix.proto"],
|
||||
cc_deps = ["//mediapipe/framework:calculator_cc_proto"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":model_matrix_proto"],
|
||||
)
|
||||
|
||||
mediapipe_cc_proto_library(
|
||||
name = "annotations_to_render_data_calculator_cc_proto",
|
||||
srcs = ["annotations_to_render_data_calculator.proto"],
|
||||
cc_deps = [
|
||||
"//mediapipe/framework:calculator_cc_proto",
|
||||
"//mediapipe/util:color_cc_proto",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":annotations_to_render_data_calculator_proto"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "box_util",
|
||||
srcs = ["box_util.cc"],
|
||||
hdrs = ["box_util.h"],
|
||||
deps = [
|
||||
"//mediapipe/framework/port:logging",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"//mediapipe/framework/port:opencv_imgproc",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "frame_annotation_tracker",
|
||||
srcs = ["frame_annotation_tracker.cc"],
|
||||
hdrs = ["frame_annotation_tracker.h"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":box_util",
|
||||
"//mediapipe/framework/port:integral_types",
|
||||
"//mediapipe/framework/port:logging",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
"@com_google_absl//absl/container:btree",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gl_animation_overlay_calculator",
|
||||
srcs = ["gl_animation_overlay_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":camera_parameters_cc_proto",
|
||||
":gl_animation_overlay_calculator_cc_proto",
|
||||
":model_matrix_cc_proto",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"//mediapipe/framework/port:status",
|
||||
"//mediapipe/gpu:gl_calculator_helper",
|
||||
"//mediapipe/gpu:shader_util",
|
||||
"//mediapipe/util/android:asset_manager_util",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "decoder",
|
||||
srcs = [
|
||||
"decoder.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"decoder.h",
|
||||
],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":belief_decoder_config_cc_proto",
|
||||
"//mediapipe/framework/port:logging",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"//mediapipe/framework/port:opencv_imgproc",
|
||||
"//mediapipe/framework/port:status",
|
||||
"@com_google_absl//absl/status",
|
||||
"@eigen_archive//:eigen",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tensor_util",
|
||||
srcs = [
|
||||
"tensor_util.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"tensor_util.h",
|
||||
],
|
||||
deps = [
|
||||
"//mediapipe/framework/port:logging",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"@org_tensorflow//tensorflow/lite:framework",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "box",
|
||||
srcs = [
|
||||
"box.cc",
|
||||
"model.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"box.h",
|
||||
"model.h",
|
||||
"types.h",
|
||||
],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":object_cc_proto",
|
||||
"//mediapipe/framework/port:logging",
|
||||
"@eigen_archive//:eigen",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "frame_annotation_to_timed_box_list_calculator",
|
||||
srcs = ["frame_annotation_to_timed_box_list_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":box_util",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"//mediapipe/framework/port:opencv_imgproc",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"//mediapipe/framework/port:status",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
"@com_google_absl//absl/memory",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "frame_annotation_tracker_calculator",
|
||||
srcs = ["frame_annotation_tracker_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":frame_annotation_tracker",
|
||||
":frame_annotation_tracker_calculator_cc_proto",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"//mediapipe/framework/port:status",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
"@com_google_absl//absl/memory",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tflite_tensors_to_objects_calculator",
|
||||
srcs = ["tflite_tensors_to_objects_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":belief_decoder_config_cc_proto",
|
||||
":decoder",
|
||||
":tensor_util",
|
||||
":tflite_tensors_to_objects_calculator_cc_proto",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework/deps:file_path",
|
||||
"//mediapipe/framework/formats:detection_cc_proto",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
"@com_google_absl//absl/types:span",
|
||||
"@eigen_archive//:eigen",
|
||||
"@org_tensorflow//tensorflow/lite:framework",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "lift_2d_frame_annotation_to_3d_calculator",
|
||||
srcs = ["lift_2d_frame_annotation_to_3d_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":belief_decoder_config_cc_proto",
|
||||
":decoder",
|
||||
":lift_2d_frame_annotation_to_3d_calculator_cc_proto",
|
||||
":tensor_util",
|
||||
":tflite_tensors_to_objects_calculator_cc_proto",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework/deps:file_path",
|
||||
"//mediapipe/framework/formats:detection_cc_proto",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
"@com_google_absl//absl/types:span",
|
||||
"@eigen_archive//:eigen",
|
||||
"@org_tensorflow//tensorflow/lite:framework",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "annotations_to_model_matrices_calculator",
|
||||
srcs = ["annotations_to_model_matrices_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":annotations_to_model_matrices_calculator_cc_proto",
|
||||
":box",
|
||||
":model_matrix_cc_proto",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework:calculator_options_cc_proto",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"//mediapipe/framework/port:status",
|
||||
"//mediapipe/util:color_cc_proto",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@eigen_archive//:eigen",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "annotations_to_render_data_calculator",
|
||||
srcs = ["annotations_to_render_data_calculator.cc"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":annotations_to_render_data_calculator_cc_proto",
|
||||
"//mediapipe/framework:calculator_framework",
|
||||
"//mediapipe/framework:calculator_options_cc_proto",
|
||||
"//mediapipe/framework/port:ret_check",
|
||||
"//mediapipe/util:color_cc_proto",
|
||||
"//mediapipe/util:render_data_cc_proto",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "box_util_test",
|
||||
srcs = ["box_util_test.cc"],
|
||||
deps = [
|
||||
":box_util",
|
||||
"//mediapipe/framework/port:gtest_main",
|
||||
"//mediapipe/framework/port:opencv_core",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "frame_annotation_tracker_test",
|
||||
srcs = ["frame_annotation_tracker_test.cc"],
|
||||
deps = [
|
||||
":annotation_cc_proto",
|
||||
":frame_annotation_tracker",
|
||||
"//mediapipe/framework/port:gtest_main",
|
||||
"//mediapipe/framework/port:logging",
|
||||
"//mediapipe/util/tracking:box_tracker_cc_proto",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,551 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
// Info about the camera characteristics used to capture images and depth data.
|
||||
// See developer.apple.com/documentation/avfoundation/avcameracalibrationdata
|
||||
// for more information.
|
||||
message AVCameraCalibrationData {
|
||||
// 3x3 row-major matrix relating a camera's internal properties to an ideal
|
||||
// pinhole-camera model.
|
||||
// See
|
||||
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881135-intrinsicmatrix
|
||||
// for detailed usage information.
|
||||
repeated float intrinsic_matrix = 1 [packed = true];
|
||||
|
||||
// The image dimensions to which the intrinsic_matrix values are relative.
|
||||
optional float intrinsic_matrix_reference_dimension_width = 2;
|
||||
optional float intrinsic_matrix_reference_dimension_height = 3;
|
||||
|
||||
// 3x4 row-major matrix relating a camera's position and orientation to a
|
||||
// world or scene coordinate system. Consists of a unitless 3x3 rotation
|
||||
// matrix (R) on the left and a translation (t) 3x1 vector on the right. The
|
||||
// translation vector's units are millimeters. For example:
|
||||
//
|
||||
// |r1,1 r2,1 r3,1 | t1|
|
||||
// [R | t] = |r1,2 r2,2 r3,2 | t2|
|
||||
// |r1,3 r2,3 r3,3 | t3|
|
||||
//
|
||||
// is stored as [r11, r21, r31, t1, r12, r22, r32, t2, ...]
|
||||
//
|
||||
// See
|
||||
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881130-extrinsicmatrix?language=objc
|
||||
// for more information.
|
||||
repeated float extrinsic_matrix = 4 [packed = true];
|
||||
|
||||
// The size, in millimeters, of one image pixel.
|
||||
optional float pixel_size = 5;
|
||||
|
||||
// A list of floating-point values describing radial distortions imparted by
|
||||
// the camera lens, for use in rectifying camera images.
|
||||
// See
|
||||
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881129-lensdistortionlookuptable?language=objc
|
||||
// for more information.
|
||||
repeated float lens_distortion_lookup_values = 6 [packed = true];
|
||||
|
||||
// A list of floating-point values describing radial distortions for use in
|
||||
// reapplying camera geometry to a rectified image.
|
||||
// See
|
||||
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881132-inverselensdistortionlookuptable?language=objc
|
||||
// for more information.
|
||||
repeated float inverse_lens_distortion_lookup_values = 7 [packed = true];
|
||||
|
||||
// The offset of the distortion center of the camera lens from the top-left
|
||||
// corner of the image.
|
||||
// See
|
||||
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881131-lensdistortioncenter?language=objc
|
||||
// for more information.
|
||||
optional float lens_distortion_center_x = 8;
|
||||
optional float lens_distortion_center_y = 9;
|
||||
}
|
||||
|
||||
// Container for depth data information.
|
||||
// See developer.apple.com/documentation/avfoundation/avdepthdata for more info.
|
||||
message AVDepthData {
|
||||
// PNG representation of the grayscale depth data map. See discussion about
|
||||
// depth_data_map_original_minimum_value, below, for information about how
|
||||
// to interpret the pixel values.
|
||||
optional bytes depth_data_map = 1;
|
||||
|
||||
// Pixel format type of the original captured depth data.
|
||||
// See
|
||||
// developer.apple.com/documentation/corevideo/1563591-pixel_format_identifiers?language=objc
|
||||
// for the complete list of possible pixel format types. This value represents
|
||||
// a string for the associated OSType/FourCharCode.
|
||||
optional string depth_data_type = 2;
|
||||
|
||||
// Indicates the general accuracy of the depth_data_map.
|
||||
// See developer.apple.com/documentation/avfoundation/avdepthdataaccuracy for
|
||||
// more information.
|
||||
enum Accuracy {
|
||||
UNDEFINED_ACCURACY = 0;
|
||||
// Values in the depth map are usable for foreground/background separation
|
||||
// but are not absolutely accurate in the physical world.
|
||||
RELATIVE = 1;
|
||||
// Values in the depth map are absolutely accurate in the physical world.
|
||||
ABSOLUTE = 2;
|
||||
}
|
||||
optional Accuracy depth_data_accuracy = 3 [default = RELATIVE];
|
||||
|
||||
// Indicates whether the depth_data_map contains temporally smoothed data.
|
||||
optional bool depth_data_filtered = 4;
|
||||
|
||||
// Quality of the depth_data_map.
|
||||
enum Quality {
|
||||
UNDEFINED_QUALITY = 0;
|
||||
HIGH = 1;
|
||||
LOW = 2;
|
||||
}
|
||||
optional Quality depth_data_quality = 5;
|
||||
|
||||
// Associated calibration data for the depth_data_map.
|
||||
optional AVCameraCalibrationData camera_calibration_data = 6;
|
||||
|
||||
// The original range of values expressed by the depth_data_map, before
|
||||
// grayscale normalization. For example, if the minimum and maximum values
|
||||
// indicate a range of [0.5, 2.2], and the depth_data_type value indicates
|
||||
// it was a depth map, then white pixels (255, 255, 255) will map to 0.5 and
|
||||
// black pixels (0, 0, 0) will map to 2.2 with the grayscale range linearly
|
||||
// interpolated inbetween. Conversely, if the depth_data_type value indicates
|
||||
// it was a disparity map, then white pixels will map to 2.2 and black pixels
|
||||
// will map to 0.5.
|
||||
optional float depth_data_map_original_minimum_value = 7;
|
||||
optional float depth_data_map_original_maximum_value = 8;
|
||||
|
||||
// The width of the depth buffer map.
|
||||
optional int32 depth_data_map_width = 9;
|
||||
|
||||
// The height of the depth buffer map.
|
||||
optional int32 depth_data_map_height = 10;
|
||||
|
||||
// The row-major flattened array of the depth buffer map pixels. This will be
|
||||
// either a float32 or float16 byte array, depending on 'depth_data_type'.
|
||||
optional bytes depth_data_map_raw_values = 11;
|
||||
}
|
||||
|
||||
// Estimated scene lighting information associated with a captured video frame.
|
||||
// See developer.apple.com/documentation/arkit/arlightestimate for more info.
|
||||
message ARLightEstimate {
|
||||
// The estimated intensity, in lumens, of ambient light throughout the scene.
|
||||
optional double ambient_intensity = 1;
|
||||
|
||||
// The estimated color temperature, in degrees Kelvin, of ambient light
|
||||
// throughout the scene.
|
||||
optional double ambient_color_temperature = 2;
|
||||
|
||||
// Data describing the estimated lighting environment in all directions.
|
||||
// Second-level spherical harmonics in separate red, green, and blue data
|
||||
// planes. Thus, this buffer contains 3 sets of 9 coefficients, or a total of
|
||||
// 27 values.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928222-sphericalharmonicscoefficients?language=objc
|
||||
// for more information.
|
||||
repeated float spherical_harmonics_coefficients = 3 [packed = true];
|
||||
|
||||
message DirectionVector {
|
||||
optional float x = 1;
|
||||
optional float y = 2;
|
||||
optional float z = 3;
|
||||
}
|
||||
// A vector indicating the orientation of the strongest directional light
|
||||
// source, normalized in the world-coordinate space.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928221-primarylightdirection?language=objc
|
||||
// for more information;
|
||||
optional DirectionVector primary_light_direction = 4;
|
||||
|
||||
// The estimated intensity, in lumens, of the strongest directional light
|
||||
// source in the scene.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928219-primarylightintensity?language=objc
|
||||
// for more information.
|
||||
optional float primary_light_intensity = 5;
|
||||
}
|
||||
|
||||
// Information about the camera position and imaging characteristics for a
|
||||
// captured video frame.
|
||||
// See developer.apple.com/documentation/arkit/arcamera for more information.
|
||||
message ARCamera {
|
||||
// The general quality of position tracking available when the camera captured
|
||||
// a frame.
|
||||
enum TrackingState {
|
||||
UNDEFINED_TRACKING_STATE = 0;
|
||||
// Camera position tracking is not available.
|
||||
UNAVAILABLE = 1;
|
||||
// Tracking is available, but the quality of results is questionable.
|
||||
LIMITED = 2;
|
||||
// Camera position tracking is providing optimal results.
|
||||
NORMAL = 3;
|
||||
}
|
||||
optional TrackingState tracking_state = 1 [default = UNAVAILABLE];
|
||||
|
||||
// A possible diagnosis for limited position tracking quality as of when the
|
||||
// frame was captured.
|
||||
enum TrackingStateReason {
|
||||
UNDEFINED_TRACKING_STATE_REASON = 0;
|
||||
// The current tracking state is not limited.
|
||||
NONE = 1;
|
||||
// Not yet enough camera or motion data to provide tracking information.
|
||||
INITIALIZING = 2;
|
||||
// The device is moving too fast for accurate image-based position tracking.
|
||||
EXCESSIVE_MOTION = 3;
|
||||
// Not enough distinguishable features for image-based position tracking.
|
||||
INSUFFICIENT_FEATURES = 4;
|
||||
// Tracking is limited due to a relocalization in progress.
|
||||
RELOCALIZING = 5;
|
||||
}
|
||||
optional TrackingStateReason tracking_state_reason = 2 [default = NONE];
|
||||
|
||||
// 4x4 row-major matrix expressing position and orientation of the camera in
|
||||
// world coordinate space.
|
||||
// See developer.apple.com/documentation/arkit/arcamera/2866108-transform for
|
||||
// more information.
|
||||
repeated float transform = 3 [packed = true];
|
||||
|
||||
// The orientation of the camera, expressed as roll, pitch, and yaw values.
|
||||
message EulerAngles {
|
||||
optional float roll = 1;
|
||||
optional float pitch = 2;
|
||||
optional float yaw = 3;
|
||||
}
|
||||
optional EulerAngles euler_angles = 4;
|
||||
|
||||
// The width and height, in pixels, of the captured camera image.
|
||||
optional int32 image_resolution_width = 5;
|
||||
optional int32 image_resolution_height = 6;
|
||||
|
||||
// 3x3 row-major matrix that converts between the 2D camera plane and 3D world
|
||||
// coordinate space.
|
||||
// See developer.apple.com/documentation/arkit/arcamera/2875730-intrinsics for
|
||||
// usage information.
|
||||
repeated float intrinsics = 7 [packed = true];
|
||||
|
||||
// 4x4 row-major transform matrix appropriate for rendering 3D content to
|
||||
// match the image captured by the camera.
|
||||
// See
|
||||
// developer.apple.com/documentation/arkit/arcamera/2887458-projectionmatrix
|
||||
// for usage information.
|
||||
repeated float projection_matrix = 8 [packed = true];
|
||||
|
||||
// 4x4 row-major transform matrix appropriate for converting from world-space
|
||||
// to camera space. Relativized for the captured_image orientation (i.e.
|
||||
// UILandscapeOrientationRight).
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arcamera/2921672-viewmatrixfororientation?language=objc
|
||||
// for more information.
|
||||
repeated float view_matrix = 9 [packed = true];
|
||||
}
|
||||
|
||||
// Container for a 3D mesh describing face topology.
|
||||
message ARFaceGeometry {
|
||||
// Each vertex represents a 3D point in the face mesh, in the face coordinate
|
||||
// space.
|
||||
// See developer.apple.com/documentation/arkit/arfacegeometry/2928201-vertices
|
||||
// for more information.
|
||||
message Vertex {
|
||||
optional float x = 1;
|
||||
optional float y = 2;
|
||||
optional float z = 3;
|
||||
}
|
||||
repeated Vertex vertices = 1;
|
||||
|
||||
// The number of elements in the vertices list.
|
||||
optional int32 vertex_count = 2;
|
||||
|
||||
// Each texture coordinate represents UV texture coordinates for the vertex at
|
||||
// the corresponding index in the vertices buffer.
|
||||
// See
|
||||
// developer.apple.com/documentation/arkit/arfacegeometry/2928203-texturecoordinates
|
||||
// for more information.
|
||||
message TextureCoordinate {
|
||||
optional float u = 1;
|
||||
optional float v = 2;
|
||||
}
|
||||
repeated TextureCoordinate texture_coordinates = 3;
|
||||
|
||||
// The number of elements in the texture_coordinates list.
|
||||
optional int32 texture_coordinate_count = 4;
|
||||
|
||||
// Each integer value in this ordered list represents an index into the
|
||||
// vertices and texture_coordinates lists. Each set of three indices
|
||||
// identifies the vertices comprising a single triangle in the mesh. Each set
|
||||
// of three indices forms a triangle, so the number of indices in the
|
||||
// triangle_indices buffer is three times the triangle_count value.
|
||||
// See
|
||||
// developer.apple.com/documentation/arkit/arfacegeometry/2928199-triangleindices
|
||||
// for more information.
|
||||
repeated int32 triangle_indices = 5 [packed = true];
|
||||
|
||||
// The number of triangles described by the triangle_indices buffer.
|
||||
// See
|
||||
// developer.apple.com/documentation/arkit/arfacegeometry/2928207-trianglecount
|
||||
// for more information.
|
||||
optional int32 triangle_count = 6;
|
||||
}
|
||||
|
||||
// Contains a list of blend shape entries wherein each item maps a specific
|
||||
// blend shape location to its associated coefficient.
|
||||
message ARBlendShapeMap {
|
||||
message MapEntry {
|
||||
// Identifier for the specific facial feature.
|
||||
// See developer.apple.com/documentation/arkit/arblendshapelocation for a
|
||||
// complete list of identifiers.
|
||||
optional string blend_shape_location = 1;
|
||||
|
||||
// Indicates the current position of the feature relative to its neutral
|
||||
// configuration, ranging from 0.0 (neutral) to 1.0 (maximum movement).
|
||||
optional float blend_shape_coefficient = 2;
|
||||
}
|
||||
repeated MapEntry entries = 1;
|
||||
}
|
||||
|
||||
// Information about the pose, topology, and expression of a detected face.
|
||||
// See developer.apple.com/documentation/arkit/arfaceanchor for more info.
|
||||
message ARFaceAnchor {
|
||||
// A coarse triangle mesh representing the topology of the detected face.
|
||||
optional ARFaceGeometry geometry = 1;
|
||||
|
||||
// A map of named coefficients representing the detected facial expression in
|
||||
// terms of the movement of specific facial features.
|
||||
optional ARBlendShapeMap blend_shapes = 2;
|
||||
|
||||
// 4x4 row-major matrix encoding the position, orientation, and scale of the
|
||||
// anchor relative to the world coordinate space.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/aranchor/2867981-transform?language=objc
|
||||
// for more information.
|
||||
repeated float transform = 3;
|
||||
|
||||
// Indicates whether the anchor's transform is valid. Frames that have a face
|
||||
// anchor with this value set to NO should probably be ignored.
|
||||
optional bool is_tracked = 4;
|
||||
}
|
||||
|
||||
// Container for a 3D mesh.
|
||||
message ARPlaneGeometry {
|
||||
message Vertex {
|
||||
optional float x = 1;
|
||||
optional float y = 2;
|
||||
optional float z = 3;
|
||||
}
|
||||
|
||||
// Each texture coordinate represents UV texture coordinates for the vertex at
|
||||
// the corresponding index in the vertices buffer.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arfacegeometry/2928203-texturecoordinates
|
||||
// for more information.
|
||||
message TextureCoordinate {
|
||||
optional float u = 1;
|
||||
optional float v = 2;
|
||||
}
|
||||
|
||||
// A buffer of vertex positions for each point in the plane mesh.
|
||||
repeated Vertex vertices = 1;
|
||||
|
||||
// The number of elements in the vertices buffer.
|
||||
optional int32 vertex_count = 2;
|
||||
|
||||
// A buffer of texture coordinate values for each point in the plane mesh.
|
||||
repeated TextureCoordinate texture_coordinates = 3;
|
||||
|
||||
// The number of elements in the texture_coordinates buffer.
|
||||
optional int32 texture_coordinate_count = 4;
|
||||
|
||||
// Each integer value in this ordered list represents an index into the
|
||||
// vertices and texture_coordinates lists. Each set of three indices
|
||||
// identifies the vertices comprising a single triangle in the mesh. Each set
|
||||
// of three indices forms a triangle, so the number of indices in the
|
||||
// triangle_indices buffer is three times the triangle_count value.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplanegeometry/2941051-triangleindices
|
||||
// for more information.
|
||||
repeated int32 triangle_indices = 5 [packed = true];
|
||||
|
||||
// Each set of three indices forms a triangle, so the number of indices in the
|
||||
// triangle_indices buffer is three times the triangle_count value.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplanegeometry/2941058-trianglecount
|
||||
// for more information.
|
||||
optional int32 triangle_count = 6;
|
||||
|
||||
// Each value in this buffer represents the position of a vertex along the
|
||||
// boundary polygon of the estimated plane. The owning plane anchor's
|
||||
// transform matrix defines the coordinate system for these points.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplanegeometry/2941052-boundaryvertices
|
||||
// for more information.
|
||||
repeated Vertex boundary_vertices = 7;
|
||||
|
||||
// The number of elements in the boundary_vertices buffer.
|
||||
optional int32 boundary_vertex_count = 8;
|
||||
}
|
||||
|
||||
// Information about the position and orientation of a real-world flat surface.
|
||||
// See https://developer.apple.com/documentation/arkit/arplaneanchor for more
|
||||
// information.
|
||||
message ARPlaneAnchor {
|
||||
enum Alignment {
|
||||
UNDEFINED = 0;
|
||||
// The plane is perpendicular to gravity.
|
||||
HORIZONTAL = 1;
|
||||
// The plane is parallel to gravity.
|
||||
VERTICAL = 2;
|
||||
}
|
||||
|
||||
// Wrapper for a 3D point / vector within the plane. See extent and center
|
||||
// values for more information.
|
||||
message PlaneVector {
|
||||
optional float x = 1;
|
||||
optional float y = 2;
|
||||
optional float z = 3;
|
||||
}
|
||||
|
||||
enum PlaneClassification {
|
||||
NONE = 0;
|
||||
WALL = 1;
|
||||
FLOOR = 2;
|
||||
CEILING = 3;
|
||||
TABLE = 4;
|
||||
SEAT = 5;
|
||||
}
|
||||
|
||||
// The classification status for the plane.
|
||||
enum PlaneClassificationStatus {
|
||||
// The classfication process for the plane anchor has completed but the
|
||||
// result is inconclusive.
|
||||
UNKNOWN = 0;
|
||||
// No classication information can be provided (set on error or if the
|
||||
// device does not support plane classification).
|
||||
UNAVAILABLE = 1;
|
||||
// The classification process has not completed.
|
||||
UNDETERMINED = 2;
|
||||
// The classfication process for the plane anchor has completed.
|
||||
KNOWN = 3;
|
||||
}
|
||||
|
||||
// The ID of the plane.
|
||||
optional string identifier = 1;
|
||||
|
||||
// 4x4 row-major matrix encoding the position, orientation, and scale of the
|
||||
// anchor relative to the world coordinate space.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/aranchor/2867981-transform
|
||||
// for more information.
|
||||
repeated float transform = 2;
|
||||
|
||||
// The general orientation of the detected plane with respect to gravity.
|
||||
optional Alignment alignment = 3;
|
||||
|
||||
// A coarse triangle mesh representing the general shape of the detected
|
||||
// plane.
|
||||
optional ARPlaneGeometry geometry = 4;
|
||||
|
||||
// The center point of the plane relative to its anchor position.
|
||||
// Although the type of this property is a 3D vector, a plane anchor is always
|
||||
// two-dimensional, and is always positioned in only the x and z directions
|
||||
// relative to its transform position. (That is, the y-component of this
|
||||
// vector is always zero.)
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplaneanchor/2882056-center
|
||||
// for more information.
|
||||
optional PlaneVector center = 5;
|
||||
|
||||
// The estimated width and length of the detected plane.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplaneanchor/2882055-extent
|
||||
// for more information.
|
||||
optional PlaneVector extent = 6;
|
||||
|
||||
// A Boolean value that indicates whether plane classification is available on
|
||||
// the current device. On devices without plane classification support, all
|
||||
// plane anchors report a classification value of NONE
|
||||
// and a classification_status value of UNAVAILABLE.
|
||||
optional bool classification_supported = 7;
|
||||
|
||||
// A general characterization of what kind of real-world surface the plane
|
||||
// anchor represents.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplaneanchor/2990936-classification
|
||||
// for more information.
|
||||
optional PlaneClassification classification = 8;
|
||||
|
||||
// The current state of ARKit's process for classifying the plane anchor.
|
||||
// When this property's value is KNOWN, the classification property represents
|
||||
// ARKit's characterization of the real-world surface corresponding to the
|
||||
// plane anchor.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arplaneanchor/2990937-classificationstatus
|
||||
// for more information.
|
||||
optional PlaneClassificationStatus classification_status = 9;
|
||||
}
|
||||
|
||||
// A collection of points in the world coordinate space.
|
||||
// See https://developer.apple.com/documentation/arkit/arpointcloud for more
|
||||
// information.
|
||||
message ARPointCloud {
|
||||
message Point {
|
||||
optional float x = 1;
|
||||
optional float y = 2;
|
||||
optional float z = 3;
|
||||
}
|
||||
|
||||
// The number of points in the cloud.
|
||||
optional int32 count = 1;
|
||||
|
||||
// The list of detected points.
|
||||
repeated Point point = 2;
|
||||
|
||||
// A list of unique identifiers corresponding to detected feature points.
|
||||
// Each identifier in this list corresponds to the point at the same index
|
||||
// in the points array.
|
||||
repeated int64 identifier = 3 [packed = true];
|
||||
}
|
||||
|
||||
// Video image and face position tracking information.
|
||||
// See developer.apple.com/documentation/arkit/arframe for more information.
|
||||
message ARFrame {
|
||||
// The timestamp for the frame.
|
||||
optional double timestamp = 1;
|
||||
|
||||
// The depth data associated with the frame. Not all frames have depth data.
|
||||
optional AVDepthData depth_data = 2;
|
||||
|
||||
// The depth data object timestamp associated with the frame. May differ from
|
||||
// the frame timestamp value. Is only set when the frame has depth_data.
|
||||
optional double depth_data_timestamp = 3;
|
||||
|
||||
// Camera information associated with the frame.
|
||||
optional ARCamera camera = 4;
|
||||
|
||||
// Light information associated with the frame.
|
||||
optional ARLightEstimate light_estimate = 5;
|
||||
|
||||
// Face anchor information associated with the frame. Not all frames have an
|
||||
// active face anchor.
|
||||
optional ARFaceAnchor face_anchor = 6;
|
||||
|
||||
// Plane anchors associated with the frame. Not all frames have a plane
|
||||
// anchor. Plane anchors and face anchors are mutually exclusive.
|
||||
repeated ARPlaneAnchor plane_anchor = 7;
|
||||
|
||||
// The current intermediate results of the scene analysis used to perform
|
||||
// world tracking.
|
||||
// See
|
||||
// https://developer.apple.com/documentation/arkit/arframe/2887449-rawfeaturepoints
|
||||
// for more information.
|
||||
optional ARPointCloud raw_feature_points = 8;
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/graphs/object_detection_3d/calculators/a_r_capture_metadata.proto";
|
||||
import "mediapipe/graphs/object_detection_3d/calculators/object.proto";
|
||||
|
||||
// Projection of a 3D point on an image, and its metric depth.
|
||||
message NormalizedPoint2D {
|
||||
// x-y position of the 2d keypoint in the image coordinate system.
|
||||
// u,v \in [0, 1], where top left corner is (0, 0) and the bottom-right corner
|
||||
// is (1, 1).
|
||||
float x = 1;
|
||||
float y = 2;
|
||||
|
||||
// The depth of the point in the camera coordinate system (in meters).
|
||||
float depth = 3;
|
||||
}
|
||||
|
||||
// The 3D point in the camera coordinate system, the scales are in meters.
|
||||
message Point3D {
|
||||
float x = 1;
|
||||
float y = 2;
|
||||
float z = 3;
|
||||
}
|
||||
|
||||
message AnnotatedKeyPoint {
|
||||
int32 id = 1;
|
||||
Point3D point_3d = 2;
|
||||
NormalizedPoint2D point_2d = 3;
|
||||
}
|
||||
|
||||
message ObjectAnnotation {
|
||||
// Reference to the object identifier in ObjectInstance.
|
||||
int32 object_id = 1;
|
||||
|
||||
// For each objects, list all the annotated keypoints here.
|
||||
// E.g. for bounding-boxes, we have 8 keypoints, hands = 21 keypoints, etc.
|
||||
// These normalized points are the projection of the Object's 3D keypoint
|
||||
// on the current frame's camera poses.
|
||||
repeated AnnotatedKeyPoint keypoints = 2;
|
||||
|
||||
// Visibiity of this annotation in a frame.
|
||||
float visibility = 3;
|
||||
}
|
||||
|
||||
message FrameAnnotation {
|
||||
// Unique frame id, corresponds to images.
|
||||
int32 frame_id = 1;
|
||||
|
||||
// List of the annotated objects in this frame. Depending on how many object
|
||||
// are observable in this frame, we might have non or as much as
|
||||
// sequence.objects_size() annotations.
|
||||
repeated ObjectAnnotation annotations = 2;
|
||||
|
||||
// Information about the camera transformation (in the world coordinate) and
|
||||
// imaging characteristics for a captured video frame.
|
||||
ARCamera camera = 3;
|
||||
|
||||
// The timestamp for the frame.
|
||||
double timestamp = 4;
|
||||
|
||||
// Plane center and normal in camera frame.
|
||||
repeated float plane_center = 5;
|
||||
repeated float plane_normal = 6;
|
||||
}
|
||||
|
||||
// The sequence protocol contains the annotation data for the entire video clip.
|
||||
message Sequence {
|
||||
// List of all the annotated 3D objects in this sequence in the world
|
||||
// Coordinate system. Given the camera poses of each frame (also in the
|
||||
// world-coordinate) these objects bounding boxes can be projected to each
|
||||
// frame to get the per-frame annotation (i.e. image_annotation below).
|
||||
repeated Object objects = 1;
|
||||
|
||||
// List of annotated data per each frame in sequence + frame information.
|
||||
repeated FrameAnnotation frame_annotations = 2;
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "Eigen/src/Core/util/Constants.h"
|
||||
#include "Eigen/src/Geometry/Quaternion.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_join.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/calculator_options.pb.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/box.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/model_matrix.pb.h"
|
||||
#include "mediapipe/util/color.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr char kAnnotationTag[] = "ANNOTATIONS";
|
||||
constexpr char kModelMatricesTag[] = "MODEL_MATRICES";
|
||||
|
||||
using Matrix4fRM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
|
||||
|
||||
} // namespace
|
||||
|
||||
// Converts the box prediction from Objectron Model to the Model matrices
|
||||
// to be rendered.
|
||||
//
|
||||
// Input:
|
||||
// ANNOTATIONS - Frame annotations with lifted 3D points, the points are in
|
||||
// Objectron coordinate system.
|
||||
// Output:
|
||||
// MODEL_MATRICES - Result ModelMatrices, in OpenGL coordinate system.
|
||||
//
|
||||
// Usage example:
|
||||
// node {
|
||||
// calculator: "AnnotationsToModelMatricesCalculator"
|
||||
// input_stream: "ANNOTATIONS:objects"
|
||||
// output_stream: "MODEL_MATRICES:model_matrices"
|
||||
//}
|
||||
|
||||
class AnnotationsToModelMatricesCalculator : public CalculatorBase {
|
||||
public:
|
||||
AnnotationsToModelMatricesCalculator() {}
|
||||
~AnnotationsToModelMatricesCalculator() override {}
|
||||
AnnotationsToModelMatricesCalculator(
|
||||
const AnnotationsToModelMatricesCalculator&) = delete;
|
||||
AnnotationsToModelMatricesCalculator& operator=(
|
||||
const AnnotationsToModelMatricesCalculator&) = delete;
|
||||
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
|
||||
private:
|
||||
::mediapipe::Status GetModelMatricesForAnnotations(
|
||||
const FrameAnnotation& annotations,
|
||||
TimedModelMatrixProtoList* model_matrix_list);
|
||||
|
||||
AnnotationsToModelMatricesCalculatorOptions options_;
|
||||
Eigen::Vector3f model_scale_;
|
||||
Matrix4fRM model_transformation_;
|
||||
};
|
||||
REGISTER_CALCULATOR(AnnotationsToModelMatricesCalculator);
|
||||
|
||||
::mediapipe::Status AnnotationsToModelMatricesCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)) << "No input stream found.";
|
||||
if (cc->Inputs().HasTag(kAnnotationTag)) {
|
||||
cc->Inputs().Tag(kAnnotationTag).Set<FrameAnnotation>();
|
||||
}
|
||||
|
||||
if (cc->Outputs().HasTag(kModelMatricesTag)) {
|
||||
cc->Outputs().Tag(kModelMatricesTag).Set<TimedModelMatrixProtoList>();
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status AnnotationsToModelMatricesCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
RET_CHECK(cc->Inputs().HasTag(kAnnotationTag));
|
||||
|
||||
cc->SetOffset(TimestampDiff(0));
|
||||
options_ = cc->Options<AnnotationsToModelMatricesCalculatorOptions>();
|
||||
|
||||
if (options_.model_scale_size() == 3) {
|
||||
model_scale_ =
|
||||
Eigen::Map<const Eigen::Vector3f>(options_.model_scale().data());
|
||||
} else {
|
||||
model_scale_.setOnes();
|
||||
}
|
||||
|
||||
if (options_.model_transformation_size() == 16) {
|
||||
model_transformation_ =
|
||||
Eigen::Map<const Matrix4fRM>(options_.model_transformation().data());
|
||||
} else {
|
||||
model_transformation_.setIdentity();
|
||||
}
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status AnnotationsToModelMatricesCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
auto model_matrices = std::make_unique<TimedModelMatrixProtoList>();
|
||||
|
||||
const FrameAnnotation& annotations =
|
||||
cc->Inputs().Tag(kAnnotationTag).Get<FrameAnnotation>();
|
||||
|
||||
if (!GetModelMatricesForAnnotations(annotations, model_matrices.get()).ok()) {
|
||||
return ::mediapipe::InvalidArgumentError(
|
||||
"Error in GetModelMatricesForBoxes");
|
||||
}
|
||||
cc->Outputs()
|
||||
.Tag(kModelMatricesTag)
|
||||
.Add(model_matrices.release(), cc->InputTimestamp());
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status
|
||||
AnnotationsToModelMatricesCalculator::GetModelMatricesForAnnotations(
|
||||
const FrameAnnotation& annotations,
|
||||
TimedModelMatrixProtoList* model_matrix_list) {
|
||||
if (model_matrix_list == nullptr) {
|
||||
return ::mediapipe::InvalidArgumentError("model_matrix_list is nullptr");
|
||||
}
|
||||
model_matrix_list->clear_model_matrix();
|
||||
|
||||
Box box("category");
|
||||
for (const auto& object : annotations.annotations()) {
|
||||
TimedModelMatrixProto* model_matrix = model_matrix_list->add_model_matrix();
|
||||
model_matrix->set_id(object.object_id());
|
||||
|
||||
// Fit a box to the original vertices to estimate the scale of the box
|
||||
std::vector<Eigen::Vector3f> vertices;
|
||||
for (const auto& keypoint : object.keypoints()) {
|
||||
const auto& point = keypoint.point_3d();
|
||||
Eigen::Vector3f p(point.x(), point.y(), point.z());
|
||||
vertices.emplace_back(p);
|
||||
}
|
||||
box.Fit(vertices);
|
||||
|
||||
// Re-scale the box if necessary
|
||||
Eigen::Vector3f estimated_scale = box.GetScale();
|
||||
vertices.clear();
|
||||
for (const auto& keypoint : object.keypoints()) {
|
||||
const auto& point = keypoint.point_3d();
|
||||
Eigen::Vector3f p(point.x(), point.y(), point.z());
|
||||
vertices.emplace_back(p);
|
||||
}
|
||||
box.Fit(vertices);
|
||||
|
||||
Matrix4fRM object_transformation = box.GetTransformation();
|
||||
Matrix4fRM model_view;
|
||||
Matrix4fRM pursuit_model;
|
||||
// The reference view is
|
||||
//
|
||||
// ref << 0., 0., 1., 0.,
|
||||
// -1., 0., 0., 0.,
|
||||
// 0., -1., 0., 0.,
|
||||
// 0., 0., 0., 1.;
|
||||
// We have pursuit_model * model = model_view, to get pursuit_model:
|
||||
// pursuit_model = model_view * model^-1
|
||||
// clang-format off
|
||||
pursuit_model << 0.0, 1.0, 0.0, 0.0,
|
||||
1.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 1.0, 0.0,
|
||||
0.0, 0.0, 0.0, 1.0;
|
||||
// clang-format on
|
||||
|
||||
// Re-scale the CAD model to the scale of the estimated bounding box.
|
||||
const Eigen::Vector3f scale = model_scale_.cwiseProduct(estimated_scale);
|
||||
const Matrix4fRM model =
|
||||
model_transformation_.array().colwise() * scale.homogeneous().array();
|
||||
|
||||
// Finally compute the model_view matrix.
|
||||
model_view = pursuit_model * object_transformation * model;
|
||||
|
||||
for (int i = 0; i < model_view.rows(); ++i) {
|
||||
for (int j = 0; j < model_view.cols(); ++j) {
|
||||
model_matrix->add_matrix_entries(model_view(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,33 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
|
||||
message AnnotationsToModelMatricesCalculatorOptions {
|
||||
extend CalculatorOptions {
|
||||
optional AnnotationsToModelMatricesCalculatorOptions ext = 290166283;
|
||||
}
|
||||
|
||||
// Vector of size 3 indicating the scale vector [x, y, z]. We will re-scale
|
||||
// the model size with this vector. (Defaults to [1., 1., 1.])
|
||||
repeated float model_scale = 1;
|
||||
|
||||
// 4x4 Row major matrix denoting the transformation from the model to the
|
||||
// Deep Pursuit 3D coordinate system (where front is +z, and up is +y).
|
||||
repeated float model_transformation = 2;
|
||||
}
|
|
@ -0,0 +1,273 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_join.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/calculator_options.pb.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.pb.h"
|
||||
#include "mediapipe/util/color.pb.h"
|
||||
#include "mediapipe/util/render_data.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr char kAnnotationTag[] = "ANNOTATIONS";
|
||||
constexpr char kRenderDataTag[] = "RENDER_DATA";
|
||||
constexpr char kKeypointLabel[] = "KEYPOINT";
|
||||
constexpr int kMaxLandmarkThickness = 18;
|
||||
|
||||
inline void SetColor(RenderAnnotation* annotation, const Color& color) {
|
||||
annotation->mutable_color()->set_r(color.r());
|
||||
annotation->mutable_color()->set_g(color.g());
|
||||
annotation->mutable_color()->set_b(color.b());
|
||||
}
|
||||
|
||||
// Remap x from range [lo hi] to range [0 1] then multiply by scale.
|
||||
inline float Remap(float x, float lo, float hi, float scale) {
|
||||
return (x - lo) / (hi - lo + 1e-6) * scale;
|
||||
}
|
||||
|
||||
inline void GetMinMaxZ(const FrameAnnotation& annotations, float* z_min,
|
||||
float* z_max) {
|
||||
*z_min = std::numeric_limits<float>::max();
|
||||
*z_max = std::numeric_limits<float>::min();
|
||||
// Use a global depth scale for all the objects in the scene
|
||||
for (const auto& object : annotations.annotations()) {
|
||||
for (const auto& keypoint : object.keypoints()) {
|
||||
*z_min = std::min(keypoint.point_2d().depth(), *z_min);
|
||||
*z_max = std::max(keypoint.point_2d().depth(), *z_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SetColorSizeValueFromZ(float z, float z_min, float z_max,
|
||||
RenderAnnotation* render_annotation) {
|
||||
const int color_value = 255 - static_cast<int>(Remap(z, z_min, z_max, 255));
|
||||
::mediapipe::Color color;
|
||||
color.set_r(color_value);
|
||||
color.set_g(color_value);
|
||||
color.set_b(color_value);
|
||||
SetColor(render_annotation, color);
|
||||
const int thickness = static_cast<int>((1.f - Remap(z, z_min, z_max, 1)) *
|
||||
kMaxLandmarkThickness);
|
||||
render_annotation->set_thickness(thickness);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// A calculator that converts FrameAnnotation proto to RenderData proto for
|
||||
// visualization. The input should be the FrameAnnotation proto buffer. It is
|
||||
// also possible to specify the connections between landmarks.
|
||||
//
|
||||
// Example config:
|
||||
// node {
|
||||
// calculator: "AnnotationsToRenderDataCalculator"
|
||||
// input_stream: "ANNOTATIONS:annotations"
|
||||
// output_stream: "RENDER_DATA:render_data"
|
||||
// options {
|
||||
// [AnnotationsToRenderDataCalculator.ext] {
|
||||
// landmark_connections: [0, 1, 1, 2]
|
||||
// landmark_color { r: 0 g: 255 b: 0 }
|
||||
// connection_color { r: 0 g: 255 b: 0 }
|
||||
// thickness: 4.0
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
class AnnotationsToRenderDataCalculator : public CalculatorBase {
|
||||
public:
|
||||
AnnotationsToRenderDataCalculator() {}
|
||||
~AnnotationsToRenderDataCalculator() override {}
|
||||
AnnotationsToRenderDataCalculator(const AnnotationsToRenderDataCalculator&) =
|
||||
delete;
|
||||
AnnotationsToRenderDataCalculator& operator=(
|
||||
const AnnotationsToRenderDataCalculator&) = delete;
|
||||
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
|
||||
private:
|
||||
static void SetRenderAnnotationColorThickness(
|
||||
const AnnotationsToRenderDataCalculatorOptions& options,
|
||||
RenderAnnotation* render_annotation);
|
||||
static RenderAnnotation* AddPointRenderData(
|
||||
const AnnotationsToRenderDataCalculatorOptions& options,
|
||||
RenderData* render_data);
|
||||
|
||||
// Add a command to draw a line in the rendering queue. The line is drawn from
|
||||
// (start_x, start_y) to (end_x, end_y). The input x,y can either be in pixel
|
||||
// or normalized coordinate [0, 1] as indicated by the normalized flag.
|
||||
static void AddConnectionToRenderData(
|
||||
float start_x, float start_y, float end_x, float end_y,
|
||||
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
|
||||
RenderData* render_data);
|
||||
|
||||
// Same as above function. Instead of using color data to render the line, it
|
||||
// re-colors the line according to the two depth value. gray_val1 is the color
|
||||
// of the starting point and gray_val2 is the color of the ending point. The
|
||||
// line is colored using gradient color from gray_val1 to gray_val2. The
|
||||
// gray_val ranges from [0 to 255] for black to white.
|
||||
static void AddConnectionToRenderData(
|
||||
float start_x, float start_y, float end_x, float end_y,
|
||||
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
|
||||
int gray_val1, int gray_val2, RenderData* render_data);
|
||||
|
||||
AnnotationsToRenderDataCalculatorOptions options_;
|
||||
};
|
||||
REGISTER_CALCULATOR(AnnotationsToRenderDataCalculator);
|
||||
|
||||
::mediapipe::Status AnnotationsToRenderDataCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)) << "No input stream found.";
|
||||
if (cc->Inputs().HasTag(kAnnotationTag)) {
|
||||
cc->Inputs().Tag(kAnnotationTag).Set<FrameAnnotation>();
|
||||
}
|
||||
cc->Outputs().Tag(kRenderDataTag).Set<RenderData>();
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status AnnotationsToRenderDataCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
cc->SetOffset(TimestampDiff(0));
|
||||
options_ = cc->Options<AnnotationsToRenderDataCalculatorOptions>();
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status AnnotationsToRenderDataCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
auto render_data = absl::make_unique<RenderData>();
|
||||
bool visualize_depth = options_.visualize_landmark_depth();
|
||||
float z_min = 0.f;
|
||||
float z_max = 0.f;
|
||||
|
||||
if (cc->Inputs().HasTag(kAnnotationTag)) {
|
||||
const auto& annotations =
|
||||
cc->Inputs().Tag(kAnnotationTag).Get<FrameAnnotation>();
|
||||
RET_CHECK_EQ(options_.landmark_connections_size() % 2, 0)
|
||||
<< "Number of entries in landmark connections must be a multiple of 2";
|
||||
|
||||
if (visualize_depth) {
|
||||
GetMinMaxZ(annotations, &z_min, &z_max);
|
||||
// Only change rendering if there are actually z values other than 0.
|
||||
visualize_depth &= ((z_max - z_min) > 1e-3);
|
||||
}
|
||||
|
||||
for (const auto& object : annotations.annotations()) {
|
||||
for (const auto& keypoint : object.keypoints()) {
|
||||
auto* keypoint_data_render =
|
||||
AddPointRenderData(options_, render_data.get());
|
||||
auto* point = keypoint_data_render->mutable_point();
|
||||
if (visualize_depth) {
|
||||
SetColorSizeValueFromZ(keypoint.point_2d().depth(), z_min, z_max,
|
||||
keypoint_data_render);
|
||||
}
|
||||
|
||||
point->set_normalized(true);
|
||||
point->set_x(keypoint.point_2d().x());
|
||||
point->set_y(keypoint.point_2d().y());
|
||||
}
|
||||
|
||||
// Add edges
|
||||
for (int i = 0; i < options_.landmark_connections_size(); i += 2) {
|
||||
const auto& ld0 =
|
||||
object.keypoints(options_.landmark_connections(i)).point_2d();
|
||||
const auto& ld1 =
|
||||
object.keypoints(options_.landmark_connections(i + 1)).point_2d();
|
||||
const bool normalized = true;
|
||||
|
||||
if (visualize_depth) {
|
||||
const int gray_val1 =
|
||||
255 - static_cast<int>(Remap(ld0.depth(), z_min, z_max, 255));
|
||||
const int gray_val2 =
|
||||
255 - static_cast<int>(Remap(ld1.depth(), z_min, z_max, 255));
|
||||
AddConnectionToRenderData(ld0.x(), ld0.y(), ld1.x(), ld1.y(),
|
||||
options_, normalized, gray_val1, gray_val2,
|
||||
render_data.get());
|
||||
} else {
|
||||
AddConnectionToRenderData(ld0.x(), ld0.y(), ld1.x(), ld1.y(),
|
||||
options_, normalized, render_data.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cc->Outputs()
|
||||
.Tag(kRenderDataTag)
|
||||
.Add(render_data.release(), cc->InputTimestamp());
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
void AnnotationsToRenderDataCalculator::AddConnectionToRenderData(
|
||||
float start_x, float start_y, float end_x, float end_y,
|
||||
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
|
||||
int gray_val1, int gray_val2, RenderData* render_data) {
|
||||
auto* connection_annotation = render_data->add_render_annotations();
|
||||
RenderAnnotation::GradientLine* line =
|
||||
connection_annotation->mutable_gradient_line();
|
||||
line->set_x_start(start_x);
|
||||
line->set_y_start(start_y);
|
||||
line->set_x_end(end_x);
|
||||
line->set_y_end(end_y);
|
||||
line->set_normalized(normalized);
|
||||
line->mutable_color1()->set_r(gray_val1);
|
||||
line->mutable_color1()->set_g(gray_val1);
|
||||
line->mutable_color1()->set_b(gray_val1);
|
||||
line->mutable_color2()->set_r(gray_val2);
|
||||
line->mutable_color2()->set_g(gray_val2);
|
||||
line->mutable_color2()->set_b(gray_val2);
|
||||
connection_annotation->set_thickness(options.thickness());
|
||||
}
|
||||
|
||||
void AnnotationsToRenderDataCalculator::AddConnectionToRenderData(
|
||||
float start_x, float start_y, float end_x, float end_y,
|
||||
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
|
||||
RenderData* render_data) {
|
||||
auto* connection_annotation = render_data->add_render_annotations();
|
||||
RenderAnnotation::Line* line = connection_annotation->mutable_line();
|
||||
line->set_x_start(start_x);
|
||||
line->set_y_start(start_y);
|
||||
line->set_x_end(end_x);
|
||||
line->set_y_end(end_y);
|
||||
line->set_normalized(normalized);
|
||||
SetColor(connection_annotation, options.connection_color());
|
||||
connection_annotation->set_thickness(options.thickness());
|
||||
}
|
||||
|
||||
RenderAnnotation* AnnotationsToRenderDataCalculator::AddPointRenderData(
|
||||
const AnnotationsToRenderDataCalculatorOptions& options,
|
||||
RenderData* render_data) {
|
||||
auto* landmark_data_annotation = render_data->add_render_annotations();
|
||||
landmark_data_annotation->set_scene_tag(kKeypointLabel);
|
||||
SetRenderAnnotationColorThickness(options, landmark_data_annotation);
|
||||
return landmark_data_annotation;
|
||||
}
|
||||
|
||||
void AnnotationsToRenderDataCalculator::SetRenderAnnotationColorThickness(
|
||||
const AnnotationsToRenderDataCalculatorOptions& options,
|
||||
RenderAnnotation* render_annotation) {
|
||||
SetColor(render_annotation, options.landmark_color());
|
||||
render_annotation->set_thickness(options.thickness());
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,43 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
import "mediapipe/util/color.proto";
|
||||
|
||||
message AnnotationsToRenderDataCalculatorOptions {
|
||||
extend CalculatorOptions {
|
||||
optional AnnotationsToRenderDataCalculatorOptions ext = 267644238;
|
||||
}
|
||||
|
||||
// Specifies the landmarks to be connected in the drawing. For example, the
|
||||
// landmark_connections value of [0, 1, 1, 2] specifies two connections: one
|
||||
// that connects landmarks with index 0 and 1, and another that connects
|
||||
// landmarks with index 1 and 2.
|
||||
repeated int32 landmark_connections = 1;
|
||||
|
||||
// Color of the landmarks.
|
||||
optional Color landmark_color = 2;
|
||||
// Color of the connections.
|
||||
optional Color connection_color = 3;
|
||||
|
||||
// Thickness of the drawing of landmarks and connections.
|
||||
optional double thickness = 4 [default = 1.0];
|
||||
|
||||
// Change color and size of rendered landmarks based on its z value.
|
||||
optional bool visualize_landmark_depth = 5 [default = true];
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
message BeliefDecoderConfig {
|
||||
optional float heatmap_threshold = 1 [default = 0.9];
|
||||
// Maximum distance in pixels between two local max heatmap values.
|
||||
optional float local_max_distance = 2 [default = 10.0];
|
||||
// Coefficient of offset_scale.
|
||||
// offset_scale = offset_scale_coef * min(rows, cols).
|
||||
// offset_scale is used to multiply the offset predictions from the network.
|
||||
optional float offset_scale_coef = 3 [default = 0.5, deprecated = true];
|
||||
|
||||
// The radius for vertex voting. Use no voting if the radius is less than or
|
||||
// euqal to 1. Example: 10.
|
||||
optional int32 voting_radius = 4;
|
||||
|
||||
// The number of pixels to determine whether two points are the same.
|
||||
// Example: 5 (voting_radius / 2).
|
||||
optional int32 voting_allowance = 5;
|
||||
|
||||
// The threshold of beliefs, with which the points can vote. Example: 0.2.
|
||||
optional float voting_threshold = 6;
|
||||
}
|
255
mediapipe/graphs/object_detection_3d/calculators/box.cc
Normal file
|
@ -0,0 +1,255 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/box.h"
|
||||
|
||||
#include "Eigen/src/Core/util/Constants.h"
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
namespace {
|
||||
constexpr int kFrontFaceId = 4;
|
||||
constexpr int kTopFaceId = 2;
|
||||
constexpr int kNumKeypoints = 8 + 1;
|
||||
constexpr int kNumberOfAxis = 3;
|
||||
constexpr int kEdgesPerAxis = 4;
|
||||
|
||||
} // namespace
|
||||
|
||||
Box::Box(const std::string& category)
|
||||
: Model(kBoundingBox, kNumKeypoints, category),
|
||||
bounding_box_(kNumKeypoints) {
|
||||
transformation_.setIdentity();
|
||||
|
||||
scale_ << 0.1, 0.1, 0.1;
|
||||
|
||||
// The vertices are ordered according to the left-hand rule, so the normal
|
||||
// vector of each face will point inward the box.
|
||||
faces_.push_back({5, 6, 8, 7}); // +x on yz plane
|
||||
faces_.push_back({1, 3, 4, 2}); // -x on yz plane
|
||||
|
||||
faces_.push_back({3, 7, 8, 4}); // +y on xz plane = top
|
||||
faces_.push_back({1, 2, 6, 5}); // -y on xz plane
|
||||
|
||||
faces_.push_back({2, 4, 8, 6}); // +z on xy plane = front
|
||||
faces_.push_back({1, 5, 7, 3}); // -z on xy plane
|
||||
|
||||
// Add the edges in the cube, they are sorted according to axis (x-y-z).
|
||||
edges_.push_back({1, 5});
|
||||
edges_.push_back({2, 6});
|
||||
edges_.push_back({3, 7});
|
||||
edges_.push_back({4, 8});
|
||||
|
||||
edges_.push_back({1, 3});
|
||||
edges_.push_back({5, 7});
|
||||
edges_.push_back({2, 4});
|
||||
edges_.push_back({6, 8});
|
||||
|
||||
edges_.push_back({1, 2});
|
||||
edges_.push_back({3, 4});
|
||||
edges_.push_back({5, 6});
|
||||
edges_.push_back({7, 8});
|
||||
Update();
|
||||
}
|
||||
|
||||
void Box::Update() {
|
||||
// Compute the eight vertices of the bounding box from Box's parameters
|
||||
auto w = scale_[0] / 2.f;
|
||||
auto h = scale_[1] / 2.f;
|
||||
auto d = scale_[2] / 2.f;
|
||||
|
||||
// Define the local coordinate system, w.r.t. the center of the boxs
|
||||
bounding_box_[0] << 0., 0., 0.;
|
||||
bounding_box_[1] << -w, -h, -d;
|
||||
bounding_box_[2] << -w, -h, +d;
|
||||
bounding_box_[3] << -w, +h, -d;
|
||||
bounding_box_[4] << -w, +h, +d;
|
||||
bounding_box_[5] << +w, -h, -d;
|
||||
bounding_box_[6] << +w, -h, +d;
|
||||
bounding_box_[7] << +w, +h, -d;
|
||||
bounding_box_[8] << +w, +h, +d;
|
||||
|
||||
// Convert to world coordinate system
|
||||
for (int i = 0; i < kNumKeypoints; ++i) {
|
||||
bounding_box_[i] =
|
||||
transformation_.topLeftCorner<3, 3>() * bounding_box_[i] +
|
||||
transformation_.col(3).head<3>();
|
||||
}
|
||||
}
|
||||
|
||||
void Box::Adjust(const std::vector<float>& variables) {
|
||||
Eigen::Vector3f translation;
|
||||
translation << variables[0], variables[1], variables[2];
|
||||
SetTranslation(translation);
|
||||
|
||||
const float roll = variables[3];
|
||||
const float pitch = variables[4];
|
||||
const float yaw = variables[5];
|
||||
SetRotation(roll, pitch, yaw);
|
||||
|
||||
Eigen::Vector3f scale;
|
||||
scale << variables[6], variables[7], variables[8];
|
||||
|
||||
SetScale(scale);
|
||||
Update();
|
||||
}
|
||||
|
||||
float* Box::GetVertex(size_t vertex_id) {
|
||||
CHECK_LT(vertex_id, kNumKeypoints);
|
||||
return bounding_box_[vertex_id].data();
|
||||
}
|
||||
|
||||
const float* Box::GetVertex(size_t vertex_id) const {
|
||||
CHECK_LT(vertex_id, kNumKeypoints);
|
||||
return bounding_box_[vertex_id].data();
|
||||
}
|
||||
|
||||
bool Box::InsideTest(const Eigen::Vector3f& point, int check_axis) const {
|
||||
const float* v0 = GetVertex(1);
|
||||
const float* v1 = GetVertex(2);
|
||||
const float* v2 = GetVertex(3);
|
||||
const float* v4 = GetVertex(5);
|
||||
|
||||
switch (check_axis) {
|
||||
case 1:
|
||||
return (v0[0] <= point[0] && point[0] <= v1[0]); // X-axis
|
||||
case 2:
|
||||
return (v0[1] <= point[1] && point[1] <= v2[1]); // Y-axis
|
||||
case 3:
|
||||
return (v0[2] <= point[2] && point[2] <= v4[2]); // Z-axis
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void Box::Deserialize(const Object& obj) {
|
||||
CHECK_EQ(obj.keypoints_size(), kNumKeypoints);
|
||||
Model::Deserialize(obj);
|
||||
}
|
||||
|
||||
void Box::Serialize(Object* obj) {
|
||||
Model::Serialize(obj);
|
||||
obj->set_type(Object::BOUNDING_BOX);
|
||||
std::vector<Vector3f> local_bounding_box(9);
|
||||
// Define the local coordinate system, w.r.t. the center of the boxs
|
||||
local_bounding_box[0] << 0., 0., 0.;
|
||||
local_bounding_box[1] << -0.5, -0.5, -0.5;
|
||||
local_bounding_box[2] << -0.5, -0.5, +0.5;
|
||||
local_bounding_box[3] << -0.5, +0.5, -0.5;
|
||||
local_bounding_box[4] << -0.5, +0.5, +0.5;
|
||||
local_bounding_box[5] << +0.5, -0.5, -0.5;
|
||||
local_bounding_box[6] << +0.5, -0.5, +0.5;
|
||||
local_bounding_box[7] << +0.5, +0.5, -0.5;
|
||||
local_bounding_box[8] << +0.5, +0.5, +0.5;
|
||||
for (int i = 0; i < kNumKeypoints; ++i) {
|
||||
KeyPoint* keypoint = obj->add_keypoints();
|
||||
keypoint->set_x(local_bounding_box[i][0]);
|
||||
keypoint->set_y(local_bounding_box[i][1]);
|
||||
keypoint->set_z(local_bounding_box[i][2]);
|
||||
keypoint->set_confidence_radius(0.);
|
||||
}
|
||||
}
|
||||
|
||||
const Face& Box::GetFrontFace() const { return faces_[kFrontFaceId]; }
|
||||
|
||||
const Face& Box::GetTopFace() const { return faces_[kTopFaceId]; }
|
||||
|
||||
std::pair<Vector3f, Vector3f> Box::GetGroundPlane() const {
|
||||
const Vector3f gravity = Vector3f(0., 1., 0.);
|
||||
int ground_plane_id = 0;
|
||||
float ground_plane_error = 10.0;
|
||||
|
||||
auto get_face_center = [&](const Face& face) {
|
||||
Vector3f center = Vector3f::Zero();
|
||||
for (const int vertex_id : face) {
|
||||
center += Map<const Vector3f>(GetVertex(vertex_id));
|
||||
}
|
||||
center /= face.size();
|
||||
return center;
|
||||
};
|
||||
|
||||
auto get_face_normal = [&](const Face& face, const Vector3f& center) {
|
||||
Vector3f v1 = Map<const Vector3f>(GetVertex(face[0])) - center;
|
||||
Vector3f v2 = Map<const Vector3f>(GetVertex(face[1])) - center;
|
||||
Vector3f normal = v1.cross(v2);
|
||||
return normal;
|
||||
};
|
||||
|
||||
// The ground plane is defined as a plane aligned with gravity.
|
||||
// gravity is the (0, 1, 0) vector in the world coordinate system.
|
||||
const auto& faces = GetFaces();
|
||||
for (int face_id = 0; face_id < faces.size(); face_id += 2) {
|
||||
const auto& face = faces[face_id];
|
||||
Vector3f center = get_face_center(face);
|
||||
Vector3f normal = get_face_normal(face, center);
|
||||
Vector3f w = gravity.cross(normal);
|
||||
const float w_sq_norm = w.squaredNorm();
|
||||
if (w_sq_norm < ground_plane_error) {
|
||||
ground_plane_error = w_sq_norm;
|
||||
ground_plane_id = face_id;
|
||||
}
|
||||
}
|
||||
|
||||
Vector3f center = get_face_center(faces[ground_plane_id]);
|
||||
Vector3f normal = get_face_normal(faces[ground_plane_id], center);
|
||||
|
||||
// For each face, we also have a parallel face that it's normal is also
|
||||
// aligned with gravity vector. We pick the face with lower height (y-value).
|
||||
// The parallel to face 0 is 1, face 2 is 3, and face 4 is 5.
|
||||
int parallel_face_id = ground_plane_id + 1;
|
||||
const auto& parallel_face = faces[parallel_face_id];
|
||||
Vector3f parallel_face_center = get_face_center(parallel_face);
|
||||
Vector3f parallel_face_normal =
|
||||
get_face_normal(parallel_face, parallel_face_center);
|
||||
if (parallel_face_center[1] < center[1]) {
|
||||
center = parallel_face_center;
|
||||
normal = parallel_face_normal;
|
||||
}
|
||||
return {center, normal};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Box::Fit(const std::vector<T>& vertices) {
|
||||
CHECK_EQ(vertices.size(), kNumKeypoints);
|
||||
scale_.setZero();
|
||||
// The scale would remain invariant under rotation and translation.
|
||||
// We can safely estimate the scale from the oriented box.
|
||||
for (int axis = 0; axis < kNumberOfAxis; ++axis) {
|
||||
for (int edge_id = 0; edge_id < kEdgesPerAxis; ++edge_id) {
|
||||
// The edges are stored in quadruples according to each axis
|
||||
const std::array<int, 2>& edge = edges_[axis * kEdgesPerAxis + edge_id];
|
||||
scale_[axis] += (vertices[edge[0]] - vertices[edge[1]]).norm();
|
||||
}
|
||||
scale_[axis] /= kEdgesPerAxis;
|
||||
}
|
||||
// Create a scaled axis-aligned box
|
||||
transformation_.setIdentity();
|
||||
Update();
|
||||
|
||||
using MatrixN3_RM = Eigen::Matrix<float, kNumKeypoints, 3, Eigen::RowMajor>;
|
||||
Eigen::Map<const MatrixN3_RM> v(vertices[0].data());
|
||||
Eigen::Map<const MatrixN3_RM> system(bounding_box_[0].data());
|
||||
auto system_h = system.rowwise().homogeneous().eval();
|
||||
auto system_g = system_h.colPivHouseholderQr();
|
||||
auto solution = system_g.solve(v).eval();
|
||||
transformation_.topLeftCorner<3, 4>() = solution.transpose();
|
||||
Update();
|
||||
}
|
||||
|
||||
template void Box::Fit<Vector3f>(const std::vector<Vector3f>&);
|
||||
template void Box::Fit<Map<Vector3f>>(const std::vector<Map<Vector3f>>&);
|
||||
template void Box::Fit<Map<const Vector3f>>(
|
||||
const std::vector<Map<const Vector3f>>&);
|
||||
} // namespace mediapipe
|
132
mediapipe/graphs/object_detection_3d/calculators/box.h
Normal file
|
@ -0,0 +1,132 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/model.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Model for the bounding box in 3D
|
||||
// The box has 9 degrees of freedom, which uniquely defines 8 keypoints in the
|
||||
// fixed world-coordinate system.
|
||||
//
|
||||
// The 8 keypoints are defined as follows
|
||||
//
|
||||
// kp-id axis
|
||||
// 0 000 ---
|
||||
// 1 001 --+
|
||||
// 2 010 -+-
|
||||
// 3 011 -++
|
||||
// 4 100 +--
|
||||
// 5 101 +-+
|
||||
// 6 110 ++-
|
||||
// 7 111 +++
|
||||
//
|
||||
// where xyz means positive or negative vector along the axis where the center
|
||||
// of the box is the origin. The resulting bounding box is
|
||||
//
|
||||
// x x
|
||||
// 0 + + + + + + + + 4 .-------
|
||||
// +\ +\ |\
|
||||
// + \ y + \ z | \ y
|
||||
// + \ + \ | \
|
||||
// + 2 + + + + + + + + 6
|
||||
// z + + + +
|
||||
// + + + +
|
||||
// + + C + +
|
||||
// + + + +
|
||||
// 1 + + + + + + + + 5 +
|
||||
// \ + \ +
|
||||
// \ + \ +
|
||||
// \+ \+
|
||||
// 3 + + + + + + + + 7
|
||||
//
|
||||
// World coordinate system: +y is up (aligned with gravity),
|
||||
// +z is toward the user, +x follows right hand rule.
|
||||
// The front face is defined as +z axis on xy plane.
|
||||
// The top face is defined as +y axis on xz plane.
|
||||
//
|
||||
|
||||
class Box : public Model {
|
||||
public:
|
||||
EIGEN_MAKE_ALIGNED_OPERATOR_NEW
|
||||
|
||||
explicit Box(const std::string& category);
|
||||
~Box() override = default;
|
||||
|
||||
bool InsideTest(const Vector3f& point, int check_axis) const;
|
||||
|
||||
const std::vector<Face>& GetFaces() const { return faces_; }
|
||||
const Face& GetFace(size_t face_id) const { return faces_[face_id]; }
|
||||
|
||||
const std::vector<std::array<int, 2>>& GetEdges() const { return edges_; }
|
||||
const std::array<int, 2>& GetEdge(size_t edge_id) const {
|
||||
return edges_[edge_id];
|
||||
}
|
||||
|
||||
// Returns the keypoints for the front face of the box.
|
||||
// The front face is defind as a face with +z normal vector on xy plane
|
||||
// In Box's c'tor, the top face is set to {1, 3, 7, 5}
|
||||
const Face& GetFrontFace() const;
|
||||
|
||||
// Returns the keypoints for the top face of the box.
|
||||
// The top face is defind as a face with +z normal vector on xy plane
|
||||
// In Box's c'tor, the top face is set to {1, 3, 7, 5}
|
||||
const Face& GetTopFace() const;
|
||||
|
||||
void Update() override;
|
||||
void Adjust(const std::vector<float>& variables) override;
|
||||
float* GetVertex(size_t vertex_id) override;
|
||||
const float* GetVertex(size_t vertex_id) const override;
|
||||
void Deserialize(const Object& obj) override;
|
||||
void Serialize(Object* obj) override;
|
||||
|
||||
// Computes the plane center and the normal vector for the plane the object
|
||||
// is sitting on in the world cooordinate system. The normal vector is roughly
|
||||
// aligned with gravity.
|
||||
std::pair<Vector3f, Vector3f> GetGroundPlane() const;
|
||||
|
||||
// Estimates a box 9-dof parameters from the given vertices. Directly computes
|
||||
// the scale of the box, then solves for orientation and translation.
|
||||
// Expects a std::vector of size 9 of a Eigen::Vector3f or mapped Vector3f.
|
||||
// If mapping proto messages, we recommend to use the Map<const Vector3f>.
|
||||
// For example:
|
||||
//
|
||||
// using T = Map<const Vector3f>;
|
||||
// std::vector<T> vertices;
|
||||
// for (const auto& point : message) { // point is a repeated float message.
|
||||
// T p(point.data());
|
||||
// vertices.emplace_back(p);
|
||||
// }
|
||||
// box.Fit<T>(vertices);
|
||||
//
|
||||
// The Points must be arranged as 1 + 8 (center keypoint followed by 8 box
|
||||
// vertices) vector. This function will overwrite the scale and transformation
|
||||
// properties of the class.
|
||||
template <typename T = Eigen::Map<const Vector3f>>
|
||||
void Fit(const std::vector<T>& vertices);
|
||||
|
||||
private:
|
||||
std::vector<Face> faces_;
|
||||
std::vector<std::array<int, 2>> edges_;
|
||||
std::vector<Vector3f> bounding_box_;
|
||||
};
|
||||
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_
|
153
mediapipe/graphs/object_detection_3d/calculators/box_util.cc
Normal file
|
@ -0,0 +1,153 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
void ComputeBoundingRect(const std::vector<cv::Point2f>& points,
|
||||
mediapipe::TimedBoxProto* box) {
|
||||
CHECK(box != nullptr);
|
||||
float top = 1.0f;
|
||||
float bottom = 0.0f;
|
||||
float left = 1.0f;
|
||||
float right = 0.0f;
|
||||
for (const auto& point : points) {
|
||||
top = std::min(top, point.y);
|
||||
bottom = std::max(bottom, point.y);
|
||||
left = std::min(left, point.x);
|
||||
right = std::max(right, point.x);
|
||||
}
|
||||
box->set_top(top);
|
||||
box->set_bottom(bottom);
|
||||
box->set_left(left);
|
||||
box->set_right(right);
|
||||
// We are currently only doing axis aligned bounding box. If we need to
|
||||
// compute rotated bounding box, then we need the original image aspect ratio,
|
||||
// map back to original image space, compute cv::convexHull, then for each
|
||||
// edge of the hull, rotate according to edge orientation, find the box.
|
||||
box->set_rotation(0.0f);
|
||||
}
|
||||
|
||||
float ComputeBoxIoU(const TimedBoxProto& box1, const TimedBoxProto& box2) {
|
||||
cv::Point2f box1_center((box1.left() + box1.right()) * 0.5f,
|
||||
(box1.top() + box1.bottom()) * 0.5f);
|
||||
cv::Size2f box1_size(box1.right() - box1.left(), box1.bottom() - box1.top());
|
||||
cv::RotatedRect rect1(box1_center, box1_size,
|
||||
-box1.rotation() * 180.0f / M_PI);
|
||||
cv::Point2f box2_center((box2.left() + box2.right()) * 0.5f,
|
||||
(box2.top() + box2.bottom()) * 0.5f);
|
||||
cv::Size2f box2_size(box2.right() - box2.left(), box2.bottom() - box2.top());
|
||||
cv::RotatedRect rect2(box2_center, box2_size,
|
||||
-box2.rotation() * 180.0f / M_PI);
|
||||
std::vector<cv::Point2f> intersections_unsorted;
|
||||
std::vector<cv::Point2f> intersections;
|
||||
cv::rotatedRectangleIntersection(rect1, rect2, intersections_unsorted);
|
||||
if (intersections_unsorted.size() < 3) {
|
||||
return 0.0f;
|
||||
}
|
||||
cv::convexHull(intersections_unsorted, intersections);
|
||||
|
||||
// We use Shoelace formula to compute area of polygons.
|
||||
float intersection_area = 0.0f;
|
||||
for (int i = 0; i < intersections.size(); ++i) {
|
||||
const auto& curr_pt = intersections[i];
|
||||
const int i_next = (i + 1) == intersections.size() ? 0 : (i + 1);
|
||||
const auto& next_pt = intersections[i_next];
|
||||
intersection_area += (curr_pt.x * next_pt.y - next_pt.x * curr_pt.y);
|
||||
}
|
||||
intersection_area = std::abs(intersection_area) * 0.5f;
|
||||
|
||||
// Compute union area
|
||||
const float union_area =
|
||||
rect1.size.area() + rect2.size.area() - intersection_area + 1e-5f;
|
||||
|
||||
const float iou = intersection_area / union_area;
|
||||
return iou;
|
||||
}
|
||||
|
||||
std::vector<cv::Point2f> ComputeBoxCorners(const TimedBoxProto& box,
|
||||
float width, float height) {
|
||||
// Rotate 4 corner w.r.t. center.
|
||||
const cv::Point2f center(0.5f * (box.left() + box.right()) * width,
|
||||
0.5f * (box.top() + box.bottom()) * height);
|
||||
const std::vector<cv::Point2f> corners{
|
||||
cv::Point2f(box.left() * width, box.top() * height),
|
||||
cv::Point2f(box.left() * width, box.bottom() * height),
|
||||
cv::Point2f(box.right() * width, box.bottom() * height),
|
||||
cv::Point2f(box.right() * width, box.top() * height)};
|
||||
|
||||
const float cos_a = std::cos(box.rotation());
|
||||
const float sin_a = std::sin(box.rotation());
|
||||
std::vector<cv::Point2f> transformed_corners(4);
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
// Scale and rotate w.r.t. center.
|
||||
const cv::Point2f rad = corners[k] - center;
|
||||
const cv::Point2f rot_rad(cos_a * rad.x - sin_a * rad.y,
|
||||
sin_a * rad.x + cos_a * rad.y);
|
||||
transformed_corners[k] = center + rot_rad;
|
||||
transformed_corners[k].x /= width;
|
||||
transformed_corners[k].y /= height;
|
||||
}
|
||||
return transformed_corners;
|
||||
}
|
||||
|
||||
cv::Mat PerspectiveTransformBetweenBoxes(const TimedBoxProto& src_box,
|
||||
const TimedBoxProto& dst_box,
|
||||
const float aspect_ratio) {
|
||||
std::vector<cv::Point2f> box1_corners =
|
||||
ComputeBoxCorners(src_box, /*width*/ aspect_ratio, /*height*/ 1.0f);
|
||||
std::vector<cv::Point2f> box2_corners =
|
||||
ComputeBoxCorners(dst_box, /*width*/ aspect_ratio, /*height*/ 1.0f);
|
||||
cv::Mat affine_transform = cv::getPerspectiveTransform(
|
||||
/*src*/ box1_corners, /*dst*/ box2_corners);
|
||||
cv::Mat output_affine;
|
||||
affine_transform.convertTo(output_affine, CV_32FC1);
|
||||
return output_affine;
|
||||
}
|
||||
|
||||
cv::Point2f MapPoint(const TimedBoxProto& src_box, const TimedBoxProto& dst_box,
|
||||
const cv::Point2f& src_point, float width, float height) {
|
||||
const cv::Point2f src_center(
|
||||
0.5f * (src_box.left() + src_box.right()) * width,
|
||||
0.5f * (src_box.top() + src_box.bottom()) * height);
|
||||
const cv::Point2f dst_center(
|
||||
0.5f * (dst_box.left() + dst_box.right()) * width,
|
||||
0.5f * (dst_box.top() + dst_box.bottom()) * height);
|
||||
const float scale_x =
|
||||
(dst_box.right() - dst_box.left()) / (src_box.right() - src_box.left());
|
||||
const float scale_y =
|
||||
(dst_box.bottom() - dst_box.top()) / (src_box.bottom() - src_box.top());
|
||||
const float rotation = dst_box.rotation() - src_box.rotation();
|
||||
const cv::Point2f rad =
|
||||
cv::Point2f(src_point.x * width, src_point.y * height) - src_center;
|
||||
const float rad_x = rad.x * scale_x;
|
||||
const float rad_y = rad.y * scale_y;
|
||||
const float cos_a = std::cos(rotation);
|
||||
const float sin_a = std::sin(rotation);
|
||||
const cv::Point2f rot_rad(cos_a * rad_x - sin_a * rad_y,
|
||||
sin_a * rad_x + cos_a * rad_y);
|
||||
const cv::Point2f dst_point_image = dst_center + rot_rad;
|
||||
const cv::Point2f dst_point(dst_point_image.x / width,
|
||||
dst_point_image.y / height);
|
||||
return dst_point;
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
50
mediapipe/graphs/object_detection_3d/calculators/box_util.h
Normal file
|
@ -0,0 +1,50 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_
|
||||
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// This function fills the geometry of the TimedBoxProto. Id, timestamp etc.
|
||||
// need to be set outside this function.
|
||||
void ComputeBoundingRect(const std::vector<cv::Point2f>& points,
|
||||
mediapipe::TimedBoxProto* box);
|
||||
|
||||
// This function computes the intersection over union between two boxes.
|
||||
float ComputeBoxIoU(const TimedBoxProto& box1, const TimedBoxProto& box2);
|
||||
|
||||
// Computes corners of the box.
|
||||
// width and height are image width and height, which is typically
|
||||
// needed since the box is in normalized coordinates.
|
||||
std::vector<cv::Point2f> ComputeBoxCorners(const TimedBoxProto& box,
|
||||
float width, float height);
|
||||
|
||||
// Computes the perspective transform from box1 to box2.
|
||||
// The input argument aspect_ratio is width / height of the image.
|
||||
// The returned matrix should be a 3x3 matrix.
|
||||
cv::Mat PerspectiveTransformBetweenBoxes(const TimedBoxProto& src_box,
|
||||
const TimedBoxProto& dst_box,
|
||||
const float aspect_ratio);
|
||||
|
||||
// Map point according to source and destination box location.
|
||||
cv::Point2f MapPoint(const TimedBoxProto& src_box, const TimedBoxProto& dst_box,
|
||||
const cv::Point2f& src_point, float width, float height);
|
||||
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_
|
|
@ -0,0 +1,123 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
|
||||
|
||||
#include "mediapipe/framework/port/gmock.h"
|
||||
#include "mediapipe/framework/port/gtest.h"
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
namespace {
|
||||
|
||||
TEST(BoxUtilTest, TestComputeBoundingRect) {
|
||||
std::vector<cv::Point2f> points{
|
||||
cv::Point2f(0.35f, 0.25f), cv::Point2f(0.3f, 0.3f),
|
||||
cv::Point2f(0.2f, 0.4f), cv::Point2f(0.3f, 0.1f),
|
||||
cv::Point2f(0.2f, 0.2f), cv::Point2f(0.5f, 0.3f),
|
||||
cv::Point2f(0.4f, 0.4f), cv::Point2f(0.5f, 0.1f),
|
||||
cv::Point2f(0.4f, 0.2f)};
|
||||
TimedBoxProto box;
|
||||
ComputeBoundingRect(points, &box);
|
||||
EXPECT_FLOAT_EQ(0.1f, box.top());
|
||||
EXPECT_FLOAT_EQ(0.4f, box.bottom());
|
||||
EXPECT_FLOAT_EQ(0.2f, box.left());
|
||||
EXPECT_FLOAT_EQ(0.5f, box.right());
|
||||
}
|
||||
|
||||
TEST(BoxUtilTest, TestComputeBoxIoU) {
|
||||
TimedBoxProto box1;
|
||||
box1.set_top(0.2f);
|
||||
box1.set_bottom(0.6f);
|
||||
box1.set_left(0.1f);
|
||||
box1.set_right(0.3f);
|
||||
box1.set_rotation(0.0f);
|
||||
TimedBoxProto box2 = box1;
|
||||
box2.set_rotation(/*pi/2*/ 1.570796f);
|
||||
const float box_area =
|
||||
(box1.bottom() - box1.top()) * (box1.right() - box1.left());
|
||||
const float box_intersection =
|
||||
(box1.right() - box1.left()) * (box1.right() - box1.left());
|
||||
const float expected_iou =
|
||||
box_intersection / (box_area * 2 - box_intersection);
|
||||
EXPECT_NEAR(expected_iou, ComputeBoxIoU(box1, box2), 3e-5f);
|
||||
|
||||
TimedBoxProto box3;
|
||||
box3.set_top(0.2f);
|
||||
box3.set_bottom(0.6f);
|
||||
box3.set_left(0.5f);
|
||||
box3.set_right(0.7f);
|
||||
EXPECT_NEAR(0.0f, ComputeBoxIoU(box1, box3), 3e-5f);
|
||||
}
|
||||
|
||||
TEST(BoxUtilTest, TestPerspectiveTransformBetweenBoxes) {
|
||||
TimedBoxProto box1;
|
||||
const float height = 4.0f;
|
||||
const float width = 3.0f;
|
||||
box1.set_top(1.0f / height);
|
||||
box1.set_bottom(2.0f / height);
|
||||
box1.set_left(1.0f / width);
|
||||
box1.set_right(2.0f / width);
|
||||
TimedBoxProto box2;
|
||||
box2.set_top(1.0f / height);
|
||||
box2.set_bottom(2.0f / height);
|
||||
box2.set_left(1.0f / width);
|
||||
box2.set_right(2.0f / width);
|
||||
box2.set_rotation(/*pi/4*/ -0.785398f);
|
||||
cv::Mat transform =
|
||||
PerspectiveTransformBetweenBoxes(box1, box2, width / height);
|
||||
const float kTolerence = 1e-5f;
|
||||
const cv::Vec3f original_position(1.5f / width, 1.0f / height, 1.0f);
|
||||
const cv::Mat transformed_position = transform * cv::Mat(original_position);
|
||||
EXPECT_NEAR(
|
||||
(1.5f - 0.5f * std::sqrt(2) / 2.0f) / width,
|
||||
transformed_position.at<float>(0) / transformed_position.at<float>(2),
|
||||
kTolerence);
|
||||
EXPECT_NEAR(
|
||||
(1.5f - 0.5f * std::sqrt(2) / 2.0f) / height,
|
||||
transformed_position.at<float>(1) / transformed_position.at<float>(2),
|
||||
kTolerence);
|
||||
}
|
||||
|
||||
TEST(BoxUtilTest, TestMapPoint) {
|
||||
const float height = 4.0f;
|
||||
const float width = 3.0f;
|
||||
TimedBoxProto box1;
|
||||
box1.set_top(1.0f / height);
|
||||
box1.set_bottom(2.0f / height);
|
||||
box1.set_left(1.0f / width);
|
||||
box1.set_right(2.0f / width);
|
||||
TimedBoxProto box2;
|
||||
box2.set_top(1.0f / height);
|
||||
box2.set_bottom(2.0f / height);
|
||||
box2.set_left(1.0f / width);
|
||||
box2.set_right(2.0f / width);
|
||||
box2.set_rotation(/*pi/4*/ -0.785398f);
|
||||
|
||||
cv::Point2f src_point1(1.2f / width, 1.4f / height);
|
||||
cv::Point2f src_point2(1.3f / width, 1.8f / height);
|
||||
const float distance1 = std::sqrt(0.1 * 0.1 + 0.4 * 0.4);
|
||||
cv::Point2f dst_point1 = MapPoint(box1, box2, src_point1, width, height);
|
||||
cv::Point2f dst_point2 = MapPoint(box1, box2, src_point2, width, height);
|
||||
const float distance2 =
|
||||
std::sqrt((dst_point1.x * width - dst_point2.x * width) *
|
||||
(dst_point1.x * width - dst_point2.x * width) +
|
||||
(dst_point1.y * height - dst_point2.y * height) *
|
||||
(dst_point1.y * height - dst_point2.y * height));
|
||||
EXPECT_NEAR(distance1, distance2, 1e-5f);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,47 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
message CameraParametersProto {
|
||||
// This number is non-negative, it represents camera height above ground
|
||||
// normalized by focal length.
|
||||
optional float height_above_ground = 1 [default = 100.0];
|
||||
// Width of image in portrait orientation normalized by focal length
|
||||
optional float portrait_width = 2 [default = 1.0103];
|
||||
// Height of image in portrait orientation normalized by focal length
|
||||
optional float portrait_height = 3 [default = 1.3435];
|
||||
enum ImageOrientation {
|
||||
PORTRAIT_ORIENTATION = 0;
|
||||
LANDSCAPE_ORIENTATION = 1;
|
||||
}
|
||||
// The input image orientation
|
||||
optional ImageOrientation image_orientation = 4
|
||||
[default = PORTRAIT_ORIENTATION];
|
||||
|
||||
// This defines the projection method from 2D screen to 3D.
|
||||
enum ProjectionMode {
|
||||
UNSPECIFIED = 0;
|
||||
// Projects 2D point to ground plane (horizontal plane).
|
||||
GROUND_PLANE = 1;
|
||||
// Projects 2D point to sphere.
|
||||
SPHERE = 2;
|
||||
}
|
||||
optional ProjectionMode projection_mode = 5 [default = GROUND_PLANE];
|
||||
// Radius of sphere when using the SPHERE projection mode above.
|
||||
// The value is normalized by focal length.
|
||||
optional float projection_sphere_radius = 6 [default = 100.0];
|
||||
}
|
257
mediapipe/graphs/object_detection_3d/calculators/decoder.cc
Normal file
|
@ -0,0 +1,257 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h"
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "mediapipe/framework/port/canonical_errors.h"
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
constexpr int Decoder::kNumOffsetmaps = 16;
|
||||
|
||||
namespace {
|
||||
void SetPoint3d(float x, float y, float z, Point3D* point_3d) {
|
||||
point_3d->set_x(x);
|
||||
point_3d->set_y(y);
|
||||
point_3d->set_z(z);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
FrameAnnotation Decoder::DecodeBoundingBoxKeypoints(
|
||||
const cv::Mat& heatmap, const cv::Mat& offsetmap) const {
|
||||
CHECK_EQ(1, heatmap.channels());
|
||||
CHECK_EQ(kNumOffsetmaps, offsetmap.channels());
|
||||
CHECK_EQ(heatmap.cols, offsetmap.cols);
|
||||
CHECK_EQ(heatmap.rows, offsetmap.rows);
|
||||
|
||||
const float offset_scale = std::min(offsetmap.cols, offsetmap.rows);
|
||||
const std::vector<cv::Point> center_points = ExtractCenterKeypoints(heatmap);
|
||||
std::vector<BeliefBox> boxes;
|
||||
for (const auto& center_point : center_points) {
|
||||
BeliefBox box;
|
||||
box.box_2d.emplace_back(center_point.x, center_point.y);
|
||||
const int center_x = static_cast<int>(std::round(center_point.x));
|
||||
const int center_y = static_cast<int>(std::round(center_point.y));
|
||||
box.belief = heatmap.at<float>(center_y, center_x);
|
||||
if (config_.voting_radius() > 1) {
|
||||
DecodeByVoting(heatmap, offsetmap, center_x, center_y, offset_scale,
|
||||
offset_scale, &box);
|
||||
} else {
|
||||
DecodeByPeak(offsetmap, center_x, center_y, offset_scale, offset_scale,
|
||||
&box);
|
||||
}
|
||||
if (IsNewBox(&boxes, &box)) {
|
||||
boxes.push_back(std::move(box));
|
||||
}
|
||||
}
|
||||
|
||||
const float x_scale = 1.0f / offsetmap.cols;
|
||||
const float y_scale = 1.0f / offsetmap.rows;
|
||||
FrameAnnotation frame_annotations;
|
||||
for (const auto& box : boxes) {
|
||||
auto* object = frame_annotations.add_annotations();
|
||||
for (const auto& point : box.box_2d) {
|
||||
auto* point2d = object->add_keypoints()->mutable_point_2d();
|
||||
point2d->set_x(point.first * x_scale);
|
||||
point2d->set_y(point.second * y_scale);
|
||||
}
|
||||
}
|
||||
return frame_annotations;
|
||||
}
|
||||
|
||||
void Decoder::DecodeByPeak(const cv::Mat& offsetmap, int center_x, int center_y,
|
||||
float offset_scale_x, float offset_scale_y,
|
||||
BeliefBox* box) const {
|
||||
const auto& offset = offsetmap.at<cv::Vec<float, kNumOffsetmaps>>(
|
||||
/*row*/ center_y, /*col*/ center_x);
|
||||
for (int i = 0; i < kNumOffsetmaps / 2; ++i) {
|
||||
const float x_offset = offset[2 * i] * offset_scale_x;
|
||||
const float y_offset = offset[2 * i + 1] * offset_scale_y;
|
||||
box->box_2d.emplace_back(center_x + x_offset, center_y + y_offset);
|
||||
}
|
||||
}
|
||||
|
||||
void Decoder::DecodeByVoting(const cv::Mat& heatmap, const cv::Mat& offsetmap,
|
||||
int center_x, int center_y, float offset_scale_x,
|
||||
float offset_scale_y, BeliefBox* box) const {
|
||||
// Votes at the center.
|
||||
const auto& center_offset = offsetmap.at<cv::Vec<float, kNumOffsetmaps>>(
|
||||
/*row*/ center_y, /*col*/ center_x);
|
||||
std::vector<float> center_votes(kNumOffsetmaps, 0.f);
|
||||
for (int i = 0; i < kNumOffsetmaps / 2; ++i) {
|
||||
center_votes[2 * i] = center_x + center_offset[2 * i] * offset_scale_x;
|
||||
center_votes[2 * i + 1] =
|
||||
center_y + center_offset[2 * i + 1] * offset_scale_y;
|
||||
}
|
||||
|
||||
// Find voting window.
|
||||
int x_min = std::max(0, center_x - config_.voting_radius());
|
||||
int y_min = std::max(0, center_y - config_.voting_radius());
|
||||
int width = std::min(heatmap.cols - x_min, config_.voting_radius() * 2 + 1);
|
||||
int height = std::min(heatmap.rows - y_min, config_.voting_radius() * 2 + 1);
|
||||
cv::Rect rect(x_min, y_min, width, height);
|
||||
cv::Mat heat = heatmap(rect);
|
||||
cv::Mat offset = offsetmap(rect);
|
||||
|
||||
for (int i = 0; i < kNumOffsetmaps / 2; ++i) {
|
||||
float x_sum = 0.f;
|
||||
float y_sum = 0.f;
|
||||
float votes = 0.f;
|
||||
for (int r = 0; r < heat.rows; ++r) {
|
||||
for (int c = 0; c < heat.cols; ++c) {
|
||||
const float belief = heat.at<float>(r, c);
|
||||
if (belief < config_.voting_threshold()) {
|
||||
continue;
|
||||
}
|
||||
float offset_x =
|
||||
offset.at<cv::Vec<float, kNumOffsetmaps>>(r, c)[2 * i] *
|
||||
offset_scale_x;
|
||||
float offset_y =
|
||||
offset.at<cv::Vec<float, kNumOffsetmaps>>(r, c)[2 * i + 1] *
|
||||
offset_scale_y;
|
||||
float vote_x = c + rect.x + offset_x;
|
||||
float vote_y = r + rect.y + offset_y;
|
||||
float x_diff = std::abs(vote_x - center_votes[2 * i]);
|
||||
float y_diff = std::abs(vote_y - center_votes[2 * i + 1]);
|
||||
if (x_diff > config_.voting_allowance() ||
|
||||
y_diff > config_.voting_allowance()) {
|
||||
continue;
|
||||
}
|
||||
x_sum += vote_x * belief;
|
||||
y_sum += vote_y * belief;
|
||||
votes += belief;
|
||||
}
|
||||
}
|
||||
box->box_2d.emplace_back(x_sum / votes, y_sum / votes);
|
||||
}
|
||||
}
|
||||
|
||||
bool Decoder::IsNewBox(std::vector<BeliefBox>* boxes, BeliefBox* box) const {
|
||||
for (auto& b : *boxes) {
|
||||
if (IsIdentical(b, *box)) {
|
||||
if (b.belief < box->belief) {
|
||||
std::swap(b, *box);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Decoder::IsIdentical(const BeliefBox& box_1,
|
||||
const BeliefBox& box_2) const {
|
||||
// Skip the center point.
|
||||
for (int i = 1; i < box_1.box_2d.size(); ++i) {
|
||||
const float x_diff =
|
||||
std::abs(box_1.box_2d[i].first - box_2.box_2d[i].first);
|
||||
const float y_diff =
|
||||
std::abs(box_1.box_2d[i].second - box_2.box_2d[i].second);
|
||||
if (x_diff > config_.voting_allowance() ||
|
||||
y_diff > config_.voting_allowance()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<cv::Point> Decoder::ExtractCenterKeypoints(
|
||||
const cv::Mat& center_heatmap) const {
|
||||
cv::Mat max_filtered_heatmap(center_heatmap.rows, center_heatmap.cols,
|
||||
center_heatmap.type());
|
||||
const int kernel_size =
|
||||
static_cast<int>(config_.local_max_distance() * 2 + 1 + 0.5f);
|
||||
const cv::Size morph_size(kernel_size, kernel_size);
|
||||
cv::dilate(center_heatmap, max_filtered_heatmap,
|
||||
cv::getStructuringElement(cv::MORPH_RECT, morph_size));
|
||||
cv::Mat peak_map;
|
||||
cv::bitwise_and((center_heatmap >= max_filtered_heatmap),
|
||||
(center_heatmap >= config_.heatmap_threshold()), peak_map);
|
||||
std::vector<cv::Point> locations; // output, locations of non-zero pixels
|
||||
cv::findNonZero(peak_map, locations);
|
||||
return locations;
|
||||
}
|
||||
|
||||
absl::Status Decoder::Lift2DTo3D(
|
||||
const Eigen::Matrix<float, 4, 4, Eigen::RowMajor>& projection_matrix,
|
||||
bool portrait, FrameAnnotation* estimated_box) const {
|
||||
CHECK(estimated_box != nullptr);
|
||||
const float fx = projection_matrix(0, 0);
|
||||
const float fy = projection_matrix(1, 1);
|
||||
const float cx = projection_matrix(0, 2);
|
||||
const float cy = projection_matrix(1, 2);
|
||||
for (auto& annotation : *estimated_box->mutable_annotations()) {
|
||||
Eigen::Matrix<float, 16, 12, Eigen::RowMajor> m =
|
||||
Eigen::Matrix<float, 16, 12, Eigen::RowMajor>::Zero(16, 12);
|
||||
CHECK_EQ(9, annotation.keypoints_size());
|
||||
float u, v;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
const auto& keypoint2d = annotation.keypoints(i + 1).point_2d();
|
||||
if (portrait) {
|
||||
// swap x and y given that our image is in portrait orientation
|
||||
u = keypoint2d.y() * 2 - 1;
|
||||
v = keypoint2d.x() * 2 - 1;
|
||||
} else {
|
||||
u = keypoint2d.x() * 2 - 1;
|
||||
v = 1 - keypoint2d.y() * 2; // (1 - keypoint2d.y()) * 2 - 1
|
||||
}
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
// For each of the 4 control points, formulate two rows of the
|
||||
// m matrix (two equations).
|
||||
const float control_alpha = epnp_alpha_(i, j);
|
||||
m(i * 2, j * 3) = fx * control_alpha;
|
||||
m(i * 2, j * 3 + 2) = (cx + u) * control_alpha;
|
||||
m(i * 2 + 1, j * 3 + 1) = fy * control_alpha;
|
||||
m(i * 2 + 1, j * 3 + 2) = (cy + v) * control_alpha;
|
||||
}
|
||||
}
|
||||
// This is a self adjoint matrix. Use SelfAdjointEigenSolver for a fast
|
||||
// and stable solution.
|
||||
Eigen::Matrix<float, 12, 12, Eigen::RowMajor> mt_m = m.transpose() * m;
|
||||
Eigen::SelfAdjointEigenSolver<Eigen::Matrix<float, 12, 12, Eigen::RowMajor>>
|
||||
eigen_solver(mt_m);
|
||||
if (eigen_solver.info() != Eigen::Success) {
|
||||
return absl::AbortedError("Eigen decomposition failed.");
|
||||
}
|
||||
CHECK_EQ(12, eigen_solver.eigenvalues().size());
|
||||
// Eigenvalues are sorted in increasing order for SelfAdjointEigenSolver
|
||||
// only! If you use other Eigen Solvers, it's not guaranteed to be in
|
||||
// increasing order. Here, we just take the eigen vector corresponding
|
||||
// to first/smallest eigen value, since we used SelfAdjointEigenSolver.
|
||||
Eigen::VectorXf eigen_vec = eigen_solver.eigenvectors().col(0);
|
||||
Eigen::Map<Eigen::Matrix<float, 4, 3, Eigen::RowMajor>> control_matrix(
|
||||
eigen_vec.data());
|
||||
if (control_matrix(0, 2) > 0) {
|
||||
control_matrix = -control_matrix;
|
||||
}
|
||||
// First set the center keypoint.
|
||||
SetPoint3d(control_matrix(0, 0), control_matrix(0, 1), control_matrix(0, 2),
|
||||
annotation.mutable_keypoints(0)->mutable_point_3d());
|
||||
// Then set the 8 vertices.
|
||||
Eigen::Matrix<float, 8, 3, Eigen::RowMajor> vertices =
|
||||
epnp_alpha_ * control_matrix;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
SetPoint3d(vertices(i, 0), vertices(i, 1), vertices(i, 2),
|
||||
annotation.mutable_keypoints(i + 1)->mutable_point_3d());
|
||||
}
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
109
mediapipe/graphs/object_detection_3d/calculators/decoder.h
Normal file
|
@ -0,0 +1,109 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "absl/status/status.h"
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Decodes 3D bounding box from heatmaps and offset maps. In the future,
|
||||
// if we want to develop decoder for generic skeleton, then we need to
|
||||
// generalize this class, and make a few child classes.
|
||||
class Decoder {
|
||||
public:
|
||||
static const int kNumOffsetmaps;
|
||||
|
||||
explicit Decoder(const BeliefDecoderConfig& config) : config_(config) {
|
||||
epnp_alpha_ << 4.0f, -1.0f, -1.0f, -1.0f, 2.0f, -1.0f, -1.0f, 1.0f, 2.0f,
|
||||
-1.0f, 1.0f, -1.0f, 0.0f, -1.0f, 1.0f, 1.0f, 2.0f, 1.0f, -1.0f, -1.0f,
|
||||
0.0f, 1.0f, -1.0f, 1.0f, 0.0f, 1.0f, 1.0f, -1.0f, -2.0f, 1.0f, 1.0f,
|
||||
1.0f;
|
||||
}
|
||||
|
||||
// Decodes bounding boxes from predicted heatmap and offset maps.
|
||||
// Input:
|
||||
// heatmap: a single channel cv::Mat representing center point heatmap
|
||||
// offsetmap: a 16 channel cv::Mat representing the 16 offset maps
|
||||
// (2 for each of the 8 vertices)
|
||||
// Output:
|
||||
// Outputs 3D bounding boxes 2D vertices, represented by 'point_2d' field
|
||||
// in each 'keypoints' field of object annotations.
|
||||
FrameAnnotation DecodeBoundingBoxKeypoints(const cv::Mat& heatmap,
|
||||
const cv::Mat& offsetmap) const;
|
||||
|
||||
// Lifts the estimated 2D projections of bounding box vertices to 3D.
|
||||
// This function uses the EPnP approach described in this paper:
|
||||
// https://icwww.epfl.ch/~lepetit/papers/lepetit_ijcv08.pdf .
|
||||
// Input:
|
||||
// projection_matrix: the projection matrix from 3D coordinate
|
||||
// to screen coordinate.
|
||||
// The 2D screen coordinate is defined as: u is along the long
|
||||
// edge of the device, pointing down; v is along the short edge
|
||||
// of the device, pointing right.
|
||||
// portrait: a boolen variable indicating whether our images are
|
||||
// obtained in portrait orientation or not.
|
||||
// estimated_box: annotation with point_2d field populated with
|
||||
// 2d vertices.
|
||||
// Output:
|
||||
// estimated_box: annotation with point_3d field populated with
|
||||
// 3d vertices.
|
||||
absl::Status Lift2DTo3D(
|
||||
const Eigen::Matrix<float, 4, 4, Eigen::RowMajor>& projection_matrix,
|
||||
bool portrait, FrameAnnotation* estimated_box) const;
|
||||
|
||||
private:
|
||||
struct BeliefBox {
|
||||
float belief;
|
||||
std::vector<std::pair<float, float>> box_2d;
|
||||
};
|
||||
|
||||
std::vector<cv::Point> ExtractCenterKeypoints(
|
||||
const cv::Mat& center_heatmap) const;
|
||||
|
||||
// Decodes 2D keypoints at the peak point.
|
||||
void DecodeByPeak(const cv::Mat& offsetmap, int center_x, int center_y,
|
||||
float offset_scale_x, float offset_scale_y,
|
||||
BeliefBox* box) const;
|
||||
|
||||
// Decodes 2D keypoints by voting around the peak.
|
||||
void DecodeByVoting(const cv::Mat& heatmap, const cv::Mat& offsetmap,
|
||||
int center_x, int center_y, float offset_scale_x,
|
||||
float offset_scale_y, BeliefBox* box) const;
|
||||
|
||||
// Returns true if it is a new box. Otherwise, it may replace an existing box
|
||||
// if the new box's belief is higher.
|
||||
bool IsNewBox(std::vector<BeliefBox>* boxes, BeliefBox* box) const;
|
||||
|
||||
// Returns true if the two boxes are identical.
|
||||
bool IsIdentical(const BeliefBox& box_1, const BeliefBox& box_2) const;
|
||||
|
||||
BeliefDecoderConfig config_;
|
||||
// Following equation (1) in this paper
|
||||
// https://icwww.epfl.ch/~lepetit/papers/lepetit_ijcv08.pdf,
|
||||
// this variable denotes the coefficients for the 4 control points
|
||||
// for each of the 8 3D box vertices.
|
||||
Eigen::Matrix<float, 8, 4, Eigen::RowMajor> epnp_alpha_;
|
||||
};
|
||||
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_
|
|
@ -0,0 +1,115 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "absl/memory/memory.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace {
|
||||
constexpr char kInputStreamTag[] = "FRAME_ANNOTATION";
|
||||
constexpr char kOutputStreamTag[] = "BOXES";
|
||||
} // namespace
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Convert FrameAnnotation 3d bounding box detections to TimedBoxListProto
|
||||
// 2d bounding boxes.
|
||||
//
|
||||
// Input:
|
||||
// FRAME_ANNOTATION - 3d bounding box annotation.
|
||||
// Output:
|
||||
// BOXES - 2d bounding box enclosing the projection of 3d box.
|
||||
//
|
||||
// Usage example:
|
||||
// node {
|
||||
// calculator: "FrameAnnotationToTimedBoxListCalculator"
|
||||
// input_stream: "FRAME_ANNOTATION:frame_annotation"
|
||||
// output_stream: "BOXES:boxes"
|
||||
// }
|
||||
class FrameAnnotationToTimedBoxListCalculator : public CalculatorBase {
|
||||
public:
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Close(CalculatorContext* cc) override;
|
||||
};
|
||||
REGISTER_CALCULATOR(FrameAnnotationToTimedBoxListCalculator);
|
||||
|
||||
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
RET_CHECK(!cc->Inputs().GetTags().empty());
|
||||
RET_CHECK(!cc->Outputs().GetTags().empty());
|
||||
|
||||
if (cc->Inputs().HasTag(kInputStreamTag)) {
|
||||
cc->Inputs().Tag(kInputStreamTag).Set<FrameAnnotation>();
|
||||
}
|
||||
|
||||
if (cc->Outputs().HasTag(kOutputStreamTag)) {
|
||||
cc->Outputs().Tag(kOutputStreamTag).Set<TimedBoxProtoList>();
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
if (cc->Inputs().HasTag(kInputStreamTag) &&
|
||||
!cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
|
||||
const auto& frame_annotation =
|
||||
cc->Inputs().Tag(kInputStreamTag).Get<FrameAnnotation>();
|
||||
auto output_objects = absl::make_unique<TimedBoxProtoList>();
|
||||
for (const auto& annotation : frame_annotation.annotations()) {
|
||||
std::vector<cv::Point2f> key_points;
|
||||
for (const auto& keypoint : annotation.keypoints()) {
|
||||
key_points.push_back(
|
||||
cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()));
|
||||
}
|
||||
TimedBoxProto* added_box = output_objects->add_box();
|
||||
ComputeBoundingRect(key_points, added_box);
|
||||
added_box->set_id(annotation.object_id());
|
||||
const int64 time_msec =
|
||||
static_cast<int64>(std::round(frame_annotation.timestamp() / 1000));
|
||||
added_box->set_time_msec(time_msec);
|
||||
}
|
||||
|
||||
// Output
|
||||
if (cc->Outputs().HasTag(kOutputStreamTag)) {
|
||||
cc->Outputs()
|
||||
.Tag(kOutputStreamTag)
|
||||
.Add(output_objects.release(), cc->InputTimestamp());
|
||||
}
|
||||
}
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Close(
|
||||
CalculatorContext* cc) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h"
|
||||
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
void FrameAnnotationTracker::AddDetectionResult(
|
||||
const FrameAnnotation& frame_annotation) {
|
||||
const int64 time_us =
|
||||
static_cast<int64>(std::round(frame_annotation.timestamp()));
|
||||
for (const auto& object_annotation : frame_annotation.annotations()) {
|
||||
detected_objects_[time_us + object_annotation.object_id()] =
|
||||
object_annotation;
|
||||
}
|
||||
}
|
||||
|
||||
FrameAnnotation FrameAnnotationTracker::ConsolidateTrackingResult(
|
||||
const TimedBoxProtoList& tracked_boxes,
|
||||
absl::flat_hash_set<int>* cancel_object_ids) {
|
||||
CHECK(cancel_object_ids != nullptr);
|
||||
FrameAnnotation frame_annotation;
|
||||
std::vector<int64> keys_to_be_deleted;
|
||||
for (const auto& detected_obj : detected_objects_) {
|
||||
const int object_id = detected_obj.second.object_id();
|
||||
if (cancel_object_ids->contains(object_id)) {
|
||||
// Remember duplicated detections' keys.
|
||||
keys_to_be_deleted.push_back(detected_obj.first);
|
||||
continue;
|
||||
}
|
||||
TimedBoxProto ref_box;
|
||||
for (const auto& box : tracked_boxes.box()) {
|
||||
if (box.id() == object_id) {
|
||||
ref_box = box;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ref_box.has_id() || ref_box.id() < 0) {
|
||||
LOG(ERROR) << "Can't find matching tracked box for object id: "
|
||||
<< object_id << ". Likely lost tracking of it.";
|
||||
keys_to_be_deleted.push_back(detected_obj.first);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find duplicated boxes
|
||||
for (const auto& box : tracked_boxes.box()) {
|
||||
if (box.id() != object_id) {
|
||||
if (ComputeBoxIoU(ref_box, box) > iou_threshold_) {
|
||||
cancel_object_ids->insert(box.id());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Map ObjectAnnotation from detection to tracked time.
|
||||
// First, gather all keypoints from source detection.
|
||||
std::vector<cv::Point2f> key_points;
|
||||
for (const auto& keypoint : detected_obj.second.keypoints()) {
|
||||
key_points.push_back(
|
||||
cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()));
|
||||
}
|
||||
// Second, find source box.
|
||||
TimedBoxProto src_box;
|
||||
ComputeBoundingRect(key_points, &src_box);
|
||||
ObjectAnnotation* tracked_obj = frame_annotation.add_annotations();
|
||||
tracked_obj->set_object_id(ref_box.id());
|
||||
// Finally, map all keypoints in the source detection to tracked location.
|
||||
for (const auto& keypoint : detected_obj.second.keypoints()) {
|
||||
cv::Point2f dst = MapPoint(
|
||||
src_box, ref_box,
|
||||
cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()),
|
||||
img_width_, img_height_);
|
||||
auto* dst_point = tracked_obj->add_keypoints()->mutable_point_2d();
|
||||
dst_point->set_x(dst.x);
|
||||
dst_point->set_y(dst.y);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& key : keys_to_be_deleted) {
|
||||
detected_objects_.erase(key);
|
||||
}
|
||||
|
||||
return frame_annotation;
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,62 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "absl/container/btree_map.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "mediapipe/framework/port/integral_types.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
class FrameAnnotationTracker {
|
||||
public:
|
||||
// If two bounding boxes have IoU over iou_threshold, then we consider them
|
||||
// describing the same object.
|
||||
FrameAnnotationTracker(float iou_threshold, float img_width, float img_height)
|
||||
: iou_threshold_(iou_threshold),
|
||||
img_width_(img_width),
|
||||
img_height_(img_height) {}
|
||||
|
||||
// Adds detection results from an external detector.
|
||||
void AddDetectionResult(const FrameAnnotation& frame_annotation);
|
||||
|
||||
// Consolidates tracking result from an external tracker, associates with
|
||||
// the detection result by the object id, and produces the corresponding
|
||||
// result in FrameAnnotation. When there are duplicates, output the ids that
|
||||
// need to be cancelled in cancel_object_ids.
|
||||
// Note that the returned FrameAnnotation is missing timestamp. Need to fill
|
||||
// that field.
|
||||
FrameAnnotation ConsolidateTrackingResult(
|
||||
const TimedBoxProtoList& tracked_boxes,
|
||||
absl::flat_hash_set<int>* cancel_object_ids);
|
||||
|
||||
private:
|
||||
float iou_threshold_;
|
||||
float img_width_;
|
||||
float img_height_;
|
||||
// Cached detection results over time.
|
||||
// Key is timestamp_us + object_id.
|
||||
absl::btree_map<int64, ObjectAnnotation, std::greater<int64>>
|
||||
detected_objects_;
|
||||
};
|
||||
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_
|
|
@ -0,0 +1,137 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.pb.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace {
|
||||
constexpr char kInputFrameAnnotationTag[] = "FRAME_ANNOTATION";
|
||||
constexpr char kInputTrackedBoxesTag[] = "TRACKED_BOXES";
|
||||
constexpr char kOutputTrackedFrameAnnotationTag[] = "TRACKED_FRAME_ANNOTATION";
|
||||
constexpr char kOutputCancelObjectIdTag[] = "CANCEL_OBJECT_ID";
|
||||
} // namespace
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Tracks frame annotations seeded/updated by FRAME_ANNOTATION input_stream.
|
||||
// When using this calculator, make sure FRAME_ANNOTATION and TRACKED_BOXES
|
||||
// are in different sync set.
|
||||
//
|
||||
// Input:
|
||||
// FRAME_ANNOTATION - frame annotation.
|
||||
// TRACKED_BOXES - 2d box tracking result
|
||||
// Output:
|
||||
// TRACKED_FRAME_ANNOTATION - annotation inferred from 2d tracking result.
|
||||
// CANCEL_OBJECT_ID - object id that needs to be cancelled from the tracker.
|
||||
//
|
||||
// Usage example:
|
||||
// node {
|
||||
// calculator: "FrameAnnotationTrackerCalculator"
|
||||
// input_stream: "FRAME_ANNOTATION:frame_annotation"
|
||||
// input_stream: "TRACKED_BOXES:tracked_boxes"
|
||||
// output_stream: "TRACKED_FRAME_ANNOTATION:tracked_frame_annotation"
|
||||
// output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
|
||||
// }
|
||||
class FrameAnnotationTrackerCalculator : public CalculatorBase {
|
||||
public:
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Close(CalculatorContext* cc) override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<FrameAnnotationTracker> frame_annotation_tracker_;
|
||||
};
|
||||
REGISTER_CALCULATOR(FrameAnnotationTrackerCalculator);
|
||||
|
||||
::mediapipe::Status FrameAnnotationTrackerCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
RET_CHECK(!cc->Inputs().GetTags().empty());
|
||||
RET_CHECK(!cc->Outputs().GetTags().empty());
|
||||
|
||||
if (cc->Inputs().HasTag(kInputFrameAnnotationTag)) {
|
||||
cc->Inputs().Tag(kInputFrameAnnotationTag).Set<FrameAnnotation>();
|
||||
}
|
||||
if (cc->Inputs().HasTag(kInputTrackedBoxesTag)) {
|
||||
cc->Inputs().Tag(kInputTrackedBoxesTag).Set<TimedBoxProtoList>();
|
||||
}
|
||||
if (cc->Outputs().HasTag(kOutputTrackedFrameAnnotationTag)) {
|
||||
cc->Outputs().Tag(kOutputTrackedFrameAnnotationTag).Set<FrameAnnotation>();
|
||||
}
|
||||
if (cc->Outputs().HasTag(kOutputCancelObjectIdTag)) {
|
||||
cc->Outputs().Tag(kOutputCancelObjectIdTag).Set<int>();
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status FrameAnnotationTrackerCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
const auto& options = cc->Options<FrameAnnotationTrackerCalculatorOptions>();
|
||||
frame_annotation_tracker_ = absl::make_unique<FrameAnnotationTracker>(
|
||||
options.iou_threshold(), options.img_width(), options.img_height());
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status FrameAnnotationTrackerCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
if (cc->Inputs().HasTag(kInputFrameAnnotationTag) &&
|
||||
!cc->Inputs().Tag(kInputFrameAnnotationTag).IsEmpty()) {
|
||||
frame_annotation_tracker_->AddDetectionResult(
|
||||
cc->Inputs().Tag(kInputFrameAnnotationTag).Get<FrameAnnotation>());
|
||||
}
|
||||
if (cc->Inputs().HasTag(kInputTrackedBoxesTag) &&
|
||||
!cc->Inputs().Tag(kInputTrackedBoxesTag).IsEmpty() &&
|
||||
cc->Outputs().HasTag(kOutputTrackedFrameAnnotationTag)) {
|
||||
absl::flat_hash_set<int> cancel_object_ids;
|
||||
auto output_frame_annotation = absl::make_unique<FrameAnnotation>();
|
||||
*output_frame_annotation =
|
||||
frame_annotation_tracker_->ConsolidateTrackingResult(
|
||||
cc->Inputs().Tag(kInputTrackedBoxesTag).Get<TimedBoxProtoList>(),
|
||||
&cancel_object_ids);
|
||||
output_frame_annotation->set_timestamp(cc->InputTimestamp().Microseconds());
|
||||
|
||||
cc->Outputs()
|
||||
.Tag(kOutputTrackedFrameAnnotationTag)
|
||||
.Add(output_frame_annotation.release(), cc->InputTimestamp());
|
||||
|
||||
if (cc->Outputs().HasTag(kOutputCancelObjectIdTag)) {
|
||||
auto packet_timestamp = cc->InputTimestamp();
|
||||
for (const auto& id : cancel_object_ids) {
|
||||
// The timestamp is incremented (by 1 us) because currently the box
|
||||
// tracker calculator only accepts one cancel object ID for any given
|
||||
// timestamp.
|
||||
cc->Outputs()
|
||||
.Tag(kOutputCancelObjectIdTag)
|
||||
.AddPacket(mediapipe::MakePacket<int>(id).At(packet_timestamp++));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status FrameAnnotationTrackerCalculator::Close(
|
||||
CalculatorContext* cc) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,36 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// The option proto for the FrameAnnotationTrackerCalculatorOptions.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
|
||||
message FrameAnnotationTrackerCalculatorOptions {
|
||||
extend CalculatorOptions {
|
||||
optional FrameAnnotationTrackerCalculatorOptions ext = 291291253;
|
||||
}
|
||||
|
||||
// The threshold on intersection-over-union (IoU). We consider
|
||||
// boxes with IoU larger than this threshold to be the duplicates.
|
||||
optional float iou_threshold = 1 [default = 0.5];
|
||||
|
||||
// We need image dimension to properly compute annotation locations.
|
||||
optional float img_width = 2;
|
||||
|
||||
optional float img_height = 3;
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h"
|
||||
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "mediapipe/framework/port/gmock.h"
|
||||
#include "mediapipe/framework/port/gtest.h"
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/util/tracking/box_tracker.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
namespace {
|
||||
|
||||
// Create a new object annotation by shifting a reference
|
||||
// object annotation.
|
||||
ObjectAnnotation ShiftObject2d(const ObjectAnnotation& ref_obj, float dx,
|
||||
float dy) {
|
||||
ObjectAnnotation obj = ref_obj;
|
||||
for (auto& keypoint : *(obj.mutable_keypoints())) {
|
||||
const float ref_x = keypoint.point_2d().x();
|
||||
const float ref_y = keypoint.point_2d().y();
|
||||
keypoint.mutable_point_2d()->set_x(ref_x + dx);
|
||||
keypoint.mutable_point_2d()->set_y(ref_y + dy);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
TimedBoxProto ShiftBox(const TimedBoxProto& ref_box, float dx, float dy) {
|
||||
TimedBoxProto box = ref_box;
|
||||
box.set_top(ref_box.top() + dy);
|
||||
box.set_bottom(ref_box.bottom() + dy);
|
||||
box.set_left(ref_box.left() + dx);
|
||||
box.set_right(ref_box.right() + dx);
|
||||
return box;
|
||||
}
|
||||
|
||||
// Constructs a fixed ObjectAnnotation.
|
||||
ObjectAnnotation ConstructFixedObject(
|
||||
const std::vector<std::vector<float>>& points) {
|
||||
ObjectAnnotation obj;
|
||||
for (const auto& point : points) {
|
||||
auto* keypoint = obj.add_keypoints();
|
||||
CHECK_EQ(2, point.size());
|
||||
keypoint->mutable_point_2d()->set_x(point[0]);
|
||||
keypoint->mutable_point_2d()->set_y(point[1]);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
TEST(FrameAnnotationTrackerTest, TestConsolidation) {
|
||||
// Add 4 detections represented by FrameAnnotation, of which 3 correspond
|
||||
// to the same object.
|
||||
ObjectAnnotation object1, object2, object3, object4;
|
||||
// The bounding rectangle for these object keypoints is:
|
||||
// x: [0.2, 0.5], y: [0.1, 0.4]
|
||||
object3 = ConstructFixedObject({{0.35f, 0.25f},
|
||||
{0.3f, 0.3f},
|
||||
{0.2f, 0.4f},
|
||||
{0.3f, 0.1f},
|
||||
{0.2f, 0.2f},
|
||||
{0.5f, 0.3f},
|
||||
{0.4f, 0.4f},
|
||||
{0.5f, 0.1f},
|
||||
{0.4f, 0.2f}});
|
||||
object3.set_object_id(3);
|
||||
object1 = ShiftObject2d(object3, -0.05f, -0.05f);
|
||||
object1.set_object_id(1);
|
||||
object2 = ShiftObject2d(object3, 0.05f, 0.05f);
|
||||
object2.set_object_id(2);
|
||||
object4 = ShiftObject2d(object3, 0.2f, 0.2f);
|
||||
object4.set_object_id(4);
|
||||
FrameAnnotation frame_annotation_1;
|
||||
frame_annotation_1.set_timestamp(30 * 1000); // 30ms
|
||||
*(frame_annotation_1.add_annotations()) = object1;
|
||||
*(frame_annotation_1.add_annotations()) = object4;
|
||||
FrameAnnotation frame_annotation_2;
|
||||
frame_annotation_2.set_timestamp(60 * 1000); // 60ms
|
||||
*(frame_annotation_2.add_annotations()) = object2;
|
||||
FrameAnnotation frame_annotation_3;
|
||||
frame_annotation_3.set_timestamp(90 * 1000); // 90ms
|
||||
*(frame_annotation_3.add_annotations()) = object3;
|
||||
|
||||
FrameAnnotationTracker frame_annotation_tracker(/*iou_threshold*/ 0.5f, 1.0f,
|
||||
1.0f);
|
||||
frame_annotation_tracker.AddDetectionResult(frame_annotation_1);
|
||||
frame_annotation_tracker.AddDetectionResult(frame_annotation_2);
|
||||
frame_annotation_tracker.AddDetectionResult(frame_annotation_3);
|
||||
|
||||
TimedBoxProtoList timed_box_proto_list;
|
||||
TimedBoxProto* timed_box_proto = timed_box_proto_list.add_box();
|
||||
timed_box_proto->set_top(0.4f);
|
||||
timed_box_proto->set_bottom(0.7f);
|
||||
timed_box_proto->set_left(0.6f);
|
||||
timed_box_proto->set_right(0.9f);
|
||||
timed_box_proto->set_id(3);
|
||||
timed_box_proto->set_time_msec(150);
|
||||
timed_box_proto = timed_box_proto_list.add_box();
|
||||
*timed_box_proto = ShiftBox(timed_box_proto_list.box(0), 0.01f, 0.01f);
|
||||
timed_box_proto->set_id(1);
|
||||
timed_box_proto->set_time_msec(150);
|
||||
timed_box_proto = timed_box_proto_list.add_box();
|
||||
*timed_box_proto = ShiftBox(timed_box_proto_list.box(0), -0.01f, -0.01f);
|
||||
timed_box_proto->set_id(2);
|
||||
timed_box_proto->set_time_msec(150);
|
||||
absl::flat_hash_set<int> cancel_object_ids;
|
||||
FrameAnnotation tracked_detection =
|
||||
frame_annotation_tracker.ConsolidateTrackingResult(timed_box_proto_list,
|
||||
&cancel_object_ids);
|
||||
EXPECT_EQ(2, cancel_object_ids.size());
|
||||
EXPECT_EQ(1, cancel_object_ids.count(1));
|
||||
EXPECT_EQ(1, cancel_object_ids.count(2));
|
||||
EXPECT_EQ(1, tracked_detection.annotations_size());
|
||||
EXPECT_EQ(3, tracked_detection.annotations(0).object_id());
|
||||
EXPECT_EQ(object3.keypoints_size(),
|
||||
tracked_detection.annotations(0).keypoints_size());
|
||||
const float x_offset = 0.4f;
|
||||
const float y_offset = 0.3f;
|
||||
const float tolerance = 1e-5f;
|
||||
for (int i = 0; i < object3.keypoints_size(); ++i) {
|
||||
const auto& point_2d =
|
||||
tracked_detection.annotations(0).keypoints(i).point_2d();
|
||||
EXPECT_NEAR(point_2d.x(), object3.keypoints(i).point_2d().x() + x_offset,
|
||||
tolerance);
|
||||
EXPECT_NEAR(point_2d.y(), object3.keypoints(i).point_2d().y() + y_offset,
|
||||
tolerance);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,760 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
#include "mediapipe/util/android/asset_manager_util.h"
|
||||
#else
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#endif
|
||||
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/framework/port/status.h"
|
||||
#include "mediapipe/gpu/gl_calculator_helper.h"
|
||||
#include "mediapipe/gpu/shader_util.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/camera_parameters.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/model_matrix.pb.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
namespace {
|
||||
|
||||
#if defined(GL_DEBUG)
|
||||
#define GLCHECK(command) \
|
||||
command; \
|
||||
if (int err = glGetError()) LOG(ERROR) << "GL error detected: " << err;
|
||||
#else
|
||||
#define GLCHECK(command) command
|
||||
#endif
|
||||
|
||||
// For ease of use, we prefer ImageFrame on Android and GpuBuffer otherwise.
|
||||
#if defined(__ANDROID__)
|
||||
typedef ImageFrame AssetTextureFormat;
|
||||
#else
|
||||
typedef GpuBuffer AssetTextureFormat;
|
||||
#endif
|
||||
|
||||
enum { ATTRIB_VERTEX, ATTRIB_TEXTURE_POSITION, NUM_ATTRIBUTES };
|
||||
static const int kNumMatrixEntries = 16;
|
||||
|
||||
// Hard-coded MVP Matrix for testing.
|
||||
static const float kModelMatrix[] = {0.83704215, -0.36174262, 0.41049102, 0.0,
|
||||
0.06146407, 0.8076706, 0.5864218, 0.0,
|
||||
-0.54367524, -0.4656292, 0.69828844, 0.0,
|
||||
0.0, 0.0, -98.64117, 1.0};
|
||||
|
||||
// Loads a texture from an input side packet, and streams in an animation file
|
||||
// from a filename given in another input side packet, and renders the animation
|
||||
// over the screen according to the input timestamp and desired animation FPS.
|
||||
//
|
||||
// Inputs:
|
||||
// VIDEO (GpuBuffer, optional):
|
||||
// If provided, the input buffer will be assumed to be unique, and will be
|
||||
// consumed by this calculator and rendered to directly. The output video
|
||||
// buffer will then be the released reference to the input video buffer.
|
||||
// MODEL_MATRICES (TimedModelMatrixProtoList, optional):
|
||||
// If provided, will set the model matrices for the objects to be rendered
|
||||
// during future rendering calls.
|
||||
//
|
||||
// Input side packets:
|
||||
// TEXTURE (ImageFrame on Android / GpuBuffer on iOS, required):
|
||||
// Texture to use with animation file.
|
||||
// ANIMATION_ASSET (String, required):
|
||||
// Path of animation file to load and render. Should be generated by
|
||||
// //java/com/google/android/apps/motionstills/SimpleObjEncryptor with
|
||||
// --compressed_mode=true. See comments and documentation there for more
|
||||
// information on custom .obj.uuu file format.
|
||||
// CAMERA_PARAMETERS_PROTO_STRING (String, optional):
|
||||
// Serialized proto std::string of CameraParametersProto. We need this to
|
||||
// get the right aspect ratio and field of view.
|
||||
// Options:
|
||||
// aspect_ratio: the ratio between the rendered image width and height.
|
||||
// It will be ignored if CAMERA_PARAMETERS_PROTO_STRING input side packet
|
||||
// is provided.
|
||||
// vertical_fov_degrees: vertical field of view in degrees.
|
||||
// It will be ignored if CAMERA_PARAMETERS_PROTO_STRING input side packet
|
||||
// is provided.
|
||||
// z_clipping_plane_near: near plane value for z-clipping.
|
||||
// z_clipping_plane_far: far plane value for z-clipping.
|
||||
// animation_speed_fps: speed at which to cycle through animation frames (in
|
||||
// frames per second).
|
||||
//
|
||||
// Outputs:
|
||||
// OUTPUT, or index 0 (GpuBuffer):
|
||||
// Frames filled with the given texture.
|
||||
|
||||
// Simple helper-struct for containing the parsed geometry data from a 3D
|
||||
// animation frame for rendering.
|
||||
|
||||
struct TriangleMesh {
|
||||
int index_count = 0; // Needed for glDrawElements rendering call
|
||||
std::unique_ptr<float[]> vertices = nullptr;
|
||||
std::unique_ptr<float[]> texture_coords = nullptr;
|
||||
std::unique_ptr<int16[]> triangle_indices = nullptr;
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<float[]> ModelMatrix;
|
||||
|
||||
} // namespace
|
||||
|
||||
class GlAnimationOverlayCalculator : public CalculatorBase {
|
||||
public:
|
||||
GlAnimationOverlayCalculator() {}
|
||||
~GlAnimationOverlayCalculator();
|
||||
|
||||
static ::mediapipe::Status GetContract(CalculatorContract *cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext *cc) override;
|
||||
::mediapipe::Status Process(CalculatorContext *cc) override;
|
||||
|
||||
private:
|
||||
bool has_video_stream_ = false;
|
||||
bool has_model_matrix_stream_ = false;
|
||||
bool has_mask_model_matrix_stream_ = false;
|
||||
bool has_occlusion_mask_ = false;
|
||||
|
||||
GlCalculatorHelper helper_;
|
||||
bool initialized_ = false;
|
||||
GlTexture texture_;
|
||||
GlTexture mask_texture_;
|
||||
|
||||
GLuint renderbuffer_ = 0;
|
||||
bool depth_buffer_created_ = false;
|
||||
|
||||
GLuint program_ = 0;
|
||||
GLint texture_uniform_ = -1;
|
||||
GLint perspective_matrix_uniform_ = -1;
|
||||
GLint model_matrix_uniform_ = -1;
|
||||
|
||||
std::vector<TriangleMesh> triangle_meshes_;
|
||||
std::vector<TriangleMesh> mask_meshes_;
|
||||
Timestamp animation_start_time_;
|
||||
int frame_count_ = 0;
|
||||
float animation_speed_fps_;
|
||||
|
||||
std::vector<ModelMatrix> current_model_matrices_;
|
||||
std::vector<ModelMatrix> current_mask_model_matrices_;
|
||||
|
||||
// Perspective matrix for rendering, to be applied to all model matrices
|
||||
// prior to passing through to the shader as a MVP matrix. Initialized during
|
||||
// first image packet read.
|
||||
float perspective_matrix_[kNumMatrixEntries];
|
||||
|
||||
void ComputeAspectRatioAndFovFromCameraParameters(
|
||||
const CameraParametersProto &camera_parameters, float *aspect_ratio,
|
||||
float *vertical_fov_degrees);
|
||||
int GetAnimationFrameIndex(Timestamp timestamp);
|
||||
::mediapipe::Status GlSetup();
|
||||
::mediapipe::Status GlBind(const TriangleMesh &triangle_mesh,
|
||||
const GlTexture &texture);
|
||||
::mediapipe::Status GlRender(const TriangleMesh &triangle_mesh,
|
||||
const float *model_matrix);
|
||||
void InitializePerspectiveMatrix(float aspect_ratio,
|
||||
float vertical_fov_degrees, float z_near,
|
||||
float z_far);
|
||||
void LoadModelMatrices(const TimedModelMatrixProtoList &model_matrices,
|
||||
std::vector<ModelMatrix> *current_model_matrices);
|
||||
|
||||
#if !defined(__ANDROID__)
|
||||
// Asset loading routine for all non-Android platforms.
|
||||
bool LoadAnimation(const std::string &filename);
|
||||
#else
|
||||
// Asset loading for all Android platforms.
|
||||
bool LoadAnimationAndroid(const std::string &filename,
|
||||
std::vector<TriangleMesh> *mesh);
|
||||
bool ReadBytesFromAsset(AAsset *asset, void *buffer, int num_bytes_to_read);
|
||||
#endif
|
||||
};
|
||||
REGISTER_CALCULATOR(GlAnimationOverlayCalculator);
|
||||
|
||||
// static
|
||||
::mediapipe::Status GlAnimationOverlayCalculator::GetContract(
|
||||
CalculatorContract *cc) {
|
||||
MP_RETURN_IF_ERROR(
|
||||
GlCalculatorHelper::SetupInputSidePackets(&(cc->InputSidePackets())));
|
||||
if (cc->Inputs().HasTag("VIDEO")) {
|
||||
// Currently used only for size and timestamp.
|
||||
cc->Inputs().Tag("VIDEO").Set<GpuBuffer>();
|
||||
}
|
||||
TagOrIndex(&(cc->Outputs()), "OUTPUT", 0).Set<GpuBuffer>();
|
||||
|
||||
if (cc->Inputs().HasTag("MODEL_MATRICES")) {
|
||||
cc->Inputs().Tag("MODEL_MATRICES").Set<TimedModelMatrixProtoList>();
|
||||
}
|
||||
if (cc->Inputs().HasTag("MASK_MODEL_MATRICES")) {
|
||||
cc->Inputs().Tag("MASK_MODEL_MATRICES").Set<TimedModelMatrixProtoList>();
|
||||
}
|
||||
|
||||
cc->InputSidePackets().Tag("TEXTURE").Set<AssetTextureFormat>();
|
||||
cc->InputSidePackets().Tag("ANIMATION_ASSET").Set<std::string>();
|
||||
if (cc->InputSidePackets().HasTag("CAMERA_PARAMETERS_PROTO_STRING")) {
|
||||
cc->InputSidePackets()
|
||||
.Tag("CAMERA_PARAMETERS_PROTO_STRING")
|
||||
.Set<std::string>();
|
||||
}
|
||||
|
||||
if (cc->InputSidePackets().HasTag("MASK_TEXTURE")) {
|
||||
cc->InputSidePackets().Tag("MASK_TEXTURE").Set<AssetTextureFormat>();
|
||||
}
|
||||
if (cc->InputSidePackets().HasTag("MASK_ASSET")) {
|
||||
cc->InputSidePackets().Tag("MASK_ASSET").Set<std::string>();
|
||||
}
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
// Helper function for initializing our perspective matrix.
|
||||
void GlAnimationOverlayCalculator::InitializePerspectiveMatrix(
|
||||
float aspect_ratio, float fov_degrees, float z_near, float z_far) {
|
||||
// Standard perspective projection matrix calculations.
|
||||
const float f = 1.0f / std::tan(fov_degrees * M_PI / 360.0f);
|
||||
for (int i = 0; i < kNumMatrixEntries; i++) {
|
||||
perspective_matrix_[i] = 0;
|
||||
}
|
||||
const float denom = 1.0f / (z_near - z_far);
|
||||
perspective_matrix_[0] = f / aspect_ratio;
|
||||
perspective_matrix_[5] = f;
|
||||
perspective_matrix_[10] = (z_near + z_far) * denom;
|
||||
perspective_matrix_[11] = -1.0f;
|
||||
perspective_matrix_[14] = 2.0f * z_far * z_near * denom;
|
||||
}
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
// Helper function for reading in a specified number of bytes from an Android
|
||||
// asset. Returns true if successfully reads in all bytes into buffer.
|
||||
bool GlAnimationOverlayCalculator::ReadBytesFromAsset(AAsset *asset,
|
||||
void *buffer,
|
||||
int num_bytes_to_read) {
|
||||
// Most file systems use block sizes of 4KB or 8KB; ideally we'd choose a
|
||||
// small multiple of the block size for best input streaming performance, so
|
||||
// we go for a reasobably safe buffer size of 8KB = 8*1024 bytes.
|
||||
static const int kMaxChunkSize = 8192;
|
||||
|
||||
int bytes_left = num_bytes_to_read;
|
||||
int bytes_read = 1; // any value > 0 here just to start looping.
|
||||
|
||||
// Treat as uint8_t array so we can deal in single byte arithmetic easily.
|
||||
uint8_t *currBufferIndex = reinterpret_cast<uint8_t *>(buffer);
|
||||
while (bytes_read > 0 && bytes_left > 0) {
|
||||
bytes_read = AAsset_read(asset, (void *)currBufferIndex,
|
||||
std::min(bytes_left, kMaxChunkSize));
|
||||
bytes_left -= bytes_read;
|
||||
currBufferIndex += bytes_read;
|
||||
}
|
||||
// At least log any I/O errors encountered.
|
||||
if (bytes_read < 0) {
|
||||
LOG(ERROR) << "Error reading from AAsset: " << bytes_read;
|
||||
return false;
|
||||
}
|
||||
if (bytes_left > 0) {
|
||||
// Reached EOF before reading in specified number of bytes.
|
||||
LOG(WARNING) << "Reached EOF before reading in specified number of bytes.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// The below asset streaming code is Android-only, making use of the platform
|
||||
// JNI helper classes AAssetManager and AAsset.
|
||||
bool GlAnimationOverlayCalculator::LoadAnimationAndroid(
|
||||
const std::string &filename, std::vector<TriangleMesh> *meshes) {
|
||||
mediapipe::AssetManager *mediapipe_asset_manager =
|
||||
Singleton<mediapipe::AssetManager>::get();
|
||||
AAssetManager *asset_manager = mediapipe_asset_manager->GetAssetManager();
|
||||
if (!asset_manager) {
|
||||
LOG(ERROR) << "Failed to access Android asset manager.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// New read-bytes stuff here! First we open file for streaming.
|
||||
AAsset *asset = AAssetManager_open(asset_manager, filename.c_str(),
|
||||
AASSET_MODE_STREAMING);
|
||||
if (!asset) {
|
||||
LOG(ERROR) << "Failed to open animation asset: " << filename;
|
||||
return false;
|
||||
}
|
||||
|
||||
// And now, while we are able to stream in more frames, we do so.
|
||||
frame_count_ = 0;
|
||||
int32 lengths[3];
|
||||
while (ReadBytesFromAsset(asset, (void *)lengths, sizeof(lengths[0]) * 3)) {
|
||||
// About to start reading the next animation frame. Stream it in here.
|
||||
// Each frame stores first the object counts of its three arrays
|
||||
// (vertices, texture coordinates, triangle indices; respectively), and
|
||||
// then stores each of those arrays as a byte dump, in order.
|
||||
meshes->emplace_back();
|
||||
TriangleMesh &triangle_mesh = meshes->back();
|
||||
// Try to read in vertices (4-byte floats)
|
||||
triangle_mesh.vertices.reset(new float[lengths[0]]);
|
||||
if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.vertices.get(),
|
||||
sizeof(float) * lengths[0])) {
|
||||
LOG(ERROR) << "Failed to read vertices for frame " << frame_count_;
|
||||
return false;
|
||||
}
|
||||
// Try to read in texture coordinates (4-byte floats)
|
||||
triangle_mesh.texture_coords.reset(new float[lengths[1]]);
|
||||
if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.texture_coords.get(),
|
||||
sizeof(float) * lengths[1])) {
|
||||
LOG(ERROR) << "Failed to read tex-coords for frame " << frame_count_;
|
||||
return false;
|
||||
}
|
||||
// Try to read in indices (2-byte shorts)
|
||||
triangle_mesh.index_count = lengths[2];
|
||||
triangle_mesh.triangle_indices.reset(new int16[lengths[2]]);
|
||||
if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.triangle_indices.get(),
|
||||
sizeof(int16) * lengths[2])) {
|
||||
LOG(ERROR) << "Failed to read indices for frame " << frame_count_;
|
||||
return false;
|
||||
}
|
||||
frame_count_++;
|
||||
}
|
||||
AAsset_close(asset);
|
||||
|
||||
LOG(INFO) << "Finished parsing " << frame_count_ << " animation frames.";
|
||||
if (meshes->empty()) {
|
||||
LOG(ERROR) << "No animation frames were parsed! Erroring out calculator.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#else // defined(__ANDROID__)
|
||||
|
||||
bool GlAnimationOverlayCalculator::LoadAnimation(const std::string &filename) {
|
||||
std::ifstream infile(filename.c_str(), std::ifstream::binary);
|
||||
if (!infile) {
|
||||
LOG(ERROR) << "Error opening asset with filename: " << filename;
|
||||
return false;
|
||||
}
|
||||
|
||||
frame_count_ = 0;
|
||||
int32 lengths[3];
|
||||
while (true) {
|
||||
// See if we have more initial size counts to read in.
|
||||
infile.read((char *)(lengths), sizeof(lengths[0]) * 3);
|
||||
if (!infile) {
|
||||
// No more frames to read. Close out.
|
||||
infile.close();
|
||||
break;
|
||||
}
|
||||
|
||||
triangle_meshes_.emplace_back();
|
||||
TriangleMesh &triangle_mesh = triangle_meshes_.back();
|
||||
|
||||
// Try to read in vertices (4-byte floats).
|
||||
triangle_mesh.vertices.reset(new float[lengths[0]]);
|
||||
infile.read((char *)(triangle_mesh.vertices.get()),
|
||||
sizeof(float) * lengths[0]);
|
||||
if (!infile) {
|
||||
LOG(ERROR) << "Failed to read vertices for frame " << frame_count_;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Try to read in texture coordinates (4-byte floats)
|
||||
triangle_mesh.texture_coords.reset(new float[lengths[1]]);
|
||||
infile.read((char *)(triangle_mesh.texture_coords.get()),
|
||||
sizeof(float) * lengths[1]);
|
||||
if (!infile) {
|
||||
LOG(ERROR) << "Failed to read texture coordinates for frame "
|
||||
<< frame_count_;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Try to read in the triangle indices (2-byte shorts)
|
||||
triangle_mesh.index_count = lengths[2];
|
||||
triangle_mesh.triangle_indices.reset(new int16[lengths[2]]);
|
||||
infile.read((char *)(triangle_mesh.triangle_indices.get()),
|
||||
sizeof(int16) * lengths[2]);
|
||||
if (!infile) {
|
||||
LOG(ERROR) << "Failed to read triangle indices for frame "
|
||||
<< frame_count_;
|
||||
return false;
|
||||
}
|
||||
frame_count_++;
|
||||
}
|
||||
|
||||
LOG(INFO) << "Finished parsing " << frame_count_ << " animation frames.";
|
||||
if (triangle_meshes_.empty()) {
|
||||
LOG(ERROR) << "No animation frames were parsed! Erroring out calculator.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void GlAnimationOverlayCalculator::ComputeAspectRatioAndFovFromCameraParameters(
|
||||
const CameraParametersProto &camera_parameters, float *aspect_ratio,
|
||||
float *vertical_fov_degrees) {
|
||||
CHECK(aspect_ratio != nullptr);
|
||||
CHECK(vertical_fov_degrees != nullptr);
|
||||
*aspect_ratio =
|
||||
camera_parameters.portrait_width() / camera_parameters.portrait_height();
|
||||
*vertical_fov_degrees =
|
||||
std::atan(camera_parameters.portrait_height() * 0.5f) * 2 * 180 / M_PI;
|
||||
}
|
||||
|
||||
::mediapipe::Status GlAnimationOverlayCalculator::Open(CalculatorContext *cc) {
|
||||
cc->SetOffset(TimestampDiff(0));
|
||||
MP_RETURN_IF_ERROR(helper_.Open(cc));
|
||||
|
||||
const auto &options = cc->Options<GlAnimationOverlayCalculatorOptions>();
|
||||
|
||||
animation_speed_fps_ = options.animation_speed_fps();
|
||||
|
||||
// Construct projection matrix using input side packets or option
|
||||
float aspect_ratio;
|
||||
float vertical_fov_degrees;
|
||||
if (cc->InputSidePackets().HasTag("CAMERA_PARAMETERS_PROTO_STRING")) {
|
||||
const std::string &camera_parameters_proto_string =
|
||||
cc->InputSidePackets()
|
||||
.Tag("CAMERA_PARAMETERS_PROTO_STRING")
|
||||
.Get<std::string>();
|
||||
CameraParametersProto camera_parameters_proto;
|
||||
camera_parameters_proto.ParseFromString(camera_parameters_proto_string);
|
||||
ComputeAspectRatioAndFovFromCameraParameters(
|
||||
camera_parameters_proto, &aspect_ratio, &vertical_fov_degrees);
|
||||
} else {
|
||||
aspect_ratio = options.aspect_ratio();
|
||||
vertical_fov_degrees = options.vertical_fov_degrees();
|
||||
}
|
||||
|
||||
// when constructing projection matrix.
|
||||
InitializePerspectiveMatrix(aspect_ratio, vertical_fov_degrees,
|
||||
options.z_clipping_plane_near(),
|
||||
options.z_clipping_plane_far());
|
||||
|
||||
// See what streams we have.
|
||||
has_video_stream_ = cc->Inputs().HasTag("VIDEO");
|
||||
has_model_matrix_stream_ = cc->Inputs().HasTag("MODEL_MATRICES");
|
||||
has_mask_model_matrix_stream_ = cc->Inputs().HasTag("MASK_MODEL_MATRICES");
|
||||
|
||||
// Try to load in the animation asset in a platform-specific manner.
|
||||
const std::string &asset_name =
|
||||
cc->InputSidePackets().Tag("ANIMATION_ASSET").Get<std::string>();
|
||||
bool loaded_animation = false;
|
||||
#if defined(__ANDROID__)
|
||||
if (cc->InputSidePackets().HasTag("MASK_ASSET")) {
|
||||
has_occlusion_mask_ = true;
|
||||
const std::string &mask_asset_name =
|
||||
cc->InputSidePackets().Tag("MASK_ASSET").Get<std::string>();
|
||||
loaded_animation = LoadAnimationAndroid(mask_asset_name, &mask_meshes_);
|
||||
if (!loaded_animation) {
|
||||
LOG(ERROR) << "Failed to load mask asset.";
|
||||
return ::mediapipe::UnknownError("Failed to load mask asset.");
|
||||
}
|
||||
}
|
||||
loaded_animation = LoadAnimationAndroid(asset_name, &triangle_meshes_);
|
||||
#else
|
||||
loaded_animation = LoadAnimation(asset_name);
|
||||
#endif
|
||||
if (!loaded_animation) {
|
||||
LOG(ERROR) << "Failed to load animation asset.";
|
||||
return ::mediapipe::UnknownError("Failed to load animation asset.");
|
||||
}
|
||||
|
||||
return helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status {
|
||||
if (cc->InputSidePackets().HasTag("MASK_TEXTURE")) {
|
||||
const auto &mask_texture =
|
||||
cc->InputSidePackets().Tag("MASK_TEXTURE").Get<AssetTextureFormat>();
|
||||
mask_texture_ = helper_.CreateSourceTexture(mask_texture);
|
||||
}
|
||||
|
||||
// Load in our asset's texture data
|
||||
const auto &input_texture =
|
||||
cc->InputSidePackets().Tag("TEXTURE").Get<AssetTextureFormat>();
|
||||
texture_ = helper_.CreateSourceTexture(input_texture);
|
||||
VLOG(2) << "Input texture size: " << texture_.width() << ", "
|
||||
<< texture_.height() << std::endl;
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
});
|
||||
}
|
||||
|
||||
int GlAnimationOverlayCalculator::GetAnimationFrameIndex(Timestamp timestamp) {
|
||||
double seconds_delta = timestamp.Seconds() - animation_start_time_.Seconds();
|
||||
int64_t frame_index =
|
||||
static_cast<int64_t>(seconds_delta * animation_speed_fps_);
|
||||
frame_index %= frame_count_;
|
||||
return static_cast<int>(frame_index);
|
||||
}
|
||||
|
||||
void GlAnimationOverlayCalculator::LoadModelMatrices(
|
||||
const TimedModelMatrixProtoList &model_matrices,
|
||||
std::vector<ModelMatrix> *current_model_matrices) {
|
||||
current_model_matrices->clear();
|
||||
for (int i = 0; i < model_matrices.model_matrix_size(); ++i) {
|
||||
const auto &model_matrix = model_matrices.model_matrix(i);
|
||||
CHECK(model_matrix.matrix_entries_size() == kNumMatrixEntries)
|
||||
<< "Invalid Model Matrix";
|
||||
current_model_matrices->emplace_back();
|
||||
ModelMatrix &new_matrix = current_model_matrices->back();
|
||||
new_matrix.reset(new float[kNumMatrixEntries]);
|
||||
for (int j = 0; j < kNumMatrixEntries; j++) {
|
||||
// Model matrices streamed in using ROW-MAJOR format, but we want
|
||||
// COLUMN-MAJOR for rendering, so we transpose here.
|
||||
int col = j % 4;
|
||||
int row = j / 4;
|
||||
new_matrix[row + col * 4] = model_matrix.matrix_entries(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
::mediapipe::Status GlAnimationOverlayCalculator::Process(
|
||||
CalculatorContext *cc) {
|
||||
return helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status {
|
||||
if (!initialized_) {
|
||||
MP_RETURN_IF_ERROR(GlSetup());
|
||||
initialized_ = true;
|
||||
animation_start_time_ = cc->InputTimestamp();
|
||||
}
|
||||
|
||||
// Process model matrices, if any are being streamed in, and update our
|
||||
// list.
|
||||
if (has_model_matrix_stream_ &&
|
||||
!cc->Inputs().Tag("MODEL_MATRICES").IsEmpty()) {
|
||||
const TimedModelMatrixProtoList &model_matrices =
|
||||
cc->Inputs().Tag("MODEL_MATRICES").Get<TimedModelMatrixProtoList>();
|
||||
LoadModelMatrices(model_matrices, ¤t_model_matrices_);
|
||||
}
|
||||
if (has_mask_model_matrix_stream_ &&
|
||||
!cc->Inputs().Tag("MASK_MODEL_MATRICES").IsEmpty()) {
|
||||
const TimedModelMatrixProtoList &model_matrices =
|
||||
cc->Inputs()
|
||||
.Tag("MASK_MODEL_MATRICES")
|
||||
.Get<TimedModelMatrixProtoList>();
|
||||
LoadModelMatrices(model_matrices, ¤t_mask_model_matrices_);
|
||||
}
|
||||
|
||||
// Arbitrary default width and height for output destination texture, in the
|
||||
// event that we don't have a valid and unique input buffer to overlay.
|
||||
int width = 640;
|
||||
int height = 480;
|
||||
|
||||
GlTexture dst;
|
||||
std::unique_ptr<GpuBuffer> input_frame(nullptr);
|
||||
if (has_video_stream_ && !(cc->Inputs().Tag("VIDEO").IsEmpty())) {
|
||||
auto result = cc->Inputs().Tag("VIDEO").Value().Consume<GpuBuffer>();
|
||||
if (result.ok()) {
|
||||
input_frame = std::move(result).ValueOrDie();
|
||||
#if !MEDIAPIPE_GPU_BUFFER_USE_CV_PIXEL_BUFFER
|
||||
input_frame->GetGlTextureBufferSharedPtr()->Reuse();
|
||||
#endif
|
||||
width = input_frame->width();
|
||||
height = input_frame->height();
|
||||
dst = helper_.CreateSourceTexture(*input_frame);
|
||||
} else {
|
||||
LOG(ERROR) << "Unable to consume input video frame for overlay!";
|
||||
LOG(ERROR) << "Status returned was: " << result.status();
|
||||
dst = helper_.CreateDestinationTexture(width, height);
|
||||
}
|
||||
} else if (!has_video_stream_) {
|
||||
dst = helper_.CreateDestinationTexture(width, height);
|
||||
} else {
|
||||
// We have an input video stream, but not for this frame. Don't render!
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
helper_.BindFramebuffer(dst);
|
||||
|
||||
if (!depth_buffer_created_) {
|
||||
// Create our private depth buffer.
|
||||
GLCHECK(glGenRenderbuffers(1, &renderbuffer_));
|
||||
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer_));
|
||||
GLCHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT16,
|
||||
width, height));
|
||||
GLCHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT,
|
||||
GL_RENDERBUFFER, renderbuffer_));
|
||||
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0));
|
||||
depth_buffer_created_ = true;
|
||||
}
|
||||
|
||||
// Re-bind our depth renderbuffer to our FBO depth attachment here.
|
||||
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer_));
|
||||
GLCHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT,
|
||||
GL_RENDERBUFFER, renderbuffer_));
|
||||
GLenum status = GLCHECK(glCheckFramebufferStatus(GL_FRAMEBUFFER));
|
||||
if (status != GL_FRAMEBUFFER_COMPLETE) {
|
||||
LOG(ERROR) << "Incomplete framebuffer with status: " << status;
|
||||
}
|
||||
GLCHECK(glClear(GL_DEPTH_BUFFER_BIT));
|
||||
|
||||
if (has_occlusion_mask_) {
|
||||
glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
|
||||
const TriangleMesh &mask_frame = mask_meshes_.front();
|
||||
MP_RETURN_IF_ERROR(GlBind(mask_frame, mask_texture_));
|
||||
// Draw objects using our latest model matrix stream packet.
|
||||
for (const ModelMatrix &model_matrix : current_mask_model_matrices_) {
|
||||
MP_RETURN_IF_ERROR(GlRender(mask_frame, model_matrix.get()));
|
||||
}
|
||||
}
|
||||
|
||||
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
|
||||
int frame_index = GetAnimationFrameIndex(cc->InputTimestamp());
|
||||
const TriangleMesh ¤t_frame = triangle_meshes_[frame_index];
|
||||
MP_RETURN_IF_ERROR(GlBind(current_frame, texture_));
|
||||
if (has_model_matrix_stream_) {
|
||||
// Draw objects using our latest model matrix stream packet.
|
||||
for (const ModelMatrix &model_matrix : current_model_matrices_) {
|
||||
MP_RETURN_IF_ERROR(GlRender(current_frame, model_matrix.get()));
|
||||
}
|
||||
} else {
|
||||
// Just draw one object to a static model matrix.
|
||||
MP_RETURN_IF_ERROR(GlRender(current_frame, kModelMatrix));
|
||||
}
|
||||
|
||||
// Disable vertex attributes
|
||||
GLCHECK(glEnableVertexAttribArray(ATTRIB_VERTEX));
|
||||
GLCHECK(glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION));
|
||||
|
||||
// Disable depth test
|
||||
GLCHECK(glDisable(GL_DEPTH_TEST));
|
||||
|
||||
// Unbind texture
|
||||
GLCHECK(glActiveTexture(GL_TEXTURE1));
|
||||
GLCHECK(glBindTexture(texture_.target(), 0));
|
||||
|
||||
// Unbind depth buffer
|
||||
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0));
|
||||
|
||||
GLCHECK(glFlush());
|
||||
|
||||
auto output = dst.GetFrame<GpuBuffer>();
|
||||
dst.Release();
|
||||
TagOrIndex(&(cc->Outputs()), "OUTPUT", 0)
|
||||
.Add(output.release(), cc->InputTimestamp());
|
||||
GLCHECK(glFrontFace(GL_CCW));
|
||||
return ::mediapipe::OkStatus();
|
||||
});
|
||||
}
|
||||
|
||||
::mediapipe::Status GlAnimationOverlayCalculator::GlSetup() {
|
||||
// Load vertex and fragment shaders
|
||||
const GLint attr_location[NUM_ATTRIBUTES] = {
|
||||
ATTRIB_VERTEX,
|
||||
ATTRIB_TEXTURE_POSITION,
|
||||
};
|
||||
const GLchar *attr_name[NUM_ATTRIBUTES] = {
|
||||
"position",
|
||||
"texture_coordinate",
|
||||
};
|
||||
|
||||
const GLchar *vert_src = R"(
|
||||
// Perspective projection matrix for rendering / clipping
|
||||
uniform mat4 perspectiveMatrix;
|
||||
|
||||
// Matrix defining the currently rendered object model
|
||||
uniform mat4 modelMatrix;
|
||||
|
||||
// vertex position in threespace
|
||||
attribute vec4 position;
|
||||
|
||||
// texture coordinate for each vertex in normalized texture space (0..1)
|
||||
attribute mediump vec4 texture_coordinate;
|
||||
|
||||
// texture coordinate for fragment shader (will be interpolated)
|
||||
varying mediump vec2 sample_coordinate;
|
||||
|
||||
void main() {
|
||||
sample_coordinate = texture_coordinate.xy;
|
||||
mat4 mvpMatrix = perspectiveMatrix * modelMatrix;
|
||||
gl_Position = mvpMatrix * position;
|
||||
}
|
||||
)";
|
||||
|
||||
const GLchar *frag_src = R"(
|
||||
precision mediump float;
|
||||
|
||||
varying vec2 sample_coordinate; // texture coordinate (0..1)
|
||||
uniform sampler2D texture; // texture to shade with
|
||||
|
||||
void main() {
|
||||
gl_FragColor = texture2D(texture, sample_coordinate);
|
||||
}
|
||||
)";
|
||||
|
||||
// Shader program
|
||||
GLCHECK(GlhCreateProgram(vert_src, frag_src, NUM_ATTRIBUTES,
|
||||
(const GLchar **)&attr_name[0], attr_location,
|
||||
&program_));
|
||||
RET_CHECK(program_) << "Problem initializing the program.";
|
||||
texture_uniform_ = GLCHECK(glGetUniformLocation(program_, "texture"));
|
||||
perspective_matrix_uniform_ =
|
||||
GLCHECK(glGetUniformLocation(program_, "perspectiveMatrix"));
|
||||
model_matrix_uniform_ =
|
||||
GLCHECK(glGetUniformLocation(program_, "modelMatrix"));
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status GlAnimationOverlayCalculator::GlBind(
|
||||
const TriangleMesh &triangle_mesh, const GlTexture &texture) {
|
||||
GLCHECK(glUseProgram(program_));
|
||||
|
||||
// Disable backface culling to allow occlusion effects.
|
||||
// Some options for solid arbitrary 3D geometry rendering
|
||||
GLCHECK(glEnable(GL_BLEND));
|
||||
GLCHECK(glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA));
|
||||
GLCHECK(glEnable(GL_DEPTH_TEST));
|
||||
GLCHECK(glFrontFace(GL_CW));
|
||||
GLCHECK(glDepthMask(GL_TRUE));
|
||||
GLCHECK(glDepthFunc(GL_LESS));
|
||||
|
||||
// Clear our depth buffer before starting draw calls
|
||||
GLCHECK(glVertexAttribPointer(ATTRIB_VERTEX, 3, GL_FLOAT, 0, 0,
|
||||
triangle_mesh.vertices.get()));
|
||||
GLCHECK(glEnableVertexAttribArray(ATTRIB_VERTEX));
|
||||
GLCHECK(glVertexAttribPointer(ATTRIB_TEXTURE_POSITION, 2, GL_FLOAT, 0, 0,
|
||||
triangle_mesh.texture_coords.get()));
|
||||
GLCHECK(glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION));
|
||||
GLCHECK(glActiveTexture(GL_TEXTURE1));
|
||||
GLCHECK(glBindTexture(texture.target(), texture.name()));
|
||||
|
||||
// We previously bound it to GL_TEXTURE1
|
||||
GLCHECK(glUniform1i(texture_uniform_, 1));
|
||||
|
||||
GLCHECK(glUniformMatrix4fv(perspective_matrix_uniform_, 1, GL_FALSE,
|
||||
perspective_matrix_));
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status GlAnimationOverlayCalculator::GlRender(
|
||||
const TriangleMesh &triangle_mesh, const float *model_matrix) {
|
||||
GLCHECK(glUniformMatrix4fv(model_matrix_uniform_, 1, GL_FALSE, model_matrix));
|
||||
GLCHECK(glDrawElements(GL_TRIANGLES, triangle_mesh.index_count,
|
||||
GL_UNSIGNED_SHORT,
|
||||
triangle_mesh.triangle_indices.get()));
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
GlAnimationOverlayCalculator::~GlAnimationOverlayCalculator() {
|
||||
helper_.RunInGlContext([this] {
|
||||
if (program_) {
|
||||
GLCHECK(glDeleteProgram(program_));
|
||||
program_ = 0;
|
||||
}
|
||||
if (depth_buffer_created_) {
|
||||
GLCHECK(glDeleteRenderbuffers(1, &renderbuffer_));
|
||||
renderbuffer_ = 0;
|
||||
}
|
||||
if (texture_.width() > 0) {
|
||||
texture_.Release();
|
||||
}
|
||||
if (mask_texture_.width() > 0) {
|
||||
mask_texture_.Release();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,41 @@
|
|||
// Copyright 2019 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
|
||||
message GlAnimationOverlayCalculatorOptions {
|
||||
extend CalculatorOptions {
|
||||
optional GlAnimationOverlayCalculatorOptions ext = 174760573;
|
||||
}
|
||||
|
||||
// Default aspect ratio of rendering target width over height.
|
||||
// This specific value is for 3:4 view. Do not change this default value.
|
||||
optional float aspect_ratio = 1 [default = 0.75];
|
||||
// Default vertical field of view in degrees. This specific default value
|
||||
// is arbitrary. Do not change this default value. If you want to use
|
||||
// a different vertical_fov_degrees, set it in the options.
|
||||
optional float vertical_fov_degrees = 2 [default = 70.0];
|
||||
|
||||
// Perspective projection matrix z-clipping near plane value.
|
||||
optional float z_clipping_plane_near = 3 [default = 0.1];
|
||||
// Perspective projection matrix z-clipping far plane value.
|
||||
optional float z_clipping_plane_far = 4 [default = 1000.0];
|
||||
|
||||
// Speed at which to play the animation (in frames per second).
|
||||
optional float animation_speed_fps = 5 [default = 25.0];
|
||||
}
|
|
@ -0,0 +1,168 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/types/span.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/deps/file_path.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h"
|
||||
|
||||
namespace {
|
||||
constexpr char kInputStreamTag[] = "FRAME_ANNOTATION";
|
||||
constexpr char kOutputStreamTag[] = "LIFTED_FRAME_ANNOTATION";
|
||||
|
||||
// Each detection object will be assigned an unique id that starts from 1.
|
||||
static int object_id = 0;
|
||||
|
||||
inline int GetNextObjectId() { return ++object_id; }
|
||||
} // namespace
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Lifted the 2D points in a tracked frame annotation to 3D.
|
||||
//
|
||||
// Input:
|
||||
// FRAME_ANNOTATIONS - Frame annotations with detected 2D points
|
||||
// Output:
|
||||
// LIFTED_FRAME_ANNOTATIONS - Result FrameAnnotation with lifted 3D points.
|
||||
//
|
||||
// Usage example:
|
||||
// node {
|
||||
// calculator: "Lift2DFrameAnnotationTo3DCalculator"
|
||||
// input_stream: "FRAME_ANNOTATIONS:tracked_annotations"
|
||||
// output_stream: "LIFTED_FRAME_ANNOTATIONS:lifted_3d_annotations"
|
||||
// }
|
||||
class Lift2DFrameAnnotationTo3DCalculator : public CalculatorBase {
|
||||
public:
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Close(CalculatorContext* cc) override;
|
||||
|
||||
private:
|
||||
::mediapipe::Status ProcessCPU(CalculatorContext* cc,
|
||||
FrameAnnotation* output_objects);
|
||||
::mediapipe::Status LoadOptions(CalculatorContext* cc);
|
||||
|
||||
// Increment and assign object ID for each detected object.
|
||||
// In a single MediaPipe session, the IDs are unique.
|
||||
// Also assign timestamp for the FrameAnnotation to be the input packet
|
||||
// timestamp.
|
||||
void AssignObjectIdAndTimestamp(int64 timestamp_us,
|
||||
FrameAnnotation* annotation);
|
||||
std::unique_ptr<Decoder> decoder_;
|
||||
::mediapipe::Lift2DFrameAnnotationTo3DCalculatorOptions options_;
|
||||
Eigen::Matrix<float, 4, 4, Eigen::RowMajor> projection_matrix_;
|
||||
};
|
||||
REGISTER_CALCULATOR(Lift2DFrameAnnotationTo3DCalculator);
|
||||
|
||||
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
RET_CHECK(cc->Inputs().HasTag(kInputStreamTag));
|
||||
RET_CHECK(cc->Outputs().HasTag(kOutputStreamTag));
|
||||
cc->Inputs().Tag(kInputStreamTag).Set<FrameAnnotation>();
|
||||
cc->Outputs().Tag(kOutputStreamTag).Set<FrameAnnotation>();
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
MP_RETURN_IF_ERROR(LoadOptions(cc));
|
||||
// clang-format off
|
||||
projection_matrix_ <<
|
||||
1.5731, 0, 0, 0,
|
||||
0, 2.0975, 0, 0,
|
||||
0, 0, -1.0002, -0.2,
|
||||
0, 0, -1, 0;
|
||||
// clang-format on
|
||||
|
||||
decoder_ = absl::make_unique<Decoder>(
|
||||
BeliefDecoderConfig(options_.decoder_config()));
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
auto output_objects = absl::make_unique<FrameAnnotation>();
|
||||
|
||||
MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get()));
|
||||
|
||||
// Output
|
||||
if (cc->Outputs().HasTag(kOutputStreamTag)) {
|
||||
cc->Outputs()
|
||||
.Tag(kOutputStreamTag)
|
||||
.Add(output_objects.release(), cc->InputTimestamp());
|
||||
}
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::ProcessCPU(
|
||||
CalculatorContext* cc, FrameAnnotation* output_objects) {
|
||||
const auto& input_frame_annotations =
|
||||
cc->Inputs().Tag(kInputStreamTag).Get<FrameAnnotation>();
|
||||
// Copy the input frame annotation to the output
|
||||
*output_objects = input_frame_annotations;
|
||||
|
||||
auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true,
|
||||
output_objects);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
return status;
|
||||
}
|
||||
AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(),
|
||||
output_objects);
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Close(
|
||||
CalculatorContext* cc) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::LoadOptions(
|
||||
CalculatorContext* cc) {
|
||||
// Get calculator options specified in the graph.
|
||||
options_ =
|
||||
cc->Options<::mediapipe::Lift2DFrameAnnotationTo3DCalculatorOptions>();
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
void Lift2DFrameAnnotationTo3DCalculator::AssignObjectIdAndTimestamp(
|
||||
int64 timestamp_us, FrameAnnotation* annotation) {
|
||||
for (auto& ann : *annotation->mutable_annotations()) {
|
||||
ann.set_object_id(GetNextObjectId());
|
||||
}
|
||||
annotation->set_timestamp(timestamp_us);
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,30 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// The option proto for the Lift2DFrameAnnotationTo3DCalculatorOptions.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
import "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto";
|
||||
|
||||
message Lift2DFrameAnnotationTo3DCalculatorOptions {
|
||||
extend CalculatorOptions {
|
||||
optional Lift2DFrameAnnotationTo3DCalculatorOptions ext = 290166284;
|
||||
}
|
||||
|
||||
optional BeliefDecoderConfig decoder_config = 1;
|
||||
}
|
101
mediapipe/graphs/object_detection_3d/calculators/model.cc
Normal file
|
@ -0,0 +1,101 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/model.h"
|
||||
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
void Model::SetTransformation(const Eigen::Matrix4f& transform) {
|
||||
transformation_ = transform;
|
||||
}
|
||||
|
||||
void Model::SetTranslation(const Eigen::Vector3f& translation) {
|
||||
transformation_.col(3).template head<3>() = translation;
|
||||
}
|
||||
|
||||
void Model::SetRotation(float roll, float pitch, float yaw) {
|
||||
// In our coordinate system, Y is up. We first rotate the object around Y
|
||||
// (yaw), then around Z (pitch), and finally around X (roll).
|
||||
Eigen::Matrix3f r;
|
||||
r = Eigen::AngleAxisf(yaw, Eigen::Vector3f::UnitY()) *
|
||||
Eigen::AngleAxisf(pitch, Eigen::Vector3f::UnitZ()) *
|
||||
Eigen::AngleAxisf(roll, Eigen::Vector3f::UnitX());
|
||||
transformation_.topLeftCorner<3, 3>() = r;
|
||||
}
|
||||
|
||||
void Model::SetRotation(const Eigen::Matrix3f& rotation) {
|
||||
transformation_.topLeftCorner<3, 3>() = rotation;
|
||||
}
|
||||
|
||||
void Model::SetScale(const Eigen::Vector3f& scale) { scale_ = scale; }
|
||||
|
||||
void Model::SetCategory(const std::string& category) { category_ = category; }
|
||||
|
||||
const Eigen::Vector3f Model::GetRotationAngles() const {
|
||||
Vector3f ypr = transformation_.topLeftCorner<3, 3>().eulerAngles(1, 2, 0);
|
||||
return Vector3f(ypr(2), ypr(1), ypr(0)); // swap YPR with RPY
|
||||
}
|
||||
|
||||
const Eigen::Matrix4f& Model::GetTransformation() const {
|
||||
return transformation_;
|
||||
}
|
||||
|
||||
const Eigen::Vector3f& Model::GetScale() const { return scale_; }
|
||||
|
||||
const Eigen::Ref<const Eigen::Vector3f> Model::GetTranslation() const {
|
||||
return transformation_.col(3).template head<3>();
|
||||
}
|
||||
|
||||
const Eigen::Ref<const Eigen::Matrix3f> Model::GetRotation() const {
|
||||
return transformation_.template topLeftCorner<3, 3>();
|
||||
}
|
||||
|
||||
const std::string& Model::GetCategory() const { return category_; }
|
||||
|
||||
void Model::Deserialize(const Object& obj) {
|
||||
CHECK_EQ(obj.rotation_size(), 9);
|
||||
CHECK_EQ(obj.translation_size(), 3);
|
||||
CHECK_EQ(obj.scale_size(), 3);
|
||||
category_ = obj.category();
|
||||
|
||||
using RotationMatrix = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>;
|
||||
transformation_.setIdentity();
|
||||
transformation_.topLeftCorner<3, 3>() =
|
||||
Eigen::Map<const RotationMatrix>(obj.rotation().data());
|
||||
transformation_.col(3).head<3>() =
|
||||
Eigen::Map<const Eigen::Vector3f>(obj.translation().data());
|
||||
scale_ = Eigen::Map<const Eigen::Vector3f>(obj.scale().data());
|
||||
Update();
|
||||
}
|
||||
|
||||
void Model::Serialize(Object* obj) {
|
||||
obj->set_category(category_);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
obj->add_rotation(transformation_(i, j));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
obj->add_translation(transformation_(i, 3));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
obj->add_scale(scale_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
92
mediapipe/graphs/object_detection_3d/calculators/model.h
Normal file
|
@ -0,0 +1,92 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/object.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/types.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
class Model {
|
||||
public:
|
||||
EIGEN_MAKE_ALIGNED_OPERATOR_NEW
|
||||
|
||||
enum Type {
|
||||
kVisualizationOnly = 0,
|
||||
kBoundingBox,
|
||||
kSkeleton,
|
||||
kShape, // Shape is a virtual object.
|
||||
kNumModes,
|
||||
};
|
||||
|
||||
virtual ~Model() = default;
|
||||
|
||||
virtual void SetTransformation(const Eigen::Matrix4f& transform);
|
||||
virtual void SetTranslation(const Eigen::Vector3f& translation);
|
||||
|
||||
// Compute the rotation matrix from these angles and update the transformation
|
||||
// matrix accordingly
|
||||
virtual void SetRotation(float roll, float pitch, float yaw);
|
||||
virtual void SetRotation(const Eigen::Matrix3f& rotation);
|
||||
virtual void SetScale(const Eigen::Vector3f& scale);
|
||||
virtual void SetCategory(const std::string& category);
|
||||
virtual size_t GetNumberKeypoints() const { return number_keypoints_; }
|
||||
|
||||
// Gets Euler angles in the order of roll, pitch, yaw.
|
||||
virtual const Eigen::Vector3f GetRotationAngles() const;
|
||||
virtual const Eigen::Matrix4f& GetTransformation() const;
|
||||
virtual const Eigen::Vector3f& GetScale() const;
|
||||
virtual const Eigen::Ref<const Eigen::Vector3f> GetTranslation() const;
|
||||
virtual const Eigen::Ref<const Eigen::Matrix3f> GetRotation() const;
|
||||
virtual const std::string& GetCategory() const;
|
||||
|
||||
// Update the model's keypoints in the world-coordinate system.
|
||||
// The update includes transforming the model to the world-coordinate system
|
||||
// as well as scaling the model.
|
||||
// The user is expected to call this function after Setting the rotation,
|
||||
// orientation or the scale of the model to get an updated model.
|
||||
virtual void Update() = 0;
|
||||
|
||||
// Update the model's parameters (orientation, position, and scale) from the
|
||||
// user-provided variables.
|
||||
virtual void Adjust(const std::vector<float>& variables) = 0;
|
||||
|
||||
// Returns a pointer to the model's keypoints.
|
||||
// Use Eigen::Map to cast the pointer back to Vector3 or Vector4
|
||||
virtual const float* GetVertex(size_t id) const = 0;
|
||||
virtual float* GetVertex(size_t id) = 0;
|
||||
virtual void Deserialize(const Object& obj);
|
||||
virtual void Serialize(Object* obj);
|
||||
|
||||
// TODO: make member variables protected, and add public apis.
|
||||
// 4x4 transformation matrix mapping the first keypoint to world coordinate
|
||||
Eigen::Matrix4f transformation_;
|
||||
Eigen::Vector3f scale_; // width, height, depth
|
||||
Type model_type_;
|
||||
size_t number_keypoints_;
|
||||
std::string category_;
|
||||
|
||||
protected:
|
||||
Model(Type type, size_t number_keypoints, const std::string& category)
|
||||
: model_type_(type),
|
||||
number_keypoints_(number_keypoints),
|
||||
category_(category) {}
|
||||
};
|
||||
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_
|
|
@ -0,0 +1,48 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
message TimedModelMatrixProto {
|
||||
// 4x4 model matrix stored in ROW major order.
|
||||
repeated float matrix_entries = 1 [packed = true];
|
||||
// Timestamp of this model matrix in milliseconds.
|
||||
optional int64 time_msec = 2 [default = 0];
|
||||
// Unique per object id
|
||||
optional int32 id = 3 [default = -1];
|
||||
}
|
||||
|
||||
message TimedModelMatrixProtoList {
|
||||
repeated TimedModelMatrixProto model_matrix = 1;
|
||||
}
|
||||
|
||||
// For convenience, when the desired information or transformation can be
|
||||
// encoded into vectors (e.g. when the matrix represents a scale or Euler-angle-
|
||||
// based rotation operation.)
|
||||
message TimedVectorProto {
|
||||
// The vector values themselves.
|
||||
repeated float vector_entries = 1 [packed = true];
|
||||
|
||||
// Timestamp of this vector in milliseconds.
|
||||
optional int64 time_msec = 2 [default = 0];
|
||||
|
||||
// Unique per object id
|
||||
optional int32 id = 3 [default = -1];
|
||||
}
|
||||
|
||||
message TimedVectorProtoList {
|
||||
repeated TimedVectorProto vector_list = 1;
|
||||
}
|
124
mediapipe/graphs/object_detection_3d/calculators/object.proto
Normal file
|
@ -0,0 +1,124 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
message KeyPoint {
|
||||
// The position of the keypoint in the local coordinate system of the rigid
|
||||
// object.
|
||||
float x = 1;
|
||||
float y = 2;
|
||||
float z = 3;
|
||||
|
||||
// Sphere around the keypoint, indiciating annotator's confidence of the
|
||||
// position in meters.
|
||||
float confidence_radius = 4;
|
||||
|
||||
// The name of the keypoint (e.g. legs, head, etc.).
|
||||
// Does not have to be unique.
|
||||
string name = 5;
|
||||
|
||||
// Indicates whether the keypoint is hidden or not.
|
||||
bool hidden = 6;
|
||||
}
|
||||
|
||||
message Object {
|
||||
// Unique object id through a sequence. There might be multiple objects of
|
||||
// the same label in this sequence.
|
||||
int32 id = 1;
|
||||
|
||||
// Describes what category an object is. E.g. object class, attribute,
|
||||
// instance or person identity. This provides additional context for the
|
||||
// object type.
|
||||
string category = 2;
|
||||
|
||||
enum Type {
|
||||
UNDEFINED_TYPE = 0;
|
||||
BOUNDING_BOX = 1;
|
||||
SKELETON = 2;
|
||||
}
|
||||
|
||||
Type type = 3;
|
||||
|
||||
// 3x3 row-major rotation matrix describing the orientation of the rigid
|
||||
// object's frame of reference in the world-coordinate system.
|
||||
repeated float rotation = 4;
|
||||
|
||||
// 3x1 vector describing the translation of the rigid object's frame of
|
||||
// reference in the world-coordinate system in meters.
|
||||
repeated float translation = 5;
|
||||
|
||||
// 3x1 vector describing the scale of the rigid object's frame of reference in
|
||||
// the world-coordinate system in meters.
|
||||
repeated float scale = 6;
|
||||
|
||||
// List of all the key points associated with this object in the object
|
||||
// coordinate system.
|
||||
// The first keypoint is always the object's frame of reference,
|
||||
// e.g. the centroid of the box.
|
||||
// E.g. bounding box with its center as frame of reference, the 9 keypoints :
|
||||
// {0., 0., 0.},
|
||||
// {-.5, -.5, -.5}, {-.5, -.5, +.5}, {-.5, +.5, -.5}, {-.5, +.5, +.5},
|
||||
// {+.5, -.5, -.5}, {+.5, -.5, +.5}, {+.5, +.5, -.5}, {+.5, +.5, +.5}
|
||||
// To get the bounding box in the world-coordinate system, we first scale the
|
||||
// box then transform the scaled box.
|
||||
// For example, bounding box in the world coordinate system is
|
||||
// rotation * scale * keypoints + translation
|
||||
repeated KeyPoint keypoints = 7;
|
||||
|
||||
// Enum to reflect how this object is created.
|
||||
enum Method {
|
||||
UNKNOWN_METHOD = 0;
|
||||
ANNOTATION = 1; // Created by data annotation.
|
||||
AUGMENTATION = 2; // Created by data augmentation.
|
||||
}
|
||||
Method method = 8;
|
||||
}
|
||||
|
||||
// The edge connecting two keypoints together
|
||||
message Edge {
|
||||
// keypoint id of the edge's source
|
||||
int32 source = 1;
|
||||
|
||||
// keypoint id of the edge's sink
|
||||
int32 sink = 2;
|
||||
}
|
||||
|
||||
// The skeleton template for different objects (e.g. humans, chairs, hands, etc)
|
||||
// The annotation tool reads the skeleton template dictionary.
|
||||
message Skeleton {
|
||||
// The origin keypoint in the object coordinate system. (i.e. Point 0, 0, 0)
|
||||
int32 reference_keypoint = 1;
|
||||
|
||||
// The skeleton's category (e.g. human, chair, hand.). Should be unique in the
|
||||
// dictionary.
|
||||
string category = 2;
|
||||
|
||||
// Initialization value for all the keypoints in the skeleton in the object's
|
||||
// local coordinate system. Pursuit will transform these points using object's
|
||||
// transformation to get the keypoint in the world-cooridnate.
|
||||
repeated KeyPoint keypoints = 3;
|
||||
|
||||
// List of edges connecting keypoints
|
||||
repeated Edge edges = 4;
|
||||
}
|
||||
|
||||
// The list of all the modeled skeletons in our library. These models can be
|
||||
// objects (chairs, desks, etc), humans (full pose, hands, faces, etc), or box.
|
||||
// We can have multiple skeletons in the same file.
|
||||
message Skeletons {
|
||||
repeated Skeleton object = 1;
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h"
|
||||
|
||||
#include "mediapipe/framework/port/logging.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
cv::Mat ConvertTfliteTensorToCvMat(const TfLiteTensor& tensor) {
|
||||
// Check tensor is BxCxWxH (size = 4) and the batch size is one(data[0] = 1)
|
||||
CHECK(tensor.dims->size == 4 && tensor.dims->data[0] == 1);
|
||||
CHECK_EQ(kTfLiteFloat32, tensor.type) << "tflite_tensor type is not float";
|
||||
|
||||
const size_t num_output_channels = tensor.dims->data[3];
|
||||
const int dims = 2;
|
||||
const int sizes[] = {tensor.dims->data[1], tensor.dims->data[2]};
|
||||
const int type = CV_MAKETYPE(CV_32F, num_output_channels);
|
||||
return cv::Mat(dims, sizes, type, reinterpret_cast<void*>(tensor.data.f));
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_
|
||||
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "tensorflow/lite/interpreter.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Converts a single channel tflite tensor to a grayscale image
|
||||
cv::Mat ConvertTfliteTensorToCvMat(const TfLiteTensor& tensor);
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_
|
|
@ -0,0 +1,216 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "Eigen/Dense"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/types/span.h"
|
||||
#include "mediapipe/framework/calculator_framework.h"
|
||||
#include "mediapipe/framework/deps/file_path.h"
|
||||
#include "mediapipe/framework/port/opencv_core_inc.h"
|
||||
#include "mediapipe/framework/port/ret_check.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.pb.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h"
|
||||
#include "mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.pb.h"
|
||||
#include "tensorflow/lite/interpreter.h"
|
||||
|
||||
namespace {
|
||||
constexpr char kInputStreamTag[] = "TENSORS";
|
||||
constexpr char kOutputStreamTag[] = "ANNOTATIONS";
|
||||
|
||||
// Each detection object will be assigned an unique id that starts from 1.
|
||||
static int object_id = 0;
|
||||
|
||||
inline int GetNextObjectId() { return ++object_id; }
|
||||
} // namespace
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
// Convert result TFLite tensors from deep pursuit 3d model into
|
||||
// FrameAnnotation.
|
||||
//
|
||||
// Input:
|
||||
// TENSORS - Vector of TfLiteTensor of type kTfLiteFloat32.
|
||||
// Output:
|
||||
// ANNOTATIONS - Result FrameAnnotation.
|
||||
//
|
||||
// Usage example:
|
||||
// node {
|
||||
// calculator: "TfLiteTensorsToObjectsCalculator"
|
||||
// input_stream: "TENSORS:tensors"
|
||||
// output_stream: "ANNOTATIONS:annotations"
|
||||
// }
|
||||
class TfLiteTensorsToObjectsCalculator : public CalculatorBase {
|
||||
public:
|
||||
static ::mediapipe::Status GetContract(CalculatorContract* cc);
|
||||
|
||||
::mediapipe::Status Open(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Process(CalculatorContext* cc) override;
|
||||
::mediapipe::Status Close(CalculatorContext* cc) override;
|
||||
|
||||
private:
|
||||
::mediapipe::Status ProcessCPU(CalculatorContext* cc,
|
||||
FrameAnnotation* output_objects);
|
||||
::mediapipe::Status LoadOptions(CalculatorContext* cc);
|
||||
// Takes point_3d in FrameAnnotation, projects to 2D, and overwrite the
|
||||
// point_2d field with the projection.
|
||||
void Project3DTo2D(bool portrait, FrameAnnotation* annotation) const;
|
||||
// Increment and assign object ID for each detected object.
|
||||
// In a single MediaPipe session, the IDs are unique.
|
||||
// Also assign timestamp for the FrameAnnotation to be the input packet
|
||||
// timestamp.
|
||||
void AssignObjectIdAndTimestamp(int64 timestamp_us,
|
||||
FrameAnnotation* annotation);
|
||||
|
||||
int num_classes_ = 0;
|
||||
int num_keypoints_ = 0;
|
||||
|
||||
::mediapipe::TfLiteTensorsToObjectsCalculatorOptions options_;
|
||||
std::unique_ptr<Decoder> decoder_;
|
||||
Eigen::Matrix<float, 4, 4, Eigen::RowMajor> projection_matrix_;
|
||||
};
|
||||
REGISTER_CALCULATOR(TfLiteTensorsToObjectsCalculator);
|
||||
|
||||
::mediapipe::Status TfLiteTensorsToObjectsCalculator::GetContract(
|
||||
CalculatorContract* cc) {
|
||||
RET_CHECK(!cc->Inputs().GetTags().empty());
|
||||
RET_CHECK(!cc->Outputs().GetTags().empty());
|
||||
|
||||
if (cc->Inputs().HasTag(kInputStreamTag)) {
|
||||
cc->Inputs().Tag(kInputStreamTag).Set<std::vector<TfLiteTensor>>();
|
||||
}
|
||||
|
||||
if (cc->Outputs().HasTag(kOutputStreamTag)) {
|
||||
cc->Outputs().Tag(kOutputStreamTag).Set<FrameAnnotation>();
|
||||
}
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TfLiteTensorsToObjectsCalculator::Open(
|
||||
CalculatorContext* cc) {
|
||||
MP_RETURN_IF_ERROR(LoadOptions(cc));
|
||||
// clang-format off
|
||||
projection_matrix_ <<
|
||||
1.5731, 0, 0, 0,
|
||||
0, 2.0975, 0, 0,
|
||||
0, 0, -1.0002, -0.2,
|
||||
0, 0, -1, 0;
|
||||
// clang-format on
|
||||
decoder_ = absl::make_unique<Decoder>(
|
||||
BeliefDecoderConfig(options_.decoder_config()));
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TfLiteTensorsToObjectsCalculator::Process(
|
||||
CalculatorContext* cc) {
|
||||
if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
auto output_objects = absl::make_unique<FrameAnnotation>();
|
||||
|
||||
MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get()));
|
||||
|
||||
// Output
|
||||
if (cc->Outputs().HasTag(kOutputStreamTag)) {
|
||||
cc->Outputs()
|
||||
.Tag(kOutputStreamTag)
|
||||
.Add(output_objects.release(), cc->InputTimestamp());
|
||||
}
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TfLiteTensorsToObjectsCalculator::ProcessCPU(
|
||||
CalculatorContext* cc, FrameAnnotation* output_objects) {
|
||||
const auto& input_tensors =
|
||||
cc->Inputs().Tag(kInputStreamTag).Get<std::vector<TfLiteTensor>>();
|
||||
|
||||
cv::Mat prediction_heatmap = ConvertTfliteTensorToCvMat(input_tensors[0]);
|
||||
cv::Mat offsetmap = ConvertTfliteTensorToCvMat(input_tensors[1]);
|
||||
|
||||
*output_objects =
|
||||
decoder_->DecodeBoundingBoxKeypoints(prediction_heatmap, offsetmap);
|
||||
auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true,
|
||||
output_objects);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
return status;
|
||||
}
|
||||
Project3DTo2D(/*portrait*/ true, output_objects);
|
||||
AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(),
|
||||
output_objects);
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TfLiteTensorsToObjectsCalculator::Close(
|
||||
CalculatorContext* cc) {
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
::mediapipe::Status TfLiteTensorsToObjectsCalculator::LoadOptions(
|
||||
CalculatorContext* cc) {
|
||||
// Get calculator options specified in the graph.
|
||||
options_ =
|
||||
cc->Options<::mediapipe::TfLiteTensorsToObjectsCalculatorOptions>();
|
||||
|
||||
num_classes_ = options_.num_classes();
|
||||
num_keypoints_ = options_.num_keypoints();
|
||||
|
||||
// Currently only support 2D when num_values_per_keypoint equals to 2.
|
||||
CHECK_EQ(options_.num_values_per_keypoint(), 2);
|
||||
|
||||
return ::mediapipe::OkStatus();
|
||||
}
|
||||
|
||||
void TfLiteTensorsToObjectsCalculator::Project3DTo2D(
|
||||
bool portrait, FrameAnnotation* annotation) const {
|
||||
for (auto& ann : *annotation->mutable_annotations()) {
|
||||
for (auto& key_point : *ann.mutable_keypoints()) {
|
||||
Eigen::Vector4f point3d;
|
||||
point3d << key_point.point_3d().x(), key_point.point_3d().y(),
|
||||
key_point.point_3d().z(), 1.0f;
|
||||
Eigen::Vector4f point3d_projection = projection_matrix_ * point3d;
|
||||
float u, v;
|
||||
const float inv_w = 1.0f / point3d_projection(3);
|
||||
if (portrait) {
|
||||
u = (point3d_projection(1) * inv_w + 1.0f) * 0.5f;
|
||||
v = (point3d_projection(0) * inv_w + 1.0f) * 0.5f;
|
||||
} else {
|
||||
u = (point3d_projection(0) * inv_w + 1.0f) * 0.5f;
|
||||
v = (1.0f - point3d_projection(1) * inv_w) * 0.5f;
|
||||
}
|
||||
key_point.mutable_point_2d()->set_x(u);
|
||||
key_point.mutable_point_2d()->set_y(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TfLiteTensorsToObjectsCalculator::AssignObjectIdAndTimestamp(
|
||||
int64 timestamp_us, FrameAnnotation* annotation) {
|
||||
for (auto& ann : *annotation->mutable_annotations()) {
|
||||
ann.set_object_id(GetNextObjectId());
|
||||
}
|
||||
annotation->set_timestamp(timestamp_us);
|
||||
}
|
||||
|
||||
} // namespace mediapipe
|
|
@ -0,0 +1,39 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// The option proto for the TfLiteTensorsToObjectsCalculatorOptions.
|
||||
|
||||
syntax = "proto2";
|
||||
|
||||
package mediapipe;
|
||||
|
||||
import "mediapipe/framework/calculator.proto";
|
||||
import "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto";
|
||||
|
||||
message TfLiteTensorsToObjectsCalculatorOptions {
|
||||
extend CalculatorOptions {
|
||||
optional TfLiteTensorsToObjectsCalculatorOptions ext = 263667646;
|
||||
}
|
||||
|
||||
// The number of output classes predicted by the detection model.
|
||||
optional int32 num_classes = 1;
|
||||
|
||||
// The number of predicted keypoints.
|
||||
optional int32 num_keypoints = 2;
|
||||
// The dimension of each keypoint, e.g. number of values predicted for each
|
||||
// keypoint.
|
||||
optional int32 num_values_per_keypoint = 3 [default = 2];
|
||||
|
||||
optional BeliefDecoderConfig decoder_config = 4;
|
||||
}
|
56
mediapipe/graphs/object_detection_3d/calculators/types.h
Normal file
|
@ -0,0 +1,56 @@
|
|||
// Copyright 2020 The MediaPipe Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_
|
||||
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "Eigen/Geometry"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
using Eigen::Map;
|
||||
using Eigen::Vector2f;
|
||||
using Eigen::Vector3f;
|
||||
using Eigen::Vector4f;
|
||||
using Matrix4f_RM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
|
||||
using Matrix3f_RM = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>;
|
||||
|
||||
using Face = std::array<int, 4>;
|
||||
|
||||
struct SuperPoint {
|
||||
enum PointSourceType { kPointCloud = 0, kBoundingBox = 1, kSkeleton = 2 };
|
||||
// The id of the point in the point-cloud
|
||||
int reference_point;
|
||||
// The source of the
|
||||
PointSourceType source;
|
||||
// The id of the point in set of points in current frame
|
||||
int id;
|
||||
// If source is kBoundingBox or kSkeleton, object_id stores the id of which \
|
||||
// object this point belongs to.
|
||||
int object_id;
|
||||
// projected u-v value
|
||||
Vector2f uv;
|
||||
Vector2f pixel;
|
||||
// the 3D point
|
||||
Vector3f point_3d;
|
||||
// Color
|
||||
Eigen::Matrix<unsigned char, 4, 1> color;
|
||||
bool rendered;
|
||||
};
|
||||
|
||||
} // namespace mediapipe
|
||||
|
||||
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_
|
|
@ -0,0 +1,133 @@
|
|||
# MediaPipe object detection 3D with tracking graph.
|
||||
|
||||
# Images on GPU coming into and out of the graph.
|
||||
input_stream: "input_video"
|
||||
input_stream: "input_width"
|
||||
input_stream: "input_height"
|
||||
output_stream: "output_video"
|
||||
|
||||
# Crops the image from the center to the size WIDTHxHEIGHT.
|
||||
node: {
|
||||
calculator: "ImageCroppingCalculator"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "IMAGE_GPU:input_video_4x3"
|
||||
input_stream: "WIDTH:input_width"
|
||||
input_stream: "HEIGHT:input_height"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] {
|
||||
border_mode: BORDER_REPLICATE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Creates a copy of the input_video stream. At the end of the graph, the
|
||||
# GlAnimationOverlayCalculator will consume the input_video texture and draws
|
||||
# on top of it.
|
||||
node: {
|
||||
calculator: "GlScalerCalculator"
|
||||
input_stream: "VIDEO:input_video_4x3"
|
||||
output_stream: "VIDEO:input_video_copy"
|
||||
}
|
||||
|
||||
# Resamples the images by specific frame rate. This calculator is used to
|
||||
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
|
||||
# consumption for expensive process.
|
||||
node {
|
||||
calculator: "PacketResamplerCalculator"
|
||||
input_stream: "DATA:input_video_copy"
|
||||
output_stream: "DATA:sampled_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
|
||||
frame_rate: 5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node {
|
||||
calculator: "ObjectronDetectionSubgraphGpu"
|
||||
input_stream: "IMAGE_GPU:sampled_input_video"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
}
|
||||
|
||||
node {
|
||||
calculator: "ObjectronTrackingSubgraphGpu"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "IMAGE_GPU:input_video_copy"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
}
|
||||
|
||||
# The rendering nodes:
|
||||
# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly
|
||||
# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask.
|
||||
# These models are designed using different tools, so we supply a transformation
|
||||
# to bring both of them to the Objectron's coordinate system.
|
||||
|
||||
# Creates a model matrices for the tracked object given the lifted 3D points.
|
||||
# This calculator does two things: 1) Estimates object's pose (orientation,
|
||||
# translation, and scale) from the 3D vertices, and
|
||||
# 2) bring the object from the objectron's coordinate system to the renderer
|
||||
# (OpenGL) coordinate system. Since the final goal is to render a mesh file on
|
||||
# top of the object, we also supply a transformation to bring the mesh to the
|
||||
# objectron's coordinate system, and rescale mesh to the unit size.
|
||||
node {
|
||||
calculator: "AnnotationsToModelMatricesCalculator"
|
||||
input_stream: "ANNOTATIONS:lifted_tracked_objects"
|
||||
output_stream: "MODEL_MATRICES:model_matrices"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
|
||||
# Re-scale the CAD model to the size of a unit box
|
||||
model_scale: [0.05, 0.05, 0.05]
|
||||
# Bring the box CAD model to objectron's coordinate system. This
|
||||
# is equivalent of -pi/2 rotation along the y-axis (right-hand rule):
|
||||
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY())
|
||||
model_transformation: [0.0, 0.0, -1.0, 0.0]
|
||||
model_transformation: [0.0, 1.0, 0.0, 0.0]
|
||||
model_transformation: [1.0, 0.0, 0.0, 0.0]
|
||||
model_transformation: [0.0, 0.0, 0.0, 1.0]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Compute the model matrices for the CAD model of the chair, to be used as an
|
||||
# occlusion mask. The model will be rendered at the exact same location as the
|
||||
# bounding box.
|
||||
node {
|
||||
calculator: "AnnotationsToModelMatricesCalculator"
|
||||
input_stream: "ANNOTATIONS:lifted_tracked_objects"
|
||||
output_stream: "MODEL_MATRICES:mask_model_matrices"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
|
||||
# Re-scale the CAD model to the size of a unit box
|
||||
model_scale: [0.15, 0.1, 0.15]
|
||||
# Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This
|
||||
# is equivalent of -pi/2 rotation along the x-axis:
|
||||
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX())
|
||||
model_transformation: [1.0, 0.0, 0.0, 0.0]
|
||||
model_transformation: [0.0, 1.0, 0.0, -10.0]
|
||||
model_transformation: [0.0, 0.0, -1.0, 0.0]
|
||||
model_transformation: [0.0, 0.0, 0.0, 1.0]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Render everything together. First we render the 3D bounding box animation,
|
||||
# then we render the occlusion mask.
|
||||
node:{
|
||||
calculator:"GlAnimationOverlayCalculator"
|
||||
input_stream:"VIDEO:input_video_4x3"
|
||||
input_stream:"MODEL_MATRICES:model_matrices"
|
||||
input_stream:"MASK_MODEL_MATRICES:mask_model_matrices"
|
||||
output_stream:"output_video"
|
||||
input_side_packet:"TEXTURE:box_texture"
|
||||
input_side_packet:"ANIMATION_ASSET:box_asset_name"
|
||||
input_side_packet:"MASK_TEXTURE:obj_texture"
|
||||
input_side_packet:"MASK_ASSET:obj_asset_name"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] {
|
||||
# Output resolution is 480x640 with the aspect ratio of 0.75
|
||||
aspect_ratio: 0.75
|
||||
vertical_fov_degrees: 70.
|
||||
animation_speed_fps: 25
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
# MediaPipe object detection 3D with tracking graph.
|
||||
|
||||
# Images on GPU coming into and out of the graph.
|
||||
input_stream: "input_video"
|
||||
input_stream: "input_width"
|
||||
input_stream: "input_height"
|
||||
output_stream: "output_video"
|
||||
|
||||
# Crops the image from the center to the size WIDTHxHEIGHT.
|
||||
node: {
|
||||
calculator: "ImageCroppingCalculator"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "IMAGE_GPU:input_video_4x3"
|
||||
input_stream: "WIDTH:input_width"
|
||||
input_stream: "HEIGHT:input_height"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] {
|
||||
border_mode: BORDER_REPLICATE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Creates a copy of the input_video stream. At the end of the graph, the
|
||||
# GlAnimationOverlayCalculator will consume the input_video texture and draws
|
||||
# on top of it.
|
||||
node: {
|
||||
calculator: "GlScalerCalculator"
|
||||
input_stream: "VIDEO:input_video_4x3"
|
||||
output_stream: "VIDEO:input_video_copy"
|
||||
}
|
||||
|
||||
# Resamples the images by specific frame rate. This calculator is used to
|
||||
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
|
||||
# consumption for expensive process.
|
||||
node {
|
||||
calculator: "PacketResamplerCalculator"
|
||||
input_stream: "DATA:input_video_copy"
|
||||
output_stream: "DATA:sampled_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
|
||||
frame_rate: 5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node {
|
||||
calculator: "ObjectronDetectionSubgraphGpu"
|
||||
input_stream: "IMAGE_GPU:sampled_input_video"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
}
|
||||
|
||||
node {
|
||||
calculator: "ObjectronTrackingSubgraphGpu"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "IMAGE_GPU:input_video_copy"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
}
|
||||
|
||||
# The rendering nodes:
|
||||
# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly
|
||||
# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask.
|
||||
# These models are designed using different tools, so we supply a transformation
|
||||
# to bring both of them to the Objectron's coordinate system.
|
||||
|
||||
# Creates a model matrices for the tracked object given the lifted 3D points.
|
||||
# This calculator does two things: 1) Estimates object's pose (orientation,
|
||||
# translation, and scale) from the 3D vertices, and
|
||||
# 2) bring the object from the objectron's coordinate system to the renderer
|
||||
# (OpenGL) coordinate system. Since the final goal is to render a mesh file on
|
||||
# top of the object, we also supply a transformation to bring the mesh to the
|
||||
# objectron's coordinate system, and rescale mesh to the unit size.
|
||||
node {
|
||||
calculator: "AnnotationsToModelMatricesCalculator"
|
||||
input_stream: "ANNOTATIONS:lifted_tracked_objects"
|
||||
output_stream: "MODEL_MATRICES:model_matrices"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
|
||||
# Re-scale the CAD model to the size of a unit box
|
||||
model_scale: [0.05, 0.05, 0.05]
|
||||
# Bring the box CAD model to objectron's coordinate system. This
|
||||
# is equivalent of -pi/2 rotation along the y-axis (right-hand rule):
|
||||
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY())
|
||||
model_transformation: [0.0, 0.0, -1.0, 0.0]
|
||||
model_transformation: [0.0, 1.0, 0.0, 0.0]
|
||||
model_transformation: [1.0, 0.0, 0.0, 0.0]
|
||||
model_transformation: [0.0, 0.0, 0.0, 1.0]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Compute the model matrices for the CAD model of the shoe, to be used as an
|
||||
# occlusion mask. The model will be rendered at the exact same location as the
|
||||
# bounding box.
|
||||
node {
|
||||
calculator: "AnnotationsToModelMatricesCalculator"
|
||||
input_stream: "ANNOTATIONS:lifted_tracked_objects"
|
||||
output_stream: "MODEL_MATRICES:mask_model_matrices"
|
||||
#input_side_packet: "MODEL_SCALE:model_scale"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
|
||||
# Re-scale the CAD model to the size of a unit box
|
||||
model_scale: [0.45, 0.25, 0.15]
|
||||
# Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This
|
||||
# is equivalent of -pi/2 rotation along the x-axis (right-hand rule):
|
||||
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX())
|
||||
model_transformation: [1.0, 0.0, 0.0, 0.0]
|
||||
model_transformation: [0.0, 0.0, 1.0, 0.0]
|
||||
model_transformation: [0.0, -1.0, 0.0, 0.0]
|
||||
model_transformation: [0.0, 0.0, 0.0, 1.0]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Render everything together. First we render the 3D bounding box animation,
|
||||
# then we render the occlusion mask.
|
||||
node: {
|
||||
calculator: "GlAnimationOverlayCalculator"
|
||||
input_stream: "VIDEO:input_video_4x3"
|
||||
input_stream: "MODEL_MATRICES:model_matrices"
|
||||
input_stream: "MASK_MODEL_MATRICES:mask_model_matrices"
|
||||
output_stream: "output_video"
|
||||
input_side_packet: "TEXTURE:box_texture"
|
||||
input_side_packet: "ANIMATION_ASSET:box_asset_name"
|
||||
input_side_packet: "MASK_TEXTURE:obj_texture"
|
||||
input_side_packet: "MASK_ASSET:obj_asset_name"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] {
|
||||
# Output resolution is 480x640 with the aspect ratio of 0.75
|
||||
aspect_ratio: 0.75
|
||||
vertical_fov_degrees: 70.
|
||||
animation_speed_fps: 25
|
||||
}
|
||||
}
|
||||
}
|
52
mediapipe/graphs/object_detection_3d/subgraphs/BUILD
Normal file
|
@ -0,0 +1,52 @@
|
|||
# Copyright 2020 The MediaPipe Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
load(
|
||||
"//mediapipe/framework/tool:mediapipe_graph.bzl",
|
||||
"mediapipe_simple_subgraph",
|
||||
)
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
mediapipe_simple_subgraph(
|
||||
name = "objectron_detection_gpu",
|
||||
graph = "objectron_detection_gpu.pbtxt",
|
||||
register_as = "ObjectronDetectionSubgraphGpu",
|
||||
deps = [
|
||||
"//mediapipe/calculators/image:image_transformation_calculator",
|
||||
"//mediapipe/calculators/tflite:tflite_converter_calculator",
|
||||
"//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator",
|
||||
"//mediapipe/calculators/tflite:tflite_inference_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/calculators:tflite_tensors_to_objects_calculator",
|
||||
],
|
||||
)
|
||||
|
||||
mediapipe_simple_subgraph(
|
||||
name = "objectron_tracking_gpu",
|
||||
graph = "objectron_tracking_gpu.pbtxt",
|
||||
register_as = "ObjectronTrackingSubgraphGpu",
|
||||
deps = [
|
||||
"//mediapipe/calculators/image:image_transformation_calculator",
|
||||
"//mediapipe/calculators/video:box_tracker_calculator",
|
||||
"//mediapipe/calculators/video:flow_packager_calculator",
|
||||
"//mediapipe/calculators/video:motion_analysis_calculator",
|
||||
"//mediapipe/framework/stream_handler:sync_set_input_stream_handler",
|
||||
"//mediapipe/gpu:gpu_buffer_to_image_frame_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_to_timed_box_list_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_tracker_calculator",
|
||||
"//mediapipe/graphs/object_detection_3d/calculators:lift_2d_frame_annotation_to_3d_calculator",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,81 @@
|
|||
# MediaPipe Objectron detection gpu subgraph
|
||||
|
||||
type: "ObjectronDetectionSubgraphGpu"
|
||||
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
|
||||
# Transforms the input image on GPU to a 480x640 image. To scale the input
|
||||
# image, the scale_mode option is set to FIT to preserve the aspect ratio,
|
||||
# resulting in potential letterboxing in the transformed image.
|
||||
node: {
|
||||
calculator: "ImageTransformationCalculator"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "IMAGE_GPU:transformed_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
|
||||
output_width: 480
|
||||
output_height: 640
|
||||
scale_mode: FIT
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Converts the transformed input image on GPU into an image tensor stored as a
|
||||
# TfLiteTensor.
|
||||
node {
|
||||
calculator: "TfLiteConverterCalculator"
|
||||
input_stream: "IMAGE_GPU:transformed_input_video"
|
||||
output_stream: "TENSORS_GPU:image_tensor"
|
||||
}
|
||||
|
||||
# Generates a single side packet containing a TensorFlow Lite op resolver that
|
||||
# supports custom ops needed by the model used in this graph.
|
||||
node {
|
||||
calculator: "TfLiteCustomOpResolverCalculator"
|
||||
output_side_packet: "opresolver"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] {
|
||||
use_gpu: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
|
||||
# vector of tensors representing, for instance, detection boxes/keypoints and
|
||||
# scores.
|
||||
node {
|
||||
calculator: "TfLiteInferenceCalculator"
|
||||
input_stream: "TENSORS_GPU:image_tensor"
|
||||
output_stream: "TENSORS:detection_tensors"
|
||||
input_side_packet: "CUSTOM_OP_RESOLVER:opresolver"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
|
||||
model_path: "object_detection_3d.tflite"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Decodes the model's output tensor (the heatmap and the distance fields) to 2D
|
||||
# keypoints. There are nine 2D keypoints: one center keypoint and eight vertices
|
||||
# for the 3D bounding box. The calculator parameters determine's the decoder's
|
||||
# sensitivity.
|
||||
node {
|
||||
calculator: "TfLiteTensorsToObjectsCalculator"
|
||||
input_stream: "TENSORS:detection_tensors"
|
||||
output_stream: "ANNOTATIONS:objects"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.TfLiteTensorsToObjectsCalculatorOptions] {
|
||||
num_classes: 1
|
||||
num_keypoints: 9
|
||||
decoder_config {
|
||||
heatmap_threshold: 0.6
|
||||
local_max_distance: 2
|
||||
offset_scale_coef: 1.0
|
||||
voting_radius: 2
|
||||
voting_allowance: 1
|
||||
voting_threshold: 0.2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
# MediaPipe Objectron tracking gpu subgraph
|
||||
|
||||
type: "ObjectronTrackingSubgraphGpu"
|
||||
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
|
||||
|
||||
# Converts the detected keypoints to Boxes, used by the tracking subgraph.
|
||||
node {
|
||||
calculator: "FrameAnnotationToTimedBoxListCalculator"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
output_stream: "BOXES:start_pos"
|
||||
}
|
||||
|
||||
node: {
|
||||
calculator: "ImageTransformationCalculator"
|
||||
input_stream: "IMAGE_GPU:input_video"
|
||||
output_stream: "IMAGE_GPU:downscaled_input_video"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
|
||||
output_width: 240
|
||||
output_height: 320
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Converts GPU buffer to ImageFrame for processing tracking.
|
||||
node: {
|
||||
calculator: "GpuBufferToImageFrameCalculator"
|
||||
input_stream: "downscaled_input_video"
|
||||
output_stream: "downscaled_input_video_cpu"
|
||||
}
|
||||
|
||||
# Performs motion analysis on an incoming video stream.
|
||||
node: {
|
||||
calculator: "MotionAnalysisCalculator"
|
||||
input_stream: "VIDEO:downscaled_input_video_cpu"
|
||||
output_stream: "CAMERA:camera_motion"
|
||||
output_stream: "FLOW:region_flow"
|
||||
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: {
|
||||
analysis_options {
|
||||
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
|
||||
flow_options {
|
||||
fast_estimation_min_block_size: 100
|
||||
top_inlier_sets: 1
|
||||
frac_inlier_error_threshold: 3e-3
|
||||
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
|
||||
verification_distance: 5.0
|
||||
verify_long_feature_acceleration: true
|
||||
verify_long_feature_trigger_ratio: 0.1
|
||||
tracking_options {
|
||||
max_features: 500
|
||||
adaptive_extraction_levels: 2
|
||||
min_eig_val_settings {
|
||||
adaptive_lowest_quality_level: 2e-4
|
||||
}
|
||||
klt_tracker_implementation: KLT_OPENCV
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Reads optical flow fields defined in
|
||||
# mediapipe/framework/formats/motion/optical_flow_field.h,
|
||||
# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized
|
||||
# to 0-255.
|
||||
node: {
|
||||
calculator: "FlowPackagerCalculator"
|
||||
input_stream: "FLOW:region_flow"
|
||||
input_stream: "CAMERA:camera_motion"
|
||||
output_stream: "TRACKING:tracking_data"
|
||||
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: {
|
||||
flow_packager_options: {
|
||||
binary_tracking_data_support: false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Tracks box positions over time.
|
||||
node: {
|
||||
calculator: "BoxTrackerCalculator"
|
||||
input_stream: "TRACKING:tracking_data"
|
||||
input_stream: "TRACK_TIME:input_video"
|
||||
input_stream: "START_POS:start_pos"
|
||||
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
|
||||
input_stream_info: {
|
||||
tag_index: "CANCEL_OBJECT_ID"
|
||||
back_edge: true
|
||||
}
|
||||
output_stream: "BOXES:boxes"
|
||||
|
||||
input_stream_handler {
|
||||
input_stream_handler: "SyncSetInputStreamHandler"
|
||||
options {
|
||||
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
|
||||
sync_set {
|
||||
tag_index: "TRACKING"
|
||||
tag_index: "TRACK_TIME"
|
||||
}
|
||||
sync_set {
|
||||
tag_index: "START_POS"
|
||||
}
|
||||
sync_set {
|
||||
tag_index: "CANCEL_OBJECT_ID"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: {
|
||||
tracker_options: {
|
||||
track_step_options {
|
||||
track_object_and_camera: true
|
||||
tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE
|
||||
inlier_spring_force: 0.0
|
||||
static_motion_temporal_ratio: 3e-2
|
||||
}
|
||||
}
|
||||
visualize_tracking_data: false
|
||||
streaming_track_data_cache_size: 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Consolidates tracking and detection results.
|
||||
node {
|
||||
calculator: "FrameAnnotationTrackerCalculator"
|
||||
input_stream: "FRAME_ANNOTATION:objects"
|
||||
input_stream: "TRACKED_BOXES:boxes"
|
||||
output_stream: "TRACKED_FRAME_ANNOTATION:tracked_objects"
|
||||
output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
|
||||
node_options: {
|
||||
[type.googleapis.com/mediapipe.FrameAnnotationTrackerCalculatorOptions] {
|
||||
img_width: 240
|
||||
img_height: 320
|
||||
iou_threshold: 0.1
|
||||
}
|
||||
}
|
||||
|
||||
input_stream_handler {
|
||||
input_stream_handler: "SyncSetInputStreamHandler"
|
||||
options {
|
||||
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
|
||||
sync_set {
|
||||
tag_index: "FRAME_ANNOTATION"
|
||||
}
|
||||
sync_set {
|
||||
tag_index: "TRACKED_BOXES"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Lift the tracked 2D keypoints to 3D using EPnP algorithm.
|
||||
node {
|
||||
calculator: "Lift2DFrameAnnotationTo3DCalculator"
|
||||
input_stream: "FRAME_ANNOTATION:tracked_objects"
|
||||
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
|
||||
}
|
|
@ -67,15 +67,19 @@ public class CameraXPreviewHelper extends CameraHelper {
|
|||
private int cameraTimestampSource = CameraCharacteristics.SENSOR_INFO_TIMESTAMP_SOURCE_UNKNOWN;
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("RestrictTo") // See b/132705545.
|
||||
public void startCamera(
|
||||
Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture) {
|
||||
startCamera(context, cameraFacing, surfaceTexture, TARGET_SIZE);
|
||||
}
|
||||
|
||||
public void startCamera(
|
||||
Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture, Size targetSize) {
|
||||
LensFacing cameraLensFacing =
|
||||
cameraFacing == CameraHelper.CameraFacing.FRONT ? LensFacing.FRONT : LensFacing.BACK;
|
||||
PreviewConfig previewConfig =
|
||||
new PreviewConfig.Builder()
|
||||
.setLensFacing(cameraLensFacing)
|
||||
.setTargetResolution(TARGET_SIZE)
|
||||
.setTargetResolution(targetSize)
|
||||
.build();
|
||||
preview = new Preview(previewConfig);
|
||||
|
||||
|
@ -110,7 +114,6 @@ public class CameraXPreviewHelper extends CameraHelper {
|
|||
}
|
||||
});
|
||||
CameraX.bindToLifecycle(/*lifecycleOwner=*/ (LifecycleOwner) context, preview);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -210,6 +213,10 @@ public class CameraXPreviewHelper extends CameraHelper {
|
|||
return focalLengthPixels;
|
||||
}
|
||||
|
||||
public Size getFrameSize() {
|
||||
return frameSize;
|
||||
}
|
||||
|
||||
// Computes the focal length of the camera in pixels based on lens and sensor properties.
|
||||
private float calculateFocalLengthInPixels() {
|
||||
// Focal length of the camera in millimeters.
|
||||
|
|
BIN
mediapipe/models/object_detection_3d_chair.tflite
Normal file
BIN
mediapipe/models/object_detection_3d_sneakers.tflite
Normal file
|
@ -41,3 +41,37 @@ cc_library(
|
|||
"@org_tensorflow//tensorflow/lite/kernels:builtin_ops",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tensor_buffer",
|
||||
srcs = ["tensor_buffer.cc"],
|
||||
hdrs = ["tensor_buffer.h"],
|
||||
deps = [
|
||||
"@org_tensorflow//tensorflow/lite:framework",
|
||||
"@com_google_absl//absl/memory",
|
||||
"//mediapipe/framework:port",
|
||||
] + select({
|
||||
"//mediapipe/gpu:disable_gpu": [],
|
||||
"//mediapipe:ios": [
|
||||
"//mediapipe/gpu:MPPMetalUtil",
|
||||
"//mediapipe/gpu:gl_base",
|
||||
],
|
||||
"//conditions:default": [
|
||||
"@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_buffer",
|
||||
"//mediapipe/gpu:gl_base",
|
||||
"//mediapipe/gpu:gl_context",
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "tensor_buffer_test",
|
||||
srcs = ["tensor_buffer_test.cc"],
|
||||
deps = [
|
||||
":tensor_buffer",
|
||||
"//mediapipe/framework/port:gtest_main",
|
||||
] + select({
|
||||
"//mediapipe/gpu:disable_gpu": [],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
|
43
mediapipe/util/tflite/tensor_buffer.cc
Normal file
|
@ -0,0 +1,43 @@
|
|||
#include "mediapipe/util/tflite/tensor_buffer.h"
|
||||
|
||||
namespace mediapipe {
|
||||
|
||||
TensorBuffer::TensorBuffer() {}
|
||||
|
||||
TensorBuffer::~TensorBuffer() { uses_gpu_ = false; }
|
||||
|
||||
TensorBuffer::TensorBuffer(TfLiteTensor& tensor) {
|
||||
cpu_ = tensor;
|
||||
uses_gpu_ = false;
|
||||
}
|
||||
|
||||
#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
|
||||
TensorBuffer::TensorBuffer(std::shared_ptr<tflite::gpu::gl::GlBuffer> tensor) {
|
||||
gpu_ = std::move(tensor);
|
||||
uses_gpu_ = true;
|
||||
}
|
||||
// static
|
||||
std::shared_ptr<tflite::gpu::gl::GlBuffer> TensorBuffer::CreateGlBuffer(
|
||||
std::shared_ptr<mediapipe::GlContext> context) {
|
||||
std::shared_ptr<tflite::gpu::gl::GlBuffer> ptr(
|
||||
new tflite::gpu::gl::GlBuffer, [context](tflite::gpu::gl::GlBuffer* ref) {
|
||||
if (context) {
|
||||
context->Run([ref]() {
|
||||
if (ref) delete ref;
|
||||
});
|
||||
} else {
|
||||
if (ref) delete ref; // No context provided.
|
||||
}
|
||||
});
|
||||
return ptr;
|
||||
}
|
||||
#endif // MEDIAPIPE_DISABLE_GL_COMPUTE
|
||||
|
||||
#if defined(MEDIAPIPE_IOS)
|
||||
TensorBuffer::TensorBuffer(id<MTLBuffer> tensor) {
|
||||
gpu_ = tensor;
|
||||
uses_gpu_ = true;
|
||||
}
|
||||
#endif // MEDIAPIPE_IOS
|
||||
|
||||
} // namespace mediapipe
|