Project import generated by Copybara.

GitOrigin-RevId: 4419aaa472eeb91123d1f8576188166ee0e5ea69
This commit is contained in:
MediaPipe Team 2020-03-10 18:07:12 -07:00 committed by jqtang
parent 252a5713c7
commit 3b6d3c4058
104 changed files with 7441 additions and 88 deletions

View File

@ -15,6 +15,7 @@
* [Hair Segmentation](mediapipe/docs/hair_segmentation_mobile_gpu.md) [[Web Demo]](https://viz.mediapipe.dev/runner/demos/hair_segmentation/hair_segmentation.html) * [Hair Segmentation](mediapipe/docs/hair_segmentation_mobile_gpu.md) [[Web Demo]](https://viz.mediapipe.dev/runner/demos/hair_segmentation/hair_segmentation.html)
* [Object Detection](mediapipe/docs/object_detection_mobile_gpu.md) * [Object Detection](mediapipe/docs/object_detection_mobile_gpu.md)
* [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md) * [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md)
* [Objectron: 3D Object Detection and Tracking](mediapipe/docs/objectron_mobile_gpu.md)
* [AutoFlip](mediapipe/docs/autoflip.md) * [AutoFlip](mediapipe/docs/autoflip.md)
![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif) ![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif)
@ -43,6 +44,8 @@ A web-based visualizer is hosted on [viz.mediapipe.dev](https://viz.mediapipe.de
* [YouTube Channel](https://www.youtube.com/channel/UCObqmpuSMx-usADtL_qdMAw) * [YouTube Channel](https://www.youtube.com/channel/UCObqmpuSMx-usADtL_qdMAw)
## Publications ## Publications
* [MediaPipe Objectron: Real-time 3D Object Detection on Mobile Devices](https://mediapipe.page.link/objectron-aiblog)
* [AutoFlip: An Open Source Framework for Intelligent Video Reframing](https://mediapipe.page.link/autoflip)
* [Google Developer Blog: MediaPipe on the Web](https://mediapipe.page.link/webdevblog) * [Google Developer Blog: MediaPipe on the Web](https://mediapipe.page.link/webdevblog)
* [Google Developer Blog: Object Detection and Tracking using MediaPipe](https://mediapipe.page.link/objecttrackingblog) * [Google Developer Blog: Object Detection and Tracking using MediaPipe](https://mediapipe.page.link/objecttrackingblog)
* [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html) * [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html)
@ -63,7 +66,7 @@ A web-based visualizer is hosted on [viz.mediapipe.dev](https://viz.mediapipe.de
* [Discuss](https://groups.google.com/forum/#!forum/mediapipe) - General community discussion around MediaPipe * [Discuss](https://groups.google.com/forum/#!forum/mediapipe) - General community discussion around MediaPipe
## Alpha Disclaimer ## Alpha Disclaimer
MediaPipe is currently in alpha for v0.6. We are still making breaking API changes and expect to get to stable API by v1.0. MediaPipe is currently in alpha for v0.7. We are still making breaking API changes and expect to get to stable API by v1.0.
## Contributing ## Contributing
We welcome contributions. Please follow these [guidelines](./CONTRIBUTING.md). We welcome contributions. Please follow these [guidelines](./CONTRIBUTING.md).

View File

@ -75,11 +75,28 @@ REGISTER_CALCULATOR(ImageCroppingCalculator);
} }
#endif // !MEDIAPIPE_DISABLE_GPU #endif // !MEDIAPIPE_DISABLE_GPU
RET_CHECK(cc->Inputs().HasTag(kRectTag) ^ cc->Inputs().HasTag(kNormRectTag) ^ int flags = 0;
(cc->Options<mediapipe::ImageCroppingCalculatorOptions>() if (cc->Inputs().HasTag(kRectTag)) {
++flags;
}
if (cc->Inputs().HasTag(kWidthTag) && cc->Inputs().HasTag(kHeightTag)) {
++flags;
}
if (cc->Inputs().HasTag(kNormRectTag)) {
++flags;
}
if (cc->Options<mediapipe::ImageCroppingCalculatorOptions>()
.has_norm_width() && .has_norm_width() &&
cc->Options<mediapipe::ImageCroppingCalculatorOptions>() cc->Options<mediapipe::ImageCroppingCalculatorOptions>()
.has_norm_height())); .has_norm_height()) {
++flags;
}
if (cc->Options<mediapipe::ImageCroppingCalculatorOptions>().has_width() &&
cc->Options<mediapipe::ImageCroppingCalculatorOptions>().has_height()) {
++flags;
}
RET_CHECK(flags == 1) << "Illegal combination of input streams/options.";
if (cc->Inputs().HasTag(kRectTag)) { if (cc->Inputs().HasTag(kRectTag)) {
cc->Inputs().Tag(kRectTag).Set<Rect>(); cc->Inputs().Tag(kRectTag).Set<Rect>();
} }

View File

@ -39,6 +39,15 @@ proto_library(
], ],
) )
proto_library(
name = "timed_box_list_id_to_label_calculator_proto",
srcs = ["timed_box_list_id_to_label_calculator.proto"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_proto",
],
)
proto_library( proto_library(
name = "latency_proto", name = "latency_proto",
srcs = ["latency.proto"], srcs = ["latency.proto"],
@ -113,6 +122,18 @@ mediapipe_cc_proto_library(
], ],
) )
mediapipe_cc_proto_library(
name = "timed_box_list_id_to_label_calculator_cc_proto",
srcs = ["timed_box_list_id_to_label_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
],
visibility = ["//visibility:public"],
deps = [
":timed_box_list_id_to_label_calculator_proto",
],
)
mediapipe_cc_proto_library( mediapipe_cc_proto_library(
name = "latency_cc_proto", name = "latency_cc_proto",
srcs = ["latency.proto"], srcs = ["latency.proto"],
@ -313,6 +334,34 @@ cc_library(
alwayslink = 1, alwayslink = 1,
) )
cc_library(
name = "timed_box_list_id_to_label_calculator",
srcs = ["timed_box_list_id_to_label_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":timed_box_list_id_to_label_calculator_cc_proto",
"//mediapipe/framework/port:status",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework:packet",
"//mediapipe/util/tracking:box_tracker_cc_proto",
"//mediapipe/util:resource_util",
] + select({
"//mediapipe:android": [
"//mediapipe/util/android/file/base",
],
"//mediapipe:apple": [
"//mediapipe/util/android/file/base",
],
"//mediapipe:macos": [
"//mediapipe/framework/port:file_helpers",
],
"//conditions:default": [
"//mediapipe/framework/port:file_helpers",
],
}),
alwayslink = 1,
)
cc_library( cc_library(
name = "non_max_suppression_calculator", name = "non_max_suppression_calculator",
srcs = ["non_max_suppression_calculator.cc"], srcs = ["non_max_suppression_calculator.cc"],

View File

@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mediapipe//framework/packet.h"
#include "mediapipe/calculators/util/detection_label_id_to_text_calculator.pb.h" #include "mediapipe/calculators/util/detection_label_id_to_text_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/detection.pb.h" #include "mediapipe/framework/formats/detection.pb.h"
#include "mediapipe/framework/packet.h"
#include "mediapipe/framework/port/status.h" #include "mediapipe/framework/port/status.h"
#include "mediapipe/util/resource_util.h" #include "mediapipe/util/resource_util.h"

View File

@ -0,0 +1,105 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/calculators/util/timed_box_list_id_to_label_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/packet.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/resource_util.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
#if defined(MEDIAPIPE_MOBILE)
#include "mediapipe/util/android/file/base/file.h"
#include "mediapipe/util/android/file/base/helpers.h"
#else
#include "mediapipe/framework/port/file_helpers.h"
#endif
namespace mediapipe {
using mediapipe::TimedBoxProto;
using mediapipe::TimedBoxProtoList;
// Takes a label map (from label IDs to names), and populate the label field in
// TimedBoxProto according to it's ID.
//
// Example usage:
// node {
// calculator: "TimedBoxListIdToLabelCalculator"
// input_stream: "input_timed_box_list"
// output_stream: "output_timed_box_list"
// node_options: {
// [mediapipe.TimedBoxListIdToLabelCalculatorOptions] {
// label_map_path: "labelmap.txt"
// }
// }
// }
class TimedBoxListIdToLabelCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
private:
std::unordered_map<int, std::string> label_map_;
};
REGISTER_CALCULATOR(TimedBoxListIdToLabelCalculator);
::mediapipe::Status TimedBoxListIdToLabelCalculator::GetContract(
CalculatorContract* cc) {
cc->Inputs().Index(0).Set<TimedBoxProtoList>();
cc->Outputs().Index(0).Set<TimedBoxProtoList>();
return ::mediapipe::OkStatus();
}
::mediapipe::Status TimedBoxListIdToLabelCalculator::Open(
CalculatorContext* cc) {
cc->SetOffset(TimestampDiff(0));
const auto& options =
cc->Options<::mediapipe::TimedBoxListIdToLabelCalculatorOptions>();
std::string string_path;
ASSIGN_OR_RETURN(string_path, PathToResourceAsFile(options.label_map_path()));
std::string label_map_string;
MP_RETURN_IF_ERROR(file::GetContents(string_path, &label_map_string));
std::istringstream stream(label_map_string);
std::string line;
int i = 0;
while (std::getline(stream, line)) {
label_map_[i++] = line;
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status TimedBoxListIdToLabelCalculator::Process(
CalculatorContext* cc) {
const auto& input_list = cc->Inputs().Index(0).Get<TimedBoxProtoList>();
auto output_list = absl::make_unique<TimedBoxProtoList>();
for (const auto& input_box : input_list.box()) {
TimedBoxProto* box_ptr = output_list->add_box();
*box_ptr = input_box;
if (label_map_.find(input_box.id()) != label_map_.end()) {
box_ptr->set_label(label_map_[input_box.id()]);
}
}
cc->Outputs().Index(0).Add(output_list.release(), cc->InputTimestamp());
return ::mediapipe::OkStatus();
}
} // namespace mediapipe

View File

@ -0,0 +1,28 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
message TimedBoxListIdToLabelCalculatorOptions {
extend mediapipe.CalculatorOptions {
optional TimedBoxListIdToLabelCalculatorOptions ext = 297701606;
}
// Path to a label map file for getting the actual name of detected classes.
optional string label_map_path = 1;
}

View File

@ -66,6 +66,25 @@ void AddTimedBoxProtoToRenderData(
rect->set_bottom(box_proto.bottom()); rect->set_bottom(box_proto.bottom());
rect->set_rotation(box_proto.rotation()); rect->set_rotation(box_proto.rotation());
} }
if (box_proto.has_label()) {
auto* label_annotation = render_data->add_render_annotations();
label_annotation->mutable_color()->set_r(options.box_color().r());
label_annotation->mutable_color()->set_g(options.box_color().g());
label_annotation->mutable_color()->set_b(options.box_color().b());
label_annotation->set_thickness(options.thickness());
RenderAnnotation::Text* text = label_annotation->mutable_text();
text->set_display_text(box_proto.label());
text->set_normalized(true);
constexpr float text_left_start = 0.3f;
text->set_left((1.0f - text_left_start) * box_proto.left() +
text_left_start * box_proto.right());
constexpr float text_baseline = 0.6f;
text->set_baseline(text_baseline * box_proto.bottom() +
(1.0f - text_baseline) * box_proto.top());
constexpr float text_height = 0.2f;
text->set_font_height((box_proto.bottom() - box_proto.top()) * text_height);
}
} }
} // namespace } // namespace

View File

@ -15,6 +15,9 @@ For overall context on AutoFlip, please read this
Run the following command to build the AutoFlip pipeline: Run the following command to build the AutoFlip pipeline:
Note: AutoFlip currently only works with OpenCV 3 . Please verify your OpenCV
version beforehand.
```bash ```bash
bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/autoflip:run_autoflip bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/autoflip:run_autoflip
``` ```

View File

@ -44,6 +44,14 @@ graphs can be easily adapted to run on CPU v.s. GPU.
[Object Detection and Tracking with GPU](./object_tracking_mobile_gpu.md) illustrates how to [Object Detection and Tracking with GPU](./object_tracking_mobile_gpu.md) illustrates how to
use MediaPipe for object detection and tracking. use MediaPipe for object detection and tracking.
### Objectron: 3D Object Detection and Tracking with GPU
[MediaPipe Objectron is 3D Object Detection with GPU](./objectron_mobile_gpu.md)
illustrates mobile real-time 3D object detection and tracking pipeline for every
day objects like shoes and chairs
* [Android](./objectron_mobile_gpu.md)
### Face Detection with GPU ### Face Detection with GPU
[Face Detection with GPU](./face_detection_mobile_gpu.md) illustrates how to use [Face Detection with GPU](./face_detection_mobile_gpu.md) illustrates how to use

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

View File

@ -364,8 +364,10 @@ To build and run iOS apps:
### Installing on Windows Subsystem for Linux (WSL) ### Installing on Windows Subsystem for Linux (WSL)
Note: WSL has historically not provided access to USB cameras. Mediapipe can use Note: The pre-built OpenCV packages don't support cameras in WSL. Unless you
a video file as input. [compile](https://funvision.blogspot.com/2019/12/opencv-web-camera-and-video-streams-in.html)
OpenCV with FFMPEG and GStreamer in WSL, the live demos won't work with any
cameras. Alternatively, you use a video file as input.
1. Follow the 1. Follow the
[instruction](https://docs.microsoft.com/en-us/windows/wsl/install-win10) to [instruction](https://docs.microsoft.com/en-us/windows/wsl/install-win10) to
@ -373,7 +375,7 @@ a video file as input.
2. Install Windows ADB and start the ADB server in Windows. 2. Install Windows ADB and start the ADB server in Windows.
Note: Windows and WSLs adb versions must be the same version, e.g., if WSL Note: Windows' and WSLs adb versions must be the same version, e.g., if WSL
has ADB 1.0.39, you need to download the corresponding Windows ADB from has ADB 1.0.39, you need to download the corresponding Windows ADB from
[here](https://dl.google.com/android/repository/platform-tools_r26.0.1-windows.zip). [here](https://dl.google.com/android/repository/platform-tools_r26.0.1-windows.zip).

View File

@ -26,6 +26,7 @@ To build and run the TensorFlow example on desktop, run:
$ bazel build -c opt \ $ bazel build -c opt \
--define MEDIAPIPE_DISABLE_GPU=1 \ --define MEDIAPIPE_DISABLE_GPU=1 \
--define no_aws_support=true \ --define no_aws_support=true \
--linkopt=-s \
mediapipe/examples/desktop/object_detection:object_detection_tensorflow mediapipe/examples/desktop/object_detection:object_detection_tensorflow
# It should print: # It should print:

View File

@ -0,0 +1,489 @@
# MediaPipe Objectron (GPU)
This doc focuses on the
[below example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt)
that performs 3D object detection and tracking with TensorFlow Lite on GPU.
Objectron for shoes | Objectron for chairs
:-----------------------------------------------------------------------------: | :------------------:
![objectron_shoe_android_gpu_gif](images/mobile/objectron_shoe_android_gpu.gif) | ![objectron_chair_android_gpu_gif](images/mobile/objectron_chair_android_gpu.gif)
For overall context on MediaPipe Objectron, please read the
[Google AI Blog](https://mediapipe.page.link/objectron-aiblog). The Objectron's
ML model (see also the [model card](https://mediapipe.page.link/objectron-mc))
estimates a 3D bounding box for the detected object.
## Android
[Source](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d)
An arm64 build of Objectron for shoes can be
[downloaded here](https://drive.google.com/open?id=1S0K4hbWt3o31FfQ4QU3Rz7IHrvOUMx1d),
and for chairs can be
[downloaded here](https://drive.google.com/open?id=1MM8K-13bXLCVS1EHQ-KgkVyEahEPrKej).
To build and install the Objectron for shoes:
```bash
bazel build -c opt --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d
```
Similarly to build and install the Objectron for chairs, add **--define
chair=true** flag to build command.
```bash
bazel build -c opt --define chair=true --config android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d:objectdetection3d
```
Once the app is built, install in on Android device with:
```bash
adb install bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/objectdetection3d.apk
```
## Graph
The Objectron main graph internally utilizes the Objectron detection subgraph,
and the Objectron tracking subgraph. To visualize the graph as shown above, copy
the text specification of the graph below and paste it into
[MediaPipe Visualizer](https://viz.mediapipe.dev/).
### Main Graph
This is the main graph for the shoe detector. This graph runs detection and
tracking and renders the output to the display.
![object_detection_mobile_gpu_graph](images/mobile/object_detection_3d_android_gpu.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/shoe_classic_occlusion_tracking.pbtxt)
```bash
# MediaPipe object detection 3D with tracking graph.
# Images on GPU coming into and out of the graph.
input_stream: "input_video"
output_stream: "output_video"
# Creates a copy of the input_video stream. At the end of the graph, the
# GlAnimationOverlayCalculator will consume the input_video texture and draws
# on top of it.
node: {
calculator: "GlScalerCalculator"
input_stream: "VIDEO:input_video"
output_stream: "VIDEO:input_video_copy"
}
# Resamples the images by specific frame rate. This calculator is used to
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
# consumption for expensive process.
node {
calculator: "PacketResamplerCalculator"
input_stream: "DATA:input_video_copy"
output_stream: "DATA:sampled_input_video"
node_options: {
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
frame_rate: 5
}
}
}
node {
calculator: "ObjectronDetectionSubgraphGpu"
input_stream: "IMAGE_GPU:sampled_input_video"
output_stream: "ANNOTATIONS:objects"
}
node {
calculator: "ObjectronTrackingSubgraphGpu"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "IMAGE_GPU:input_video_copy"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
}
# The rendering nodes:
# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly
# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask.
# These models are designed using different tools, so we supply a transformation
# to bring both of them to the Objectron's coordinate system.
# Creates a model matrices for the tracked object given the lifted 3D points.
# This calculator does two things: 1) Estimates object's pose (orientation,
# translation, and scale) from the 3D vertices, and
# 2) bring the object from the objectron's coordinate system to the renderer
# (OpenGL) coordinate system. Since the final goal is to render a mesh file on
# top of the object, we also supply a transformation to bring the mesh to the
# objectron's coordinate system, and rescale mesh to the unit size.
node {
calculator: "AnnotationsToModelMatricesCalculator"
input_stream: "ANNOTATIONS:lifted_tracked_objects"
output_stream: "MODEL_MATRICES:model_matrices"
node_options: {
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
# Re-scale the CAD model to the size of a unit box
model_scale: 0.05
model_scale: 0.05
model_scale: 0.05
# Bring the box CAD model to objectron's coordinate system. This
# is equivalent of -pi/2 rotation along the y-axis (right-hand rule):
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY())
model_transformation: 0.0
model_transformation: 0.0
model_transformation: -1.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 1.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 1.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 1.0
}
}
}
# Compute the model matrices for the CAD model of the shoe, to be used as an
# occlusion mask. The model will be rendered at the exact same location as the
# bounding box.
node {
calculator: "AnnotationsToModelMatricesCalculator"
input_stream: "ANNOTATIONS:lifted_tracked_objects"
output_stream: "MODEL_MATRICES:mask_model_matrices"
#input_side_packet: "MODEL_SCALE:model_scale"
node_options: {
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
# Re-scale the CAD model to the size of a unit box
model_scale: 0.45
model_scale: 0.25
model_scale: 0.15
# Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This
# is equivalent of -pi/2 rotation along the x-axis (right-hand rule):
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX())
model_transformation: 1.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 1.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: -1.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 0.0
model_transformation: 1.0
}
}
}
# Render everything together. First we render the 3D bounding box animation,
# then we render the occlusion mask.
node: {
calculator: "GlAnimationOverlayCalculator"
input_stream: "VIDEO:input_video"
input_stream: "MODEL_MATRICES:model_matrices"
input_stream: "MASK_MODEL_MATRICES:mask_model_matrices"
output_stream: "output_video"
input_side_packet: "TEXTURE:box_texture"
input_side_packet: "ANIMATION_ASSET:box_asset_name"
input_side_packet: "MASK_TEXTURE:obj_texture"
input_side_packet: "MASK_ASSET:obj_asset_name"
node_options: {
[type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] {
# Output resolution is 480x640 with the aspect ratio of 0.75
aspect_ratio: 0.75
vertical_fov_degrees: 70.
animation_speed_fps: 25
}
}
}
```
### Objectron Detection Subgraph
Objectron detection subgraph uses the *TfLiteInferenceCalculator* to run
inference and decodes the output tensor to *FrameAnnotation* protobuf. The
*FrameAnnotation* contains nine keypoints: the bounding box's center, as well as
its eight vertices. The boxes will be passed to the Objectron tracking subgraph.
![object_detection_subgraph](images/mobile/objectron_detection_subgraph.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_detection_gpu.pbtxt)
```bash
# MediaPipe Objectron detection gpu subgraph
type: "ObjectronDetectionSubgraphGpu"
input_stream: "IMAGE_GPU:input_video"
output_stream: "ANNOTATIONS:objects"
# Transforms the input image on GPU to a 480x640 image. To scale the input
# image, the scale_mode option is set to FIT to preserve the aspect ratio,
# resulting in potential letterboxing in the transformed image.
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:transformed_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 480
output_height: 640
scale_mode: FIT
}
}
}
# Converts the transformed input image on GPU into an image tensor stored as a
# TfLiteTensor.
node {
calculator: "TfLiteConverterCalculator"
input_stream: "IMAGE_GPU:transformed_input_video"
output_stream: "TENSORS_GPU:image_tensor"
}
# Generates a single side packet containing a TensorFlow Lite op resolver that
# supports custom ops needed by the model used in this graph.
node {
calculator: "TfLiteCustomOpResolverCalculator"
output_side_packet: "opresolver"
node_options: {
[type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] {
use_gpu: true
}
}
}
# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
# vector of tensors representing, for instance, detection boxes/keypoints and
# scores.
node {
calculator: "TfLiteInferenceCalculator"
input_stream: "TENSORS_GPU:image_tensor"
output_stream: "TENSORS:detection_tensors"
input_side_packet: "CUSTOM_OP_RESOLVER:opresolver"
node_options: {
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
model_path: "object_detection_3d.tflite"
}
}
}
# Decodes the model's output tensor (the heatmap and the distance fields) to 2D
# keypoints. There are nine 2D keypoints: one center keypoint and eight vertices
# for the 3D bounding box. The calculator parameters determine's the decoder's
# sensitivity.
node {
calculator: "TfLiteTensorsToObjectsCalculator"
input_stream: "TENSORS:detection_tensors"
output_stream: "ANNOTATIONS:objects"
node_options: {
[type.googleapis.com/mediapipe.TfLiteTensorsToObjectsCalculatorOptions] {
num_classes: 1
num_keypoints: 9
decoder_config {
heatmap_threshold: 0.6
local_max_distance: 2
offset_scale_coef: 1.0
voting_radius: 2
voting_allowance: 1
voting_threshold: 0.2
}
}
}
}
```
### Object Tracking Subgraph
Object tracking subgraph uses a *BoxTracker* calculator which is a generic
tracking library, also used in
[Mediapipe's 2D Object Detection and Tracking](https://github.com/google/mediapipe/tree/master/mediapipe/g3doc/object_tracking_mobile_gpu.md).
The tracking runs every frame and when a new detection is available, it
consolidates the detection and tracking results. The tracker tracks the box with
its 2D keypoints, so at the end we lift the 2D keypoints to 3D using EPnP
algorithm in *Lift2DFrameAnnotationTo3D* Calculator.
![object_tracking_subgraph](images/mobile/objectron_tracking_subgraph.png)
[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/subgraphs/objectron_tracking_gpu.pbtxt)
```bash
# MediaPipe Objectron tracking gpu subgraph
type: "ObjectronTrackingSubgraphGpu"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "IMAGE_GPU:input_video"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
# Converts the detected keypoints to Boxes, used by the tracking subgraph.
node {
calculator: "FrameAnnotationToTimedBoxListCalculator"
input_stream: "FRAME_ANNOTATION:objects"
output_stream: "BOXES:start_pos"
}
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:downscaled_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 240
output_height: 320
}
}
}
# Converts GPU buffer to ImageFrame for processing tracking.
node: {
calculator: "GpuBufferToImageFrameCalculator"
input_stream: "downscaled_input_video"
output_stream: "downscaled_input_video_cpu"
}
# Performs motion analysis on an incoming video stream.
node: {
calculator: "MotionAnalysisCalculator"
input_stream: "VIDEO:downscaled_input_video_cpu"
output_stream: "CAMERA:camera_motion"
output_stream: "FLOW:region_flow"
node_options: {
[type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: {
analysis_options {
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
flow_options {
fast_estimation_min_block_size: 100
top_inlier_sets: 1
frac_inlier_error_threshold: 3e-3
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
verification_distance: 5.0
verify_long_feature_acceleration: true
verify_long_feature_trigger_ratio: 0.1
tracking_options {
max_features: 500
adaptive_extraction_levels: 2
min_eig_val_settings {
adaptive_lowest_quality_level: 2e-4
}
klt_tracker_implementation: KLT_OPENCV
}
}
}
}
}
}
# Reads optical flow fields defined in
# mediapipe/framework/formats/motion/optical_flow_field.h,
# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized
# to 0-255.
node: {
calculator: "FlowPackagerCalculator"
input_stream: "FLOW:region_flow"
input_stream: "CAMERA:camera_motion"
output_stream: "TRACKING:tracking_data"
node_options: {
[type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: {
flow_packager_options: {
binary_tracking_data_support: false
}
}
}
}
# Tracks box positions over time.
node: {
calculator: "BoxTrackerCalculator"
input_stream: "TRACKING:tracking_data"
input_stream: "TRACK_TIME:input_video"
input_stream: "START_POS:start_pos"
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream_info: {
tag_index: "CANCEL_OBJECT_ID"
back_edge: true
}
output_stream: "BOXES:boxes"
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "TRACKING"
tag_index: "TRACK_TIME"
}
sync_set {
tag_index: "START_POS"
}
sync_set {
tag_index: "CANCEL_OBJECT_ID"
}
}
}
}
node_options: {
[type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: {
tracker_options: {
track_step_options {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}
# Consolidates tracking and detection results.
node {
calculator: "FrameAnnotationTrackerCalculator"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "TRACKED_BOXES:boxes"
output_stream: "TRACKED_FRAME_ANNOTATION:tracked_objects"
output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
node_options: {
[type.googleapis.com/mediapipe.FrameAnnotationTrackerCalculatorOptions] {
img_width: 240
img_height: 320
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "FRAME_ANNOTATION"
}
sync_set {
tag_index: "TRACKED_BOXES"
}
}
}
}
}
# Lift the tracked 2D keypoints to 3D using EPnP algorithm.
node {
calculator: "Lift2DFrameAnnotationTo3DCalculator"
input_stream: "FRAME_ANNOTATION:tracked_objects"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
}
```

View File

@ -61,6 +61,8 @@ videos.
```bash ```bash
# cd to the root directory of the MediaPipe repo # cd to the root directory of the MediaPipe repo
cd - cd -
pip3 install tf_slim
python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph
``` ```
@ -78,7 +80,7 @@ videos.
5. Run the MediaPipe binary to extract the features. 5. Run the MediaPipe binary to extract the features.
```bash ```bash
bazel build -c opt \ bazel build -c opt --linkopt=-s \
--define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \ --define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \
mediapipe/examples/desktop/youtube8m:extract_yt8m_features mediapipe/examples/desktop/youtube8m:extract_yt8m_features
@ -126,13 +128,13 @@ the inference for both local videos and the dataset
2. Build the inference binary. 2. Build the inference binary.
```bash ```bash
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
mediapipe/examples/desktop/youtube8m:model_inference mediapipe/examples/desktop/youtube8m:model_inference
``` ```
3. Run the python web server. 3. Run the python web server.
Note: pip install absl-py Note: pip3 install absl-py
```bash ```bash
python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd` python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd`
@ -162,7 +164,7 @@ the inference for both local videos and the dataset
3. Build and run the inference binary. 3. Build and run the inference binary.
```bash ```bash
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
mediapipe/examples/desktop/youtube8m:model_inference mediapipe/examples/desktop/youtube8m:model_inference
# segment_size is the number of seconds window of frames. # segment_size is the number of seconds window of frames.

View File

@ -0,0 +1,33 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="com.google.mediapipe.apps.objectdetection3d">
<uses-sdk
android:minSdkVersion="21"
android:targetSdkVersion="27" />
<!-- For using the camera -->
<uses-permission android:name="android.permission.CAMERA" />
<uses-feature android:name="android.hardware.camera" />
<uses-feature android:name="android.hardware.camera.autofocus" />
<!-- For MediaPipe -->
<uses-feature android:glEsVersion="0x00020000" android:required="true" />
<application
android:allowBackup="true"
android:label="@string/app_name"
android:supportsRtl="true"
android:theme="@style/AppTheme">
<activity
android:name=".MainActivity"
android:exported="true"
android:screenOrientation="portrait">
<intent-filter>
<action android:name="android.intent.action.MAIN" />
<category android:name="android.intent.category.LAUNCHER" />
</intent-filter>
</activity>
</application>
</manifest>

View File

@ -0,0 +1,115 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:private"])
cc_binary(
name = "libmediapipe_jni.so",
linkshared = 1,
linkstatic = 1,
deps = [
"//mediapipe/graphs/object_detection_3d:mobile_calculators",
"//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni",
],
)
cc_library(
name = "mediapipe_jni_lib",
srcs = [":libmediapipe_jni.so"],
alwayslink = 1,
)
# To use the "chair" model instead of the default "shoes" model,
# add "--define chair=true" to the bazel build command.
config_setting(
name = "use_chair_model",
define_values = {
"chair": "true",
},
)
# Maps the binary graph to an alias (e.g., the app name) for convenience so that the alias can be
# easily incorporated into the app via, for example,
# MainActivity.BINARY_GRAPH_NAME = "appname.binarypb".
genrule(
name = "binary_graph",
srcs = select({
"//conditions:default": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_shoe"],
":use_chair_model": ["//mediapipe/graphs/object_detection_3d:mobile_gpu_binary_graph_chair"],
}),
outs = ["objectdetection3d.binarypb"],
cmd = "cp $< $@",
)
genrule(
name = "model",
srcs = select({
"//conditions:default": ["//mediapipe/models:object_detection_3d_sneakers.tflite"],
":use_chair_model": ["//mediapipe/models:object_detection_3d_chair.tflite"],
}),
outs = ["object_detection_3d.tflite"],
cmd = "cp $< $@",
)
android_library(
name = "mediapipe_lib",
srcs = glob(["*.java"]),
assets = [
":binary_graph",
":model",
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:box.obj.uuu",
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets:classic_colors.png",
] + select({
"//conditions:default": [
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:model.obj.uuu",
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/sneaker:texture.bmp",
],
":use_chair_model": [
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:model.obj.uuu",
"//mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetection3d/assets/chair:texture.bmp",
],
}),
assets_dir = "",
manifest = "AndroidManifest.xml",
resource_files = glob(["res/**"]),
deps = [
":mediapipe_jni_lib",
"//mediapipe/framework/formats:landmark_java_proto_lite",
"//mediapipe/java/com/google/mediapipe/components:android_camerax_helper",
"//mediapipe/java/com/google/mediapipe/components:android_components",
"//mediapipe/java/com/google/mediapipe/framework:android_framework",
"//mediapipe/java/com/google/mediapipe/glutil",
"//third_party:androidx_appcompat",
"//third_party:androidx_constraint_layout",
"//third_party:androidx_legacy_support_v4",
"//third_party:androidx_recyclerview",
"//third_party:opencv",
"@androidx_concurrent_futures//jar",
"@androidx_lifecycle//jar",
"@com_google_code_findbugs//jar",
"@com_google_guava_android//jar",
],
)
android_binary(
name = "objectdetection3d",
manifest = "AndroidManifest.xml",
manifest_values = {"applicationId": "com.google.mediapipe.apps.objectdetection3d"},
multidex = "native",
deps = [
":mediapipe_lib",
],
)

View File

@ -0,0 +1,280 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.mediapipe.apps.objectdetection3d;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.graphics.SurfaceTexture;
import android.os.Bundle;
import androidx.appcompat.app.AppCompatActivity;
import android.util.Log;
import android.util.Size;
import android.view.SurfaceHolder;
import android.view.SurfaceView;
import android.view.View;
import android.view.ViewGroup;
import com.google.mediapipe.components.CameraHelper;
import com.google.mediapipe.components.CameraXPreviewHelper;
import com.google.mediapipe.components.ExternalTextureConverter;
import com.google.mediapipe.components.FrameProcessor;
import com.google.mediapipe.components.PermissionHelper;
import com.google.mediapipe.framework.AndroidAssetUtil;
import com.google.mediapipe.framework.AndroidPacketCreator;
import com.google.mediapipe.framework.Packet;
import com.google.mediapipe.glutil.EglManager;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
/** Main activity of MediaPipe example apps. */
public class MainActivity extends AppCompatActivity {
private static final String TAG = "MainActivity";
private static final String BINARY_GRAPH_NAME = "objectdetection3d.binarypb";
private static final String INPUT_VIDEO_STREAM_NAME = "input_video";
private static final String OUTPUT_VIDEO_STREAM_NAME = "output_video";
private static final String OBJ_TEXTURE = "texture.bmp";
private static final String OBJ_FILE = "model.obj.uuu";
private static final String BOX_TEXTURE = "classic_colors.png";
private static final String BOX_FILE = "box.obj.uuu";
private static final CameraHelper.CameraFacing CAMERA_FACING = CameraHelper.CameraFacing.BACK;
// Flips the camera-preview frames vertically before sending them into FrameProcessor to be
// processed in a MediaPipe graph, and flips the processed frames back when they are displayed.
// This is needed because OpenGL represents images assuming the image origin is at the bottom-left
// corner, whereas MediaPipe in general assumes the image origin is at top-left.
private static final boolean FLIP_FRAMES_VERTICALLY = true;
// Target resolution should be 4:3 for this application, as expected by the model and tracker.
private static final Size TARGET_RESOLUTION = new Size(1280, 960);
static {
// Load all native libraries needed by the app.
System.loadLibrary("mediapipe_jni");
System.loadLibrary("opencv_java3");
}
// {@link SurfaceTexture} where the camera-preview frames can be accessed.
private SurfaceTexture previewFrameTexture;
// {@link SurfaceView} that displays the camera-preview frames processed by a MediaPipe graph.
private SurfaceView previewDisplayView;
// Creates and manages an {@link EGLContext}.
private EglManager eglManager;
// Sends camera-preview frames into a MediaPipe graph for processing, and displays the processed
// frames onto a {@link Surface}.
private FrameProcessor processor;
// Converts the GL_TEXTURE_EXTERNAL_OES texture from Android camera into a regular texture to be
// consumed by {@link FrameProcessor} and the underlying MediaPipe graph.
private ExternalTextureConverter converter;
// Handles camera access via the {@link CameraX} Jetpack support library.
private CameraXPreviewHelper cameraHelper;
// Assets.
private Bitmap objTexture = null;
private Bitmap boxTexture = null;
Size cameraImageSize;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
previewDisplayView = new SurfaceView(this);
setupPreviewDisplayView();
// Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g.,
// binary graphs.
AndroidAssetUtil.initializeNativeAssetManager(this);
eglManager = new EglManager(null);
processor =
new FrameProcessor(
this,
eglManager.getNativeContext(),
BINARY_GRAPH_NAME,
INPUT_VIDEO_STREAM_NAME,
OUTPUT_VIDEO_STREAM_NAME);
processor.getVideoSurfaceOutput().setFlipY(FLIP_FRAMES_VERTICALLY);
prepareDemoAssets();
AndroidPacketCreator packetCreator = processor.getPacketCreator();
Map<String, Packet> inputSidePackets = new HashMap<>();
inputSidePackets.put("obj_asset_name", packetCreator.createString(OBJ_FILE));
inputSidePackets.put("box_asset_name", packetCreator.createString(BOX_FILE));
inputSidePackets.put("obj_texture", packetCreator.createRgbaImageFrame(objTexture));
inputSidePackets.put("box_texture", packetCreator.createRgbaImageFrame(boxTexture));
processor.setInputSidePackets(inputSidePackets);
PermissionHelper.checkAndRequestCameraPermissions(this);
}
@Override
protected void onResume() {
super.onResume();
converter = new ExternalTextureConverter(eglManager.getContext());
converter.setFlipY(FLIP_FRAMES_VERTICALLY);
converter.setConsumer(processor);
if (PermissionHelper.cameraPermissionsGranted(this)) {
startCamera();
}
}
@Override
protected void onPause() {
super.onPause();
converter.close();
}
@Override
public void onRequestPermissionsResult(
int requestCode, String[] permissions, int[] grantResults) {
super.onRequestPermissionsResult(requestCode, permissions, grantResults);
PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults);
}
private void setupPreviewDisplayView() {
previewDisplayView.setVisibility(View.GONE);
ViewGroup viewGroup = findViewById(R.id.preview_display_layout);
viewGroup.addView(previewDisplayView);
previewDisplayView
.getHolder()
.addCallback(
new SurfaceHolder.Callback() {
@Override
public void surfaceCreated(SurfaceHolder holder) {
processor.getVideoSurfaceOutput().setSurface(holder.getSurface());
}
@Override
public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
// (Re-)Compute the ideal size of the camera-preview display (the area that the
// camera-preview frames get rendered onto, potentially with scaling and rotation)
// based on the size of the SurfaceView that contains the display.
Size viewSize = new Size(height, height * 3 / 4); // Prefer 3:4 aspect ratio.
Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize);
boolean isCameraRotated = cameraHelper.isCameraRotated();
cameraImageSize = cameraHelper.getFrameSize();
// Connect the converter to the camera-preview frames as its input (via
// previewFrameTexture), and configure the output width and height as the computed
// display size.
converter.setSurfaceTextureAndAttachToGLContext(
previewFrameTexture,
isCameraRotated ? displaySize.getHeight() : displaySize.getWidth(),
isCameraRotated ? displaySize.getWidth() : displaySize.getHeight());
processor.setOnWillAddFrameListener(
(timestamp) -> {
try {
int cameraTextureWidth =
isCameraRotated
? cameraImageSize.getHeight()
: cameraImageSize.getWidth();
int cameraTextureHeight =
isCameraRotated
? cameraImageSize.getWidth()
: cameraImageSize.getHeight();
// Find limiting side and scale to 3:4 aspect ratio
float aspectRatio =
(float) cameraTextureWidth / (float) cameraTextureHeight;
if (aspectRatio > 3.0 / 4.0) {
// width too big
cameraTextureWidth = (int) ((float) cameraTextureHeight * 3.0 / 4.0);
} else {
// height too big
cameraTextureHeight = (int) ((float) cameraTextureWidth * 4.0 / 3.0);
}
Packet widthPacket =
processor.getPacketCreator().createInt32(cameraTextureWidth);
Packet heightPacket =
processor.getPacketCreator().createInt32(cameraTextureHeight);
try {
processor
.getGraph()
.addPacketToInputStream("input_width", widthPacket, timestamp);
processor
.getGraph()
.addPacketToInputStream("input_height", heightPacket, timestamp);
} catch (Exception e) {
Log.e(
TAG,
"MediaPipeException encountered adding packets to width and height"
+ " input streams.");
}
widthPacket.release();
heightPacket.release();
} catch (IllegalStateException ise) {
Log.e(
TAG,
"Exception while adding packets to width and height input streams.");
}
});
}
@Override
public void surfaceDestroyed(SurfaceHolder holder) {
processor.getVideoSurfaceOutput().setSurface(null);
}
});
}
private void startCamera() {
cameraHelper = new CameraXPreviewHelper();
cameraHelper.setOnCameraStartedListener(
surfaceTexture -> {
previewFrameTexture = surfaceTexture;
// Make the display view visible to start showing the preview. This triggers the
// SurfaceHolder.Callback added to (the holder of) previewDisplayView.
previewDisplayView.setVisibility(View.VISIBLE);
});
cameraHelper.startCamera(
this, CAMERA_FACING, /*surfaceTexture=*/ null, /*targetSize=*/ TARGET_RESOLUTION);
cameraImageSize = cameraHelper.getFrameSize();
}
private void prepareDemoAssets() {
AndroidAssetUtil.initializeNativeAssetManager(this);
// We render from raw data with openGL, so disable decoding preprocessing
BitmapFactory.Options decodeOptions = new BitmapFactory.Options();
decodeOptions.inScaled = false;
decodeOptions.inDither = false;
decodeOptions.inPremultiplied = false;
try {
InputStream inputStream = getAssets().open(OBJ_TEXTURE);
objTexture = BitmapFactory.decodeStream(inputStream, null /*outPadding*/, decodeOptions);
inputStream.close();
} catch (Exception e) {
Log.e(TAG, "Error parsing object texture; error: " + e);
throw new IllegalStateException(e);
}
try {
InputStream inputStream = getAssets().open(BOX_TEXTURE);
boxTexture = BitmapFactory.decodeStream(inputStream, null /*outPadding*/, decodeOptions);
inputStream.close();
} catch (Exception e) {
Log.e(TAG, "Error parsing box texture; error: " + e);
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,21 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
exports_files(
srcs = glob(["**"]),
)

View File

@ -0,0 +1,21 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
exports_files(
srcs = glob(["**"]),
)

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.9 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 410 KiB

View File

@ -0,0 +1,21 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
exports_files(
srcs = glob(["**"]),
)

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 MiB

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="utf-8"?>
<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
xmlns:app="http://schemas.android.com/apk/res-auto"
xmlns:tools="http://schemas.android.com/tools"
android:layout_width="match_parent"
android:layout_height="match_parent">
<FrameLayout
android:id="@+id/preview_display_layout"
android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:layout_weight="1">
<TextView
android:id="@+id/no_camera_access_view"
android:layout_height="fill_parent"
android:layout_width="fill_parent"
android:gravity="center"
android:text="@string/no_camera_access" />
</FrameLayout>
</androidx.constraintlayout.widget.ConstraintLayout>

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="colorPrimary">#008577</color>
<color name="colorPrimaryDark">#00574B</color>
<color name="colorAccent">#D81B60</color>
</resources>

View File

@ -0,0 +1,4 @@
<resources>
<string name="app_name" translatable="false">Object Detection 3D</string>
<string name="no_camera_access" translatable="false">Please grant camera permissions.</string>
</resources>

View File

@ -0,0 +1,11 @@
<resources>
<!-- Base application theme. -->
<style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
<!-- Customize your theme here. -->
<item name="colorPrimary">@color/colorPrimary</item>
<item name="colorPrimaryDark">@color/colorPrimaryDark</item>
<item name="colorAccent">@color/colorAccent</item>
</style>
</resources>

View File

@ -63,7 +63,7 @@ COPY . /mediapipe/
# Install bazel # Install bazel
ARG BAZEL_VERSION=0.29.1 ARG BAZEL_VERSION=1.1.0
RUN mkdir /bazel && \ RUN mkdir /bazel && \
wget --no-check-certificate -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \ wget --no-check-certificate -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
wget --no-check-certificate -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \ wget --no-check-certificate -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \

View File

@ -1,9 +1,11 @@
# Coral Dev Board Setup (experimental) # Coral Dev Board Setup (experimental)
**Dislaimer**: Running MediaPipe on Coral is experimental, and this process may **Disclaimer**: Running MediaPipe on Coral is experimental, and this process may
not be exact and is subject to change. These instructions have only been tested not be exact and is subject to change. These instructions have only been tested
on the [Coral Dev Board](https://coral.ai/products/dev-board/) with Mendel 4.0, on the [Coral Dev Board](https://coral.ai/products/dev-board/)
and may vary for different devices and workstations. running [Mendel Enterprise Day 13](https://coral.ai/software/) OS and
using [Diploria2](https://github.com/google-coral/edgetpu/tree/diploria2)
edgetpu libs, and may vary for different devices and workstations.
This file describes how to prepare a Coral Dev Board and setup a Linux This file describes how to prepare a Coral Dev Board and setup a Linux
Docker container for building MediaPipe applications that run on Edge TPU. Docker container for building MediaPipe applications that run on Edge TPU.
@ -16,10 +18,12 @@ Docker container for building MediaPipe applications that run on Edge TPU.
* Setup the coral device via [here](https://coral.withgoogle.com/docs/dev-board/get-started/), and ensure the _mdt_ command works * Setup the coral device via [here](https://coral.withgoogle.com/docs/dev-board/get-started/), and ensure the _mdt_ command works
Note: alias mdt="python3 -m mdt.main" may be needed on some systems
* (on coral device) prepare MediaPipe * (on coral device) prepare MediaPipe
cd ~ cd ~
sudo apt-get install -y git sudo apt-get update && sudo apt-get install -y git
git clone https://github.com/google/mediapipe.git git clone https://github.com/google/mediapipe.git
mkdir mediapipe/bazel-bin mkdir mediapipe/bazel-bin

View File

@ -10,19 +10,25 @@ http_archive(
sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e", sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e",
) )
load("@bazel_skylib//lib:versions.bzl", "versions") load("@bazel_skylib//lib:versions.bzl", "versions")
versions.check(minimum_bazel_version = "0.24.1") versions.check(minimum_bazel_version = "1.0.0",
maximum_bazel_version = "1.2.1")
# ABSL cpp library.
# ABSL cpp library lts_2020_02_25
http_archive( http_archive(
name = "com_google_absl", name = "com_google_absl",
# Head commit on 2019-04-12.
# TODO: Switch to the latest absl version when the problem gets
# fixed.
urls = [ urls = [
"https://github.com/abseil/abseil-cpp/archive/a02f62f456f2c4a7ecf2be3104fe0c6e16fbad9a.tar.gz", "https://github.com/abseil/abseil-cpp/archive/20200225.tar.gz",
], ],
sha256 = "d437920d1434c766d22e85773b899c77c672b8b4865d5dc2cd61a29fdff3cf03", # Remove after https://github.com/abseil/abseil-cpp/issues/326 is solved.
strip_prefix = "abseil-cpp-a02f62f456f2c4a7ecf2be3104fe0c6e16fbad9a", patches = [
"@//third_party:com_google_absl_f863b622fe13612433fdf43f76547d5edda0c93001.diff"
],
patch_args = [
"-p1",
],
strip_prefix = "abseil-cpp-20200225",
sha256 = "728a813291bdec2aa46eab8356ace9f75ac2ed9dfe2df5ab603c4e6c09f1c353"
) )
http_archive( http_archive(
@ -72,6 +78,14 @@ http_archive(
], ],
) )
# easyexif
http_archive(
name = "easyexif",
url = "https://github.com/mayanklahiri/easyexif/archive/master.zip",
strip_prefix = "easyexif-master",
build_file = "@//third_party:easyexif.BUILD",
)
# libyuv # libyuv
http_archive( http_archive(
name = "libyuv", name = "libyuv",
@ -103,15 +117,23 @@ http_archive(
], ],
) )
# 2019-11-12 # 2020-02-12
_TENSORFLOW_GIT_COMMIT = "a5f9bcd64453ff3d1f64cb4da4786db3d2da7f82" # The last commit before TensorFlow switched to Bazel 2.0
_TENSORFLOW_SHA256= "f2b6f2ab2ffe63e86eccd3ce4bea6b7197383d726638dfeeebcdc1e7de73f075" _TENSORFLOW_GIT_COMMIT = "77e9ffb9b2bfb1a4f7056e62d84039626923e328"
_TENSORFLOW_SHA256= "176ccd82f7dd17c5e117b50d353603b129c7a6ccbfebd522ca47cc2a40f33f13"
http_archive( http_archive(
name = "org_tensorflow", name = "org_tensorflow",
urls = [ urls = [
"https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, "https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT,
"https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT,
], ],
# A compatibility patch
patches = [
"@//third_party:org_tensorflow_528e22eae8bf3206189a066032c66e9e5c9b4a61.diff"
],
patch_args = [
"-p1",
],
strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT, strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT,
sha256 = _TENSORFLOW_SHA256, sha256 = _TENSORFLOW_SHA256,
) )
@ -119,8 +141,22 @@ http_archive(
load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace") load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
tf_workspace(tf_repo_name = "org_tensorflow") tf_workspace(tf_repo_name = "org_tensorflow")
http_archive(
name = "ceres_solver",
url = "https://github.com/ceres-solver/ceres-solver/archive/1.14.0.zip",
patches = [
"@//third_party:ceres_solver_9bf9588988236279e1262f75d7f4d85711dfa172.diff"
],
patch_args = [
"-p1",
],
strip_prefix = "ceres-solver-1.14.0",
sha256 = "5ba6d0db4e784621fda44a50c58bb23b0892684692f0c623e2063f9c19f192f1"
)
# Please run # Please run
# $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \ # $ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \
# libopencv-calib3d-dev libopencv-features2d-dev \
# libopencv-imgproc-dev libopencv-video-dev # libopencv-imgproc-dev libopencv-video-dev
new_local_repository( new_local_repository(
name = "linux_opencv", name = "linux_opencv",
@ -149,11 +185,10 @@ new_local_repository(
http_archive( http_archive(
name = "android_opencv", name = "android_opencv",
sha256 = "056b849842e4fa8751d09edbb64530cfa7a63c84ccd232d0ace330e27ba55d0b",
build_file = "@//third_party:opencv_android.BUILD", build_file = "@//third_party:opencv_android.BUILD",
strip_prefix = "OpenCV-android-sdk", strip_prefix = "OpenCV-android-sdk",
type = "zip", type = "zip",
url = "https://github.com/opencv/opencv/releases/download/4.1.0/opencv-4.1.0-android-sdk.zip", url = "https://github.com/opencv/opencv/releases/download/3.4.3/opencv-3.4.3-android-sdk.zip",
) )
# After OpenCV 3.2.0, the pre-compiled opencv2.framework has google protobuf symbols, which will # After OpenCV 3.2.0, the pre-compiled opencv2.framework has google protobuf symbols, which will
@ -184,13 +219,18 @@ maven_install(
artifacts = [ artifacts = [
"androidx.annotation:annotation:aar:1.1.0", "androidx.annotation:annotation:aar:1.1.0",
"androidx.appcompat:appcompat:aar:1.1.0-rc01", "androidx.appcompat:appcompat:aar:1.1.0-rc01",
"androidx.camera:camera-core:aar:1.0.0-alpha06",
"androidx.camera:camera-camera2:aar:1.0.0-alpha06",
"androidx.constraintlayout:constraintlayout:aar:1.1.3", "androidx.constraintlayout:constraintlayout:aar:1.1.3",
"androidx.core:core:aar:1.1.0-rc03", "androidx.core:core:aar:1.1.0-rc03",
"androidx.legacy:legacy-support-v4:aar:1.0.0", "androidx.legacy:legacy-support-v4:aar:1.0.0",
"androidx.recyclerview:recyclerview:aar:1.1.0-beta02", "androidx.recyclerview:recyclerview:aar:1.1.0-beta02",
"com.google.android.material:material:aar:1.0.0-rc01", "com.google.android.material:material:aar:1.0.0-rc01",
], ],
repositories = ["https://dl.google.com/dl/android/maven2"], repositories = [
"https://dl.google.com/dl/android/maven2",
"https://repo1.maven.org/maven2",
],
) )
maven_server( maven_server(
@ -285,10 +325,13 @@ http_archive(
build_file = "@//third_party:google_toolbox_for_mac.BUILD", build_file = "@//third_party:google_toolbox_for_mac.BUILD",
) )
### Coral ###
# Coral
#COMMIT=$(git ls-remote https://github.com/google-coral/crosstool master | awk '{print $1}') #COMMIT=$(git ls-remote https://github.com/google-coral/crosstool master | awk '{print $1}')
#SHA256=$(curl -L "https://github.com/google-coral/crosstool/archive/${COMMIT}.tar.gz" | sha256sum | awk '{print $1}') #SHA256=$(curl -L "https://github.com/google-coral/crosstool/archive/${COMMIT}.tar.gz" | sha256sum | awk '{print $1}')
# Oct 2019
#COMMIT=9e00d5be43bf001f883b5700f5d04882fea00229
#SHA256=cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb
http_archive( http_archive(
name = "coral_crosstool", name = "coral_crosstool",
sha256 = "cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb", sha256 = "cb31b1417ccdcf7dd9fca5ec63e1571672372c30427730255997a547569d2feb",

View File

@ -8,7 +8,7 @@ echo ' sh mediapipe/examples/coral/setup.sh '
sleep 3 sleep 3
mkdir opencv32_arm64_libs mkdir -p opencv32_arm64_libs
cp mediapipe/examples/coral/update_sources.sh update_sources.sh cp mediapipe/examples/coral/update_sources.sh update_sources.sh
chmod +x update_sources.sh chmod +x update_sources.sh

View File

@ -11,6 +11,8 @@
2. Build and run the run_autoflip binary to process a local video. 2. Build and run the run_autoflip binary to process a local video.
Note: AutoFlip currently only works with OpenCV 3 . Please verify your OpenCV version beforehand.
```bash ```bash
bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \ bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \
mediapipe/examples/desktop/autoflip:run_autoflip mediapipe/examples/desktop/autoflip:run_autoflip

View File

@ -63,12 +63,15 @@ import random
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import urllib
import zipfile import zipfile
from absl import app from absl import app
from absl import flags from absl import flags
from absl import logging from absl import logging
from six.moves import range
from six.moves import urllib
import tensorflow.compat.v1 as tf import tensorflow.compat.v1 as tf
from mediapipe.util.sequence import media_sequence as ms from mediapipe.util.sequence import media_sequence as ms
@ -218,7 +221,7 @@ class Charades(object):
return output_dict return output_dict
if split not in SPLITS: if split not in SPLITS:
raise ValueError("Split %s not in %s" % split, str(SPLITS.keys())) raise ValueError("Split %s not in %s" % split, str(list(SPLITS.keys())))
all_shards = tf.io.gfile.glob( all_shards = tf.io.gfile.glob(
os.path.join(self.path_to_data, SPLITS[split][0] + "-*-of-*")) os.path.join(self.path_to_data, SPLITS[split][0] + "-*-of-*"))
random.shuffle(all_shards) random.shuffle(all_shards)
@ -329,7 +332,7 @@ class Charades(object):
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
urlretrieve = urllib.request.urlretrieve urlretrieve = urllib.request.urlretrieve
else: else:
urlretrieve = urllib.urlretrieve urlretrieve = urllib.request.urlretrieve
logging.info("Creating data directory.") logging.info("Creating data directory.")
tf.io.gfile.makedirs(self.path_to_data) tf.io.gfile.makedirs(self.path_to_data)
logging.info("Downloading license.") logging.info("Downloading license.")

View File

@ -57,11 +57,12 @@ import random
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import urllib
from absl import app from absl import app
from absl import flags from absl import flags
from absl import logging from absl import logging
from six.moves import range
from six.moves import urllib
import tensorflow.compat.v1 as tf import tensorflow.compat.v1 as tf
from mediapipe.util.sequence import media_sequence as ms from mediapipe.util.sequence import media_sequence as ms
@ -198,7 +199,7 @@ class DemoDataset(object):
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
urlretrieve = urllib.request.urlretrieve urlretrieve = urllib.request.urlretrieve
else: else:
urlretrieve = urllib.urlretrieve urlretrieve = urllib.request.urlretrieve
for split in SPLITS: for split in SPLITS:
reader = csv.DictReader(SPLITS[split].split("\n")) reader = csv.DictReader(SPLITS[split].split("\n"))
all_metadata = [] all_metadata = []

View File

@ -73,11 +73,13 @@ import subprocess
import sys import sys
import tarfile import tarfile
import tempfile import tempfile
import urllib
from absl import app from absl import app
from absl import flags from absl import flags
from absl import logging from absl import logging
from six.moves import range
from six.moves import urllib
from six.moves import zip
import tensorflow.compat.v1 as tf import tensorflow.compat.v1 as tf
from mediapipe.util.sequence import media_sequence as ms from mediapipe.util.sequence import media_sequence as ms
@ -96,15 +98,15 @@ FILEPATTERN = "kinetics_700_%s_25fps_rgb_flow"
SPLITS = { SPLITS = {
"train": { "train": {
"shards": 1000, "shards": 1000,
"examples": 540247 "examples": 538779
}, },
"validate": { "validate": {
"shards": 100, "shards": 100,
"examples": 34610 "examples": 34499
}, },
"test": { "test": {
"shards": 100, "shards": 100,
"examples": 69103 "examples": 68847
}, },
"custom": { "custom": {
"csv": None, # Add a CSV for your own data here. "csv": None, # Add a CSV for your own data here.
@ -198,7 +200,7 @@ class Kinetics(object):
return output_dict return output_dict
if split not in SPLITS: if split not in SPLITS:
raise ValueError("Split %s not in %s" % split, str(SPLITS.keys())) raise ValueError("Split %s not in %s" % split, str(list(SPLITS.keys())))
all_shards = tf.io.gfile.glob( all_shards = tf.io.gfile.glob(
os.path.join(self.path_to_data, FILEPATTERN % split + "-*-of-*")) os.path.join(self.path_to_data, FILEPATTERN % split + "-*-of-*"))
random.shuffle(all_shards) random.shuffle(all_shards)
@ -302,11 +304,12 @@ class Kinetics(object):
continue continue
# rename the row with a constitent set of names. # rename the row with a constitent set of names.
if len(csv_row) == 5: if len(csv_row) == 5:
row = dict(zip(["label_name", "video", "start", "end", "split"], row = dict(
csv_row)) list(
zip(["label_name", "video", "start", "end", "split"],
csv_row)))
else: else:
row = dict(zip(["video", "start", "end", "split"], row = dict(list(zip(["video", "start", "end", "split"], csv_row)))
csv_row))
metadata = tf.train.SequenceExample() metadata = tf.train.SequenceExample()
ms.set_example_id(bytes23(row["video"] + "_" + row["start"]), ms.set_example_id(bytes23(row["video"] + "_" + row["start"]),
metadata) metadata)
@ -328,7 +331,7 @@ class Kinetics(object):
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
urlretrieve = urllib.request.urlretrieve urlretrieve = urllib.request.urlretrieve
else: else:
urlretrieve = urllib.urlretrieve urlretrieve = urllib.request.urlretrieve
logging.info("Creating data directory.") logging.info("Creating data directory.")
tf.io.gfile.makedirs(self.path_to_data) tf.io.gfile.makedirs(self.path_to_data)
logging.info("Downloading annotations.") logging.info("Downloading annotations.")
@ -404,7 +407,7 @@ class Kinetics(object):
assert NUM_CLASSES == num_keys, ( assert NUM_CLASSES == num_keys, (
"Found %d labels for split: %s, should be %d" % ( "Found %d labels for split: %s, should be %d" % (
num_keys, name, NUM_CLASSES)) num_keys, name, NUM_CLASSES))
label_map = dict(zip(classes, range(len(classes)))) label_map = dict(list(zip(classes, list(range(len(classes))))))
if SPLITS[name]["examples"] > 0: if SPLITS[name]["examples"] > 0:
assert SPLITS[name]["examples"] == num_examples, ( assert SPLITS[name]["examples"] == num_examples, (
"Found %d examples for split: %s, should be %d" % ( "Found %d examples for split: %s, should be %d" % (

View File

@ -30,6 +30,8 @@
```bash ```bash
# cd to the root directory of the MediaPipe repo # cd to the root directory of the MediaPipe repo
cd - cd -
pip3 install tf_slim
python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph python -m mediapipe.examples.desktop.youtube8m.generate_vggish_frozen_graph
``` ```
@ -47,7 +49,7 @@
5. Run the MediaPipe binary to extract the features. 5. Run the MediaPipe binary to extract the features.
```bash ```bash
bazel build -c opt \ bazel build -c opt --linkopt=-s \
--define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \ --define MEDIAPIPE_DISABLE_GPU=1 --define no_aws_support=true \
mediapipe/examples/desktop/youtube8m:extract_yt8m_features mediapipe/examples/desktop/youtube8m:extract_yt8m_features
@ -87,7 +89,7 @@
3. Build and run the inference binary. 3. Build and run the inference binary.
```bash ```bash
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
mediapipe/examples/desktop/youtube8m:model_inference mediapipe/examples/desktop/youtube8m:model_inference
GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/youtube8m/model_inference \ GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/youtube8m/model_inference \
@ -113,13 +115,13 @@
2. Build the inference binary. 2. Build the inference binary.
```bash ```bash
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
mediapipe/examples/desktop/youtube8m:model_inference mediapipe/examples/desktop/youtube8m:model_inference
``` ```
3. Run the python web server. 3. Run the python web server.
Note: pip install absl-py Note: pip3 install absl-py
```bash ```bash
python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd` python mediapipe/examples/desktop/youtube8m/viewer/server.py --root `pwd`
@ -142,7 +144,7 @@
3. Build and run the inference binary. 3. Build and run the inference binary.
```bash ```bash
bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' \ bazel build -c opt --define='MEDIAPIPE_DISABLE_GPU=1' --linkopt=-s \
mediapipe/examples/desktop/youtube8m:model_inference mediapipe/examples/desktop/youtube8m:model_inference
# segment_size is the number of seconds window of frames. # segment_size is the number of seconds window of frames.

View File

@ -25,7 +25,7 @@ import sys
from absl import app from absl import app
import tensorflow.compat.v1 as tf import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.python.tools import freeze_graph from tensorflow.python.tools import freeze_graph
BASE_DIR = '/tmp/mediapipe/' BASE_DIR = '/tmp/mediapipe/'

View File

@ -1078,10 +1078,16 @@ cc_library(
cc_library( cc_library(
name = "port", name = "port",
hdrs = ["port.h"], hdrs = ["port.h"],
defines = select({
"//conditions:default": [],
}) + select({
"//conditions:default": [],
"//mediapipe/gpu:disable_gpu": ["MEDIAPIPE_DISABLE_GPU"],
}),
visibility = [ visibility = [
"//mediapipe/framework:__subpackages__", "//mediapipe/framework:__subpackages__",
"//mediapipe/framework/port:__pkg__", "//mediapipe/framework/port:__pkg__",
"//mediapipe/util:__pkg__", "//mediapipe/util:__subpackages__",
], ],
) )

View File

@ -135,16 +135,20 @@
// ASSIGN_OR_RETURN(ValueType value, MaybeGetValue(query), _.LogError()); // ASSIGN_OR_RETURN(ValueType value, MaybeGetValue(query), _.LogError());
// //
#define ASSIGN_OR_RETURN(...) \ #define ASSIGN_OR_RETURN(...) \
STATUS_MACROS_IMPL_GET_VARIADIC_(__VA_ARGS__, \ STATUS_MACROS_IMPL_GET_VARIADIC_((__VA_ARGS__, \
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_, \ STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_, \
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_) \ STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_)) \
(__VA_ARGS__) (__VA_ARGS__)
// ================================================================= // =================================================================
// == Implementation details, do not rely on anything below here. == // == Implementation details, do not rely on anything below here. ==
// ================================================================= // =================================================================
#define STATUS_MACROS_IMPL_GET_VARIADIC_(_1, _2, _3, NAME, ...) NAME // MSVC incorrectly expands variadic macros, splice together a macro call to
// work around the bug.
#define STATUS_MACROS_IMPL_GET_VARIADIC_HELPER_(_1, _2, _3, NAME, ...) NAME
#define STATUS_MACROS_IMPL_GET_VARIADIC_(args) \
STATUS_MACROS_IMPL_GET_VARIADIC_HELPER_ args
#define STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_(lhs, rexpr) \ #define STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_2_(lhs, rexpr) \
STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, std::move(_)) STATUS_MACROS_IMPL_ASSIGN_OR_RETURN_3_(lhs, rexpr, std::move(_))

View File

@ -99,7 +99,12 @@ def _encode_binary_proto_impl(ctx):
), ),
mnemonic = "EncodeProto", mnemonic = "EncodeProto",
) )
return struct(files = depset([binarypb]))
output_depset = depset([binarypb])
return [DefaultInfo(
files = output_depset,
data_runfiles = ctx.runfiles(transitive_files = output_depset),
)]
encode_binary_proto = rule( encode_binary_proto = rule(
implementation = _encode_binary_proto_impl, implementation = _encode_binary_proto_impl,

View File

@ -131,7 +131,7 @@ class ShardedMap {
return *this; return *this;
} }
inline bool operator==(const Iterator& other) const { inline bool operator==(const Iterator& other) const {
return iter_ == other.iter_; return shard_ == other.shard_ && iter_ == other.iter_;
} }
inline bool operator!=(const Iterator& other) const { inline bool operator!=(const Iterator& other) const {
return !operator==(other); return !operator==(other);
@ -154,7 +154,10 @@ class ShardedMap {
: shard_(shard), iter_(iter), map_(map) {} : shard_(shard), iter_(iter), map_(map) {}
// Releases all resources. // Releases all resources.
inline void Clear() ABSL_NO_THREAD_SAFETY_ANALYSIS { inline void Clear() ABSL_NO_THREAD_SAFETY_ANALYSIS {
if (map_ && iter_ != map_->maps_.back().end()) { if (!map_) return;
bool is_end = (shard_ == map_->maps_.size() - 1 &&
iter_ == map_->maps_[shard_].end());
if (!is_end) {
map_->mutexes_[shard_].Unlock(); map_->mutexes_[shard_].Unlock();
} }
map_ = nullptr; map_ = nullptr;

View File

@ -100,7 +100,6 @@ class Timestamp {
} }
// Special values. // Special values.
static Timestamp Unset(); static Timestamp Unset();
static Timestamp Unstarted(); static Timestamp Unstarted();
static Timestamp PreStream(); static Timestamp PreStream();

View File

@ -264,6 +264,10 @@ static ::mediapipe::Status PrefixNames(std::string prefix,
generator.mutable_input_side_packet(), replace_names)); generator.mutable_input_side_packet(), replace_names));
MP_RETURN_IF_ERROR(TransformStreamNames( MP_RETURN_IF_ERROR(TransformStreamNames(
generator.mutable_output_side_packet(), replace_names)); generator.mutable_output_side_packet(), replace_names));
// Remove input side packets ignored by the subgraph-node.
MP_RETURN_IF_ERROR(RemoveIgnoredStreams(
generator.mutable_input_side_packet(), ignored_input_side_packets));
} }
return ::mediapipe::OkStatus(); return ::mediapipe::OkStatus();
} }

View File

@ -105,17 +105,27 @@ GpuBuffer GpuBufferMultiPool::GetBuffer(int width, int height,
BufferSpec key(width, height, format); BufferSpec key(width, height, format);
auto pool_it = pools_.find(key); auto pool_it = pools_.find(key);
if (pool_it == pools_.end()) { if (pool_it == pools_.end()) {
// Discard the oldest pool in order of creation. // Discard the least recently used pool in LRU cache.
// TODO: implement a better policy.
if (pools_.size() >= kMaxPoolCount) { if (pools_.size() >= kMaxPoolCount) {
auto old_spec = buffer_specs_.front(); auto old_spec = buffer_specs_.front(); // Front has LRU.
buffer_specs_.pop(); buffer_specs_.pop_front();
pools_.erase(old_spec); pools_.erase(old_spec);
} }
buffer_specs_.push(key); buffer_specs_.push_back(key); // Push new spec to back.
std::tie(pool_it, std::ignore) = std::tie(pool_it, std::ignore) =
pools_.emplace(std::piecewise_construct, std::forward_as_tuple(key), pools_.emplace(std::piecewise_construct, std::forward_as_tuple(key),
std::forward_as_tuple(MakeSimplePool(key))); std::forward_as_tuple(MakeSimplePool(key)));
} else {
// Find and move current 'key' spec to back, keeping others in same order.
auto specs_it = buffer_specs_.begin();
while (specs_it != buffer_specs_.end()) {
if (*specs_it == key) {
buffer_specs_.erase(specs_it);
break;
}
++specs_it;
}
buffer_specs_.push_back(key);
} }
return GetBufferFromSimplePool(pool_it->first, pool_it->second); return GetBufferFromSimplePool(pool_it->first, pool_it->second);
} }

View File

@ -22,8 +22,8 @@
#ifndef MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_ #ifndef MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_
#define MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_ #define MEDIAPIPE_GPU_GPU_BUFFER_MULTI_POOL_H_
#include <deque>
#include <limits> #include <limits>
#include <queue>
#include <unordered_map> #include <unordered_map>
#include "absl/synchronization/mutex.h" #include "absl/synchronization/mutex.h"
@ -110,7 +110,7 @@ class GpuBufferMultiPool {
ABSL_GUARDED_BY(mutex_); ABSL_GUARDED_BY(mutex_);
// A queue of BufferSpecs to keep track of the age of each BufferSpec added to // A queue of BufferSpecs to keep track of the age of each BufferSpec added to
// the pool. // the pool.
std::queue<BufferSpec> buffer_specs_; std::deque<BufferSpec> buffer_specs_;
#ifdef __APPLE__ #ifdef __APPLE__
// Texture caches used with this pool. // Texture caches used with this pool.

View File

@ -73,13 +73,15 @@ def _metal_compiler_args(ctx, src, obj, minimum_os_version, copts, diagnostics,
def _metal_compiler_inputs(srcs, hdrs, deps = []): def _metal_compiler_inputs(srcs, hdrs, deps = []):
"""Determines the list of inputs required for a compile action.""" """Determines the list of inputs required for a compile action."""
objc_providers = [x.objc for x in deps if hasattr(x, "objc")]
objc_files = depset() cc_infos = [dep[CcInfo] for dep in deps if CcInfo in dep]
for objc in objc_providers:
objc_files += objc.header
return srcs + hdrs + objc_files.to_list() dep_headers = depset(transitive = [
cc_info.compilation_context.headers
for cc_info in cc_infos
])
return depset(srcs + hdrs, transitive = [dep_headers])
def _metal_library_impl(ctx): def _metal_library_impl(ctx):
"""Implementation for metal_library Skylark rule.""" """Implementation for metal_library Skylark rule."""
@ -144,11 +146,22 @@ def _metal_library_impl(ctx):
**additional_params **additional_params
) )
cc_infos = [dep[CcInfo] for dep in ctx.attr.deps if CcInfo in dep]
if ctx.files.hdrs:
cc_infos.append(
CcInfo(
compilation_context = cc_common.create_compilation_context(
headers = depset([f for f in ctx.files.hdrs]),
),
),
)
return [ return [
DefaultInfo( DefaultInfo(
files = depset([output_lib]), files = depset([output_lib]),
), ),
objc_provider, objc_provider,
cc_common.merge_cc_infos(cc_infos = cc_infos),
# Return the provider for the new bundling logic of rules_apple. # Return the provider for the new bundling logic of rules_apple.
resources.bucketize_typed([output_lib], "unprocessed"), resources.bucketize_typed([output_lib], "unprocessed"),
] ]
@ -156,7 +169,7 @@ def _metal_library_impl(ctx):
METAL_LIBRARY_ATTRS = dicts.add(apple_support.action_required_attrs(), { METAL_LIBRARY_ATTRS = dicts.add(apple_support.action_required_attrs(), {
"srcs": attr.label_list(allow_files = [".metal"], allow_empty = False), "srcs": attr.label_list(allow_files = [".metal"], allow_empty = False),
"hdrs": attr.label_list(allow_files = [".h"]), "hdrs": attr.label_list(allow_files = [".h"]),
"deps": attr.label_list(providers = [["objc"]]), "deps": attr.label_list(providers = [["objc", CcInfo]]),
"copts": attr.string_list(), "copts": attr.string_list(),
"minimum_os_version": attr.string(), "minimum_os_version": attr.string(),
}) })

View File

@ -0,0 +1,56 @@
# Copyright 2019 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
load(
"//mediapipe/framework/tool:mediapipe_graph.bzl",
"mediapipe_binary_graph",
)
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
exports_files(glob([
"*.pbtxt",
]))
cc_library(
name = "mobile_calculators",
visibility = ["//visibility:public"],
deps = [
"//mediapipe/calculators/core:packet_resampler_calculator",
"//mediapipe/calculators/image:image_cropping_calculator",
"//mediapipe/gpu:gl_scaler_calculator",
"//mediapipe/graphs/object_detection_3d/calculators:annotations_to_model_matrices_calculator",
"//mediapipe/graphs/object_detection_3d/calculators:gl_animation_overlay_calculator",
"//mediapipe/graphs/object_detection_3d/subgraphs:objectron_detection_gpu",
"//mediapipe/graphs/object_detection_3d/subgraphs:objectron_tracking_gpu",
],
)
mediapipe_binary_graph(
name = "mobile_gpu_binary_graph_shoe",
graph = "shoe_classic_occlusion_tracking.pbtxt",
output_name = "mobile_gpu_shoe.binarypb",
visibility = ["//visibility:public"],
deps = [":mobile_calculators"],
)
mediapipe_binary_graph(
name = "mobile_gpu_binary_graph_chair",
graph = "chair_classic_occlusion_tracking.pbtxt",
output_name = "mobile_gpu_chair.binarypb",
visibility = ["//visibility:public"],
deps = [":mobile_calculators"],
)

View File

@ -0,0 +1,476 @@
# Copyright 2020 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
load("//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library")
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:private"])
proto_library(
name = "object_proto",
srcs = [
"object.proto",
],
)
proto_library(
name = "a_r_capture_metadata_proto",
srcs = [
"a_r_capture_metadata.proto",
],
)
proto_library(
name = "annotation_proto",
srcs = [
"annotation_data.proto",
],
deps = [
":a_r_capture_metadata_proto",
":object_proto",
],
)
proto_library(
name = "belief_decoder_config_proto",
srcs = [
"belief_decoder_config.proto",
],
)
proto_library(
name = "camera_parameters_proto",
srcs = [
"camera_parameters.proto",
],
)
proto_library(
name = "frame_annotation_tracker_calculator_proto",
srcs = ["frame_annotation_tracker_calculator.proto"],
deps = [
"//mediapipe/framework:calculator_proto",
],
)
proto_library(
name = "gl_animation_overlay_calculator_proto",
srcs = ["gl_animation_overlay_calculator.proto"],
visibility = ["//visibility:public"],
deps = ["//mediapipe/framework:calculator_proto"],
)
proto_library(
name = "tflite_tensors_to_objects_calculator_proto",
srcs = ["tflite_tensors_to_objects_calculator.proto"],
visibility = ["//visibility:public"],
deps = [
":belief_decoder_config_proto",
"//mediapipe/framework:calculator_proto",
],
)
proto_library(
name = "lift_2d_frame_annotation_to_3d_calculator_proto",
srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"],
visibility = ["//visibility:public"],
deps = [
":belief_decoder_config_proto",
"//mediapipe/framework:calculator_proto",
],
)
proto_library(
name = "annotations_to_model_matrices_calculator_proto",
srcs = ["annotations_to_model_matrices_calculator.proto"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_proto",
],
)
proto_library(
name = "model_matrix_proto",
srcs = ["model_matrix.proto"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_proto",
],
)
proto_library(
name = "annotations_to_render_data_calculator_proto",
srcs = ["annotations_to_render_data_calculator.proto"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_proto",
"//mediapipe/util:color_proto",
],
)
mediapipe_cc_proto_library(
name = "object_cc_proto",
srcs = ["object.proto"],
visibility = ["//visibility:public"],
deps = [":object_proto"],
)
mediapipe_cc_proto_library(
name = "a_r_capture_metadata_cc_proto",
srcs = ["a_r_capture_metadata.proto"],
visibility = ["//visibility:public"],
deps = [":a_r_capture_metadata_proto"],
)
mediapipe_cc_proto_library(
name = "annotation_cc_proto",
srcs = ["annotation_data.proto"],
cc_deps = [
":a_r_capture_metadata_cc_proto",
":object_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":annotation_proto"],
)
mediapipe_cc_proto_library(
name = "camera_parameters_cc_proto",
srcs = ["camera_parameters.proto"],
visibility = ["//visibility:public"],
deps = [":camera_parameters_proto"],
)
mediapipe_cc_proto_library(
name = "frame_annotation_tracker_calculator_cc_proto",
srcs = ["frame_annotation_tracker_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":frame_annotation_tracker_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "gl_animation_overlay_calculator_cc_proto",
srcs = ["gl_animation_overlay_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":gl_animation_overlay_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "belief_decoder_config_cc_proto",
srcs = ["belief_decoder_config.proto"],
visibility = ["//visibility:public"],
deps = [":belief_decoder_config_proto"],
)
mediapipe_cc_proto_library(
name = "tflite_tensors_to_objects_calculator_cc_proto",
srcs = ["tflite_tensors_to_objects_calculator.proto"],
cc_deps = [
":belief_decoder_config_cc_proto",
"//mediapipe/framework:calculator_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":tflite_tensors_to_objects_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "lift_2d_frame_annotation_to_3d_calculator_cc_proto",
srcs = ["lift_2d_frame_annotation_to_3d_calculator.proto"],
cc_deps = [
":belief_decoder_config_cc_proto",
"//mediapipe/framework:calculator_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":lift_2d_frame_annotation_to_3d_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "annotations_to_model_matrices_calculator_cc_proto",
srcs = ["annotations_to_model_matrices_calculator.proto"],
cc_deps = ["//mediapipe/framework:calculator_cc_proto"],
visibility = ["//visibility:public"],
deps = [":annotations_to_model_matrices_calculator_proto"],
)
mediapipe_cc_proto_library(
name = "model_matrix_cc_proto",
srcs = ["model_matrix.proto"],
cc_deps = ["//mediapipe/framework:calculator_cc_proto"],
visibility = ["//visibility:public"],
deps = [":model_matrix_proto"],
)
mediapipe_cc_proto_library(
name = "annotations_to_render_data_calculator_cc_proto",
srcs = ["annotations_to_render_data_calculator.proto"],
cc_deps = [
"//mediapipe/framework:calculator_cc_proto",
"//mediapipe/util:color_cc_proto",
],
visibility = ["//visibility:public"],
deps = [":annotations_to_render_data_calculator_proto"],
)
cc_library(
name = "box_util",
srcs = ["box_util.cc"],
hdrs = ["box_util.h"],
deps = [
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/util/tracking:box_tracker_cc_proto",
],
)
cc_library(
name = "frame_annotation_tracker",
srcs = ["frame_annotation_tracker.cc"],
hdrs = ["frame_annotation_tracker.h"],
deps = [
":annotation_cc_proto",
":box_util",
"//mediapipe/framework/port:integral_types",
"//mediapipe/framework/port:logging",
"//mediapipe/util/tracking:box_tracker_cc_proto",
"@com_google_absl//absl/container:btree",
"@com_google_absl//absl/container:flat_hash_set",
],
)
cc_library(
name = "gl_animation_overlay_calculator",
srcs = ["gl_animation_overlay_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":camera_parameters_cc_proto",
":gl_animation_overlay_calculator_cc_proto",
":model_matrix_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/gpu:gl_calculator_helper",
"//mediapipe/gpu:shader_util",
"//mediapipe/util/android:asset_manager_util",
],
alwayslink = 1,
)
cc_library(
name = "decoder",
srcs = [
"decoder.cc",
],
hdrs = [
"decoder.h",
],
deps = [
":annotation_cc_proto",
":belief_decoder_config_cc_proto",
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:status",
"@com_google_absl//absl/status",
"@eigen_archive//:eigen",
],
)
cc_library(
name = "tensor_util",
srcs = [
"tensor_util.cc",
],
hdrs = [
"tensor_util.h",
],
deps = [
"//mediapipe/framework/port:logging",
"//mediapipe/framework/port:opencv_core",
"@org_tensorflow//tensorflow/lite:framework",
],
)
cc_library(
name = "box",
srcs = [
"box.cc",
"model.cc",
],
hdrs = [
"box.h",
"model.h",
"types.h",
],
deps = [
":annotation_cc_proto",
":object_cc_proto",
"//mediapipe/framework/port:logging",
"@eigen_archive//:eigen",
],
)
cc_library(
name = "frame_annotation_to_timed_box_list_calculator",
srcs = ["frame_annotation_to_timed_box_list_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":annotation_cc_proto",
":box_util",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:opencv_imgproc",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/util/tracking:box_tracker_cc_proto",
"@com_google_absl//absl/memory",
],
alwayslink = 1,
)
cc_library(
name = "frame_annotation_tracker_calculator",
srcs = ["frame_annotation_tracker_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":annotation_cc_proto",
":frame_annotation_tracker",
":frame_annotation_tracker_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/util/tracking:box_tracker_cc_proto",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/memory",
],
alwayslink = 1,
)
cc_library(
name = "tflite_tensors_to_objects_calculator",
srcs = ["tflite_tensors_to_objects_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":annotation_cc_proto",
":belief_decoder_config_cc_proto",
":decoder",
":tensor_util",
":tflite_tensors_to_objects_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/deps:file_path",
"//mediapipe/framework/formats:detection_cc_proto",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:ret_check",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/types:span",
"@eigen_archive//:eigen",
"@org_tensorflow//tensorflow/lite:framework",
],
alwayslink = 1,
)
cc_library(
name = "lift_2d_frame_annotation_to_3d_calculator",
srcs = ["lift_2d_frame_annotation_to_3d_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":annotation_cc_proto",
":belief_decoder_config_cc_proto",
":decoder",
":lift_2d_frame_annotation_to_3d_calculator_cc_proto",
":tensor_util",
":tflite_tensors_to_objects_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/deps:file_path",
"//mediapipe/framework/formats:detection_cc_proto",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/framework/port:ret_check",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/types:span",
"@eigen_archive//:eigen",
"@org_tensorflow//tensorflow/lite:framework",
],
alwayslink = 1,
)
cc_library(
name = "annotations_to_model_matrices_calculator",
srcs = ["annotations_to_model_matrices_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":annotation_cc_proto",
":annotations_to_model_matrices_calculator_cc_proto",
":box",
":model_matrix_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework:calculator_options_cc_proto",
"//mediapipe/framework/port:ret_check",
"//mediapipe/framework/port:status",
"//mediapipe/util:color_cc_proto",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@eigen_archive//:eigen",
],
alwayslink = 1,
)
cc_library(
name = "annotations_to_render_data_calculator",
srcs = ["annotations_to_render_data_calculator.cc"],
visibility = ["//visibility:public"],
deps = [
":annotation_cc_proto",
":annotations_to_render_data_calculator_cc_proto",
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework:calculator_options_cc_proto",
"//mediapipe/framework/port:ret_check",
"//mediapipe/util:color_cc_proto",
"//mediapipe/util:render_data_cc_proto",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
],
alwayslink = 1,
)
cc_test(
name = "box_util_test",
srcs = ["box_util_test.cc"],
deps = [
":box_util",
"//mediapipe/framework/port:gtest_main",
"//mediapipe/framework/port:opencv_core",
"//mediapipe/util/tracking:box_tracker_cc_proto",
],
)
cc_test(
name = "frame_annotation_tracker_test",
srcs = ["frame_annotation_tracker_test.cc"],
deps = [
":annotation_cc_proto",
":frame_annotation_tracker",
"//mediapipe/framework/port:gtest_main",
"//mediapipe/framework/port:logging",
"//mediapipe/util/tracking:box_tracker_cc_proto",
"@com_google_absl//absl/container:flat_hash_set",
],
)

View File

@ -0,0 +1,551 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
// Info about the camera characteristics used to capture images and depth data.
// See developer.apple.com/documentation/avfoundation/avcameracalibrationdata
// for more information.
message AVCameraCalibrationData {
// 3x3 row-major matrix relating a camera's internal properties to an ideal
// pinhole-camera model.
// See
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881135-intrinsicmatrix
// for detailed usage information.
repeated float intrinsic_matrix = 1 [packed = true];
// The image dimensions to which the intrinsic_matrix values are relative.
optional float intrinsic_matrix_reference_dimension_width = 2;
optional float intrinsic_matrix_reference_dimension_height = 3;
// 3x4 row-major matrix relating a camera's position and orientation to a
// world or scene coordinate system. Consists of a unitless 3x3 rotation
// matrix (R) on the left and a translation (t) 3x1 vector on the right. The
// translation vector's units are millimeters. For example:
//
// |r1,1 r2,1 r3,1 | t1|
// [R | t] = |r1,2 r2,2 r3,2 | t2|
// |r1,3 r2,3 r3,3 | t3|
//
// is stored as [r11, r21, r31, t1, r12, r22, r32, t2, ...]
//
// See
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881130-extrinsicmatrix?language=objc
// for more information.
repeated float extrinsic_matrix = 4 [packed = true];
// The size, in millimeters, of one image pixel.
optional float pixel_size = 5;
// A list of floating-point values describing radial distortions imparted by
// the camera lens, for use in rectifying camera images.
// See
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881129-lensdistortionlookuptable?language=objc
// for more information.
repeated float lens_distortion_lookup_values = 6 [packed = true];
// A list of floating-point values describing radial distortions for use in
// reapplying camera geometry to a rectified image.
// See
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881132-inverselensdistortionlookuptable?language=objc
// for more information.
repeated float inverse_lens_distortion_lookup_values = 7 [packed = true];
// The offset of the distortion center of the camera lens from the top-left
// corner of the image.
// See
// developer.apple.com/documentation/avfoundation/avcameracalibrationdata/2881131-lensdistortioncenter?language=objc
// for more information.
optional float lens_distortion_center_x = 8;
optional float lens_distortion_center_y = 9;
}
// Container for depth data information.
// See developer.apple.com/documentation/avfoundation/avdepthdata for more info.
message AVDepthData {
// PNG representation of the grayscale depth data map. See discussion about
// depth_data_map_original_minimum_value, below, for information about how
// to interpret the pixel values.
optional bytes depth_data_map = 1;
// Pixel format type of the original captured depth data.
// See
// developer.apple.com/documentation/corevideo/1563591-pixel_format_identifiers?language=objc
// for the complete list of possible pixel format types. This value represents
// a string for the associated OSType/FourCharCode.
optional string depth_data_type = 2;
// Indicates the general accuracy of the depth_data_map.
// See developer.apple.com/documentation/avfoundation/avdepthdataaccuracy for
// more information.
enum Accuracy {
UNDEFINED_ACCURACY = 0;
// Values in the depth map are usable for foreground/background separation
// but are not absolutely accurate in the physical world.
RELATIVE = 1;
// Values in the depth map are absolutely accurate in the physical world.
ABSOLUTE = 2;
}
optional Accuracy depth_data_accuracy = 3 [default = RELATIVE];
// Indicates whether the depth_data_map contains temporally smoothed data.
optional bool depth_data_filtered = 4;
// Quality of the depth_data_map.
enum Quality {
UNDEFINED_QUALITY = 0;
HIGH = 1;
LOW = 2;
}
optional Quality depth_data_quality = 5;
// Associated calibration data for the depth_data_map.
optional AVCameraCalibrationData camera_calibration_data = 6;
// The original range of values expressed by the depth_data_map, before
// grayscale normalization. For example, if the minimum and maximum values
// indicate a range of [0.5, 2.2], and the depth_data_type value indicates
// it was a depth map, then white pixels (255, 255, 255) will map to 0.5 and
// black pixels (0, 0, 0) will map to 2.2 with the grayscale range linearly
// interpolated inbetween. Conversely, if the depth_data_type value indicates
// it was a disparity map, then white pixels will map to 2.2 and black pixels
// will map to 0.5.
optional float depth_data_map_original_minimum_value = 7;
optional float depth_data_map_original_maximum_value = 8;
// The width of the depth buffer map.
optional int32 depth_data_map_width = 9;
// The height of the depth buffer map.
optional int32 depth_data_map_height = 10;
// The row-major flattened array of the depth buffer map pixels. This will be
// either a float32 or float16 byte array, depending on 'depth_data_type'.
optional bytes depth_data_map_raw_values = 11;
}
// Estimated scene lighting information associated with a captured video frame.
// See developer.apple.com/documentation/arkit/arlightestimate for more info.
message ARLightEstimate {
// The estimated intensity, in lumens, of ambient light throughout the scene.
optional double ambient_intensity = 1;
// The estimated color temperature, in degrees Kelvin, of ambient light
// throughout the scene.
optional double ambient_color_temperature = 2;
// Data describing the estimated lighting environment in all directions.
// Second-level spherical harmonics in separate red, green, and blue data
// planes. Thus, this buffer contains 3 sets of 9 coefficients, or a total of
// 27 values.
// See
// https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928222-sphericalharmonicscoefficients?language=objc
// for more information.
repeated float spherical_harmonics_coefficients = 3 [packed = true];
message DirectionVector {
optional float x = 1;
optional float y = 2;
optional float z = 3;
}
// A vector indicating the orientation of the strongest directional light
// source, normalized in the world-coordinate space.
// See
// https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928221-primarylightdirection?language=objc
// for more information;
optional DirectionVector primary_light_direction = 4;
// The estimated intensity, in lumens, of the strongest directional light
// source in the scene.
// See
// https://developer.apple.com/documentation/arkit/ardirectionallightestimate/2928219-primarylightintensity?language=objc
// for more information.
optional float primary_light_intensity = 5;
}
// Information about the camera position and imaging characteristics for a
// captured video frame.
// See developer.apple.com/documentation/arkit/arcamera for more information.
message ARCamera {
// The general quality of position tracking available when the camera captured
// a frame.
enum TrackingState {
UNDEFINED_TRACKING_STATE = 0;
// Camera position tracking is not available.
UNAVAILABLE = 1;
// Tracking is available, but the quality of results is questionable.
LIMITED = 2;
// Camera position tracking is providing optimal results.
NORMAL = 3;
}
optional TrackingState tracking_state = 1 [default = UNAVAILABLE];
// A possible diagnosis for limited position tracking quality as of when the
// frame was captured.
enum TrackingStateReason {
UNDEFINED_TRACKING_STATE_REASON = 0;
// The current tracking state is not limited.
NONE = 1;
// Not yet enough camera or motion data to provide tracking information.
INITIALIZING = 2;
// The device is moving too fast for accurate image-based position tracking.
EXCESSIVE_MOTION = 3;
// Not enough distinguishable features for image-based position tracking.
INSUFFICIENT_FEATURES = 4;
// Tracking is limited due to a relocalization in progress.
RELOCALIZING = 5;
}
optional TrackingStateReason tracking_state_reason = 2 [default = NONE];
// 4x4 row-major matrix expressing position and orientation of the camera in
// world coordinate space.
// See developer.apple.com/documentation/arkit/arcamera/2866108-transform for
// more information.
repeated float transform = 3 [packed = true];
// The orientation of the camera, expressed as roll, pitch, and yaw values.
message EulerAngles {
optional float roll = 1;
optional float pitch = 2;
optional float yaw = 3;
}
optional EulerAngles euler_angles = 4;
// The width and height, in pixels, of the captured camera image.
optional int32 image_resolution_width = 5;
optional int32 image_resolution_height = 6;
// 3x3 row-major matrix that converts between the 2D camera plane and 3D world
// coordinate space.
// See developer.apple.com/documentation/arkit/arcamera/2875730-intrinsics for
// usage information.
repeated float intrinsics = 7 [packed = true];
// 4x4 row-major transform matrix appropriate for rendering 3D content to
// match the image captured by the camera.
// See
// developer.apple.com/documentation/arkit/arcamera/2887458-projectionmatrix
// for usage information.
repeated float projection_matrix = 8 [packed = true];
// 4x4 row-major transform matrix appropriate for converting from world-space
// to camera space. Relativized for the captured_image orientation (i.e.
// UILandscapeOrientationRight).
// See
// https://developer.apple.com/documentation/arkit/arcamera/2921672-viewmatrixfororientation?language=objc
// for more information.
repeated float view_matrix = 9 [packed = true];
}
// Container for a 3D mesh describing face topology.
message ARFaceGeometry {
// Each vertex represents a 3D point in the face mesh, in the face coordinate
// space.
// See developer.apple.com/documentation/arkit/arfacegeometry/2928201-vertices
// for more information.
message Vertex {
optional float x = 1;
optional float y = 2;
optional float z = 3;
}
repeated Vertex vertices = 1;
// The number of elements in the vertices list.
optional int32 vertex_count = 2;
// Each texture coordinate represents UV texture coordinates for the vertex at
// the corresponding index in the vertices buffer.
// See
// developer.apple.com/documentation/arkit/arfacegeometry/2928203-texturecoordinates
// for more information.
message TextureCoordinate {
optional float u = 1;
optional float v = 2;
}
repeated TextureCoordinate texture_coordinates = 3;
// The number of elements in the texture_coordinates list.
optional int32 texture_coordinate_count = 4;
// Each integer value in this ordered list represents an index into the
// vertices and texture_coordinates lists. Each set of three indices
// identifies the vertices comprising a single triangle in the mesh. Each set
// of three indices forms a triangle, so the number of indices in the
// triangle_indices buffer is three times the triangle_count value.
// See
// developer.apple.com/documentation/arkit/arfacegeometry/2928199-triangleindices
// for more information.
repeated int32 triangle_indices = 5 [packed = true];
// The number of triangles described by the triangle_indices buffer.
// See
// developer.apple.com/documentation/arkit/arfacegeometry/2928207-trianglecount
// for more information.
optional int32 triangle_count = 6;
}
// Contains a list of blend shape entries wherein each item maps a specific
// blend shape location to its associated coefficient.
message ARBlendShapeMap {
message MapEntry {
// Identifier for the specific facial feature.
// See developer.apple.com/documentation/arkit/arblendshapelocation for a
// complete list of identifiers.
optional string blend_shape_location = 1;
// Indicates the current position of the feature relative to its neutral
// configuration, ranging from 0.0 (neutral) to 1.0 (maximum movement).
optional float blend_shape_coefficient = 2;
}
repeated MapEntry entries = 1;
}
// Information about the pose, topology, and expression of a detected face.
// See developer.apple.com/documentation/arkit/arfaceanchor for more info.
message ARFaceAnchor {
// A coarse triangle mesh representing the topology of the detected face.
optional ARFaceGeometry geometry = 1;
// A map of named coefficients representing the detected facial expression in
// terms of the movement of specific facial features.
optional ARBlendShapeMap blend_shapes = 2;
// 4x4 row-major matrix encoding the position, orientation, and scale of the
// anchor relative to the world coordinate space.
// See
// https://developer.apple.com/documentation/arkit/aranchor/2867981-transform?language=objc
// for more information.
repeated float transform = 3;
// Indicates whether the anchor's transform is valid. Frames that have a face
// anchor with this value set to NO should probably be ignored.
optional bool is_tracked = 4;
}
// Container for a 3D mesh.
message ARPlaneGeometry {
message Vertex {
optional float x = 1;
optional float y = 2;
optional float z = 3;
}
// Each texture coordinate represents UV texture coordinates for the vertex at
// the corresponding index in the vertices buffer.
// See
// https://developer.apple.com/documentation/arkit/arfacegeometry/2928203-texturecoordinates
// for more information.
message TextureCoordinate {
optional float u = 1;
optional float v = 2;
}
// A buffer of vertex positions for each point in the plane mesh.
repeated Vertex vertices = 1;
// The number of elements in the vertices buffer.
optional int32 vertex_count = 2;
// A buffer of texture coordinate values for each point in the plane mesh.
repeated TextureCoordinate texture_coordinates = 3;
// The number of elements in the texture_coordinates buffer.
optional int32 texture_coordinate_count = 4;
// Each integer value in this ordered list represents an index into the
// vertices and texture_coordinates lists. Each set of three indices
// identifies the vertices comprising a single triangle in the mesh. Each set
// of three indices forms a triangle, so the number of indices in the
// triangle_indices buffer is three times the triangle_count value.
// See
// https://developer.apple.com/documentation/arkit/arplanegeometry/2941051-triangleindices
// for more information.
repeated int32 triangle_indices = 5 [packed = true];
// Each set of three indices forms a triangle, so the number of indices in the
// triangle_indices buffer is three times the triangle_count value.
// See
// https://developer.apple.com/documentation/arkit/arplanegeometry/2941058-trianglecount
// for more information.
optional int32 triangle_count = 6;
// Each value in this buffer represents the position of a vertex along the
// boundary polygon of the estimated plane. The owning plane anchor's
// transform matrix defines the coordinate system for these points.
// See
// https://developer.apple.com/documentation/arkit/arplanegeometry/2941052-boundaryvertices
// for more information.
repeated Vertex boundary_vertices = 7;
// The number of elements in the boundary_vertices buffer.
optional int32 boundary_vertex_count = 8;
}
// Information about the position and orientation of a real-world flat surface.
// See https://developer.apple.com/documentation/arkit/arplaneanchor for more
// information.
message ARPlaneAnchor {
enum Alignment {
UNDEFINED = 0;
// The plane is perpendicular to gravity.
HORIZONTAL = 1;
// The plane is parallel to gravity.
VERTICAL = 2;
}
// Wrapper for a 3D point / vector within the plane. See extent and center
// values for more information.
message PlaneVector {
optional float x = 1;
optional float y = 2;
optional float z = 3;
}
enum PlaneClassification {
NONE = 0;
WALL = 1;
FLOOR = 2;
CEILING = 3;
TABLE = 4;
SEAT = 5;
}
// The classification status for the plane.
enum PlaneClassificationStatus {
// The classfication process for the plane anchor has completed but the
// result is inconclusive.
UNKNOWN = 0;
// No classication information can be provided (set on error or if the
// device does not support plane classification).
UNAVAILABLE = 1;
// The classification process has not completed.
UNDETERMINED = 2;
// The classfication process for the plane anchor has completed.
KNOWN = 3;
}
// The ID of the plane.
optional string identifier = 1;
// 4x4 row-major matrix encoding the position, orientation, and scale of the
// anchor relative to the world coordinate space.
// See
// https://developer.apple.com/documentation/arkit/aranchor/2867981-transform
// for more information.
repeated float transform = 2;
// The general orientation of the detected plane with respect to gravity.
optional Alignment alignment = 3;
// A coarse triangle mesh representing the general shape of the detected
// plane.
optional ARPlaneGeometry geometry = 4;
// The center point of the plane relative to its anchor position.
// Although the type of this property is a 3D vector, a plane anchor is always
// two-dimensional, and is always positioned in only the x and z directions
// relative to its transform position. (That is, the y-component of this
// vector is always zero.)
// See
// https://developer.apple.com/documentation/arkit/arplaneanchor/2882056-center
// for more information.
optional PlaneVector center = 5;
// The estimated width and length of the detected plane.
// See
// https://developer.apple.com/documentation/arkit/arplaneanchor/2882055-extent
// for more information.
optional PlaneVector extent = 6;
// A Boolean value that indicates whether plane classification is available on
// the current device. On devices without plane classification support, all
// plane anchors report a classification value of NONE
// and a classification_status value of UNAVAILABLE.
optional bool classification_supported = 7;
// A general characterization of what kind of real-world surface the plane
// anchor represents.
// See
// https://developer.apple.com/documentation/arkit/arplaneanchor/2990936-classification
// for more information.
optional PlaneClassification classification = 8;
// The current state of ARKit's process for classifying the plane anchor.
// When this property's value is KNOWN, the classification property represents
// ARKit's characterization of the real-world surface corresponding to the
// plane anchor.
// See
// https://developer.apple.com/documentation/arkit/arplaneanchor/2990937-classificationstatus
// for more information.
optional PlaneClassificationStatus classification_status = 9;
}
// A collection of points in the world coordinate space.
// See https://developer.apple.com/documentation/arkit/arpointcloud for more
// information.
message ARPointCloud {
message Point {
optional float x = 1;
optional float y = 2;
optional float z = 3;
}
// The number of points in the cloud.
optional int32 count = 1;
// The list of detected points.
repeated Point point = 2;
// A list of unique identifiers corresponding to detected feature points.
// Each identifier in this list corresponds to the point at the same index
// in the points array.
repeated int64 identifier = 3 [packed = true];
}
// Video image and face position tracking information.
// See developer.apple.com/documentation/arkit/arframe for more information.
message ARFrame {
// The timestamp for the frame.
optional double timestamp = 1;
// The depth data associated with the frame. Not all frames have depth data.
optional AVDepthData depth_data = 2;
// The depth data object timestamp associated with the frame. May differ from
// the frame timestamp value. Is only set when the frame has depth_data.
optional double depth_data_timestamp = 3;
// Camera information associated with the frame.
optional ARCamera camera = 4;
// Light information associated with the frame.
optional ARLightEstimate light_estimate = 5;
// Face anchor information associated with the frame. Not all frames have an
// active face anchor.
optional ARFaceAnchor face_anchor = 6;
// Plane anchors associated with the frame. Not all frames have a plane
// anchor. Plane anchors and face anchors are mutually exclusive.
repeated ARPlaneAnchor plane_anchor = 7;
// The current intermediate results of the scene analysis used to perform
// world tracking.
// See
// https://developer.apple.com/documentation/arkit/arframe/2887449-rawfeaturepoints
// for more information.
optional ARPointCloud raw_feature_points = 8;
}

View File

@ -0,0 +1,92 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package mediapipe;
import "mediapipe/graphs/object_detection_3d/calculators/a_r_capture_metadata.proto";
import "mediapipe/graphs/object_detection_3d/calculators/object.proto";
// Projection of a 3D point on an image, and its metric depth.
message NormalizedPoint2D {
// x-y position of the 2d keypoint in the image coordinate system.
// u,v \in [0, 1], where top left corner is (0, 0) and the bottom-right corner
// is (1, 1).
float x = 1;
float y = 2;
// The depth of the point in the camera coordinate system (in meters).
float depth = 3;
}
// The 3D point in the camera coordinate system, the scales are in meters.
message Point3D {
float x = 1;
float y = 2;
float z = 3;
}
message AnnotatedKeyPoint {
int32 id = 1;
Point3D point_3d = 2;
NormalizedPoint2D point_2d = 3;
}
message ObjectAnnotation {
// Reference to the object identifier in ObjectInstance.
int32 object_id = 1;
// For each objects, list all the annotated keypoints here.
// E.g. for bounding-boxes, we have 8 keypoints, hands = 21 keypoints, etc.
// These normalized points are the projection of the Object's 3D keypoint
// on the current frame's camera poses.
repeated AnnotatedKeyPoint keypoints = 2;
// Visibiity of this annotation in a frame.
float visibility = 3;
}
message FrameAnnotation {
// Unique frame id, corresponds to images.
int32 frame_id = 1;
// List of the annotated objects in this frame. Depending on how many object
// are observable in this frame, we might have non or as much as
// sequence.objects_size() annotations.
repeated ObjectAnnotation annotations = 2;
// Information about the camera transformation (in the world coordinate) and
// imaging characteristics for a captured video frame.
ARCamera camera = 3;
// The timestamp for the frame.
double timestamp = 4;
// Plane center and normal in camera frame.
repeated float plane_center = 5;
repeated float plane_normal = 6;
}
// The sequence protocol contains the annotation data for the entire video clip.
message Sequence {
// List of all the annotated 3D objects in this sequence in the world
// Coordinate system. Given the camera poses of each frame (also in the
// world-coordinate) these objects bounding boxes can be projected to each
// frame to get the per-frame annotation (i.e. image_annotation below).
repeated Object objects = 1;
// List of annotated data per each frame in sequence + frame information.
repeated FrameAnnotation frame_annotations = 2;
}

View File

@ -0,0 +1,209 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "Eigen/Dense"
#include "Eigen/src/Core/util/Constants.h"
#include "Eigen/src/Geometry/Quaternion.h"
#include "absl/memory/memory.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/calculator_options.pb.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotations_to_model_matrices_calculator.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/box.h"
#include "mediapipe/graphs/object_detection_3d/calculators/model_matrix.pb.h"
#include "mediapipe/util/color.pb.h"
namespace mediapipe {
namespace {
constexpr char kAnnotationTag[] = "ANNOTATIONS";
constexpr char kModelMatricesTag[] = "MODEL_MATRICES";
using Matrix4fRM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
} // namespace
// Converts the box prediction from Objectron Model to the Model matrices
// to be rendered.
//
// Input:
// ANNOTATIONS - Frame annotations with lifted 3D points, the points are in
// Objectron coordinate system.
// Output:
// MODEL_MATRICES - Result ModelMatrices, in OpenGL coordinate system.
//
// Usage example:
// node {
// calculator: "AnnotationsToModelMatricesCalculator"
// input_stream: "ANNOTATIONS:objects"
// output_stream: "MODEL_MATRICES:model_matrices"
//}
class AnnotationsToModelMatricesCalculator : public CalculatorBase {
public:
AnnotationsToModelMatricesCalculator() {}
~AnnotationsToModelMatricesCalculator() override {}
AnnotationsToModelMatricesCalculator(
const AnnotationsToModelMatricesCalculator&) = delete;
AnnotationsToModelMatricesCalculator& operator=(
const AnnotationsToModelMatricesCalculator&) = delete;
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
private:
::mediapipe::Status GetModelMatricesForAnnotations(
const FrameAnnotation& annotations,
TimedModelMatrixProtoList* model_matrix_list);
AnnotationsToModelMatricesCalculatorOptions options_;
Eigen::Vector3f model_scale_;
Matrix4fRM model_transformation_;
};
REGISTER_CALCULATOR(AnnotationsToModelMatricesCalculator);
::mediapipe::Status AnnotationsToModelMatricesCalculator::GetContract(
CalculatorContract* cc) {
RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)) << "No input stream found.";
if (cc->Inputs().HasTag(kAnnotationTag)) {
cc->Inputs().Tag(kAnnotationTag).Set<FrameAnnotation>();
}
if (cc->Outputs().HasTag(kModelMatricesTag)) {
cc->Outputs().Tag(kModelMatricesTag).Set<TimedModelMatrixProtoList>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status AnnotationsToModelMatricesCalculator::Open(
CalculatorContext* cc) {
RET_CHECK(cc->Inputs().HasTag(kAnnotationTag));
cc->SetOffset(TimestampDiff(0));
options_ = cc->Options<AnnotationsToModelMatricesCalculatorOptions>();
if (options_.model_scale_size() == 3) {
model_scale_ =
Eigen::Map<const Eigen::Vector3f>(options_.model_scale().data());
} else {
model_scale_.setOnes();
}
if (options_.model_transformation_size() == 16) {
model_transformation_ =
Eigen::Map<const Matrix4fRM>(options_.model_transformation().data());
} else {
model_transformation_.setIdentity();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status AnnotationsToModelMatricesCalculator::Process(
CalculatorContext* cc) {
auto model_matrices = std::make_unique<TimedModelMatrixProtoList>();
const FrameAnnotation& annotations =
cc->Inputs().Tag(kAnnotationTag).Get<FrameAnnotation>();
if (!GetModelMatricesForAnnotations(annotations, model_matrices.get()).ok()) {
return ::mediapipe::InvalidArgumentError(
"Error in GetModelMatricesForBoxes");
}
cc->Outputs()
.Tag(kModelMatricesTag)
.Add(model_matrices.release(), cc->InputTimestamp());
return ::mediapipe::OkStatus();
}
::mediapipe::Status
AnnotationsToModelMatricesCalculator::GetModelMatricesForAnnotations(
const FrameAnnotation& annotations,
TimedModelMatrixProtoList* model_matrix_list) {
if (model_matrix_list == nullptr) {
return ::mediapipe::InvalidArgumentError("model_matrix_list is nullptr");
}
model_matrix_list->clear_model_matrix();
Box box("category");
for (const auto& object : annotations.annotations()) {
TimedModelMatrixProto* model_matrix = model_matrix_list->add_model_matrix();
model_matrix->set_id(object.object_id());
// Fit a box to the original vertices to estimate the scale of the box
std::vector<Eigen::Vector3f> vertices;
for (const auto& keypoint : object.keypoints()) {
const auto& point = keypoint.point_3d();
Eigen::Vector3f p(point.x(), point.y(), point.z());
vertices.emplace_back(p);
}
box.Fit(vertices);
// Re-scale the box if necessary
Eigen::Vector3f estimated_scale = box.GetScale();
vertices.clear();
for (const auto& keypoint : object.keypoints()) {
const auto& point = keypoint.point_3d();
Eigen::Vector3f p(point.x(), point.y(), point.z());
vertices.emplace_back(p);
}
box.Fit(vertices);
Matrix4fRM object_transformation = box.GetTransformation();
Matrix4fRM model_view;
Matrix4fRM pursuit_model;
// The reference view is
//
// ref << 0., 0., 1., 0.,
// -1., 0., 0., 0.,
// 0., -1., 0., 0.,
// 0., 0., 0., 1.;
// We have pursuit_model * model = model_view, to get pursuit_model:
// pursuit_model = model_view * model^-1
// clang-format off
pursuit_model << 0.0, 1.0, 0.0, 0.0,
1.0, 0.0, 0.0, 0.0,
0.0, 0.0, 1.0, 0.0,
0.0, 0.0, 0.0, 1.0;
// clang-format on
// Re-scale the CAD model to the scale of the estimated bounding box.
const Eigen::Vector3f scale = model_scale_.cwiseProduct(estimated_scale);
const Matrix4fRM model =
model_transformation_.array().colwise() * scale.homogeneous().array();
// Finally compute the model_view matrix.
model_view = pursuit_model * object_transformation * model;
for (int i = 0; i < model_view.rows(); ++i) {
for (int j = 0; j < model_view.cols(); ++j) {
model_matrix->add_matrix_entries(model_view(i, j));
}
}
}
return ::mediapipe::OkStatus();
}
} // namespace mediapipe

View File

@ -0,0 +1,33 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
message AnnotationsToModelMatricesCalculatorOptions {
extend CalculatorOptions {
optional AnnotationsToModelMatricesCalculatorOptions ext = 290166283;
}
// Vector of size 3 indicating the scale vector [x, y, z]. We will re-scale
// the model size with this vector. (Defaults to [1., 1., 1.])
repeated float model_scale = 1;
// 4x4 Row major matrix denoting the transformation from the model to the
// Deep Pursuit 3D coordinate system (where front is +z, and up is +y).
repeated float model_transformation = 2;
}

View File

@ -0,0 +1,273 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/memory/memory.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/calculator_options.pb.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotations_to_render_data_calculator.pb.h"
#include "mediapipe/util/color.pb.h"
#include "mediapipe/util/render_data.pb.h"
namespace mediapipe {
namespace {
constexpr char kAnnotationTag[] = "ANNOTATIONS";
constexpr char kRenderDataTag[] = "RENDER_DATA";
constexpr char kKeypointLabel[] = "KEYPOINT";
constexpr int kMaxLandmarkThickness = 18;
inline void SetColor(RenderAnnotation* annotation, const Color& color) {
annotation->mutable_color()->set_r(color.r());
annotation->mutable_color()->set_g(color.g());
annotation->mutable_color()->set_b(color.b());
}
// Remap x from range [lo hi] to range [0 1] then multiply by scale.
inline float Remap(float x, float lo, float hi, float scale) {
return (x - lo) / (hi - lo + 1e-6) * scale;
}
inline void GetMinMaxZ(const FrameAnnotation& annotations, float* z_min,
float* z_max) {
*z_min = std::numeric_limits<float>::max();
*z_max = std::numeric_limits<float>::min();
// Use a global depth scale for all the objects in the scene
for (const auto& object : annotations.annotations()) {
for (const auto& keypoint : object.keypoints()) {
*z_min = std::min(keypoint.point_2d().depth(), *z_min);
*z_max = std::max(keypoint.point_2d().depth(), *z_max);
}
}
}
void SetColorSizeValueFromZ(float z, float z_min, float z_max,
RenderAnnotation* render_annotation) {
const int color_value = 255 - static_cast<int>(Remap(z, z_min, z_max, 255));
::mediapipe::Color color;
color.set_r(color_value);
color.set_g(color_value);
color.set_b(color_value);
SetColor(render_annotation, color);
const int thickness = static_cast<int>((1.f - Remap(z, z_min, z_max, 1)) *
kMaxLandmarkThickness);
render_annotation->set_thickness(thickness);
}
} // namespace
// A calculator that converts FrameAnnotation proto to RenderData proto for
// visualization. The input should be the FrameAnnotation proto buffer. It is
// also possible to specify the connections between landmarks.
//
// Example config:
// node {
// calculator: "AnnotationsToRenderDataCalculator"
// input_stream: "ANNOTATIONS:annotations"
// output_stream: "RENDER_DATA:render_data"
// options {
// [AnnotationsToRenderDataCalculator.ext] {
// landmark_connections: [0, 1, 1, 2]
// landmark_color { r: 0 g: 255 b: 0 }
// connection_color { r: 0 g: 255 b: 0 }
// thickness: 4.0
// }
// }
// }
class AnnotationsToRenderDataCalculator : public CalculatorBase {
public:
AnnotationsToRenderDataCalculator() {}
~AnnotationsToRenderDataCalculator() override {}
AnnotationsToRenderDataCalculator(const AnnotationsToRenderDataCalculator&) =
delete;
AnnotationsToRenderDataCalculator& operator=(
const AnnotationsToRenderDataCalculator&) = delete;
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
private:
static void SetRenderAnnotationColorThickness(
const AnnotationsToRenderDataCalculatorOptions& options,
RenderAnnotation* render_annotation);
static RenderAnnotation* AddPointRenderData(
const AnnotationsToRenderDataCalculatorOptions& options,
RenderData* render_data);
// Add a command to draw a line in the rendering queue. The line is drawn from
// (start_x, start_y) to (end_x, end_y). The input x,y can either be in pixel
// or normalized coordinate [0, 1] as indicated by the normalized flag.
static void AddConnectionToRenderData(
float start_x, float start_y, float end_x, float end_y,
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
RenderData* render_data);
// Same as above function. Instead of using color data to render the line, it
// re-colors the line according to the two depth value. gray_val1 is the color
// of the starting point and gray_val2 is the color of the ending point. The
// line is colored using gradient color from gray_val1 to gray_val2. The
// gray_val ranges from [0 to 255] for black to white.
static void AddConnectionToRenderData(
float start_x, float start_y, float end_x, float end_y,
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
int gray_val1, int gray_val2, RenderData* render_data);
AnnotationsToRenderDataCalculatorOptions options_;
};
REGISTER_CALCULATOR(AnnotationsToRenderDataCalculator);
::mediapipe::Status AnnotationsToRenderDataCalculator::GetContract(
CalculatorContract* cc) {
RET_CHECK(cc->Inputs().HasTag(kAnnotationTag)) << "No input stream found.";
if (cc->Inputs().HasTag(kAnnotationTag)) {
cc->Inputs().Tag(kAnnotationTag).Set<FrameAnnotation>();
}
cc->Outputs().Tag(kRenderDataTag).Set<RenderData>();
return ::mediapipe::OkStatus();
}
::mediapipe::Status AnnotationsToRenderDataCalculator::Open(
CalculatorContext* cc) {
cc->SetOffset(TimestampDiff(0));
options_ = cc->Options<AnnotationsToRenderDataCalculatorOptions>();
return ::mediapipe::OkStatus();
}
::mediapipe::Status AnnotationsToRenderDataCalculator::Process(
CalculatorContext* cc) {
auto render_data = absl::make_unique<RenderData>();
bool visualize_depth = options_.visualize_landmark_depth();
float z_min = 0.f;
float z_max = 0.f;
if (cc->Inputs().HasTag(kAnnotationTag)) {
const auto& annotations =
cc->Inputs().Tag(kAnnotationTag).Get<FrameAnnotation>();
RET_CHECK_EQ(options_.landmark_connections_size() % 2, 0)
<< "Number of entries in landmark connections must be a multiple of 2";
if (visualize_depth) {
GetMinMaxZ(annotations, &z_min, &z_max);
// Only change rendering if there are actually z values other than 0.
visualize_depth &= ((z_max - z_min) > 1e-3);
}
for (const auto& object : annotations.annotations()) {
for (const auto& keypoint : object.keypoints()) {
auto* keypoint_data_render =
AddPointRenderData(options_, render_data.get());
auto* point = keypoint_data_render->mutable_point();
if (visualize_depth) {
SetColorSizeValueFromZ(keypoint.point_2d().depth(), z_min, z_max,
keypoint_data_render);
}
point->set_normalized(true);
point->set_x(keypoint.point_2d().x());
point->set_y(keypoint.point_2d().y());
}
// Add edges
for (int i = 0; i < options_.landmark_connections_size(); i += 2) {
const auto& ld0 =
object.keypoints(options_.landmark_connections(i)).point_2d();
const auto& ld1 =
object.keypoints(options_.landmark_connections(i + 1)).point_2d();
const bool normalized = true;
if (visualize_depth) {
const int gray_val1 =
255 - static_cast<int>(Remap(ld0.depth(), z_min, z_max, 255));
const int gray_val2 =
255 - static_cast<int>(Remap(ld1.depth(), z_min, z_max, 255));
AddConnectionToRenderData(ld0.x(), ld0.y(), ld1.x(), ld1.y(),
options_, normalized, gray_val1, gray_val2,
render_data.get());
} else {
AddConnectionToRenderData(ld0.x(), ld0.y(), ld1.x(), ld1.y(),
options_, normalized, render_data.get());
}
}
}
}
cc->Outputs()
.Tag(kRenderDataTag)
.Add(render_data.release(), cc->InputTimestamp());
return ::mediapipe::OkStatus();
}
void AnnotationsToRenderDataCalculator::AddConnectionToRenderData(
float start_x, float start_y, float end_x, float end_y,
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
int gray_val1, int gray_val2, RenderData* render_data) {
auto* connection_annotation = render_data->add_render_annotations();
RenderAnnotation::GradientLine* line =
connection_annotation->mutable_gradient_line();
line->set_x_start(start_x);
line->set_y_start(start_y);
line->set_x_end(end_x);
line->set_y_end(end_y);
line->set_normalized(normalized);
line->mutable_color1()->set_r(gray_val1);
line->mutable_color1()->set_g(gray_val1);
line->mutable_color1()->set_b(gray_val1);
line->mutable_color2()->set_r(gray_val2);
line->mutable_color2()->set_g(gray_val2);
line->mutable_color2()->set_b(gray_val2);
connection_annotation->set_thickness(options.thickness());
}
void AnnotationsToRenderDataCalculator::AddConnectionToRenderData(
float start_x, float start_y, float end_x, float end_y,
const AnnotationsToRenderDataCalculatorOptions& options, bool normalized,
RenderData* render_data) {
auto* connection_annotation = render_data->add_render_annotations();
RenderAnnotation::Line* line = connection_annotation->mutable_line();
line->set_x_start(start_x);
line->set_y_start(start_y);
line->set_x_end(end_x);
line->set_y_end(end_y);
line->set_normalized(normalized);
SetColor(connection_annotation, options.connection_color());
connection_annotation->set_thickness(options.thickness());
}
RenderAnnotation* AnnotationsToRenderDataCalculator::AddPointRenderData(
const AnnotationsToRenderDataCalculatorOptions& options,
RenderData* render_data) {
auto* landmark_data_annotation = render_data->add_render_annotations();
landmark_data_annotation->set_scene_tag(kKeypointLabel);
SetRenderAnnotationColorThickness(options, landmark_data_annotation);
return landmark_data_annotation;
}
void AnnotationsToRenderDataCalculator::SetRenderAnnotationColorThickness(
const AnnotationsToRenderDataCalculatorOptions& options,
RenderAnnotation* render_annotation) {
SetColor(render_annotation, options.landmark_color());
render_annotation->set_thickness(options.thickness());
}
} // namespace mediapipe

View File

@ -0,0 +1,43 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
import "mediapipe/util/color.proto";
message AnnotationsToRenderDataCalculatorOptions {
extend CalculatorOptions {
optional AnnotationsToRenderDataCalculatorOptions ext = 267644238;
}
// Specifies the landmarks to be connected in the drawing. For example, the
// landmark_connections value of [0, 1, 1, 2] specifies two connections: one
// that connects landmarks with index 0 and 1, and another that connects
// landmarks with index 1 and 2.
repeated int32 landmark_connections = 1;
// Color of the landmarks.
optional Color landmark_color = 2;
// Color of the connections.
optional Color connection_color = 3;
// Thickness of the drawing of landmarks and connections.
optional double thickness = 4 [default = 1.0];
// Change color and size of rendered landmarks based on its z value.
optional bool visualize_landmark_depth = 5 [default = true];
}

View File

@ -0,0 +1,38 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
message BeliefDecoderConfig {
optional float heatmap_threshold = 1 [default = 0.9];
// Maximum distance in pixels between two local max heatmap values.
optional float local_max_distance = 2 [default = 10.0];
// Coefficient of offset_scale.
// offset_scale = offset_scale_coef * min(rows, cols).
// offset_scale is used to multiply the offset predictions from the network.
optional float offset_scale_coef = 3 [default = 0.5, deprecated = true];
// The radius for vertex voting. Use no voting if the radius is less than or
// euqal to 1. Example: 10.
optional int32 voting_radius = 4;
// The number of pixels to determine whether two points are the same.
// Example: 5 (voting_radius / 2).
optional int32 voting_allowance = 5;
// The threshold of beliefs, with which the points can vote. Example: 0.2.
optional float voting_threshold = 6;
}

View File

@ -0,0 +1,255 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/box.h"
#include "Eigen/src/Core/util/Constants.h"
#include "mediapipe/framework/port/logging.h"
namespace mediapipe {
namespace {
constexpr int kFrontFaceId = 4;
constexpr int kTopFaceId = 2;
constexpr int kNumKeypoints = 8 + 1;
constexpr int kNumberOfAxis = 3;
constexpr int kEdgesPerAxis = 4;
} // namespace
Box::Box(const std::string& category)
: Model(kBoundingBox, kNumKeypoints, category),
bounding_box_(kNumKeypoints) {
transformation_.setIdentity();
scale_ << 0.1, 0.1, 0.1;
// The vertices are ordered according to the left-hand rule, so the normal
// vector of each face will point inward the box.
faces_.push_back({5, 6, 8, 7}); // +x on yz plane
faces_.push_back({1, 3, 4, 2}); // -x on yz plane
faces_.push_back({3, 7, 8, 4}); // +y on xz plane = top
faces_.push_back({1, 2, 6, 5}); // -y on xz plane
faces_.push_back({2, 4, 8, 6}); // +z on xy plane = front
faces_.push_back({1, 5, 7, 3}); // -z on xy plane
// Add the edges in the cube, they are sorted according to axis (x-y-z).
edges_.push_back({1, 5});
edges_.push_back({2, 6});
edges_.push_back({3, 7});
edges_.push_back({4, 8});
edges_.push_back({1, 3});
edges_.push_back({5, 7});
edges_.push_back({2, 4});
edges_.push_back({6, 8});
edges_.push_back({1, 2});
edges_.push_back({3, 4});
edges_.push_back({5, 6});
edges_.push_back({7, 8});
Update();
}
void Box::Update() {
// Compute the eight vertices of the bounding box from Box's parameters
auto w = scale_[0] / 2.f;
auto h = scale_[1] / 2.f;
auto d = scale_[2] / 2.f;
// Define the local coordinate system, w.r.t. the center of the boxs
bounding_box_[0] << 0., 0., 0.;
bounding_box_[1] << -w, -h, -d;
bounding_box_[2] << -w, -h, +d;
bounding_box_[3] << -w, +h, -d;
bounding_box_[4] << -w, +h, +d;
bounding_box_[5] << +w, -h, -d;
bounding_box_[6] << +w, -h, +d;
bounding_box_[7] << +w, +h, -d;
bounding_box_[8] << +w, +h, +d;
// Convert to world coordinate system
for (int i = 0; i < kNumKeypoints; ++i) {
bounding_box_[i] =
transformation_.topLeftCorner<3, 3>() * bounding_box_[i] +
transformation_.col(3).head<3>();
}
}
void Box::Adjust(const std::vector<float>& variables) {
Eigen::Vector3f translation;
translation << variables[0], variables[1], variables[2];
SetTranslation(translation);
const float roll = variables[3];
const float pitch = variables[4];
const float yaw = variables[5];
SetRotation(roll, pitch, yaw);
Eigen::Vector3f scale;
scale << variables[6], variables[7], variables[8];
SetScale(scale);
Update();
}
float* Box::GetVertex(size_t vertex_id) {
CHECK_LT(vertex_id, kNumKeypoints);
return bounding_box_[vertex_id].data();
}
const float* Box::GetVertex(size_t vertex_id) const {
CHECK_LT(vertex_id, kNumKeypoints);
return bounding_box_[vertex_id].data();
}
bool Box::InsideTest(const Eigen::Vector3f& point, int check_axis) const {
const float* v0 = GetVertex(1);
const float* v1 = GetVertex(2);
const float* v2 = GetVertex(3);
const float* v4 = GetVertex(5);
switch (check_axis) {
case 1:
return (v0[0] <= point[0] && point[0] <= v1[0]); // X-axis
case 2:
return (v0[1] <= point[1] && point[1] <= v2[1]); // Y-axis
case 3:
return (v0[2] <= point[2] && point[2] <= v4[2]); // Z-axis
default:
return false;
}
}
void Box::Deserialize(const Object& obj) {
CHECK_EQ(obj.keypoints_size(), kNumKeypoints);
Model::Deserialize(obj);
}
void Box::Serialize(Object* obj) {
Model::Serialize(obj);
obj->set_type(Object::BOUNDING_BOX);
std::vector<Vector3f> local_bounding_box(9);
// Define the local coordinate system, w.r.t. the center of the boxs
local_bounding_box[0] << 0., 0., 0.;
local_bounding_box[1] << -0.5, -0.5, -0.5;
local_bounding_box[2] << -0.5, -0.5, +0.5;
local_bounding_box[3] << -0.5, +0.5, -0.5;
local_bounding_box[4] << -0.5, +0.5, +0.5;
local_bounding_box[5] << +0.5, -0.5, -0.5;
local_bounding_box[6] << +0.5, -0.5, +0.5;
local_bounding_box[7] << +0.5, +0.5, -0.5;
local_bounding_box[8] << +0.5, +0.5, +0.5;
for (int i = 0; i < kNumKeypoints; ++i) {
KeyPoint* keypoint = obj->add_keypoints();
keypoint->set_x(local_bounding_box[i][0]);
keypoint->set_y(local_bounding_box[i][1]);
keypoint->set_z(local_bounding_box[i][2]);
keypoint->set_confidence_radius(0.);
}
}
const Face& Box::GetFrontFace() const { return faces_[kFrontFaceId]; }
const Face& Box::GetTopFace() const { return faces_[kTopFaceId]; }
std::pair<Vector3f, Vector3f> Box::GetGroundPlane() const {
const Vector3f gravity = Vector3f(0., 1., 0.);
int ground_plane_id = 0;
float ground_plane_error = 10.0;
auto get_face_center = [&](const Face& face) {
Vector3f center = Vector3f::Zero();
for (const int vertex_id : face) {
center += Map<const Vector3f>(GetVertex(vertex_id));
}
center /= face.size();
return center;
};
auto get_face_normal = [&](const Face& face, const Vector3f& center) {
Vector3f v1 = Map<const Vector3f>(GetVertex(face[0])) - center;
Vector3f v2 = Map<const Vector3f>(GetVertex(face[1])) - center;
Vector3f normal = v1.cross(v2);
return normal;
};
// The ground plane is defined as a plane aligned with gravity.
// gravity is the (0, 1, 0) vector in the world coordinate system.
const auto& faces = GetFaces();
for (int face_id = 0; face_id < faces.size(); face_id += 2) {
const auto& face = faces[face_id];
Vector3f center = get_face_center(face);
Vector3f normal = get_face_normal(face, center);
Vector3f w = gravity.cross(normal);
const float w_sq_norm = w.squaredNorm();
if (w_sq_norm < ground_plane_error) {
ground_plane_error = w_sq_norm;
ground_plane_id = face_id;
}
}
Vector3f center = get_face_center(faces[ground_plane_id]);
Vector3f normal = get_face_normal(faces[ground_plane_id], center);
// For each face, we also have a parallel face that it's normal is also
// aligned with gravity vector. We pick the face with lower height (y-value).
// The parallel to face 0 is 1, face 2 is 3, and face 4 is 5.
int parallel_face_id = ground_plane_id + 1;
const auto& parallel_face = faces[parallel_face_id];
Vector3f parallel_face_center = get_face_center(parallel_face);
Vector3f parallel_face_normal =
get_face_normal(parallel_face, parallel_face_center);
if (parallel_face_center[1] < center[1]) {
center = parallel_face_center;
normal = parallel_face_normal;
}
return {center, normal};
}
template <typename T>
void Box::Fit(const std::vector<T>& vertices) {
CHECK_EQ(vertices.size(), kNumKeypoints);
scale_.setZero();
// The scale would remain invariant under rotation and translation.
// We can safely estimate the scale from the oriented box.
for (int axis = 0; axis < kNumberOfAxis; ++axis) {
for (int edge_id = 0; edge_id < kEdgesPerAxis; ++edge_id) {
// The edges are stored in quadruples according to each axis
const std::array<int, 2>& edge = edges_[axis * kEdgesPerAxis + edge_id];
scale_[axis] += (vertices[edge[0]] - vertices[edge[1]]).norm();
}
scale_[axis] /= kEdgesPerAxis;
}
// Create a scaled axis-aligned box
transformation_.setIdentity();
Update();
using MatrixN3_RM = Eigen::Matrix<float, kNumKeypoints, 3, Eigen::RowMajor>;
Eigen::Map<const MatrixN3_RM> v(vertices[0].data());
Eigen::Map<const MatrixN3_RM> system(bounding_box_[0].data());
auto system_h = system.rowwise().homogeneous().eval();
auto system_g = system_h.colPivHouseholderQr();
auto solution = system_g.solve(v).eval();
transformation_.topLeftCorner<3, 4>() = solution.transpose();
Update();
}
template void Box::Fit<Vector3f>(const std::vector<Vector3f>&);
template void Box::Fit<Map<Vector3f>>(const std::vector<Map<Vector3f>>&);
template void Box::Fit<Map<const Vector3f>>(
const std::vector<Map<const Vector3f>>&);
} // namespace mediapipe

View File

@ -0,0 +1,132 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_
#include <vector>
#include "mediapipe/graphs/object_detection_3d/calculators/model.h"
namespace mediapipe {
// Model for the bounding box in 3D
// The box has 9 degrees of freedom, which uniquely defines 8 keypoints in the
// fixed world-coordinate system.
//
// The 8 keypoints are defined as follows
//
// kp-id axis
// 0 000 ---
// 1 001 --+
// 2 010 -+-
// 3 011 -++
// 4 100 +--
// 5 101 +-+
// 6 110 ++-
// 7 111 +++
//
// where xyz means positive or negative vector along the axis where the center
// of the box is the origin. The resulting bounding box is
//
// x x
// 0 + + + + + + + + 4 .-------
// +\ +\ |\
// + \ y + \ z | \ y
// + \ + \ | \
// + 2 + + + + + + + + 6
// z + + + +
// + + + +
// + + C + +
// + + + +
// 1 + + + + + + + + 5 +
// \ + \ +
// \ + \ +
// \+ \+
// 3 + + + + + + + + 7
//
// World coordinate system: +y is up (aligned with gravity),
// +z is toward the user, +x follows right hand rule.
// The front face is defined as +z axis on xy plane.
// The top face is defined as +y axis on xz plane.
//
class Box : public Model {
public:
EIGEN_MAKE_ALIGNED_OPERATOR_NEW
explicit Box(const std::string& category);
~Box() override = default;
bool InsideTest(const Vector3f& point, int check_axis) const;
const std::vector<Face>& GetFaces() const { return faces_; }
const Face& GetFace(size_t face_id) const { return faces_[face_id]; }
const std::vector<std::array<int, 2>>& GetEdges() const { return edges_; }
const std::array<int, 2>& GetEdge(size_t edge_id) const {
return edges_[edge_id];
}
// Returns the keypoints for the front face of the box.
// The front face is defind as a face with +z normal vector on xy plane
// In Box's c'tor, the top face is set to {1, 3, 7, 5}
const Face& GetFrontFace() const;
// Returns the keypoints for the top face of the box.
// The top face is defind as a face with +z normal vector on xy plane
// In Box's c'tor, the top face is set to {1, 3, 7, 5}
const Face& GetTopFace() const;
void Update() override;
void Adjust(const std::vector<float>& variables) override;
float* GetVertex(size_t vertex_id) override;
const float* GetVertex(size_t vertex_id) const override;
void Deserialize(const Object& obj) override;
void Serialize(Object* obj) override;
// Computes the plane center and the normal vector for the plane the object
// is sitting on in the world cooordinate system. The normal vector is roughly
// aligned with gravity.
std::pair<Vector3f, Vector3f> GetGroundPlane() const;
// Estimates a box 9-dof parameters from the given vertices. Directly computes
// the scale of the box, then solves for orientation and translation.
// Expects a std::vector of size 9 of a Eigen::Vector3f or mapped Vector3f.
// If mapping proto messages, we recommend to use the Map<const Vector3f>.
// For example:
//
// using T = Map<const Vector3f>;
// std::vector<T> vertices;
// for (const auto& point : message) { // point is a repeated float message.
// T p(point.data());
// vertices.emplace_back(p);
// }
// box.Fit<T>(vertices);
//
// The Points must be arranged as 1 + 8 (center keypoint followed by 8 box
// vertices) vector. This function will overwrite the scale and transformation
// properties of the class.
template <typename T = Eigen::Map<const Vector3f>>
void Fit(const std::vector<T>& vertices);
private:
std::vector<Face> faces_;
std::vector<std::array<int, 2>> edges_;
std::vector<Vector3f> bounding_box_;
};
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_H_

View File

@ -0,0 +1,153 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
#include <math.h>
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace mediapipe {
void ComputeBoundingRect(const std::vector<cv::Point2f>& points,
mediapipe::TimedBoxProto* box) {
CHECK(box != nullptr);
float top = 1.0f;
float bottom = 0.0f;
float left = 1.0f;
float right = 0.0f;
for (const auto& point : points) {
top = std::min(top, point.y);
bottom = std::max(bottom, point.y);
left = std::min(left, point.x);
right = std::max(right, point.x);
}
box->set_top(top);
box->set_bottom(bottom);
box->set_left(left);
box->set_right(right);
// We are currently only doing axis aligned bounding box. If we need to
// compute rotated bounding box, then we need the original image aspect ratio,
// map back to original image space, compute cv::convexHull, then for each
// edge of the hull, rotate according to edge orientation, find the box.
box->set_rotation(0.0f);
}
float ComputeBoxIoU(const TimedBoxProto& box1, const TimedBoxProto& box2) {
cv::Point2f box1_center((box1.left() + box1.right()) * 0.5f,
(box1.top() + box1.bottom()) * 0.5f);
cv::Size2f box1_size(box1.right() - box1.left(), box1.bottom() - box1.top());
cv::RotatedRect rect1(box1_center, box1_size,
-box1.rotation() * 180.0f / M_PI);
cv::Point2f box2_center((box2.left() + box2.right()) * 0.5f,
(box2.top() + box2.bottom()) * 0.5f);
cv::Size2f box2_size(box2.right() - box2.left(), box2.bottom() - box2.top());
cv::RotatedRect rect2(box2_center, box2_size,
-box2.rotation() * 180.0f / M_PI);
std::vector<cv::Point2f> intersections_unsorted;
std::vector<cv::Point2f> intersections;
cv::rotatedRectangleIntersection(rect1, rect2, intersections_unsorted);
if (intersections_unsorted.size() < 3) {
return 0.0f;
}
cv::convexHull(intersections_unsorted, intersections);
// We use Shoelace formula to compute area of polygons.
float intersection_area = 0.0f;
for (int i = 0; i < intersections.size(); ++i) {
const auto& curr_pt = intersections[i];
const int i_next = (i + 1) == intersections.size() ? 0 : (i + 1);
const auto& next_pt = intersections[i_next];
intersection_area += (curr_pt.x * next_pt.y - next_pt.x * curr_pt.y);
}
intersection_area = std::abs(intersection_area) * 0.5f;
// Compute union area
const float union_area =
rect1.size.area() + rect2.size.area() - intersection_area + 1e-5f;
const float iou = intersection_area / union_area;
return iou;
}
std::vector<cv::Point2f> ComputeBoxCorners(const TimedBoxProto& box,
float width, float height) {
// Rotate 4 corner w.r.t. center.
const cv::Point2f center(0.5f * (box.left() + box.right()) * width,
0.5f * (box.top() + box.bottom()) * height);
const std::vector<cv::Point2f> corners{
cv::Point2f(box.left() * width, box.top() * height),
cv::Point2f(box.left() * width, box.bottom() * height),
cv::Point2f(box.right() * width, box.bottom() * height),
cv::Point2f(box.right() * width, box.top() * height)};
const float cos_a = std::cos(box.rotation());
const float sin_a = std::sin(box.rotation());
std::vector<cv::Point2f> transformed_corners(4);
for (int k = 0; k < 4; ++k) {
// Scale and rotate w.r.t. center.
const cv::Point2f rad = corners[k] - center;
const cv::Point2f rot_rad(cos_a * rad.x - sin_a * rad.y,
sin_a * rad.x + cos_a * rad.y);
transformed_corners[k] = center + rot_rad;
transformed_corners[k].x /= width;
transformed_corners[k].y /= height;
}
return transformed_corners;
}
cv::Mat PerspectiveTransformBetweenBoxes(const TimedBoxProto& src_box,
const TimedBoxProto& dst_box,
const float aspect_ratio) {
std::vector<cv::Point2f> box1_corners =
ComputeBoxCorners(src_box, /*width*/ aspect_ratio, /*height*/ 1.0f);
std::vector<cv::Point2f> box2_corners =
ComputeBoxCorners(dst_box, /*width*/ aspect_ratio, /*height*/ 1.0f);
cv::Mat affine_transform = cv::getPerspectiveTransform(
/*src*/ box1_corners, /*dst*/ box2_corners);
cv::Mat output_affine;
affine_transform.convertTo(output_affine, CV_32FC1);
return output_affine;
}
cv::Point2f MapPoint(const TimedBoxProto& src_box, const TimedBoxProto& dst_box,
const cv::Point2f& src_point, float width, float height) {
const cv::Point2f src_center(
0.5f * (src_box.left() + src_box.right()) * width,
0.5f * (src_box.top() + src_box.bottom()) * height);
const cv::Point2f dst_center(
0.5f * (dst_box.left() + dst_box.right()) * width,
0.5f * (dst_box.top() + dst_box.bottom()) * height);
const float scale_x =
(dst_box.right() - dst_box.left()) / (src_box.right() - src_box.left());
const float scale_y =
(dst_box.bottom() - dst_box.top()) / (src_box.bottom() - src_box.top());
const float rotation = dst_box.rotation() - src_box.rotation();
const cv::Point2f rad =
cv::Point2f(src_point.x * width, src_point.y * height) - src_center;
const float rad_x = rad.x * scale_x;
const float rad_y = rad.y * scale_y;
const float cos_a = std::cos(rotation);
const float sin_a = std::sin(rotation);
const cv::Point2f rot_rad(cos_a * rad_x - sin_a * rad_y,
sin_a * rad_x + cos_a * rad_y);
const cv::Point2f dst_point_image = dst_center + rot_rad;
const cv::Point2f dst_point(dst_point_image.x / width,
dst_point_image.y / height);
return dst_point;
}
} // namespace mediapipe

View File

@ -0,0 +1,50 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace mediapipe {
// This function fills the geometry of the TimedBoxProto. Id, timestamp etc.
// need to be set outside this function.
void ComputeBoundingRect(const std::vector<cv::Point2f>& points,
mediapipe::TimedBoxProto* box);
// This function computes the intersection over union between two boxes.
float ComputeBoxIoU(const TimedBoxProto& box1, const TimedBoxProto& box2);
// Computes corners of the box.
// width and height are image width and height, which is typically
// needed since the box is in normalized coordinates.
std::vector<cv::Point2f> ComputeBoxCorners(const TimedBoxProto& box,
float width, float height);
// Computes the perspective transform from box1 to box2.
// The input argument aspect_ratio is width / height of the image.
// The returned matrix should be a 3x3 matrix.
cv::Mat PerspectiveTransformBetweenBoxes(const TimedBoxProto& src_box,
const TimedBoxProto& dst_box,
const float aspect_ratio);
// Map point according to source and destination box location.
cv::Point2f MapPoint(const TimedBoxProto& src_box, const TimedBoxProto& dst_box,
const cv::Point2f& src_point, float width, float height);
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_BOX_UTIL_H_

View File

@ -0,0 +1,123 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace mediapipe {
namespace {
TEST(BoxUtilTest, TestComputeBoundingRect) {
std::vector<cv::Point2f> points{
cv::Point2f(0.35f, 0.25f), cv::Point2f(0.3f, 0.3f),
cv::Point2f(0.2f, 0.4f), cv::Point2f(0.3f, 0.1f),
cv::Point2f(0.2f, 0.2f), cv::Point2f(0.5f, 0.3f),
cv::Point2f(0.4f, 0.4f), cv::Point2f(0.5f, 0.1f),
cv::Point2f(0.4f, 0.2f)};
TimedBoxProto box;
ComputeBoundingRect(points, &box);
EXPECT_FLOAT_EQ(0.1f, box.top());
EXPECT_FLOAT_EQ(0.4f, box.bottom());
EXPECT_FLOAT_EQ(0.2f, box.left());
EXPECT_FLOAT_EQ(0.5f, box.right());
}
TEST(BoxUtilTest, TestComputeBoxIoU) {
TimedBoxProto box1;
box1.set_top(0.2f);
box1.set_bottom(0.6f);
box1.set_left(0.1f);
box1.set_right(0.3f);
box1.set_rotation(0.0f);
TimedBoxProto box2 = box1;
box2.set_rotation(/*pi/2*/ 1.570796f);
const float box_area =
(box1.bottom() - box1.top()) * (box1.right() - box1.left());
const float box_intersection =
(box1.right() - box1.left()) * (box1.right() - box1.left());
const float expected_iou =
box_intersection / (box_area * 2 - box_intersection);
EXPECT_NEAR(expected_iou, ComputeBoxIoU(box1, box2), 3e-5f);
TimedBoxProto box3;
box3.set_top(0.2f);
box3.set_bottom(0.6f);
box3.set_left(0.5f);
box3.set_right(0.7f);
EXPECT_NEAR(0.0f, ComputeBoxIoU(box1, box3), 3e-5f);
}
TEST(BoxUtilTest, TestPerspectiveTransformBetweenBoxes) {
TimedBoxProto box1;
const float height = 4.0f;
const float width = 3.0f;
box1.set_top(1.0f / height);
box1.set_bottom(2.0f / height);
box1.set_left(1.0f / width);
box1.set_right(2.0f / width);
TimedBoxProto box2;
box2.set_top(1.0f / height);
box2.set_bottom(2.0f / height);
box2.set_left(1.0f / width);
box2.set_right(2.0f / width);
box2.set_rotation(/*pi/4*/ -0.785398f);
cv::Mat transform =
PerspectiveTransformBetweenBoxes(box1, box2, width / height);
const float kTolerence = 1e-5f;
const cv::Vec3f original_position(1.5f / width, 1.0f / height, 1.0f);
const cv::Mat transformed_position = transform * cv::Mat(original_position);
EXPECT_NEAR(
(1.5f - 0.5f * std::sqrt(2) / 2.0f) / width,
transformed_position.at<float>(0) / transformed_position.at<float>(2),
kTolerence);
EXPECT_NEAR(
(1.5f - 0.5f * std::sqrt(2) / 2.0f) / height,
transformed_position.at<float>(1) / transformed_position.at<float>(2),
kTolerence);
}
TEST(BoxUtilTest, TestMapPoint) {
const float height = 4.0f;
const float width = 3.0f;
TimedBoxProto box1;
box1.set_top(1.0f / height);
box1.set_bottom(2.0f / height);
box1.set_left(1.0f / width);
box1.set_right(2.0f / width);
TimedBoxProto box2;
box2.set_top(1.0f / height);
box2.set_bottom(2.0f / height);
box2.set_left(1.0f / width);
box2.set_right(2.0f / width);
box2.set_rotation(/*pi/4*/ -0.785398f);
cv::Point2f src_point1(1.2f / width, 1.4f / height);
cv::Point2f src_point2(1.3f / width, 1.8f / height);
const float distance1 = std::sqrt(0.1 * 0.1 + 0.4 * 0.4);
cv::Point2f dst_point1 = MapPoint(box1, box2, src_point1, width, height);
cv::Point2f dst_point2 = MapPoint(box1, box2, src_point2, width, height);
const float distance2 =
std::sqrt((dst_point1.x * width - dst_point2.x * width) *
(dst_point1.x * width - dst_point2.x * width) +
(dst_point1.y * height - dst_point2.y * height) *
(dst_point1.y * height - dst_point2.y * height));
EXPECT_NEAR(distance1, distance2, 1e-5f);
}
} // namespace
} // namespace mediapipe

View File

@ -0,0 +1,47 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
message CameraParametersProto {
// This number is non-negative, it represents camera height above ground
// normalized by focal length.
optional float height_above_ground = 1 [default = 100.0];
// Width of image in portrait orientation normalized by focal length
optional float portrait_width = 2 [default = 1.0103];
// Height of image in portrait orientation normalized by focal length
optional float portrait_height = 3 [default = 1.3435];
enum ImageOrientation {
PORTRAIT_ORIENTATION = 0;
LANDSCAPE_ORIENTATION = 1;
}
// The input image orientation
optional ImageOrientation image_orientation = 4
[default = PORTRAIT_ORIENTATION];
// This defines the projection method from 2D screen to 3D.
enum ProjectionMode {
UNSPECIFIED = 0;
// Projects 2D point to ground plane (horizontal plane).
GROUND_PLANE = 1;
// Projects 2D point to sphere.
SPHERE = 2;
}
optional ProjectionMode projection_mode = 5 [default = GROUND_PLANE];
// Radius of sphere when using the SPHERE projection mode above.
// The value is normalized by focal length.
optional float projection_sphere_radius = 6 [default = 100.0];
}

View File

@ -0,0 +1,257 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h"
#include <limits>
#include "Eigen/Dense"
#include "mediapipe/framework/port/canonical_errors.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
namespace mediapipe {
constexpr int Decoder::kNumOffsetmaps = 16;
namespace {
void SetPoint3d(float x, float y, float z, Point3D* point_3d) {
point_3d->set_x(x);
point_3d->set_y(y);
point_3d->set_z(z);
}
} // namespace
FrameAnnotation Decoder::DecodeBoundingBoxKeypoints(
const cv::Mat& heatmap, const cv::Mat& offsetmap) const {
CHECK_EQ(1, heatmap.channels());
CHECK_EQ(kNumOffsetmaps, offsetmap.channels());
CHECK_EQ(heatmap.cols, offsetmap.cols);
CHECK_EQ(heatmap.rows, offsetmap.rows);
const float offset_scale = std::min(offsetmap.cols, offsetmap.rows);
const std::vector<cv::Point> center_points = ExtractCenterKeypoints(heatmap);
std::vector<BeliefBox> boxes;
for (const auto& center_point : center_points) {
BeliefBox box;
box.box_2d.emplace_back(center_point.x, center_point.y);
const int center_x = static_cast<int>(std::round(center_point.x));
const int center_y = static_cast<int>(std::round(center_point.y));
box.belief = heatmap.at<float>(center_y, center_x);
if (config_.voting_radius() > 1) {
DecodeByVoting(heatmap, offsetmap, center_x, center_y, offset_scale,
offset_scale, &box);
} else {
DecodeByPeak(offsetmap, center_x, center_y, offset_scale, offset_scale,
&box);
}
if (IsNewBox(&boxes, &box)) {
boxes.push_back(std::move(box));
}
}
const float x_scale = 1.0f / offsetmap.cols;
const float y_scale = 1.0f / offsetmap.rows;
FrameAnnotation frame_annotations;
for (const auto& box : boxes) {
auto* object = frame_annotations.add_annotations();
for (const auto& point : box.box_2d) {
auto* point2d = object->add_keypoints()->mutable_point_2d();
point2d->set_x(point.first * x_scale);
point2d->set_y(point.second * y_scale);
}
}
return frame_annotations;
}
void Decoder::DecodeByPeak(const cv::Mat& offsetmap, int center_x, int center_y,
float offset_scale_x, float offset_scale_y,
BeliefBox* box) const {
const auto& offset = offsetmap.at<cv::Vec<float, kNumOffsetmaps>>(
/*row*/ center_y, /*col*/ center_x);
for (int i = 0; i < kNumOffsetmaps / 2; ++i) {
const float x_offset = offset[2 * i] * offset_scale_x;
const float y_offset = offset[2 * i + 1] * offset_scale_y;
box->box_2d.emplace_back(center_x + x_offset, center_y + y_offset);
}
}
void Decoder::DecodeByVoting(const cv::Mat& heatmap, const cv::Mat& offsetmap,
int center_x, int center_y, float offset_scale_x,
float offset_scale_y, BeliefBox* box) const {
// Votes at the center.
const auto& center_offset = offsetmap.at<cv::Vec<float, kNumOffsetmaps>>(
/*row*/ center_y, /*col*/ center_x);
std::vector<float> center_votes(kNumOffsetmaps, 0.f);
for (int i = 0; i < kNumOffsetmaps / 2; ++i) {
center_votes[2 * i] = center_x + center_offset[2 * i] * offset_scale_x;
center_votes[2 * i + 1] =
center_y + center_offset[2 * i + 1] * offset_scale_y;
}
// Find voting window.
int x_min = std::max(0, center_x - config_.voting_radius());
int y_min = std::max(0, center_y - config_.voting_radius());
int width = std::min(heatmap.cols - x_min, config_.voting_radius() * 2 + 1);
int height = std::min(heatmap.rows - y_min, config_.voting_radius() * 2 + 1);
cv::Rect rect(x_min, y_min, width, height);
cv::Mat heat = heatmap(rect);
cv::Mat offset = offsetmap(rect);
for (int i = 0; i < kNumOffsetmaps / 2; ++i) {
float x_sum = 0.f;
float y_sum = 0.f;
float votes = 0.f;
for (int r = 0; r < heat.rows; ++r) {
for (int c = 0; c < heat.cols; ++c) {
const float belief = heat.at<float>(r, c);
if (belief < config_.voting_threshold()) {
continue;
}
float offset_x =
offset.at<cv::Vec<float, kNumOffsetmaps>>(r, c)[2 * i] *
offset_scale_x;
float offset_y =
offset.at<cv::Vec<float, kNumOffsetmaps>>(r, c)[2 * i + 1] *
offset_scale_y;
float vote_x = c + rect.x + offset_x;
float vote_y = r + rect.y + offset_y;
float x_diff = std::abs(vote_x - center_votes[2 * i]);
float y_diff = std::abs(vote_y - center_votes[2 * i + 1]);
if (x_diff > config_.voting_allowance() ||
y_diff > config_.voting_allowance()) {
continue;
}
x_sum += vote_x * belief;
y_sum += vote_y * belief;
votes += belief;
}
}
box->box_2d.emplace_back(x_sum / votes, y_sum / votes);
}
}
bool Decoder::IsNewBox(std::vector<BeliefBox>* boxes, BeliefBox* box) const {
for (auto& b : *boxes) {
if (IsIdentical(b, *box)) {
if (b.belief < box->belief) {
std::swap(b, *box);
}
return false;
}
}
return true;
}
bool Decoder::IsIdentical(const BeliefBox& box_1,
const BeliefBox& box_2) const {
// Skip the center point.
for (int i = 1; i < box_1.box_2d.size(); ++i) {
const float x_diff =
std::abs(box_1.box_2d[i].first - box_2.box_2d[i].first);
const float y_diff =
std::abs(box_1.box_2d[i].second - box_2.box_2d[i].second);
if (x_diff > config_.voting_allowance() ||
y_diff > config_.voting_allowance()) {
return false;
}
}
return true;
}
std::vector<cv::Point> Decoder::ExtractCenterKeypoints(
const cv::Mat& center_heatmap) const {
cv::Mat max_filtered_heatmap(center_heatmap.rows, center_heatmap.cols,
center_heatmap.type());
const int kernel_size =
static_cast<int>(config_.local_max_distance() * 2 + 1 + 0.5f);
const cv::Size morph_size(kernel_size, kernel_size);
cv::dilate(center_heatmap, max_filtered_heatmap,
cv::getStructuringElement(cv::MORPH_RECT, morph_size));
cv::Mat peak_map;
cv::bitwise_and((center_heatmap >= max_filtered_heatmap),
(center_heatmap >= config_.heatmap_threshold()), peak_map);
std::vector<cv::Point> locations; // output, locations of non-zero pixels
cv::findNonZero(peak_map, locations);
return locations;
}
absl::Status Decoder::Lift2DTo3D(
const Eigen::Matrix<float, 4, 4, Eigen::RowMajor>& projection_matrix,
bool portrait, FrameAnnotation* estimated_box) const {
CHECK(estimated_box != nullptr);
const float fx = projection_matrix(0, 0);
const float fy = projection_matrix(1, 1);
const float cx = projection_matrix(0, 2);
const float cy = projection_matrix(1, 2);
for (auto& annotation : *estimated_box->mutable_annotations()) {
Eigen::Matrix<float, 16, 12, Eigen::RowMajor> m =
Eigen::Matrix<float, 16, 12, Eigen::RowMajor>::Zero(16, 12);
CHECK_EQ(9, annotation.keypoints_size());
float u, v;
for (int i = 0; i < 8; ++i) {
const auto& keypoint2d = annotation.keypoints(i + 1).point_2d();
if (portrait) {
// swap x and y given that our image is in portrait orientation
u = keypoint2d.y() * 2 - 1;
v = keypoint2d.x() * 2 - 1;
} else {
u = keypoint2d.x() * 2 - 1;
v = 1 - keypoint2d.y() * 2; // (1 - keypoint2d.y()) * 2 - 1
}
for (int j = 0; j < 4; ++j) {
// For each of the 4 control points, formulate two rows of the
// m matrix (two equations).
const float control_alpha = epnp_alpha_(i, j);
m(i * 2, j * 3) = fx * control_alpha;
m(i * 2, j * 3 + 2) = (cx + u) * control_alpha;
m(i * 2 + 1, j * 3 + 1) = fy * control_alpha;
m(i * 2 + 1, j * 3 + 2) = (cy + v) * control_alpha;
}
}
// This is a self adjoint matrix. Use SelfAdjointEigenSolver for a fast
// and stable solution.
Eigen::Matrix<float, 12, 12, Eigen::RowMajor> mt_m = m.transpose() * m;
Eigen::SelfAdjointEigenSolver<Eigen::Matrix<float, 12, 12, Eigen::RowMajor>>
eigen_solver(mt_m);
if (eigen_solver.info() != Eigen::Success) {
return absl::AbortedError("Eigen decomposition failed.");
}
CHECK_EQ(12, eigen_solver.eigenvalues().size());
// Eigenvalues are sorted in increasing order for SelfAdjointEigenSolver
// only! If you use other Eigen Solvers, it's not guaranteed to be in
// increasing order. Here, we just take the eigen vector corresponding
// to first/smallest eigen value, since we used SelfAdjointEigenSolver.
Eigen::VectorXf eigen_vec = eigen_solver.eigenvectors().col(0);
Eigen::Map<Eigen::Matrix<float, 4, 3, Eigen::RowMajor>> control_matrix(
eigen_vec.data());
if (control_matrix(0, 2) > 0) {
control_matrix = -control_matrix;
}
// First set the center keypoint.
SetPoint3d(control_matrix(0, 0), control_matrix(0, 1), control_matrix(0, 2),
annotation.mutable_keypoints(0)->mutable_point_3d());
// Then set the 8 vertices.
Eigen::Matrix<float, 8, 3, Eigen::RowMajor> vertices =
epnp_alpha_ * control_matrix;
for (int i = 0; i < 8; ++i) {
SetPoint3d(vertices(i, 0), vertices(i, 1), vertices(i, 2),
annotation.mutable_keypoints(i + 1)->mutable_point_3d());
}
}
return absl::OkStatus();
}
} // namespace mediapipe

View File

@ -0,0 +1,109 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_
#include <vector>
#include "Eigen/Dense"
#include "absl/status/status.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.pb.h"
namespace mediapipe {
// Decodes 3D bounding box from heatmaps and offset maps. In the future,
// if we want to develop decoder for generic skeleton, then we need to
// generalize this class, and make a few child classes.
class Decoder {
public:
static const int kNumOffsetmaps;
explicit Decoder(const BeliefDecoderConfig& config) : config_(config) {
epnp_alpha_ << 4.0f, -1.0f, -1.0f, -1.0f, 2.0f, -1.0f, -1.0f, 1.0f, 2.0f,
-1.0f, 1.0f, -1.0f, 0.0f, -1.0f, 1.0f, 1.0f, 2.0f, 1.0f, -1.0f, -1.0f,
0.0f, 1.0f, -1.0f, 1.0f, 0.0f, 1.0f, 1.0f, -1.0f, -2.0f, 1.0f, 1.0f,
1.0f;
}
// Decodes bounding boxes from predicted heatmap and offset maps.
// Input:
// heatmap: a single channel cv::Mat representing center point heatmap
// offsetmap: a 16 channel cv::Mat representing the 16 offset maps
// (2 for each of the 8 vertices)
// Output:
// Outputs 3D bounding boxes 2D vertices, represented by 'point_2d' field
// in each 'keypoints' field of object annotations.
FrameAnnotation DecodeBoundingBoxKeypoints(const cv::Mat& heatmap,
const cv::Mat& offsetmap) const;
// Lifts the estimated 2D projections of bounding box vertices to 3D.
// This function uses the EPnP approach described in this paper:
// https://icwww.epfl.ch/~lepetit/papers/lepetit_ijcv08.pdf .
// Input:
// projection_matrix: the projection matrix from 3D coordinate
// to screen coordinate.
// The 2D screen coordinate is defined as: u is along the long
// edge of the device, pointing down; v is along the short edge
// of the device, pointing right.
// portrait: a boolen variable indicating whether our images are
// obtained in portrait orientation or not.
// estimated_box: annotation with point_2d field populated with
// 2d vertices.
// Output:
// estimated_box: annotation with point_3d field populated with
// 3d vertices.
absl::Status Lift2DTo3D(
const Eigen::Matrix<float, 4, 4, Eigen::RowMajor>& projection_matrix,
bool portrait, FrameAnnotation* estimated_box) const;
private:
struct BeliefBox {
float belief;
std::vector<std::pair<float, float>> box_2d;
};
std::vector<cv::Point> ExtractCenterKeypoints(
const cv::Mat& center_heatmap) const;
// Decodes 2D keypoints at the peak point.
void DecodeByPeak(const cv::Mat& offsetmap, int center_x, int center_y,
float offset_scale_x, float offset_scale_y,
BeliefBox* box) const;
// Decodes 2D keypoints by voting around the peak.
void DecodeByVoting(const cv::Mat& heatmap, const cv::Mat& offsetmap,
int center_x, int center_y, float offset_scale_x,
float offset_scale_y, BeliefBox* box) const;
// Returns true if it is a new box. Otherwise, it may replace an existing box
// if the new box's belief is higher.
bool IsNewBox(std::vector<BeliefBox>* boxes, BeliefBox* box) const;
// Returns true if the two boxes are identical.
bool IsIdentical(const BeliefBox& box_1, const BeliefBox& box_2) const;
BeliefDecoderConfig config_;
// Following equation (1) in this paper
// https://icwww.epfl.ch/~lepetit/papers/lepetit_ijcv08.pdf,
// this variable denotes the coefficients for the 4 control points
// for each of the 8 3D box vertices.
Eigen::Matrix<float, 8, 4, Eigen::RowMajor> epnp_alpha_;
};
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_DECODER_H_

View File

@ -0,0 +1,115 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <utility>
#include "absl/memory/memory.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace {
constexpr char kInputStreamTag[] = "FRAME_ANNOTATION";
constexpr char kOutputStreamTag[] = "BOXES";
} // namespace
namespace mediapipe {
// Convert FrameAnnotation 3d bounding box detections to TimedBoxListProto
// 2d bounding boxes.
//
// Input:
// FRAME_ANNOTATION - 3d bounding box annotation.
// Output:
// BOXES - 2d bounding box enclosing the projection of 3d box.
//
// Usage example:
// node {
// calculator: "FrameAnnotationToTimedBoxListCalculator"
// input_stream: "FRAME_ANNOTATION:frame_annotation"
// output_stream: "BOXES:boxes"
// }
class FrameAnnotationToTimedBoxListCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
::mediapipe::Status Close(CalculatorContext* cc) override;
};
REGISTER_CALCULATOR(FrameAnnotationToTimedBoxListCalculator);
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::GetContract(
CalculatorContract* cc) {
RET_CHECK(!cc->Inputs().GetTags().empty());
RET_CHECK(!cc->Outputs().GetTags().empty());
if (cc->Inputs().HasTag(kInputStreamTag)) {
cc->Inputs().Tag(kInputStreamTag).Set<FrameAnnotation>();
}
if (cc->Outputs().HasTag(kOutputStreamTag)) {
cc->Outputs().Tag(kOutputStreamTag).Set<TimedBoxProtoList>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Open(
CalculatorContext* cc) {
return ::mediapipe::OkStatus();
}
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Process(
CalculatorContext* cc) {
if (cc->Inputs().HasTag(kInputStreamTag) &&
!cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
const auto& frame_annotation =
cc->Inputs().Tag(kInputStreamTag).Get<FrameAnnotation>();
auto output_objects = absl::make_unique<TimedBoxProtoList>();
for (const auto& annotation : frame_annotation.annotations()) {
std::vector<cv::Point2f> key_points;
for (const auto& keypoint : annotation.keypoints()) {
key_points.push_back(
cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()));
}
TimedBoxProto* added_box = output_objects->add_box();
ComputeBoundingRect(key_points, added_box);
added_box->set_id(annotation.object_id());
const int64 time_msec =
static_cast<int64>(std::round(frame_annotation.timestamp() / 1000));
added_box->set_time_msec(time_msec);
}
// Output
if (cc->Outputs().HasTag(kOutputStreamTag)) {
cc->Outputs()
.Tag(kOutputStreamTag)
.Add(output_objects.release(), cc->InputTimestamp());
}
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status FrameAnnotationToTimedBoxListCalculator::Close(
CalculatorContext* cc) {
return ::mediapipe::OkStatus();
}
} // namespace mediapipe

View File

@ -0,0 +1,102 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h"
#include "absl/container/flat_hash_set.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/box_util.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace mediapipe {
void FrameAnnotationTracker::AddDetectionResult(
const FrameAnnotation& frame_annotation) {
const int64 time_us =
static_cast<int64>(std::round(frame_annotation.timestamp()));
for (const auto& object_annotation : frame_annotation.annotations()) {
detected_objects_[time_us + object_annotation.object_id()] =
object_annotation;
}
}
FrameAnnotation FrameAnnotationTracker::ConsolidateTrackingResult(
const TimedBoxProtoList& tracked_boxes,
absl::flat_hash_set<int>* cancel_object_ids) {
CHECK(cancel_object_ids != nullptr);
FrameAnnotation frame_annotation;
std::vector<int64> keys_to_be_deleted;
for (const auto& detected_obj : detected_objects_) {
const int object_id = detected_obj.second.object_id();
if (cancel_object_ids->contains(object_id)) {
// Remember duplicated detections' keys.
keys_to_be_deleted.push_back(detected_obj.first);
continue;
}
TimedBoxProto ref_box;
for (const auto& box : tracked_boxes.box()) {
if (box.id() == object_id) {
ref_box = box;
break;
}
}
if (!ref_box.has_id() || ref_box.id() < 0) {
LOG(ERROR) << "Can't find matching tracked box for object id: "
<< object_id << ". Likely lost tracking of it.";
keys_to_be_deleted.push_back(detected_obj.first);
continue;
}
// Find duplicated boxes
for (const auto& box : tracked_boxes.box()) {
if (box.id() != object_id) {
if (ComputeBoxIoU(ref_box, box) > iou_threshold_) {
cancel_object_ids->insert(box.id());
}
}
}
// Map ObjectAnnotation from detection to tracked time.
// First, gather all keypoints from source detection.
std::vector<cv::Point2f> key_points;
for (const auto& keypoint : detected_obj.second.keypoints()) {
key_points.push_back(
cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()));
}
// Second, find source box.
TimedBoxProto src_box;
ComputeBoundingRect(key_points, &src_box);
ObjectAnnotation* tracked_obj = frame_annotation.add_annotations();
tracked_obj->set_object_id(ref_box.id());
// Finally, map all keypoints in the source detection to tracked location.
for (const auto& keypoint : detected_obj.second.keypoints()) {
cv::Point2f dst = MapPoint(
src_box, ref_box,
cv::Point2f(keypoint.point_2d().x(), keypoint.point_2d().y()),
img_width_, img_height_);
auto* dst_point = tracked_obj->add_keypoints()->mutable_point_2d();
dst_point->set_x(dst.x);
dst_point->set_y(dst.y);
}
}
for (const auto& key : keys_to_be_deleted) {
detected_objects_.erase(key);
}
return frame_annotation;
}
} // namespace mediapipe

View File

@ -0,0 +1,62 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_
#include <functional>
#include "absl/container/btree_map.h"
#include "absl/container/flat_hash_set.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace mediapipe {
class FrameAnnotationTracker {
public:
// If two bounding boxes have IoU over iou_threshold, then we consider them
// describing the same object.
FrameAnnotationTracker(float iou_threshold, float img_width, float img_height)
: iou_threshold_(iou_threshold),
img_width_(img_width),
img_height_(img_height) {}
// Adds detection results from an external detector.
void AddDetectionResult(const FrameAnnotation& frame_annotation);
// Consolidates tracking result from an external tracker, associates with
// the detection result by the object id, and produces the corresponding
// result in FrameAnnotation. When there are duplicates, output the ids that
// need to be cancelled in cancel_object_ids.
// Note that the returned FrameAnnotation is missing timestamp. Need to fill
// that field.
FrameAnnotation ConsolidateTrackingResult(
const TimedBoxProtoList& tracked_boxes,
absl::flat_hash_set<int>* cancel_object_ids);
private:
float iou_threshold_;
float img_width_;
float img_height_;
// Cached detection results over time.
// Key is timestamp_us + object_id.
absl::btree_map<int64, ObjectAnnotation, std::greater<int64>>
detected_objects_;
};
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_FRAME_ANNOTATION_TRACKER_H_

View File

@ -0,0 +1,137 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "absl/container/flat_hash_set.h"
#include "absl/memory/memory.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h"
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker_calculator.pb.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace {
constexpr char kInputFrameAnnotationTag[] = "FRAME_ANNOTATION";
constexpr char kInputTrackedBoxesTag[] = "TRACKED_BOXES";
constexpr char kOutputTrackedFrameAnnotationTag[] = "TRACKED_FRAME_ANNOTATION";
constexpr char kOutputCancelObjectIdTag[] = "CANCEL_OBJECT_ID";
} // namespace
namespace mediapipe {
// Tracks frame annotations seeded/updated by FRAME_ANNOTATION input_stream.
// When using this calculator, make sure FRAME_ANNOTATION and TRACKED_BOXES
// are in different sync set.
//
// Input:
// FRAME_ANNOTATION - frame annotation.
// TRACKED_BOXES - 2d box tracking result
// Output:
// TRACKED_FRAME_ANNOTATION - annotation inferred from 2d tracking result.
// CANCEL_OBJECT_ID - object id that needs to be cancelled from the tracker.
//
// Usage example:
// node {
// calculator: "FrameAnnotationTrackerCalculator"
// input_stream: "FRAME_ANNOTATION:frame_annotation"
// input_stream: "TRACKED_BOXES:tracked_boxes"
// output_stream: "TRACKED_FRAME_ANNOTATION:tracked_frame_annotation"
// output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
// }
class FrameAnnotationTrackerCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
::mediapipe::Status Close(CalculatorContext* cc) override;
private:
std::unique_ptr<FrameAnnotationTracker> frame_annotation_tracker_;
};
REGISTER_CALCULATOR(FrameAnnotationTrackerCalculator);
::mediapipe::Status FrameAnnotationTrackerCalculator::GetContract(
CalculatorContract* cc) {
RET_CHECK(!cc->Inputs().GetTags().empty());
RET_CHECK(!cc->Outputs().GetTags().empty());
if (cc->Inputs().HasTag(kInputFrameAnnotationTag)) {
cc->Inputs().Tag(kInputFrameAnnotationTag).Set<FrameAnnotation>();
}
if (cc->Inputs().HasTag(kInputTrackedBoxesTag)) {
cc->Inputs().Tag(kInputTrackedBoxesTag).Set<TimedBoxProtoList>();
}
if (cc->Outputs().HasTag(kOutputTrackedFrameAnnotationTag)) {
cc->Outputs().Tag(kOutputTrackedFrameAnnotationTag).Set<FrameAnnotation>();
}
if (cc->Outputs().HasTag(kOutputCancelObjectIdTag)) {
cc->Outputs().Tag(kOutputCancelObjectIdTag).Set<int>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status FrameAnnotationTrackerCalculator::Open(
CalculatorContext* cc) {
const auto& options = cc->Options<FrameAnnotationTrackerCalculatorOptions>();
frame_annotation_tracker_ = absl::make_unique<FrameAnnotationTracker>(
options.iou_threshold(), options.img_width(), options.img_height());
return ::mediapipe::OkStatus();
}
::mediapipe::Status FrameAnnotationTrackerCalculator::Process(
CalculatorContext* cc) {
if (cc->Inputs().HasTag(kInputFrameAnnotationTag) &&
!cc->Inputs().Tag(kInputFrameAnnotationTag).IsEmpty()) {
frame_annotation_tracker_->AddDetectionResult(
cc->Inputs().Tag(kInputFrameAnnotationTag).Get<FrameAnnotation>());
}
if (cc->Inputs().HasTag(kInputTrackedBoxesTag) &&
!cc->Inputs().Tag(kInputTrackedBoxesTag).IsEmpty() &&
cc->Outputs().HasTag(kOutputTrackedFrameAnnotationTag)) {
absl::flat_hash_set<int> cancel_object_ids;
auto output_frame_annotation = absl::make_unique<FrameAnnotation>();
*output_frame_annotation =
frame_annotation_tracker_->ConsolidateTrackingResult(
cc->Inputs().Tag(kInputTrackedBoxesTag).Get<TimedBoxProtoList>(),
&cancel_object_ids);
output_frame_annotation->set_timestamp(cc->InputTimestamp().Microseconds());
cc->Outputs()
.Tag(kOutputTrackedFrameAnnotationTag)
.Add(output_frame_annotation.release(), cc->InputTimestamp());
if (cc->Outputs().HasTag(kOutputCancelObjectIdTag)) {
auto packet_timestamp = cc->InputTimestamp();
for (const auto& id : cancel_object_ids) {
// The timestamp is incremented (by 1 us) because currently the box
// tracker calculator only accepts one cancel object ID for any given
// timestamp.
cc->Outputs()
.Tag(kOutputCancelObjectIdTag)
.AddPacket(mediapipe::MakePacket<int>(id).At(packet_timestamp++));
}
}
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status FrameAnnotationTrackerCalculator::Close(
CalculatorContext* cc) {
return ::mediapipe::OkStatus();
}
} // namespace mediapipe

View File

@ -0,0 +1,36 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The option proto for the FrameAnnotationTrackerCalculatorOptions.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
message FrameAnnotationTrackerCalculatorOptions {
extend CalculatorOptions {
optional FrameAnnotationTrackerCalculatorOptions ext = 291291253;
}
// The threshold on intersection-over-union (IoU). We consider
// boxes with IoU larger than this threshold to be the duplicates.
optional float iou_threshold = 1 [default = 0.5];
// We need image dimension to properly compute annotation locations.
optional float img_width = 2;
optional float img_height = 3;
}

View File

@ -0,0 +1,143 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/frame_annotation_tracker.h"
#include "absl/container/flat_hash_set.h"
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/util/tracking/box_tracker.pb.h"
namespace mediapipe {
namespace {
// Create a new object annotation by shifting a reference
// object annotation.
ObjectAnnotation ShiftObject2d(const ObjectAnnotation& ref_obj, float dx,
float dy) {
ObjectAnnotation obj = ref_obj;
for (auto& keypoint : *(obj.mutable_keypoints())) {
const float ref_x = keypoint.point_2d().x();
const float ref_y = keypoint.point_2d().y();
keypoint.mutable_point_2d()->set_x(ref_x + dx);
keypoint.mutable_point_2d()->set_y(ref_y + dy);
}
return obj;
}
TimedBoxProto ShiftBox(const TimedBoxProto& ref_box, float dx, float dy) {
TimedBoxProto box = ref_box;
box.set_top(ref_box.top() + dy);
box.set_bottom(ref_box.bottom() + dy);
box.set_left(ref_box.left() + dx);
box.set_right(ref_box.right() + dx);
return box;
}
// Constructs a fixed ObjectAnnotation.
ObjectAnnotation ConstructFixedObject(
const std::vector<std::vector<float>>& points) {
ObjectAnnotation obj;
for (const auto& point : points) {
auto* keypoint = obj.add_keypoints();
CHECK_EQ(2, point.size());
keypoint->mutable_point_2d()->set_x(point[0]);
keypoint->mutable_point_2d()->set_y(point[1]);
}
return obj;
}
TEST(FrameAnnotationTrackerTest, TestConsolidation) {
// Add 4 detections represented by FrameAnnotation, of which 3 correspond
// to the same object.
ObjectAnnotation object1, object2, object3, object4;
// The bounding rectangle for these object keypoints is:
// x: [0.2, 0.5], y: [0.1, 0.4]
object3 = ConstructFixedObject({{0.35f, 0.25f},
{0.3f, 0.3f},
{0.2f, 0.4f},
{0.3f, 0.1f},
{0.2f, 0.2f},
{0.5f, 0.3f},
{0.4f, 0.4f},
{0.5f, 0.1f},
{0.4f, 0.2f}});
object3.set_object_id(3);
object1 = ShiftObject2d(object3, -0.05f, -0.05f);
object1.set_object_id(1);
object2 = ShiftObject2d(object3, 0.05f, 0.05f);
object2.set_object_id(2);
object4 = ShiftObject2d(object3, 0.2f, 0.2f);
object4.set_object_id(4);
FrameAnnotation frame_annotation_1;
frame_annotation_1.set_timestamp(30 * 1000); // 30ms
*(frame_annotation_1.add_annotations()) = object1;
*(frame_annotation_1.add_annotations()) = object4;
FrameAnnotation frame_annotation_2;
frame_annotation_2.set_timestamp(60 * 1000); // 60ms
*(frame_annotation_2.add_annotations()) = object2;
FrameAnnotation frame_annotation_3;
frame_annotation_3.set_timestamp(90 * 1000); // 90ms
*(frame_annotation_3.add_annotations()) = object3;
FrameAnnotationTracker frame_annotation_tracker(/*iou_threshold*/ 0.5f, 1.0f,
1.0f);
frame_annotation_tracker.AddDetectionResult(frame_annotation_1);
frame_annotation_tracker.AddDetectionResult(frame_annotation_2);
frame_annotation_tracker.AddDetectionResult(frame_annotation_3);
TimedBoxProtoList timed_box_proto_list;
TimedBoxProto* timed_box_proto = timed_box_proto_list.add_box();
timed_box_proto->set_top(0.4f);
timed_box_proto->set_bottom(0.7f);
timed_box_proto->set_left(0.6f);
timed_box_proto->set_right(0.9f);
timed_box_proto->set_id(3);
timed_box_proto->set_time_msec(150);
timed_box_proto = timed_box_proto_list.add_box();
*timed_box_proto = ShiftBox(timed_box_proto_list.box(0), 0.01f, 0.01f);
timed_box_proto->set_id(1);
timed_box_proto->set_time_msec(150);
timed_box_proto = timed_box_proto_list.add_box();
*timed_box_proto = ShiftBox(timed_box_proto_list.box(0), -0.01f, -0.01f);
timed_box_proto->set_id(2);
timed_box_proto->set_time_msec(150);
absl::flat_hash_set<int> cancel_object_ids;
FrameAnnotation tracked_detection =
frame_annotation_tracker.ConsolidateTrackingResult(timed_box_proto_list,
&cancel_object_ids);
EXPECT_EQ(2, cancel_object_ids.size());
EXPECT_EQ(1, cancel_object_ids.count(1));
EXPECT_EQ(1, cancel_object_ids.count(2));
EXPECT_EQ(1, tracked_detection.annotations_size());
EXPECT_EQ(3, tracked_detection.annotations(0).object_id());
EXPECT_EQ(object3.keypoints_size(),
tracked_detection.annotations(0).keypoints_size());
const float x_offset = 0.4f;
const float y_offset = 0.3f;
const float tolerance = 1e-5f;
for (int i = 0; i < object3.keypoints_size(); ++i) {
const auto& point_2d =
tracked_detection.annotations(0).keypoints(i).point_2d();
EXPECT_NEAR(point_2d.x(), object3.keypoints(i).point_2d().x() + x_offset,
tolerance);
EXPECT_NEAR(point_2d.y(), object3.keypoints(i).point_2d().y() + y_offset,
tolerance);
}
}
} // namespace
} // namespace mediapipe

View File

@ -0,0 +1,760 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(__ANDROID__)
#include "mediapipe/util/android/asset_manager_util.h"
#else
#include <fstream>
#include <iostream>
#endif
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/gpu/gl_calculator_helper.h"
#include "mediapipe/gpu/shader_util.h"
#include "mediapipe/graphs/object_detection_3d/calculators/camera_parameters.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/model_matrix.pb.h"
namespace mediapipe {
namespace {
#if defined(GL_DEBUG)
#define GLCHECK(command) \
command; \
if (int err = glGetError()) LOG(ERROR) << "GL error detected: " << err;
#else
#define GLCHECK(command) command
#endif
// For ease of use, we prefer ImageFrame on Android and GpuBuffer otherwise.
#if defined(__ANDROID__)
typedef ImageFrame AssetTextureFormat;
#else
typedef GpuBuffer AssetTextureFormat;
#endif
enum { ATTRIB_VERTEX, ATTRIB_TEXTURE_POSITION, NUM_ATTRIBUTES };
static const int kNumMatrixEntries = 16;
// Hard-coded MVP Matrix for testing.
static const float kModelMatrix[] = {0.83704215, -0.36174262, 0.41049102, 0.0,
0.06146407, 0.8076706, 0.5864218, 0.0,
-0.54367524, -0.4656292, 0.69828844, 0.0,
0.0, 0.0, -98.64117, 1.0};
// Loads a texture from an input side packet, and streams in an animation file
// from a filename given in another input side packet, and renders the animation
// over the screen according to the input timestamp and desired animation FPS.
//
// Inputs:
// VIDEO (GpuBuffer, optional):
// If provided, the input buffer will be assumed to be unique, and will be
// consumed by this calculator and rendered to directly. The output video
// buffer will then be the released reference to the input video buffer.
// MODEL_MATRICES (TimedModelMatrixProtoList, optional):
// If provided, will set the model matrices for the objects to be rendered
// during future rendering calls.
//
// Input side packets:
// TEXTURE (ImageFrame on Android / GpuBuffer on iOS, required):
// Texture to use with animation file.
// ANIMATION_ASSET (String, required):
// Path of animation file to load and render. Should be generated by
// //java/com/google/android/apps/motionstills/SimpleObjEncryptor with
// --compressed_mode=true. See comments and documentation there for more
// information on custom .obj.uuu file format.
// CAMERA_PARAMETERS_PROTO_STRING (String, optional):
// Serialized proto std::string of CameraParametersProto. We need this to
// get the right aspect ratio and field of view.
// Options:
// aspect_ratio: the ratio between the rendered image width and height.
// It will be ignored if CAMERA_PARAMETERS_PROTO_STRING input side packet
// is provided.
// vertical_fov_degrees: vertical field of view in degrees.
// It will be ignored if CAMERA_PARAMETERS_PROTO_STRING input side packet
// is provided.
// z_clipping_plane_near: near plane value for z-clipping.
// z_clipping_plane_far: far plane value for z-clipping.
// animation_speed_fps: speed at which to cycle through animation frames (in
// frames per second).
//
// Outputs:
// OUTPUT, or index 0 (GpuBuffer):
// Frames filled with the given texture.
// Simple helper-struct for containing the parsed geometry data from a 3D
// animation frame for rendering.
struct TriangleMesh {
int index_count = 0; // Needed for glDrawElements rendering call
std::unique_ptr<float[]> vertices = nullptr;
std::unique_ptr<float[]> texture_coords = nullptr;
std::unique_ptr<int16[]> triangle_indices = nullptr;
};
typedef std::unique_ptr<float[]> ModelMatrix;
} // namespace
class GlAnimationOverlayCalculator : public CalculatorBase {
public:
GlAnimationOverlayCalculator() {}
~GlAnimationOverlayCalculator();
static ::mediapipe::Status GetContract(CalculatorContract *cc);
::mediapipe::Status Open(CalculatorContext *cc) override;
::mediapipe::Status Process(CalculatorContext *cc) override;
private:
bool has_video_stream_ = false;
bool has_model_matrix_stream_ = false;
bool has_mask_model_matrix_stream_ = false;
bool has_occlusion_mask_ = false;
GlCalculatorHelper helper_;
bool initialized_ = false;
GlTexture texture_;
GlTexture mask_texture_;
GLuint renderbuffer_ = 0;
bool depth_buffer_created_ = false;
GLuint program_ = 0;
GLint texture_uniform_ = -1;
GLint perspective_matrix_uniform_ = -1;
GLint model_matrix_uniform_ = -1;
std::vector<TriangleMesh> triangle_meshes_;
std::vector<TriangleMesh> mask_meshes_;
Timestamp animation_start_time_;
int frame_count_ = 0;
float animation_speed_fps_;
std::vector<ModelMatrix> current_model_matrices_;
std::vector<ModelMatrix> current_mask_model_matrices_;
// Perspective matrix for rendering, to be applied to all model matrices
// prior to passing through to the shader as a MVP matrix. Initialized during
// first image packet read.
float perspective_matrix_[kNumMatrixEntries];
void ComputeAspectRatioAndFovFromCameraParameters(
const CameraParametersProto &camera_parameters, float *aspect_ratio,
float *vertical_fov_degrees);
int GetAnimationFrameIndex(Timestamp timestamp);
::mediapipe::Status GlSetup();
::mediapipe::Status GlBind(const TriangleMesh &triangle_mesh,
const GlTexture &texture);
::mediapipe::Status GlRender(const TriangleMesh &triangle_mesh,
const float *model_matrix);
void InitializePerspectiveMatrix(float aspect_ratio,
float vertical_fov_degrees, float z_near,
float z_far);
void LoadModelMatrices(const TimedModelMatrixProtoList &model_matrices,
std::vector<ModelMatrix> *current_model_matrices);
#if !defined(__ANDROID__)
// Asset loading routine for all non-Android platforms.
bool LoadAnimation(const std::string &filename);
#else
// Asset loading for all Android platforms.
bool LoadAnimationAndroid(const std::string &filename,
std::vector<TriangleMesh> *mesh);
bool ReadBytesFromAsset(AAsset *asset, void *buffer, int num_bytes_to_read);
#endif
};
REGISTER_CALCULATOR(GlAnimationOverlayCalculator);
// static
::mediapipe::Status GlAnimationOverlayCalculator::GetContract(
CalculatorContract *cc) {
MP_RETURN_IF_ERROR(
GlCalculatorHelper::SetupInputSidePackets(&(cc->InputSidePackets())));
if (cc->Inputs().HasTag("VIDEO")) {
// Currently used only for size and timestamp.
cc->Inputs().Tag("VIDEO").Set<GpuBuffer>();
}
TagOrIndex(&(cc->Outputs()), "OUTPUT", 0).Set<GpuBuffer>();
if (cc->Inputs().HasTag("MODEL_MATRICES")) {
cc->Inputs().Tag("MODEL_MATRICES").Set<TimedModelMatrixProtoList>();
}
if (cc->Inputs().HasTag("MASK_MODEL_MATRICES")) {
cc->Inputs().Tag("MASK_MODEL_MATRICES").Set<TimedModelMatrixProtoList>();
}
cc->InputSidePackets().Tag("TEXTURE").Set<AssetTextureFormat>();
cc->InputSidePackets().Tag("ANIMATION_ASSET").Set<std::string>();
if (cc->InputSidePackets().HasTag("CAMERA_PARAMETERS_PROTO_STRING")) {
cc->InputSidePackets()
.Tag("CAMERA_PARAMETERS_PROTO_STRING")
.Set<std::string>();
}
if (cc->InputSidePackets().HasTag("MASK_TEXTURE")) {
cc->InputSidePackets().Tag("MASK_TEXTURE").Set<AssetTextureFormat>();
}
if (cc->InputSidePackets().HasTag("MASK_ASSET")) {
cc->InputSidePackets().Tag("MASK_ASSET").Set<std::string>();
}
return ::mediapipe::OkStatus();
}
// Helper function for initializing our perspective matrix.
void GlAnimationOverlayCalculator::InitializePerspectiveMatrix(
float aspect_ratio, float fov_degrees, float z_near, float z_far) {
// Standard perspective projection matrix calculations.
const float f = 1.0f / std::tan(fov_degrees * M_PI / 360.0f);
for (int i = 0; i < kNumMatrixEntries; i++) {
perspective_matrix_[i] = 0;
}
const float denom = 1.0f / (z_near - z_far);
perspective_matrix_[0] = f / aspect_ratio;
perspective_matrix_[5] = f;
perspective_matrix_[10] = (z_near + z_far) * denom;
perspective_matrix_[11] = -1.0f;
perspective_matrix_[14] = 2.0f * z_far * z_near * denom;
}
#if defined(__ANDROID__)
// Helper function for reading in a specified number of bytes from an Android
// asset. Returns true if successfully reads in all bytes into buffer.
bool GlAnimationOverlayCalculator::ReadBytesFromAsset(AAsset *asset,
void *buffer,
int num_bytes_to_read) {
// Most file systems use block sizes of 4KB or 8KB; ideally we'd choose a
// small multiple of the block size for best input streaming performance, so
// we go for a reasobably safe buffer size of 8KB = 8*1024 bytes.
static const int kMaxChunkSize = 8192;
int bytes_left = num_bytes_to_read;
int bytes_read = 1; // any value > 0 here just to start looping.
// Treat as uint8_t array so we can deal in single byte arithmetic easily.
uint8_t *currBufferIndex = reinterpret_cast<uint8_t *>(buffer);
while (bytes_read > 0 && bytes_left > 0) {
bytes_read = AAsset_read(asset, (void *)currBufferIndex,
std::min(bytes_left, kMaxChunkSize));
bytes_left -= bytes_read;
currBufferIndex += bytes_read;
}
// At least log any I/O errors encountered.
if (bytes_read < 0) {
LOG(ERROR) << "Error reading from AAsset: " << bytes_read;
return false;
}
if (bytes_left > 0) {
// Reached EOF before reading in specified number of bytes.
LOG(WARNING) << "Reached EOF before reading in specified number of bytes.";
return false;
}
return true;
}
// The below asset streaming code is Android-only, making use of the platform
// JNI helper classes AAssetManager and AAsset.
bool GlAnimationOverlayCalculator::LoadAnimationAndroid(
const std::string &filename, std::vector<TriangleMesh> *meshes) {
mediapipe::AssetManager *mediapipe_asset_manager =
Singleton<mediapipe::AssetManager>::get();
AAssetManager *asset_manager = mediapipe_asset_manager->GetAssetManager();
if (!asset_manager) {
LOG(ERROR) << "Failed to access Android asset manager.";
return false;
}
// New read-bytes stuff here! First we open file for streaming.
AAsset *asset = AAssetManager_open(asset_manager, filename.c_str(),
AASSET_MODE_STREAMING);
if (!asset) {
LOG(ERROR) << "Failed to open animation asset: " << filename;
return false;
}
// And now, while we are able to stream in more frames, we do so.
frame_count_ = 0;
int32 lengths[3];
while (ReadBytesFromAsset(asset, (void *)lengths, sizeof(lengths[0]) * 3)) {
// About to start reading the next animation frame. Stream it in here.
// Each frame stores first the object counts of its three arrays
// (vertices, texture coordinates, triangle indices; respectively), and
// then stores each of those arrays as a byte dump, in order.
meshes->emplace_back();
TriangleMesh &triangle_mesh = meshes->back();
// Try to read in vertices (4-byte floats)
triangle_mesh.vertices.reset(new float[lengths[0]]);
if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.vertices.get(),
sizeof(float) * lengths[0])) {
LOG(ERROR) << "Failed to read vertices for frame " << frame_count_;
return false;
}
// Try to read in texture coordinates (4-byte floats)
triangle_mesh.texture_coords.reset(new float[lengths[1]]);
if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.texture_coords.get(),
sizeof(float) * lengths[1])) {
LOG(ERROR) << "Failed to read tex-coords for frame " << frame_count_;
return false;
}
// Try to read in indices (2-byte shorts)
triangle_mesh.index_count = lengths[2];
triangle_mesh.triangle_indices.reset(new int16[lengths[2]]);
if (!ReadBytesFromAsset(asset, (void *)triangle_mesh.triangle_indices.get(),
sizeof(int16) * lengths[2])) {
LOG(ERROR) << "Failed to read indices for frame " << frame_count_;
return false;
}
frame_count_++;
}
AAsset_close(asset);
LOG(INFO) << "Finished parsing " << frame_count_ << " animation frames.";
if (meshes->empty()) {
LOG(ERROR) << "No animation frames were parsed! Erroring out calculator.";
return false;
}
return true;
}
#else // defined(__ANDROID__)
bool GlAnimationOverlayCalculator::LoadAnimation(const std::string &filename) {
std::ifstream infile(filename.c_str(), std::ifstream::binary);
if (!infile) {
LOG(ERROR) << "Error opening asset with filename: " << filename;
return false;
}
frame_count_ = 0;
int32 lengths[3];
while (true) {
// See if we have more initial size counts to read in.
infile.read((char *)(lengths), sizeof(lengths[0]) * 3);
if (!infile) {
// No more frames to read. Close out.
infile.close();
break;
}
triangle_meshes_.emplace_back();
TriangleMesh &triangle_mesh = triangle_meshes_.back();
// Try to read in vertices (4-byte floats).
triangle_mesh.vertices.reset(new float[lengths[0]]);
infile.read((char *)(triangle_mesh.vertices.get()),
sizeof(float) * lengths[0]);
if (!infile) {
LOG(ERROR) << "Failed to read vertices for frame " << frame_count_;
return false;
}
// Try to read in texture coordinates (4-byte floats)
triangle_mesh.texture_coords.reset(new float[lengths[1]]);
infile.read((char *)(triangle_mesh.texture_coords.get()),
sizeof(float) * lengths[1]);
if (!infile) {
LOG(ERROR) << "Failed to read texture coordinates for frame "
<< frame_count_;
return false;
}
// Try to read in the triangle indices (2-byte shorts)
triangle_mesh.index_count = lengths[2];
triangle_mesh.triangle_indices.reset(new int16[lengths[2]]);
infile.read((char *)(triangle_mesh.triangle_indices.get()),
sizeof(int16) * lengths[2]);
if (!infile) {
LOG(ERROR) << "Failed to read triangle indices for frame "
<< frame_count_;
return false;
}
frame_count_++;
}
LOG(INFO) << "Finished parsing " << frame_count_ << " animation frames.";
if (triangle_meshes_.empty()) {
LOG(ERROR) << "No animation frames were parsed! Erroring out calculator.";
return false;
}
return true;
}
#endif
void GlAnimationOverlayCalculator::ComputeAspectRatioAndFovFromCameraParameters(
const CameraParametersProto &camera_parameters, float *aspect_ratio,
float *vertical_fov_degrees) {
CHECK(aspect_ratio != nullptr);
CHECK(vertical_fov_degrees != nullptr);
*aspect_ratio =
camera_parameters.portrait_width() / camera_parameters.portrait_height();
*vertical_fov_degrees =
std::atan(camera_parameters.portrait_height() * 0.5f) * 2 * 180 / M_PI;
}
::mediapipe::Status GlAnimationOverlayCalculator::Open(CalculatorContext *cc) {
cc->SetOffset(TimestampDiff(0));
MP_RETURN_IF_ERROR(helper_.Open(cc));
const auto &options = cc->Options<GlAnimationOverlayCalculatorOptions>();
animation_speed_fps_ = options.animation_speed_fps();
// Construct projection matrix using input side packets or option
float aspect_ratio;
float vertical_fov_degrees;
if (cc->InputSidePackets().HasTag("CAMERA_PARAMETERS_PROTO_STRING")) {
const std::string &camera_parameters_proto_string =
cc->InputSidePackets()
.Tag("CAMERA_PARAMETERS_PROTO_STRING")
.Get<std::string>();
CameraParametersProto camera_parameters_proto;
camera_parameters_proto.ParseFromString(camera_parameters_proto_string);
ComputeAspectRatioAndFovFromCameraParameters(
camera_parameters_proto, &aspect_ratio, &vertical_fov_degrees);
} else {
aspect_ratio = options.aspect_ratio();
vertical_fov_degrees = options.vertical_fov_degrees();
}
// when constructing projection matrix.
InitializePerspectiveMatrix(aspect_ratio, vertical_fov_degrees,
options.z_clipping_plane_near(),
options.z_clipping_plane_far());
// See what streams we have.
has_video_stream_ = cc->Inputs().HasTag("VIDEO");
has_model_matrix_stream_ = cc->Inputs().HasTag("MODEL_MATRICES");
has_mask_model_matrix_stream_ = cc->Inputs().HasTag("MASK_MODEL_MATRICES");
// Try to load in the animation asset in a platform-specific manner.
const std::string &asset_name =
cc->InputSidePackets().Tag("ANIMATION_ASSET").Get<std::string>();
bool loaded_animation = false;
#if defined(__ANDROID__)
if (cc->InputSidePackets().HasTag("MASK_ASSET")) {
has_occlusion_mask_ = true;
const std::string &mask_asset_name =
cc->InputSidePackets().Tag("MASK_ASSET").Get<std::string>();
loaded_animation = LoadAnimationAndroid(mask_asset_name, &mask_meshes_);
if (!loaded_animation) {
LOG(ERROR) << "Failed to load mask asset.";
return ::mediapipe::UnknownError("Failed to load mask asset.");
}
}
loaded_animation = LoadAnimationAndroid(asset_name, &triangle_meshes_);
#else
loaded_animation = LoadAnimation(asset_name);
#endif
if (!loaded_animation) {
LOG(ERROR) << "Failed to load animation asset.";
return ::mediapipe::UnknownError("Failed to load animation asset.");
}
return helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status {
if (cc->InputSidePackets().HasTag("MASK_TEXTURE")) {
const auto &mask_texture =
cc->InputSidePackets().Tag("MASK_TEXTURE").Get<AssetTextureFormat>();
mask_texture_ = helper_.CreateSourceTexture(mask_texture);
}
// Load in our asset's texture data
const auto &input_texture =
cc->InputSidePackets().Tag("TEXTURE").Get<AssetTextureFormat>();
texture_ = helper_.CreateSourceTexture(input_texture);
VLOG(2) << "Input texture size: " << texture_.width() << ", "
<< texture_.height() << std::endl;
return ::mediapipe::OkStatus();
});
}
int GlAnimationOverlayCalculator::GetAnimationFrameIndex(Timestamp timestamp) {
double seconds_delta = timestamp.Seconds() - animation_start_time_.Seconds();
int64_t frame_index =
static_cast<int64_t>(seconds_delta * animation_speed_fps_);
frame_index %= frame_count_;
return static_cast<int>(frame_index);
}
void GlAnimationOverlayCalculator::LoadModelMatrices(
const TimedModelMatrixProtoList &model_matrices,
std::vector<ModelMatrix> *current_model_matrices) {
current_model_matrices->clear();
for (int i = 0; i < model_matrices.model_matrix_size(); ++i) {
const auto &model_matrix = model_matrices.model_matrix(i);
CHECK(model_matrix.matrix_entries_size() == kNumMatrixEntries)
<< "Invalid Model Matrix";
current_model_matrices->emplace_back();
ModelMatrix &new_matrix = current_model_matrices->back();
new_matrix.reset(new float[kNumMatrixEntries]);
for (int j = 0; j < kNumMatrixEntries; j++) {
// Model matrices streamed in using ROW-MAJOR format, but we want
// COLUMN-MAJOR for rendering, so we transpose here.
int col = j % 4;
int row = j / 4;
new_matrix[row + col * 4] = model_matrix.matrix_entries(j);
}
}
}
::mediapipe::Status GlAnimationOverlayCalculator::Process(
CalculatorContext *cc) {
return helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status {
if (!initialized_) {
MP_RETURN_IF_ERROR(GlSetup());
initialized_ = true;
animation_start_time_ = cc->InputTimestamp();
}
// Process model matrices, if any are being streamed in, and update our
// list.
if (has_model_matrix_stream_ &&
!cc->Inputs().Tag("MODEL_MATRICES").IsEmpty()) {
const TimedModelMatrixProtoList &model_matrices =
cc->Inputs().Tag("MODEL_MATRICES").Get<TimedModelMatrixProtoList>();
LoadModelMatrices(model_matrices, &current_model_matrices_);
}
if (has_mask_model_matrix_stream_ &&
!cc->Inputs().Tag("MASK_MODEL_MATRICES").IsEmpty()) {
const TimedModelMatrixProtoList &model_matrices =
cc->Inputs()
.Tag("MASK_MODEL_MATRICES")
.Get<TimedModelMatrixProtoList>();
LoadModelMatrices(model_matrices, &current_mask_model_matrices_);
}
// Arbitrary default width and height for output destination texture, in the
// event that we don't have a valid and unique input buffer to overlay.
int width = 640;
int height = 480;
GlTexture dst;
std::unique_ptr<GpuBuffer> input_frame(nullptr);
if (has_video_stream_ && !(cc->Inputs().Tag("VIDEO").IsEmpty())) {
auto result = cc->Inputs().Tag("VIDEO").Value().Consume<GpuBuffer>();
if (result.ok()) {
input_frame = std::move(result).ValueOrDie();
#if !MEDIAPIPE_GPU_BUFFER_USE_CV_PIXEL_BUFFER
input_frame->GetGlTextureBufferSharedPtr()->Reuse();
#endif
width = input_frame->width();
height = input_frame->height();
dst = helper_.CreateSourceTexture(*input_frame);
} else {
LOG(ERROR) << "Unable to consume input video frame for overlay!";
LOG(ERROR) << "Status returned was: " << result.status();
dst = helper_.CreateDestinationTexture(width, height);
}
} else if (!has_video_stream_) {
dst = helper_.CreateDestinationTexture(width, height);
} else {
// We have an input video stream, but not for this frame. Don't render!
return ::mediapipe::OkStatus();
}
helper_.BindFramebuffer(dst);
if (!depth_buffer_created_) {
// Create our private depth buffer.
GLCHECK(glGenRenderbuffers(1, &renderbuffer_));
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer_));
GLCHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT16,
width, height));
GLCHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT,
GL_RENDERBUFFER, renderbuffer_));
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0));
depth_buffer_created_ = true;
}
// Re-bind our depth renderbuffer to our FBO depth attachment here.
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer_));
GLCHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT,
GL_RENDERBUFFER, renderbuffer_));
GLenum status = GLCHECK(glCheckFramebufferStatus(GL_FRAMEBUFFER));
if (status != GL_FRAMEBUFFER_COMPLETE) {
LOG(ERROR) << "Incomplete framebuffer with status: " << status;
}
GLCHECK(glClear(GL_DEPTH_BUFFER_BIT));
if (has_occlusion_mask_) {
glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
const TriangleMesh &mask_frame = mask_meshes_.front();
MP_RETURN_IF_ERROR(GlBind(mask_frame, mask_texture_));
// Draw objects using our latest model matrix stream packet.
for (const ModelMatrix &model_matrix : current_mask_model_matrices_) {
MP_RETURN_IF_ERROR(GlRender(mask_frame, model_matrix.get()));
}
}
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
int frame_index = GetAnimationFrameIndex(cc->InputTimestamp());
const TriangleMesh &current_frame = triangle_meshes_[frame_index];
MP_RETURN_IF_ERROR(GlBind(current_frame, texture_));
if (has_model_matrix_stream_) {
// Draw objects using our latest model matrix stream packet.
for (const ModelMatrix &model_matrix : current_model_matrices_) {
MP_RETURN_IF_ERROR(GlRender(current_frame, model_matrix.get()));
}
} else {
// Just draw one object to a static model matrix.
MP_RETURN_IF_ERROR(GlRender(current_frame, kModelMatrix));
}
// Disable vertex attributes
GLCHECK(glEnableVertexAttribArray(ATTRIB_VERTEX));
GLCHECK(glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION));
// Disable depth test
GLCHECK(glDisable(GL_DEPTH_TEST));
// Unbind texture
GLCHECK(glActiveTexture(GL_TEXTURE1));
GLCHECK(glBindTexture(texture_.target(), 0));
// Unbind depth buffer
GLCHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0));
GLCHECK(glFlush());
auto output = dst.GetFrame<GpuBuffer>();
dst.Release();
TagOrIndex(&(cc->Outputs()), "OUTPUT", 0)
.Add(output.release(), cc->InputTimestamp());
GLCHECK(glFrontFace(GL_CCW));
return ::mediapipe::OkStatus();
});
}
::mediapipe::Status GlAnimationOverlayCalculator::GlSetup() {
// Load vertex and fragment shaders
const GLint attr_location[NUM_ATTRIBUTES] = {
ATTRIB_VERTEX,
ATTRIB_TEXTURE_POSITION,
};
const GLchar *attr_name[NUM_ATTRIBUTES] = {
"position",
"texture_coordinate",
};
const GLchar *vert_src = R"(
// Perspective projection matrix for rendering / clipping
uniform mat4 perspectiveMatrix;
// Matrix defining the currently rendered object model
uniform mat4 modelMatrix;
// vertex position in threespace
attribute vec4 position;
// texture coordinate for each vertex in normalized texture space (0..1)
attribute mediump vec4 texture_coordinate;
// texture coordinate for fragment shader (will be interpolated)
varying mediump vec2 sample_coordinate;
void main() {
sample_coordinate = texture_coordinate.xy;
mat4 mvpMatrix = perspectiveMatrix * modelMatrix;
gl_Position = mvpMatrix * position;
}
)";
const GLchar *frag_src = R"(
precision mediump float;
varying vec2 sample_coordinate; // texture coordinate (0..1)
uniform sampler2D texture; // texture to shade with
void main() {
gl_FragColor = texture2D(texture, sample_coordinate);
}
)";
// Shader program
GLCHECK(GlhCreateProgram(vert_src, frag_src, NUM_ATTRIBUTES,
(const GLchar **)&attr_name[0], attr_location,
&program_));
RET_CHECK(program_) << "Problem initializing the program.";
texture_uniform_ = GLCHECK(glGetUniformLocation(program_, "texture"));
perspective_matrix_uniform_ =
GLCHECK(glGetUniformLocation(program_, "perspectiveMatrix"));
model_matrix_uniform_ =
GLCHECK(glGetUniformLocation(program_, "modelMatrix"));
return ::mediapipe::OkStatus();
}
::mediapipe::Status GlAnimationOverlayCalculator::GlBind(
const TriangleMesh &triangle_mesh, const GlTexture &texture) {
GLCHECK(glUseProgram(program_));
// Disable backface culling to allow occlusion effects.
// Some options for solid arbitrary 3D geometry rendering
GLCHECK(glEnable(GL_BLEND));
GLCHECK(glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA));
GLCHECK(glEnable(GL_DEPTH_TEST));
GLCHECK(glFrontFace(GL_CW));
GLCHECK(glDepthMask(GL_TRUE));
GLCHECK(glDepthFunc(GL_LESS));
// Clear our depth buffer before starting draw calls
GLCHECK(glVertexAttribPointer(ATTRIB_VERTEX, 3, GL_FLOAT, 0, 0,
triangle_mesh.vertices.get()));
GLCHECK(glEnableVertexAttribArray(ATTRIB_VERTEX));
GLCHECK(glVertexAttribPointer(ATTRIB_TEXTURE_POSITION, 2, GL_FLOAT, 0, 0,
triangle_mesh.texture_coords.get()));
GLCHECK(glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION));
GLCHECK(glActiveTexture(GL_TEXTURE1));
GLCHECK(glBindTexture(texture.target(), texture.name()));
// We previously bound it to GL_TEXTURE1
GLCHECK(glUniform1i(texture_uniform_, 1));
GLCHECK(glUniformMatrix4fv(perspective_matrix_uniform_, 1, GL_FALSE,
perspective_matrix_));
return ::mediapipe::OkStatus();
}
::mediapipe::Status GlAnimationOverlayCalculator::GlRender(
const TriangleMesh &triangle_mesh, const float *model_matrix) {
GLCHECK(glUniformMatrix4fv(model_matrix_uniform_, 1, GL_FALSE, model_matrix));
GLCHECK(glDrawElements(GL_TRIANGLES, triangle_mesh.index_count,
GL_UNSIGNED_SHORT,
triangle_mesh.triangle_indices.get()));
return ::mediapipe::OkStatus();
}
GlAnimationOverlayCalculator::~GlAnimationOverlayCalculator() {
helper_.RunInGlContext([this] {
if (program_) {
GLCHECK(glDeleteProgram(program_));
program_ = 0;
}
if (depth_buffer_created_) {
GLCHECK(glDeleteRenderbuffers(1, &renderbuffer_));
renderbuffer_ = 0;
}
if (texture_.width() > 0) {
texture_.Release();
}
if (mask_texture_.width() > 0) {
mask_texture_.Release();
}
});
}
} // namespace mediapipe

View File

@ -0,0 +1,41 @@
// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
message GlAnimationOverlayCalculatorOptions {
extend CalculatorOptions {
optional GlAnimationOverlayCalculatorOptions ext = 174760573;
}
// Default aspect ratio of rendering target width over height.
// This specific value is for 3:4 view. Do not change this default value.
optional float aspect_ratio = 1 [default = 0.75];
// Default vertical field of view in degrees. This specific default value
// is arbitrary. Do not change this default value. If you want to use
// a different vertical_fov_degrees, set it in the options.
optional float vertical_fov_degrees = 2 [default = 70.0];
// Perspective projection matrix z-clipping near plane value.
optional float z_clipping_plane_near = 3 [default = 0.1];
// Perspective projection matrix z-clipping far plane value.
optional float z_clipping_plane_far = 4 [default = 1000.0];
// Speed at which to play the animation (in frames per second).
optional float animation_speed_fps = 5 [default = 25.0];
}

View File

@ -0,0 +1,168 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <unordered_map>
#include <vector>
#include "Eigen/Dense"
#include "absl/memory/memory.h"
#include "absl/strings/str_format.h"
#include "absl/types/span.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/deps/file_path.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h"
#include "mediapipe/graphs/object_detection_3d/calculators/lift_2d_frame_annotation_to_3d_calculator.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h"
namespace {
constexpr char kInputStreamTag[] = "FRAME_ANNOTATION";
constexpr char kOutputStreamTag[] = "LIFTED_FRAME_ANNOTATION";
// Each detection object will be assigned an unique id that starts from 1.
static int object_id = 0;
inline int GetNextObjectId() { return ++object_id; }
} // namespace
namespace mediapipe {
// Lifted the 2D points in a tracked frame annotation to 3D.
//
// Input:
// FRAME_ANNOTATIONS - Frame annotations with detected 2D points
// Output:
// LIFTED_FRAME_ANNOTATIONS - Result FrameAnnotation with lifted 3D points.
//
// Usage example:
// node {
// calculator: "Lift2DFrameAnnotationTo3DCalculator"
// input_stream: "FRAME_ANNOTATIONS:tracked_annotations"
// output_stream: "LIFTED_FRAME_ANNOTATIONS:lifted_3d_annotations"
// }
class Lift2DFrameAnnotationTo3DCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
::mediapipe::Status Close(CalculatorContext* cc) override;
private:
::mediapipe::Status ProcessCPU(CalculatorContext* cc,
FrameAnnotation* output_objects);
::mediapipe::Status LoadOptions(CalculatorContext* cc);
// Increment and assign object ID for each detected object.
// In a single MediaPipe session, the IDs are unique.
// Also assign timestamp for the FrameAnnotation to be the input packet
// timestamp.
void AssignObjectIdAndTimestamp(int64 timestamp_us,
FrameAnnotation* annotation);
std::unique_ptr<Decoder> decoder_;
::mediapipe::Lift2DFrameAnnotationTo3DCalculatorOptions options_;
Eigen::Matrix<float, 4, 4, Eigen::RowMajor> projection_matrix_;
};
REGISTER_CALCULATOR(Lift2DFrameAnnotationTo3DCalculator);
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::GetContract(
CalculatorContract* cc) {
RET_CHECK(cc->Inputs().HasTag(kInputStreamTag));
RET_CHECK(cc->Outputs().HasTag(kOutputStreamTag));
cc->Inputs().Tag(kInputStreamTag).Set<FrameAnnotation>();
cc->Outputs().Tag(kOutputStreamTag).Set<FrameAnnotation>();
return ::mediapipe::OkStatus();
}
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Open(
CalculatorContext* cc) {
MP_RETURN_IF_ERROR(LoadOptions(cc));
// clang-format off
projection_matrix_ <<
1.5731, 0, 0, 0,
0, 2.0975, 0, 0,
0, 0, -1.0002, -0.2,
0, 0, -1, 0;
// clang-format on
decoder_ = absl::make_unique<Decoder>(
BeliefDecoderConfig(options_.decoder_config()));
return ::mediapipe::OkStatus();
}
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Process(
CalculatorContext* cc) {
if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
return ::mediapipe::OkStatus();
}
auto output_objects = absl::make_unique<FrameAnnotation>();
MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get()));
// Output
if (cc->Outputs().HasTag(kOutputStreamTag)) {
cc->Outputs()
.Tag(kOutputStreamTag)
.Add(output_objects.release(), cc->InputTimestamp());
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::ProcessCPU(
CalculatorContext* cc, FrameAnnotation* output_objects) {
const auto& input_frame_annotations =
cc->Inputs().Tag(kInputStreamTag).Get<FrameAnnotation>();
// Copy the input frame annotation to the output
*output_objects = input_frame_annotations;
auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true,
output_objects);
if (!status.ok()) {
LOG(ERROR) << status;
return status;
}
AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(),
output_objects);
return ::mediapipe::OkStatus();
}
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::Close(
CalculatorContext* cc) {
return ::mediapipe::OkStatus();
}
::mediapipe::Status Lift2DFrameAnnotationTo3DCalculator::LoadOptions(
CalculatorContext* cc) {
// Get calculator options specified in the graph.
options_ =
cc->Options<::mediapipe::Lift2DFrameAnnotationTo3DCalculatorOptions>();
return ::mediapipe::OkStatus();
}
void Lift2DFrameAnnotationTo3DCalculator::AssignObjectIdAndTimestamp(
int64 timestamp_us, FrameAnnotation* annotation) {
for (auto& ann : *annotation->mutable_annotations()) {
ann.set_object_id(GetNextObjectId());
}
annotation->set_timestamp(timestamp_us);
}
} // namespace mediapipe

View File

@ -0,0 +1,30 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The option proto for the Lift2DFrameAnnotationTo3DCalculatorOptions.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
import "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto";
message Lift2DFrameAnnotationTo3DCalculatorOptions {
extend CalculatorOptions {
optional Lift2DFrameAnnotationTo3DCalculatorOptions ext = 290166284;
}
optional BeliefDecoderConfig decoder_config = 1;
}

View File

@ -0,0 +1,101 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/model.h"
#include "mediapipe/framework/port/logging.h"
namespace mediapipe {
void Model::SetTransformation(const Eigen::Matrix4f& transform) {
transformation_ = transform;
}
void Model::SetTranslation(const Eigen::Vector3f& translation) {
transformation_.col(3).template head<3>() = translation;
}
void Model::SetRotation(float roll, float pitch, float yaw) {
// In our coordinate system, Y is up. We first rotate the object around Y
// (yaw), then around Z (pitch), and finally around X (roll).
Eigen::Matrix3f r;
r = Eigen::AngleAxisf(yaw, Eigen::Vector3f::UnitY()) *
Eigen::AngleAxisf(pitch, Eigen::Vector3f::UnitZ()) *
Eigen::AngleAxisf(roll, Eigen::Vector3f::UnitX());
transformation_.topLeftCorner<3, 3>() = r;
}
void Model::SetRotation(const Eigen::Matrix3f& rotation) {
transformation_.topLeftCorner<3, 3>() = rotation;
}
void Model::SetScale(const Eigen::Vector3f& scale) { scale_ = scale; }
void Model::SetCategory(const std::string& category) { category_ = category; }
const Eigen::Vector3f Model::GetRotationAngles() const {
Vector3f ypr = transformation_.topLeftCorner<3, 3>().eulerAngles(1, 2, 0);
return Vector3f(ypr(2), ypr(1), ypr(0)); // swap YPR with RPY
}
const Eigen::Matrix4f& Model::GetTransformation() const {
return transformation_;
}
const Eigen::Vector3f& Model::GetScale() const { return scale_; }
const Eigen::Ref<const Eigen::Vector3f> Model::GetTranslation() const {
return transformation_.col(3).template head<3>();
}
const Eigen::Ref<const Eigen::Matrix3f> Model::GetRotation() const {
return transformation_.template topLeftCorner<3, 3>();
}
const std::string& Model::GetCategory() const { return category_; }
void Model::Deserialize(const Object& obj) {
CHECK_EQ(obj.rotation_size(), 9);
CHECK_EQ(obj.translation_size(), 3);
CHECK_EQ(obj.scale_size(), 3);
category_ = obj.category();
using RotationMatrix = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>;
transformation_.setIdentity();
transformation_.topLeftCorner<3, 3>() =
Eigen::Map<const RotationMatrix>(obj.rotation().data());
transformation_.col(3).head<3>() =
Eigen::Map<const Eigen::Vector3f>(obj.translation().data());
scale_ = Eigen::Map<const Eigen::Vector3f>(obj.scale().data());
Update();
}
void Model::Serialize(Object* obj) {
obj->set_category(category_);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
obj->add_rotation(transformation_(i, j));
}
}
for (int i = 0; i < 3; ++i) {
obj->add_translation(transformation_(i, 3));
}
for (int i = 0; i < 3; ++i) {
obj->add_scale(scale_[i]);
}
}
} // namespace mediapipe

View File

@ -0,0 +1,92 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/object.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/types.h"
namespace mediapipe {
class Model {
public:
EIGEN_MAKE_ALIGNED_OPERATOR_NEW
enum Type {
kVisualizationOnly = 0,
kBoundingBox,
kSkeleton,
kShape, // Shape is a virtual object.
kNumModes,
};
virtual ~Model() = default;
virtual void SetTransformation(const Eigen::Matrix4f& transform);
virtual void SetTranslation(const Eigen::Vector3f& translation);
// Compute the rotation matrix from these angles and update the transformation
// matrix accordingly
virtual void SetRotation(float roll, float pitch, float yaw);
virtual void SetRotation(const Eigen::Matrix3f& rotation);
virtual void SetScale(const Eigen::Vector3f& scale);
virtual void SetCategory(const std::string& category);
virtual size_t GetNumberKeypoints() const { return number_keypoints_; }
// Gets Euler angles in the order of roll, pitch, yaw.
virtual const Eigen::Vector3f GetRotationAngles() const;
virtual const Eigen::Matrix4f& GetTransformation() const;
virtual const Eigen::Vector3f& GetScale() const;
virtual const Eigen::Ref<const Eigen::Vector3f> GetTranslation() const;
virtual const Eigen::Ref<const Eigen::Matrix3f> GetRotation() const;
virtual const std::string& GetCategory() const;
// Update the model's keypoints in the world-coordinate system.
// The update includes transforming the model to the world-coordinate system
// as well as scaling the model.
// The user is expected to call this function after Setting the rotation,
// orientation or the scale of the model to get an updated model.
virtual void Update() = 0;
// Update the model's parameters (orientation, position, and scale) from the
// user-provided variables.
virtual void Adjust(const std::vector<float>& variables) = 0;
// Returns a pointer to the model's keypoints.
// Use Eigen::Map to cast the pointer back to Vector3 or Vector4
virtual const float* GetVertex(size_t id) const = 0;
virtual float* GetVertex(size_t id) = 0;
virtual void Deserialize(const Object& obj);
virtual void Serialize(Object* obj);
// TODO: make member variables protected, and add public apis.
// 4x4 transformation matrix mapping the first keypoint to world coordinate
Eigen::Matrix4f transformation_;
Eigen::Vector3f scale_; // width, height, depth
Type model_type_;
size_t number_keypoints_;
std::string category_;
protected:
Model(Type type, size_t number_keypoints, const std::string& category)
: model_type_(type),
number_keypoints_(number_keypoints),
category_(category) {}
};
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_MODEL_H_

View File

@ -0,0 +1,48 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
message TimedModelMatrixProto {
// 4x4 model matrix stored in ROW major order.
repeated float matrix_entries = 1 [packed = true];
// Timestamp of this model matrix in milliseconds.
optional int64 time_msec = 2 [default = 0];
// Unique per object id
optional int32 id = 3 [default = -1];
}
message TimedModelMatrixProtoList {
repeated TimedModelMatrixProto model_matrix = 1;
}
// For convenience, when the desired information or transformation can be
// encoded into vectors (e.g. when the matrix represents a scale or Euler-angle-
// based rotation operation.)
message TimedVectorProto {
// The vector values themselves.
repeated float vector_entries = 1 [packed = true];
// Timestamp of this vector in milliseconds.
optional int64 time_msec = 2 [default = 0];
// Unique per object id
optional int32 id = 3 [default = -1];
}
message TimedVectorProtoList {
repeated TimedVectorProto vector_list = 1;
}

View File

@ -0,0 +1,124 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package mediapipe;
message KeyPoint {
// The position of the keypoint in the local coordinate system of the rigid
// object.
float x = 1;
float y = 2;
float z = 3;
// Sphere around the keypoint, indiciating annotator's confidence of the
// position in meters.
float confidence_radius = 4;
// The name of the keypoint (e.g. legs, head, etc.).
// Does not have to be unique.
string name = 5;
// Indicates whether the keypoint is hidden or not.
bool hidden = 6;
}
message Object {
// Unique object id through a sequence. There might be multiple objects of
// the same label in this sequence.
int32 id = 1;
// Describes what category an object is. E.g. object class, attribute,
// instance or person identity. This provides additional context for the
// object type.
string category = 2;
enum Type {
UNDEFINED_TYPE = 0;
BOUNDING_BOX = 1;
SKELETON = 2;
}
Type type = 3;
// 3x3 row-major rotation matrix describing the orientation of the rigid
// object's frame of reference in the world-coordinate system.
repeated float rotation = 4;
// 3x1 vector describing the translation of the rigid object's frame of
// reference in the world-coordinate system in meters.
repeated float translation = 5;
// 3x1 vector describing the scale of the rigid object's frame of reference in
// the world-coordinate system in meters.
repeated float scale = 6;
// List of all the key points associated with this object in the object
// coordinate system.
// The first keypoint is always the object's frame of reference,
// e.g. the centroid of the box.
// E.g. bounding box with its center as frame of reference, the 9 keypoints :
// {0., 0., 0.},
// {-.5, -.5, -.5}, {-.5, -.5, +.5}, {-.5, +.5, -.5}, {-.5, +.5, +.5},
// {+.5, -.5, -.5}, {+.5, -.5, +.5}, {+.5, +.5, -.5}, {+.5, +.5, +.5}
// To get the bounding box in the world-coordinate system, we first scale the
// box then transform the scaled box.
// For example, bounding box in the world coordinate system is
// rotation * scale * keypoints + translation
repeated KeyPoint keypoints = 7;
// Enum to reflect how this object is created.
enum Method {
UNKNOWN_METHOD = 0;
ANNOTATION = 1; // Created by data annotation.
AUGMENTATION = 2; // Created by data augmentation.
}
Method method = 8;
}
// The edge connecting two keypoints together
message Edge {
// keypoint id of the edge's source
int32 source = 1;
// keypoint id of the edge's sink
int32 sink = 2;
}
// The skeleton template for different objects (e.g. humans, chairs, hands, etc)
// The annotation tool reads the skeleton template dictionary.
message Skeleton {
// The origin keypoint in the object coordinate system. (i.e. Point 0, 0, 0)
int32 reference_keypoint = 1;
// The skeleton's category (e.g. human, chair, hand.). Should be unique in the
// dictionary.
string category = 2;
// Initialization value for all the keypoints in the skeleton in the object's
// local coordinate system. Pursuit will transform these points using object's
// transformation to get the keypoint in the world-cooridnate.
repeated KeyPoint keypoints = 3;
// List of edges connecting keypoints
repeated Edge edges = 4;
}
// The list of all the modeled skeletons in our library. These models can be
// objects (chairs, desks, etc), humans (full pose, hands, faces, etc), or box.
// We can have multiple skeletons in the same file.
message Skeletons {
repeated Skeleton object = 1;
}

View File

@ -0,0 +1,33 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h"
#include "mediapipe/framework/port/logging.h"
namespace mediapipe {
cv::Mat ConvertTfliteTensorToCvMat(const TfLiteTensor& tensor) {
// Check tensor is BxCxWxH (size = 4) and the batch size is one(data[0] = 1)
CHECK(tensor.dims->size == 4 && tensor.dims->data[0] == 1);
CHECK_EQ(kTfLiteFloat32, tensor.type) << "tflite_tensor type is not float";
const size_t num_output_channels = tensor.dims->data[3];
const int dims = 2;
const int sizes[] = {tensor.dims->data[1], tensor.dims->data[2]};
const int type = CV_MAKETYPE(CV_32F, num_output_channels);
return cv::Mat(dims, sizes, type, reinterpret_cast<void*>(tensor.data.f));
}
} // namespace mediapipe

View File

@ -0,0 +1,27 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "tensorflow/lite/interpreter.h"
namespace mediapipe {
// Converts a single channel tflite tensor to a grayscale image
cv::Mat ConvertTfliteTensorToCvMat(const TfLiteTensor& tensor);
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TENSOR_UTIL_H_

View File

@ -0,0 +1,216 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <unordered_map>
#include <vector>
#include "Eigen/Dense"
#include "absl/memory/memory.h"
#include "absl/strings/str_format.h"
#include "absl/types/span.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/deps/file_path.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/graphs/object_detection_3d/calculators/annotation_data.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.pb.h"
#include "mediapipe/graphs/object_detection_3d/calculators/decoder.h"
#include "mediapipe/graphs/object_detection_3d/calculators/tensor_util.h"
#include "mediapipe/graphs/object_detection_3d/calculators/tflite_tensors_to_objects_calculator.pb.h"
#include "tensorflow/lite/interpreter.h"
namespace {
constexpr char kInputStreamTag[] = "TENSORS";
constexpr char kOutputStreamTag[] = "ANNOTATIONS";
// Each detection object will be assigned an unique id that starts from 1.
static int object_id = 0;
inline int GetNextObjectId() { return ++object_id; }
} // namespace
namespace mediapipe {
// Convert result TFLite tensors from deep pursuit 3d model into
// FrameAnnotation.
//
// Input:
// TENSORS - Vector of TfLiteTensor of type kTfLiteFloat32.
// Output:
// ANNOTATIONS - Result FrameAnnotation.
//
// Usage example:
// node {
// calculator: "TfLiteTensorsToObjectsCalculator"
// input_stream: "TENSORS:tensors"
// output_stream: "ANNOTATIONS:annotations"
// }
class TfLiteTensorsToObjectsCalculator : public CalculatorBase {
public:
static ::mediapipe::Status GetContract(CalculatorContract* cc);
::mediapipe::Status Open(CalculatorContext* cc) override;
::mediapipe::Status Process(CalculatorContext* cc) override;
::mediapipe::Status Close(CalculatorContext* cc) override;
private:
::mediapipe::Status ProcessCPU(CalculatorContext* cc,
FrameAnnotation* output_objects);
::mediapipe::Status LoadOptions(CalculatorContext* cc);
// Takes point_3d in FrameAnnotation, projects to 2D, and overwrite the
// point_2d field with the projection.
void Project3DTo2D(bool portrait, FrameAnnotation* annotation) const;
// Increment and assign object ID for each detected object.
// In a single MediaPipe session, the IDs are unique.
// Also assign timestamp for the FrameAnnotation to be the input packet
// timestamp.
void AssignObjectIdAndTimestamp(int64 timestamp_us,
FrameAnnotation* annotation);
int num_classes_ = 0;
int num_keypoints_ = 0;
::mediapipe::TfLiteTensorsToObjectsCalculatorOptions options_;
std::unique_ptr<Decoder> decoder_;
Eigen::Matrix<float, 4, 4, Eigen::RowMajor> projection_matrix_;
};
REGISTER_CALCULATOR(TfLiteTensorsToObjectsCalculator);
::mediapipe::Status TfLiteTensorsToObjectsCalculator::GetContract(
CalculatorContract* cc) {
RET_CHECK(!cc->Inputs().GetTags().empty());
RET_CHECK(!cc->Outputs().GetTags().empty());
if (cc->Inputs().HasTag(kInputStreamTag)) {
cc->Inputs().Tag(kInputStreamTag).Set<std::vector<TfLiteTensor>>();
}
if (cc->Outputs().HasTag(kOutputStreamTag)) {
cc->Outputs().Tag(kOutputStreamTag).Set<FrameAnnotation>();
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status TfLiteTensorsToObjectsCalculator::Open(
CalculatorContext* cc) {
MP_RETURN_IF_ERROR(LoadOptions(cc));
// clang-format off
projection_matrix_ <<
1.5731, 0, 0, 0,
0, 2.0975, 0, 0,
0, 0, -1.0002, -0.2,
0, 0, -1, 0;
// clang-format on
decoder_ = absl::make_unique<Decoder>(
BeliefDecoderConfig(options_.decoder_config()));
return ::mediapipe::OkStatus();
}
::mediapipe::Status TfLiteTensorsToObjectsCalculator::Process(
CalculatorContext* cc) {
if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
return ::mediapipe::OkStatus();
}
auto output_objects = absl::make_unique<FrameAnnotation>();
MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get()));
// Output
if (cc->Outputs().HasTag(kOutputStreamTag)) {
cc->Outputs()
.Tag(kOutputStreamTag)
.Add(output_objects.release(), cc->InputTimestamp());
}
return ::mediapipe::OkStatus();
}
::mediapipe::Status TfLiteTensorsToObjectsCalculator::ProcessCPU(
CalculatorContext* cc, FrameAnnotation* output_objects) {
const auto& input_tensors =
cc->Inputs().Tag(kInputStreamTag).Get<std::vector<TfLiteTensor>>();
cv::Mat prediction_heatmap = ConvertTfliteTensorToCvMat(input_tensors[0]);
cv::Mat offsetmap = ConvertTfliteTensorToCvMat(input_tensors[1]);
*output_objects =
decoder_->DecodeBoundingBoxKeypoints(prediction_heatmap, offsetmap);
auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true,
output_objects);
if (!status.ok()) {
LOG(ERROR) << status;
return status;
}
Project3DTo2D(/*portrait*/ true, output_objects);
AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(),
output_objects);
return ::mediapipe::OkStatus();
}
::mediapipe::Status TfLiteTensorsToObjectsCalculator::Close(
CalculatorContext* cc) {
return ::mediapipe::OkStatus();
}
::mediapipe::Status TfLiteTensorsToObjectsCalculator::LoadOptions(
CalculatorContext* cc) {
// Get calculator options specified in the graph.
options_ =
cc->Options<::mediapipe::TfLiteTensorsToObjectsCalculatorOptions>();
num_classes_ = options_.num_classes();
num_keypoints_ = options_.num_keypoints();
// Currently only support 2D when num_values_per_keypoint equals to 2.
CHECK_EQ(options_.num_values_per_keypoint(), 2);
return ::mediapipe::OkStatus();
}
void TfLiteTensorsToObjectsCalculator::Project3DTo2D(
bool portrait, FrameAnnotation* annotation) const {
for (auto& ann : *annotation->mutable_annotations()) {
for (auto& key_point : *ann.mutable_keypoints()) {
Eigen::Vector4f point3d;
point3d << key_point.point_3d().x(), key_point.point_3d().y(),
key_point.point_3d().z(), 1.0f;
Eigen::Vector4f point3d_projection = projection_matrix_ * point3d;
float u, v;
const float inv_w = 1.0f / point3d_projection(3);
if (portrait) {
u = (point3d_projection(1) * inv_w + 1.0f) * 0.5f;
v = (point3d_projection(0) * inv_w + 1.0f) * 0.5f;
} else {
u = (point3d_projection(0) * inv_w + 1.0f) * 0.5f;
v = (1.0f - point3d_projection(1) * inv_w) * 0.5f;
}
key_point.mutable_point_2d()->set_x(u);
key_point.mutable_point_2d()->set_y(v);
}
}
}
void TfLiteTensorsToObjectsCalculator::AssignObjectIdAndTimestamp(
int64 timestamp_us, FrameAnnotation* annotation) {
for (auto& ann : *annotation->mutable_annotations()) {
ann.set_object_id(GetNextObjectId());
}
annotation->set_timestamp(timestamp_us);
}
} // namespace mediapipe

View File

@ -0,0 +1,39 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The option proto for the TfLiteTensorsToObjectsCalculatorOptions.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
import "mediapipe/graphs/object_detection_3d/calculators/belief_decoder_config.proto";
message TfLiteTensorsToObjectsCalculatorOptions {
extend CalculatorOptions {
optional TfLiteTensorsToObjectsCalculatorOptions ext = 263667646;
}
// The number of output classes predicted by the detection model.
optional int32 num_classes = 1;
// The number of predicted keypoints.
optional int32 num_keypoints = 2;
// The dimension of each keypoint, e.g. number of values predicted for each
// keypoint.
optional int32 num_values_per_keypoint = 3 [default = 2];
optional BeliefDecoderConfig decoder_config = 4;
}

View File

@ -0,0 +1,56 @@
// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_
#define MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_
#include <array>
#include "Eigen/Geometry"
namespace mediapipe {
using Eigen::Map;
using Eigen::Vector2f;
using Eigen::Vector3f;
using Eigen::Vector4f;
using Matrix4f_RM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
using Matrix3f_RM = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>;
using Face = std::array<int, 4>;
struct SuperPoint {
enum PointSourceType { kPointCloud = 0, kBoundingBox = 1, kSkeleton = 2 };
// The id of the point in the point-cloud
int reference_point;
// The source of the
PointSourceType source;
// The id of the point in set of points in current frame
int id;
// If source is kBoundingBox or kSkeleton, object_id stores the id of which \
// object this point belongs to.
int object_id;
// projected u-v value
Vector2f uv;
Vector2f pixel;
// the 3D point
Vector3f point_3d;
// Color
Eigen::Matrix<unsigned char, 4, 1> color;
bool rendered;
};
} // namespace mediapipe
#endif // MEDIAPIPE_GRAPHS_OBJECT_DETECTION_3D_TYPES_H_

View File

@ -0,0 +1,133 @@
# MediaPipe object detection 3D with tracking graph.
# Images on GPU coming into and out of the graph.
input_stream: "input_video"
input_stream: "input_width"
input_stream: "input_height"
output_stream: "output_video"
# Crops the image from the center to the size WIDTHxHEIGHT.
node: {
calculator: "ImageCroppingCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:input_video_4x3"
input_stream: "WIDTH:input_width"
input_stream: "HEIGHT:input_height"
node_options: {
[type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] {
border_mode: BORDER_REPLICATE
}
}
}
# Creates a copy of the input_video stream. At the end of the graph, the
# GlAnimationOverlayCalculator will consume the input_video texture and draws
# on top of it.
node: {
calculator: "GlScalerCalculator"
input_stream: "VIDEO:input_video_4x3"
output_stream: "VIDEO:input_video_copy"
}
# Resamples the images by specific frame rate. This calculator is used to
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
# consumption for expensive process.
node {
calculator: "PacketResamplerCalculator"
input_stream: "DATA:input_video_copy"
output_stream: "DATA:sampled_input_video"
node_options: {
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
frame_rate: 5
}
}
}
node {
calculator: "ObjectronDetectionSubgraphGpu"
input_stream: "IMAGE_GPU:sampled_input_video"
output_stream: "ANNOTATIONS:objects"
}
node {
calculator: "ObjectronTrackingSubgraphGpu"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "IMAGE_GPU:input_video_copy"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
}
# The rendering nodes:
# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly
# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask.
# These models are designed using different tools, so we supply a transformation
# to bring both of them to the Objectron's coordinate system.
# Creates a model matrices for the tracked object given the lifted 3D points.
# This calculator does two things: 1) Estimates object's pose (orientation,
# translation, and scale) from the 3D vertices, and
# 2) bring the object from the objectron's coordinate system to the renderer
# (OpenGL) coordinate system. Since the final goal is to render a mesh file on
# top of the object, we also supply a transformation to bring the mesh to the
# objectron's coordinate system, and rescale mesh to the unit size.
node {
calculator: "AnnotationsToModelMatricesCalculator"
input_stream: "ANNOTATIONS:lifted_tracked_objects"
output_stream: "MODEL_MATRICES:model_matrices"
node_options: {
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
# Re-scale the CAD model to the size of a unit box
model_scale: [0.05, 0.05, 0.05]
# Bring the box CAD model to objectron's coordinate system. This
# is equivalent of -pi/2 rotation along the y-axis (right-hand rule):
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY())
model_transformation: [0.0, 0.0, -1.0, 0.0]
model_transformation: [0.0, 1.0, 0.0, 0.0]
model_transformation: [1.0, 0.0, 0.0, 0.0]
model_transformation: [0.0, 0.0, 0.0, 1.0]
}
}
}
# Compute the model matrices for the CAD model of the chair, to be used as an
# occlusion mask. The model will be rendered at the exact same location as the
# bounding box.
node {
calculator: "AnnotationsToModelMatricesCalculator"
input_stream: "ANNOTATIONS:lifted_tracked_objects"
output_stream: "MODEL_MATRICES:mask_model_matrices"
node_options: {
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
# Re-scale the CAD model to the size of a unit box
model_scale: [0.15, 0.1, 0.15]
# Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This
# is equivalent of -pi/2 rotation along the x-axis:
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX())
model_transformation: [1.0, 0.0, 0.0, 0.0]
model_transformation: [0.0, 1.0, 0.0, -10.0]
model_transformation: [0.0, 0.0, -1.0, 0.0]
model_transformation: [0.0, 0.0, 0.0, 1.0]
}
}
}
# Render everything together. First we render the 3D bounding box animation,
# then we render the occlusion mask.
node:{
calculator:"GlAnimationOverlayCalculator"
input_stream:"VIDEO:input_video_4x3"
input_stream:"MODEL_MATRICES:model_matrices"
input_stream:"MASK_MODEL_MATRICES:mask_model_matrices"
output_stream:"output_video"
input_side_packet:"TEXTURE:box_texture"
input_side_packet:"ANIMATION_ASSET:box_asset_name"
input_side_packet:"MASK_TEXTURE:obj_texture"
input_side_packet:"MASK_ASSET:obj_asset_name"
node_options: {
[type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] {
# Output resolution is 480x640 with the aspect ratio of 0.75
aspect_ratio: 0.75
vertical_fov_degrees: 70.
animation_speed_fps: 25
}
}
}

View File

@ -0,0 +1,134 @@
# MediaPipe object detection 3D with tracking graph.
# Images on GPU coming into and out of the graph.
input_stream: "input_video"
input_stream: "input_width"
input_stream: "input_height"
output_stream: "output_video"
# Crops the image from the center to the size WIDTHxHEIGHT.
node: {
calculator: "ImageCroppingCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:input_video_4x3"
input_stream: "WIDTH:input_width"
input_stream: "HEIGHT:input_height"
node_options: {
[type.googleapis.com/mediapipe.ImageCroppingCalculatorOptions] {
border_mode: BORDER_REPLICATE
}
}
}
# Creates a copy of the input_video stream. At the end of the graph, the
# GlAnimationOverlayCalculator will consume the input_video texture and draws
# on top of it.
node: {
calculator: "GlScalerCalculator"
input_stream: "VIDEO:input_video_4x3"
output_stream: "VIDEO:input_video_copy"
}
# Resamples the images by specific frame rate. This calculator is used to
# control the frequecy of subsequent calculators/subgraphs, e.g. less power
# consumption for expensive process.
node {
calculator: "PacketResamplerCalculator"
input_stream: "DATA:input_video_copy"
output_stream: "DATA:sampled_input_video"
node_options: {
[type.googleapis.com/mediapipe.PacketResamplerCalculatorOptions] {
frame_rate: 5
}
}
}
node {
calculator: "ObjectronDetectionSubgraphGpu"
input_stream: "IMAGE_GPU:sampled_input_video"
output_stream: "ANNOTATIONS:objects"
}
node {
calculator: "ObjectronTrackingSubgraphGpu"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "IMAGE_GPU:input_video_copy"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
}
# The rendering nodes:
# We are rendering two meshes: 1) a 3D bounding box, which we overlay directly
# on the texture, and 2) a shoe CAD model, which we use as an occlusion mask.
# These models are designed using different tools, so we supply a transformation
# to bring both of them to the Objectron's coordinate system.
# Creates a model matrices for the tracked object given the lifted 3D points.
# This calculator does two things: 1) Estimates object's pose (orientation,
# translation, and scale) from the 3D vertices, and
# 2) bring the object from the objectron's coordinate system to the renderer
# (OpenGL) coordinate system. Since the final goal is to render a mesh file on
# top of the object, we also supply a transformation to bring the mesh to the
# objectron's coordinate system, and rescale mesh to the unit size.
node {
calculator: "AnnotationsToModelMatricesCalculator"
input_stream: "ANNOTATIONS:lifted_tracked_objects"
output_stream: "MODEL_MATRICES:model_matrices"
node_options: {
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
# Re-scale the CAD model to the size of a unit box
model_scale: [0.05, 0.05, 0.05]
# Bring the box CAD model to objectron's coordinate system. This
# is equivalent of -pi/2 rotation along the y-axis (right-hand rule):
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitY())
model_transformation: [0.0, 0.0, -1.0, 0.0]
model_transformation: [0.0, 1.0, 0.0, 0.0]
model_transformation: [1.0, 0.0, 0.0, 0.0]
model_transformation: [0.0, 0.0, 0.0, 1.0]
}
}
}
# Compute the model matrices for the CAD model of the shoe, to be used as an
# occlusion mask. The model will be rendered at the exact same location as the
# bounding box.
node {
calculator: "AnnotationsToModelMatricesCalculator"
input_stream: "ANNOTATIONS:lifted_tracked_objects"
output_stream: "MODEL_MATRICES:mask_model_matrices"
#input_side_packet: "MODEL_SCALE:model_scale"
node_options: {
[type.googleapis.com/mediapipe.AnnotationsToModelMatricesCalculatorOptions] {
# Re-scale the CAD model to the size of a unit box
model_scale: [0.45, 0.25, 0.15]
# Bring the shoe CAD model to Deep Pursuit 3D's coordinate system. This
# is equivalent of -pi/2 rotation along the x-axis (right-hand rule):
# Eigen::AngleAxisf(-M_PI / 2., Eigen::Vector3f::UnitX())
model_transformation: [1.0, 0.0, 0.0, 0.0]
model_transformation: [0.0, 0.0, 1.0, 0.0]
model_transformation: [0.0, -1.0, 0.0, 0.0]
model_transformation: [0.0, 0.0, 0.0, 1.0]
}
}
}
# Render everything together. First we render the 3D bounding box animation,
# then we render the occlusion mask.
node: {
calculator: "GlAnimationOverlayCalculator"
input_stream: "VIDEO:input_video_4x3"
input_stream: "MODEL_MATRICES:model_matrices"
input_stream: "MASK_MODEL_MATRICES:mask_model_matrices"
output_stream: "output_video"
input_side_packet: "TEXTURE:box_texture"
input_side_packet: "ANIMATION_ASSET:box_asset_name"
input_side_packet: "MASK_TEXTURE:obj_texture"
input_side_packet: "MASK_ASSET:obj_asset_name"
node_options: {
[type.googleapis.com/mediapipe.GlAnimationOverlayCalculatorOptions] {
# Output resolution is 480x640 with the aspect ratio of 0.75
aspect_ratio: 0.75
vertical_fov_degrees: 70.
animation_speed_fps: 25
}
}
}

View File

@ -0,0 +1,52 @@
# Copyright 2020 The MediaPipe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
load(
"//mediapipe/framework/tool:mediapipe_graph.bzl",
"mediapipe_simple_subgraph",
)
licenses(["notice"]) # Apache 2.0
package(default_visibility = ["//visibility:public"])
mediapipe_simple_subgraph(
name = "objectron_detection_gpu",
graph = "objectron_detection_gpu.pbtxt",
register_as = "ObjectronDetectionSubgraphGpu",
deps = [
"//mediapipe/calculators/image:image_transformation_calculator",
"//mediapipe/calculators/tflite:tflite_converter_calculator",
"//mediapipe/calculators/tflite:tflite_custom_op_resolver_calculator",
"//mediapipe/calculators/tflite:tflite_inference_calculator",
"//mediapipe/graphs/object_detection_3d/calculators:tflite_tensors_to_objects_calculator",
],
)
mediapipe_simple_subgraph(
name = "objectron_tracking_gpu",
graph = "objectron_tracking_gpu.pbtxt",
register_as = "ObjectronTrackingSubgraphGpu",
deps = [
"//mediapipe/calculators/image:image_transformation_calculator",
"//mediapipe/calculators/video:box_tracker_calculator",
"//mediapipe/calculators/video:flow_packager_calculator",
"//mediapipe/calculators/video:motion_analysis_calculator",
"//mediapipe/framework/stream_handler:sync_set_input_stream_handler",
"//mediapipe/gpu:gpu_buffer_to_image_frame_calculator",
"//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_to_timed_box_list_calculator",
"//mediapipe/graphs/object_detection_3d/calculators:frame_annotation_tracker_calculator",
"//mediapipe/graphs/object_detection_3d/calculators:lift_2d_frame_annotation_to_3d_calculator",
],
)

View File

@ -0,0 +1,81 @@
# MediaPipe Objectron detection gpu subgraph
type: "ObjectronDetectionSubgraphGpu"
input_stream: "IMAGE_GPU:input_video"
output_stream: "ANNOTATIONS:objects"
# Transforms the input image on GPU to a 480x640 image. To scale the input
# image, the scale_mode option is set to FIT to preserve the aspect ratio,
# resulting in potential letterboxing in the transformed image.
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:transformed_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 480
output_height: 640
scale_mode: FIT
}
}
}
# Converts the transformed input image on GPU into an image tensor stored as a
# TfLiteTensor.
node {
calculator: "TfLiteConverterCalculator"
input_stream: "IMAGE_GPU:transformed_input_video"
output_stream: "TENSORS_GPU:image_tensor"
}
# Generates a single side packet containing a TensorFlow Lite op resolver that
# supports custom ops needed by the model used in this graph.
node {
calculator: "TfLiteCustomOpResolverCalculator"
output_side_packet: "opresolver"
node_options: {
[type.googleapis.com/mediapipe.TfLiteCustomOpResolverCalculatorOptions] {
use_gpu: true
}
}
}
# Runs a TensorFlow Lite model on GPU that takes an image tensor and outputs a
# vector of tensors representing, for instance, detection boxes/keypoints and
# scores.
node {
calculator: "TfLiteInferenceCalculator"
input_stream: "TENSORS_GPU:image_tensor"
output_stream: "TENSORS:detection_tensors"
input_side_packet: "CUSTOM_OP_RESOLVER:opresolver"
node_options: {
[type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
model_path: "object_detection_3d.tflite"
}
}
}
# Decodes the model's output tensor (the heatmap and the distance fields) to 2D
# keypoints. There are nine 2D keypoints: one center keypoint and eight vertices
# for the 3D bounding box. The calculator parameters determine's the decoder's
# sensitivity.
node {
calculator: "TfLiteTensorsToObjectsCalculator"
input_stream: "TENSORS:detection_tensors"
output_stream: "ANNOTATIONS:objects"
node_options: {
[type.googleapis.com/mediapipe.TfLiteTensorsToObjectsCalculatorOptions] {
num_classes: 1
num_keypoints: 9
decoder_config {
heatmap_threshold: 0.6
local_max_distance: 2
offset_scale_coef: 1.0
voting_radius: 2
voting_allowance: 1
voting_threshold: 0.2
}
}
}
}

View File

@ -0,0 +1,170 @@
# MediaPipe Objectron tracking gpu subgraph
type: "ObjectronTrackingSubgraphGpu"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "IMAGE_GPU:input_video"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
# Converts the detected keypoints to Boxes, used by the tracking subgraph.
node {
calculator: "FrameAnnotationToTimedBoxListCalculator"
input_stream: "FRAME_ANNOTATION:objects"
output_stream: "BOXES:start_pos"
}
node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE_GPU:input_video"
output_stream: "IMAGE_GPU:downscaled_input_video"
node_options: {
[type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
output_width: 240
output_height: 320
}
}
}
# Converts GPU buffer to ImageFrame for processing tracking.
node: {
calculator: "GpuBufferToImageFrameCalculator"
input_stream: "downscaled_input_video"
output_stream: "downscaled_input_video_cpu"
}
# Performs motion analysis on an incoming video stream.
node: {
calculator: "MotionAnalysisCalculator"
input_stream: "VIDEO:downscaled_input_video_cpu"
output_stream: "CAMERA:camera_motion"
output_stream: "FLOW:region_flow"
node_options: {
[type.googleapis.com/mediapipe.MotionAnalysisCalculatorOptions]: {
analysis_options {
analysis_policy: ANALYSIS_POLICY_CAMERA_MOBILE
flow_options {
fast_estimation_min_block_size: 100
top_inlier_sets: 1
frac_inlier_error_threshold: 3e-3
downsample_mode: DOWNSAMPLE_TO_INPUT_SIZE
verification_distance: 5.0
verify_long_feature_acceleration: true
verify_long_feature_trigger_ratio: 0.1
tracking_options {
max_features: 500
adaptive_extraction_levels: 2
min_eig_val_settings {
adaptive_lowest_quality_level: 2e-4
}
klt_tracker_implementation: KLT_OPENCV
}
}
}
}
}
}
# Reads optical flow fields defined in
# mediapipe/framework/formats/motion/optical_flow_field.h,
# returns a VideoFrame with 2 channels (v_x and v_y), each channel is quantized
# to 0-255.
node: {
calculator: "FlowPackagerCalculator"
input_stream: "FLOW:region_flow"
input_stream: "CAMERA:camera_motion"
output_stream: "TRACKING:tracking_data"
node_options: {
[type.googleapis.com/mediapipe.FlowPackagerCalculatorOptions]: {
flow_packager_options: {
binary_tracking_data_support: false
}
}
}
}
# Tracks box positions over time.
node: {
calculator: "BoxTrackerCalculator"
input_stream: "TRACKING:tracking_data"
input_stream: "TRACK_TIME:input_video"
input_stream: "START_POS:start_pos"
input_stream: "CANCEL_OBJECT_ID:cancel_object_id"
input_stream_info: {
tag_index: "CANCEL_OBJECT_ID"
back_edge: true
}
output_stream: "BOXES:boxes"
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "TRACKING"
tag_index: "TRACK_TIME"
}
sync_set {
tag_index: "START_POS"
}
sync_set {
tag_index: "CANCEL_OBJECT_ID"
}
}
}
}
node_options: {
[type.googleapis.com/mediapipe.BoxTrackerCalculatorOptions]: {
tracker_options: {
track_step_options {
track_object_and_camera: true
tracking_degrees: TRACKING_DEGREE_OBJECT_ROTATION_SCALE
inlier_spring_force: 0.0
static_motion_temporal_ratio: 3e-2
}
}
visualize_tracking_data: false
streaming_track_data_cache_size: 100
}
}
}
# Consolidates tracking and detection results.
node {
calculator: "FrameAnnotationTrackerCalculator"
input_stream: "FRAME_ANNOTATION:objects"
input_stream: "TRACKED_BOXES:boxes"
output_stream: "TRACKED_FRAME_ANNOTATION:tracked_objects"
output_stream: "CANCEL_OBJECT_ID:cancel_object_id"
node_options: {
[type.googleapis.com/mediapipe.FrameAnnotationTrackerCalculatorOptions] {
img_width: 240
img_height: 320
iou_threshold: 0.1
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler"
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "FRAME_ANNOTATION"
}
sync_set {
tag_index: "TRACKED_BOXES"
}
}
}
}
}
# Lift the tracked 2D keypoints to 3D using EPnP algorithm.
node {
calculator: "Lift2DFrameAnnotationTo3DCalculator"
input_stream: "FRAME_ANNOTATION:tracked_objects"
output_stream: "LIFTED_FRAME_ANNOTATION:lifted_tracked_objects"
}

View File

@ -67,15 +67,19 @@ public class CameraXPreviewHelper extends CameraHelper {
private int cameraTimestampSource = CameraCharacteristics.SENSOR_INFO_TIMESTAMP_SOURCE_UNKNOWN; private int cameraTimestampSource = CameraCharacteristics.SENSOR_INFO_TIMESTAMP_SOURCE_UNKNOWN;
@Override @Override
@SuppressWarnings("RestrictTo") // See b/132705545.
public void startCamera( public void startCamera(
Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture) { Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture) {
startCamera(context, cameraFacing, surfaceTexture, TARGET_SIZE);
}
public void startCamera(
Activity context, CameraFacing cameraFacing, SurfaceTexture surfaceTexture, Size targetSize) {
LensFacing cameraLensFacing = LensFacing cameraLensFacing =
cameraFacing == CameraHelper.CameraFacing.FRONT ? LensFacing.FRONT : LensFacing.BACK; cameraFacing == CameraHelper.CameraFacing.FRONT ? LensFacing.FRONT : LensFacing.BACK;
PreviewConfig previewConfig = PreviewConfig previewConfig =
new PreviewConfig.Builder() new PreviewConfig.Builder()
.setLensFacing(cameraLensFacing) .setLensFacing(cameraLensFacing)
.setTargetResolution(TARGET_SIZE) .setTargetResolution(targetSize)
.build(); .build();
preview = new Preview(previewConfig); preview = new Preview(previewConfig);
@ -110,7 +114,6 @@ public class CameraXPreviewHelper extends CameraHelper {
} }
}); });
CameraX.bindToLifecycle(/*lifecycleOwner=*/ (LifecycleOwner) context, preview); CameraX.bindToLifecycle(/*lifecycleOwner=*/ (LifecycleOwner) context, preview);
} }
@Override @Override
@ -210,6 +213,10 @@ public class CameraXPreviewHelper extends CameraHelper {
return focalLengthPixels; return focalLengthPixels;
} }
public Size getFrameSize() {
return frameSize;
}
// Computes the focal length of the camera in pixels based on lens and sensor properties. // Computes the focal length of the camera in pixels based on lens and sensor properties.
private float calculateFocalLengthInPixels() { private float calculateFocalLengthInPixels() {
// Focal length of the camera in millimeters. // Focal length of the camera in millimeters.

Binary file not shown.

Binary file not shown.

View File

@ -41,3 +41,37 @@ cc_library(
"@org_tensorflow//tensorflow/lite/kernels:builtin_ops", "@org_tensorflow//tensorflow/lite/kernels:builtin_ops",
], ],
) )
cc_library(
name = "tensor_buffer",
srcs = ["tensor_buffer.cc"],
hdrs = ["tensor_buffer.h"],
deps = [
"@org_tensorflow//tensorflow/lite:framework",
"@com_google_absl//absl/memory",
"//mediapipe/framework:port",
] + select({
"//mediapipe/gpu:disable_gpu": [],
"//mediapipe:ios": [
"//mediapipe/gpu:MPPMetalUtil",
"//mediapipe/gpu:gl_base",
],
"//conditions:default": [
"@org_tensorflow//tensorflow/lite/delegates/gpu/gl:gl_buffer",
"//mediapipe/gpu:gl_base",
"//mediapipe/gpu:gl_context",
],
}),
)
cc_test(
name = "tensor_buffer_test",
srcs = ["tensor_buffer_test.cc"],
deps = [
":tensor_buffer",
"//mediapipe/framework/port:gtest_main",
] + select({
"//mediapipe/gpu:disable_gpu": [],
"//conditions:default": [],
}),
)

View File

@ -0,0 +1,43 @@
#include "mediapipe/util/tflite/tensor_buffer.h"
namespace mediapipe {
TensorBuffer::TensorBuffer() {}
TensorBuffer::~TensorBuffer() { uses_gpu_ = false; }
TensorBuffer::TensorBuffer(TfLiteTensor& tensor) {
cpu_ = tensor;
uses_gpu_ = false;
}
#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
TensorBuffer::TensorBuffer(std::shared_ptr<tflite::gpu::gl::GlBuffer> tensor) {
gpu_ = std::move(tensor);
uses_gpu_ = true;
}
// static
std::shared_ptr<tflite::gpu::gl::GlBuffer> TensorBuffer::CreateGlBuffer(
std::shared_ptr<mediapipe::GlContext> context) {
std::shared_ptr<tflite::gpu::gl::GlBuffer> ptr(
new tflite::gpu::gl::GlBuffer, [context](tflite::gpu::gl::GlBuffer* ref) {
if (context) {
context->Run([ref]() {
if (ref) delete ref;
});
} else {
if (ref) delete ref; // No context provided.
}
});
return ptr;
}
#endif // MEDIAPIPE_DISABLE_GL_COMPUTE
#if defined(MEDIAPIPE_IOS)
TensorBuffer::TensorBuffer(id<MTLBuffer> tensor) {
gpu_ = tensor;
uses_gpu_ = true;
}
#endif // MEDIAPIPE_IOS
} // namespace mediapipe

Some files were not shown because too many files have changed in this diff Show More