381 lines
14 KiB
C++
381 lines
14 KiB
C++
// Copyright 2019 The MediaPipe Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <cmath>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "mediapipe/calculators/tflite/ssd_anchors_calculator.pb.h"
|
|
#include "mediapipe/framework/calculator_framework.h"
|
|
#include "mediapipe/framework/formats/object_detection/anchor.pb.h"
|
|
#include "mediapipe/framework/port/ret_check.h"
|
|
|
|
namespace mediapipe {
|
|
|
|
namespace {
|
|
|
|
struct MultiScaleAnchorInfo {
|
|
int32 level;
|
|
std::vector<float> aspect_ratios;
|
|
std::vector<float> scales;
|
|
std::pair<float, float> base_anchor_size;
|
|
std::pair<float, float> anchor_stride;
|
|
};
|
|
|
|
struct FeatureMapDim {
|
|
int height;
|
|
int width;
|
|
};
|
|
|
|
float CalculateScale(float min_scale, float max_scale, int stride_index,
|
|
int num_strides) {
|
|
if (num_strides == 1) {
|
|
return (min_scale + max_scale) * 0.5f;
|
|
} else {
|
|
return min_scale +
|
|
(max_scale - min_scale) * 1.0 * stride_index / (num_strides - 1.0f);
|
|
}
|
|
}
|
|
|
|
int GetNumLayers(const SsdAnchorsCalculatorOptions& options) {
|
|
if (options.multiscale_anchor_generation()) {
|
|
return (options.max_level() - options.min_level() + 1);
|
|
}
|
|
return options.num_layers();
|
|
}
|
|
|
|
FeatureMapDim GetFeatureMapDimensions(
|
|
const SsdAnchorsCalculatorOptions& options, int index) {
|
|
FeatureMapDim feature_map_dims;
|
|
if (options.feature_map_height_size()) {
|
|
feature_map_dims.height = options.feature_map_height(index);
|
|
feature_map_dims.width = options.feature_map_width(index);
|
|
} else {
|
|
const int stride = options.strides(index);
|
|
feature_map_dims.height =
|
|
std::ceil(1.0f * options.input_size_height() / stride);
|
|
feature_map_dims.width =
|
|
std::ceil(1.0f * options.input_size_width() / stride);
|
|
}
|
|
return feature_map_dims;
|
|
}
|
|
|
|
// Although we have stride for both x and y, only one value is used for offset
|
|
// calculation. See
|
|
// tensorflow_models/object_detection/anchor_generators/multiscale_grid_anchor_generator.py;l=121
|
|
std::pair<float, float> GetMultiScaleAnchorOffset(
|
|
const SsdAnchorsCalculatorOptions& options, const float stride,
|
|
const int level) {
|
|
std::pair<float, float> result(0., 0.);
|
|
int denominator = std::pow(2, level);
|
|
if (options.input_size_height() % denominator == 0 ||
|
|
options.input_size_height() == 1) {
|
|
result.first = stride / 2.0;
|
|
}
|
|
if (options.input_size_width() % denominator == 0 ||
|
|
options.input_size_width() == 1) {
|
|
result.second = stride / 2.0;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void NormalizeAnchor(const int input_height, const int input_width,
|
|
Anchor* anchor) {
|
|
anchor->set_h(anchor->h() / (float)input_height);
|
|
anchor->set_w(anchor->w() / (float)input_width);
|
|
anchor->set_y_center(anchor->y_center() / (float)input_height);
|
|
anchor->set_x_center(anchor->x_center() / (float)input_width);
|
|
}
|
|
|
|
Anchor CalculateAnchorBox(const int y_center, const int x_center,
|
|
const float scale, const float aspect_ratio,
|
|
const std::pair<float, float> base_anchor_size,
|
|
// y-height first
|
|
const std::pair<float, float> anchor_stride,
|
|
const std::pair<float, float> anchor_offset) {
|
|
Anchor result;
|
|
float ratio_sqrt = std::sqrt(aspect_ratio);
|
|
result.set_h(scale * base_anchor_size.first / ratio_sqrt);
|
|
result.set_w(scale * ratio_sqrt * base_anchor_size.second);
|
|
result.set_y_center(y_center * anchor_stride.first + anchor_offset.first);
|
|
result.set_x_center(x_center * anchor_stride.second + anchor_offset.second);
|
|
return result;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
// Generate anchors for SSD object detection model.
|
|
// Output:
|
|
// ANCHORS: A list of anchors. Model generates predictions based on the
|
|
// offsets of these anchors.
|
|
//
|
|
// Usage example:
|
|
// node {
|
|
// calculator: "SsdAnchorsCalculator"
|
|
// output_side_packet: "anchors"
|
|
// options {
|
|
// [mediapipe.SsdAnchorsCalculatorOptions.ext] {
|
|
// num_layers: 6
|
|
// min_scale: 0.2
|
|
// max_scale: 0.95
|
|
// input_size_height: 300
|
|
// input_size_width: 300
|
|
// anchor_offset_x: 0.5
|
|
// anchor_offset_y: 0.5
|
|
// strides: 16
|
|
// strides: 32
|
|
// strides: 64
|
|
// strides: 128
|
|
// strides: 256
|
|
// strides: 512
|
|
// aspect_ratios: 1.0
|
|
// aspect_ratios: 2.0
|
|
// aspect_ratios: 0.5
|
|
// aspect_ratios: 3.0
|
|
// aspect_ratios: 0.3333
|
|
// reduce_boxes_in_lowest_layer: true
|
|
// }
|
|
// }
|
|
// }
|
|
class SsdAnchorsCalculator : public CalculatorBase {
|
|
public:
|
|
static absl::Status GetContract(CalculatorContract* cc) {
|
|
cc->OutputSidePackets().Index(0).Set<std::vector<Anchor>>();
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status Open(CalculatorContext* cc) override {
|
|
cc->SetOffset(TimestampDiff(0));
|
|
|
|
const SsdAnchorsCalculatorOptions& options =
|
|
cc->Options<SsdAnchorsCalculatorOptions>();
|
|
|
|
auto anchors = absl::make_unique<std::vector<Anchor>>();
|
|
if (!options.fixed_anchors().empty()) {
|
|
// Check fields for generating anchors are not set.
|
|
if (options.has_input_size_height() || options.has_input_size_width() ||
|
|
options.has_min_scale() || options.has_max_scale() ||
|
|
options.has_num_layers() || options.multiscale_anchor_generation()) {
|
|
return absl::InvalidArgumentError(
|
|
"Fixed anchors are provided, but fields are set for generating "
|
|
"anchors. When fixed anchors are set, fields for generating "
|
|
"anchors must not be set.");
|
|
}
|
|
anchors->assign(options.fixed_anchors().begin(),
|
|
options.fixed_anchors().end());
|
|
cc->OutputSidePackets().Index(0).Set(Adopt(anchors.release()));
|
|
return absl::OkStatus();
|
|
}
|
|
MP_RETURN_IF_ERROR(GenerateAnchors(anchors.get(), options));
|
|
cc->OutputSidePackets().Index(0).Set(Adopt(anchors.release()));
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status Process(CalculatorContext* cc) override {
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
private:
|
|
static absl::Status GenerateAnchors(
|
|
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options);
|
|
|
|
static absl::Status GenerateMultiScaleAnchors(
|
|
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options);
|
|
};
|
|
REGISTER_CALCULATOR(SsdAnchorsCalculator);
|
|
|
|
// Generates grid anchors on the fly corresponding to multiple CNN layers as
|
|
// described in:
|
|
// "Focal Loss for Dense Object Detection" (https://arxiv.org/abs/1708.02002)
|
|
// T.-Y. Lin, P. Goyal, R. Girshick, K. He, P. Dollar
|
|
absl::Status SsdAnchorsCalculator::GenerateMultiScaleAnchors(
|
|
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options) {
|
|
std::vector<MultiScaleAnchorInfo> anchor_infos;
|
|
for (int i = options.min_level(); i <= options.max_level(); ++i) {
|
|
MultiScaleAnchorInfo current_anchor_info;
|
|
// level
|
|
current_anchor_info.level = i;
|
|
// aspect_ratios
|
|
for (const float aspect_ratio : options.aspect_ratios()) {
|
|
current_anchor_info.aspect_ratios.push_back(aspect_ratio);
|
|
}
|
|
|
|
// scale
|
|
for (int i = 0; i < options.scales_per_octave(); ++i) {
|
|
current_anchor_info.scales.push_back(
|
|
std::pow(2.0, (double)i / (double)options.scales_per_octave()));
|
|
}
|
|
|
|
// anchor stride
|
|
float anchor_stride = std::pow(2.0, i);
|
|
current_anchor_info.anchor_stride =
|
|
std::make_pair(anchor_stride, anchor_stride);
|
|
|
|
// base_anchor_size
|
|
current_anchor_info.base_anchor_size =
|
|
std::make_pair(anchor_stride * options.anchor_scale(),
|
|
anchor_stride * options.anchor_scale());
|
|
anchor_infos.push_back(current_anchor_info);
|
|
}
|
|
|
|
for (unsigned int i = 0; i < anchor_infos.size(); ++i) {
|
|
FeatureMapDim dimensions = GetFeatureMapDimensions(options, i);
|
|
for (int y = 0; y < dimensions.height; ++y) {
|
|
for (int x = 0; x < dimensions.width; ++x) {
|
|
// loop over combination of scale and aspect ratio
|
|
for (unsigned int j = 0; j < anchor_infos[i].aspect_ratios.size();
|
|
++j) {
|
|
for (unsigned int k = 0; k < anchor_infos[i].scales.size(); ++k) {
|
|
Anchor anchor = CalculateAnchorBox(
|
|
/*y_center=*/y, /*x_center=*/x, anchor_infos[i].scales[k],
|
|
anchor_infos[i].aspect_ratios[j],
|
|
anchor_infos[i].base_anchor_size,
|
|
/*anchor_stride=*/anchor_infos[i].anchor_stride,
|
|
/*anchor_offset=*/
|
|
GetMultiScaleAnchorOffset(options,
|
|
anchor_infos[i].anchor_stride.first,
|
|
anchor_infos[i].level));
|
|
if (options.normalize_coordinates()) {
|
|
NormalizeAnchor(options.input_size_height(),
|
|
options.input_size_width(), &anchor);
|
|
}
|
|
anchors->push_back(anchor);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status SsdAnchorsCalculator::GenerateAnchors(
|
|
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options) {
|
|
// Verify the options.
|
|
if (!options.feature_map_height_size() && !options.strides_size()) {
|
|
return absl::InvalidArgumentError(
|
|
"Both feature map shape and strides are missing. Must provide either "
|
|
"one.");
|
|
}
|
|
const int kNumLayers = GetNumLayers(options);
|
|
|
|
if (options.feature_map_height_size()) {
|
|
if (options.strides_size()) {
|
|
LOG(ERROR) << "Found feature map shapes. Strides will be ignored.";
|
|
}
|
|
CHECK_EQ(options.feature_map_height_size(), kNumLayers);
|
|
CHECK_EQ(options.feature_map_height_size(),
|
|
options.feature_map_width_size());
|
|
} else {
|
|
CHECK_EQ(options.strides_size(), kNumLayers);
|
|
}
|
|
|
|
if (options.multiscale_anchor_generation()) {
|
|
return GenerateMultiScaleAnchors(anchors, options);
|
|
}
|
|
|
|
int layer_id = 0;
|
|
while (layer_id < options.num_layers()) {
|
|
std::vector<float> anchor_height;
|
|
std::vector<float> anchor_width;
|
|
std::vector<float> aspect_ratios;
|
|
std::vector<float> scales;
|
|
|
|
// For same strides, we merge the anchors in the same order.
|
|
int last_same_stride_layer = layer_id;
|
|
while (last_same_stride_layer < options.strides_size() &&
|
|
options.strides(last_same_stride_layer) ==
|
|
options.strides(layer_id)) {
|
|
const float scale =
|
|
CalculateScale(options.min_scale(), options.max_scale(),
|
|
last_same_stride_layer, options.strides_size());
|
|
if (last_same_stride_layer == 0 &&
|
|
options.reduce_boxes_in_lowest_layer()) {
|
|
// For first layer, it can be specified to use predefined anchors.
|
|
aspect_ratios.push_back(1.0);
|
|
aspect_ratios.push_back(2.0);
|
|
aspect_ratios.push_back(0.5);
|
|
scales.push_back(0.1);
|
|
scales.push_back(scale);
|
|
scales.push_back(scale);
|
|
} else {
|
|
for (int aspect_ratio_id = 0;
|
|
aspect_ratio_id < options.aspect_ratios_size();
|
|
++aspect_ratio_id) {
|
|
aspect_ratios.push_back(options.aspect_ratios(aspect_ratio_id));
|
|
scales.push_back(scale);
|
|
}
|
|
if (options.interpolated_scale_aspect_ratio() > 0.0) {
|
|
const float scale_next =
|
|
last_same_stride_layer == options.strides_size() - 1
|
|
? 1.0f
|
|
: CalculateScale(options.min_scale(), options.max_scale(),
|
|
last_same_stride_layer + 1,
|
|
options.strides_size());
|
|
scales.push_back(std::sqrt(scale * scale_next));
|
|
aspect_ratios.push_back(options.interpolated_scale_aspect_ratio());
|
|
}
|
|
}
|
|
last_same_stride_layer++;
|
|
}
|
|
|
|
for (int i = 0; i < aspect_ratios.size(); ++i) {
|
|
const float ratio_sqrts = std::sqrt(aspect_ratios[i]);
|
|
anchor_height.push_back(scales[i] / ratio_sqrts);
|
|
anchor_width.push_back(scales[i] * ratio_sqrts);
|
|
}
|
|
|
|
int feature_map_height = 0;
|
|
int feature_map_width = 0;
|
|
if (options.feature_map_height_size()) {
|
|
feature_map_height = options.feature_map_height(layer_id);
|
|
feature_map_width = options.feature_map_width(layer_id);
|
|
} else {
|
|
const int stride = options.strides(layer_id);
|
|
feature_map_height =
|
|
std::ceil(1.0f * options.input_size_height() / stride);
|
|
feature_map_width = std::ceil(1.0f * options.input_size_width() / stride);
|
|
}
|
|
|
|
for (int y = 0; y < feature_map_height; ++y) {
|
|
for (int x = 0; x < feature_map_width; ++x) {
|
|
for (int anchor_id = 0; anchor_id < anchor_height.size(); ++anchor_id) {
|
|
// TODO: Support specifying anchor_offset_x, anchor_offset_y.
|
|
const float x_center =
|
|
(x + options.anchor_offset_x()) * 1.0f / feature_map_width;
|
|
const float y_center =
|
|
(y + options.anchor_offset_y()) * 1.0f / feature_map_height;
|
|
|
|
Anchor new_anchor;
|
|
new_anchor.set_x_center(x_center);
|
|
new_anchor.set_y_center(y_center);
|
|
|
|
if (options.fixed_anchor_size()) {
|
|
new_anchor.set_w(1.0f);
|
|
new_anchor.set_h(1.0f);
|
|
} else {
|
|
new_anchor.set_w(anchor_width[anchor_id]);
|
|
new_anchor.set_h(anchor_height[anchor_id]);
|
|
}
|
|
anchors->push_back(new_anchor);
|
|
}
|
|
}
|
|
}
|
|
layer_id = last_same_stride_layer;
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
} // namespace mediapipe
|