From 01c64082f133f0703113ea0292332d5a86257a8b Mon Sep 17 00:00:00 2001
From: MediaPipe Team <mediapipe-team@google.com>
Date: Thu, 23 Feb 2023 14:10:21 -0800
Subject: [PATCH] ImageToTensorCalculator decides the output tensor size in
 runtime from the input image size.

PiperOrigin-RevId: 511882195
---
 mediapipe/calculators/tensor/BUILD            |  1 +
 .../tensor/image_to_tensor_calculator.cc      | 10 +--
 .../tensor/image_to_tensor_calculator.proto   |  2 +
 .../tensor/image_to_tensor_calculator_test.cc | 65 ++++++++++++-------
 .../tensor/image_to_tensor_utils.h            | 25 ++++---
 .../tensor/image_to_tensor_utils_test.cc      | 27 ++++++--
 6 files changed, 88 insertions(+), 42 deletions(-)
diff --git a/mediapipe/calculators/tensor/BUILD b/mediapipe/calculators/tensor/BUILD
index 1ac5644c1..c66665b68 100644
--- a/mediapipe/calculators/tensor/BUILD
+++ b/mediapipe/calculators/tensor/BUILD
@@ -1033,6 +1033,7 @@ cc_test(
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/mediapipe/calculators/tensor/image_to_tensor_calculator.cc b/mediapipe/calculators/tensor/image_to_tensor_calculator.cc
index 5af4cdb60..499b497b0 100644
--- a/mediapipe/calculators/tensor/image_to_tensor_calculator.cc
+++ b/mediapipe/calculators/tensor/image_to_tensor_calculator.cc
@@ -195,8 +195,9 @@ class ImageToTensorCalculator : public Node {
 #endif  // MEDIAPIPE_DISABLE_GPU
 
     RotatedRect roi = GetRoi(image->width(), image->height(), norm_rect);
-    ASSIGN_OR_RETURN(auto padding, PadRoi(options_.output_tensor_width(),
-                                          options_.output_tensor_height(),
+    const int tensor_width = params_.output_width.value_or(image->width());
+    const int tensor_height = params_.output_height.value_or(image->height());
+    ASSIGN_OR_RETURN(auto padding, PadRoi(tensor_width, tensor_height,
                                           options_.keep_aspect_ratio(), &roi));
     if (kOutLetterboxPadding(cc).IsConnected()) {
       kOutLetterboxPadding(cc).Send(padding);
@@ -214,9 +215,8 @@ class ImageToTensorCalculator : public Node {
 
     Tensor::ElementType output_tensor_type =
         GetOutputTensorType(image->UsesGpu(), params_);
-    Tensor tensor(output_tensor_type,
-                  {1, params_.output_height, params_.output_width,
-                   GetNumOutputChannels(*image)});
+    Tensor tensor(output_tensor_type, {1, tensor_height, tensor_width,
+                                       GetNumOutputChannels(*image)});
     MP_RETURN_IF_ERROR((image->UsesGpu() ? gpu_converter_ : cpu_converter_)
                            ->Convert(*image, roi, params_.range_min,
                                      params_.range_max,
diff --git a/mediapipe/calculators/tensor/image_to_tensor_calculator.proto b/mediapipe/calculators/tensor/image_to_tensor_calculator.proto
index 780ee8021..183f933df 100644
--- a/mediapipe/calculators/tensor/image_to_tensor_calculator.proto
+++ b/mediapipe/calculators/tensor/image_to_tensor_calculator.proto
@@ -54,6 +54,8 @@ message ImageToTensorCalculatorOptions {
     BORDER_REPLICATE = 2;
   }
 
+  // The width and height of output tensor. The output tensor would have the
+  // input image width/height if not set.
   optional int32 output_tensor_width = 1;
   optional int32 output_tensor_height = 2;
 
diff --git a/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc b/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc
index ceb1fc502..ed7d93886 100644
--- a/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc
+++ b/mediapipe/calculators/tensor/image_to_tensor_calculator_test.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include <cmath>
+#include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/flags/flag.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/substitute.h"
 #include "mediapipe/calculators/tensor/image_to_tensor_converter.h"
 #include "mediapipe/calculators/tensor/image_to_tensor_utils.h"
@@ -51,13 +54,12 @@ std::string GetFilePath(absl::string_view filename) {
 
 // Image to tensor test template.
 // No processing/assertions should be done after the function is invoked.
-void RunTestWithInputImagePacket(const Packet& input_image_packet,
-                                 cv::Mat expected_result, float range_min,
-                                 float range_max, int tensor_width,
-                                 int tensor_height, bool keep_aspect,
-                                 absl::optional<BorderMode> border_mode,
-                                 const mediapipe::NormalizedRect& roi,
-                                 bool output_int_tensor) {
+void RunTestWithInputImagePacket(
+    const Packet& input_image_packet, cv::Mat expected_result, float range_min,
+    float range_max, std::optional<int> tensor_width,
+    std::optional<int> tensor_height, bool keep_aspect,
+    absl::optional<BorderMode> border_mode,
+    const mediapipe::NormalizedRect& roi, bool output_int_tensor) {
   std::string border_mode_str;
   if (border_mode) {
     switch (*border_mode) {
@@ -93,8 +95,9 @@ void RunTestWithInputImagePacket(const Packet& input_image_packet,
               })",
                                            range_min, range_max);
   }
-  auto graph_config = mediapipe::ParseTextProtoOrDie<CalculatorGraphConfig>(
-      absl::Substitute(R"(
+  auto graph_config =
+      mediapipe::ParseTextProtoOrDie<CalculatorGraphConfig>(absl::Substitute(
+          R"(
         input_stream: "input_image"
         input_stream: "roi"
         node {
@@ -104,8 +107,8 @@ void RunTestWithInputImagePacket(const Packet& input_image_packet,
           output_stream: "TENSORS:tensor"
           options {
             [mediapipe.ImageToTensorCalculatorOptions.ext] {
-              output_tensor_width: $0
-              output_tensor_height: $1
+              $0 # output tensor width
+              $1 # output tensor height
               keep_aspect_ratio: $2
               $3 # output range
               $4 # border mode
@@ -113,11 +116,16 @@ void RunTestWithInputImagePacket(const Packet& input_image_packet,
           }
         }
         )",
-                       /*$0=*/tensor_width,
-                       /*$1=*/tensor_height,
-                       /*$2=*/keep_aspect ? "true" : "false",
-                       /*$3=*/output_tensor_range,
-                       /*$4=*/border_mode_str));
+          /*$0=*/tensor_width.has_value()
+              ? absl::StrFormat("output_tensor_width: %d", tensor_width.value())
+              : "",
+          /*$1=*/tensor_height.has_value()
+              ? absl::StrFormat("output_tensor_height: %d",
+                                tensor_height.value())
+              : "",
+          /*$2=*/keep_aspect ? "true" : "false",
+          /*$3=*/output_tensor_range,
+          /*$4=*/border_mode_str));
 
   std::vector<Packet> output_packets;
   tool::AddVectorSink("tensor", &graph_config, &output_packets);
@@ -149,18 +157,18 @@ void RunTestWithInputImagePacket(const Packet& input_image_packet,
   if (output_int_tensor) {
     if (range_min < 0) {
       EXPECT_EQ(tensor.element_type(), Tensor::ElementType::kInt8);
-      tensor_mat = cv::Mat(tensor_height, tensor_width,
+      tensor_mat = cv::Mat(expected_result.rows, expected_result.cols,
                            channels == 1 ? CV_8SC1 : CV_8SC3,
                            const_cast<int8*>(view.buffer<int8>()));
     } else {
       EXPECT_EQ(tensor.element_type(), Tensor::ElementType::kUInt8);
-      tensor_mat = cv::Mat(tensor_height, tensor_width,
+      tensor_mat = cv::Mat(expected_result.rows, expected_result.cols,
                            channels == 1 ? CV_8UC1 : CV_8UC3,
                            const_cast<uint8*>(view.buffer<uint8>()));
     }
   } else {
     EXPECT_EQ(tensor.element_type(), Tensor::ElementType::kFloat32);
-    tensor_mat = cv::Mat(tensor_height, tensor_width,
+    tensor_mat = cv::Mat(expected_result.rows, expected_result.cols,
                          channels == 1 ? CV_32FC1 : CV_32FC3,
                          const_cast<float*>(view.buffer<float>()));
   }
@@ -216,9 +224,9 @@ const std::vector<InputType> kInputTypesToTest = {InputType::kImageFrame,
 
 void RunTest(cv::Mat input, cv::Mat expected_result,
              std::vector<std::pair<float, float>> float_ranges,
-             std::vector<std::pair<int, int>> int_ranges, int tensor_width,
-             int tensor_height, bool keep_aspect,
-             absl::optional<BorderMode> border_mode,
+             std::vector<std::pair<int, int>> int_ranges,
+             std::optional<int> tensor_width, std::optional<int> tensor_height,
+             bool keep_aspect, absl::optional<BorderMode> border_mode,
              const mediapipe::NormalizedRect& roi) {
   for (auto input_type : kInputTypesToTest) {
     for (auto float_range : float_ranges) {
@@ -486,5 +494,18 @@ TEST(ImageToTensorCalculatorTest, NoOpExceptRangeBorderZero) {
           BorderMode::kZero, roi);
 }
 
+TEST(ImageToTensorCalculatorTest, NoOpExceptRangeAndUseInputImageDims) {
+  mediapipe::NormalizedRect roi;
+  roi.set_x_center(0.5f);
+  roi.set_y_center(0.5f);
+  roi.set_width(1.0f);
+  roi.set_height(1.0f);
+  RunTest(GetRgb(GetFilePath("input.jpg")),
+          GetRgb(GetFilePath("noop_except_range.png")),
+          /*float_ranges=*/{{-1.0f, 1.0f}},
+          /*int_ranges=*/{{0, 255}, {-128, 127}},
+          /*tensor_width=*/std::nullopt, /*tensor_height=*/std::nullopt,
+          /*keep_aspect=*/false, BorderMode::kZero, roi);
+}
 }  // namespace
 }  // namespace mediapipe
diff --git a/mediapipe/calculators/tensor/image_to_tensor_utils.h b/mediapipe/calculators/tensor/image_to_tensor_utils.h
index dc38ac7bc..a73529dce 100644
--- a/mediapipe/calculators/tensor/image_to_tensor_utils.h
+++ b/mediapipe/calculators/tensor/image_to_tensor_utils.h
@@ -16,6 +16,7 @@
 #define MEDIAPIPE_CALCULATORS_TENSOR_IMAGE_TO_TENSOR_UTILS_H_
 
 #include <array>
+#include <optional>
 
 #include "absl/types/optional.h"
 #include "mediapipe/calculators/tensor/image_to_tensor_calculator.pb.h"
@@ -51,8 +52,8 @@ enum class BorderMode { kZero, kReplicate };
 // Struct that host commonly accessed parameters used in the
 // ImageTo[Batch]TensorCalculator.
 struct OutputTensorParams {
-  int output_height;
-  int output_width;
+  std::optional<int> output_height;
+  std::optional<int> output_width;
   int output_batch;
   bool is_float_output;
   float range_min;
@@ -161,10 +162,14 @@ absl::Status ValidateOptionOutputDims(const T& options) {
         << "The maximum of the output int tensor range must be less than or "
            "equal to 127.";
   }
-  RET_CHECK_GT(options.output_tensor_width(), 0)
-      << "Valid output tensor width is required.";
-  RET_CHECK_GT(options.output_tensor_height(), 0)
-      << "Valid output tensor height is required.";
+  if (options.has_output_tensor_width()) {
+    RET_CHECK_GT(options.output_tensor_width(), 0)
+        << "Valid output tensor width is required.";
+  }
+  if (options.has_output_tensor_height()) {
+    RET_CHECK_GT(options.output_tensor_height(), 0)
+        << "Valid output tensor height is required.";
+  }
   return absl::OkStatus();
 }
 
@@ -185,8 +190,12 @@ OutputTensorParams GetOutputTensorParams(const T& options) {
     params.range_min = options.output_tensor_float_range().min();
     params.range_max = options.output_tensor_float_range().max();
   }
-  params.output_width = options.output_tensor_width();
-  params.output_height = options.output_tensor_height();
+  if (options.has_output_tensor_width()) {
+    params.output_width = options.output_tensor_width();
+  }
+  if (options.has_output_tensor_height()) {
+    params.output_height = options.output_tensor_height();
+  }
   params.is_float_output = options.has_output_tensor_float_range();
   params.output_batch = 1;
   return params;
diff --git a/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc b/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc
index 450bcba31..70f39d52e 100644
--- a/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc
+++ b/mediapipe/calculators/tensor/image_to_tensor_utils_test.cc
@@ -14,6 +14,8 @@
 
 #include "mediapipe/calculators/tensor/image_to_tensor_utils.h"
 
+#include <optional>
+
 #include "mediapipe/framework/formats/rect.pb.h"
 #include "mediapipe/framework/port/gtest.h"
 #include "mediapipe/framework/port/parse_text_proto.h"
@@ -172,6 +174,10 @@ constexpr char kValidIntProto[] = R"(
   output_tensor_height: 200
 )";
 
+constexpr char kValidNoTensorDimsProto[] = R"(
+  output_tensor_float_range { min: 0 max: 255 }
+)";
+
 TEST(ValidateOptionOutputDims, ImageToTensorCalcOptions) {
   const auto float_options =
       mediapipe::ParseTextProtoOrDie<mediapipe::ImageToTensorCalculatorOptions>(
@@ -193,13 +199,6 @@ TEST(ValidateOptionOutputDims, EmptyProto) {
       ValidateOptionOutputDims(options),
       StatusIs(absl::StatusCode::kInternal,
                HasSubstr("Valid output float tensor range is required")));
-
-  // Output width/height is not set.
-  options.mutable_output_tensor_float_range()->set_min(0.0);
-  options.mutable_output_tensor_float_range()->set_max(1.0);
-  EXPECT_THAT(ValidateOptionOutputDims(options),
-              StatusIs(absl::StatusCode::kInternal,
-                       HasSubstr("Valid output tensor width is required")));
 }
 
 TEST(GetOutputTensorParams, ImageToTensorCalcOptionsSetValues) {
@@ -215,6 +214,20 @@ TEST(GetOutputTensorParams, ImageToTensorCalcOptionsSetValues) {
   EXPECT_EQ(params2.output_height, 200);
 }
 
+TEST(GetOutputTensorParams, ImageToTensorCalcOptionsNoTensorDims) {
+  // Test valid option for ImageToTensorCalculatorOptions without output
+  // width/height.
+  const auto options =
+      mediapipe::ParseTextProtoOrDie<mediapipe::ImageToTensorCalculatorOptions>(
+          kValidNoTensorDimsProto);
+  const auto params3 = GetOutputTensorParams(options);
+  EXPECT_EQ(params3.range_min, 0.0f);
+  EXPECT_EQ(params3.range_max, 255.0f);
+  EXPECT_EQ(params3.output_batch, 1);
+  EXPECT_EQ(params3.output_width, std::nullopt);
+  EXPECT_EQ(params3.output_height, std::nullopt);
+}
+
 TEST(GetBorderMode, GetBorderMode) {
   // Default to REPLICATE.
   auto border_mode =