Clean up TensorConverterCalculator flipping behavior

Returns an error if - gpu_origin is specified for a CPU image, and - gpu_origin and flip_vertically are both specified. Adds a test for an IMAGE_GPU input to validate flipping. PiperOrigin-RevId: 565311456
2023-09-14 02:53:07 -07:00 · 2023-09-14 02:53:07 -07:00 · 124a4de08d
commit 124a4de08d
parent 21646008d5
4 changed files with 165 additions and 94 deletions
--- a/mediapipe/calculators/tensor/BUILD
+++ b/mediapipe/calculators/tensor/BUILD
@ -660,7 +660,12 @@ cc_library(
        "//mediapipe/gpu:gpu_buffer_format",
        "//mediapipe/gpu:gpu_origin_cc_proto",
        "//mediapipe/util:resource_util",
        "@com_google_absl//absl/log",
        "@com_google_absl//absl/log:absl_check",
        "@com_google_absl//absl/log:absl_log",
        "@com_google_absl//absl/log:check",
        "@com_google_absl//absl/status",
        "@com_google_absl//absl/status:statusor",
        "@com_google_absl//absl/strings:str_format",
    ] + select({
        "//mediapipe/gpu:disable_gpu": [],
@ -715,6 +720,7 @@ cc_test(
        "//mediapipe/framework/port:parse_text_proto",
        "//mediapipe/framework/tool:validate_type",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/status",
        "@com_google_absl//absl/strings",
    ],
 )
--- a/mediapipe/calculators/tensor/tensor_converter_calculator.cc
+++ b/mediapipe/calculators/tensor/tensor_converter_calculator.cc
@ -17,6 +17,7 @@
 #include <vector>
 #include "absl/log/absl_check.h"
 #include "absl/log/absl_log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@ -57,11 +58,25 @@ int NumGroups(const int size, const int group_size) {  // NOLINT
 }
 absl::StatusOr<bool> ShouldFlipVertically(
-    const mediapipe::TensorConverterCalculatorOptions& options) {
+    const mediapipe::TensorConverterCalculatorOptions& options, bool use_gpu) {
  if (options.has_flip_vertically() && options.has_gpu_origin()) {
    return absl::FailedPreconditionError(absl::StrFormat(
        "Cannot specify both flip_vertically and gpu_origin options"));
  }
  if (!options.has_gpu_origin()) {
    // Fall back to flip_vertically.
    return options.flip_vertically();
  }
  // Warn if gpu_origin is specified with a CPU input image.
  // Those are always TOP_LEFT, so no flipping is necessary.
  if (!use_gpu) {
    ABSL_LOG(WARNING)
        << "Ignoring gpu_origin option since IMAGE_GPU input is not specified";
    return false;
  }
  switch (options.gpu_origin()) {
    case mediapipe::GpuOrigin::TOP_LEFT:
      return false;
@ -140,7 +155,7 @@ class TensorConverterCalculator : public CalculatorBase {
 private:
  absl::Status InitGpu(CalculatorContext* cc);
-  absl::Status LoadOptions(CalculatorContext* cc);
+  absl::Status LoadOptions(CalculatorContext* cc, bool use_gpu);
  template <class T>
  absl::Status NormalizeImage(const ImageFrame& image_frame,
                              bool flip_vertically, float* tensor_ptr);
@ -176,7 +191,8 @@ absl::Status TensorConverterCalculator::GetContract(CalculatorContract* cc) {
  RET_CHECK(static_cast<int>(cc->Inputs().HasTag(kImageFrameTag)) +
                static_cast<int>(cc->Inputs().HasTag(kGpuBufferTag)) +
                static_cast<int>(cc->Inputs().HasTag(kMatrixTag)) ==
-            1);
+            1)
      << "Only one input tag of {IMAGE, IMAGE_GPU, MATRIX} may be specified";
  if (cc->Inputs().HasTag(kImageFrameTag)) {
    cc->Inputs().Tag(kImageFrameTag).Set<ImageFrame>();
@ -204,8 +220,6 @@ absl::Status TensorConverterCalculator::GetContract(CalculatorContract* cc) {
 absl::Status TensorConverterCalculator::Open(CalculatorContext* cc) {
  cc->SetOffset(TimestampDiff(0));
  MP_RETURN_IF_ERROR(LoadOptions(cc));
 #if !MEDIAPIPE_DISABLE_GPU
  if (cc->Inputs().HasTag(kGpuBufferTag)) {
    use_gpu_ = true;
@ -218,6 +232,8 @@ absl::Status TensorConverterCalculator::Open(CalculatorContext* cc) {
  }
 #endif  // !MEDIAPIPE_DISABLE_GPU
  MP_RETURN_IF_ERROR(LoadOptions(cc, use_gpu_));
  return absl::OkStatus();
 }
@ -436,7 +452,7 @@ absl::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) {
  // Shader to convert GL Texture to Metal Buffer,
  // with normalization to either: [0,1] or [-1,1].
  const std::string shader_source = absl::Substitute(
-      R"(
+      R"glsl(
  #include <metal_stdlib>
  using namespace metal;
@ -455,7 +471,7 @@ absl::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) {
    $3  // g & b channels
    $4  // alpha channel
  }
-      )",
+      )glsl",
      /*$0=*/
      output_range_.has_value()
          ? absl::Substitute("pixel = pixel * half($0) + half($1);",
@ -465,8 +481,8 @@ absl::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) {
      /*$1=*/max_num_channels_,
      /*$2=*/flip_vertically_ ? "(in_tex.get_height() - 1 - gid.y)" : "gid.y",
      /*$3=*/
-      single_channel ? "" : R"(out_buf[linear_index + 1] = pixel.y;
+      single_channel ? "" : R"glsl(out_buf[linear_index + 1] = pixel.y;
-               out_buf[linear_index + 2] = pixel.z;)",
+                                   out_buf[linear_index + 2] = pixel.z;)glsl",
      /*$4=*/include_alpha ? "out_buf[linear_index + 3] = pixel.w;" : "");
  NSString* library_source =
@ -484,17 +500,17 @@ absl::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) {
  RET_CHECK(to_buffer_program_ != nil) << "Couldn't create pipeline state " <<
      [[error localizedDescription] UTF8String];
 #elif MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30
-  MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext([this, &include_alpha,
+  MP_RETURN_IF_ERROR(
      gpu_helper_.RunInGlContext([this, &include_alpha,
 #if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
-                                                 &input,
+                                  &input,
 #endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
-                                                 &single_channel]()
+                                  &single_channel]() -> absl::Status {
                                                    -> absl::Status {
 #if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
-    // Shader to convert GL Texture to Shader Storage Buffer Object (SSBO),
+        // Shader to convert GL Texture to Shader Storage Buffer Object (SSBO),
-    // with normalization to either: [0,1] or [-1,1].
+        // with normalization to either: [0,1] or [-1,1].
-    const std::string shader_source = absl::Substitute(
+        const std::string shader_source = absl::Substitute(
-        R"( #version 310 es
+            R"glsl( #version 310 es
          layout(local_size_x = $0, local_size_y = $0) in;
          layout(binding = 0) uniform sampler2D input_texture;
          layout(std430, binding = 1) buffer Output {float elements[];} output_data;
@ -508,38 +524,40 @@ absl::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) {
            output_data.elements[linear_index + 0] = pixel.x;  // r channel
            $5  // g & b channels
            $6  // alpha channel
-          })",
+          })glsl",
-        /*$0=*/kWorkgroupSize, /*$1=*/input.width(), /*$2=*/input.height(),
+            /*$0=*/kWorkgroupSize, /*$1=*/input.width(), /*$2=*/input.height(),
-        /*$3=*/
+            /*$3=*/
-        output_range_.has_value()
+            output_range_.has_value()
-            ? absl::Substitute("pixel = pixel * float($0) + float($1);",
+                ? absl::Substitute(
-                               (output_range_->second - output_range_->first),
+                      "pixel = pixel * float($0) + float($1);",
-                               output_range_->first)
+                      (output_range_->second - output_range_->first),
-            : "",
+                      output_range_->first)
-        /*$4=*/flip_vertically_ ? "(width_height.y - 1 - gid.y)" : "gid.y",
+                : "",
-        /*$5=*/
+            /*$4=*/flip_vertically_ ? "(width_height.y - 1 - gid.y)" : "gid.y",
-        single_channel ? ""
+            /*$5=*/
-                       : R"(output_data.elements[linear_index + 1] = pixel.y;
+            single_channel
-                            output_data.elements[linear_index + 2] = pixel.z;)",
+                ? ""
-        /*$6=*/
+                : R"glsl(output_data.elements[linear_index + 1] = pixel.y;
-        include_alpha ? "output_data.elements[linear_index + 3] = pixel.w;"
+                     output_data.elements[linear_index + 2] = pixel.z;)glsl",
-                      : "",
+            /*$6=*/
-        /*$7=*/max_num_channels_);
+            include_alpha ? "output_data.elements[linear_index + 3] = pixel.w;"
-    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
+                          : "",
-    const GLchar* sources[] = {shader_source.c_str()};
+            /*$7=*/max_num_channels_);
-    glShaderSource(shader, 1, sources, NULL);
+        GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
-    glCompileShader(shader);
+        const GLchar* sources[] = {shader_source.c_str()};
-    GLint compiled = GL_FALSE;
+        glShaderSource(shader, 1, sources, NULL);
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
+        glCompileShader(shader);
-    RET_CHECK(compiled == GL_TRUE);
+        GLint compiled = GL_FALSE;
-    to_buffer_program_ = glCreateProgram();
+        glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
-    glAttachShader(to_buffer_program_, shader);
+        RET_CHECK(compiled == GL_TRUE);
-    glDeleteShader(shader);
+        to_buffer_program_ = glCreateProgram();
-    glLinkProgram(to_buffer_program_);
+        glAttachShader(to_buffer_program_, shader);
        glDeleteShader(shader);
        glLinkProgram(to_buffer_program_);
 #else
-    // OpenGL ES 3.0 fragment shader Texture2d -> Texture2d conversion.
+        // OpenGL ES 3.0 fragment shader Texture2d -> Texture2d conversion.
-    const std::string shader_source = absl::Substitute(
+        const std::string shader_source = absl::Substitute(
-        R"(
+            R"glsl(
        #if __VERSION__ < 130
          #define in varying
        #endif  // __VERSION__ < 130
@ -565,49 +583,51 @@ absl::Status TensorConverterCalculator::InitGpu(CalculatorContext* cc) {
            fragColor.r = pixel.r;  // r channel
            $3  // g & b channels
            $4  // alpha channel
-          })",
+          })glsl",
-        /*$0=*/single_channel ? "vec1" : "vec4",
+            /*$0=*/single_channel ? "vec1" : "vec4",
-        /*$1=*/
+            /*$1=*/
-        flip_vertically_
+            flip_vertically_
-            ? "vec2(sample_coordinate.x, 1.0 - sample_coordinate.y);"
+                ? "vec2(sample_coordinate.x, 1.0 - sample_coordinate.y);"
-            : "sample_coordinate;",
+                : "sample_coordinate;",
-        /*$2=*/output_range_.has_value()
+            /*$2=*/output_range_.has_value()
-            ? absl::Substitute("pixel = pixel * float($0) + float($1);",
+                ? absl::Substitute(
-                               (output_range_->second - output_range_->first),
+                      "pixel = pixel * float($0) + float($1);",
-                               output_range_->first)
+                      (output_range_->second - output_range_->first),
-            : "",
+                      output_range_->first)
-        /*$3=*/single_channel ? "" : R"(fragColor.g = pixel.g;
+                : "",
-                            fragColor.b = pixel.b;)",
+            /*$3=*/single_channel ? "" : R"glsl(fragColor.g = pixel.g;
-        /*$4=*/
+                                            fragColor.b = pixel.b;)glsl",
-        include_alpha ? "fragColor.a = pixel.a;"
+            /*$4=*/
-                      : (single_channel ? "" : "fragColor.a = 1.0;"));
+            include_alpha ? "fragColor.a = pixel.a;"
                          : (single_channel ? "" : "fragColor.a = 1.0;"));
-    const GLint attr_location[NUM_ATTRIBUTES] = {
+        const GLint attr_location[NUM_ATTRIBUTES] = {
-        ATTRIB_VERTEX,
+            ATTRIB_VERTEX,
-        ATTRIB_TEXTURE_POSITION,
+            ATTRIB_TEXTURE_POSITION,
-    };
+        };
-    const GLchar* attr_name[NUM_ATTRIBUTES] = {
+        const GLchar* attr_name[NUM_ATTRIBUTES] = {
-        "position",
+            "position",
-        "texture_coordinate",
+            "texture_coordinate",
-    };
+        };
-    // shader program and params
+        // shader program and params
-    mediapipe::GlhCreateProgram(
+        mediapipe::GlhCreateProgram(
-        mediapipe::kBasicVertexShader, shader_source.c_str(), NUM_ATTRIBUTES,
+            mediapipe::kBasicVertexShader, shader_source.c_str(),
-        &attr_name[0], attr_location, &to_tex2d_program_);
+            NUM_ATTRIBUTES, &attr_name[0], attr_location, &to_tex2d_program_);
-    RET_CHECK(to_tex2d_program_) << "Problem initializing the program.";
+        RET_CHECK(to_tex2d_program_) << "Problem initializing the program.";
-    glUseProgram(to_tex2d_program_);
+        glUseProgram(to_tex2d_program_);
-    glUniform1i(glGetUniformLocation(to_tex2d_program_, "frame"), 1);
+        glUniform1i(glGetUniformLocation(to_tex2d_program_, "frame"), 1);
-    glGenFramebuffers(1, &framebuffer_);
+        glGenFramebuffers(1, &framebuffer_);
 #endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
-    return absl::OkStatus();
+        return absl::OkStatus();
-  }));
+      }));
 #endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30
 #endif  // !MEDIAPIPE_DISABLE_GPU
  return absl::OkStatus();
 }
-absl::Status TensorConverterCalculator::LoadOptions(CalculatorContext* cc) {
+absl::Status TensorConverterCalculator::LoadOptions(CalculatorContext* cc,
                                                    bool use_gpu) {
  // Get calculator options specified in the graph.
  const auto& options =
      cc->Options<::mediapipe::TensorConverterCalculatorOptions>();
@ -635,7 +655,7 @@ absl::Status TensorConverterCalculator::LoadOptions(CalculatorContext* cc) {
  }
  // Get y-flip mode.
-  ASSIGN_OR_RETURN(flip_vertically_, ShouldFlipVertically(options));
+  ASSIGN_OR_RETURN(flip_vertically_, ShouldFlipVertically(options, use_gpu));
  // Get row_major_matrix mode.
  row_major_matrix_ = options.row_major_matrix();
--- a/mediapipe/calculators/tensor/tensor_converter_calculator.proto
+++ b/mediapipe/calculators/tensor/tensor_converter_calculator.proto
@ -44,12 +44,14 @@ message TensorConverterCalculatorOptions {
  // with a coordinate system where the origin is at the bottom-left corner
  // (e.g., in OpenGL) whereas the ML model expects an image with a top-left
  // origin.
-  // Prefer gpu_origin over this field.
+  // Prefer gpu_origin over this field when using GPU input images.
  optional bool flip_vertically = 2 [default = false];
-  // Determines when the input image should be flipped vertically.
+  // Determines when the input GPU image should be flipped vertically.
  // See GpuOrigin.Mode for more information.
  // Affects only IMAGE_GPU inputs.
  // If unset, falls back to flip_vertically for backwards compatibility.
  // Cannot set both gpu_origin and flip_vertically.
  optional GpuOrigin.Mode gpu_origin = 10;
  // Controls how many channels of the input image get passed through to the
--- a/mediapipe/calculators/tensor/tensor_converter_calculator_test.cc
+++ b/mediapipe/calculators/tensor/tensor_converter_calculator_test.cc
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <memory>
 #include <random>
@ -19,6 +20,7 @@
 #include <vector>
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/substitute.h"
 #include "mediapipe/framework/calculator_framework.h"
 #include "mediapipe/framework/calculator_runner.h"
@ -45,6 +47,7 @@ constexpr char kTransposeOptionsString[] =
 }  // namespace
 using RandomEngine = std::mt19937_64;
 using ::testing::HasSubstr;
 const uint32_t kSeed = 1234;
 const int kNumSizes = 8;
 const int sizes[kNumSizes][2] = {{1, 1}, {12, 1}, {1, 9},   {2, 2},
@ -57,7 +60,7 @@ class TensorConverterCalculatorTest : public ::testing::Test {
                       bool row_major_matrix = false) {
    RandomEngine random(kSeed);
    std::uniform_real_distribution<> uniform_dist(0, 1.0);
-    auto matrix = ::absl::make_unique<Matrix>();
+    auto matrix = std::make_unique<Matrix>();
    matrix->resize(num_rows, num_columns);
    if (row_major_matrix) {
      for (int y = 0; y < num_rows; ++y) {
@ -105,7 +108,7 @@ TEST_F(TensorConverterCalculatorTest, RandomMatrixColMajor) {
    tool::AddVectorSink("tensor", &graph_config, &output_packets);
    // Run the graph.
-    graph_ = absl::make_unique<CalculatorGraph>();
+    graph_ = std::make_unique<CalculatorGraph>();
    MP_ASSERT_OK(graph_->Initialize(graph_config));
    MP_ASSERT_OK(graph_->StartRun({}));
@ -167,7 +170,7 @@ TEST_F(TensorConverterCalculatorTest, RandomMatrixRowMajor) {
    tool::AddVectorSink("tensor", &graph_config, &output_packets);
    // Run the graph.
-    graph_ = absl::make_unique<CalculatorGraph>();
+    graph_ = std::make_unique<CalculatorGraph>();
    MP_ASSERT_OK(graph_->Initialize(graph_config));
    MP_ASSERT_OK(graph_->StartRun({}));
@ -231,7 +234,7 @@ TEST_F(TensorConverterCalculatorTest, CustomDivAndSub) {
  // Run the graph.
  MP_ASSERT_OK(graph.Initialize(graph_config));
  MP_ASSERT_OK(graph.StartRun({}));
-  auto input_image = absl::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 1);
+  auto input_image = std::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 1);
  cv::Mat mat = mediapipe::formats::MatView(input_image.get());
  mat.at<uint8_t>(0, 0) = 200;
  MP_ASSERT_OK(graph.AddPacketToInputStream(
@ -285,7 +288,7 @@ TEST_F(TensorConverterCalculatorTest, SetOutputRange) {
    // Run the graph.
    MP_ASSERT_OK(graph.Initialize(graph_config));
    MP_ASSERT_OK(graph.StartRun({}));
-    auto input_image = absl::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 1);
+    auto input_image = std::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 1);
    cv::Mat mat = mediapipe::formats::MatView(input_image.get());
    mat.at<uint8_t>(0, 0) = 200;
    MP_ASSERT_OK(graph.AddPacketToInputStream(
@ -341,7 +344,7 @@ TEST_F(TensorConverterCalculatorTest, FlipVertically) {
  // Run the graph.
  MP_ASSERT_OK(graph.Initialize(graph_config));
  MP_ASSERT_OK(graph.StartRun({}));
-  auto input_image = absl::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 2);
+  auto input_image = std::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 2);
  cv::Mat mat = mediapipe::formats::MatView(input_image.get());
  constexpr uint8_t kY0Value = 100;
  constexpr uint8_t kY1Value = 200;
@ -372,7 +375,8 @@ TEST_F(TensorConverterCalculatorTest, FlipVertically) {
  MP_ASSERT_OK(graph.WaitUntilDone());
 }
-TEST_F(TensorConverterCalculatorTest, GpuOriginOverridesFlipVertically) {
+TEST_F(TensorConverterCalculatorTest,
       CannotSpecifyBothFlipVerticallyAndGpuOrigin) {
  CalculatorGraph graph;
  CalculatorGraphConfig graph_config =
      mediapipe::ParseTextProtoOrDie<CalculatorGraphConfig>(R"pb(
@ -396,7 +400,46 @@ TEST_F(TensorConverterCalculatorTest, GpuOriginOverridesFlipVertically) {
  // Run the graph.
  MP_ASSERT_OK(graph.Initialize(graph_config));
  MP_ASSERT_OK(graph.StartRun({}));
-  auto input_image = absl::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 2);
+  auto input_image = std::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 1);
  MP_ASSERT_OK(graph.AddPacketToInputStream(
      "input_image", Adopt(input_image.release()).At(Timestamp(0))));
  // Processing should fail as we specified both flip_vertically and gpu_origin.
  absl::Status status = graph.WaitUntilIdle();
  EXPECT_FALSE(status.ok());
  EXPECT_THAT(status.message(), HasSubstr("flip_vertically and gpu_origin"));
  EXPECT_EQ(output_packets.size(), 0);
  // Fully close graph at end, otherwise calculator+tensors are destroyed
  // after calling WaitUntilDone().
  MP_ASSERT_OK(graph.CloseInputStream("input_image"));
  EXPECT_FALSE(graph.WaitUntilDone().ok());
 }
 TEST_F(TensorConverterCalculatorTest, GpuOriginIsIgnoredWithCpuImage) {
  CalculatorGraph graph;
  CalculatorGraphConfig graph_config =
      mediapipe::ParseTextProtoOrDie<CalculatorGraphConfig>(R"pb(
        input_stream: "input_image"
        node {
          calculator: "TensorConverterCalculator"
          input_stream: "IMAGE:input_image"
          output_stream: "TENSORS:tensor"
          options {
            [mediapipe.TensorConverterCalculatorOptions.ext] {
              gpu_origin: CONVENTIONAL
              output_tensor_float_range { min: 0 max: 255 }
            }
          }
        }
      )pb");
  std::vector<Packet> output_packets;
  tool::AddVectorSink("tensor", &graph_config, &output_packets);
  // Run the graph.
  MP_ASSERT_OK(graph.Initialize(graph_config));
  MP_ASSERT_OK(graph.StartRun({}));
  auto input_image = std::make_unique<ImageFrame>(ImageFormat::GRAY8, 1, 2);
  cv::Mat mat = mediapipe::formats::MatView(input_image.get());
  constexpr uint8_t kY0Value = 100;
  constexpr uint8_t kY1Value = 200;