Internal change

PiperOrigin-RevId: 514580892
2023-03-06 18:11:38 -08:00 · 2023-03-06 18:11:38 -08:00 · bd9a2ee1fc
commit bd9a2ee1fc
parent 0337c7f52f
4 changed files with 639 additions and 6 deletions
--- a/mediapipe/tasks/cc/vision/image_segmenter/calculators/BUILD
+++ b/mediapipe/tasks/cc/vision/image_segmenter/calculators/BUILD
@ -55,6 +55,27 @@ cc_library(
    alwayslink = 1,
 )

+cc_library(
+    name = "segmentation_postprocessor_gl",
+    srcs = ["segmentation_postprocessor_gl.cc"],
+    hdrs = ["segmentation_postprocessor_gl.h"],
+    tags = ["nomac"],
+    deps = [
+        ":tensors_to_segmentation_calculator_cc_proto",
+        "//mediapipe/framework:calculator_framework",
+        "//mediapipe/framework/formats:image",
+        "//mediapipe/framework/formats:tensor",
+        "//mediapipe/framework/port:status",
+        "//mediapipe/gpu:gl_calculator_helper",
+        "//mediapipe/gpu:gl_simple_shaders",
+        "//mediapipe/gpu:shader_util",
+        "//mediapipe/tasks/cc/vision/image_segmenter/proto:segmenter_options_cc_proto",
+        "//mediapipe/tasks/cc/vision/utils:image_utils",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 cc_test(
    name = "tensors_to_segmentation_calculator_test",
    srcs = ["tensors_to_segmentation_calculator_test.cc"],
--- a/mediapipe/tasks/cc/vision/image_segmenter/calculators/segmentation_postprocessor_gl.cc
+++ b/mediapipe/tasks/cc/vision/image_segmenter/calculators/segmentation_postprocessor_gl.cc
@ -0,0 +1,502 @@
+#include "mediapipe/tasks/cc/vision/image_segmenter/calculators/segmentation_postprocessor_gl.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "mediapipe/framework/port/status_macros.h"
+#include "mediapipe/gpu/gl_simple_shaders.h"
+#include "mediapipe/gpu/shader_util.h"
+#include "mediapipe/tasks/cc/vision/image_segmenter/proto/segmenter_options.pb.h"
+
+namespace mediapipe {
+namespace tasks {
+namespace {
+
+using mediapipe::kBasicSquareVertices;
+using mediapipe::kBasicTextureVertices;
+using mediapipe::kBasicVertexShader;
+using ::mediapipe::tasks::vision::Shape;
+using ::mediapipe::tasks::vision::image_segmenter::proto::SegmenterOptions;
+
+enum { ATTRIB_VERTEX, ATTRIB_TEXTURE_POSITION, NUM_ATTRIBUTES };
+
+static constexpr char kActivationFragmentShader[] = R"(
+DEFAULT_PRECISION(mediump, float)
+in vec2 sample_coordinate;
+uniform sampler2D input_texture;
+
+void main() {
+  vec4 in_value = texture2D(input_texture, sample_coordinate);
+
+  // Run activation function over all 4 channels at once.
+  %s
+
+  gl_FragColor = out_value;
+})";
+
+// Trivial passthrough fragment shader; do splitting in a custom vertex shader.
+static constexpr char kPassthroughShader[] = R"(
+DEFAULT_PRECISION(mediump, float)
+in vec2 sample_coordinate;
+uniform sampler2D input_texture;
+
+void main() {
+  gl_FragColor = texture2D(input_texture, sample_coordinate);
+})";
+
+// Vertex shader for splitting; kLayoutAligned means we just move across x-axis.
+static constexpr char kSplitVertexShader[] = R"(
+DEFAULT_PRECISION(highp, float)
+attribute vec4 position;
+attribute vec4 texture_coordinate;
+varying vec2 sample_coordinate;
+
+// We assume kLayoutAligned for now. Everything will be scaled properly, so just
+// need offset for decimation iterations.
+uniform float x_offset;
+
+void main() {
+  sample_coordinate = vec2(texture_coordinate.x + x_offset, texture_coordinate.y);
+  gl_Position = position;
+})";
+
+// TODO: Consider using MRT to speed this up in the future.
+static constexpr char kChannelSelectShader[] = R"(
+DEFAULT_PRECISION(mediump, float)
+in vec2 sample_coordinate;
+uniform sampler2D input_texture;
+uniform int channel_select;
+
+void main() {
+  vec4 in_value = texture2D(input_texture, sample_coordinate);
+  float out_value;
+  if (channel_select == 0) {
+    out_value = in_value.r;
+  } else if (channel_select == 1) {
+    out_value = in_value.g;
+  } else if (channel_select == 2) {
+    out_value = in_value.b;
+  } else {
+    out_value = in_value.a;
+  }
+  gl_FragColor = vec4(out_value, out_value, out_value, out_value);
+})";
+
+// Hard-coded for max of 3 textures for now, so num classes must be <= 12, and
+// the cost of this shader will be higher than necessary for smaller numbers of
+// classes.
+// TODO: Improve this.
+static constexpr char kArgmaxShader[] = R"(
+DEFAULT_PRECISION(mediump, float)
+in vec2 sample_coordinate;
+uniform sampler2D input_texture0;
+uniform sampler2D input_texture1;
+uniform sampler2D input_texture2;
+
+int argmax4(vec4 vec) {
+  float aMax = max(vec.x, vec.y);
+  float bMax = max(vec.z, vec.w);
+  if (aMax >= bMax) {
+    if (vec.x >= vec.y) return 0;
+    return 1;
+  } else if (vec.z >= vec.w) return 2;
+  return 3;
+}
+
+float max4(vec4 vec) {
+  return max(max(vec.x, vec.y), max(vec.z, vec.w));
+}
+
+void main() {
+  // Grab all vecs
+  vec4 pixel0 = texture2D(input_texture0, sample_coordinate);
+  vec4 pixel1 = texture2D(input_texture1, sample_coordinate);
+  vec4 pixel2 = texture2D(input_texture2, sample_coordinate);
+
+  // Find vector which contains maximum value, and return its argmax
+  float max0 = max4(pixel0);
+  float max1 = max4(pixel1);
+  float max2 = max4(pixel2);
+
+  int argmax;
+  float out_value;
+  if (max0 >= max1) {
+    if (max0 >= max2) {
+      argmax = argmax4(pixel0);
+    } else {
+      argmax = argmax4(pixel2) + 8;
+    }
+  } else if (max1 >= max2) {
+    argmax = argmax4(pixel1) + 4;
+  } else {
+    argmax = argmax4(pixel2) + 8;
+  }
+
+  out_value = float(argmax) / 255.0;
+  gl_FragColor = vec4(out_value, out_value, out_value, out_value);
+})";
+
+}  // namespace
+
+// static
+absl::Status SegmentationPostprocessorGl::UpdateContract(
+    CalculatorContract* cc) {
+  return GlCalculatorHelper::UpdateContract(cc);
+}
+
+absl::Status SegmentationPostprocessorGl::Initialize(
+    CalculatorContext* cc,
+    TensorsToSegmentationCalculatorOptions const& options) {
+  options_ = options;  // Just copy for now
+  MP_RETURN_IF_ERROR(helper_.Open(cc));
+  MP_RETURN_IF_ERROR(GlInit());
+  return absl::OkStatus();
+}
+
+absl::Status SegmentationPostprocessorGl::GlInit() {
+  return helper_.RunInGlContext([this]() -> absl::Status {
+    // TODO: This part of the setup code is so common, we should really
+    // refactor to a helper utility.
+    const GLint attr_location[NUM_ATTRIBUTES] = {
+        ATTRIB_VERTEX,
+        ATTRIB_TEXTURE_POSITION,
+    };
+    const GLchar* attr_name[NUM_ATTRIBUTES] = {
+        "position",
+        "texture_coordinate",
+    };
+
+    std::string activation_fn;
+    switch (options_.segmenter_options().activation()) {
+      case SegmenterOptions::SIGMOID:
+        LOG(INFO) << "SIGMOID activation function chosen on GPU";
+        activation_fn = "vec4 out_value = 1.0 / (exp(-in_value) + 1.0);";
+        break;
+      case SegmenterOptions::SOFTMAX:
+        LOG(ERROR) << "SOFTMAX activation function not implemented for GPU";
+        // TODO: Softmax algo per-pixel:
+        // (1) Find max of all channels
+        // (2) For each channel do exp(val - max_value) transform
+        // (3) Find sum over all channels
+        // (4) Divide by this sum
+        break;
+      case SegmenterOptions::NONE:
+        LOG(INFO) << "NONE activation function chosen on GPU";
+        activation_fn = "vec4 out_value = in_value;";
+        break;
+    }
+
+    // TODO: Skip activation step entirely for "NONE" to save a full
+    //     renderpass.  (And same applies for CATEGORY_MASK mode).
+    bool is_category_mask = options_.segmenter_options().output_type() ==
+                            SegmenterOptions::CATEGORY_MASK;
+    if (is_category_mask) {
+      LOG(INFO) << "CATEGORY_MASK requested; using NONE activation function.";
+      activation_fn = "vec4 out_value = in_value;";
+    }
+
+    const std::string activation_shader_source =
+        absl::StrCat(std::string(mediapipe::kMediaPipeFragmentShaderPreamble),
+                     absl::StrFormat(kActivationFragmentShader, activation_fn));
+
+    const std::string split_fragment_shader_source =
+        absl::StrCat(std::string(mediapipe::kMediaPipeFragmentShaderPreamble),
+                     std::string(kPassthroughShader));
+    const std::string split_vertex_shader_source =
+        absl::StrCat(std::string(mediapipe::kMediaPipeVertexShaderPreamble),
+                     std::string(kSplitVertexShader));
+
+    const std::string channel_select_shader_source =
+        absl::StrCat(std::string(mediapipe::kMediaPipeFragmentShaderPreamble),
+                     std::string(kChannelSelectShader));
+
+    const std::string argmax_shader_source =
+        absl::StrCat(std::string(mediapipe::kMediaPipeFragmentShaderPreamble),
+                     std::string(kArgmaxShader));
+
+    // Compile all our shader programs.
+    // Note: we enable `force_log_errors` so that we get full debugging error
+    //   messages when compiling shaders on web, where normally such errors are
+    //   suppressed. See //mediapipe/gpu/shader_util.cc for more
+    //   info.
+    mediapipe::GlhCreateProgram(
+        kBasicVertexShader, activation_shader_source.c_str(), NUM_ATTRIBUTES,
+        &attr_name[0], attr_location, &activation_program_,
+        /* force_log_errors */ true);
+    RET_CHECK(activation_program_)
+        << "Problem initializing the activation program.";
+
+    mediapipe::GlhCreateProgram(split_vertex_shader_source.c_str(),
+                                split_fragment_shader_source.c_str(),
+                                NUM_ATTRIBUTES, &attr_name[0], attr_location,
+                                &split_program_,
+                                /* force_log_errors */ true);
+    RET_CHECK(split_program_) << "Problem initializing the split program.";
+
+    mediapipe::GlhCreateProgram(
+        kBasicVertexShader, channel_select_shader_source.c_str(),
+        NUM_ATTRIBUTES, &attr_name[0], attr_location, &channel_select_program_,
+        /* force_log_errors */ true);
+    RET_CHECK(channel_select_program_)
+        << "Problem initializing the channel select program.";
+
+    mediapipe::GlhCreateProgram(kBasicVertexShader,
+                                argmax_shader_source.c_str(), NUM_ATTRIBUTES,
+                                &attr_name[0], attr_location, &argmax_program_,
+                                /* force_log_errors */ true);
+    RET_CHECK(argmax_program_) << "Problem initializing the argmax program.";
+
+    // Get uniform locations.
+    activation_texture_uniform_ =
+        glGetUniformLocation(activation_program_, "input_texture");
+    RET_CHECK(activation_texture_uniform_ > 0)
+        << "activation input_texture uniform not found.";
+
+    split_texture_uniform_ =
+        glGetUniformLocation(split_program_, "input_texture");
+    RET_CHECK(split_texture_uniform_ > 0)
+        << "split input_texture uniform not found.";
+    split_x_offset_uniform_ = glGetUniformLocation(split_program_, "x_offset");
+    RET_CHECK(split_x_offset_uniform_ > 0)
+        << "split x_offset uniform not found.";
+
+    channel_select_texture_uniform_ =
+        glGetUniformLocation(channel_select_program_, "input_texture");
+    RET_CHECK(channel_select_texture_uniform_ > 0)
+        << "channel select input_texture uniform not found.";
+    channel_select_index_uniform_ =
+        glGetUniformLocation(channel_select_program_, "channel_select");
+    RET_CHECK(channel_select_index_uniform_ > 0)
+        << "channel select indexing uniform not found.";
+
+    argmax_texture0_uniform_ =
+        glGetUniformLocation(argmax_program_, "input_texture0");
+    RET_CHECK(argmax_texture0_uniform_ > 0)
+        << "argmax input_texture0 uniform not found.";
+    argmax_texture1_uniform_ =
+        glGetUniformLocation(argmax_program_, "input_texture1");
+    RET_CHECK(argmax_texture1_uniform_ > 0)
+        << "argmax input_texture1 uniform not found.";
+    argmax_texture2_uniform_ =
+        glGetUniformLocation(argmax_program_, "input_texture2");
+    RET_CHECK(argmax_texture2_uniform_ > 0)
+        << "argmax input_texture2 uniform not found.";
+
+    // TODO: If ES3.0+ only, switch to VAO for handling attributes.
+    glGenBuffers(1, &square_vertices_);
+    glBindBuffer(GL_ARRAY_BUFFER, square_vertices_);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(kBasicSquareVertices),
+                 kBasicSquareVertices, GL_STATIC_DRAW);
+
+    glGenBuffers(1, &texture_vertices_);
+    glBindBuffer(GL_ARRAY_BUFFER, texture_vertices_);
+    glBufferData(GL_ARRAY_BUFFER, sizeof(kBasicTextureVertices),
+                 kBasicTextureVertices, GL_STATIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    return absl::OkStatus();
+  });
+}
+
+std::vector<std::unique_ptr<Image>>
+SegmentationPostprocessorGl::GetSegmentationResultGpu(const Shape& input_shape,
+                                                      const Shape& output_shape,
+                                                      const Tensor& tensor) {
+  std::vector<std::unique_ptr<Image>> image_outputs;
+  auto status = helper_.RunInGlContext([this, &input_shape, &output_shape,
+                                        &tensor,
+                                        &image_outputs]() -> absl::Status {
+    // Get Tensor input and image output parameters
+    int input_width, input_height;
+
+    if (!tensor.ready_as_opengl_texture_2d()) {
+      LOG(WARNING) << "Tensor wasn't ready on GPU; using slow workaround.";
+      (void)tensor.GetCpuReadView();
+    }
+
+    const auto layout = tensor.GetOpenGlTexture2dReadView().GetLayoutDimensions(
+        tensor.shape(), &input_width, &input_height);
+    if (layout != Tensor::OpenGlTexture2dView::Layout::kAligned) {
+      LOG(ERROR) << "Tensor layout not kAligned! Cannot handle.";
+    }
+
+    bool is_category_mask = options_.segmenter_options().output_type() ==
+                            SegmenterOptions::CATEGORY_MASK;
+
+    const GpuBufferFormat activation_output_format =
+        GpuBufferFormat::kRGBAFloat128;
+    const GpuBufferFormat chunk_output_format = GpuBufferFormat::kRGBAFloat128;
+
+    // Uint8 pipeline and conversions are lacking, so for now we just use F32
+    // textures even for category masks.
+    const GpuBufferFormat final_output_format = GpuBufferFormat::kGrayFloat32;
+    const Tensor::OpenGlTexture2dView read_view =
+        tensor.GetOpenGlTexture2dReadView();
+
+    const int width = input_shape.width;           // Slice width from shape
+    const int height = input_shape.height;         // Slice height from chape
+    const int num_outputs = input_shape.channels;  // One output per channel
+    const int num_chunks = (input_shape.channels + 3) / 4;  // ceil(channels/4)
+    const int output_width = output_shape.width;    // Final output width
+    const int output_height = output_shape.height;  // Final output height
+
+    // We disable blending or else our alpha channel may destroy our other
+    // channels' data.
+    glDisable(GL_BLEND);
+
+    // Step 0: bind buffers / textures
+    glBindBuffer(GL_ARRAY_BUFFER, square_vertices_);
+    glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, nullptr);
+    glEnableVertexAttribArray(ATTRIB_VERTEX);
+
+    glBindBuffer(GL_ARRAY_BUFFER, texture_vertices_);
+    glVertexAttribPointer(ATTRIB_TEXTURE_POSITION, 2, GL_FLOAT, 0, 0, nullptr);
+    glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION);
+
+    // Step 1: apply activation pass
+    glUseProgram(activation_program_);
+    glUniform1i(activation_texture_uniform_, 1);
+    GlTexture activated_texture = helper_.CreateDestinationTexture(
+        input_width, input_height, activation_output_format);
+    helper_.BindFramebuffer(activated_texture);
+
+    // All our input source textures are just simple GL_TEXTURE_2D types.
+    glActiveTexture(GL_TEXTURE1);
+    glBindTexture(GL_TEXTURE_2D, read_view.name());
+
+    // Render
+    glClear(GL_COLOR_BUFFER_BIT);
+    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+    // Step 2: split megatexture into 4-chunks (assume kLayoutAligned for now).
+    std::vector<GlTexture> chunks;
+    // # chunks: offset in pixels at which taps must be made
+    // 1 chunk: 0
+    // 2 chunks: -0.5, +0.5
+    // 3 chunks: -1,0,1
+    // 4 chunks: -1.5, -.5, .5, 1.5
+    // ...
+    // Step is always 1 pixel, while initial offset is (1 - N) * 0.5
+    glUseProgram(split_program_);
+    glUniform1i(split_texture_uniform_, 1);
+    const float tex_offset = 0.5 * (1.0 - (float)num_chunks);
+    for (int i = 0; i < num_chunks; i++) {
+      chunks.push_back(
+          helper_.CreateDestinationTexture(width, height, chunk_output_format));
+      helper_.BindFramebuffer(chunks.back());
+      glUniform1f(split_x_offset_uniform_,
+                  ((float)i + tex_offset) / (float)(input_width));
+      // Technically duplicated, but fine for now; we want this after the bind
+      glBindTexture(GL_TEXTURE_2D, activated_texture.name());
+      // Disable HW interpolation
+      glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+      glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+      // Render
+      glClear(GL_COLOR_BUFFER_BIT);
+      glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+    }
+
+    std::vector<GlTexture> outputs;
+    if (is_category_mask) {
+      // Step 3: For CATEGORY, apply argmax shader with up to 3 textures to
+      // extract final index mask.
+      RET_CHECK(num_chunks <= 3)
+          << "Cannot handle more than 12 classes in argmax shader.";
+
+      glUseProgram(argmax_program_);
+      glUniform1i(argmax_texture0_uniform_, 1);
+      glUniform1i(argmax_texture1_uniform_, 2);
+      glUniform1i(argmax_texture2_uniform_, 3);
+      outputs.push_back(helper_.CreateDestinationTexture(
+          output_width, output_height, final_output_format));
+      helper_.BindFramebuffer(outputs.back());
+
+      // Bind however many chunks we have
+      for (int i = 0; i < num_chunks; ++i) {
+        glActiveTexture(GL_TEXTURE1 + i);
+        glBindTexture(GL_TEXTURE_2D, chunks[i].name());
+      }
+
+      for (int i = num_chunks; i < 3; ++i) {  // 3 is hard-coded max chunks
+        glActiveTexture(GL_TEXTURE1 + i);
+        // If texture is unbound, sampling from it should always give zeros.
+        // This is not ideal, but is ok for now for not polluting the argmax
+        // shader results too much.
+        glBindTexture(GL_TEXTURE_2D, 0);
+      }
+
+      glClear(GL_COLOR_BUFFER_BIT);
+      glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+      // Unbind the extra textures here.
+      for (int i = 0; i < num_chunks; ++i) {
+        glActiveTexture(GL_TEXTURE1 + i);
+        glBindTexture(GL_TEXTURE_2D, 0);
+      }
+    } else {
+      // Step 3: For CONFIDENCE, apply channel-select repeatedly to extract
+      // final textures.
+      glUseProgram(channel_select_program_);
+      glUniform1i(channel_select_texture_uniform_, 1);
+      for (int i = 0; i < num_outputs; i++) {
+        glUniform1i(channel_select_index_uniform_, (i % 4));
+        outputs.push_back(helper_.CreateDestinationTexture(
+            output_width, output_height, final_output_format));
+        helper_.BindFramebuffer(outputs.back());
+
+        // We have to rebind constantly because BindFramebuffer seems to
+        // interfere with this.
+        glBindTexture(GL_TEXTURE_2D, chunks[i / 4].name());
+
+        glClear(GL_COLOR_BUFFER_BIT);
+        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+      }
+    }
+
+    // Unbind everything
+    glDisableVertexAttribArray(ATTRIB_VERTEX);
+    glDisableVertexAttribArray(ATTRIB_TEXTURE_POSITION);
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glBindFramebuffer(GL_FRAMEBUFFER, 0);
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    // Get Image vector from GlTexture vector
+    for (auto& output_texture : outputs) {
+      image_outputs.push_back(output_texture.GetFrame<Image>());
+    }
+
+    return absl::OkStatus();
+  });
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Error with rendering: " << status;
+  }
+
+  return image_outputs;
+}
+
+// Cleanup OpenGL resources on destruction
+SegmentationPostprocessorGl::~SegmentationPostprocessorGl() {
+  helper_.RunInGlContext([this] {
+    glDeleteProgram(activation_program_);
+    glDeleteProgram(argmax_program_);
+    glDeleteProgram(channel_select_program_);
+    glDeleteProgram(split_program_);
+    glDeleteBuffers(1, &square_vertices_);
+    glDeleteBuffers(1, &texture_vertices_);
+    activation_program_ = 0;
+    argmax_program_ = 0;
+    channel_select_program_ = 0;
+    split_program_ = 0;
+    square_vertices_ = 0;
+    texture_vertices_ = 0;
+  });
+}
+
+}  // namespace tasks
+}  // namespace mediapipe
--- a/mediapipe/tasks/cc/vision/image_segmenter/calculators/segmentation_postprocessor_gl.h
+++ b/mediapipe/tasks/cc/vision/image_segmenter/calculators/segmentation_postprocessor_gl.h
@ -0,0 +1,66 @@
+// Copyright 2023 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MEDIAPIPE_TASKS_CC_VISION_IMAGE_SEGMENTER_CALCULATORS_SEGMENTATION_POSTPROCESSOR_GL_H_
+#define MEDIAPIPE_TASKS_CC_VISION_IMAGE_SEGMENTER_CALCULATORS_SEGMENTATION_POSTPROCESSOR_GL_H_
+#include "mediapipe/framework/calculator_framework.h"
+#include "mediapipe/framework/formats/image.h"
+#include "mediapipe/framework/formats/tensor.h"
+#include "mediapipe/gpu/gl_calculator_helper.h"
+#include "mediapipe/tasks/cc/vision/image_segmenter/calculators/tensors_to_segmentation_calculator.pb.h"
+#include "mediapipe/tasks/cc/vision/utils/image_utils.h"
+
+namespace mediapipe {
+namespace tasks {
+
+class SegmentationPostprocessorGl {
+ public:
+  ~SegmentationPostprocessorGl();
+
+  static absl::Status UpdateContract(CalculatorContract* cc);
+
+  absl::Status Initialize(
+      CalculatorContext* cc,
+      TensorsToSegmentationCalculatorOptions const& options);
+  std::vector<std::unique_ptr<Image>> GetSegmentationResultGpu(
+      const vision::Shape& input_shape, const vision::Shape& output_shape,
+      const Tensor& tensor);
+
+ private:
+  absl::Status GlInit();
+
+  TensorsToSegmentationCalculatorOptions options_;
+  GlCalculatorHelper helper_;
+
+  // GL references (programs, buffers, uniforms)
+  GLuint activation_program_ = 0;
+  GLuint argmax_program_ = 0;
+  GLuint channel_select_program_ = 0;
+  GLuint split_program_ = 0;
+  GLuint square_vertices_ = 0;
+  GLuint texture_vertices_ = 0;
+  GLint activation_texture_uniform_;
+  GLint argmax_texture0_uniform_;
+  GLint argmax_texture1_uniform_;
+  GLint argmax_texture2_uniform_;
+  GLint channel_select_texture_uniform_;
+  GLint channel_select_index_uniform_;
+  GLint split_texture_uniform_;
+  GLint split_x_offset_uniform_;
+};
+
+}  // namespace tasks
+}  // namespace mediapipe
+
+#endif  // MEDIAPIPE_TASKS_CC_VISION_IMAGE_SEGMENTER_CALCULATORS_SEGMENTATION_POSTPROCESSOR_GL_H_
--- a/mediapipe/tasks/cc/vision/image_segmenter/calculators/tensors_to_segmentation_calculator.cc
+++ b/mediapipe/tasks/cc/vision/image_segmenter/calculators/tensors_to_segmentation_calculator.cc
@ -39,6 +39,10 @@ limitations under the License.
 #include "mediapipe/tasks/cc/vision/utils/image_utils.h"
 #include "mediapipe/util/label_map.pb.h"

+#ifdef __EMSCRIPTEN__
+#include "mediapipe/tasks/cc/vision/image_segmenter/calculators/segmentation_postprocessor_gl.h"
+#endif  // __EMSCRIPTEN__
+
 // TODO: consolidate TensorToSegmentationCalculator.
 namespace mediapipe {
 namespace tasks {
@ -118,23 +122,41 @@ class TensorsToSegmentationCalculator : public Node {
  static constexpr Output<Image>::Multiple kSegmentationOut{"SEGMENTATION"};
  MEDIAPIPE_NODE_CONTRACT(kTensorsIn, kOutputSizeIn, kSegmentationOut);

+  static absl::Status UpdateContract(CalculatorContract* cc);
+
  absl::Status Open(CalculatorContext* cc);
  absl::Status Process(CalculatorContext* cc);

 private:
-  std::vector<Image> GetSegmentationResult(const Shape& input_shape,
+  std::vector<Image> GetSegmentationResultCpu(const Shape& input_shape,
                                              const Shape& output_shape,
                                              const float* tensors_buffer);
-
  TensorsToSegmentationCalculatorOptions options_;
+
+#ifdef __EMSCRIPTEN__
+  SegmentationPostprocessorGl postprocessor_;
+#endif  // __EMSCRIPTEN__
 };

+// static
+absl::Status TensorsToSegmentationCalculator::UpdateContract(
+    CalculatorContract* cc) {
+#ifdef __EMSCRIPTEN__
+  return SegmentationPostprocessorGl::UpdateContract(cc);
+#else
+  return absl::OkStatus();
+#endif  // __EMSCRIPTEN__
+}
+
 absl::Status TensorsToSegmentationCalculator::Open(
    mediapipe::CalculatorContext* cc) {
  options_ = cc->Options<TensorsToSegmentationCalculatorOptions>();
  RET_CHECK_NE(options_.segmenter_options().output_type(),
               SegmenterOptions::UNSPECIFIED)
      << "Must specify output_type as one of [CONFIDENCE_MASK|CATEGORY_MASK].";
+#ifdef __EMSCRIPTEN__
+  MP_RETURN_IF_ERROR(postprocessor_.Initialize(cc, options_));
+#endif  // __EMSCRIPTEN__
  return absl::OkStatus();
 }

@ -167,7 +189,29 @@ absl::Status TensorsToSegmentationCalculator::Process(
          ? 1
          : input_shape.channels};

-  std::vector<Image> segmented_masks = GetSegmentationResult(
+  // Use GPU postprocessing on web when Tensor is there already and has <= 12
+  // categories.
+#ifdef __EMSCRIPTEN__
+  if (input_tensor.ready_as_opengl_texture_2d() && input_shape.channels <= 12) {
+    std::vector<std::unique_ptr<Image>> segmented_masks =
+        postprocessor_.GetSegmentationResultGpu(input_shape, output_shape,
+                                                input_tensor);
+    for (int i = 0; i < segmented_masks.size(); ++i) {
+      // Real output on GPU.
+      // kSegmentationOut(cc)[i].Send(std::move(segmented_masks[i]));
+
+      // Reformat as CPU for now for testing.
+      // TODO: Switch to real GPU output when GPU output pipeline is
+      //     ready.
+      Image new_image(segmented_masks[i]->GetImageFrameSharedPtr());
+      kSegmentationOut(cc)[i].Send(std::move(new_image));
+    }
+    return absl::OkStatus();
+  }
+#endif  // __EMSCRIPTEN__
+
+  // Otherwise, use CPU postprocessing.
+  std::vector<Image> segmented_masks = GetSegmentationResultCpu(
      input_shape, output_shape, input_tensor.GetCpuReadView().buffer<float>());
  for (int i = 0; i < segmented_masks.size(); ++i) {
    kSegmentationOut(cc)[i].Send(std::move(segmented_masks[i]));
@ -175,7 +219,7 @@ absl::Status TensorsToSegmentationCalculator::Process(
  return absl::OkStatus();
 }

-std::vector<Image> TensorsToSegmentationCalculator::GetSegmentationResult(
+std::vector<Image> TensorsToSegmentationCalculator::GetSegmentationResultCpu(
    const Shape& input_shape, const Shape& output_shape,
    const float* tensors_buffer) {
  std::function<void(absl::Span<const float> values,