Project import generated by Copybara.

GitOrigin-RevId: d0039a576e2db9c0fcefffd26a527df74cbe145b
2020-04-21 22:12:12 -04:00 · 2020-04-21 22:12:12 -04:00 · 7bad8fce62
commit 7bad8fce62
parent 024f7bf0f1
45 changed files with 1566 additions and 227 deletions
--- a/README.md
+++ b/README.md
@ -19,6 +19,7 @@
 * [Object Detection and Tracking](mediapipe/docs/object_tracking_mobile_gpu.md)
 * [Objectron: 3D Object Detection and Tracking](mediapipe/docs/objectron_mobile_gpu.md)
 * [AutoFlip: Intelligent Video Reframing](mediapipe/docs/autoflip.md)
+* [KNIFT: Template Matching with Neural Image Features](mediapipe/docs/template_matching_mobile_cpu.md)

 ![face_detection](mediapipe/docs/images/mobile/face_detection_android_gpu_small.gif)
 ![face_mesh](mediapipe/docs/images/mobile/face_mesh_android_gpu_small.gif)
@ -29,6 +30,7 @@
 ![object_tracking](mediapipe/docs/images/mobile/object_tracking_android_gpu_small.gif)
 ![objectron_shoes](mediapipe/docs/images/mobile/objectron_shoe_android_gpu_small.gif)
 ![objectron_chair](mediapipe/docs/images/mobile/objectron_chair_android_gpu_small.gif)
+![template_matching](mediapipe/docs/images/mobile/template_matching_android_cpu_small.gif)

 ## Installation
 Follow these [instructions](mediapipe/docs/install.md).
@ -53,6 +55,7 @@ Search MediaPipe Github repository using [Google Open Source code search](https:
 *  [YouTube Channel](https://www.youtube.com/channel/UCObqmpuSMx-usADtL_qdMAw)

 ## Publications
+* [MediaPipe KNIFT: Template-based Feature Matching](https://mediapipe.page.link/knift-blog)
 * [Alfred Camera: Smart camera features using MediaPipe](https://developers.googleblog.com/2020/03/alfred-camera-smart-camera-features-using-mediapipe.html)
 * [MediaPipe Objectron: Real-time 3D Object Detection on Mobile Devices](https://mediapipe.page.link/objectron-aiblog)
 * [AutoFlip: An Open Source Framework for Intelligent Video Reframing](https://mediapipe.page.link/autoflip)
--- a/mediapipe/calculators/image/image_properties_calculator.cc
+++ b/mediapipe/calculators/image/image_properties_calculator.cc
@ -19,6 +19,11 @@
 #include "mediapipe/gpu/gpu_buffer.h"
 #endif  //  !MEDIAPIPE_DISABLE_GPU

+namespace {
+constexpr char kImageFrameTag[] = "IMAGE";
+constexpr char kGpuBufferTag[] = "IMAGE_GPU";
+}  // namespace
+
 namespace mediapipe {

 // Extracts image properties from the input image and outputs the properties.
@ -40,13 +45,14 @@ namespace mediapipe {
 class ImagePropertiesCalculator : public CalculatorBase {
 public:
  static ::mediapipe::Status GetContract(CalculatorContract* cc) {
-    RET_CHECK(cc->Inputs().HasTag("IMAGE") ^ cc->Inputs().HasTag("IMAGE_GPU"));
-    if (cc->Inputs().HasTag("IMAGE")) {
-      cc->Inputs().Tag("IMAGE").Set<ImageFrame>();
+    RET_CHECK(cc->Inputs().HasTag(kImageFrameTag) ^
+              cc->Inputs().HasTag(kGpuBufferTag));
+    if (cc->Inputs().HasTag(kImageFrameTag)) {
+      cc->Inputs().Tag(kImageFrameTag).Set<ImageFrame>();
    }
 #if !defined(MEDIAPIPE_DISABLE_GPU)
-    if (cc->Inputs().HasTag("IMAGE_GPU")) {
-      cc->Inputs().Tag("IMAGE_GPU").Set<::mediapipe::GpuBuffer>();
+    if (cc->Inputs().HasTag(kGpuBufferTag)) {
+      cc->Inputs().Tag(kGpuBufferTag).Set<::mediapipe::GpuBuffer>();
    }
 #endif  //  !MEDIAPIPE_DISABLE_GPU

@ -66,16 +72,17 @@ class ImagePropertiesCalculator : public CalculatorBase {
    int width;
    int height;

-    if (cc->Inputs().HasTag("IMAGE") && !cc->Inputs().Tag("IMAGE").IsEmpty()) {
-      const auto& image = cc->Inputs().Tag("IMAGE").Get<ImageFrame>();
+    if (cc->Inputs().HasTag(kImageFrameTag) &&
+        !cc->Inputs().Tag(kImageFrameTag).IsEmpty()) {
+      const auto& image = cc->Inputs().Tag(kImageFrameTag).Get<ImageFrame>();
      width = image.Width();
      height = image.Height();
    }
 #if !defined(MEDIAPIPE_DISABLE_GPU)
-    if (cc->Inputs().HasTag("IMAGE_GPU") &&
-        !cc->Inputs().Tag("IMAGE_GPU").IsEmpty()) {
+    if (cc->Inputs().HasTag(kGpuBufferTag) &&
+        !cc->Inputs().Tag(kGpuBufferTag).IsEmpty()) {
      const auto& image =
-          cc->Inputs().Tag("IMAGE_GPU").Get<mediapipe::GpuBuffer>();
+          cc->Inputs().Tag(kGpuBufferTag).Get<mediapipe::GpuBuffer>();
      width = image.width();
      height = image.height();
    }
--- a/mediapipe/calculators/image/image_transformation_calculator.cc
+++ b/mediapipe/calculators/image/image_transformation_calculator.cc
@ -47,6 +47,9 @@ namespace mediapipe {
 #endif  //  !MEDIAPIPE_DISABLE_GPU

 namespace {
+constexpr char kImageFrameTag[] = "IMAGE";
+constexpr char kGpuBufferTag[] = "IMAGE_GPU";
+
 int RotationModeToDegrees(mediapipe::RotationMode_Mode rotation) {
  switch (rotation) {
    case mediapipe::RotationMode_Mode_UNKNOWN:
@ -95,7 +98,7 @@ mediapipe::ScaleMode_Mode ParseScaleMode(
 // Scales, rotates, and flips images horizontally or vertically.
 //
 // Input:
-//   One of the following two tags:
+//   One of the following tags:
 //   IMAGE: ImageFrame representing the input image.
 //   IMAGE_GPU: GpuBuffer representing the input image.
 //
@ -113,7 +116,7 @@ mediapipe::ScaleMode_Mode ParseScaleMode(
 //   corresponding field in the calculator options.
 //
 // Output:
-//   One of the following two tags:
+//   One of the following tags:
 //   IMAGE - ImageFrame representing the output image.
 //   IMAGE_GPU - GpuBuffer representing the output image.
 //
@ -152,7 +155,8 @@ mediapipe::ScaleMode_Mode ParseScaleMode(
 // Note: To enable horizontal or vertical flipping, specify them in the
 // calculator options. Flipping is applied after rotation.
 //
-// Note: Only scale mode STRETCH is currently supported on CPU.
+// Note: Input defines output, so only matchig types supported:
+// IMAGE -> IMAGE  or  IMAGE_GPU -> IMAGE_GPU
 //
 class ImageTransformationCalculator : public CalculatorBase {
 public:
@ -186,7 +190,7 @@ class ImageTransformationCalculator : public CalculatorBase {

  bool use_gpu_ = false;
 #if !defined(MEDIAPIPE_DISABLE_GPU)
-  GlCalculatorHelper helper_;
+  GlCalculatorHelper gpu_helper_;
  std::unique_ptr<QuadRenderer> rgb_renderer_;
  std::unique_ptr<QuadRenderer> yuv_renderer_;
  std::unique_ptr<QuadRenderer> ext_rgb_renderer_;
@ -197,21 +201,22 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
 // static
 ::mediapipe::Status ImageTransformationCalculator::GetContract(
    CalculatorContract* cc) {
-  RET_CHECK(cc->Inputs().HasTag("IMAGE") ^ cc->Inputs().HasTag("IMAGE_GPU"));
-  RET_CHECK(cc->Outputs().HasTag("IMAGE") ^ cc->Outputs().HasTag("IMAGE_GPU"));
+  // Only one input can be set, and the output type must match.
+  RET_CHECK(cc->Inputs().HasTag(kImageFrameTag) ^
+            cc->Inputs().HasTag(kGpuBufferTag));

  bool use_gpu = false;

-  if (cc->Inputs().HasTag("IMAGE")) {
-    RET_CHECK(cc->Outputs().HasTag("IMAGE"));
-    cc->Inputs().Tag("IMAGE").Set<ImageFrame>();
-    cc->Outputs().Tag("IMAGE").Set<ImageFrame>();
+  if (cc->Inputs().HasTag(kImageFrameTag)) {
+    RET_CHECK(cc->Outputs().HasTag(kImageFrameTag));
+    cc->Inputs().Tag(kImageFrameTag).Set<ImageFrame>();
+    cc->Outputs().Tag(kImageFrameTag).Set<ImageFrame>();
  }
 #if !defined(MEDIAPIPE_DISABLE_GPU)
-  if (cc->Inputs().HasTag("IMAGE_GPU")) {
-    RET_CHECK(cc->Outputs().HasTag("IMAGE_GPU"));
-    cc->Inputs().Tag("IMAGE_GPU").Set<GpuBuffer>();
-    cc->Outputs().Tag("IMAGE_GPU").Set<GpuBuffer>();
+  if (cc->Inputs().HasTag(kGpuBufferTag)) {
+    RET_CHECK(cc->Outputs().HasTag(kGpuBufferTag));
+    cc->Inputs().Tag(kGpuBufferTag).Set<GpuBuffer>();
+    cc->Outputs().Tag(kGpuBufferTag).Set<GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
@ -259,7 +264,7 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);

  options_ = cc->Options<ImageTransformationCalculatorOptions>();

-  if (cc->Inputs().HasTag("IMAGE_GPU")) {
+  if (cc->Inputs().HasTag(kGpuBufferTag)) {
    use_gpu_ = true;
  }

@ -300,7 +305,7 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
  if (use_gpu_) {
 #if !defined(MEDIAPIPE_DISABLE_GPU)
    // Let the helper access the GL context information.
-    MP_RETURN_IF_ERROR(helper_.Open(cc));
+    MP_RETURN_IF_ERROR(gpu_helper_.Open(cc));
 #else
    RET_CHECK_FAIL() << "GPU processing not enabled.";
 #endif  //  !MEDIAPIPE_DISABLE_GPU
@ -328,18 +333,14 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);

  if (use_gpu_) {
 #if !defined(MEDIAPIPE_DISABLE_GPU)
-    if (cc->Inputs().Tag("IMAGE_GPU").IsEmpty()) {
-      // Image is missing, hence no way to produce output image. (Timestamp
-      // bound will be updated automatically.)
+    if (cc->Inputs().Tag(kGpuBufferTag).IsEmpty()) {
      return ::mediapipe::OkStatus();
    }
-    return helper_.RunInGlContext(
+    return gpu_helper_.RunInGlContext(
        [this, cc]() -> ::mediapipe::Status { return RenderGpu(cc); });
 #endif  //  !MEDIAPIPE_DISABLE_GPU
  } else {
-    if (cc->Inputs().Tag("IMAGE").IsEmpty()) {
-      // Image is missing, hence no way to produce output image. (Timestamp
-      // bound will be updated automatically.)
+    if (cc->Inputs().Tag(kImageFrameTag).IsEmpty()) {
      return ::mediapipe::OkStatus();
    }
    return RenderCpu(cc);
@ -354,7 +355,7 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
    QuadRenderer* rgb_renderer = rgb_renderer_.release();
    QuadRenderer* yuv_renderer = yuv_renderer_.release();
    QuadRenderer* ext_rgb_renderer = ext_rgb_renderer_.release();
-    helper_.RunInGlContext([rgb_renderer, yuv_renderer, ext_rgb_renderer] {
+    gpu_helper_.RunInGlContext([rgb_renderer, yuv_renderer, ext_rgb_renderer] {
      if (rgb_renderer) {
        rgb_renderer->GlTeardown();
        delete rgb_renderer;
@ -376,17 +377,21 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);

 ::mediapipe::Status ImageTransformationCalculator::RenderCpu(
    CalculatorContext* cc) {
-  const auto& input_img = cc->Inputs().Tag("IMAGE").Get<ImageFrame>();
-  cv::Mat input_mat = formats::MatView(&input_img);
-  cv::Mat scaled_mat;
+  cv::Mat input_mat;
+  mediapipe::ImageFormat::Format format;

-  const int input_width = input_img.Width();
-  const int input_height = input_img.Height();
+  const auto& input = cc->Inputs().Tag(kImageFrameTag).Get<ImageFrame>();
+  input_mat = formats::MatView(&input);
+  format = input.Format();
+
+  const int input_width = input_mat.cols;
+  const int input_height = input_mat.rows;
  if (!output_height_ || !output_width_) {
    output_height_ = input_height;
    output_width_ = input_width;
  }

+  cv::Mat scaled_mat;
  if (scale_mode_ == mediapipe::ScaleMode_Mode_STRETCH) {
    cv::resize(input_mat, scaled_mat, cv::Size(output_width_, output_height_));
  } else {
@ -443,10 +448,12 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
  }

  std::unique_ptr<ImageFrame> output_frame(
-      new ImageFrame(input_img.Format(), output_width, output_height));
+      new ImageFrame(format, output_width, output_height));
  cv::Mat output_mat = formats::MatView(output_frame.get());
  flipped_mat.copyTo(output_mat);
-  cc->Outputs().Tag("IMAGE").Add(output_frame.release(), cc->InputTimestamp());
+  cc->Outputs()
+      .Tag(kImageFrameTag)
+      .Add(output_frame.release(), cc->InputTimestamp());

  return ::mediapipe::OkStatus();
 }
@ -454,7 +461,7 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
 ::mediapipe::Status ImageTransformationCalculator::RenderGpu(
    CalculatorContext* cc) {
 #if !defined(MEDIAPIPE_DISABLE_GPU)
-  const auto& input = cc->Inputs().Tag("IMAGE_GPU").Get<GpuBuffer>();
+  const auto& input = cc->Inputs().Tag(kGpuBufferTag).Get<GpuBuffer>();
  const int input_width = input.width();
  const int input_height = input.height();

@ -485,11 +492,11 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
                                 {"video_frame_y", "video_frame_uv"}));
    }
    renderer = yuv_renderer_.get();
-    src1 = helper_.CreateSourceTexture(input, 0);
+    src1 = gpu_helper_.CreateSourceTexture(input, 0);
  } else  // NOLINT(readability/braces)
 #endif    // iOS
  {
-    src1 = helper_.CreateSourceTexture(input);
+    src1 = gpu_helper_.CreateSourceTexture(input);
 #if defined(TEXTURE_EXTERNAL_OES)
    if (src1.target() == GL_TEXTURE_EXTERNAL_OES) {
      if (!ext_rgb_renderer_) {
@ -515,10 +522,10 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
  mediapipe::FrameRotation rotation =
      mediapipe::FrameRotationFromDegrees(RotationModeToDegrees(rotation_));

-  auto dst = helper_.CreateDestinationTexture(output_width, output_height,
-                                              input.format());
+  auto dst = gpu_helper_.CreateDestinationTexture(output_width, output_height,
+                                                  input.format());

-  helper_.BindFramebuffer(dst);  // GL_TEXTURE0
+  gpu_helper_.BindFramebuffer(dst);  // GL_TEXTURE0
  glActiveTexture(GL_TEXTURE1);
  glBindTexture(src1.target(), src1.name());

@ -533,8 +540,8 @@ REGISTER_CALCULATOR(ImageTransformationCalculator);
  // Execute GL commands, before getting result.
  glFlush();

-  auto output = dst.GetFrame<GpuBuffer>();
-  cc->Outputs().Tag("IMAGE_GPU").Add(output.release(), cc->InputTimestamp());
+  auto output = dst.template GetFrame<GpuBuffer>();
+  cc->Outputs().Tag(kGpuBufferTag).Add(output.release(), cc->InputTimestamp());

 #endif  //  !MEDIAPIPE_DISABLE_GPU

--- a/mediapipe/calculators/image/recolor_calculator.cc
+++ b/mediapipe/calculators/image/recolor_calculator.cc
@ -32,6 +32,11 @@

 namespace {
 enum { ATTRIB_VERTEX, ATTRIB_TEXTURE_POSITION, NUM_ATTRIBUTES };
+
+constexpr char kImageFrameTag[] = "IMAGE";
+constexpr char kMaskCpuTag[] = "MASK";
+constexpr char kGpuBufferTag[] = "IMAGE_GPU";
+constexpr char kMaskGpuTag[] = "MASK_GPU";
 }  // namespace

 namespace mediapipe {
@ -112,39 +117,41 @@ REGISTER_CALCULATOR(RecolorCalculator);
  bool use_gpu = false;

 #if !defined(MEDIAPIPE_DISABLE_GPU)
-  if (cc->Inputs().HasTag("IMAGE_GPU")) {
-    cc->Inputs().Tag("IMAGE_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Inputs().HasTag(kGpuBufferTag)) {
+    cc->Inputs().Tag(kGpuBufferTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
-  if (cc->Inputs().HasTag("IMAGE")) {
-    cc->Inputs().Tag("IMAGE").Set<ImageFrame>();
+  if (cc->Inputs().HasTag(kImageFrameTag)) {
+    cc->Inputs().Tag(kImageFrameTag).Set<ImageFrame>();
  }

 #if !defined(MEDIAPIPE_DISABLE_GPU)
-  if (cc->Inputs().HasTag("MASK_GPU")) {
-    cc->Inputs().Tag("MASK_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Inputs().HasTag(kMaskGpuTag)) {
+    cc->Inputs().Tag(kMaskGpuTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
-  if (cc->Inputs().HasTag("MASK")) {
-    cc->Inputs().Tag("MASK").Set<ImageFrame>();
+  if (cc->Inputs().HasTag(kMaskCpuTag)) {
+    cc->Inputs().Tag(kMaskCpuTag).Set<ImageFrame>();
  }

 #if !defined(MEDIAPIPE_DISABLE_GPU)
-  if (cc->Outputs().HasTag("IMAGE_GPU")) {
-    cc->Outputs().Tag("IMAGE_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Outputs().HasTag(kGpuBufferTag)) {
+    cc->Outputs().Tag(kGpuBufferTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
-  if (cc->Outputs().HasTag("IMAGE")) {
-    cc->Outputs().Tag("IMAGE").Set<ImageFrame>();
+  if (cc->Outputs().HasTag(kImageFrameTag)) {
+    cc->Outputs().Tag(kImageFrameTag).Set<ImageFrame>();
  }

  // Confirm only one of the input streams is present.
-  RET_CHECK(cc->Inputs().HasTag("IMAGE") ^ cc->Inputs().HasTag("IMAGE_GPU"));
+  RET_CHECK(cc->Inputs().HasTag(kImageFrameTag) ^
+            cc->Inputs().HasTag(kGpuBufferTag));
  // Confirm only one of the output streams is present.
-  RET_CHECK(cc->Outputs().HasTag("IMAGE") ^ cc->Outputs().HasTag("IMAGE_GPU"));
+  RET_CHECK(cc->Outputs().HasTag(kImageFrameTag) ^
+            cc->Outputs().HasTag(kGpuBufferTag));

  if (use_gpu) {
 #if !defined(MEDIAPIPE_DISABLE_GPU)
@ -158,7 +165,7 @@ REGISTER_CALCULATOR(RecolorCalculator);
 ::mediapipe::Status RecolorCalculator::Open(CalculatorContext* cc) {
  cc->SetOffset(TimestampDiff(0));

-  if (cc->Inputs().HasTag("IMAGE_GPU")) {
+  if (cc->Inputs().HasTag(kGpuBufferTag)) {
    use_gpu_ = true;
 #if !defined(MEDIAPIPE_DISABLE_GPU)
    MP_RETURN_IF_ERROR(gpu_helper_.Open(cc));
@ -201,12 +208,12 @@ REGISTER_CALCULATOR(RecolorCalculator);
 }

 ::mediapipe::Status RecolorCalculator::RenderCpu(CalculatorContext* cc) {
-  if (cc->Inputs().Tag("MASK").IsEmpty()) {
+  if (cc->Inputs().Tag(kMaskCpuTag).IsEmpty()) {
    return ::mediapipe::OkStatus();
  }
  // Get inputs and setup output.
-  const auto& input_img = cc->Inputs().Tag("IMAGE").Get<ImageFrame>();
-  const auto& mask_img = cc->Inputs().Tag("MASK").Get<ImageFrame>();
+  const auto& input_img = cc->Inputs().Tag(kImageFrameTag).Get<ImageFrame>();
+  const auto& mask_img = cc->Inputs().Tag(kMaskCpuTag).Get<ImageFrame>();

  cv::Mat input_mat = formats::MatView(&input_img);
  cv::Mat mask_mat = formats::MatView(&mask_img);
@ -254,19 +261,21 @@ REGISTER_CALCULATOR(RecolorCalculator);
    }
  }

-  cc->Outputs().Tag("IMAGE").Add(output_img.release(), cc->InputTimestamp());
+  cc->Outputs()
+      .Tag(kImageFrameTag)
+      .Add(output_img.release(), cc->InputTimestamp());

  return ::mediapipe::OkStatus();
 }

 ::mediapipe::Status RecolorCalculator::RenderGpu(CalculatorContext* cc) {
-  if (cc->Inputs().Tag("MASK_GPU").IsEmpty()) {
+  if (cc->Inputs().Tag(kMaskGpuTag).IsEmpty()) {
    return ::mediapipe::OkStatus();
  }
 #if !defined(MEDIAPIPE_DISABLE_GPU)
  // Get inputs and setup output.
-  const Packet& input_packet = cc->Inputs().Tag("IMAGE_GPU").Value();
-  const Packet& mask_packet = cc->Inputs().Tag("MASK_GPU").Value();
+  const Packet& input_packet = cc->Inputs().Tag(kGpuBufferTag).Value();
+  const Packet& mask_packet = cc->Inputs().Tag(kMaskGpuTag).Value();

  const auto& input_buffer = input_packet.Get<mediapipe::GpuBuffer>();
  const auto& mask_buffer = mask_packet.Get<mediapipe::GpuBuffer>();
@ -296,7 +305,7 @@ REGISTER_CALCULATOR(RecolorCalculator);

  // Send result image in GPU packet.
  auto output = dst_tex.GetFrame<mediapipe::GpuBuffer>();
-  cc->Outputs().Tag("IMAGE_GPU").Add(output.release(), cc->InputTimestamp());
+  cc->Outputs().Tag(kGpuBufferTag).Add(output.release(), cc->InputTimestamp());

  // Cleanup
  img_tex.Release();
--- a/mediapipe/calculators/tflite/BUILD
+++ b/mediapipe/calculators/tflite/BUILD
@ -243,6 +243,7 @@ cc_library(
            "@org_tensorflow//tensorflow/lite/delegates/gpu:metal_delegate_internal",
        ],
        "//conditions:default": [
+            "//mediapipe/util/tflite:tflite_gpu_runner",
            "//mediapipe/gpu:gl_calculator_helper",
            "//mediapipe/gpu:gpu_buffer",
            "@org_tensorflow//tensorflow/lite/delegates/gpu/common:shape",
--- a/mediapipe/calculators/tflite/tflite_converter_calculator.cc
+++ b/mediapipe/calculators/tflite/tflite_converter_calculator.cc
@ -63,6 +63,10 @@ typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
 typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>
    ColMajorMatrixXf;

+constexpr char kImageFrameTag[] = "IMAGE";
+constexpr char kGpuBufferTag[] = "IMAGE_GPU";
+constexpr char kTensorsTag[] = "TENSORS";
+constexpr char kTensorsGpuTag[] = "TENSORS_GPU";
 }  // namespace

 namespace mediapipe {
@ -124,6 +128,9 @@ struct GPUData {
 //  GPU tensors are currently only supported on mobile platforms.
 //  This calculator uses FixedSizeInputStreamHandler by default.
 //
+// Note: Input defines output, so only these type sets are supported:
+// IMAGE -> TENSORS | IMAGE_GPU -> TENSORS_GPU | MATRIX -> TENSORS
+//
 class TfLiteConverterCalculator : public CalculatorBase {
 public:
  static ::mediapipe::Status GetContract(CalculatorContract* cc);
@ -138,9 +145,9 @@ class TfLiteConverterCalculator : public CalculatorBase {
  template <class T>
  ::mediapipe::Status NormalizeImage(const ImageFrame& image_frame,
                                     bool zero_center, bool flip_vertically,
-                                     float* tensor_buffer);
+                                     float* tensor_ptr);
  ::mediapipe::Status CopyMatrixToTensor(const Matrix& matrix,
-                                         float* tensor_buffer);
+                                         float* tensor_ptr);
  ::mediapipe::Status ProcessCPU(CalculatorContext* cc);
  ::mediapipe::Status ProcessGPU(CalculatorContext* cc);

@ -166,33 +173,35 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);

 ::mediapipe::Status TfLiteConverterCalculator::GetContract(
    CalculatorContract* cc) {
-  const bool has_image_tag = cc->Inputs().HasTag("IMAGE");
-  const bool has_image_gpu_tag = cc->Inputs().HasTag("IMAGE_GPU");
-  const bool has_matrix_tag = cc->Inputs().HasTag("MATRIX");
  // Confirm only one of the input streams is present.
-  RET_CHECK(has_image_tag ^ has_image_gpu_tag ^ has_matrix_tag &&
-            !(has_image_tag && has_image_gpu_tag && has_matrix_tag));
+  RET_CHECK(cc->Inputs().HasTag(kImageFrameTag) ^
+            cc->Inputs().HasTag(kGpuBufferTag) ^ cc->Inputs().HasTag("MATRIX"));

  // Confirm only one of the output streams is present.
-  RET_CHECK(cc->Outputs().HasTag("TENSORS") ^
-            cc->Outputs().HasTag("TENSORS_GPU"));
+  RET_CHECK(cc->Outputs().HasTag(kTensorsTag) ^
+            cc->Outputs().HasTag(kTensorsGpuTag));

  bool use_gpu = false;

-  if (cc->Inputs().HasTag("IMAGE")) cc->Inputs().Tag("IMAGE").Set<ImageFrame>();
-  if (cc->Inputs().HasTag("MATRIX")) cc->Inputs().Tag("MATRIX").Set<Matrix>();
+  if (cc->Inputs().HasTag(kImageFrameTag)) {
+    cc->Inputs().Tag(kImageFrameTag).Set<ImageFrame>();
+  }
+  if (cc->Inputs().HasTag("MATRIX")) {
+    cc->Inputs().Tag("MATRIX").Set<Matrix>();
+  }
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
-  if (cc->Inputs().HasTag("IMAGE_GPU")) {
-    cc->Inputs().Tag("IMAGE_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Inputs().HasTag(kGpuBufferTag)) {
+    cc->Inputs().Tag(kGpuBufferTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU

-  if (cc->Outputs().HasTag("TENSORS"))
-    cc->Outputs().Tag("TENSORS").Set<std::vector<TfLiteTensor>>();
+  if (cc->Outputs().HasTag(kTensorsTag)) {
+    cc->Outputs().Tag(kTensorsTag).Set<std::vector<TfLiteTensor>>();
+  }
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
-  if (cc->Outputs().HasTag("TENSORS_GPU")) {
-    cc->Outputs().Tag("TENSORS_GPU").Set<std::vector<GpuTensor>>();
+  if (cc->Outputs().HasTag(kTensorsGpuTag)) {
+    cc->Outputs().Tag(kTensorsGpuTag).Set<std::vector<GpuTensor>>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
@ -216,8 +225,8 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);

  MP_RETURN_IF_ERROR(LoadOptions(cc));

-  if (cc->Inputs().HasTag("IMAGE_GPU") ||
-      cc->Outputs().HasTag("IMAGE_OUT_GPU")) {
+  if (cc->Inputs().HasTag(kGpuBufferTag) ||
+      cc->Outputs().HasTag(kGpuBufferTag)) {
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
    use_gpu_ = true;
 #else
@ -227,8 +236,8 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);

  if (use_gpu_) {
    // Cannot mix CPU/GPU streams.
-    RET_CHECK(cc->Inputs().HasTag("IMAGE_GPU") &&
-              cc->Outputs().HasTag("TENSORS_GPU"));
+    RET_CHECK(cc->Inputs().HasTag(kGpuBufferTag) &&
+              cc->Outputs().HasTag(kTensorsGpuTag));
    // Cannot use quantization.
    use_quantized_tensors_ = false;
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
@ -248,7 +257,6 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);

 ::mediapipe::Status TfLiteConverterCalculator::Process(CalculatorContext* cc) {
  if (use_gpu_) {
-    // GpuBuffer to tflite::gpu::GlBuffer conversion.
    if (!initialized_) {
      MP_RETURN_IF_ERROR(InitGpu(cc));
      initialized_ = true;
@ -259,7 +267,6 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
    // Convert to CPU tensors or Matrix type.
    MP_RETURN_IF_ERROR(ProcessCPU(cc));
  }
-
  return ::mediapipe::OkStatus();
 }

@ -275,24 +282,26 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);

 ::mediapipe::Status TfLiteConverterCalculator::ProcessCPU(
    CalculatorContext* cc) {
-  if (cc->Inputs().HasTag("IMAGE")) {
+  if (cc->Inputs().HasTag(kImageFrameTag)) {
    // CPU ImageFrame to TfLiteTensor conversion.

-    const auto& image_frame = cc->Inputs().Tag("IMAGE").Get<ImageFrame>();
+    const auto& image_frame =
+        cc->Inputs().Tag(kImageFrameTag).Get<ImageFrame>();
    const int height = image_frame.Height();
    const int width = image_frame.Width();
    const int channels = image_frame.NumberOfChannels();
    const int channels_preserved = std::min(channels, max_num_channels_);
+    const mediapipe::ImageFormat::Format format = image_frame.Format();

    if (!initialized_) {
-      if (!(image_frame.Format() == mediapipe::ImageFormat::SRGBA ||
-            image_frame.Format() == mediapipe::ImageFormat::SRGB ||
-            image_frame.Format() == mediapipe::ImageFormat::GRAY8 ||
-            image_frame.Format() == mediapipe::ImageFormat::VEC32F1))
+      if (!(format == mediapipe::ImageFormat::SRGBA ||
+            format == mediapipe::ImageFormat::SRGB ||
+            format == mediapipe::ImageFormat::GRAY8 ||
+            format == mediapipe::ImageFormat::VEC32F1))
        RET_CHECK_FAIL() << "Unsupported CPU input format.";
      TfLiteQuantization quant;
      if (use_quantized_tensors_) {
-        RET_CHECK(image_frame.Format() != mediapipe::ImageFormat::VEC32F1)
+        RET_CHECK(format != mediapipe::ImageFormat::VEC32F1)
            << "Only 8-bit input images are supported for quantization.";
        quant.type = kTfLiteAffineQuantization;
        quant.params = nullptr;
@ -349,8 +358,9 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);

    auto output_tensors = absl::make_unique<std::vector<TfLiteTensor>>();
    output_tensors->emplace_back(*tensor);
-    cc->Outputs().Tag("TENSORS").Add(output_tensors.release(),
-                                     cc->InputTimestamp());
+    cc->Outputs()
+        .Tag(kTensorsTag)
+        .Add(output_tensors.release(), cc->InputTimestamp());
  } else if (cc->Inputs().HasTag("MATRIX")) {
    // CPU Matrix to TfLiteTensor conversion.

@ -371,15 +381,16 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
    interpreter_->ResizeInputTensor(tensor_idx, {height, width, channels});
    interpreter_->AllocateTensors();

-    float* tensor_buffer = tensor->data.f;
-    RET_CHECK(tensor_buffer);
+    float* tensor_ptr = tensor->data.f;
+    RET_CHECK(tensor_ptr);

-    MP_RETURN_IF_ERROR(CopyMatrixToTensor(matrix, tensor_buffer));
+    MP_RETURN_IF_ERROR(CopyMatrixToTensor(matrix, tensor_ptr));

    auto output_tensors = absl::make_unique<std::vector<TfLiteTensor>>();
    output_tensors->emplace_back(*tensor);
-    cc->Outputs().Tag("TENSORS").Add(output_tensors.release(),
-                                     cc->InputTimestamp());
+    cc->Outputs()
+        .Tag(kTensorsTag)
+        .Add(output_tensors.release(), cc->InputTimestamp());
  }

  return ::mediapipe::OkStatus();
@ -389,7 +400,8 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
    CalculatorContext* cc) {
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
  // GpuBuffer to tflite::gpu::GlBuffer conversion.
-  const auto& input = cc->Inputs().Tag("IMAGE_GPU").Get<mediapipe::GpuBuffer>();
+  const auto& input =
+      cc->Inputs().Tag(kGpuBufferTag).Get<mediapipe::GpuBuffer>();
  MP_RETURN_IF_ERROR(
      gpu_helper_.RunInGlContext([this, &input]() -> ::mediapipe::Status {
        // Convert GL texture into TfLite GlBuffer (SSBO).
@ -421,11 +433,12 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
        return ::mediapipe::OkStatus();
      }));
  cc->Outputs()
-      .Tag("TENSORS_GPU")
+      .Tag(kTensorsGpuTag)
      .Add(output_tensors.release(), cc->InputTimestamp());
 #elif defined(MEDIAPIPE_IOS)
  // GpuBuffer to id<MTLBuffer> conversion.
-  const auto& input = cc->Inputs().Tag("IMAGE_GPU").Get<mediapipe::GpuBuffer>();
+  const auto& input =
+      cc->Inputs().Tag(kGpuBufferTag).Get<mediapipe::GpuBuffer>();
  id<MTLCommandBuffer> command_buffer = [gpu_helper_ commandBuffer];

  id<MTLTexture> src_texture = [gpu_helper_ metalTextureWithGpuBuffer:input];
@ -457,7 +470,7 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
                    commandBuffer:command_buffer];

  cc->Outputs()
-      .Tag("TENSORS_GPU")
+      .Tag(kTensorsGpuTag)
      .Add(output_tensors.release(), cc->InputTimestamp());
 #else
  RET_CHECK_FAIL() << "GPU processing is not enabled.";
@ -469,7 +482,8 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
 ::mediapipe::Status TfLiteConverterCalculator::InitGpu(CalculatorContext* cc) {
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
  // Get input image sizes.
-  const auto& input = cc->Inputs().Tag("IMAGE_GPU").Get<mediapipe::GpuBuffer>();
+  const auto& input =
+      cc->Inputs().Tag(kGpuBufferTag).Get<mediapipe::GpuBuffer>();
  mediapipe::ImageFormat::Format format =
      mediapipe::ImageFormatForGpuBufferFormat(input.format());
  gpu_data_out_ = absl::make_unique<GPUData>();
@ -612,7 +626,7 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
  CHECK_LE(max_num_channels_, 4);
  CHECK_NE(max_num_channels_, 2);
 #if defined(MEDIAPIPE_IOS)
-  if (cc->Inputs().HasTag("IMAGE_GPU"))
+  if (cc->Inputs().HasTag(kGpuBufferTag))
    // Currently on iOS, tflite gpu input tensor must be 4 channels,
    // so input image must be 4 channels also (checked in InitGpu).
    max_num_channels_ = 4;
@ -627,7 +641,7 @@ REGISTER_CALCULATOR(TfLiteConverterCalculator);
 template <class T>
 ::mediapipe::Status TfLiteConverterCalculator::NormalizeImage(
    const ImageFrame& image_frame, bool zero_center, bool flip_vertically,
-    float* tensor_buffer) {
+    float* tensor_ptr) {
  const int height = image_frame.Height();
  const int width = image_frame.Width();
  const int channels = image_frame.NumberOfChannels();
@ -651,7 +665,7 @@ template <class T>
        (flip_vertically ? height - 1 - i : i) * image_frame.WidthStep());
    for (int j = 0; j < width; ++j) {
      for (int c = 0; c < channels_preserved; ++c) {
-        *tensor_buffer++ = *image_ptr++ / div - sub;
+        *tensor_ptr++ = *image_ptr++ / div - sub;
      }
      image_ptr += channels_ignored;
    }
@ -661,14 +675,14 @@ template <class T>
 }

 ::mediapipe::Status TfLiteConverterCalculator::CopyMatrixToTensor(
-    const Matrix& matrix, float* tensor_buffer) {
+    const Matrix& matrix, float* tensor_ptr) {
  if (row_major_matrix_) {
-    auto matrix_map = Eigen::Map<RowMajorMatrixXf>(tensor_buffer, matrix.rows(),
-                                                   matrix.cols());
+    auto matrix_map =
+        Eigen::Map<RowMajorMatrixXf>(tensor_ptr, matrix.rows(), matrix.cols());
    matrix_map = matrix;
  } else {
-    auto matrix_map = Eigen::Map<ColMajorMatrixXf>(tensor_buffer, matrix.rows(),
-                                                   matrix.cols());
+    auto matrix_map =
+        Eigen::Map<ColMajorMatrixXf>(tensor_ptr, matrix.rows(), matrix.cols());
    matrix_map = matrix;
  }

--- a/mediapipe/calculators/tflite/tflite_inference_calculator.cc
+++ b/mediapipe/calculators/tflite/tflite_inference_calculator.cc
@ -36,6 +36,7 @@
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
 #include "mediapipe/gpu/gl_calculator_helper.h"
 #include "mediapipe/gpu/gpu_buffer.h"
+#include "mediapipe/util/tflite/tflite_gpu_runner.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
@ -75,6 +76,9 @@ typedef id<MTLBuffer> GpuTensor;

 // Round up n to next multiple of m.
 size_t RoundUp(size_t n, size_t m) { return ((n + m - 1) / m) * m; }  // NOLINT
+
+constexpr char kTensorsTag[] = "TENSORS";
+constexpr char kTensorsGpuTag[] = "TENSORS_GPU";
 }  // namespace

 #if defined(MEDIAPIPE_EDGE_TPU)
@ -219,6 +223,7 @@ class TfLiteInferenceCalculator : public CalculatorBase {
  ::mediapipe::Status LoadModel(CalculatorContext* cc);
  ::mediapipe::StatusOr<Packet> GetModelAsPacket(const CalculatorContext& cc);
  ::mediapipe::Status LoadDelegate(CalculatorContext* cc);
+  ::mediapipe::Status InitTFLiteGPURunner();

  Packet model_packet_;
  std::unique_ptr<tflite::Interpreter> interpreter_;
@ -228,6 +233,7 @@ class TfLiteInferenceCalculator : public CalculatorBase {
  mediapipe::GlCalculatorHelper gpu_helper_;
  std::vector<std::unique_ptr<GPUData>> gpu_data_in_;
  std::vector<std::unique_ptr<GPUData>> gpu_data_out_;
+  std::unique_ptr<tflite::gpu::TFLiteGPURunner> tflite_gpu_runner_;
 #elif defined(MEDIAPIPE_IOS)
  MPPMetalHelper* gpu_helper_ = nullptr;
  std::vector<std::unique_ptr<GPUData>> gpu_data_in_;
@ -245,6 +251,8 @@ class TfLiteInferenceCalculator : public CalculatorBase {
  bool gpu_input_ = false;
  bool gpu_output_ = false;
  bool use_quantized_tensors_ = false;
+
+  bool use_advanced_gpu_api_ = false;
 };
 REGISTER_CALCULATOR(TfLiteInferenceCalculator);

@ -252,10 +260,10 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);

 ::mediapipe::Status TfLiteInferenceCalculator::GetContract(
    CalculatorContract* cc) {
-  RET_CHECK(cc->Inputs().HasTag("TENSORS") ^
-            cc->Inputs().HasTag("TENSORS_GPU"));
-  RET_CHECK(cc->Outputs().HasTag("TENSORS") ^
-            cc->Outputs().HasTag("TENSORS_GPU"));
+  RET_CHECK(cc->Inputs().HasTag(kTensorsTag) ^
+            cc->Inputs().HasTag(kTensorsGpuTag));
+  RET_CHECK(cc->Outputs().HasTag(kTensorsTag) ^
+            cc->Outputs().HasTag(kTensorsGpuTag));

  const auto& options =
      cc->Options<::mediapipe::TfLiteInferenceCalculatorOptions>();
@ -266,26 +274,26 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
  bool use_gpu =
      options.has_delegate() ? options.delegate().has_gpu() : options.use_gpu();

-  if (cc->Inputs().HasTag("TENSORS"))
-    cc->Inputs().Tag("TENSORS").Set<std::vector<TfLiteTensor>>();
+  if (cc->Inputs().HasTag(kTensorsTag))
+    cc->Inputs().Tag(kTensorsTag).Set<std::vector<TfLiteTensor>>();
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
-  if (cc->Inputs().HasTag("TENSORS_GPU")) {
+  if (cc->Inputs().HasTag(kTensorsGpuTag)) {
    RET_CHECK(!options.has_delegate() || options.delegate().has_gpu())
        << "GPU input is compatible with GPU delegate only.";

-    cc->Inputs().Tag("TENSORS_GPU").Set<std::vector<GpuTensor>>();
+    cc->Inputs().Tag(kTensorsGpuTag).Set<std::vector<GpuTensor>>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU

-  if (cc->Outputs().HasTag("TENSORS"))
-    cc->Outputs().Tag("TENSORS").Set<std::vector<TfLiteTensor>>();
+  if (cc->Outputs().HasTag(kTensorsTag))
+    cc->Outputs().Tag(kTensorsTag).Set<std::vector<TfLiteTensor>>();
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
-  if (cc->Outputs().HasTag("TENSORS_GPU")) {
+  if (cc->Outputs().HasTag(kTensorsGpuTag)) {
    RET_CHECK(!options.has_delegate() || options.delegate().has_gpu())
        << "GPU output is compatible with GPU delegate only.";

-    cc->Outputs().Tag("TENSORS_GPU").Set<std::vector<GpuTensor>>();
+    cc->Outputs().Tag(kTensorsGpuTag).Set<std::vector<GpuTensor>>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
@ -320,27 +328,31 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
      cc->Options<::mediapipe::TfLiteInferenceCalculatorOptions>();
  gpu_inference_ = options.use_gpu();

-  if (cc->Inputs().HasTag("TENSORS_GPU")) {
+  if (cc->Inputs().HasTag(kTensorsGpuTag)) {
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
    gpu_input_ = true;
    gpu_inference_ = true;  // Inference must be on GPU also.
 #else
-    RET_CHECK(!cc->Inputs().HasTag("TENSORS_GPU"))
+    RET_CHECK(!cc->Inputs().HasTag(kTensorsGpuTag))
        << "GPU processing not enabled.";
 #endif  //  !MEDIAPIPE_DISABLE_GPU
  }

-  if (cc->Outputs().HasTag("TENSORS_GPU")) {
+  if (cc->Outputs().HasTag(kTensorsGpuTag)) {
 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
    gpu_output_ = true;
-    RET_CHECK(cc->Inputs().HasTag("TENSORS_GPU"))
+    RET_CHECK(cc->Inputs().HasTag(kTensorsGpuTag))
        << "GPU output must also have GPU Input.";
 #else
-    RET_CHECK(!cc->Inputs().HasTag("TENSORS_GPU"))
+    RET_CHECK(!cc->Inputs().HasTag(kTensorsGpuTag))
        << "GPU processing not enabled.";
 #endif  //  !MEDIAPIPE_DISABLE_GPU
  }

+  const auto& calculator_opts =
+      cc->Options<mediapipe::TfLiteInferenceCalculatorOptions>();
+  use_advanced_gpu_api_ = false;
+
  MP_RETURN_IF_ERROR(LoadModel(cc));

  if (gpu_inference_) {
@ -352,8 +364,12 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
 #endif

 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
-    MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext(
-        [this, &cc]() -> ::mediapipe::Status { return LoadDelegate(cc); }));
+    MP_RETURN_IF_ERROR(
+        gpu_helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status {
+          return use_advanced_gpu_api_ ? InitTFLiteGPURunner()
+                                       : LoadDelegate(cc);
+        }));
+    if (use_advanced_gpu_api_) return ::mediapipe::OkStatus();
 #else
    MP_RETURN_IF_ERROR(LoadDelegate(cc));
 #endif
@ -365,13 +381,51 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
  return ::mediapipe::OkStatus();
 }

+::mediapipe::Status TfLiteInferenceCalculator::InitTFLiteGPURunner() {
+#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
+  // Create and bind OpenGL buffers for outputs.
+  // These buffers are created onve and later their ids are jut passed to the
+  // calculator outputs.
+
+  gpu_data_out_.resize(tflite_gpu_runner_->outputs_size());
+  for (int i = 0; i < tflite_gpu_runner_->outputs_size(); ++i) {
+    gpu_data_out_[i] = absl::make_unique<GPUData>();
+    ASSIGN_OR_RETURN(gpu_data_out_[i]->elements,
+                     tflite_gpu_runner_->GetOutputElements(i));
+    // Create and bind input buffer.
+    RET_CHECK_CALL(::tflite::gpu::gl::CreateReadWriteShaderStorageBuffer<float>(
+        gpu_data_out_[i]->elements, &gpu_data_out_[i]->buffer));
+  }
+  RET_CHECK_CALL(tflite_gpu_runner_->Build());
+#endif
+  return ::mediapipe::OkStatus();
+}
+
 ::mediapipe::Status TfLiteInferenceCalculator::Process(CalculatorContext* cc) {
  // 1. Receive pre-processed tensor inputs.
-  if (gpu_input_) {
-    // Read GPU input into SSBO.
+  if (use_advanced_gpu_api_) {
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
    const auto& input_tensors =
        cc->Inputs().Tag("TENSORS_GPU").Get<std::vector<GpuTensor>>();
+    RET_CHECK(input_tensors.empty());
+    MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext(
+        [this, &input_tensors]() -> ::mediapipe::Status {
+          for (int i = 0; i < input_tensors.size(); ++i) {
+            MP_RETURN_IF_ERROR(tflite_gpu_runner_->BindSSBOToInputTensor(
+                input_tensors[i].id(), i));
+          }
+          for (int i = 0; i < gpu_data_out_.size(); ++i) {
+            MP_RETURN_IF_ERROR(tflite_gpu_runner_->BindSSBOToOutputTensor(
+                gpu_data_out_[i]->buffer.id(), i));
+          }
+          return ::mediapipe::OkStatus();
+        }));
+#endif
+  } else if (gpu_input_) {
+    // Read GPU input into SSBO.
+#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
+    const auto& input_tensors =
+        cc->Inputs().Tag(kTensorsGpuTag).Get<std::vector<GpuTensor>>();
    RET_CHECK_GT(input_tensors.size(), 0);
    MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext(
        [this, &input_tensors]() -> ::mediapipe::Status {
@ -386,7 +440,7 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
        }));
 #elif defined(MEDIAPIPE_IOS)
    const auto& input_tensors =
-        cc->Inputs().Tag("TENSORS_GPU").Get<std::vector<GpuTensor>>();
+        cc->Inputs().Tag(kTensorsGpuTag).Get<std::vector<GpuTensor>>();
    RET_CHECK_GT(input_tensors.size(), 0);
    // Explicit copy input with conversion float 32 bits to 16 bits.
    gpu_data_in_.resize(input_tensors.size());
@ -413,7 +467,7 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
  } else {
    // Read CPU input into tensors.
    const auto& input_tensors =
-        cc->Inputs().Tag("TENSORS").Get<std::vector<TfLiteTensor>>();
+        cc->Inputs().Tag(kTensorsTag).Get<std::vector<TfLiteTensor>>();
    RET_CHECK_GT(input_tensors.size(), 0);
    for (int i = 0; i < input_tensors.size(); ++i) {
      const TfLiteTensor* input_tensor = &input_tensors[i];
@ -437,7 +491,11 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
    MP_RETURN_IF_ERROR(
        gpu_helper_.RunInGlContext([this]() -> ::mediapipe::Status {
-          RET_CHECK_EQ(interpreter_->Invoke(), kTfLiteOk);
+          if (use_advanced_gpu_api_) {
+            RET_CHECK(tflite_gpu_runner_->Invoke().ok());
+          } else {
+            RET_CHECK_EQ(interpreter_->Invoke(), kTfLiteOk);
+          }
          return ::mediapipe::OkStatus();
        }));
 #elif defined(MEDIAPIPE_IOS)
@ -448,7 +506,18 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
  }

  // 3. Output processed tensors.
-  if (gpu_output_) {
+  if (use_advanced_gpu_api_) {
+#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
+    auto output_tensors = absl::make_unique<std::vector<GpuTensor>>();
+    output_tensors->resize(gpu_data_out_.size());
+    for (int i = 0; i < gpu_data_out_.size(); ++i) {
+      output_tensors->at(i) = gpu_data_out_[0]->buffer.MakeRef();
+    }
+    cc->Outputs()
+        .Tag("TENSORS_GPU")
+        .Add(output_tensors.release(), cc->InputTimestamp());
+#endif
+  } else if (gpu_output_) {
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
    // Output result tensors (GPU).
    auto output_tensors = absl::make_unique<std::vector<GpuTensor>>();
@ -464,7 +533,7 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
          return ::mediapipe::OkStatus();
        }));
    cc->Outputs()
-        .Tag("TENSORS_GPU")
+        .Tag(kTensorsGpuTag)
        .Add(output_tensors.release(), cc->InputTimestamp());
 #elif defined(MEDIAPIPE_IOS)
    // Output result tensors (GPU).
@ -488,7 +557,7 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
    [convert_command endEncoding];
    [command_buffer commit];
    cc->Outputs()
-        .Tag("TENSORS_GPU")
+        .Tag(kTensorsGpuTag)
        .Add(output_tensors.release(), cc->InputTimestamp());
 #else
    RET_CHECK_FAIL() << "GPU processing not enabled.";
@ -501,8 +570,9 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
      TfLiteTensor* tensor = interpreter_->tensor(tensor_indexes[i]);
      output_tensors->emplace_back(*tensor);
    }
-    cc->Outputs().Tag("TENSORS").Add(output_tensors.release(),
-                                     cc->InputTimestamp());
+    cc->Outputs()
+        .Tag(kTensorsTag)
+        .Add(output_tensors.release(), cc->InputTimestamp());
  }

  return ::mediapipe::OkStatus();
@ -557,6 +627,20 @@ REGISTER_CALCULATOR(TfLiteInferenceCalculator);
                      .Tag("CUSTOM_OP_RESOLVER")
                      .Get<tflite::ops::builtin::BuiltinOpResolver>();
  }
+
+#if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
+  if (use_advanced_gpu_api_) {
+    tflite::gpu::InferenceOptions options;
+    options.priority1 = tflite::gpu::InferencePriority::MIN_LATENCY;
+    options.priority2 = tflite::gpu::InferencePriority::AUTO;
+    options.priority3 = tflite::gpu::InferencePriority::AUTO;
+    options.usage = tflite::gpu::InferenceUsage::SUSTAINED_SPEED;
+    tflite_gpu_runner_ =
+        std::make_unique<tflite::gpu::TFLiteGPURunner>(options);
+    return tflite_gpu_runner_->InitializeWithModel(model);
+  }
+#endif
+
 #if defined(MEDIAPIPE_EDGE_TPU)
  interpreter_ =
      BuildEdgeTpuInterpreter(model, &op_resolver, edgetpu_context_.get());
--- a/mediapipe/calculators/tflite/tflite_inference_calculator.proto
+++ b/mediapipe/calculators/tflite/tflite_inference_calculator.proto
@ -42,7 +42,11 @@ message TfLiteInferenceCalculatorOptions {
    message TfLite {}
    // Delegate to run GPU inference depending on the device.
    // (Can use OpenGl, OpenCl, Metal depending on the device.)
-    message Gpu {}
+    message Gpu {
+      // Experimental, Android/Linux only. Use TFLite GPU delegate API2 for
+      // the NN inference.
+      optional bool use_advanced_gpu_api = 1 [default = false];
+    }
    // Android only.
    message Nnapi {}
    message Xnnpack {
--- a/mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+++ b/mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
@ -47,10 +47,11 @@
 #endif  // iOS

 namespace {
-
 constexpr int kNumInputTensorsWithAnchors = 3;
 constexpr int kNumCoordsPerBox = 4;

+constexpr char kTensorsTag[] = "TENSORS";
+constexpr char kTensorsGpuTag[] = "TENSORS_GPU";
 }  // namespace

 namespace mediapipe {
@ -200,13 +201,13 @@ REGISTER_CALCULATOR(TfLiteTensorsToDetectionsCalculator);

  bool use_gpu = false;

-  if (cc->Inputs().HasTag("TENSORS")) {
-    cc->Inputs().Tag("TENSORS").Set<std::vector<TfLiteTensor>>();
+  if (cc->Inputs().HasTag(kTensorsTag)) {
+    cc->Inputs().Tag(kTensorsTag).Set<std::vector<TfLiteTensor>>();
  }

 #if !defined(MEDIAPIPE_DISABLE_GPU) && !defined(__EMSCRIPTEN__)
-  if (cc->Inputs().HasTag("TENSORS_GPU")) {
-    cc->Inputs().Tag("TENSORS_GPU").Set<std::vector<GpuTensor>>();
+  if (cc->Inputs().HasTag(kTensorsGpuTag)) {
+    cc->Inputs().Tag(kTensorsGpuTag).Set<std::vector<GpuTensor>>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
@ -236,7 +237,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToDetectionsCalculator);
    CalculatorContext* cc) {
  cc->SetOffset(TimestampDiff(0));

-  if (cc->Inputs().HasTag("TENSORS_GPU")) {
+  if (cc->Inputs().HasTag(kTensorsGpuTag)) {
    gpu_input_ = true;
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
    MP_RETURN_IF_ERROR(gpu_helper_.Open(cc));
@ -258,8 +259,8 @@ REGISTER_CALCULATOR(TfLiteTensorsToDetectionsCalculator);

 ::mediapipe::Status TfLiteTensorsToDetectionsCalculator::Process(
    CalculatorContext* cc) {
-  if ((!gpu_input_ && cc->Inputs().Tag("TENSORS").IsEmpty()) ||
-      (gpu_input_ && cc->Inputs().Tag("TENSORS_GPU").IsEmpty())) {
+  if ((!gpu_input_ && cc->Inputs().Tag(kTensorsTag).IsEmpty()) ||
+      (gpu_input_ && cc->Inputs().Tag(kTensorsGpuTag).IsEmpty())) {
    return ::mediapipe::OkStatus();
  }

@ -284,7 +285,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToDetectionsCalculator);
 ::mediapipe::Status TfLiteTensorsToDetectionsCalculator::ProcessCPU(
    CalculatorContext* cc, std::vector<Detection>* output_detections) {
  const auto& input_tensors =
-      cc->Inputs().Tag("TENSORS").Get<std::vector<TfLiteTensor>>();
+      cc->Inputs().Tag(kTensorsTag).Get<std::vector<TfLiteTensor>>();

  if (input_tensors.size() == 2 ||
      input_tensors.size() == kNumInputTensorsWithAnchors) {
@ -402,7 +403,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToDetectionsCalculator);
    CalculatorContext* cc, std::vector<Detection>* output_detections) {
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
  const auto& input_tensors =
-      cc->Inputs().Tag("TENSORS_GPU").Get<std::vector<GpuTensor>>();
+      cc->Inputs().Tag(kTensorsGpuTag).Get<std::vector<GpuTensor>>();
  RET_CHECK_GE(input_tensors.size(), 2);

  MP_RETURN_IF_ERROR(gpu_helper_.RunInGlContext([this, &input_tensors, &cc,
@ -466,7 +467,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToDetectionsCalculator);
 #elif defined(MEDIAPIPE_IOS)

  const auto& input_tensors =
-      cc->Inputs().Tag("TENSORS_GPU").Get<std::vector<GpuTensor>>();
+      cc->Inputs().Tag(kTensorsGpuTag).Get<std::vector<GpuTensor>>();
  RET_CHECK_GE(input_tensors.size(), 2);

  // Copy inputs.
--- a/mediapipe/calculators/tflite/tflite_tensors_to_segmentation_calculator.cc
+++ b/mediapipe/calculators/tflite/tflite_tensors_to_segmentation_calculator.cc
@ -49,6 +49,16 @@ int NumGroups(const int size, const int group_size) {  // NOLINT
 float Clamp(float val, float min, float max) {
  return std::min(std::max(val, min), max);
 }
+
+constexpr char kTensorsTag[] = "TENSORS";
+constexpr char kTensorsGpuTag[] = "TENSORS_GPU";
+constexpr char kSizeImageTag[] = "REFERENCE_IMAGE";
+constexpr char kSizeImageGpuTag[] = "REFERENCE_IMAGE_GPU";
+constexpr char kMaskTag[] = "MASK";
+constexpr char kMaskGpuTag[] = "MASK_GPU";
+constexpr char kPrevMaskTag[] = "PREV_MASK";
+constexpr char kPrevMaskGpuTag[] = "PREV_MASK_GPU";
+
 }  // namespace

 namespace mediapipe {
@ -148,39 +158,39 @@ REGISTER_CALCULATOR(TfLiteTensorsToSegmentationCalculator);
  bool use_gpu = false;

  // Inputs CPU.
-  if (cc->Inputs().HasTag("TENSORS")) {
-    cc->Inputs().Tag("TENSORS").Set<std::vector<TfLiteTensor>>();
+  if (cc->Inputs().HasTag(kTensorsTag)) {
+    cc->Inputs().Tag(kTensorsTag).Set<std::vector<TfLiteTensor>>();
  }
-  if (cc->Inputs().HasTag("PREV_MASK")) {
-    cc->Inputs().Tag("PREV_MASK").Set<ImageFrame>();
+  if (cc->Inputs().HasTag(kPrevMaskTag)) {
+    cc->Inputs().Tag(kPrevMaskTag).Set<ImageFrame>();
  }
-  if (cc->Inputs().HasTag("REFERENCE_IMAGE")) {
-    cc->Inputs().Tag("REFERENCE_IMAGE").Set<ImageFrame>();
+  if (cc->Inputs().HasTag(kSizeImageTag)) {
+    cc->Inputs().Tag(kSizeImageTag).Set<ImageFrame>();
  }

  // Inputs GPU.
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
-  if (cc->Inputs().HasTag("TENSORS_GPU")) {
-    cc->Inputs().Tag("TENSORS_GPU").Set<std::vector<GlBuffer>>();
+  if (cc->Inputs().HasTag(kTensorsGpuTag)) {
+    cc->Inputs().Tag(kTensorsGpuTag).Set<std::vector<GlBuffer>>();
    use_gpu |= true;
  }
-  if (cc->Inputs().HasTag("PREV_MASK_GPU")) {
-    cc->Inputs().Tag("PREV_MASK_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Inputs().HasTag(kPrevMaskGpuTag)) {
+    cc->Inputs().Tag(kPrevMaskGpuTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
-  if (cc->Inputs().HasTag("REFERENCE_IMAGE_GPU")) {
-    cc->Inputs().Tag("REFERENCE_IMAGE_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Inputs().HasTag(kSizeImageGpuTag)) {
+    cc->Inputs().Tag(kSizeImageGpuTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU

  // Outputs.
-  if (cc->Outputs().HasTag("MASK")) {
-    cc->Outputs().Tag("MASK").Set<ImageFrame>();
+  if (cc->Outputs().HasTag(kMaskTag)) {
+    cc->Outputs().Tag(kMaskTag).Set<ImageFrame>();
  }
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
-  if (cc->Outputs().HasTag("MASK_GPU")) {
-    cc->Outputs().Tag("MASK_GPU").Set<mediapipe::GpuBuffer>();
+  if (cc->Outputs().HasTag(kMaskGpuTag)) {
+    cc->Outputs().Tag(kMaskGpuTag).Set<mediapipe::GpuBuffer>();
    use_gpu |= true;
  }
 #endif  //  !MEDIAPIPE_DISABLE_GPU
@ -197,7 +207,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToSegmentationCalculator);
    CalculatorContext* cc) {
  cc->SetOffset(TimestampDiff(0));

-  if (cc->Inputs().HasTag("TENSORS_GPU")) {
+  if (cc->Inputs().HasTag(kTensorsGpuTag)) {
    use_gpu_ = true;
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
    MP_RETURN_IF_ERROR(gpu_helper_.Open(cc));
@ -255,23 +265,22 @@ REGISTER_CALCULATOR(TfLiteTensorsToSegmentationCalculator);

 ::mediapipe::Status TfLiteTensorsToSegmentationCalculator::ProcessCpu(
    CalculatorContext* cc) {
-  if (cc->Inputs().Tag("TENSORS").IsEmpty()) {
+  if (cc->Inputs().Tag(kTensorsTag).IsEmpty()) {
    return ::mediapipe::OkStatus();
  }

  // Get input streams.
  const auto& input_tensors =
-      cc->Inputs().Tag("TENSORS").Get<std::vector<TfLiteTensor>>();
-  const bool has_prev_mask = cc->Inputs().HasTag("PREV_MASK") &&
-                             !cc->Inputs().Tag("PREV_MASK").IsEmpty();
+      cc->Inputs().Tag(kTensorsTag).Get<std::vector<TfLiteTensor>>();
+  const bool has_prev_mask = cc->Inputs().HasTag(kPrevMaskTag) &&
+                             !cc->Inputs().Tag(kPrevMaskTag).IsEmpty();
  const ImageFrame placeholder;
-  const auto& input_mask = has_prev_mask
-                               ? cc->Inputs().Tag("PREV_MASK").Get<ImageFrame>()
-                               : placeholder;
+  const auto& input_mask =
+      has_prev_mask ? cc->Inputs().Tag(kPrevMaskTag).Get<ImageFrame>()
+                    : placeholder;
  int output_width = tensor_width_, output_height = tensor_height_;
-  if (cc->Inputs().HasTag("REFERENCE_IMAGE")) {
-    const auto& input_image =
-        cc->Inputs().Tag("REFERENCE_IMAGE").Get<ImageFrame>();
+  if (cc->Inputs().HasTag(kSizeImageTag)) {
+    const auto& input_image = cc->Inputs().Tag(kSizeImageTag).Get<ImageFrame>();
    output_width = input_image.Width();
    output_height = input_image.Height();
  }
@ -353,7 +362,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToSegmentationCalculator);
      ImageFormat::SRGBA, output_width, output_height);
  cv::Mat output_mat = formats::MatView(output_mask.get());
  large_mask_mat.copyTo(output_mat);
-  cc->Outputs().Tag("MASK").Add(output_mask.release(), cc->InputTimestamp());
+  cc->Outputs().Tag(kMaskTag).Add(output_mask.release(), cc->InputTimestamp());

  return ::mediapipe::OkStatus();
 }
@ -364,23 +373,23 @@ REGISTER_CALCULATOR(TfLiteTensorsToSegmentationCalculator);
 // 3. upsample small mask into output mask to be same size as input image
 ::mediapipe::Status TfLiteTensorsToSegmentationCalculator::ProcessGpu(
    CalculatorContext* cc) {
-  if (cc->Inputs().Tag("TENSORS_GPU").IsEmpty()) {
+  if (cc->Inputs().Tag(kTensorsGpuTag).IsEmpty()) {
    return ::mediapipe::OkStatus();
  }
 #if !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
  // Get input streams.
  const auto& input_tensors =
-      cc->Inputs().Tag("TENSORS_GPU").Get<std::vector<GlBuffer>>();
-  const bool has_prev_mask = cc->Inputs().HasTag("PREV_MASK_GPU") &&
-                             !cc->Inputs().Tag("PREV_MASK_GPU").IsEmpty();
+      cc->Inputs().Tag(kTensorsGpuTag).Get<std::vector<GlBuffer>>();
+  const bool has_prev_mask = cc->Inputs().HasTag(kPrevMaskGpuTag) &&
+                             !cc->Inputs().Tag(kPrevMaskGpuTag).IsEmpty();
  const auto& input_mask =
      has_prev_mask
-          ? cc->Inputs().Tag("PREV_MASK_GPU").Get<mediapipe::GpuBuffer>()
+          ? cc->Inputs().Tag(kPrevMaskGpuTag).Get<mediapipe::GpuBuffer>()
          : mediapipe::GpuBuffer();
  int output_width = tensor_width_, output_height = tensor_height_;
-  if (cc->Inputs().HasTag("REFERENCE_IMAGE_GPU")) {
+  if (cc->Inputs().HasTag(kSizeImageGpuTag)) {
    const auto& input_image =
-        cc->Inputs().Tag("REFERENCE_IMAGE_GPU").Get<mediapipe::GpuBuffer>();
+        cc->Inputs().Tag(kSizeImageGpuTag).Get<mediapipe::GpuBuffer>();
    output_width = input_image.width();
    output_height = input_image.height();
  }
@ -441,7 +450,7 @@ REGISTER_CALCULATOR(TfLiteTensorsToSegmentationCalculator);
  // Send out image as GPU packet.
  auto output_image = output_texture.GetFrame<mediapipe::GpuBuffer>();
  cc->Outputs()
-      .Tag("MASK_GPU")
+      .Tag(kMaskGpuTag)
      .Add(output_image.release(), cc->InputTimestamp());

  // Cleanup
--- a/mediapipe/docs/examples.md
+++ b/mediapipe/docs/examples.md
@ -121,6 +121,14 @@ and model details are described in the

 *   [Android](./hair_segmentation_mobile_gpu.md)

+### Template Matching using KNIFT with CPU
+
+[Template Matching using KNIFT on Mobile](./template_matching_mobile_cpu.md)
+shows how to use MediaPipe with TFLite model for template matching using Knift
+on mobile using CPU.
+
+*   [Android](./template_matching_mobile_cpu.md)
+
 ## Desktop

 ### Hello World for C++
@ -171,7 +179,6 @@ on desktop with webcam input.
 *   [Desktop GPU](./face_mesh_desktop.md)
 *   [Desktop CPU](./face_mesh_desktop.md)

-
 ### Hand Tracking on Desktop with Webcam

 [Hand Tracking on Desktop with Webcam](./hand_tracking_desktop.md) shows how to
@ -198,7 +205,7 @@ GPU with live video from a webcam.

 *   [Desktop GPU](./hair_segmentation_desktop.md)

-## Google Coral (machine learning acceleration with Google EdgeTPU)
+## Google Coral (ML acceleration with Google EdgeTPU)

 Below are code samples on how to run MediaPipe on Google Coral Dev Board.

--- a/mediapipe/docs/images/mobile/template_matching_android_cpu.gif
+++ b/mediapipe/docs/images/mobile/template_matching_android_cpu.gif
--- a/mediapipe/docs/images/mobile/template_matching_android_cpu_small.gif
+++ b/mediapipe/docs/images/mobile/template_matching_android_cpu_small.gif
--- a/mediapipe/docs/images/mobile/template_matching_mobile_graph.png
+++ b/mediapipe/docs/images/mobile/template_matching_mobile_graph.png
--- a/mediapipe/docs/images/mobile/template_matching_mobile_template.jpg
+++ b/mediapipe/docs/images/mobile/template_matching_mobile_template.jpg
--- a/mediapipe/docs/template_matching_desktop_cpu.md
+++ b/mediapipe/docs/template_matching_desktop_cpu.md
@ -0,0 +1,31 @@
+# Template Matching using KNIFT on Desktop
+
+This doc focuses on the
+[example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/template_matching/template_matching_desktop.pbtxt)
+that performs template matching with KNIFT (Keypoint Neural Invariant Feature
+Transform) on desktop CPU.
+
+If you are interested in more detail about KNIFT or running the example on
+mobile, please see
+[Template Matching using KNIFT on Mobile (CPU)](template_matching_mobile_cpu.md).
+
+To build the desktop app, run:
+
+```bash
+$ bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \
+    mediapipe/examples/desktop/template_matching:template_matching_tflite
+```
+
+To run the desktop app, please specify a template index file
+([example](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_index.pb)) and a
+video to be matched. For how to build your own index file, please see
+[here](template_matching_mobile_cpu.md#build-index-file).
+
+```bash
+$ GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/template_matching/template_matching_tflite \
+    --calculator_graph_config_file=mediapipe/graphs/template_matching/template_matching_desktop.pbtxt --input_side_packets="input_video_path=<input video path>,output_video_path=<output video path>"
+```
+
+## Graph
+
+[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/template_matching/template_matching_desktop.pbtxt)
--- a/mediapipe/docs/template_matching_mobile_cpu.md
+++ b/mediapipe/docs/template_matching_mobile_cpu.md
@ -0,0 +1,94 @@
+# Template Matching using KNIFT on Mobile (CPU)
+
+This doc focuses on the
+[example graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/template_matching/template_matching_mobile_cpu.pbtxt)
+that performs template matching with KNIFT (Keypoint Neural Invariant Feature
+Transform) on mobile CPU.
+
+![template_matching_mobile_cpu.gif](images/mobile/template_matching_android_cpu.gif)
+
+In the visualization above, the green dots represent detected keypoints on each
+frame and the red box represents the targets matched by templates using KNIFT
+features (see also [model card](https://mediapipe.page.link/knift-mc)). For more
+information, please see
+[Google Developers Blog](https://mediapipe.page.link/knift-blog).
+
+## Build Index Files
+
+In MediaPipe, we've already provided a file in
+[knift_index.pb](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_index.pb),
+pre-computed from the 3 template images (of USD bills) shown below. If you'd
+like to use your own template images, please follow the steps below, or
+otherwise you can jump directly to [Android](#android).
+
+![template_matching_mobile_template.jpg](images/mobile/template_matching_mobile_template.jpg)
+
+### Step 1:
+
+Put all template images in a single directory.
+
+### Step 2:
+
+To build the index file for all templates in the directory, run:
+
+```bash
+$ bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \
+    mediapipe/examples/desktop/template_matching:template_matching_tflite
+$ bazel-bin/mediapipe/examples/desktop/template_matching/template_matching_tflite \
+    --calculator_graph_config_file=mediapipe/graphs/template_matching/index_building.pbtxt \
+    --input_side_packets="file_directory=<template image directory>,file_suffix='png',output_index_filename=<output index filename>"
+```
+
+The output index file includes the extracted KNIFT features.
+
+### Step 3:
+
+Replace
+[mediapipe/models/knift_index.pb](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_index.pb)
+with the index file you generated, and update
+[mediapipe/models/knift_labelmap.txt](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_labelmap.txt)
+with your own template names.
+
+## Android
+
+[Source](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu)
+
+A prebuilt arm64 APK can be
+[downloaded here](https://drive.google.com/open?id=1tSWRfes9rAM4NrzmJBplguNQQvaeBZSa).
+
+To build and install the app yourself, run:
+
+Note: MediaPipe uses OpenCV 3 by default. However, because of
+[issues](https://github.com/opencv/opencv/issues/11488) between NDK 17+ and
+OpenCV 3 when using
+[knnMatch](https://docs.opencv.org/3.4/db/d39/classcv_1_1DescriptorMatcher.html#a378f35c9b1a5dfa4022839a45cdf0e89),
+please use the following commands to temporarily switch to OpenCV 4 for the
+template matching exmaple on Android, and switch back to OpenCV 3 afterwards.
+
+```bash
+# Switch to OpenCV 4
+sed -i -e 's:3.4.3/opencv-3.4.3:4.0.1/opencv-4.0.1:g' WORKSPACE
+sed -i -e 's:libopencv_java3:libopencv_java4:g' third_party/opencv_android.BUILD
+
+# Build and install app
+bazel build -c opt --config=android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu:templatematchingcpu
+adb install -r bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/templatematchingcpu.apk
+
+# Switch back to OpenCV 3
+sed -i -e 's:4.0.1/opencv-4.0.1:3.4.3/opencv-3.4.3:g' WORKSPACE
+sed -i -e 's:libopencv_java4:libopencv_java3:g' third_party/opencv_android.BUILD
+```
+
+## Use XNNPACK Delegate
+
+The example uses XNNPACK delegate by default. Users can change the
+[option in TfLiteInferenceCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/calculators/tflite/tflite_inference_calculator.proto)
+to use default TF Lite inference.
+
+## Graph
+
+### Main Graph
+
+![template_matching_mobile_graph](images/mobile/template_matching_mobile_graph.png)
+
+[Source pbtxt file](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/template_matching/template_matching_mobile_cpu.pbtxt)
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/AndroidManifest.xml
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/AndroidManifest.xml
@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.google.mediapipe.apps.templatematchingcpu">
+
+  <uses-sdk
+      android:minSdkVersion="21"
+      android:targetSdkVersion="27" />
+
+  <!-- For using the camera -->
+  <uses-permission android:name="android.permission.CAMERA" />
+  <uses-feature android:name="android.hardware.camera" />
+  <uses-feature android:name="android.hardware.camera.autofocus" />
+  <!-- For MediaPipe -->
+  <uses-feature android:glEsVersion="0x00020000" android:required="true" />
+
+
+  <application
+      android:allowBackup="true"
+      android:label="@string/app_name"
+      android:supportsRtl="true"
+      android:theme="@style/AppTheme">
+      <activity
+          android:name=".MainActivity"
+          android:exported="true"
+          android:screenOrientation="portrait">
+          <intent-filter>
+              <action android:name="android.intent.action.MAIN" />
+              <category android:name="android.intent.category.LAUNCHER" />
+          </intent-filter>
+      </activity>
+  </application>
+
+</manifest>
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/BUILD
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/BUILD
@ -0,0 +1,82 @@
+# Copyright 2019 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:private"])
+
+cc_binary(
+    name = "libmediapipe_jni.so",
+    linkshared = 1,
+    linkstatic = 1,
+    deps = [
+        "//mediapipe/graphs/template_matching:mobile_calculators",
+        "//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni",
+    ],
+)
+
+cc_library(
+    name = "mediapipe_jni_lib",
+    srcs = [":libmediapipe_jni.so"],
+    alwayslink = 1,
+)
+
+# Maps the binary graph to an alias (e.g., the app name) for convenience so that the alias can be
+# easily incorporated into the app via, for example,
+# MainActivity.BINARY_GRAPH_NAME = "appname.binarypb".
+genrule(
+    name = "binary_graph",
+    srcs = ["//mediapipe/graphs/template_matching:mobile_cpu_binary_graph"],
+    outs = ["templatematching.binarypb"],
+    cmd = "cp $< $@",
+)
+
+android_library(
+    name = "mediapipe_lib",
+    srcs = glob(["*.java"]),
+    assets = [
+        ":binary_graph",
+        "//mediapipe/models:knift_index.pb",
+        "//mediapipe/models:knift_float.tflite",
+        "//mediapipe/models:knift_labelmap.txt",
+    ],
+    assets_dir = "",
+    manifest = "AndroidManifest.xml",
+    resource_files = glob(["res/**"]),
+    deps = [
+        ":mediapipe_jni_lib",
+        "//mediapipe/java/com/google/mediapipe/components:android_camerax_helper",
+        "//mediapipe/java/com/google/mediapipe/components:android_components",
+        "//mediapipe/java/com/google/mediapipe/framework:android_framework",
+        "//mediapipe/java/com/google/mediapipe/glutil",
+        "//third_party:androidx_appcompat",
+        "//third_party:androidx_constraint_layout",
+        "//third_party:androidx_legacy_support_v4",
+        "//third_party:androidx_recyclerview",
+        "//third_party:opencv",
+        "@maven//:androidx_concurrent_concurrent_futures",
+        "@maven//:androidx_lifecycle_lifecycle_common",
+        "@maven//:com_google_guava_guava",
+    ],
+)
+
+android_binary(
+    name = "templatematchingcpu",
+    manifest = "AndroidManifest.xml",
+    manifest_values = {"applicationId": "com.google.mediapipe.apps.templatematchingcpu"},
+    multidex = "native",
+    deps = [
+        ":mediapipe_lib",
+    ],
+)
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/MainActivity.java
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/MainActivity.java
@ -0,0 +1,170 @@
+// Copyright 2019 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.mediapipe.apps.templatematchingcpu;
+
+import android.graphics.SurfaceTexture;
+import android.os.Bundle;
+import androidx.appcompat.app.AppCompatActivity;
+import android.util.Size;
+import android.view.SurfaceHolder;
+import android.view.SurfaceView;
+import android.view.View;
+import android.view.ViewGroup;
+import com.google.mediapipe.components.CameraHelper;
+import com.google.mediapipe.components.CameraXPreviewHelper;
+import com.google.mediapipe.components.ExternalTextureConverter;
+import com.google.mediapipe.components.FrameProcessor;
+import com.google.mediapipe.components.PermissionHelper;
+import com.google.mediapipe.framework.AndroidAssetUtil;
+import com.google.mediapipe.glutil.EglManager;
+
+/** Main activity of MediaPipe example apps. */
+public class MainActivity extends AppCompatActivity {
+  private static final String TAG = "MainActivity";
+
+  private static final String BINARY_GRAPH_NAME = "templatematching.binarypb";
+  private static final String INPUT_VIDEO_STREAM_NAME = "input_video";
+  private static final String OUTPUT_VIDEO_STREAM_NAME = "output_video";
+  private static final CameraHelper.CameraFacing CAMERA_FACING = CameraHelper.CameraFacing.BACK;
+
+  // Flips the camera-preview frames vertically before sending them into FrameProcessor to be
+  // processed in a MediaPipe graph, and flips the processed frames back when they are displayed.
+  // This is needed because OpenGL represents images assuming the image origin is at the bottom-left
+  // corner, whereas MediaPipe in general assumes the image origin is at top-left.
+  private static final boolean FLIP_FRAMES_VERTICALLY = true;
+
+  static {
+    // Load all native libraries needed by the app.
+    System.loadLibrary("mediapipe_jni");
+    System.loadLibrary("opencv_java4");
+  }
+
+  // {@link SurfaceTexture} where the camera-preview frames can be accessed.
+  private SurfaceTexture previewFrameTexture;
+  // {@link SurfaceView} that displays the camera-preview frames processed by a MediaPipe graph.
+  private SurfaceView previewDisplayView;
+
+  // Creates and manages an {@link EGLContext}.
+  private EglManager eglManager;
+  // Sends camera-preview frames into a MediaPipe graph for processing, and displays the processed
+  // frames onto a {@link Surface}.
+  private FrameProcessor processor;
+  // Converts the GL_TEXTURE_EXTERNAL_OES texture from Android camera into a regular texture to be
+  // consumed by {@link FrameProcessor} and the underlying MediaPipe graph.
+  private ExternalTextureConverter converter;
+
+  // Handles camera access via the {@link CameraX} Jetpack support library.
+  private CameraXPreviewHelper cameraHelper;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_main);
+
+    previewDisplayView = new SurfaceView(this);
+    setupPreviewDisplayView();
+
+    // Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g.,
+    // binary graphs.
+    AndroidAssetUtil.initializeNativeAssetManager(this);
+
+    eglManager = new EglManager(null);
+    processor =
+        new FrameProcessor(
+            this,
+            eglManager.getNativeContext(),
+            BINARY_GRAPH_NAME,
+            INPUT_VIDEO_STREAM_NAME,
+            OUTPUT_VIDEO_STREAM_NAME);
+    processor.getVideoSurfaceOutput().setFlipY(FLIP_FRAMES_VERTICALLY);
+
+    PermissionHelper.checkAndRequestCameraPermissions(this);
+  }
+
+  @Override
+  protected void onResume() {
+    super.onResume();
+    converter = new ExternalTextureConverter(eglManager.getContext());
+    converter.setFlipY(FLIP_FRAMES_VERTICALLY);
+    converter.setConsumer(processor);
+    if (PermissionHelper.cameraPermissionsGranted(this)) {
+      startCamera();
+    }
+  }
+
+  @Override
+  protected void onPause() {
+    super.onPause();
+    converter.close();
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, String[] permissions, int[] grantResults) {
+    super.onRequestPermissionsResult(requestCode, permissions, grantResults);
+    PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults);
+  }
+
+  private void setupPreviewDisplayView() {
+    previewDisplayView.setVisibility(View.GONE);
+    ViewGroup viewGroup = findViewById(R.id.preview_display_layout);
+    viewGroup.addView(previewDisplayView);
+
+    previewDisplayView
+        .getHolder()
+        .addCallback(
+            new SurfaceHolder.Callback() {
+              @Override
+              public void surfaceCreated(SurfaceHolder holder) {
+                processor.getVideoSurfaceOutput().setSurface(holder.getSurface());
+              }
+
+              @Override
+              public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
+                // (Re-)Compute the ideal size of the camera-preview display (the area that the
+                // camera-preview frames get rendered onto, potentially with scaling and rotation)
+                // based on the size of the SurfaceView that contains the display.
+                Size viewSize = new Size(width, height);
+                Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize);
+                boolean isCameraRotated = cameraHelper.isCameraRotated();
+
+                // Connect the converter to the camera-preview frames as its input (via
+                // previewFrameTexture), and configure the output width and height as the computed
+                // display size.
+                converter.setSurfaceTextureAndAttachToGLContext(
+                    previewFrameTexture,
+                    isCameraRotated ? displaySize.getHeight() : displaySize.getWidth(),
+                    isCameraRotated ? displaySize.getWidth() : displaySize.getHeight());
+              }
+
+              @Override
+              public void surfaceDestroyed(SurfaceHolder holder) {
+                processor.getVideoSurfaceOutput().setSurface(null);
+              }
+            });
+  }
+
+  private void startCamera() {
+    cameraHelper = new CameraXPreviewHelper();
+    cameraHelper.setOnCameraStartedListener(
+        surfaceTexture -> {
+          previewFrameTexture = surfaceTexture;
+          // Make the display view visible to start showing the preview. This triggers the
+          // SurfaceHolder.Callback added to (the holder of) previewDisplayView.
+          previewDisplayView.setVisibility(View.VISIBLE);
+        });
+    cameraHelper.startCamera(this, CAMERA_FACING, /*surfaceTexture=*/ null);
+  }
+}
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/layout/activity_main.xml
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/layout/activity_main.xml
@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <FrameLayout
+        android:id="@+id/preview_display_layout"
+        android:layout_width="fill_parent"
+        android:layout_height="fill_parent"
+        android:layout_weight="1">
+        <TextView
+            android:id="@+id/no_camera_access_view"
+            android:layout_height="fill_parent"
+            android:layout_width="fill_parent"
+            android:gravity="center"
+            android:text="@string/no_camera_access" />
+    </FrameLayout>
+</androidx.constraintlayout.widget.ConstraintLayout>
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/values/colors.xml
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/values/colors.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="colorPrimary">#008577</color>
+    <color name="colorPrimaryDark">#00574B</color>
+    <color name="colorAccent">#D81B60</color>
+</resources>
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/values/strings.xml
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/values/strings.xml
@ -0,0 +1,4 @@
+<resources>
+    <string name="app_name" translatable="false">Template Matching CPU</string>
+    <string name="no_camera_access" translatable="false">Please grant camera permissions.</string>
+</resources>
--- a/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/values/styles.xml
+++ b/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/res/values/styles.xml
@ -0,0 +1,11 @@
+<resources>
+
+    <!-- Base application theme. -->
+    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
+        <!-- Customize your theme here. -->
+        <item name="colorPrimary">@color/colorPrimary</item>
+        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
+        <item name="colorAccent">@color/colorAccent</item>
+    </style>
+
+</resources>
--- a/mediapipe/examples/desktop/template_matching/BUILD
+++ b/mediapipe/examples/desktop/template_matching/BUILD
@ -0,0 +1,25 @@
+# Copyright 2019 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//mediapipe/examples:__subpackages__"])
+
+cc_binary(
+    name = "template_matching_tflite",
+    deps = [
+        "//mediapipe/examples/desktop:simple_run_graph_main",
+        "//mediapipe/graphs/template_matching:desktop_calculators",
+    ],
+)
--- a/mediapipe/framework/calculator_graph.cc
+++ b/mediapipe/framework/calculator_graph.cc
@ -320,7 +320,7 @@ CalculatorGraph::~CalculatorGraph() {
  }

  if (!::mediapipe::ContainsKey(executors_, "")) {
-    MP_RETURN_IF_ERROR(InitializeDefaultExecutor(*default_executor_options,
+    MP_RETURN_IF_ERROR(InitializeDefaultExecutor(default_executor_options,
                                                 use_application_thread));
  }

@ -328,7 +328,7 @@ CalculatorGraph::~CalculatorGraph() {
 }

 ::mediapipe::Status CalculatorGraph::InitializeDefaultExecutor(
-    const ThreadPoolExecutorOptions& default_executor_options,
+    const ThreadPoolExecutorOptions* default_executor_options,
    bool use_application_thread) {
  // If specified, run synchronously on the calling thread.
  if (use_application_thread) {
@ -341,7 +341,9 @@ CalculatorGraph::~CalculatorGraph() {
  }

  // Check the number of threads specified in the proto.
-  int num_threads = default_executor_options.num_threads();
+  int num_threads = default_executor_options == nullptr
+                        ? 0
+                        : default_executor_options->num_threads();

  // If the default (0 or -1) was specified, pick a suitable number of threads
  // depending on the number of processors in this system and the number of
@ -1215,12 +1217,14 @@ Packet CalculatorGraph::GetServicePacket(const GraphServiceBase& service) {
 }

 ::mediapipe::Status CalculatorGraph::CreateDefaultThreadPool(
-    const ThreadPoolExecutorOptions& default_executor_options,
+    const ThreadPoolExecutorOptions* default_executor_options,
    int num_threads) {
  MediaPipeOptions extendable_options;
  ThreadPoolExecutorOptions* options =
      extendable_options.MutableExtension(ThreadPoolExecutorOptions::ext);
-  *options = default_executor_options;
+  if (default_executor_options != nullptr) {
+    options->CopyFrom(*default_executor_options);
+  }
  options->set_num_threads(num_threads);
  // clang-format off
  ASSIGN_OR_RETURN(Executor* executor,
--- a/mediapipe/framework/calculator_graph.h
+++ b/mediapipe/framework/calculator_graph.h
@ -461,13 +461,13 @@ class CalculatorGraph {
  //
  // Only called by InitializeExecutors().
  ::mediapipe::Status InitializeDefaultExecutor(
-      const ThreadPoolExecutorOptions& default_executor_options,
+      const ThreadPoolExecutorOptions* default_executor_options,
      bool use_application_thread);

  // Creates a thread pool as the default executor. The num_threads argument
  // overrides the num_threads field in default_executor_options.
  ::mediapipe::Status CreateDefaultThreadPool(
-      const ThreadPoolExecutorOptions& default_executor_options,
+      const ThreadPoolExecutorOptions* default_executor_options,
      int num_threads);

  // Returns true if |name| is a reserved executor name.
--- a/mediapipe/gpu/gl_context.cc
+++ b/mediapipe/gpu/gl_context.cc
@ -274,6 +274,11 @@ bool GlContext::HasGlExtension(absl::string_view extension) const {
  }

  return Run([this]() -> ::mediapipe::Status {
+    // Clear any GL errors at this point: as this is a fresh context
+    // there shouldn't be any, but if we adopted an existing context (e.g. in
+    // some Emscripten cases), there might be some existing tripped error.
+    ForceClearExistingGlErrors();
+
    absl::string_view version_string(
        reinterpret_cast<const char*>(glGetString(GL_VERSION)));

@ -769,10 +774,18 @@ bool GlContext::SyncTokenIsReady(const std::shared_ptr<GlSyncPoint>& token) {
  return token->IsReady();
 }

-bool GlContext::CheckForGlErrors() {
+void GlContext::ForceClearExistingGlErrors() {
+  LogUncheckedGlErrors(CheckForGlErrors(/*force=*/true));
+}
+
+bool GlContext::CheckForGlErrors() { return CheckForGlErrors(false); }
+
+bool GlContext::CheckForGlErrors(bool force) {
 #if UNSAFE_EMSCRIPTEN_SKIP_GL_ERROR_HANDLING
-  LOG_FIRST_N(WARNING, 1) << "MediaPipe OpenGL error checking is disabled";
-  return false;
+  if (!force) {
+    LOG_FIRST_N(WARNING, 1) << "MediaPipe OpenGL error checking is disabled";
+    return false;
+  }
 #endif

  if (!HasContext()) return false;
--- a/mediapipe/gpu/gl_context.h
+++ b/mediapipe/gpu/gl_context.h
@ -348,7 +348,20 @@ class GlContext : public std::enable_shared_from_this<GlContext> {
  void DestroyContext();

  bool HasContext() const;
+
+  // This function clears out any tripped gl Errors and just logs them. This
+  // is used by code that needs to check glGetError() to know if it succeeded,
+  // but can't rely on the existing state to be 'clean'.
+  void ForceClearExistingGlErrors();
+
+  // Returns true if there were any GL errors. Note that this may be a no-op
+  // for performance reasons in some contexts (specifically Emscripten opt).
  bool CheckForGlErrors();
+
+  // Same as `CheckForGLErrors()` but with the option of forcing the check
+  // even if we would otherwise skip for performance reasons.
+  bool CheckForGlErrors(bool force);
+
  void LogUncheckedGlErrors(bool had_gl_errors);
  ::mediapipe::Status GetGlExtensions();
  ::mediapipe::Status GetGlExtensionsCompat();
--- a/mediapipe/gpu/gl_simple_calculator.h
+++ b/mediapipe/gpu/gl_simple_calculator.h
@ -36,9 +36,9 @@ namespace mediapipe {
 // - GlRender(), which is called for each frame.
 // - A destructor, to destroy the objects created in GlSetup.
 // Note that when GlSetup and GlRender are called, the GL context has already
-// been set, but in the destructor it has not. The destructor should have a
-// local variable set to ContextAutoSetter() to make sure it is doing the
-// destruction in the right GL context.
+// been set, but in the destructor it has not. The destructor should use the
+// RunInGlContext() helper to make sure it is doing the destruction in the right
+// GL context.
 //
 // Additionally, you can define a GlBind() method, which will be called to
 // enable shader programs, bind any additional textures you may need, etc.
--- a/mediapipe/gpu/gl_texture_buffer.cc
+++ b/mediapipe/gpu/gl_texture_buffer.cc
@ -87,11 +87,10 @@ bool GlTextureBuffer::CreateInternal(const void* data) {
 }

 void GlTextureBuffer::Reuse() {
-  WaitForConsumersOnGpu();
-  // TODO: should we just do this inside WaitForConsumersOnGpu?
-  // if we do that, WaitForConsumersOnGpu can be called only once.
+  absl::MutexLock lock(&consumer_sync_mutex_);
+  consumer_multi_sync_->WaitOnGpu();
+  // Reset the sync points.
  consumer_multi_sync_ = absl::make_unique<GlMultiSyncPoint>();
-  // Reset the token.
  producer_sync_ = nullptr;
 }

@ -102,11 +101,15 @@ void GlTextureBuffer::Updated(std::shared_ptr<GlSyncPoint> prod_token) {
 }

 void GlTextureBuffer::DidRead(std::shared_ptr<GlSyncPoint> cons_token) {
+  absl::MutexLock lock(&consumer_sync_mutex_);
  consumer_multi_sync_->Add(std::move(cons_token));
 }

 GlTextureBuffer::~GlTextureBuffer() {
  if (deletion_callback_) {
+    // Note: at this point there are no more consumers that could be added
+    // to the consumer_multi_sync_, so it no longer needs to be protected
+    // by out mutex when we hand it to the deletion callback.
    deletion_callback_(std::move(consumer_multi_sync_));
  }
 }
@ -129,10 +132,17 @@ void GlTextureBuffer::WaitOnGpu() {
  }
 }

-void GlTextureBuffer::WaitForConsumers() { consumer_multi_sync_->Wait(); }
+void GlTextureBuffer::WaitForConsumers() {
+  absl::MutexLock lock(&consumer_sync_mutex_);
+  consumer_multi_sync_->Wait();
+}

 void GlTextureBuffer::WaitForConsumersOnGpu() {
+  absl::MutexLock lock(&consumer_sync_mutex_);
  consumer_multi_sync_->WaitOnGpu();
+  // TODO: should we clear the consumer_multi_sync_ here?
+  // It would mean that WaitForConsumersOnGpu can be called only once, or more
+  // precisely, on only one GL context.
 }

 }  // namespace mediapipe
--- a/mediapipe/gpu/gl_texture_buffer.h
+++ b/mediapipe/gpu/gl_texture_buffer.h
@ -121,15 +121,16 @@ class GlTextureBuffer {
  friend class GlCalculatorHelperImpl;

  GLuint name_ = 0;
-  int width_ = 0;
-  int height_ = 0;
-  GpuBufferFormat format_ = GpuBufferFormat::kUnknown;
-  GLenum target_ = GL_TEXTURE_2D;
+  const int width_ = 0;
+  const int height_ = 0;
+  const GpuBufferFormat format_ = GpuBufferFormat::kUnknown;
+  const GLenum target_ = GL_TEXTURE_2D;
  // Token tracking changes to this texture. Used by WaitUntilComplete.
  std::shared_ptr<GlSyncPoint> producer_sync_;
+  absl::Mutex consumer_sync_mutex_;
  // Tokens tracking the point when consumers finished using this texture.
-  std::unique_ptr<GlMultiSyncPoint> consumer_multi_sync_ =
-      absl::make_unique<GlMultiSyncPoint>();
+  std::unique_ptr<GlMultiSyncPoint> consumer_multi_sync_ ABSL_GUARDED_BY(
+      consumer_sync_mutex_) = absl::make_unique<GlMultiSyncPoint>();
  DeletionCallback deletion_callback_;
 };

--- a/mediapipe/graphs/template_matching/BUILD
+++ b/mediapipe/graphs/template_matching/BUILD
@ -0,0 +1,67 @@
+# Copyright 2019 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load(
+    "//mediapipe/framework/tool:mediapipe_graph.bzl",
+    "mediapipe_binary_graph",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "template_matching_deps",
+    deps = [
+        "//mediapipe/calculators/image:feature_detector_calculator",
+        "//mediapipe/calculators/image:image_properties_calculator",
+        "//mediapipe/calculators/image:image_transformation_calculator",
+        "//mediapipe/calculators/tflite:tflite_converter_calculator",
+        "//mediapipe/calculators/tflite:tflite_inference_calculator",
+        "//mediapipe/calculators/tflite:tflite_tensors_to_floats_calculator",
+        "//mediapipe/calculators/util:annotation_overlay_calculator",
+        "//mediapipe/calculators/util:landmarks_to_render_data_calculator",
+        "//mediapipe/calculators/util:timed_box_list_id_to_label_calculator",
+        "//mediapipe/calculators/util:timed_box_list_to_render_data_calculator",
+        "//mediapipe/calculators/video:box_detector_calculator",
+    ],
+)
+
+cc_library(
+    name = "desktop_calculators",
+    deps = [
+        ":template_matching_deps",
+        "//mediapipe/calculators/image:opencv_encoded_image_to_image_frame_calculator",
+        "//mediapipe/calculators/util:local_file_pattern_contents_calculator",
+        "//mediapipe/calculators/video:opencv_video_decoder_calculator",
+        "//mediapipe/calculators/video:opencv_video_encoder_calculator",
+    ],
+)
+
+cc_library(
+    name = "mobile_calculators",
+    deps = [
+        ":template_matching_deps",
+        "//mediapipe/calculators/core:flow_limiter_calculator",
+        "//mediapipe/calculators/image:image_transformation_calculator",
+        "//mediapipe/gpu:gpu_buffer_to_image_frame_calculator",
+    ],
+)
+
+mediapipe_binary_graph(
+    name = "mobile_cpu_binary_graph",
+    graph = "template_matching_mobile_cpu.pbtxt",
+    output_name = "mobile_cpu.binarypb",
+    deps = [":mobile_calculators"],
+)
--- a/mediapipe/graphs/template_matching/index_building.pbtxt
+++ b/mediapipe/graphs/template_matching/index_building.pbtxt
@ -0,0 +1,79 @@
+# MediaPipe graph that build feature descriptors index for specific target.
+
+# max_queue_size limits the number of packets enqueued on any input stream
+# by throttling inputs to the graph. This makes the graph only process one
+# frame per time.
+max_queue_size: 1
+
+# Decodes an input video file into images and a video header.
+node {
+  calculator: "LocalFilePatternContentsCalculator"
+  input_side_packet: "FILE_DIRECTORY:file_directory"
+  input_side_packet: "FILE_SUFFIX:file_suffix"
+  output_stream: "CONTENTS:encoded_image"
+}
+
+node {
+  calculator: "OpenCvEncodedImageToImageFrameCalculator"
+  input_stream: "encoded_image"
+  output_stream: "image_frame"
+}
+
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:image_frame"
+  output_stream: "SIZE:input_video_size"
+}
+
+node {
+  calculator: "FeatureDetectorCalculator"
+  input_stream: "IMAGE:image_frame"
+  output_stream: "FEATURES:features"
+  output_stream: "LANDMARKS:landmarks"
+  output_stream: "PATCHES:patches"
+  node_options: {
+    [type.googleapis.com/mediapipe.FeatureDetectorCalculatorOptions] {
+      max_features: 400
+    }
+  }
+}
+
+# input tensors: 200*32*32*1 float
+# output tensors: 200*40 float, only first keypoint.size()*40 is knift features,
+# rest is padded by zero.
+node {
+  calculator: "TfLiteInferenceCalculator"
+  input_stream: "TENSORS:patches"
+  output_stream: "TENSORS:knift_feature_tensors"
+  input_stream_handler {
+    input_stream_handler: "DefaultInputStreamHandler"
+  }
+  node_options: {
+    [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
+      model_path: "mediapipe/models/knift_float_400.tflite"
+    }
+  }
+}
+
+node {
+  calculator: "TfLiteTensorsToFloatsCalculator"
+  input_stream: "TENSORS:knift_feature_tensors"
+  output_stream: "FLOATS:knift_feature_floats"
+}
+
+node {
+  calculator: "BoxDetectorCalculator"
+  input_side_packet: "OUTPUT_INDEX_FILENAME:output_index_filename"
+  input_stream: "FEATURES:features"
+  input_stream: "IMAGE_SIZE:input_video_size"
+  input_stream: "DESCRIPTORS:knift_feature_floats"
+
+  node_options: {
+    [type.googleapis.com/mediapipe.BoxDetectorCalculatorOptions] {
+      detector_options {
+        index_type: OPENCV_BF
+        detect_every_n_frame: 1
+      }
+    }
+  }
+}
--- a/mediapipe/graphs/template_matching/template_matching_desktop.pbtxt
+++ b/mediapipe/graphs/template_matching/template_matching_desktop.pbtxt
@ -0,0 +1,128 @@
+# MediaPipe graph that performs object detection on desktop with TensorFlow Lite
+# on CPU.
+# Used in the example in
+# mediapipe/examples/desktop/template_matching:template_matching_tflite
+
+# max_queue_size limits the number of packets enqueued on any input stream
+# by throttling inputs to the graph. This makes the graph only process one
+# frame per time.
+max_queue_size: 1
+
+# Decodes an input video file into images and a video header.
+node {
+  calculator: "OpenCvVideoDecoderCalculator"
+  input_side_packet: "INPUT_FILE_PATH:input_video_path"
+  output_stream: "VIDEO:input_video"
+  output_stream: "VIDEO_PRESTREAM:input_video_header"
+}
+
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:input_video"
+  output_stream: "SIZE:input_video_size"
+}
+
+node {
+  calculator: "FeatureDetectorCalculator"
+  input_stream: "IMAGE:input_video"
+  output_stream: "FEATURES:features"
+  output_stream: "LANDMARKS:landmarks"
+  output_stream: "PATCHES:patches"
+}
+
+# input tensors: 200*32*32*1 float
+# output tensors: 200*40 float, only first keypoint.size()*40 is knift features,
+# rest is padded by zero.
+node {
+  calculator: "TfLiteInferenceCalculator"
+  input_stream: "TENSORS:patches"
+  output_stream: "TENSORS:knift_feature_tensors"
+  node_options: {
+    [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
+      model_path: "mediapipe/models/knift_float.tflite"
+    }
+  }
+}
+
+node {
+  calculator: "TfLiteTensorsToFloatsCalculator"
+  input_stream: "TENSORS:knift_feature_tensors"
+  output_stream: "FLOATS:knift_feature_floats"
+}
+
+node {
+  calculator: "BoxDetectorCalculator"
+  input_stream: "FEATURES:features"
+  input_stream: "IMAGE_SIZE:input_video_size"
+  input_stream: "DESCRIPTORS:knift_feature_floats"
+  output_stream: "BOXES:detections"
+
+  node_options: {
+    [type.googleapis.com/mediapipe.BoxDetectorCalculatorOptions] {
+      detector_options {
+        index_type: OPENCV_BF
+        detect_every_n_frame: 1
+      }
+      index_proto_filename: "mediapipe/models/knift_index.pb"
+    }
+  }
+}
+
+node {
+  calculator: "TimedBoxListIdToLabelCalculator"
+  input_stream: "detections"
+  output_stream: "labeled_detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.TimedBoxListIdToLabelCalculatorOptions] {
+      label_map_path: "mediapipe/models/knift_labelmap.txt"
+    }
+  }
+}
+
+node {
+  calculator: "TimedBoxListToRenderDataCalculator"
+  input_stream: "BOX_LIST:labeled_detections"
+  output_stream: "RENDER_DATA:box_render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.TimedBoxListToRenderDataCalculatorOptions] {
+      box_color { r: 255 g: 0 b: 0 }
+      thickness: 5.0
+    }
+  }
+}
+
+node {
+  calculator: "LandmarksToRenderDataCalculator"
+  input_stream: "NORM_LANDMARKS:landmarks"
+  output_stream: "RENDER_DATA:landmarks_render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.LandmarksToRenderDataCalculatorOptions] {
+      landmark_color { r: 0 g: 255 b: 0 }
+      thickness: 2.0
+    }
+  }
+}
+
+# Draws annotations and overlays them on top of the input images.
+node {
+  calculator: "AnnotationOverlayCalculator"
+  input_stream: "IMAGE:input_video"
+  input_stream: "box_render_data"
+  input_stream: "landmarks_render_data"
+  output_stream: "IMAGE:output_video"
+}
+
+# Encodes the annotated images into a video file, adopting properties specified
+# in the input video header, e.g., video framerate.
+node {
+  calculator: "OpenCvVideoEncoderCalculator"
+  input_stream: "VIDEO:output_video"
+  input_stream: "VIDEO_PRESTREAM:input_video_header"
+  input_side_packet: "OUTPUT_FILE_PATH:output_video_path"
+  node_options: {
+    [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: {
+      codec: "avc1"
+      video_format: "mp4"
+    }
+  }
+}
--- a/mediapipe/graphs/template_matching/template_matching_mobile_cpu.pbtxt
+++ b/mediapipe/graphs/template_matching/template_matching_mobile_cpu.pbtxt
@ -0,0 +1,136 @@
+# MediaPipe graph that performs template matching with TensorFlow Lite on CPU.
+# Used in the examples in
+# mediapipe/examples/android/src/java/com/mediapipe/apps/templatematchingcpu
+
+# Images on GPU coming into and out of the graph.
+input_stream: "input_video"
+output_stream: "output_video"
+
+# Throttles the images flowing downstream for flow control.
+node {
+  calculator: "FlowLimiterCalculator"
+  input_stream: "input_video"
+  input_stream: "FINISHED:detections"
+  input_stream_info: {
+    tag_index: "FINISHED"
+    back_edge: true
+  }
+  output_stream: "throttled_input_video"
+}
+
+# Transfers the input image from GPU to CPU memory.
+node: {
+  calculator: "GpuBufferToImageFrameCalculator"
+  input_stream: "throttled_input_video"
+  output_stream: "input_video_cpu"
+}
+
+# Transforms the input image on CPU to a 480x640 image.
+node: {
+  calculator: "ImageTransformationCalculator"
+  input_stream: "IMAGE:input_video_cpu"
+  output_stream: "IMAGE:transformed_input_video_cpu"
+  node_options: {
+    [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] {
+      output_width: 480
+      output_height: 640
+    }
+  }
+}
+
+node {
+  calculator: "ImagePropertiesCalculator"
+  input_stream: "IMAGE:transformed_input_video_cpu"
+  output_stream: "SIZE:input_video_size"
+}
+
+node {
+  calculator: "FeatureDetectorCalculator"
+  input_stream: "IMAGE:transformed_input_video_cpu"
+  output_stream: "FEATURES:features"
+  output_stream: "LANDMARKS:landmarks"
+  output_stream: "PATCHES:patches"
+}
+
+# input tensors: 200*32*32*1 float
+# output tensors: 200*40 float, only first keypoint.size()*40 is knift features,
+# rest is padded by zero.
+node {
+  calculator: "TfLiteInferenceCalculator"
+  input_stream: "TENSORS:patches"
+  output_stream: "TENSORS:knift_feature_tensors"
+  node_options: {
+    [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] {
+      model_path: "mediapipe/models/knift_float.tflite"
+      delegate { xnnpack {} }
+    }
+  }
+}
+
+node {
+  calculator: "TfLiteTensorsToFloatsCalculator"
+  input_stream: "TENSORS:knift_feature_tensors"
+  output_stream: "FLOATS:knift_feature_floats"
+}
+
+node {
+  calculator: "BoxDetectorCalculator"
+  input_stream: "FEATURES:features"
+  input_stream: "IMAGE_SIZE:input_video_size"
+  input_stream: "DESCRIPTORS:knift_feature_floats"
+  output_stream: "BOXES:detections"
+
+  node_options: {
+    [type.googleapis.com/mediapipe.BoxDetectorCalculatorOptions] {
+      detector_options {
+        index_type: OPENCV_BF
+        detect_every_n_frame: 1
+      }
+      index_proto_filename: "mediapipe/models/knift_index.pb"
+    }
+  }
+}
+
+node {
+  calculator: "TimedBoxListIdToLabelCalculator"
+  input_stream: "detections"
+  output_stream: "labeled_detections"
+  node_options: {
+    [type.googleapis.com/mediapipe.TimedBoxListIdToLabelCalculatorOptions] {
+      label_map_path: "mediapipe/models/knift_labelmap.txt"
+    }
+  }
+}
+
+node {
+  calculator: "TimedBoxListToRenderDataCalculator"
+  input_stream: "BOX_LIST:labeled_detections"
+  output_stream: "RENDER_DATA:box_render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.TimedBoxListToRenderDataCalculatorOptions] {
+      box_color { r: 255 g: 0 b: 0 }
+      thickness: 5.0
+    }
+  }
+}
+
+node {
+  calculator: "LandmarksToRenderDataCalculator"
+  input_stream: "NORM_LANDMARKS:landmarks"
+  output_stream: "RENDER_DATA:landmarks_render_data"
+  node_options: {
+    [type.googleapis.com/mediapipe.LandmarksToRenderDataCalculatorOptions] {
+      landmark_color { r: 0 g: 255 b: 0 }
+      thickness: 2.0
+    }
+  }
+}
+
+# Draws annotations and overlays them on top of the input images.
+node {
+  calculator: "AnnotationOverlayCalculator"
+  input_stream: "IMAGE_GPU:throttled_input_video"
+  input_stream: "box_render_data"
+  input_stream: "landmarks_render_data"
+  output_stream: "IMAGE_GPU:output_video"
+}
--- a/mediapipe/models/README.md
+++ b/mediapipe/models/README.md
@ -34,3 +34,10 @@ Here are the descriptions of the models used in the [example applications](../do
  * [Model page](https://sites.google.com/corp/view/perception-cv4arvr/hair-segmentation)
  * Paper: ["Real-time Hair segmentation and recoloring on Mobile GPUs"](https://arxiv.org/abs/1907.06740)
  * [Model card](https://drive.google.com/file/d/1lPwJ8BD_-3UUor4LayQ0xpa_RIC_hoRh/view)
+
+### KNIFT (Keypoint Neural Invariant Feature Transform)
+  * Up to 200 keypoints: [TFLite model](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_float.tflite)
+  * Up to 400 keypoints: [TFLite model](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_float_400.tflite)
+  * [Google Developers Blog post](https://mediapipe.page.link/knift)
+  * [Model card](https://mediapipe.page.link/knift-mc)
+
--- a/mediapipe/models/knift_float.tflite
+++ b/mediapipe/models/knift_float.tflite
--- a/mediapipe/models/knift_float_400.tflite
+++ b/mediapipe/models/knift_float_400.tflite
--- a/mediapipe/models/knift_index.pb
+++ b/mediapipe/models/knift_index.pb
--- a/mediapipe/models/knift_labelmap.txt
+++ b/mediapipe/models/knift_labelmap.txt
@ -0,0 +1,3 @@
+1USD
+20USD
+5USD
--- a/mediapipe/util/tflite/BUILD
+++ b/mediapipe/util/tflite/BUILD
@ -75,3 +75,31 @@ cc_test(
        "//conditions:default": [],
    }),
 )
+
+cc_library(
+    name = "tflite_gpu_runner",
+    srcs = select({
+        "//mediapipe:ios": [],
+        "//mediapipe:macos": [],
+        "//conditions:default": ["tflite_gpu_runner.cc"],
+    }),
+    hdrs = select({
+        "//mediapipe:ios": [],
+        "//mediapipe:macos": [],
+        "//conditions:default": ["tflite_gpu_runner.h"],
+    }),
+    deps = select({
+        "//mediapipe:ios": [],
+        "//mediapipe:macos": [],
+        "//conditions:default": [
+            "@com_google_absl//absl/strings",
+            "//mediapipe/framework/port:ret_check",
+            "//mediapipe/framework/port:status",
+            "//mediapipe/framework/port:statusor",
+            "@org_tensorflow//tensorflow/lite:framework",
+            "@org_tensorflow//tensorflow/lite/delegates/gpu:api",
+            "@org_tensorflow//tensorflow/lite/delegates/gpu/common:model",
+            "@org_tensorflow//tensorflow/lite/delegates/gpu/gl:api2",
+        ],
+    }),
+)
--- a/mediapipe/util/tflite/tflite_gpu_runner.cc
+++ b/mediapipe/util/tflite/tflite_gpu_runner.cc
@ -0,0 +1,138 @@
+// Copyright 2020 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mediapipe/util/tflite/tflite_gpu_runner.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/strings/substitute.h"
+#include "mediapipe/framework/port/canonical_errors.h"
+#include "mediapipe/framework/port/ret_check.h"
+#include "mediapipe/framework/port/status.h"
+#include "mediapipe/framework/port/status_macros.h"
+#include "mediapipe/framework/port/statusor.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/gl/api2.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+ObjectDef GetSSBOObjectDef(int channels) {
+  ObjectDef gpu_object_def;
+  gpu_object_def.data_type = DataType::FLOAT32;
+  gpu_object_def.data_layout = DataLayout::BHWC;
+  if (channels == 4) {
+    gpu_object_def.data_layout = DataLayout::DHWC4;
+  }
+  gpu_object_def.object_type = ObjectType::OPENGL_SSBO;
+  gpu_object_def.user_provided = true;
+  return gpu_object_def;
+}
+
+}  // namespace
+
+mediapipe::Status TFLiteGPURunner::InitializeWithModel(
+    const tflite::FlatBufferModel& flatbuffer) {
+  for (const auto& input : graph_->inputs()) {
+    input_shapes_.push_back(input->tensor.shape);
+  }
+  for (const auto& output : graph_->outputs()) {
+    output_shapes_.push_back(output->tensor.shape);
+  }
+  return absl::OkStatus();
+}
+
+mediapipe::StatusOr<int64_t> TFLiteGPURunner::GetInputElements(int id) {
+  if (id >= input_shapes_.size()) {
+    return ::mediapipe::InternalError("Wrong input tensor id.");
+  } else {
+    return input_shapes_[id].DimensionsProduct();
+  }
+}
+
+mediapipe::StatusOr<int64_t> TFLiteGPURunner::GetOutputElements(int id) {
+  if (id >= output_shapes_.size()) {
+    return ::mediapipe::InternalError("Wrong output tensor id.");
+  } else {
+    return output_shapes_[id].DimensionsProduct();
+  }
+}
+
+mediapipe::Status TFLiteGPURunner::Build() {
+  // 1. Prepare inference builder.
+  std::unique_ptr<InferenceBuilder> builder;
+  MP_RETURN_IF_ERROR(InitializeOpenGL(&builder));
+
+  // 2. Describe output/input objects for created builder.
+  for (int flow_index = 0; flow_index < input_shapes_.size(); ++flow_index) {
+    if (input_ssbo_ids_.find(flow_index) == input_ssbo_ids_.end()) {
+      return absl::AlreadyExistsError(absl::Substitute(
+          "Couldn't find a OpenGL ssbo for input $0.", flow_index));
+    }
+    MP_RETURN_IF_ERROR(builder->SetInputObjectDef(
+        flow_index, GetSSBOObjectDef(input_shapes_[flow_index].c)));
+  }
+  for (int flow_index = 0; flow_index < output_shapes_.size(); ++flow_index) {
+    if (output_ssbo_ids_.find(flow_index) == output_ssbo_ids_.end()) {
+      return absl::AlreadyExistsError(absl::Substitute(
+          "Couldn't find a OpenGL ssbo for output $0.", flow_index));
+    }
+    MP_RETURN_IF_ERROR(builder->SetOutputObjectDef(
+        flow_index, GetSSBOObjectDef(output_shapes_[flow_index].c)));
+  }
+
+  // 3. Build inference runner with the created builder.
+  return builder->Build(&runner_);
+}
+
+mediapipe::Status TFLiteGPURunner::BindSSBOToInputTensor(GLuint ssbo_id,
+                                                         int input_id) {
+  OpenGlBuffer buffer;
+  buffer.id = ssbo_id;
+  return runner_->SetInputObject(input_id, std::move(buffer));
+}
+
+mediapipe::Status TFLiteGPURunner::BindSSBOToOutputTensor(GLuint ssbo_id,
+                                                          int output_id) {
+  OpenGlBuffer buffer;
+  buffer.id = ssbo_id;
+  return runner_->SetOutputObject(output_id, std::move(buffer));
+}
+
+mediapipe::Status TFLiteGPURunner::Invoke() { return runner_->Run(); }
+
+mediapipe::Status TFLiteGPURunner::InitializeOpenGL(
+    std::unique_ptr<InferenceBuilder>* builder) {
+  gl::InferenceEnvironmentOptions env_options;
+  gl::InferenceEnvironmentProperties properties;
+  gl::InferenceOptions gl_options;
+  gl_options.priority1 = options_.priority1;
+  gl_options.priority2 = options_.priority2;
+  gl_options.priority3 = options_.priority3;
+  gl_options.usage = options_.usage;
+  MP_RETURN_IF_ERROR(
+      NewInferenceEnvironment(env_options, &gl_environment_, &properties));
+  MP_RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph_),
+                                                          gl_options, builder));
+  graph_.release();
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
--- a/mediapipe/util/tflite/tflite_gpu_runner.h
+++ b/mediapipe/util/tflite/tflite_gpu_runner.h
@ -0,0 +1,90 @@
+// Copyright 2020 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MEDIAPIPE_CALCULATORS_TFLITE_TFLITE_GPU_RUNNER_H_
+#define MEDIAPIPE_CALCULATORS_TFLITE_TFLITE_GPU_RUNNER_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "mediapipe/framework/port/status.h"
+#include "mediapipe/framework/port/statusor.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/gl/api2.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace gpu {
+
+// Executes GPU based inference using the TFLite GPU delegate api2.
+// Currently supports only GPU inputs/outputs.
+//
+// Typical order of execution:
+// 1. Initialize with the flatbuffer model using InitializeWithModel().
+// 2. Bind OpenGL SSBO objects as inputs and outputs using
+// BindSSBOToInputTensor() and BindSSBOToOutputTensor().
+// 3. Build the inference runner with Build() method.
+// 4. Invoke() executes the inference, where inputs and outputs are those which
+// were specified earlier. Invoke() may be called in the loop.
+//
+// Note: All of these need to happen inside MediaPipe's RunInGlContext to make
+// sure that all steps from inference construction to execution are made using
+// same OpenGL context.
+class TFLiteGPURunner {
+ public:
+  explicit TFLiteGPURunner(const InferenceOptions& options)
+      : options_(options) {}
+
+  mediapipe::Status InitializeWithModel(
+      const tflite::FlatBufferModel& flatbuffer);
+  mediapipe::Status BindSSBOToInputTensor(GLuint ssbo_id, int input_id);
+  mediapipe::Status BindSSBOToOutputTensor(GLuint ssbo_id, int output_id);
+
+  int inputs_size() const { return input_shapes_.size(); }
+  int outputs_size() const { return output_shapes_.size(); }
+
+  mediapipe::StatusOr<int64_t> GetInputElements(int id);
+  mediapipe::StatusOr<int64_t> GetOutputElements(int id);
+
+  mediapipe::Status Build();
+  mediapipe::Status Invoke();
+
+ private:
+  mediapipe::Status InitializeOpenGL(
+      std::unique_ptr<InferenceBuilder>* builder);
+
+  InferenceOptions options_;
+  std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
+
+  // graph_ is maintained temporarily and becomes invalid after runner_ is ready
+  std::unique_ptr<GraphFloat32> graph_;
+  std::unique_ptr<InferenceRunner> runner_;
+
+  // Store registered OpenGL ssbo ids for the corresponding input/output tensor.
+  // key: io tensor position, value: OpenGL ssbo id.
+  std::unordered_map<int, GLuint> input_ssbo_ids_;
+  std::unordered_map<int, GLuint> output_ssbo_ids_;
+
+  // We keep information about input/output shapes, because they are needed
+  // after graph_ becomes "converted" into runner_.
+  std::vector<BHWC> input_shapes_;
+  std::vector<BHWC> output_shapes_;
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // MEDIAPIPE_CALCULATORS_TFLITE_TFLITE_GPU_RUNNER_H_
--- a/mediapipe/util/tracking/box_detector.proto
+++ b/mediapipe/util/tracking/box_detector.proto
@ -68,10 +68,10 @@ message BoxDetectorOptions {
  optional int32 min_num_correspondence = 6 [default = 5];

  // Reprojection threshold for RANSAC to find inliers.
-  optional float ransac_reprojection_threshold = 7 [default = 0.02];
+  optional float ransac_reprojection_threshold = 7 [default = 0.005];

  // Max distance to match 2 NIMBY features.
-  optional float max_match_distance = 8 [default = 0.8];
+  optional float max_match_distance = 8 [default = 0.9];

  // Max persepective change factor.
  optional float max_perspective_factor = 9 [default = 0.1];