mediapipe/mediapipe/framework/formats/tensor.h

// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef MEDIAPIPE_FRAMEWORK_FORMATS_TENSOR_H_
#define MEDIAPIPE_FRAMEWORK_FORMATS_TENSOR_H_

#include <algorithm>
#include <functional>
#include <initializer_list>
#include <numeric>
#include <tuple>
#include <type_traits>
#include <utility>
#include <vector>

#include "absl/container/flat_hash_map.h"
#include "absl/log/absl_check.h"
#include "absl/synchronization/mutex.h"
#include "mediapipe/framework/formats/tensor/internal.h"
#include "mediapipe/framework/port.h"

// Supported use cases for tensor_ahwb:
// 1. Native code running in Android apps.
// 2. Android vendor processes linked against nativewindow.
#if !defined(MEDIAPIPE_NO_JNI) || defined(MEDIAPIPE_ANDROID_LINK_NATIVE_WINDOW)
#if __ANDROID_API__ >= 26 || defined(__ANDROID_UNAVAILABLE_SYMBOLS_ARE_WEAK__)
#define MEDIAPIPE_TENSOR_USE_AHWB 1
#endif  // __ANDROID_API__ >= 26 ||
        // defined(__ANDROID_UNAVAILABLE_SYMBOLS_ARE_WEAK__)
#endif  // !defined(MEDIAPIPE_NO_JNI) ||
        // defined(MEDIAPIPE_ANDROID_LINK_NATIVE_WINDOW)

#ifdef MEDIAPIPE_TENSOR_USE_AHWB
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include <android/hardware_buffer.h>
#endif  // MEDIAPIPE_TENSOR_USE_AHWB
#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30
#include "mediapipe/gpu/gl_base.h"
#include "mediapipe/gpu/gl_context.h"
#endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30

#if defined __has_builtin
#if __has_builtin(__builtin_LINE)
#define builtin_LINE __builtin_LINE
#endif
#if __has_builtin(__builtin_FILE)
#define builtin_FILE __builtin_FILE
#endif
#endif

#ifndef builtin_LINE
#define builtin_LINE() 0
#endif
#ifndef builtin_FILE
#define builtin_FILE() ""
#endif

namespace mediapipe {
// Tensor is a container of multi-dimensional data that supports sharing the
// content across different backends and APIs, currently: CPU / Metal / OpenGL.
// Texture2DView is limited to 4 dimensions.
// The content is accessible through requesting device specific views.
// Acquiring a view guarantees that the content is not changed by another thread
// until the view is released.
//
// Tensor::MtlBufferView view = tensor.GetMtlBufferWriteView(mtl_device);
// mtl_device is used to create MTLBuffer
// id<MTLBuffer> buffer = view.buffer();
// For OpenGL the code below must be called by a thread with valid OpenGL ES
// context bound:
// GLuint buffer = view.buffer();
// Then the buffer can be bound to the GPU command buffer.
// ...binding the buffer to the command buffer...
// ...committing command buffer and releasing the view...
//
// The following request for the CPU view will be blocked until the GPU view is
// released and the GPU task is finished.
//
// auto view = tensor.GetCpuReadView();
// float* pointer = view.buffer<float>();
// ...reading the cpu memory...

struct MtlResources;
class Tensor {
  class View {
   public:
    // Non-copyable.
    View(const View&) = delete;
    View& operator=(const View&) = delete;
    View(View&& src) = default;

   protected:
    View(std::unique_ptr<absl::MutexLock>&& lock) : lock_(std::move(lock)) {}
    std::unique_ptr<absl::MutexLock> lock_;
  };

 public:
  // No resources are allocated here.
  enum class ElementType {
    kNone,
    kFloat16,
    kFloat32,
    kUInt8,
    kInt8,
    kInt32,
    kChar,
    kBool
  };
  struct Shape {
    Shape() = default;
    Shape(std::initializer_list<int> dimensions) : dims(dimensions) {}
    Shape(const std::vector<int>& dimensions) : dims(dimensions) {}
    Shape(std::initializer_list<int> dimensions, bool is_dynamic)
        : dims(dimensions), is_dynamic(is_dynamic) {}
    Shape(const std::vector<int>& dimensions, bool is_dynamic)
        : dims(dimensions), is_dynamic(is_dynamic) {}
    int num_elements() const {
      return std::accumulate(dims.begin(), dims.end(), 1,
                             std::multiplies<int>());
    }
    std::vector<int> dims;
    // The Tensor has dynamic rather than static shape so the TFLite interpreter
    // needs to be reallocated. Only relevant for CPU.
    bool is_dynamic = false;
  };
  // Quantization parameters corresponding to the zero_point and scale value
  // made available by TfLite quantized (uint8/int8) tensors.
  struct QuantizationParameters {
    QuantizationParameters() = default;
    QuantizationParameters(float scale, int zero_point)
        : scale(scale), zero_point(zero_point) {}
    float scale = 1.0f;
    int zero_point = 0;
  };

  Tensor(ElementType element_type, const Shape& shape);
  Tensor(ElementType element_type, const Shape& shape,
         const QuantizationParameters& quantization_parameters);

  // Non-copyable.
  Tensor(const Tensor&) = delete;
  Tensor& operator=(const Tensor&) = delete;
  // Move-only.
  Tensor(Tensor&& src);
  Tensor& operator=(Tensor&&);
  ~Tensor();

  template <typename T>
  class CpuView : public View {
   public:
    template <typename P>
    auto buffer() const {
      // const and non-const return  type selection.
      return static_cast<typename std::tuple_element<
          std::is_const<T>::value, std::tuple<P*, const P*> >::type>(buffer_);
    }
    CpuView(CpuView&& src) : View(std::move(src)) {
      buffer_ = std::exchange(src.buffer_, nullptr);
      release_callback_ = std::exchange(src.release_callback_, nullptr);
    }
    ~CpuView() {
      if (release_callback_) release_callback_();
    }

   protected:
    friend class Tensor;
    CpuView(T* buffer, std::unique_ptr<absl::MutexLock>&& lock,
            std::function<void()> release_callback = nullptr)
        : View(std::move(lock)),
          buffer_(buffer),
          release_callback_(release_callback) {}
    T* buffer_;
    std::function<void()> release_callback_;
  };
  using CpuReadView = CpuView<const void>;
  CpuReadView GetCpuReadView() const;
  using CpuWriteView = CpuView<void>;
  CpuWriteView GetCpuWriteView(
      uint64_t source_location_hash =
          tensor_internal::FnvHash64(builtin_FILE(), builtin_LINE())) const;

#ifdef MEDIAPIPE_TENSOR_USE_AHWB
  using FinishingFunc = std::function<bool(bool)>;
  class AHardwareBufferView : public View {
   public:
    AHardwareBuffer* handle() const { return handle_; }
    AHardwareBufferView(AHardwareBufferView&& src) : View(std::move(src)) {
      handle_ = std::exchange(src.handle_, nullptr);
      file_descriptor_ = src.file_descriptor_;
      fence_fd_ = std::exchange(src.fence_fd_, nullptr);
      ahwb_written_ = std::exchange(src.ahwb_written_, nullptr);
      release_callback_ = std::exchange(src.release_callback_, nullptr);
    }
    int file_descriptor() const { return file_descriptor_; }
    void SetReadingFinishedFunc(FinishingFunc&& func) {
      ABSL_CHECK(ahwb_written_)
          << "AHWB write view can't accept 'reading finished callback'";
      *ahwb_written_ = std::move(func);
    }
    void SetWritingFinishedFD(int fd, FinishingFunc func = nullptr) {
      ABSL_CHECK(fence_fd_)
          << "AHWB read view can't accept 'writing finished file descriptor'";
      *fence_fd_ = fd;
      *ahwb_written_ = std::move(func);
    }
    // The function is called when the tensor is released.
    void SetReleaseCallback(std::function<void()> callback) {
      *release_callback_ = std::move(callback);
    }

   protected:
    friend class Tensor;
    AHardwareBufferView(AHardwareBuffer* handle, int file_descriptor,
                        int* fence_fd, FinishingFunc* ahwb_written,
                        std::function<void()>* release_callback,
                        std::unique_ptr<absl::MutexLock>&& lock)
        : View(std::move(lock)),
          handle_(handle),
          file_descriptor_(file_descriptor),
          fence_fd_(fence_fd),
          ahwb_written_(ahwb_written),
          release_callback_(release_callback) {}
    AHardwareBuffer* handle_;
    int file_descriptor_;
    // The view sets some Tensor's fields. The view is released prior to tensor.
    int* fence_fd_;
    FinishingFunc* ahwb_written_;
    std::function<void()>* release_callback_;
  };
  AHardwareBufferView GetAHardwareBufferReadView() const;
  // size_alignment is an optional argument to tell the API to allocate
  // a buffer that is padded to multiples of size_alignment bytes.
  // size_alignment must be power of 2, i.e. 2, 4, 8, 16, 64, etc.
  // If size_alignment is 0, then the buffer will not be padded.
  AHardwareBufferView GetAHardwareBufferWriteView(int size_alignment = 0) const;
#endif  // MEDIAPIPE_TENSOR_USE_AHWB

#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30
  // TODO: Use GlTextureView instead.
  // Only float32 textures are supported with 1/2/3/4 depths.
  // OpenGlTexture2dView currently only supports BHWC memory layout.
  class OpenGlTexture2dView : public View {
   public:
    GLuint name() const { return name_; }
    OpenGlTexture2dView(OpenGlTexture2dView&& src)
        : View(std::move(src)), name_(src.name_) {
      src.name_ = GL_INVALID_INDEX;
    }
    // To fit a tensor into a texture two layouts are used:
    // 1. Aligned. Width of the texture = tensor_width * num_slices, where slice
    //    is a group of 4 depth values. Tensor depth is padded to 4.
    // 2. Linearized. If texture width or height with the layout 1. is greater
    //    than the GPU supports then all tensor values are packed into a texture
    //    with fixed width calculated by this method.
    // Must be called with the valid GL context bound to the current thread.
    enum class Layout { kAligned, kLinearized };
    static Layout GetLayoutDimensions(const Tensor::Shape& shape, int* width,
                                      int* height);

   protected:
    friend class Tensor;
    OpenGlTexture2dView(GLuint name, std::unique_ptr<absl::MutexLock>&& lock)
        : View(std::move(lock)), name_(name) {}
    GLuint name_;
  };
  // A valid OpenGL context must be bound to the calling thread due to possible
  // GPU resource allocation.
  OpenGlTexture2dView GetOpenGlTexture2dReadView() const;
  OpenGlTexture2dView GetOpenGlTexture2dWriteView() const;
#endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30

#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
  class OpenGlBufferView : public View {
   public:
    GLuint name() const { return name_; }
    OpenGlBufferView(OpenGlBufferView&& src) : View(std::move(src)) {
      name_ = std::exchange(src.name_, GL_INVALID_INDEX);
      ssbo_read_ = std::exchange(src.ssbo_read_, nullptr);
    }
    ~OpenGlBufferView() {
      if (ssbo_read_) {
        *ssbo_read_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
      }
    }

   protected:
    friend class Tensor;
    OpenGlBufferView(GLuint name, std::unique_ptr<absl::MutexLock>&& lock,
                     GLsync* ssbo_read)
        : View(std::move(lock)), name_(name), ssbo_read_(ssbo_read) {}
    GLuint name_;
    GLsync* ssbo_read_;
  };
  // A valid OpenGL context must be bound to the calling thread due to possible
  // GPU resource allocation.
  OpenGlBufferView GetOpenGlBufferReadView() const;
  OpenGlBufferView GetOpenGlBufferWriteView(
      uint64_t source_location_hash =
          tensor_internal::FnvHash64(builtin_FILE(), builtin_LINE())) const;
#endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31

  const Shape& shape() const { return shape_; }
  ElementType element_type() const { return element_type_; }
  const QuantizationParameters& quantization_parameters() const {
    return quantization_parameters_;
  }
  int element_size() const {
    switch (element_type_) {
      case ElementType::kNone:
        return 0;
      case ElementType::kFloat16:
        return 2;
      case ElementType::kFloat32:
        return sizeof(float);
      case ElementType::kUInt8:
        return 1;
      case ElementType::kInt8:
        return 1;
      case ElementType::kInt32:
        return sizeof(int32_t);
      case ElementType::kChar:
        return sizeof(char);
      case ElementType::kBool:
        return sizeof(bool);
    }
  }
  int bytes() const { return shape_.num_elements() * element_size(); }

  bool ready_on_cpu() const {
    return valid_ & (kValidAHardwareBuffer | kValidCpu);
  }
  bool ready_on_gpu() const {
    return valid_ & (kValidMetalBuffer | kValidOpenGlBuffer |
                     kValidAHardwareBuffer | kValidOpenGlTexture2d);
  }
  bool ready_as_metal_buffer() const { return valid_ & kValidMetalBuffer; }
  bool ready_as_opengl_buffer() const {
    return valid_ & (kValidAHardwareBuffer | kValidOpenGlBuffer);
  }
  bool ready_as_opengl_texture_2d() const {
    return valid_ & kValidOpenGlTexture2d;
  }

 private:
  friend class MtlBufferView;
  void Move(Tensor*);
  void Invalidate();

  ElementType element_type_;
  Shape shape_;
  QuantizationParameters quantization_parameters_;

  // The flags describe the current source of truth resource type.
  enum {
    kValidNone = 0,
    kValidCpu = 1 << 0,
    kValidMetalBuffer = 1 << 1,
    kValidOpenGlBuffer = 1 << 2,
    kValidOpenGlTexture2d = 1 << 3,
    kValidAHardwareBuffer = 1 << 5,
  };
  // A list of resource which are currently allocated and synchronized between
  // each-other: valid_ = kValidCpu | kValidMetalBuffer;
  mutable int valid_ = 0;
  // The mutex is locked by Get*View and is kept by all Views.
  mutable absl::Mutex view_mutex_;

  mutable void* cpu_buffer_ = nullptr;
  void AllocateCpuBuffer() const;
  // Forward declaration of the MtlResources provides compile-time verification
  // of ODR if this header includes any actual code that uses MtlResources.
  mutable std::unique_ptr<MtlResources> mtl_resources_;

#ifdef MEDIAPIPE_TENSOR_USE_AHWB
  mutable AHardwareBuffer* ahwb_ = nullptr;
  // Signals when GPU finished writing into SSBO so AHWB can be used then. Or
  // signals when writing into AHWB has been finished so GPU can read from SSBO.
  // Sync and FD are bound together.
  mutable EGLSyncKHR fence_sync_ = EGL_NO_SYNC_KHR;
  // This FD signals when the writing into the SSBO has been finished.
  mutable int ssbo_written_ = -1;
  // An externally set FD that is wrapped with the EGL sync then to synchronize
  // AHWB -> OpenGL SSBO.
  mutable int fence_fd_ = -1;
  // Reading from SSBO has been finished so SSBO can be released.
  mutable GLsync ssbo_read_ = 0;
  // An externally set function that signals when it is safe to release AHWB.
  // If the input parameter is 'true' then wait for the writing to be finished.
  mutable FinishingFunc ahwb_written_;
  mutable std::function<void()> release_callback_;
  bool AllocateAHardwareBuffer(int size_alignment = 0) const;
  void CreateEglSyncAndFd() const;
#endif  // MEDIAPIPE_TENSOR_USE_AHWB
  // Use Ahwb for other views: OpenGL / CPU buffer.
  mutable bool use_ahwb_ = false;
  mutable uint64_t ahwb_tracking_key_ = 0;
  // TODO: Tracks all unique tensors. Can grow to a large number. LRU
  // (Least Recently Used) can be more predicted.
  // The value contains the size alignment parameter.
  static inline absl::flat_hash_map<uint64_t, int> ahwb_usage_track_;
  // Expects the target SSBO to be already bound.
  bool AllocateAhwbMapToSsbo() const;
  bool InsertAhwbToSsboFence() const;
  void MoveAhwbStuff(Tensor* src);
  void ReleaseAhwbStuff();
  void* MapAhwbToCpuRead() const;
  void* MapAhwbToCpuWrite() const;
  void MoveCpuOrSsboToAhwb() const;
  // Set current tracking key, set "use ahwb" if the key is already marked.
  void TrackAhwbUsage(uint64_t key) const;

#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30
  mutable std::shared_ptr<mediapipe::GlContext> gl_context_;
  mutable GLuint opengl_texture2d_ = GL_INVALID_INDEX;
  mutable GLuint frame_buffer_ = GL_INVALID_INDEX;
  mutable int texture_width_;
  mutable int texture_height_;
#ifdef __EMSCRIPTEN__
  mutable bool texture_is_half_float_ = false;
#endif  // __EMSCRIPTEN__
  void AllocateOpenGlTexture2d() const;
#if MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
  mutable GLuint opengl_buffer_ = GL_INVALID_INDEX;
  void AllocateOpenGlBuffer() const;
#endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_31
  bool NeedsHalfFloatRenderTarget() const;
#endif  // MEDIAPIPE_OPENGL_ES_VERSION >= MEDIAPIPE_OPENGL_ES_30
};

int BhwcBatchFromShape(const Tensor::Shape& shape);
int BhwcHeightFromShape(const Tensor::Shape& shape);
int BhwcWidthFromShape(const Tensor::Shape& shape);
int BhwcDepthFromShape(const Tensor::Shape& shape);

}  // namespace mediapipe

#endif  // MEDIAPIPE_FRAMEWORK_FORMATS_TENSOR_H_