// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include "absl/memory/memory.h" #include "mediapipe/calculators/tflite/tflite_inference_calculator.pb.h" #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/port/ret_check.h" #include "mediapipe/util/tflite/config.h" #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__) #include "mediapipe/util/cpu_util.h" #endif // !__EMSCRIPTEN__ || __EMSCRIPTEN_PTHREADS__ #include "mediapipe/util/resource_util.h" #include "tensorflow/lite/error_reporter.h" #include "tensorflow/lite/interpreter.h" #include "tensorflow/lite/kernels/register.h" #include "tensorflow/lite/model.h" #if MEDIAPIPE_TFLITE_GL_INFERENCE #include "mediapipe/gpu/gl_calculator_helper.h" #include "mediapipe/gpu/gpu_buffer.h" #include "mediapipe/util/tflite/tflite_gpu_runner.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h" #include "tensorflow/lite/delegates/gpu/gl/gl_program.h" #include "tensorflow/lite/delegates/gpu/gl/gl_shader.h" #include "tensorflow/lite/delegates/gpu/gl_delegate.h" #endif // MEDIAPIPE_TFLITE_GL_INFERENCE #if MEDIAPIPE_TFLITE_METAL_INFERENCE #import #import #import #import "mediapipe/gpu/MPPMetalHelper.h" #include "mediapipe/gpu/MPPMetalUtil.h" #include "mediapipe/gpu/gpu_buffer.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h" #include "tensorflow/lite/delegates/gpu/metal_delegate.h" #include "tensorflow/lite/delegates/gpu/metal_delegate_internal.h" #endif // MEDIAPIPE_TFLITE_METAL_INFERENCE #if !defined(MEDIAPIPE_EDGE_TPU) #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h" #endif // !EDGETPU #if defined(MEDIAPIPE_ANDROID) #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h" #endif // ANDROID namespace { // Commonly used to compute the number of blocks to launch in a kernel. int NumGroups(const int size, const int group_size) { // NOLINT return (size + group_size - 1) / group_size; } // Round up n to next multiple of m. size_t RoundUp(size_t n, size_t m) { return ((n + m - 1) / m) * m; } // NOLINT constexpr char kTensorsTag[] = "TENSORS"; constexpr char kTensorsGpuTag[] = "TENSORS_GPU"; } // namespace #if defined(MEDIAPIPE_EDGE_TPU) #include "edgetpu.h" // Creates and returns an Edge TPU interpreter to run the given edgetpu model. std::unique_ptr BuildEdgeTpuInterpreter( const tflite::FlatBufferModel& model, tflite::ops::builtin::BuiltinOpResolver* resolver, edgetpu::EdgeTpuContext* edgetpu_context) { resolver->AddCustom(edgetpu::kCustomOp, edgetpu::RegisterCustomOp()); std::unique_ptr interpreter; if (tflite::InterpreterBuilder(model, *resolver)(&interpreter) != kTfLiteOk) { std::cerr << "Failed to build edge TPU interpreter." << std::endl; } interpreter->SetExternalContext(kTfLiteEdgeTpuContext, edgetpu_context); interpreter->SetNumThreads(1); if (interpreter->AllocateTensors() != kTfLiteOk) { std::cerr << "Failed to allocate edge TPU tensors." << std::endl; } return interpreter; } #endif // MEDIAPIPE_EDGE_TPU // TfLiteInferenceCalculator File Layout: // * Header // * Core // * Aux namespace mediapipe { #if MEDIAPIPE_TFLITE_GL_INFERENCE using ::tflite::gpu::gl::CopyBuffer; using ::tflite::gpu::gl::CreateReadWriteShaderStorageBuffer; using ::tflite::gpu::gl::GlBuffer; #endif #if MEDIAPIPE_TFLITE_GPU_SUPPORTED namespace { struct GPUData { int elements = 1; GpuTensor buffer; ::tflite::gpu::BHWC shape; }; } // namespace #endif // MEDIAPIPE_TFLITE_GPU_SUPPORTED // Returns number of threads to configure XNNPACK delegate with. // (Equal to user provided value if specified. Otherwise, it returns number of // high cores (hard-coded to 1 for Emscripten without Threads extension)) int GetXnnpackNumThreads( const mediapipe::TfLiteInferenceCalculatorOptions& opts) { static constexpr int kDefaultNumThreads = -1; if (opts.has_delegate() && opts.delegate().has_xnnpack() && opts.delegate().xnnpack().num_threads() != kDefaultNumThreads) { return opts.delegate().xnnpack().num_threads(); } #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__) return InferHigherCoreIds().size(); #else return 1; #endif // !__EMSCRIPTEN__ || __EMSCRIPTEN_PTHREADS__ } // Calculator Header Section // Runs inference on the provided input TFLite tensors and TFLite model. // // Creates an interpreter with given model and calls invoke(). // Optionally run inference on CPU/GPU. // // This calculator is designed to be used with the TfLiteConverterCalculator, // to get the appropriate inputs. // // When the input tensors are on CPU, gpu inference is optional and can be // specified in the calculator options. // When the input tensors are on GPU, inference is GPU and output can be CPU or // GPU. // // Input: // TENSORS - Vector of TfLiteTensor of type kTfLiteFloat32 or kTfLiteUInt8 // TENSORS_GPU - Vector of GlBuffer or MTLBuffer // // Output: // TENSORS - Vector of TfLiteTensor of type kTfLiteFloat32 or kTfLiteUInt8 // TENSORS_GPU - Vector of GlBuffer or MTLBuffer // // Input side packet: // CUSTOM_OP_RESOLVER (optional) - Use a custom op resolver, // instead of the builtin one. // MODEL (optional) - Use to specify TfLite model // (std::unique_ptr>) // // Example use: // node { // calculator: "TfLiteInferenceCalculator" // input_stream: "TENSORS:tensor_image" // output_stream: "TENSORS:tensors" // options: { // [mediapipe.TfLiteInferenceCalculatorOptions.ext] { // model_path: "modelname.tflite" // } // } // } // // or // // node { // calculator: "TfLiteInferenceCalculator" // input_stream: "TENSORS_GPU:tensor_image" // input_side_packet: "MODEL:model" // output_stream: "TENSORS_GPU:tensors" // options: { // [mediapipe.TfLiteInferenceCalculatorOptions.ext] { // model_path: "modelname.tflite" // delegate { gpu {} } // } // } // } // // IMPORTANT Notes: // Tensors are assumed to be ordered correctly (sequentially added to model). // Input tensors are assumed to be of the correct size and already normalized. // All output TfLiteTensors will be destroyed when the graph closes, // (i.e. after calling graph.WaitUntilDone()). // GPU tensor support rquires OpenGL ES 3.1+. // This calculator uses FixedSizeInputStreamHandler by default. // class TfLiteInferenceCalculator : public CalculatorBase { public: using TfLiteDelegatePtr = std::unique_ptr>; using TfLiteModelPtr = std::unique_ptr>; static ::mediapipe::Status GetContract(CalculatorContract* cc); ::mediapipe::Status Open(CalculatorContext* cc) override; ::mediapipe::Status Process(CalculatorContext* cc) override; ::mediapipe::Status Close(CalculatorContext* cc) override; private: ::mediapipe::Status LoadModel(CalculatorContext* cc); ::mediapipe::StatusOr GetModelAsPacket(const CalculatorContext& cc); ::mediapipe::Status LoadDelegate(CalculatorContext* cc); ::mediapipe::Status InitTFLiteGPURunner(CalculatorContext* cc); ::mediapipe::Status ProcessInputsCpu( CalculatorContext* cc, std::vector* output_tensors_cpu); ::mediapipe::Status ProcessOutputsCpu( CalculatorContext* cc, std::unique_ptr> output_tensors_cpu); ::mediapipe::Status ProcessInputsGpu( CalculatorContext* cc, std::vector* output_tensors_gpu); ::mediapipe::Status ProcessOutputsGpu( CalculatorContext* cc, std::unique_ptr> output_tensors_cpu, std::unique_ptr> output_tensors_gpu); ::mediapipe::Status RunInContextIfNeeded( std::function<::mediapipe::Status(void)> f) { if (gpu_inference_) { #if MEDIAPIPE_TFLITE_GL_INFERENCE return gpu_helper_.RunInGlContext(std::move(f)); #endif // MEDIAPIPE_TFLITE_GL_INFERENCE } return f(); } Packet model_packet_; std::unique_ptr interpreter_; TfLiteDelegatePtr delegate_; #if MEDIAPIPE_TFLITE_GL_INFERENCE mediapipe::GlCalculatorHelper gpu_helper_; std::vector> gpu_data_in_; std::vector> gpu_data_out_; std::unique_ptr tflite_gpu_runner_; #elif MEDIAPIPE_TFLITE_METAL_INFERENCE MPPMetalHelper* gpu_helper_ = nullptr; std::vector> gpu_data_in_; std::vector> gpu_data_out_; id fp32_to_fp16_program_; TFLBufferConvert* converter_from_BPHWC4_ = nil; #endif // MEDIAPIPE_TFLITE_GL_INFERENCE #if defined(MEDIAPIPE_EDGE_TPU) std::shared_ptr edgetpu_context_ = edgetpu::EdgeTpuManager::GetSingleton()->OpenDevice(); #endif bool gpu_inference_ = false; bool gpu_input_ = false; bool gpu_output_ = false; bool use_quantized_tensors_ = false; bool use_advanced_gpu_api_ = false; }; REGISTER_CALCULATOR(TfLiteInferenceCalculator); // Calculator Core Section namespace { template bool ShouldUseGpu(CC* cc) { #if MEDIAPIPE_TFLITE_GPU_SUPPORTED const auto& options = cc->template Options<::mediapipe::TfLiteInferenceCalculatorOptions>(); return options.use_gpu() || (options.has_delegate() && options.delegate().has_gpu()) || cc->Inputs().HasTag(kTensorsGpuTag) || cc->Outputs().HasTag(kTensorsGpuTag); #else return false; #endif // MEDIAPIPE_TFLITE_GPU_SUPPORTED } } // namespace ::mediapipe::Status TfLiteInferenceCalculator::GetContract( CalculatorContract* cc) { RET_CHECK(cc->Inputs().HasTag(kTensorsTag) ^ cc->Inputs().HasTag(kTensorsGpuTag)); RET_CHECK(cc->Outputs().HasTag(kTensorsTag) ^ cc->Outputs().HasTag(kTensorsGpuTag)); const auto& options = cc->Options<::mediapipe::TfLiteInferenceCalculatorOptions>(); RET_CHECK(!options.model_path().empty() ^ cc->InputSidePackets().HasTag("MODEL")) << "Either model as side packet or model path in options is required."; if (cc->Inputs().HasTag(kTensorsTag)) cc->Inputs().Tag(kTensorsTag).Set>(); if (cc->Outputs().HasTag(kTensorsTag)) cc->Outputs().Tag(kTensorsTag).Set>(); if (cc->Inputs().HasTag(kTensorsGpuTag)) cc->Inputs().Tag(kTensorsGpuTag).Set>(); if (cc->Outputs().HasTag(kTensorsGpuTag)) cc->Outputs().Tag(kTensorsGpuTag).Set>(); if (cc->InputSidePackets().HasTag("CUSTOM_OP_RESOLVER")) { cc->InputSidePackets() .Tag("CUSTOM_OP_RESOLVER") .Set(); } if (cc->InputSidePackets().HasTag("MODEL")) { cc->InputSidePackets().Tag("MODEL").Set(); } if (ShouldUseGpu(cc)) { #if MEDIAPIPE_TFLITE_GL_INFERENCE MP_RETURN_IF_ERROR(mediapipe::GlCalculatorHelper::UpdateContract(cc)); #elif MEDIAPIPE_TFLITE_METAL_INFERENCE MP_RETURN_IF_ERROR([MPPMetalHelper updateContract:cc]); #endif } // Assign this calculator's default InputStreamHandler. cc->SetInputStreamHandler("FixedSizeInputStreamHandler"); return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::Open(CalculatorContext* cc) { cc->SetOffset(TimestampDiff(0)); const auto& options = cc->Options<::mediapipe::TfLiteInferenceCalculatorOptions>(); gpu_inference_ = ShouldUseGpu(cc); gpu_input_ = cc->Inputs().HasTag(kTensorsGpuTag); gpu_output_ = cc->Outputs().HasTag(kTensorsGpuTag); use_advanced_gpu_api_ = MEDIAPIPE_TFLITE_GL_INFERENCE && options.has_delegate() && options.delegate().has_gpu() && options.delegate().gpu().use_advanced_gpu_api(); if (use_advanced_gpu_api_ && !gpu_input_) { LOG(WARNING) << "Cannot use advanced GPU APIs, input must be GPU buffers." "Falling back to the default TFLite API."; use_advanced_gpu_api_ = false; } CHECK(!use_advanced_gpu_api_ || gpu_inference_); MP_RETURN_IF_ERROR(LoadModel(cc)); if (gpu_inference_) { #if MEDIAPIPE_TFLITE_GL_INFERENCE MP_RETURN_IF_ERROR(gpu_helper_.Open(cc)); MP_RETURN_IF_ERROR( gpu_helper_.RunInGlContext([this, &cc]() -> ::mediapipe::Status { return use_advanced_gpu_api_ ? InitTFLiteGPURunner(cc) : LoadDelegate(cc); })); #elif MEDIAPIPE_TFLITE_METAL_INFERENCE gpu_helper_ = [[MPPMetalHelper alloc] initWithCalculatorContext:cc]; RET_CHECK(gpu_helper_); MP_RETURN_IF_ERROR(LoadDelegate(cc)); #endif } else { // TODO: why only on these platforms? // It seems that the XNNPACK delegate fails to load on Linux. #if defined(__EMSCRIPTEN__) || defined(MEDIAPIPE_ANDROID) || \ defined(MEDIAPIPE_IOS) MP_RETURN_IF_ERROR(LoadDelegate(cc)); #endif // __EMSCRIPTEN__ || MEDIAPIPE_ANDROID || MEDIAPIPE_IOS } return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::Process(CalculatorContext* cc) { return RunInContextIfNeeded([this, cc]() -> ::mediapipe::Status { // 0. Declare outputs auto output_tensors_gpu = absl::make_unique>(); auto output_tensors_cpu = absl::make_unique>(); // 1. Receive pre-processed tensor inputs. if (gpu_input_) { MP_RETURN_IF_ERROR(ProcessInputsGpu(cc, output_tensors_gpu.get())); } else { MP_RETURN_IF_ERROR(ProcessInputsCpu(cc, output_tensors_cpu.get())); } // 2. Run inference. #if MEDIAPIPE_TFLITE_GL_INFERENCE if (gpu_inference_ && use_advanced_gpu_api_) { RET_CHECK(tflite_gpu_runner_->Invoke().ok()); } else { RET_CHECK_EQ(interpreter_->Invoke(), kTfLiteOk); } #else RET_CHECK_EQ(interpreter_->Invoke(), kTfLiteOk); #endif // MEDIAPIPE_TFLITE_GL_INFERENCE // 3. Output processed tensors. if (gpu_output_ || use_advanced_gpu_api_) { MP_RETURN_IF_ERROR(ProcessOutputsGpu(cc, std::move(output_tensors_cpu), std::move(output_tensors_gpu))); } else { MP_RETURN_IF_ERROR(ProcessOutputsCpu(cc, std::move(output_tensors_cpu))); } return ::mediapipe::OkStatus(); }); } ::mediapipe::Status TfLiteInferenceCalculator::Close(CalculatorContext* cc) { return RunInContextIfNeeded([this]() -> ::mediapipe::Status { if (delegate_) { interpreter_ = nullptr; delegate_ = nullptr; #if MEDIAPIPE_TFLITE_GPU_SUPPORTED if (gpu_inference_) { for (int i = 0; i < gpu_data_in_.size(); ++i) { gpu_data_in_[i].reset(); } for (int i = 0; i < gpu_data_out_.size(); ++i) { gpu_data_out_[i].reset(); } } #endif // MEDIAPIPE_TFLITE_GPU_SUPPORTED } #if defined(MEDIAPIPE_EDGE_TPU) edgetpu_context_.reset(); #endif return ::mediapipe::OkStatus(); }); } // Calculator Auxiliary Section ::mediapipe::Status TfLiteInferenceCalculator::ProcessInputsCpu( CalculatorContext* cc, std::vector* output_tensors_cpu) { if (cc->Inputs().Tag(kTensorsTag).IsEmpty()) { return ::mediapipe::OkStatus(); } // Read CPU input into tensors. const auto& input_tensors = cc->Inputs().Tag(kTensorsTag).Get>(); RET_CHECK_GT(input_tensors.size(), 0); for (int i = 0; i < input_tensors.size(); ++i) { const TfLiteTensor* input_tensor = &input_tensors[i]; RET_CHECK(input_tensor->data.raw); if (use_quantized_tensors_) { const uint8* input_tensor_buffer = input_tensor->data.uint8; uint8* local_tensor_buffer = interpreter_->typed_input_tensor(i); std::memcpy(local_tensor_buffer, input_tensor_buffer, input_tensor->bytes); } else { const float* input_tensor_buffer = input_tensor->data.f; float* local_tensor_buffer = interpreter_->typed_input_tensor(i); std::memcpy(local_tensor_buffer, input_tensor_buffer, input_tensor->bytes); } } return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::ProcessInputsGpu( CalculatorContext* cc, std::vector* output_tensors_gpu) { if (cc->Inputs().Tag(kTensorsGpuTag).IsEmpty()) { return ::mediapipe::OkStatus(); } if (use_advanced_gpu_api_) { #if MEDIAPIPE_TFLITE_GL_INFERENCE const auto& input_tensors = cc->Inputs().Tag(kTensorsGpuTag).Get>(); RET_CHECK(!input_tensors.empty()); for (int i = 0; i < input_tensors.size(); ++i) { MP_RETURN_IF_ERROR( tflite_gpu_runner_->BindSSBOToInputTensor(input_tensors[i].id(), i)); } if (gpu_output_) { // Allocate new output tensor. output_tensors_gpu->resize(gpu_data_out_.size()); for (int i = 0; i < gpu_data_out_.size(); ++i) { GpuTensor& tensor = output_tensors_gpu->at(i); MP_RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer( gpu_data_out_[i]->elements, &tensor)); MP_RETURN_IF_ERROR( tflite_gpu_runner_->BindSSBOToOutputTensor(tensor.id(), i)); } } else { // Re-use internal output tensor. for (int i = 0; i < gpu_data_out_.size(); ++i) { MP_RETURN_IF_ERROR(tflite_gpu_runner_->BindSSBOToOutputTensor( gpu_data_out_[i]->buffer.id(), i)); } } #endif // MEDIAPIPE_TFLITE_GL_INFERENCE } else if (gpu_input_) { // Read GPU input into SSBO. #if MEDIAPIPE_TFLITE_GL_INFERENCE const auto& input_tensors = cc->Inputs().Tag(kTensorsGpuTag).Get>(); RET_CHECK_GT(input_tensors.size(), 0); // Explicit copy input. gpu_data_in_.resize(input_tensors.size()); for (int i = 0; i < input_tensors.size(); ++i) { MP_RETURN_IF_ERROR(CopyBuffer(input_tensors[i], gpu_data_in_[i]->buffer)); } #elif MEDIAPIPE_TFLITE_METAL_INFERENCE const auto& input_tensors = cc->Inputs().Tag(kTensorsGpuTag).Get>(); RET_CHECK_GT(input_tensors.size(), 0); // Explicit copy input with conversion float 32 bits to 16 bits. gpu_data_in_.resize(input_tensors.size()); id command_buffer = [gpu_helper_ commandBuffer]; command_buffer.label = @"TfLiteInferenceCalculatorConvert"; id compute_encoder = [command_buffer computeCommandEncoder]; [compute_encoder setComputePipelineState:fp32_to_fp16_program_]; for (int i = 0; i < input_tensors.size(); ++i) { [compute_encoder setBuffer:input_tensors[i] offset:0 atIndex:0]; [compute_encoder setBuffer:gpu_data_in_[i]->buffer offset:0 atIndex:1]; constexpr int kWorkgroupSize = 64; // Block size for GPU shader. MTLSize threads_per_group = MTLSizeMake(kWorkgroupSize, 1, 1); const int threadgroups = NumGroups(gpu_data_in_[i]->elements, kWorkgroupSize); [compute_encoder dispatchThreadgroups:MTLSizeMake(threadgroups, 1, 1) threadsPerThreadgroup:threads_per_group]; } [compute_encoder endEncoding]; [command_buffer commit]; #endif // MEDIAPIPE_TFLITE_GL_INFERENCE } return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::ProcessOutputsCpu( CalculatorContext* cc, std::unique_ptr> output_tensors_cpu) { // Output result tensors (CPU). const auto& tensor_indexes = interpreter_->outputs(); for (int i = 0; i < tensor_indexes.size(); ++i) { TfLiteTensor* tensor = interpreter_->tensor(tensor_indexes[i]); output_tensors_cpu->emplace_back(*tensor); } cc->Outputs() .Tag(kTensorsTag) .Add(output_tensors_cpu.release(), cc->InputTimestamp()); return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::ProcessOutputsGpu( CalculatorContext* cc, std::unique_ptr> output_tensors_cpu, std::unique_ptr> output_tensors_gpu) { if (use_advanced_gpu_api_) { #if MEDIAPIPE_TFLITE_GL_INFERENCE if (gpu_output_) { // Send out pre-allocated tensors. cc->Outputs() .Tag(kTensorsGpuTag) .Add(output_tensors_gpu.release(), cc->InputTimestamp()); } else { // Download to CPU for output. const auto& tensor_indexes = interpreter_->inputs(); for (int i = 0; i < tensor_indexes.size(); ++i) { TfLiteTensor* tensor = interpreter_->tensor(tensor_indexes[i]); std::vector gpu_data(tensor->bytes / sizeof(float)); MP_RETURN_IF_ERROR(gpu_data_out_[i]->buffer.Read( absl::MakeSpan(tensor->data.f, tensor->bytes))); output_tensors_cpu->emplace_back(*tensor); } // Output result tensors (CPU). cc->Outputs() .Tag(kTensorsTag) .Add(output_tensors_cpu.release(), cc->InputTimestamp()); } #endif // MEDIAPIPE_TFLITE_GL_INFERENCE } else if (gpu_output_) { #if MEDIAPIPE_TFLITE_GL_INFERENCE // Output result tensors (GPU). output_tensors_gpu->resize(gpu_data_out_.size()); for (int i = 0; i < gpu_data_out_.size(); ++i) { GpuTensor& tensor = output_tensors_gpu->at(i); // Allocate output tensor. MP_RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer( gpu_data_out_[i]->elements, &tensor)); MP_RETURN_IF_ERROR(CopyBuffer(gpu_data_out_[i]->buffer, tensor)); } cc->Outputs() .Tag(kTensorsGpuTag) .Add(output_tensors_gpu.release(), cc->InputTimestamp()); #elif MEDIAPIPE_TFLITE_METAL_INFERENCE // Output result tensors (GPU). output_tensors_gpu->resize(gpu_data_out_.size()); id device = gpu_helper_.mtlDevice; id command_buffer = [gpu_helper_ commandBuffer]; command_buffer.label = @"TfLiteInferenceBPHWC4Convert"; id convert_command = [command_buffer computeCommandEncoder]; for (int i = 0; i < gpu_data_out_.size(); ++i) { // Allocate output tensor. output_tensors_gpu->at(i) = [device newBufferWithLength:gpu_data_out_[i]->elements * sizeof(float) options:MTLResourceStorageModeShared]; // Reshape tensor. [converter_from_BPHWC4_ convertWithEncoder:convert_command shape:gpu_data_out_[i]->shape sourceBuffer:gpu_data_out_[i]->buffer convertedBuffer:output_tensors_gpu->at(i)]; } [convert_command endEncoding]; [command_buffer commit]; cc->Outputs() .Tag(kTensorsGpuTag) .Add(output_tensors_gpu.release(), cc->InputTimestamp()); #endif // MEDIAPIPE_TFLITE_GL_INFERENCE } return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::InitTFLiteGPURunner( CalculatorContext* cc) { #if MEDIAPIPE_TFLITE_GL_INFERENCE ASSIGN_OR_RETURN(model_packet_, GetModelAsPacket(*cc)); const auto& model = *model_packet_.Get(); tflite::ops::builtin::BuiltinOpResolver op_resolver; if (cc->InputSidePackets().HasTag("CUSTOM_OP_RESOLVER")) { op_resolver = cc->InputSidePackets() .Tag("CUSTOM_OP_RESOLVER") .Get(); } // Create runner tflite::gpu::InferenceOptions options; options.priority1 = tflite::gpu::InferencePriority::MIN_LATENCY; options.priority2 = tflite::gpu::InferencePriority::AUTO; options.priority3 = tflite::gpu::InferencePriority::AUTO; options.usage = tflite::gpu::InferenceUsage::SUSTAINED_SPEED; tflite_gpu_runner_ = std::make_unique(options); MP_RETURN_IF_ERROR( tflite_gpu_runner_->InitializeWithModel(model, op_resolver)); // Allocate interpreter memory for cpu output. if (!gpu_output_) { interpreter_ = absl::make_unique(); const int num_outputs = tflite_gpu_runner_->GetOutputShapes().size(); interpreter_->AddTensors(num_outputs); std::vector indices(num_outputs); for (int i = 0; i < num_outputs; ++i) indices[i] = i; // There is no ResizeOutputTensor(), so we use 'inputs' space instead. interpreter_->SetInputs(indices); TfLiteQuantization quant; quant.type = kTfLiteNoQuantization; quant.params = nullptr; for (int i = 0; i < num_outputs; ++i) { auto shape = tflite_gpu_runner_->GetOutputShapes()[i]; const int tensor_idx = interpreter_->inputs()[i]; interpreter_->SetTensorParametersReadWrite(tensor_idx, kTfLiteFloat32, "", {shape.c}, quant); CHECK(interpreter_->ResizeInputTensor( tensor_idx, {shape.h, shape.w, shape.c}) == kTfLiteOk); } CHECK(interpreter_->AllocateTensors() == kTfLiteOk); } // Create and bind OpenGL buffers for outputs. // The buffers are created once and their ids are passed to calculator outputs gpu_data_out_.resize(tflite_gpu_runner_->outputs_size()); for (int i = 0; i < tflite_gpu_runner_->outputs_size(); ++i) { gpu_data_out_[i] = absl::make_unique(); ASSIGN_OR_RETURN(gpu_data_out_[i]->elements, tflite_gpu_runner_->GetOutputElements(i)); // Create and bind input buffer. MP_RETURN_IF_ERROR( ::tflite::gpu::gl::CreateReadWriteShaderStorageBuffer( gpu_data_out_[i]->elements, &gpu_data_out_[i]->buffer)); } MP_RETURN_IF_ERROR(tflite_gpu_runner_->Build()); #endif // MEDIAPIPE_TFLITE_GL_INFERENCE return ::mediapipe::OkStatus(); } ::mediapipe::Status TfLiteInferenceCalculator::LoadModel( CalculatorContext* cc) { if (use_advanced_gpu_api_) { // Use InitTFLiteGPURunner for everything. return ::mediapipe::OkStatus(); } ASSIGN_OR_RETURN(model_packet_, GetModelAsPacket(*cc)); const auto& model = *model_packet_.Get(); tflite::ops::builtin::BuiltinOpResolver op_resolver; if (cc->InputSidePackets().HasTag("CUSTOM_OP_RESOLVER")) { op_resolver = cc->InputSidePackets() .Tag("CUSTOM_OP_RESOLVER") .Get(); } #if defined(MEDIAPIPE_EDGE_TPU) interpreter_ = BuildEdgeTpuInterpreter(model, &op_resolver, edgetpu_context_.get()); #else tflite::InterpreterBuilder(model, op_resolver)(&interpreter_); #endif // MEDIAPIPE_EDGE_TPU RET_CHECK(interpreter_); #if defined(__EMSCRIPTEN__) || defined(MEDIAPIPE_EDGE_TPU) interpreter_->SetNumThreads(1); #else interpreter_->SetNumThreads( cc->Options() .cpu_num_thread()); #endif // __EMSCRIPTEN__ if (gpu_output_) { use_quantized_tensors_ = false; } else { RET_CHECK_EQ(interpreter_->AllocateTensors(), kTfLiteOk); use_quantized_tensors_ = (interpreter_->tensor(interpreter_->inputs()[0])->quantization.type == kTfLiteAffineQuantization); if (use_quantized_tensors_) gpu_inference_ = false; } return ::mediapipe::OkStatus(); } ::mediapipe::StatusOr TfLiteInferenceCalculator::GetModelAsPacket( const CalculatorContext& cc) { const auto& options = cc.Options(); if (!options.model_path().empty()) { std::string model_path = options.model_path(); ASSIGN_OR_RETURN(model_path, mediapipe::PathToResourceAsFile(model_path)); auto model = tflite::FlatBufferModel::BuildFromFile(model_path.c_str()); RET_CHECK(model) << "Failed to load model from path."; return MakePacket(TfLiteModelPtr( model.release(), [](tflite::FlatBufferModel* model) { delete model; })); } if (cc.InputSidePackets().HasTag("MODEL")) { return cc.InputSidePackets().Tag("MODEL"); } return ::mediapipe::Status( ::mediapipe::StatusCode::kNotFound, "Must specify TFLite model as path or loaded model."); } ::mediapipe::Status TfLiteInferenceCalculator::LoadDelegate( CalculatorContext* cc) { const auto& calculator_opts = cc->Options(); if (calculator_opts.has_delegate() && calculator_opts.delegate().has_tflite()) { // Default tflite inference requeqsted - no need to modify graph. return ::mediapipe::OkStatus(); } if (!gpu_inference_) { #if defined(MEDIAPIPE_ANDROID) const bool nnapi_requested = calculator_opts.has_delegate() ? calculator_opts.delegate().has_nnapi() : calculator_opts.use_nnapi(); if (nnapi_requested) { // Attempt to use NNAPI. // If not supported, the default CPU delegate will be created and used. interpreter_->SetAllowFp16PrecisionForFp32(1); delegate_ = TfLiteDelegatePtr(tflite::NnApiDelegate(), [](TfLiteDelegate*) { // No need to free according to tflite::NnApiDelegate() // documentation. }); RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), kTfLiteOk); return ::mediapipe::OkStatus(); } #endif // MEDIAPIPE_ANDROID #if defined(__EMSCRIPTEN__) const bool xnnpack_requested = true; #else const bool xnnpack_requested = calculator_opts.has_delegate() && calculator_opts.delegate().has_xnnpack(); #endif // __EMSCRIPTEN__ #if !defined(MEDIAPIPE_EDGE_TPU) if (xnnpack_requested) { TfLiteXNNPackDelegateOptions xnnpack_opts{}; xnnpack_opts.num_threads = GetXnnpackNumThreads(calculator_opts); delegate_ = TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&xnnpack_opts), &TfLiteXNNPackDelegateDelete); RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), kTfLiteOk); } #endif // !EDGETPU // Return, no need for GPU delegate below. return ::mediapipe::OkStatus(); } #if MEDIAPIPE_TFLITE_GL_INFERENCE // Configure and create the delegate. TfLiteGpuDelegateOptions options = TfLiteGpuDelegateOptionsDefault(); options.compile_options.precision_loss_allowed = 1; options.compile_options.preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST; options.compile_options.dynamic_batch_enabled = 0; options.compile_options.inline_parameters = 1; if (!delegate_) delegate_ = TfLiteDelegatePtr(TfLiteGpuDelegateCreate(&options), &TfLiteGpuDelegateDelete); if (gpu_input_) { // Get input image sizes. const auto& input_indices = interpreter_->inputs(); gpu_data_in_.resize(input_indices.size()); for (int i = 0; i < input_indices.size(); ++i) { const TfLiteTensor* tensor = interpreter_->tensor(input_indices[i]); gpu_data_in_[i] = absl::make_unique(); gpu_data_in_[i]->elements = 1; for (int d = 0; d < tensor->dims->size; ++d) { gpu_data_in_[i]->elements *= tensor->dims->data[d]; } // Create and bind input buffer. MP_RETURN_IF_ERROR( ::tflite::gpu::gl::CreateReadWriteShaderStorageBuffer( gpu_data_in_[i]->elements, &gpu_data_in_[i]->buffer)); RET_CHECK_EQ(TfLiteGpuDelegateBindBufferToTensor( delegate_.get(), gpu_data_in_[i]->buffer.id(), interpreter_->inputs()[i]), kTfLiteOk); } } if (gpu_output_) { // Get output image sizes. const auto& output_indices = interpreter_->outputs(); gpu_data_out_.resize(output_indices.size()); for (int i = 0; i < gpu_data_out_.size(); ++i) { const TfLiteTensor* tensor = interpreter_->tensor(output_indices[i]); gpu_data_out_[i] = absl::make_unique(); gpu_data_out_[i]->elements = 1; // TODO handle *2 properly on some dialated models for (int d = 0; d < tensor->dims->size; ++d) { gpu_data_out_[i]->elements *= tensor->dims->data[d]; } } // Create and bind output buffers. interpreter_->SetAllowBufferHandleOutput(true); for (int i = 0; i < gpu_data_out_.size(); ++i) { MP_RETURN_IF_ERROR(CreateReadWriteShaderStorageBuffer( gpu_data_out_[i]->elements, &gpu_data_out_[i]->buffer)); RET_CHECK_EQ(TfLiteGpuDelegateBindBufferToTensor( delegate_.get(), gpu_data_out_[i]->buffer.id(), output_indices[i]), kTfLiteOk); } } // Must call this last. RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), kTfLiteOk); #endif // MEDIAPIPE_TFLITE_GL_INFERENCE #if MEDIAPIPE_TFLITE_METAL_INFERENCE const int kHalfSize = 2; // sizeof(half) // Configure and create the delegate. TFLGpuDelegateOptions options; options.allow_precision_loss = true; options.wait_type = TFLGpuDelegateWaitType::TFLGpuDelegateWaitTypePassive; if (!delegate_) delegate_ = TfLiteDelegatePtr(TFLGpuDelegateCreate(&options), &TFLGpuDelegateDelete); id device = gpu_helper_.mtlDevice; if (gpu_input_) { // Get input image sizes. const auto& input_indices = interpreter_->inputs(); gpu_data_in_.resize(input_indices.size()); for (int i = 0; i < input_indices.size(); ++i) { const TfLiteTensor* tensor = interpreter_->tensor(input_indices[i]); gpu_data_in_[i] = absl::make_unique(); gpu_data_in_[i]->shape.b = tensor->dims->data[0]; gpu_data_in_[i]->shape.h = tensor->dims->data[1]; gpu_data_in_[i]->shape.w = tensor->dims->data[2]; // On iOS GPU, input must be 4 channels, regardless of what model expects. gpu_data_in_[i]->shape.c = 4; gpu_data_in_[i]->elements = gpu_data_in_[i]->shape.b * gpu_data_in_[i]->shape.h * gpu_data_in_[i]->shape.w * gpu_data_in_[i]->shape.c; // Input to model can be RGBA only. if (tensor->dims->data[3] != 4) { LOG(WARNING) << "Please ensure input GPU tensor is 4 channels."; } const std::string shader_source = absl::Substitute(R"(#include using namespace metal; kernel void convertKernel(device float4* const input_buffer [[buffer(0)]], device half4* output_buffer [[buffer(1)]], uint gid [[thread_position_in_grid]]) { if (gid >= $0) return; output_buffer[gid] = half4(input_buffer[gid]); })", gpu_data_in_[i]->elements / 4); NSString* library_source = [NSString stringWithUTF8String:shader_source.c_str()]; NSError* error = nil; id library = [device newLibraryWithSource:library_source options:nil error:&error]; RET_CHECK(library != nil) << "Couldn't create shader library " << [[error localizedDescription] UTF8String]; id kernel_func = nil; kernel_func = [library newFunctionWithName:@"convertKernel"]; RET_CHECK(kernel_func != nil) << "Couldn't create kernel function."; fp32_to_fp16_program_ = [device newComputePipelineStateWithFunction:kernel_func error:&error]; RET_CHECK(fp32_to_fp16_program_ != nil) << "Couldn't create pipeline state " << [[error localizedDescription] UTF8String]; // Create and bind input buffer. gpu_data_in_[i]->buffer = [device newBufferWithLength:gpu_data_in_[i]->elements * kHalfSize options:MTLResourceStorageModeShared]; RET_CHECK_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()), kTfLiteOk); RET_CHECK_EQ( TFLGpuDelegateBindMetalBufferToTensor( delegate_.get(), input_indices[i], gpu_data_in_[i]->buffer), true); } } if (gpu_output_) { // Get output image sizes. const auto& output_indices = interpreter_->outputs(); gpu_data_out_.resize(output_indices.size()); for (int i = 0; i < gpu_data_out_.size(); ++i) { const TfLiteTensor* tensor = interpreter_->tensor(output_indices[i]); gpu_data_out_[i] = absl::make_unique(); gpu_data_out_[i]->elements = 1; // TODO handle *2 properly on some dialated models for (int d = 0; d < tensor->dims->size; ++d) { // Pad each dim for BHWC4 conversion inside delegate. gpu_data_out_[i]->elements *= RoundUp(tensor->dims->data[d], 4); } // Save dimensions for reshaping back later. gpu_data_out_[i]->shape.b = tensor->dims->data[0]; switch (tensor->dims->size) { case 2: gpu_data_out_[i]->shape.h = 1; gpu_data_out_[i]->shape.w = 1; gpu_data_out_[i]->shape.c = tensor->dims->data[1]; break; case 3: gpu_data_out_[i]->shape.h = 1; gpu_data_out_[i]->shape.w = tensor->dims->data[1]; gpu_data_out_[i]->shape.c = tensor->dims->data[2]; break; case 4: gpu_data_out_[i]->shape.h = tensor->dims->data[1]; gpu_data_out_[i]->shape.w = tensor->dims->data[2]; gpu_data_out_[i]->shape.c = tensor->dims->data[3]; break; default: return mediapipe::InternalError("Unsupported tensor shape."); } } // Create and bind output buffers. interpreter_->SetAllowBufferHandleOutput(true); for (int i = 0; i < gpu_data_out_.size(); ++i) { gpu_data_out_[i]->buffer = [device newBufferWithLength:gpu_data_out_[i]->elements * kHalfSize options:MTLResourceStorageModeShared]; RET_CHECK_EQ( TFLGpuDelegateBindMetalBufferToTensor( delegate_.get(), output_indices[i], gpu_data_out_[i]->buffer), true); } // Create converter for GPU output. converter_from_BPHWC4_ = [[TFLBufferConvert alloc] initWithDevice:device isFloat16:true convertToPBHWC4:false]; if (converter_from_BPHWC4_ == nil) { return mediapipe::InternalError( "Error initializating output buffer converter"); } } #endif // MEDIAPIPE_TFLITE_METAL_INFERENCE return ::mediapipe::OkStatus(); } } // namespace mediapipe