Add gpu to cpu fallback for tensors_to_detections_calculator.

PiperOrigin-RevId: 544480883
2023-06-29 15:34:12 -07:00 · 2023-06-29 15:34:12 -07:00 · 687075e5b8
commit 687075e5b8
parent 0ea54b1461
1 changed files with 60 additions and 15 deletions
--- a/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc
+++ b/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc
@ -256,6 +256,7 @@ class TensorsToDetectionsCalculator : public Node {

  bool gpu_inited_ = false;
  bool gpu_input_ = false;
+  bool gpu_has_enough_work_groups_ = true;
  bool anchors_init_ = false;
 };
 MEDIAPIPE_REGISTER_NODE(TensorsToDetectionsCalculator);
@ -291,7 +292,7 @@ absl::Status TensorsToDetectionsCalculator::Open(CalculatorContext* cc) {
 absl::Status TensorsToDetectionsCalculator::Process(CalculatorContext* cc) {
  auto output_detections = absl::make_unique<std::vector<Detection>>();
  bool gpu_processing = false;
-  if (CanUseGpu()) {
+  if (CanUseGpu() && gpu_has_enough_work_groups_) {
    // Use GPU processing only if at least one input tensor is already on GPU
    // (to avoid CPU->GPU overhead).
    for (const auto& tensor : *kInTensors(cc)) {
@ -321,11 +322,20 @@ absl::Status TensorsToDetectionsCalculator::Process(CalculatorContext* cc) {
    RET_CHECK(!has_custom_box_indices_);
  }

-  if (gpu_processing) {
-    if (!gpu_inited_) {
-      MP_RETURN_IF_ERROR(GpuInit(cc));
+  if (gpu_processing && !gpu_inited_) {
+    auto status = GpuInit(cc);
+    if (status.ok()) {
      gpu_inited_ = true;
+    } else if (status.code() == absl::StatusCode::kFailedPrecondition) {
+      // For initialization error because of hardware limitation, fallback to
+      // CPU processing.
+      LOG(WARNING) << status.message();
+    } else {
+      // For other error, let the error propagates.
+      return status;
    }
+  }
+  if (gpu_processing && gpu_inited_) {
    MP_RETURN_IF_ERROR(ProcessGPU(cc, output_detections.get()));
  } else {
    MP_RETURN_IF_ERROR(ProcessCPU(cc, output_detections.get()));
@ -346,17 +356,41 @@ absl::Status TensorsToDetectionsCalculator::ProcessCPU(
    // TODO: Add flexible input tensor size handling.
    auto raw_box_tensor =
        &input_tensors[tensor_mapping_.detections_tensor_index()];
-    RET_CHECK_EQ(raw_box_tensor->shape().dims.size(), 3);
-    RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1);
    RET_CHECK_GT(num_boxes_, 0) << "Please set num_boxes in calculator options";
+    if (raw_box_tensor->shape().dims.size() == 3) {
+      // The tensors from CPU inference has dim 3.
+      RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1);
      RET_CHECK_EQ(raw_box_tensor->shape().dims[1], num_boxes_);
      RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_coords_);
+    } else if (raw_box_tensor->shape().dims.size() == 4) {
+      // The tensors from GPU inference has dim 4. For gpu-cpu fallback support,
+      // we allow tensors with 4 dims.
+      RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1);
+      RET_CHECK_EQ(raw_box_tensor->shape().dims[1], 1);
+      RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_boxes_);
+      RET_CHECK_EQ(raw_box_tensor->shape().dims[3], num_coords_);
+    } else {
+      return absl::InvalidArgumentError(
+          "The dimensions of box Tensor must be 3 or 4.");
+    }
    auto raw_score_tensor =
        &input_tensors[tensor_mapping_.scores_tensor_index()];
-    RET_CHECK_EQ(raw_score_tensor->shape().dims.size(), 3);
+    if (raw_score_tensor->shape().dims.size() == 3) {
+      // The tensors from CPU inference has dim 3.
      RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1);
      RET_CHECK_EQ(raw_score_tensor->shape().dims[1], num_boxes_);
      RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_classes_);
+    } else if (raw_score_tensor->shape().dims.size() == 4) {
+      // The tensors from GPU inference has dim 4. For gpu-cpu fallback support,
+      // we allow tensors with 4 dims.
+      RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1);
+      RET_CHECK_EQ(raw_score_tensor->shape().dims[1], 1);
+      RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_boxes_);
+      RET_CHECK_EQ(raw_score_tensor->shape().dims[3], num_classes_);
+    } else {
+      return absl::InvalidArgumentError(
+          "The dimensions of score Tensor must be 3 or 4.");
+    }
    auto raw_box_view = raw_box_tensor->GetCpuReadView();
    auto raw_boxes = raw_box_view.buffer<float>();
    auto raw_scores_view = raw_score_tensor->GetCpuReadView();
@ -1111,8 +1145,13 @@ void main() {
    int max_wg_size;  //  typically <= 1024
    glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1,
                    &max_wg_size);  // y-dim
-    CHECK_LT(num_classes_, max_wg_size)
-        << "# classes must be < " << max_wg_size;
+    gpu_has_enough_work_groups_ = num_classes_ < max_wg_size;
+    if (!gpu_has_enough_work_groups_) {
+      return absl::FailedPreconditionError(absl::StrFormat(
+          "Hardware limitation: Processing will be done on CPU, because "
+          "num_classes %d exceeds the max work_group size %d.",
+          num_classes_, max_wg_size));
+    }
    // TODO support better filtering.
    if (class_index_set_.is_allowlist) {
      CHECK_EQ(class_index_set_.values.size(),
@ -1370,7 +1409,13 @@ kernel void scoreKernel(
        Tensor::ElementType::kFloat32, Tensor::Shape{1, num_boxes_ * 2});
    // # filter classes supported is hardware dependent.
    int max_wg_size = score_program_.maxTotalThreadsPerThreadgroup;
-    CHECK_LT(num_classes_, max_wg_size) << "# classes must be <" << max_wg_size;
+    gpu_has_enough_work_groups_ = num_classes_ < max_wg_size;
+    if (!gpu_has_enough_work_groups_) {
+      return absl::FailedPreconditionError(absl::StrFormat(
+          "Hardware limitation: Processing will be done on CPU, because "
+          "num_classes %d exceeds the max work_group size %d.",
+          num_classes_, max_wg_size));
+    }
  }

 #endif  // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)