diff --git a/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc b/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc index c8dd0e2a0..246269de1 100644 --- a/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc +++ b/mediapipe/calculators/tensor/tensors_to_detections_calculator.cc @@ -256,6 +256,7 @@ class TensorsToDetectionsCalculator : public Node { bool gpu_inited_ = false; bool gpu_input_ = false; + bool gpu_has_enough_work_groups_ = true; bool anchors_init_ = false; }; MEDIAPIPE_REGISTER_NODE(TensorsToDetectionsCalculator); @@ -291,7 +292,7 @@ absl::Status TensorsToDetectionsCalculator::Open(CalculatorContext* cc) { absl::Status TensorsToDetectionsCalculator::Process(CalculatorContext* cc) { auto output_detections = absl::make_unique>(); bool gpu_processing = false; - if (CanUseGpu()) { + if (CanUseGpu() && gpu_has_enough_work_groups_) { // Use GPU processing only if at least one input tensor is already on GPU // (to avoid CPU->GPU overhead). for (const auto& tensor : *kInTensors(cc)) { @@ -321,11 +322,20 @@ absl::Status TensorsToDetectionsCalculator::Process(CalculatorContext* cc) { RET_CHECK(!has_custom_box_indices_); } - if (gpu_processing) { - if (!gpu_inited_) { - MP_RETURN_IF_ERROR(GpuInit(cc)); + if (gpu_processing && !gpu_inited_) { + auto status = GpuInit(cc); + if (status.ok()) { gpu_inited_ = true; + } else if (status.code() == absl::StatusCode::kFailedPrecondition) { + // For initialization error because of hardware limitation, fallback to + // CPU processing. + LOG(WARNING) << status.message(); + } else { + // For other error, let the error propagates. + return status; } + } + if (gpu_processing && gpu_inited_) { MP_RETURN_IF_ERROR(ProcessGPU(cc, output_detections.get())); } else { MP_RETURN_IF_ERROR(ProcessCPU(cc, output_detections.get())); @@ -346,17 +356,41 @@ absl::Status TensorsToDetectionsCalculator::ProcessCPU( // TODO: Add flexible input tensor size handling. auto raw_box_tensor = &input_tensors[tensor_mapping_.detections_tensor_index()]; - RET_CHECK_EQ(raw_box_tensor->shape().dims.size(), 3); - RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1); RET_CHECK_GT(num_boxes_, 0) << "Please set num_boxes in calculator options"; - RET_CHECK_EQ(raw_box_tensor->shape().dims[1], num_boxes_); - RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_coords_); + if (raw_box_tensor->shape().dims.size() == 3) { + // The tensors from CPU inference has dim 3. + RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1); + RET_CHECK_EQ(raw_box_tensor->shape().dims[1], num_boxes_); + RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_coords_); + } else if (raw_box_tensor->shape().dims.size() == 4) { + // The tensors from GPU inference has dim 4. For gpu-cpu fallback support, + // we allow tensors with 4 dims. + RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1); + RET_CHECK_EQ(raw_box_tensor->shape().dims[1], 1); + RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_boxes_); + RET_CHECK_EQ(raw_box_tensor->shape().dims[3], num_coords_); + } else { + return absl::InvalidArgumentError( + "The dimensions of box Tensor must be 3 or 4."); + } auto raw_score_tensor = &input_tensors[tensor_mapping_.scores_tensor_index()]; - RET_CHECK_EQ(raw_score_tensor->shape().dims.size(), 3); - RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1); - RET_CHECK_EQ(raw_score_tensor->shape().dims[1], num_boxes_); - RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_classes_); + if (raw_score_tensor->shape().dims.size() == 3) { + // The tensors from CPU inference has dim 3. + RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1); + RET_CHECK_EQ(raw_score_tensor->shape().dims[1], num_boxes_); + RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_classes_); + } else if (raw_score_tensor->shape().dims.size() == 4) { + // The tensors from GPU inference has dim 4. For gpu-cpu fallback support, + // we allow tensors with 4 dims. + RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1); + RET_CHECK_EQ(raw_score_tensor->shape().dims[1], 1); + RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_boxes_); + RET_CHECK_EQ(raw_score_tensor->shape().dims[3], num_classes_); + } else { + return absl::InvalidArgumentError( + "The dimensions of score Tensor must be 3 or 4."); + } auto raw_box_view = raw_box_tensor->GetCpuReadView(); auto raw_boxes = raw_box_view.buffer(); auto raw_scores_view = raw_score_tensor->GetCpuReadView(); @@ -1111,8 +1145,13 @@ void main() { int max_wg_size; // typically <= 1024 glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &max_wg_size); // y-dim - CHECK_LT(num_classes_, max_wg_size) - << "# classes must be < " << max_wg_size; + gpu_has_enough_work_groups_ = num_classes_ < max_wg_size; + if (!gpu_has_enough_work_groups_) { + return absl::FailedPreconditionError(absl::StrFormat( + "Hardware limitation: Processing will be done on CPU, because " + "num_classes %d exceeds the max work_group size %d.", + num_classes_, max_wg_size)); + } // TODO support better filtering. if (class_index_set_.is_allowlist) { CHECK_EQ(class_index_set_.values.size(), @@ -1370,7 +1409,13 @@ kernel void scoreKernel( Tensor::ElementType::kFloat32, Tensor::Shape{1, num_boxes_ * 2}); // # filter classes supported is hardware dependent. int max_wg_size = score_program_.maxTotalThreadsPerThreadgroup; - CHECK_LT(num_classes_, max_wg_size) << "# classes must be <" << max_wg_size; + gpu_has_enough_work_groups_ = num_classes_ < max_wg_size; + if (!gpu_has_enough_work_groups_) { + return absl::FailedPreconditionError(absl::StrFormat( + "Hardware limitation: Processing will be done on CPU, because " + "num_classes %d exceeds the max work_group size %d.", + num_classes_, max_wg_size)); + } } #endif // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)