Add gpu to cpu fallback for tensors_to_detections_calculator.
PiperOrigin-RevId: 544480883
This commit is contained in:
		
							parent
							
								
									0ea54b1461
								
							
						
					
					
						commit
						687075e5b8
					
				| 
						 | 
				
			
			@ -256,6 +256,7 @@ class TensorsToDetectionsCalculator : public Node {
 | 
			
		|||
 | 
			
		||||
  bool gpu_inited_ = false;
 | 
			
		||||
  bool gpu_input_ = false;
 | 
			
		||||
  bool gpu_has_enough_work_groups_ = true;
 | 
			
		||||
  bool anchors_init_ = false;
 | 
			
		||||
};
 | 
			
		||||
MEDIAPIPE_REGISTER_NODE(TensorsToDetectionsCalculator);
 | 
			
		||||
| 
						 | 
				
			
			@ -291,7 +292,7 @@ absl::Status TensorsToDetectionsCalculator::Open(CalculatorContext* cc) {
 | 
			
		|||
absl::Status TensorsToDetectionsCalculator::Process(CalculatorContext* cc) {
 | 
			
		||||
  auto output_detections = absl::make_unique<std::vector<Detection>>();
 | 
			
		||||
  bool gpu_processing = false;
 | 
			
		||||
  if (CanUseGpu()) {
 | 
			
		||||
  if (CanUseGpu() && gpu_has_enough_work_groups_) {
 | 
			
		||||
    // Use GPU processing only if at least one input tensor is already on GPU
 | 
			
		||||
    // (to avoid CPU->GPU overhead).
 | 
			
		||||
    for (const auto& tensor : *kInTensors(cc)) {
 | 
			
		||||
| 
						 | 
				
			
			@ -321,11 +322,20 @@ absl::Status TensorsToDetectionsCalculator::Process(CalculatorContext* cc) {
 | 
			
		|||
    RET_CHECK(!has_custom_box_indices_);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (gpu_processing) {
 | 
			
		||||
    if (!gpu_inited_) {
 | 
			
		||||
      MP_RETURN_IF_ERROR(GpuInit(cc));
 | 
			
		||||
  if (gpu_processing && !gpu_inited_) {
 | 
			
		||||
    auto status = GpuInit(cc);
 | 
			
		||||
    if (status.ok()) {
 | 
			
		||||
      gpu_inited_ = true;
 | 
			
		||||
    } else if (status.code() == absl::StatusCode::kFailedPrecondition) {
 | 
			
		||||
      // For initialization error because of hardware limitation, fallback to
 | 
			
		||||
      // CPU processing.
 | 
			
		||||
      LOG(WARNING) << status.message();
 | 
			
		||||
    } else {
 | 
			
		||||
      // For other error, let the error propagates.
 | 
			
		||||
      return status;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  if (gpu_processing && gpu_inited_) {
 | 
			
		||||
    MP_RETURN_IF_ERROR(ProcessGPU(cc, output_detections.get()));
 | 
			
		||||
  } else {
 | 
			
		||||
    MP_RETURN_IF_ERROR(ProcessCPU(cc, output_detections.get()));
 | 
			
		||||
| 
						 | 
				
			
			@ -346,17 +356,41 @@ absl::Status TensorsToDetectionsCalculator::ProcessCPU(
 | 
			
		|||
    // TODO: Add flexible input tensor size handling.
 | 
			
		||||
    auto raw_box_tensor =
 | 
			
		||||
        &input_tensors[tensor_mapping_.detections_tensor_index()];
 | 
			
		||||
    RET_CHECK_EQ(raw_box_tensor->shape().dims.size(), 3);
 | 
			
		||||
    RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1);
 | 
			
		||||
    RET_CHECK_GT(num_boxes_, 0) << "Please set num_boxes in calculator options";
 | 
			
		||||
    RET_CHECK_EQ(raw_box_tensor->shape().dims[1], num_boxes_);
 | 
			
		||||
    RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_coords_);
 | 
			
		||||
    if (raw_box_tensor->shape().dims.size() == 3) {
 | 
			
		||||
      // The tensors from CPU inference has dim 3.
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1);
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[1], num_boxes_);
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_coords_);
 | 
			
		||||
    } else if (raw_box_tensor->shape().dims.size() == 4) {
 | 
			
		||||
      // The tensors from GPU inference has dim 4. For gpu-cpu fallback support,
 | 
			
		||||
      // we allow tensors with 4 dims.
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[0], 1);
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[1], 1);
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[2], num_boxes_);
 | 
			
		||||
      RET_CHECK_EQ(raw_box_tensor->shape().dims[3], num_coords_);
 | 
			
		||||
    } else {
 | 
			
		||||
      return absl::InvalidArgumentError(
 | 
			
		||||
          "The dimensions of box Tensor must be 3 or 4.");
 | 
			
		||||
    }
 | 
			
		||||
    auto raw_score_tensor =
 | 
			
		||||
        &input_tensors[tensor_mapping_.scores_tensor_index()];
 | 
			
		||||
    RET_CHECK_EQ(raw_score_tensor->shape().dims.size(), 3);
 | 
			
		||||
    RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1);
 | 
			
		||||
    RET_CHECK_EQ(raw_score_tensor->shape().dims[1], num_boxes_);
 | 
			
		||||
    RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_classes_);
 | 
			
		||||
    if (raw_score_tensor->shape().dims.size() == 3) {
 | 
			
		||||
      // The tensors from CPU inference has dim 3.
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1);
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[1], num_boxes_);
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_classes_);
 | 
			
		||||
    } else if (raw_score_tensor->shape().dims.size() == 4) {
 | 
			
		||||
      // The tensors from GPU inference has dim 4. For gpu-cpu fallback support,
 | 
			
		||||
      // we allow tensors with 4 dims.
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[0], 1);
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[1], 1);
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[2], num_boxes_);
 | 
			
		||||
      RET_CHECK_EQ(raw_score_tensor->shape().dims[3], num_classes_);
 | 
			
		||||
    } else {
 | 
			
		||||
      return absl::InvalidArgumentError(
 | 
			
		||||
          "The dimensions of score Tensor must be 3 or 4.");
 | 
			
		||||
    }
 | 
			
		||||
    auto raw_box_view = raw_box_tensor->GetCpuReadView();
 | 
			
		||||
    auto raw_boxes = raw_box_view.buffer<float>();
 | 
			
		||||
    auto raw_scores_view = raw_score_tensor->GetCpuReadView();
 | 
			
		||||
| 
						 | 
				
			
			@ -1111,8 +1145,13 @@ void main() {
 | 
			
		|||
    int max_wg_size;  //  typically <= 1024
 | 
			
		||||
    glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1,
 | 
			
		||||
                    &max_wg_size);  // y-dim
 | 
			
		||||
    CHECK_LT(num_classes_, max_wg_size)
 | 
			
		||||
        << "# classes must be < " << max_wg_size;
 | 
			
		||||
    gpu_has_enough_work_groups_ = num_classes_ < max_wg_size;
 | 
			
		||||
    if (!gpu_has_enough_work_groups_) {
 | 
			
		||||
      return absl::FailedPreconditionError(absl::StrFormat(
 | 
			
		||||
          "Hardware limitation: Processing will be done on CPU, because "
 | 
			
		||||
          "num_classes %d exceeds the max work_group size %d.",
 | 
			
		||||
          num_classes_, max_wg_size));
 | 
			
		||||
    }
 | 
			
		||||
    // TODO support better filtering.
 | 
			
		||||
    if (class_index_set_.is_allowlist) {
 | 
			
		||||
      CHECK_EQ(class_index_set_.values.size(),
 | 
			
		||||
| 
						 | 
				
			
			@ -1370,7 +1409,13 @@ kernel void scoreKernel(
 | 
			
		|||
        Tensor::ElementType::kFloat32, Tensor::Shape{1, num_boxes_ * 2});
 | 
			
		||||
    // # filter classes supported is hardware dependent.
 | 
			
		||||
    int max_wg_size = score_program_.maxTotalThreadsPerThreadgroup;
 | 
			
		||||
    CHECK_LT(num_classes_, max_wg_size) << "# classes must be <" << max_wg_size;
 | 
			
		||||
    gpu_has_enough_work_groups_ = num_classes_ < max_wg_size;
 | 
			
		||||
    if (!gpu_has_enough_work_groups_) {
 | 
			
		||||
      return absl::FailedPreconditionError(absl::StrFormat(
 | 
			
		||||
          "Hardware limitation: Processing will be done on CPU, because "
 | 
			
		||||
          "num_classes %d exceeds the max work_group size %d.",
 | 
			
		||||
          num_classes_, max_wg_size));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#endif  // !defined(MEDIAPIPE_DISABLE_GL_COMPUTE)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user