Change the image label input from Classification to Detection.

PiperOrigin-RevId: 559828139
2023-08-24 12:05:36 -07:00 · 2023-08-24 12:05:36 -07:00 · c56f45bce5
commit c56f45bce5
parent f2e9a553d6
3 changed files with 38 additions and 22 deletions
--- a/mediapipe/calculators/tensorflow/BUILD
+++ b/mediapipe/calculators/tensorflow/BUILD
@ -370,7 +370,6 @@ cc_library(
        ":pack_media_sequence_calculator_cc_proto",
        "//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto",
        "//mediapipe/framework:calculator_framework",
-        "//mediapipe/framework/formats:classification_cc_proto",
        "//mediapipe/framework/formats:detection_cc_proto",
        "//mediapipe/framework/formats:location",
        "//mediapipe/framework/formats:location_opencv",
@ -932,7 +931,6 @@ cc_test(
        "//mediapipe/framework:calculator_runner",
        "//mediapipe/framework:packet",
        "//mediapipe/framework:timestamp",
-        "//mediapipe/framework/formats:classification_cc_proto",
        "//mediapipe/framework/formats:detection_cc_proto",
        "//mediapipe/framework/formats:location",
        "//mediapipe/framework/formats:location_opencv",
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cstdint>
 #include <optional>
 #include <string>
 #include <vector>
@ -22,7 +23,6 @@
 #include "mediapipe/calculators/image/opencv_image_encoder_calculator.pb.h"
 #include "mediapipe/calculators/tensorflow/pack_media_sequence_calculator.pb.h"
 #include "mediapipe/framework/calculator_framework.h"
-#include "mediapipe/framework/formats/classification.pb.h"
 #include "mediapipe/framework/formats/detection.pb.h"
 #include "mediapipe/framework/formats/location.h"
 #include "mediapipe/framework/formats/location_opencv.h"
@ -61,7 +61,7 @@ namespace mpms = mediapipe::mediasequence;
 // The supported input stream tags are:
 // * "IMAGE", which stores the encoded images from the
 //   OpenCVImageEncoderCalculator,
-// * "IMAGE_LABEL", which stores image labels from vector<Classification>,
+// * "IMAGE_LABEL", which stores whole image labels from Detection,
 // * "FORWARD_FLOW_ENCODED", which stores the encoded optical flow from the same
 //   calculator,
 // * "BBOX" which stores bounding boxes from vector<Detections>,
@ -124,7 +124,7 @@ class PackMediaSequenceCalculator : public CalculatorBase {
    for (const auto& tag : cc->Inputs().GetTags()) {
      if (absl::StartsWith(tag, kImageTag)) {
        if (absl::StartsWith(tag, kImageLabelPrefixTag)) {
-          cc->Inputs().Tag(tag).Set<std::vector<Classification>>();
+          cc->Inputs().Tag(tag).Set<Detection>();
          continue;
        }
        std::string key = "";
@ -377,19 +377,29 @@ class PackMediaSequenceCalculator : public CalculatorBase {
        if (absl::StartsWith(tag, kImageLabelPrefixTag)) {
          std::string key =
              std::string(absl::StripPrefix(tag, kImageLabelPrefixTag));
-          std::vector<std::string> labels;
-          std::vector<float> confidences;
-          for (const auto& classification :
-               cc->Inputs().Tag(tag).Get<std::vector<Classification>>()) {
-            labels.push_back(classification.label());
-            confidences.push_back(classification.score());
+          const auto& detection = cc->Inputs().Tag(tag).Get<Detection>();
+          if (detection.label().empty()) continue;
+          RET_CHECK(detection.label_size() == detection.score_size())
+              << "Wrong image label data format: " << detection.label_size()
+              << " vs " << detection.score_size();
+          if (!detection.label_id().empty()) {
+            RET_CHECK(detection.label_id_size() == detection.label_size())
+                << "Wrong image label ID format: " << detection.label_id_size()
+                << " vs " << detection.label_size();
          }
+          std::vector<std::string> labels(detection.label().begin(),
+                                          detection.label().end());
+          std::vector<float> confidences(detection.score().begin(),
+                                         detection.score().end());
+          std::vector<int32_t> ids(detection.label_id().begin(),
+                                   detection.label_id().end());
          if (!key.empty() || mpms::HasImageEncoded(*sequence_)) {
            mpms::AddImageTimestamp(key, cc->InputTimestamp().Value(),
                                    sequence_.get());
          }
          mpms::AddImageLabelString(key, labels, sequence_.get());
          mpms::AddImageLabelConfidence(key, confidences, sequence_.get());
+          if (!ids.empty()) mpms::AddImageLabelIndex(key, ids, sequence_.get());
          continue;
        }
        if (tag != kImageTag) {
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cstdint>
 #include <string>
 #include <vector>

@ -21,7 +22,6 @@
 #include "mediapipe/calculators/tensorflow/pack_media_sequence_calculator.pb.h"
 #include "mediapipe/framework/calculator_framework.h"
 #include "mediapipe/framework/calculator_runner.h"
-#include "mediapipe/framework/formats/classification.pb.h"
 #include "mediapipe/framework/formats/detection.pb.h"
 #include "mediapipe/framework/formats/location.h"
 #include "mediapipe/framework/formats/location_opencv.h"
@ -329,21 +329,27 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoImageLabels) {

  int num_timesteps = 2;
  for (int i = 0; i < num_timesteps; ++i) {
-    Classification cls;
-    cls.set_label(absl::StrCat("foo", 2 << i));
-    cls.set_score(0.1 * i);
-    auto label_ptr = ::absl::make_unique<std::vector<Classification>>(2, cls);
+    Detection detection1;
+    detection1.add_label(absl::StrCat("foo", 2 << i));
+    detection1.add_label_id(i);
+    detection1.add_score(0.1 * i);
+    detection1.add_label(absl::StrCat("foo", 2 << i));
+    detection1.add_label_id(i);
+    detection1.add_score(0.1 * i);
+    auto label_ptr1 = ::absl::make_unique<Detection>(detection1);
    runner_->MutableInputs()
        ->Tag(kImageLabelTestTag)
-        .packets.push_back(Adopt(label_ptr.release()).At(Timestamp(i)));
-    cls.set_label(absl::StrCat("bar", 2 << i));
-    cls.set_score(0.2 * i);
-    label_ptr = ::absl::make_unique<std::vector<Classification>>(2, cls);
+        .packets.push_back(Adopt(label_ptr1.release()).At(Timestamp(i)));
+    Detection detection2;
+    detection2.add_label(absl::StrCat("bar", 2 << i));
+    detection2.add_score(0.2 * i);
+    detection2.add_label(absl::StrCat("bar", 2 << i));
+    detection2.add_score(0.2 * i);
+    auto label_ptr2 = ::absl::make_unique<Detection>(detection2);
    runner_->MutableInputs()
        ->Tag(kImageLabelOtherTag)
-        .packets.push_back(Adopt(label_ptr.release()).At(Timestamp(i)));
+        .packets.push_back(Adopt(label_ptr2.release()).At(Timestamp(i)));
  }
-
  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
      Adopt(input_sequence.release());

@ -372,6 +378,8 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoImageLabels) {
    ASSERT_THAT(mpms::GetImageLabelStringAt("TEST", output_sequence, i),
                ::testing::ElementsAreArray(
                    std::vector<std::string>(2, absl::StrCat("foo", 2 << i))));
+    ASSERT_THAT(mpms::GetImageLabelIndexAt("TEST", output_sequence, i),
+                ::testing::ElementsAreArray(std::vector<int32_t>(2, i)));
    ASSERT_THAT(mpms::GetImageLabelConfidenceAt("TEST", output_sequence, i),
                ::testing::ElementsAreArray(std::vector<float>(2, 0.1 * i)));
    ASSERT_EQ(i, mpms::GetImageTimestampAt("OTHER", output_sequence, i));