From e16be2e8fa1275e4833f3001f79a4732d7dfb1ad Mon Sep 17 00:00:00 2001
From: Jiuqiang Tang <jqtang@google.com>
Date: Fri, 28 Oct 2022 13:16:38 -0700
Subject: [PATCH] Implement the initial version of TensorsToAudioCalculator
 that supports ifft and inverse hann windowing.

PiperOrigin-RevId: 484605092
---
 mediapipe/calculators/tensor/BUILD            |  50 +++++
 .../tensor/audio_to_tensor_calculator.cc      |   2 +-
 .../tensor/tensors_to_audio_calculator.cc     | 197 ++++++++++++++++++
 .../tensor/tensors_to_audio_calculator.proto  |  29 +++
 .../tensors_to_audio_calculator_test.cc       | 149 +++++++++++++
 5 files changed, 426 insertions(+), 1 deletion(-)
 create mode 100644 mediapipe/calculators/tensor/tensors_to_audio_calculator.cc
 create mode 100644 mediapipe/calculators/tensor/tensors_to_audio_calculator.proto
 create mode 100644 mediapipe/calculators/tensor/tensors_to_audio_calculator_test.cc

diff --git a/mediapipe/calculators/tensor/BUILD b/mediapipe/calculators/tensor/BUILD
index 99b5b3e91..a966dd3fc 100644
--- a/mediapipe/calculators/tensor/BUILD
+++ b/mediapipe/calculators/tensor/BUILD
@@ -109,6 +109,56 @@ cc_test(
     ],
 )
 
+mediapipe_proto_library(
+    name = "tensors_to_audio_calculator_proto",
+    srcs = ["tensors_to_audio_calculator.proto"],
+    visibility = [
+        "//mediapipe/framework:mediapipe_internal",
+    ],
+    deps = [
+        "//mediapipe/framework:calculator_options_proto",
+        "//mediapipe/framework:calculator_proto",
+    ],
+)
+
+cc_library(
+    name = "tensors_to_audio_calculator",
+    srcs = ["tensors_to_audio_calculator.cc"],
+    visibility = [
+        "//mediapipe/framework:mediapipe_internal",
+    ],
+    deps = [
+        ":tensors_to_audio_calculator_cc_proto",
+        "//mediapipe/framework:calculator_framework",
+        "//mediapipe/framework/api2:node",
+        "//mediapipe/framework/formats:matrix",
+        "//mediapipe/framework/formats:tensor",
+        "//mediapipe/framework/port:ret_check",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_audio_tools//audio/dsp:window_functions",
+        "@pffft",
+    ],
+    alwayslink = 1,
+)
+
+cc_test(
+    name = "tensors_to_audio_calculator_test",
+    srcs = ["tensors_to_audio_calculator_test.cc"],
+    deps = [
+        ":audio_to_tensor_calculator",
+        ":audio_to_tensor_calculator_cc_proto",
+        ":tensors_to_audio_calculator",
+        ":tensors_to_audio_calculator_cc_proto",
+        "//mediapipe/framework:calculator_framework",
+        "//mediapipe/framework/formats:matrix",
+        "//mediapipe/framework/port:gtest_main",
+        "//mediapipe/framework/port:parse_text_proto",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 mediapipe_proto_library(
     name = "feedback_tensors_calculator_proto",
     srcs = ["feedback_tensors_calculator.proto"],
diff --git a/mediapipe/calculators/tensor/audio_to_tensor_calculator.cc b/mediapipe/calculators/tensor/audio_to_tensor_calculator.cc
index 59c129191..d0513518a 100644
--- a/mediapipe/calculators/tensor/audio_to_tensor_calculator.cc
+++ b/mediapipe/calculators/tensor/audio_to_tensor_calculator.cc
@@ -133,7 +133,7 @@ bool IsValidFftSize(int size) {
 //     invocation. In the non-streaming mode, the vector contains all of the
 //     output timestamps for an input audio buffer.
 //   DC_AND_NYQUIST - std::pair<float, float> @Optional.
-//     A pair of dc component and nyquest component. Only can be connected when
+//     A pair of dc component and nyquist component. Only can be connected when
 //     the calculator performs fft (the fft_size is set in the calculator
 //     options).
 //
diff --git a/mediapipe/calculators/tensor/tensors_to_audio_calculator.cc b/mediapipe/calculators/tensor/tensors_to_audio_calculator.cc
new file mode 100644
index 000000000..8da29bb69
--- /dev/null
+++ b/mediapipe/calculators/tensor/tensors_to_audio_calculator.cc
@@ -0,0 +1,197 @@
+// Copyright 2022 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <new>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "audio/dsp/window_functions.h"
+#include "mediapipe/calculators/tensor/tensors_to_audio_calculator.pb.h"
+#include "mediapipe/framework/api2/node.h"
+#include "mediapipe/framework/calculator_framework.h"
+#include "mediapipe/framework/formats/matrix.h"
+#include "mediapipe/framework/formats/tensor.h"
+#include "mediapipe/framework/port/ret_check.h"
+#include "pffft.h"
+
+namespace mediapipe {
+namespace api2 {
+namespace {
+
+std::vector<float> HannWindow(int window_size, bool sqrt_hann) {
+  std::vector<float> hann_window(window_size);
+  audio_dsp::HannWindow().GetPeriodicSamples(window_size, &hann_window);
+  if (sqrt_hann) {
+    absl::c_transform(hann_window, hann_window.begin(),
+                      [](double x) { return std::sqrt(x); });
+  }
+  return hann_window;
+}
+
+// Note that the InvHannWindow function may only work for 50% overlapping case.
+std::vector<float> InvHannWindow(int window_size, bool sqrt_hann) {
+  std::vector<float> window = HannWindow(window_size, sqrt_hann);
+  std::vector<float> inv_window(window.size());
+  if (sqrt_hann) {
+    absl::c_copy(window, inv_window.begin());
+  } else {
+    const int kHalfWindowSize = window.size() / 2;
+    absl::c_transform(window, inv_window.begin(),
+                      [](double x) { return x * x; });
+    for (int i = 0; i < kHalfWindowSize; ++i) {
+      double sum = inv_window[i] + inv_window[kHalfWindowSize + i];
+      inv_window[i] = window[i] / sum;
+      inv_window[kHalfWindowSize + i] = window[kHalfWindowSize + i] / sum;
+    }
+  }
+  return inv_window;
+}
+
+// PFFFT only supports transforms for inputs of length N of the form
+// N = (2^a)*(3^b)*(5^c) where b >=0 and c >= 0 and a >= 5 for the real FFT.
+bool IsValidFftSize(int size) {
+  if (size <= 0) {
+    return false;
+  }
+  constexpr int kFactors[] = {2, 3, 5};
+  int factorization[] = {0, 0, 0};
+  int n = static_cast<int>(size);
+  for (int i = 0; i < 3; ++i) {
+    while (n % kFactors[i] == 0) {
+      n = n / kFactors[i];
+      ++factorization[i];
+    }
+  }
+  return factorization[0] >= 5 && n == 1;
+}
+
+}  // namespace
+
+// Converts 2D MediaPipe float Tensors to audio buffers.
+// The calculator will perform ifft on the complex DFT and apply the window
+// function (Inverse Hann) afterwards. The input 2D MediaPipe Tensor must
+// have the DFT real parts in its first row and the DFT imagery parts in its
+// second row. A valid "fft_size" must be set in the CalculatorOptions.
+//
+// Inputs:
+//   TENSORS - std::vector<Tensor>
+//     Vector containing a single Tensor that represents the audio's complex DFT
+//     results.
+//   DC_AND_NYQUIST - std::pair<float, float>
+//     A pair of dc component and nyquist component.
+//
+// Outputs:
+//   AUDIO - mediapipe::Matrix
+//     The audio data represented as mediapipe::Matrix.
+//
+// Example:
+// node {
+//   calculator: "TensorsToAudioCalculator"
+//   input_stream: "TENSORS:tensors"
+//   input_stream: "DC_AND_NYQUIST:dc_and_nyquist"
+//   output_stream: "AUDIO:audio"
+//   options {
+//     [mediapipe.AudioToTensorCalculatorOptions.ext] {
+//       fft_size: 256
+//     }
+//   }
+// }
+class TensorsToAudioCalculator : public Node {
+ public:
+  static constexpr Input<std::vector<Tensor>> kTensorsIn{"TENSORS"};
+  static constexpr Input<std::pair<float, float>> kDcAndNyquistIn{
+      "DC_AND_NYQUIST"};
+  static constexpr Output<Matrix> kAudioOut{"AUDIO"};
+  MEDIAPIPE_NODE_CONTRACT(kTensorsIn, kDcAndNyquistIn, kAudioOut);
+
+  absl::Status Open(CalculatorContext* cc) override;
+  absl::Status Process(CalculatorContext* cc) override;
+  absl::Status Close(CalculatorContext* cc) override;
+
+ private:
+  // The internal state of the FFT library.
+  PFFFT_Setup* fft_state_ = nullptr;
+  int fft_size_ = 0;
+  float inverse_fft_size_ = 0;
+  std::vector<float, Eigen::aligned_allocator<float>> input_dft_;
+  std::vector<float> inv_fft_window_;
+  std::vector<float, Eigen::aligned_allocator<float>> fft_input_buffer_;
+  // pffft requires memory to work with to avoid using the stack.
+  std::vector<float, Eigen::aligned_allocator<float>> fft_workplace_;
+  std::vector<float, Eigen::aligned_allocator<float>> fft_output_;
+};
+
+absl::Status TensorsToAudioCalculator::Open(CalculatorContext* cc) {
+  const auto& options =
+      cc->Options<mediapipe::TensorsToAudioCalculatorOptions>();
+  RET_CHECK(options.has_fft_size()) << "FFT size must be specified.";
+  RET_CHECK(IsValidFftSize(options.fft_size()))
+      << "FFT size must be of the form fft_size = (2^a)*(3^b)*(5^c) where b "
+         ">=0 and c >= 0 and a >= 5, the requested fft size is "
+      << options.fft_size();
+  fft_size_ = options.fft_size();
+  inverse_fft_size_ = 1.0f / fft_size_;
+  fft_state_ = pffft_new_setup(fft_size_, PFFFT_REAL);
+  input_dft_.resize(fft_size_);
+  inv_fft_window_ = InvHannWindow(fft_size_, /* sqrt_hann = */ false);
+  fft_input_buffer_.resize(fft_size_);
+  fft_workplace_.resize(fft_size_);
+  fft_output_.resize(fft_size_);
+  return absl::OkStatus();
+}
+
+absl::Status TensorsToAudioCalculator::Process(CalculatorContext* cc) {
+  if (kTensorsIn(cc).IsEmpty() || kDcAndNyquistIn(cc).IsEmpty()) {
+    return absl::OkStatus();
+  }
+  const auto& input_tensors = *kTensorsIn(cc);
+  RET_CHECK_EQ(input_tensors.size(), 1);
+  RET_CHECK(input_tensors[0].element_type() == Tensor::ElementType::kFloat32);
+  auto view = input_tensors[0].GetCpuReadView();
+  // DC's real part.
+  input_dft_[0] = kDcAndNyquistIn(cc)->first;
+  // Nyquist's real part is the penultimate element of the tensor buffer.
+  // pffft ignores the Nyquist's imagery part. No need to fetch the last value
+  // from the tensor buffer.
+  input_dft_[1] = *(view.buffer<float>() + (fft_size_ - 2));
+  std::memcpy(input_dft_.data() + 2, view.buffer<float>(),
+              (fft_size_ - 2) * sizeof(float));
+  pffft_transform_ordered(fft_state_, input_dft_.data(), fft_output_.data(),
+                          fft_workplace_.data(), PFFFT_BACKWARD);
+  // Applies the inverse window function.
+  std::transform(
+      fft_output_.begin(), fft_output_.end(), inv_fft_window_.begin(),
+      fft_output_.begin(),
+      [this](float a, float b) { return a * b * inverse_fft_size_; });
+  Matrix matrix = Eigen::Map<Matrix>(fft_output_.data(), 1, fft_output_.size());
+  kAudioOut(cc).Send(std::move(matrix));
+  return absl::OkStatus();
+}
+
+absl::Status TensorsToAudioCalculator::Close(CalculatorContext* cc) {
+  if (fft_state_) {
+    pffft_destroy_setup(fft_state_);
+  }
+  return absl::OkStatus();
+}
+
+MEDIAPIPE_REGISTER_NODE(TensorsToAudioCalculator);
+
+}  // namespace api2
+}  // namespace mediapipe
diff --git a/mediapipe/calculators/tensor/tensors_to_audio_calculator.proto b/mediapipe/calculators/tensor/tensors_to_audio_calculator.proto
new file mode 100644
index 000000000..907627125
--- /dev/null
+++ b/mediapipe/calculators/tensor/tensors_to_audio_calculator.proto
@@ -0,0 +1,29 @@
+// Copyright 2022 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package mediapipe;
+
+import "mediapipe/framework/calculator.proto";
+
+message TensorsToAudioCalculatorOptions {
+  extend mediapipe.CalculatorOptions {
+    optional TensorsToAudioCalculatorOptions ext = 484297136;
+  }
+
+  // Size of the fft in number of bins. If set, the calculator will do ifft
+  // on the input tensor.
+  optional int64 fft_size = 1;
+}
diff --git a/mediapipe/calculators/tensor/tensors_to_audio_calculator_test.cc b/mediapipe/calculators/tensor/tensors_to_audio_calculator_test.cc
new file mode 100644
index 000000000..b332381c6
--- /dev/null
+++ b/mediapipe/calculators/tensor/tensors_to_audio_calculator_test.cc
@@ -0,0 +1,149 @@
+// Copyright 2022 The MediaPipe Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/substitute.h"
+#include "mediapipe/calculators/tensor/audio_to_tensor_calculator.pb.h"
+#include "mediapipe/calculators/tensor/tensors_to_audio_calculator.pb.h"
+#include "mediapipe/framework/calculator_framework.h"
+#include "mediapipe/framework/formats/matrix.h"
+#include "mediapipe/framework/port/gtest.h"
+#include "mediapipe/framework/port/parse_text_proto.h"
+#include "mediapipe/framework/port/status_matchers.h"
+
+namespace mediapipe {
+namespace {
+
+class TensorsToAudioCalculatorFftTest : public ::testing::Test {
+ protected:
+  // Creates an audio matrix containing a single sample of 1.0 at a specified
+  // offset.
+  Matrix CreateImpulseSignalData(int64 num_samples, int impulse_offset_idx) {
+    Matrix impulse = Matrix::Zero(1, num_samples);
+    impulse(0, impulse_offset_idx) = 1.0;
+    return impulse;
+  }
+
+  void ConfigGraph(int num_samples, double sample_rate, int fft_size) {
+    graph_config_ = ParseTextProtoOrDie<CalculatorGraphConfig>(
+        absl::Substitute(R"(
+        input_stream: "audio_in"
+        input_stream: "sample_rate"
+        output_stream: "audio_out"
+        node {
+          calculator: "AudioToTensorCalculator"
+          input_stream: "AUDIO:audio_in"
+          input_stream: "SAMPLE_RATE:sample_rate"
+          output_stream: "TENSORS:tensors"
+          output_stream: "DC_AND_NYQUIST:dc_and_nyquist"
+          options {
+            [mediapipe.AudioToTensorCalculatorOptions.ext] {
+              num_channels: 1
+              num_samples: $0
+              num_overlapping_samples: 0
+              target_sample_rate: $1
+              fft_size: $2
+            }
+          }
+        }
+        node {
+          calculator: "TensorsToAudioCalculator"
+          input_stream: "TENSORS:tensors"
+          input_stream: "DC_AND_NYQUIST:dc_and_nyquist"
+          output_stream: "AUDIO:audio_out"
+          options {
+            [mediapipe.TensorsToAudioCalculatorOptions.ext] {
+              fft_size: $2
+            }
+          }
+        }
+        )",
+                         /*$0=*/num_samples,
+                         /*$1=*/sample_rate,
+                         /*$2=*/fft_size));
+    tool::AddVectorSink("audio_out", &graph_config_, &audio_out_packets_);
+  }
+
+  void RunGraph(const Matrix& input_data, double sample_rate) {
+    MP_ASSERT_OK(graph_.Initialize(graph_config_));
+    MP_ASSERT_OK(graph_.StartRun({}));
+    MP_ASSERT_OK(graph_.AddPacketToInputStream(
+        "sample_rate", MakePacket<double>(sample_rate).At(Timestamp(0))));
+    MP_ASSERT_OK(graph_.AddPacketToInputStream(
+        "audio_in", MakePacket<Matrix>(input_data).At(Timestamp(0))));
+    MP_ASSERT_OK(graph_.CloseAllInputStreams());
+    MP_ASSERT_OK(graph_.WaitUntilDone());
+  }
+
+  std::vector<Packet> audio_out_packets_;
+  CalculatorGraphConfig graph_config_;
+  CalculatorGraph graph_;
+};
+
+TEST_F(TensorsToAudioCalculatorFftTest, TestInvalidFftSize) {
+  ConfigGraph(320, 16000, 103);
+  MP_ASSERT_OK(graph_.Initialize(graph_config_));
+  MP_ASSERT_OK(graph_.StartRun({}));
+  auto status = graph_.WaitUntilIdle();
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("FFT size must be of the form"));
+}
+
+TEST_F(TensorsToAudioCalculatorFftTest, TestImpulseSignalAtTheCenter) {
+  constexpr int sample_size = 320;
+  constexpr double sample_rate = 16000;
+  ConfigGraph(sample_size, sample_rate, 320);
+
+  Matrix impulse_data = CreateImpulseSignalData(sample_size, sample_size / 2);
+  RunGraph(impulse_data, sample_rate);
+  ASSERT_EQ(1, audio_out_packets_.size());
+  MP_ASSERT_OK(audio_out_packets_[0].ValidateAsType<Matrix>());
+  // The impulse signal at the center is not affected by the window function.
+  EXPECT_EQ(audio_out_packets_[0].Get<Matrix>(), impulse_data);
+}
+
+TEST_F(TensorsToAudioCalculatorFftTest, TestWindowedImpulseSignal) {
+  constexpr int sample_size = 320;
+  constexpr double sample_rate = 16000;
+  ConfigGraph(sample_size, sample_rate, 320);
+  Matrix impulse_data = CreateImpulseSignalData(sample_size, sample_size / 4);
+  RunGraph(impulse_data, sample_rate);
+  ASSERT_EQ(1, audio_out_packets_.size());
+  MP_ASSERT_OK(audio_out_packets_[0].ValidateAsType<Matrix>());
+  // As the impulse signal sits at the 1/4 of the hann window, the inverse
+  // window function reduces it by half.
+  EXPECT_EQ(audio_out_packets_[0].Get<Matrix>(), impulse_data / 2);
+}
+
+TEST_F(TensorsToAudioCalculatorFftTest, TestImpulseSignalAtBeginning) {
+  constexpr int sample_size = 320;
+  constexpr double sample_rate = 16000;
+  ConfigGraph(sample_size, sample_rate, 320);
+  Matrix impulse_data = CreateImpulseSignalData(sample_size, 0);
+  RunGraph(impulse_data, sample_rate);
+  ASSERT_EQ(1, audio_out_packets_.size());
+  MP_ASSERT_OK(audio_out_packets_[0].ValidateAsType<Matrix>());
+  // As the impulse signal sits at the beginning of the hann window, the inverse
+  // window function completely removes it.
+  EXPECT_EQ(audio_out_packets_[0].Get<Matrix>(), Matrix::Zero(1, sample_size));
+}
+
+}  // namespace
+}  // namespace mediapipe