mediapipe/mediapipe/calculators/audio/spectrogram_calculator.cc

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Defines SpectrogramCalculator.
#include <math.h>

#include <complex>
#include <deque>
#include <memory>
#include <string>

#include "absl/strings/string_view.h"
#include "audio/dsp/spectrogram/spectrogram.h"
#include "audio/dsp/window_functions.h"
#include "mediapipe/calculators/audio/spectrogram_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/matrix.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/status_builder.h"
#include "mediapipe/util/time_series_util.h"

namespace mediapipe {

namespace {
constexpr char kFrameDurationTag[] = "FRAME_DURATION";
constexpr char kFrameOverlapTag[] = "FRAME_OVERLAP";
}  // namespace
// MediaPipe Calculator for computing the "spectrogram" (short-time Fourier
// transform squared-magnitude, by default) of a multichannel input
// time series, including optionally overlapping frames.  Options are
// specified in SpectrogramCalculatorOptions proto  (where names are chosen
// to mirror TimeSeriesFramerCalculator):
//
// Result is a MatrixData record (for single channel input and when the
// allow_multichannel_input flag is false), or a vector of MatrixData records,
// one for each channel (when the allow_multichannel_input flag is set). Each
// waveform frame is converted to frequency by a fast Fourier transform whose
// size, n_fft, is the smallest power of two large enough to enclose the frame
// length of round(frame_duration_seconds * sample_rate).The rows of each
// spectrogram matrix(result) correspond to the n_fft/2+1 unique complex values,
// or squared/linear/dB magnitudes, depending on the output_type option. Each
// input packet will result in zero or one output packets, each containing one
// Matrix for each channel of the input, where each Matrix has one or more
// columns of spectral values, one for each complete frame of input samples. If
// the input packet contains too few samples to trigger a new output frame, no
// output packet is generated (since zero-length packets are not legal since
// they would result in timestamps that were equal, not strictly increasing).
//
// Output packet Timestamps are set to the beginning of each frame. This is to
// allow calculators downstream from SpectrogramCalculator to have aligned
// Timestamps regardless of a packet's signal length.
//
// Both frame_duration_seconds and frame_overlap_seconds will be
// rounded to the nearest integer number of samples.  Conseqently, all output
// frames will be based on the same number of input samples, and each
// analysis frame will advance from its predecessor by the same time step.
class SpectrogramCalculator : public CalculatorBase {
 public:
  static absl::Status GetContract(CalculatorContract* cc) {
    cc->Inputs().Index(0).Set<Matrix>(
        // Input stream with TimeSeriesHeader.
    );

    if (cc->InputSidePackets().HasTag(kFrameDurationTag)) {
      cc->InputSidePackets()
          .Tag(kFrameDurationTag)
          .Set<double>(
              // Optional side packet for frame_duration_seconds if provided.
          );
    }

    if (cc->InputSidePackets().HasTag(kFrameOverlapTag)) {
      cc->InputSidePackets()
          .Tag(kFrameOverlapTag)
          .Set<double>(
              // Optional side packet for frame_overlap_seconds if provided.
          );
    }

    SpectrogramCalculatorOptions spectrogram_options =
        cc->Options<SpectrogramCalculatorOptions>();
    if (!spectrogram_options.allow_multichannel_input()) {
      if (spectrogram_options.output_type() ==
          SpectrogramCalculatorOptions::COMPLEX) {
        cc->Outputs().Index(0).Set<Eigen::MatrixXcf>(
            // Complex spectrogram frames with TimeSeriesHeader.
        );
      } else {
        cc->Outputs().Index(0).Set<Matrix>(
            // Spectrogram frames with TimeSeriesHeader.
        );
      }
    } else {
      if (spectrogram_options.output_type() ==
          SpectrogramCalculatorOptions::COMPLEX) {
        cc->Outputs().Index(0).Set<std::vector<Eigen::MatrixXcf>>(
            // Complex spectrogram frames with MultiStreamTimeSeriesHeader.
        );
      } else {
        cc->Outputs().Index(0).Set<std::vector<Matrix>>(
            // Spectrogram frames with MultiStreamTimeSeriesHeader.
        );
      }
    }
    return absl::OkStatus();
  }

  // Returns FAIL if the input stream header is invalid.
  absl::Status Open(CalculatorContext* cc) override;

  // Outputs at most one packet consisting of a single Matrix with one or
  // more columns containing the spectral values from as many input frames
  // as are completed by the input samples.  Always returns OK.
  absl::Status Process(CalculatorContext* cc) override;

  // Performs zero-padding and processing of any remaining samples
  // if pad_final_packet is set.
  // Returns OK.
  absl::Status Close(CalculatorContext* cc) override;

 private:
  Timestamp CurrentOutputTimestamp(CalculatorContext* cc) {
    if (use_local_timestamp_) {
      const Timestamp now = cc->InputTimestamp();
      if (now == Timestamp::Done()) {
        // During Close the timestamp is not available, send an estimate.
        return last_local_output_timestamp_ +
               round(last_completed_frames_ * frame_step_samples() *
                     Timestamp::kTimestampUnitsPerSecond / input_sample_rate_);
      }
      last_local_output_timestamp_ = now;
      return now;
    }
    return CumulativeOutputTimestamp();
  }

  Timestamp CumulativeOutputTimestamp() {
    // Cumulative output timestamp is the *center* of the next frame to be
    // emitted, hence delayed by half a window duration compared to relevant
    // input timestamp.
    return initial_input_timestamp_ +
           round(cumulative_completed_frames_ * frame_step_samples() *
                 Timestamp::kTimestampUnitsPerSecond / input_sample_rate_);
  }

  int frame_step_samples() const {
    return frame_duration_samples_ - frame_overlap_samples_;
  }

  // Take the next set of input samples, already translated into a
  // vector<float> and pass them to the spectrogram object.
  // Convert the output of the spectrogram object into a Matrix (or an
  // Eigen::MatrixXcf if complex-valued output is requested) and pass to
  // MediaPipe output.
  absl::Status ProcessVector(const Matrix& input_stream, CalculatorContext* cc);

  // Templated function to process either real- or complex-output spectrogram.
  template <class OutputMatrixType>
  absl::Status ProcessVectorToOutput(
      const Matrix& input_stream,
      const OutputMatrixType postprocess_output_fn(const OutputMatrixType&),
      CalculatorContext* cc);

  // Use the MediaPipe timestamp instead of the estimated one. Useful when the
  // data is intermittent.
  bool use_local_timestamp_;
  Timestamp last_local_output_timestamp_;

  double input_sample_rate_;
  bool pad_final_packet_;
  int frame_duration_samples_;
  int frame_overlap_samples_;
  // How many samples we've been passed, used for checking input time stamps.
  int64 cumulative_input_samples_;
  // How many frames we've emitted, used for calculating output time stamps.
  int64 cumulative_completed_frames_;
  // How many frames were emitted last, used for estimating the timestamp on
  // Close when use_local_timestamp_ is true;
  int64 last_completed_frames_;
  Timestamp initial_input_timestamp_;
  int num_input_channels_;
  // How many frequency bins we emit (=N_FFT/2 + 1).
  int num_output_channels_;
  // Which output type?
  int output_type_;
  // Output type: mono or multichannel.
  bool allow_multichannel_input_;
  // Vector of Spectrogram objects, one for each channel.
  std::vector<std::unique_ptr<audio_dsp::Spectrogram>> spectrogram_generators_;
  // Fixed scale factor applied to output values (regardless of type).
  double output_scale_;

  static const float kLnSquaredMagnitudeToDb;
};
REGISTER_CALCULATOR(SpectrogramCalculator);

// DECIBELS = 20*log10(LINEAR_MAGNITUDE) = 10*Log10(SQUARED_MAGNITUDE)
// =10/ln(10)*ln(SQUARED_MAGNITUDE).
// Factor to convert ln(SQUARED_MAGNITUDE) to deciBels = 10.0/ln(10.0).
const float SpectrogramCalculator::kLnSquaredMagnitudeToDb = 4.342944819032518;

absl::Status SpectrogramCalculator::Open(CalculatorContext* cc) {
  SpectrogramCalculatorOptions spectrogram_options =
      cc->Options<SpectrogramCalculatorOptions>();
  // Provide frame_duration_seconds and frame_overlap_seconds either from static
  // options, or dynamically from a side packet, the side packet one will
  // override the options one if provided.

  double frame_duration_seconds = 0;
  double frame_overlap_seconds = 0;
  if (cc->InputSidePackets().HasTag(kFrameDurationTag)) {
    frame_duration_seconds =
        cc->InputSidePackets().Tag(kFrameDurationTag).Get<double>();
  } else {
    frame_duration_seconds = spectrogram_options.frame_duration_seconds();
  }

  if (cc->InputSidePackets().HasTag(kFrameOverlapTag)) {
    frame_overlap_seconds =
        cc->InputSidePackets().Tag(kFrameOverlapTag).Get<double>();
  } else {
    frame_overlap_seconds = spectrogram_options.frame_overlap_seconds();
  }

  use_local_timestamp_ = spectrogram_options.use_local_timestamp();

  if (frame_duration_seconds <= 0.0) {
    // TODO: return an error.
  }
  if (frame_overlap_seconds >= frame_duration_seconds) {
    // TODO: return an error.
  }
  if (frame_overlap_seconds < 0.0) {
    // TODO: return an error.
  }

  TimeSeriesHeader input_header;
  MP_RETURN_IF_ERROR(time_series_util::FillTimeSeriesHeaderIfValid(
      cc->Inputs().Index(0).Header(), &input_header));

  input_sample_rate_ = input_header.sample_rate();
  num_input_channels_ = input_header.num_channels();

  if (!spectrogram_options.allow_multichannel_input() &&
      num_input_channels_ != 1) {
    // TODO: return an error.
  }

  frame_duration_samples_ = round(frame_duration_seconds * input_sample_rate_);
  frame_overlap_samples_ = round(frame_overlap_seconds * input_sample_rate_);

  pad_final_packet_ = spectrogram_options.pad_final_packet();
  output_type_ = spectrogram_options.output_type();
  allow_multichannel_input_ = spectrogram_options.allow_multichannel_input();

  output_scale_ = spectrogram_options.output_scale();

  std::vector<double> window;
  switch (spectrogram_options.window_type()) {
    case SpectrogramCalculatorOptions::COSINE:
      audio_dsp::CosineWindow().GetPeriodicSamples(frame_duration_samples_,
                                                   &window);
      break;
    case SpectrogramCalculatorOptions::HANN:
      audio_dsp::HannWindow().GetPeriodicSamples(frame_duration_samples_,
                                                 &window);
      break;
    case SpectrogramCalculatorOptions::HAMMING:
      audio_dsp::HammingWindow().GetPeriodicSamples(frame_duration_samples_,
                                                    &window);
      break;
  }

  // Propagate settings down to the actual Spectrogram object.
  spectrogram_generators_.clear();
  for (int i = 0; i < num_input_channels_; i++) {
    spectrogram_generators_.push_back(
        std::unique_ptr<audio_dsp::Spectrogram>(new audio_dsp::Spectrogram()));
    spectrogram_generators_[i]->Initialize(window, frame_step_samples());
  }

  num_output_channels_ =
      spectrogram_generators_[0]->output_frequency_channels();
  std::unique_ptr<TimeSeriesHeader> output_header(
      new TimeSeriesHeader(input_header));
  // Store the actual sample rate of the input audio in the TimeSeriesHeader
  // so that subsequent calculators can figure out the frequency scale of
  // our output.
  output_header->set_audio_sample_rate(input_sample_rate_);
  // Setup rest of output header.
  output_header->set_num_channels(num_output_channels_);
  output_header->set_sample_rate(input_sample_rate_ / frame_step_samples());
  // Although we usually generate one output packet for each input
  // packet, this might not be true for input packets whose size is smaller
  // than the analysis window length.  So we clear output_header.packet_rate
  // because we can't guarantee a constant packet rate.  Similarly, the number
  // of output frames per packet depends on the input packet, so we also clear
  // output_header.num_samples.
  output_header->clear_packet_rate();
  output_header->clear_num_samples();
  if (!spectrogram_options.allow_multichannel_input()) {
    cc->Outputs().Index(0).SetHeader(Adopt(output_header.release()));
  } else {
    std::unique_ptr<MultiStreamTimeSeriesHeader> multichannel_output_header(
        new MultiStreamTimeSeriesHeader());
    *multichannel_output_header->mutable_time_series_header() = *output_header;
    multichannel_output_header->set_num_streams(num_input_channels_);
    cc->Outputs().Index(0).SetHeader(
        Adopt(multichannel_output_header.release()));
  }
  cumulative_completed_frames_ = 0;
  last_completed_frames_ = 0;
  initial_input_timestamp_ = Timestamp::Unstarted();
  if (use_local_timestamp_) {
    // Inform the framework that the calculator will output packets at the same
    // timestamps as input packets to enable packet queueing optimizations. The
    // final packet (emitted from Close()) does not follow this rule but it's
    // sufficient that its timestamp is strictly greater than the timestamp of
    // the previous packet.
    cc->SetOffset(0);
  }
  return absl::OkStatus();
}

absl::Status SpectrogramCalculator::Process(CalculatorContext* cc) {
  if (initial_input_timestamp_ == Timestamp::Unstarted()) {
    initial_input_timestamp_ = cc->InputTimestamp();
  }

  const Matrix& input_stream = cc->Inputs().Index(0).Get<Matrix>();
  if (input_stream.rows() != num_input_channels_) {
    // TODO: return an error.
  }

  cumulative_input_samples_ += input_stream.cols();

  return ProcessVector(input_stream, cc);
}

template <class OutputMatrixType>
absl::Status SpectrogramCalculator::ProcessVectorToOutput(
    const Matrix& input_stream,
    const OutputMatrixType postprocess_output_fn(const OutputMatrixType&),
    CalculatorContext* cc) {
  std::unique_ptr<std::vector<OutputMatrixType>> spectrogram_matrices(
      new std::vector<OutputMatrixType>());
  std::vector<std::vector<typename OutputMatrixType::Scalar>> output_vectors;

  // Compute a spectrogram for each channel.
  int num_output_time_frames;
  for (int channel = 0; channel < input_stream.rows(); ++channel) {
    output_vectors.clear();

    // Copy one row (channel) of the input matrix into the std::vector.
    std::vector<float> input_vector(input_stream.cols());
    Eigen::Map<Matrix>(&input_vector[0], 1, input_vector.size()) =
        input_stream.row(channel);

    if (!spectrogram_generators_[channel]->ComputeSpectrogram(
            input_vector, &output_vectors)) {
      return absl::Status(absl::StatusCode::kInternal,
                          "Spectrogram returned failure");
    }
    if (channel == 0) {
      // Record the number of time frames we expect from each channel.
      num_output_time_frames = output_vectors.size();
    } else {
      RET_CHECK_EQ(output_vectors.size(), num_output_time_frames)
          << "Inconsistent spectrogram time frames for channel " << channel;
    }
    // Skip remaining processing if there are too few input samples to trigger
    // any output frames.
    if (!output_vectors.empty()) {
      // Translate the returned values into a matrix of output frames.
      OutputMatrixType output_frames(num_output_channels_,
                                     output_vectors.size());
      for (int frame = 0; frame < output_vectors.size(); ++frame) {
        Eigen::Map<const OutputMatrixType> frame_map(
            &output_vectors[frame][0], output_vectors[frame].size(), 1);
        // The underlying dsp object returns squared magnitudes; here
        // we optionally translate to linear magnitude or dB.
        output_frames.col(frame) =
            output_scale_ * postprocess_output_fn(frame_map);
      }
      spectrogram_matrices->push_back(output_frames);
    }
  }
  // If the input is very short, there may not be enough accumulated,
  // unprocessed samples to cause any new frames to be generated by
  // the spectrogram object.  If so, we don't want to emit
  // a packet at all.
  if (!spectrogram_matrices->empty()) {
    RET_CHECK_EQ(spectrogram_matrices->size(), input_stream.rows())
        << "Inconsistent number of spectrogram channels.";
    if (allow_multichannel_input_) {
      cc->Outputs().Index(0).Add(spectrogram_matrices.release(),
                                 CurrentOutputTimestamp(cc));
    } else {
      cc->Outputs().Index(0).Add(
          new OutputMatrixType(spectrogram_matrices->at(0)),
          CurrentOutputTimestamp(cc));
    }
    cumulative_completed_frames_ += output_vectors.size();
    last_completed_frames_ = output_vectors.size();
    if (!use_local_timestamp_) {
      // In non-local timestamp mode the timestamp of the next packet will be
      // equal to CumulativeOutputTimestamp(). Inform the framework about this
      // fact to enable packet queueing optimizations.
      cc->Outputs().Index(0).SetNextTimestampBound(CumulativeOutputTimestamp());
    }
  }
  return absl::OkStatus();
}

absl::Status SpectrogramCalculator::ProcessVector(const Matrix& input_stream,
                                                  CalculatorContext* cc) {
  switch (output_type_) {
    // These blocks deliberately ignore clang-format to preserve the
    // "silhouette" of the different cases.
    // clang-format off
    case SpectrogramCalculatorOptions::COMPLEX: {
      return ProcessVectorToOutput(
          input_stream,
          +[](const Eigen::MatrixXcf& col) -> const Eigen::MatrixXcf {
            return col;
          }, cc);
    }
    case SpectrogramCalculatorOptions::SQUARED_MAGNITUDE: {
      return ProcessVectorToOutput(
          input_stream,
          +[](const Matrix& col) -> const Matrix {
            return col;
          }, cc);
    }
    case SpectrogramCalculatorOptions::LINEAR_MAGNITUDE: {
      return ProcessVectorToOutput(
          input_stream,
          +[](const Matrix& col) -> const Matrix {
            return col.array().sqrt().matrix();
          }, cc);
    }
    case SpectrogramCalculatorOptions::DECIBELS: {
      return ProcessVectorToOutput(
          input_stream,
          +[](const Matrix& col) -> const Matrix {
            return kLnSquaredMagnitudeToDb * col.array().log().matrix();
          }, cc);
    }
    // clang-format on
    default: {
      return absl::Status(absl::StatusCode::kInvalidArgument,
                          "Unrecognized spectrogram output type.");
    }
  }
}

absl::Status SpectrogramCalculator::Close(CalculatorContext* cc) {
  if (cumulative_input_samples_ > 0 && pad_final_packet_) {
    // We can flush any remaining samples by sending frame_step_samples - 1
    // zeros to the Process method, and letting it do its thing,
    // UNLESS we have fewer than one window's worth of samples, in which case
    // we pad to exactly one frame_duration_samples.
    // Release the memory for the Spectrogram objects.
    int required_padding_samples = frame_step_samples() - 1;
    if (cumulative_input_samples_ < frame_duration_samples_) {
      required_padding_samples =
          frame_duration_samples_ - cumulative_input_samples_;
    }
    return ProcessVector(
        Matrix::Zero(num_input_channels_, required_padding_samples), cc);
  }

  return absl::OkStatus();
}

}  // namespace mediapipe