mediapipe/mediapipe2/calculators/audio/spectrogram_calculator_test.cc
2021-06-10 23:01:19 +00:00

896 lines
35 KiB
C++

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include <cmath>
#include <complex>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "Eigen/Core"
#include "audio/dsp/number_util.h"
#include "mediapipe/calculators/audio/spectrogram_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/calculator_runner.h"
#include "mediapipe/framework/formats/matrix.h"
#include "mediapipe/framework/formats/time_series_header.pb.h"
#include "mediapipe/framework/port/benchmark.h"
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/time_series_test_util.h"
namespace mediapipe {
namespace {
const int kInitialTimestampOffsetMicroseconds = 4;
class SpectrogramCalculatorTest
: public TimeSeriesCalculatorTest<SpectrogramCalculatorOptions> {
protected:
void SetUp() override {
calculator_name_ = "SpectrogramCalculator";
input_sample_rate_ = 4000.0;
num_input_channels_ = 1;
}
// Initializes and runs the test graph.
absl::Status Run() {
// Now that options are set, we can set up some internal constants.
frame_duration_samples_ =
round(options_.frame_duration_seconds() * input_sample_rate_);
frame_step_samples_ =
frame_duration_samples_ -
round(options_.frame_overlap_seconds() * input_sample_rate_);
// The magnitude of the 0th FFT bin (DC) should be sum(input.*window);
// for an input identically 1.0, this is just sum(window). The average
// value of our Hann window is 0.5, hence this is the expected squared-
// magnitude output value in the DC bin for constant input of 1.0.
expected_dc_squared_magnitude_ =
pow((static_cast<float>(frame_duration_samples_) * 0.5), 2.0);
return RunGraph();
}
// Creates test multichannel input with specified packet sizes and containing
// a constant-frequency sinusoid that maintains phase between adjacent
// packets.
void SetupCosineInputPackets(const std::vector<int>& packet_sizes_samples,
float cosine_frequency_hz) {
int total_num_input_samples = 0;
for (int packet_size_samples : packet_sizes_samples) {
double packet_start_time_seconds =
kInitialTimestampOffsetMicroseconds * 1e-6 +
total_num_input_samples / input_sample_rate_;
double packet_end_time_seconds =
packet_start_time_seconds + packet_size_samples / input_sample_rate_;
double angular_freq = 2 * M_PI * cosine_frequency_hz;
Matrix* packet_data =
new Matrix(num_input_channels_, packet_size_samples);
// Use Eigen's vectorized cos() function to fill the vector with a
// sinusoid of appropriate frequency & phase.
for (int i = 0; i < num_input_channels_; i++) {
packet_data->row(i) =
Eigen::ArrayXf::LinSpaced(packet_size_samples,
packet_start_time_seconds * angular_freq,
packet_end_time_seconds * angular_freq)
.cos()
.transpose();
}
int64 input_timestamp = round(packet_start_time_seconds *
Timestamp::kTimestampUnitsPerSecond);
AppendInputPacket(packet_data, input_timestamp);
total_num_input_samples += packet_size_samples;
}
}
// Setup a sequence of input packets of specified sizes, each filled
// with samples of 1.0.
void SetupConstantInputPackets(const std::vector<int>& packet_sizes_samples) {
// A 0 Hz cosine is identically 1.0 for all samples.
SetupCosineInputPackets(packet_sizes_samples, 0.0);
}
// Setup a sequence of input packets of specified sizes, each containing a
// single sample of 1.0 at a specified offset.
void SetupImpulseInputPackets(
const std::vector<int>& packet_sizes_samples,
const std::vector<int>& impulse_offsets_samples) {
int total_num_input_samples = 0;
for (int i = 0; i < packet_sizes_samples.size(); ++i) {
double packet_start_time_seconds =
kInitialTimestampOffsetMicroseconds * 1e-6 +
total_num_input_samples / input_sample_rate_;
int64 input_timestamp = round(packet_start_time_seconds *
Timestamp::kTimestampUnitsPerSecond);
std::unique_ptr<Matrix> impulse(
new Matrix(Matrix::Zero(1, packet_sizes_samples[i])));
(*impulse)(0, impulse_offsets_samples[i]) = 1.0;
AppendInputPacket(impulse.release(), input_timestamp);
total_num_input_samples += packet_sizes_samples[i];
}
}
// Creates test multichannel input with specified packet sizes and containing
// constant input packets for the even channels and constant-frequency
// sinusoid that maintains phase between adjacent packets for the odd
// channels.
void SetupMultichannelInputPackets(
const std::vector<int>& packet_sizes_samples, float cosine_frequency_hz) {
int total_num_input_samples = 0;
for (int packet_size_samples : packet_sizes_samples) {
double packet_start_time_seconds =
kInitialTimestampOffsetMicroseconds * 1e-6 +
total_num_input_samples / input_sample_rate_;
double packet_end_time_seconds =
packet_start_time_seconds + packet_size_samples / input_sample_rate_;
double angular_freq;
Matrix* packet_data =
new Matrix(num_input_channels_, packet_size_samples);
// Use Eigen's vectorized cos() function to fill the vector with a
// sinusoid of appropriate frequency & phase.
for (int i = 0; i < num_input_channels_; i++) {
if (i % 2 == 0) {
angular_freq = 0;
} else {
angular_freq = 2 * M_PI * cosine_frequency_hz;
}
packet_data->row(i) =
Eigen::ArrayXf::LinSpaced(packet_size_samples,
packet_start_time_seconds * angular_freq,
packet_end_time_seconds * angular_freq)
.cos()
.transpose();
}
int64 input_timestamp = round(packet_start_time_seconds *
Timestamp::kTimestampUnitsPerSecond);
AppendInputPacket(packet_data, input_timestamp);
total_num_input_samples += packet_size_samples;
}
}
// Return vector of the numbers of frames in each output packet.
std::vector<int> OutputFramesPerPacket() {
std::vector<int> frame_counts;
for (const Packet& packet : output().packets) {
const Matrix& matrix = packet.Get<Matrix>();
frame_counts.push_back(matrix.cols());
}
return frame_counts;
}
// Checks output headers and Timestamps.
void CheckOutputHeadersAndTimestamps() {
const int fft_size = audio_dsp::NextPowerOfTwo(frame_duration_samples_);
TimeSeriesHeader expected_header = input().header.Get<TimeSeriesHeader>();
expected_header.set_num_channels(fft_size / 2 + 1);
// The output header sample rate should depend on the output frame step.
expected_header.set_sample_rate(input_sample_rate_ / frame_step_samples_);
// SpectrogramCalculator stores the sample rate of the input in
// the TimeSeriesHeader.
expected_header.set_audio_sample_rate(input_sample_rate_);
// We expect the output header to have num_samples and packet_rate unset.
expected_header.clear_num_samples();
expected_header.clear_packet_rate();
if (!options_.allow_multichannel_input()) {
ExpectOutputHeaderEquals(expected_header);
} else {
EXPECT_THAT(output()
.header.template Get<MultiStreamTimeSeriesHeader>()
.time_series_header(),
mediapipe::EqualsProto(expected_header));
EXPECT_THAT(output()
.header.template Get<MultiStreamTimeSeriesHeader>()
.num_streams(),
num_input_channels_);
}
int cumulative_output_frames = 0;
// The timestamps coming out of the spectrogram correspond to the
// middle of the first frame's window, hence frame_duration_samples_/2
// term. We use frame_duration_samples_ because that is how it is
// actually quantized inside spectrogram.
const double packet_timestamp_offset_seconds =
kInitialTimestampOffsetMicroseconds * 1e-6;
const double frame_step_seconds = frame_step_samples_ / input_sample_rate_;
Timestamp initial_timestamp = Timestamp::Unstarted();
for (const Packet& packet : output().packets) {
// This is the timestamp we expect based on how the spectrogram should
// behave (advancing by one step's worth of input samples each frame).
const double expected_timestamp_seconds =
packet_timestamp_offset_seconds +
cumulative_output_frames * frame_step_seconds;
const int64 expected_timestamp_ticks =
expected_timestamp_seconds * Timestamp::kTimestampUnitsPerSecond;
EXPECT_EQ(expected_timestamp_ticks, packet.Timestamp().Value());
// Accept the timestamp of the first packet as the baseline for checking
// the remainder.
if (initial_timestamp == Timestamp::Unstarted()) {
initial_timestamp = packet.Timestamp();
}
// Also check that the timestamp is consistent with the sample_rate
// in the output stream's TimeSeriesHeader.
EXPECT_TRUE(time_series_util::LogWarningIfTimestampIsInconsistent(
packet.Timestamp(), initial_timestamp, cumulative_output_frames,
expected_header.sample_rate()));
if (!options_.allow_multichannel_input()) {
if (options_.output_type() == SpectrogramCalculatorOptions::COMPLEX) {
const Eigen::MatrixXcf& matrix = packet.Get<Eigen::MatrixXcf>();
cumulative_output_frames += matrix.cols();
} else {
const Matrix& matrix = packet.Get<Matrix>();
cumulative_output_frames += matrix.cols();
}
} else {
if (options_.output_type() == SpectrogramCalculatorOptions::COMPLEX) {
const Eigen::MatrixXcf& matrix =
packet.Get<std::vector<Eigen::MatrixXcf>>().at(0);
cumulative_output_frames += matrix.cols();
} else {
const Matrix& matrix = packet.Get<std::vector<Matrix>>().at(0);
cumulative_output_frames += matrix.cols();
}
}
}
}
// Verify that the bin corresponding to the specified frequency
// is the largest one in one particular frame of a single packet.
void CheckPeakFrequencyInPacketFrame(const Packet& packet, int frame,
float frequency) {
const int fft_size = audio_dsp::NextPowerOfTwo(frame_duration_samples_);
const int target_bin =
round((frequency / input_sample_rate_) * static_cast<float>(fft_size));
const Matrix& matrix = packet.Get<Matrix>();
// Stop here if the requested frame is not in this packet.
ASSERT_GT(matrix.cols(), frame);
int actual_largest_bin;
matrix.col(frame).maxCoeff(&actual_largest_bin);
EXPECT_EQ(actual_largest_bin, target_bin);
}
// Verify that the bin corresponding to the specified frequency
// is the largest one in one particular frame of a single spectrogram Matrix.
void CheckPeakFrequencyInMatrix(const Matrix& matrix, int frame,
float frequency) {
const int fft_size = audio_dsp::NextPowerOfTwo(frame_duration_samples_);
const int target_bin =
round((frequency / input_sample_rate_) * static_cast<float>(fft_size));
// Stop here if the requested frame is not in this packet.
ASSERT_GT(matrix.cols(), frame);
int actual_largest_bin;
matrix.col(frame).maxCoeff(&actual_largest_bin);
EXPECT_EQ(actual_largest_bin, target_bin);
}
int frame_duration_samples_;
int frame_step_samples_;
// Expected DC output for a window of pure 1.0, set when window length
// is set.
float expected_dc_squared_magnitude_;
};
TEST_F(SpectrogramCalculatorTest, IntegerFrameDurationNoOverlap) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(0.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
const std::vector<int> input_packet_sizes = {500, 200};
const std::vector<int> expected_output_packet_sizes = {5, 2};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, IntegerFrameDurationSomeOverlap) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
const std::vector<int> input_packet_sizes = {500, 200};
// complete_output_frames = 1 + floor((input_samples - window_length)/step)
// = 1 + floor((500 - 100)/40) = 1 + 10 = 11 for the first packet
// = 1 + floor((700 - 100)/40) = 1 + 15 = 16 for the whole stream
// so expect 16 - 11 = 5 in the second packet.
const std::vector<int> expected_output_packet_sizes = {11, 5};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, NonintegerFrameDurationAndOverlap) {
options_.set_frame_duration_seconds(98.5 / input_sample_rate_);
options_.set_frame_overlap_seconds(58.4 / input_sample_rate_);
options_.set_pad_final_packet(false);
const std::vector<int> input_packet_sizes = {500, 200};
// now frame_duration_samples will be 99 (rounded), and frame_step_samples
// will be (99-58) = 41, so the first packet of 500 samples will generate
// 1 + floor(500-99)/41 = 10 samples.
const std::vector<int> expected_output_packet_sizes = {10, 5};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, ShortInitialPacketNoOverlap) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(0.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
const std::vector<int> input_packet_sizes = {90, 100, 110};
// The first input packet is too small to generate any frames,
// but zero-length packets would result in a timestamp monotonicity
// violation, so they are suppressed. Thus, only the second and third
// input packets generate output packets.
const std::vector<int> expected_output_packet_sizes = {1, 2};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, TrailingSamplesNoPad) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
const std::vector<int> input_packet_sizes = {140, 90};
const std::vector<int> expected_output_packet_sizes = {2, 2};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, NoTrailingSamplesWithPad) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(true);
const std::vector<int> input_packet_sizes = {140, 80};
const std::vector<int> expected_output_packet_sizes = {2, 2};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, TrailingSamplesWithPad) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(true);
const std::vector<int> input_packet_sizes = {140, 90};
// In contrast to NoTrailingSamplesWithPad and TrailingSamplesNoPad,
// this time we get an extra frame in an extra final packet.
const std::vector<int> expected_output_packet_sizes = {2, 2, 1};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, VeryShortInputWillPad) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(true);
const std::vector<int> input_packet_sizes = {30};
const std::vector<int> expected_output_packet_sizes = {1};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, VeryShortInputZeroOutputFramesIfNoPad) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
const std::vector<int> input_packet_sizes = {90};
const std::vector<int> expected_output_packet_sizes = {};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, DCSignalIsPeakBin) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
const std::vector<int> input_packet_sizes = {140}; // Gives 2 output frames.
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
const float dc_frequency_hz = 0.0;
CheckPeakFrequencyInPacketFrame(output().packets[0], 0, dc_frequency_hz);
CheckPeakFrequencyInPacketFrame(output().packets[0], 1, dc_frequency_hz);
}
TEST_F(SpectrogramCalculatorTest, A440ToneIsPeakBin) {
const std::vector<int> input_packet_sizes = {
460}; // 100 + 9*40 for 10 frames.
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
InitializeGraph();
FillInputHeader();
const float tone_frequency_hz = 440.0;
SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
int num_output_frames = output().packets[0].Get<Matrix>().cols();
for (int frame = 0; frame < num_output_frames; ++frame) {
CheckPeakFrequencyInPacketFrame(output().packets[0], frame,
tone_frequency_hz);
}
}
TEST_F(SpectrogramCalculatorTest, SquaredMagnitudeOutputLooksRight) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_output_type(SpectrogramCalculatorOptions::SQUARED_MAGNITUDE);
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_FLOAT_EQ(output().packets[0].Get<Matrix>()(0, 0),
expected_dc_squared_magnitude_);
}
TEST_F(SpectrogramCalculatorTest, DefaultOutputIsSquaredMagnitude) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
// Let the output_type be its default
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_FLOAT_EQ(output().packets[0].Get<Matrix>()(0, 0),
expected_dc_squared_magnitude_);
}
TEST_F(SpectrogramCalculatorTest, LinearMagnitudeOutputLooksRight) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_output_type(SpectrogramCalculatorOptions::LINEAR_MAGNITUDE);
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_FLOAT_EQ(output().packets[0].Get<Matrix>()(0, 0),
std::sqrt(expected_dc_squared_magnitude_));
}
TEST_F(SpectrogramCalculatorTest, DbMagnitudeOutputLooksRight) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_output_type(SpectrogramCalculatorOptions::DECIBELS);
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_FLOAT_EQ(output().packets[0].Get<Matrix>()(0, 0),
10.0 * std::log10(expected_dc_squared_magnitude_));
}
TEST_F(SpectrogramCalculatorTest, OutputScalingLooksRight) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_output_type(SpectrogramCalculatorOptions::DECIBELS);
double output_scale = 2.5;
options_.set_output_scale(output_scale);
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_FLOAT_EQ(
output().packets[0].Get<Matrix>()(0, 0),
output_scale * 10.0 * std::log10(expected_dc_squared_magnitude_));
}
TEST_F(SpectrogramCalculatorTest, ComplexOutputLooksRight) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_output_type(SpectrogramCalculatorOptions::COMPLEX);
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Setup packets with DC input (non-zero constant value).
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_FLOAT_EQ(std::norm(output().packets[0].Get<Eigen::MatrixXcf>()(0, 0)),
expected_dc_squared_magnitude_);
}
TEST_F(SpectrogramCalculatorTest, ComplexOutputLooksRightForImpulses) {
const int frame_size_samples = 100;
options_.set_frame_duration_seconds(frame_size_samples / input_sample_rate_);
options_.set_frame_overlap_seconds(0.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
options_.set_output_type(SpectrogramCalculatorOptions::COMPLEX);
const std::vector<int> input_packet_sizes = {frame_size_samples,
frame_size_samples};
const std::vector<int> input_packet_impulse_offsets = {49, 50};
InitializeGraph();
FillInputHeader();
// Make two impulse packets offset one sample from each other
SetupImpulseInputPackets(input_packet_sizes, input_packet_impulse_offsets);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
const int num_buckets =
(audio_dsp::NextPowerOfTwo(frame_size_samples) / 2) + 1;
const float precision = 0.01f;
auto norm_fn = [](const std::complex<float>& cf) { return std::norm(cf); };
// Both impulses should have (approximately) constant power across all
// frequency bins
EXPECT_TRUE(output()
.packets[0]
.Get<Eigen::MatrixXcf>()
.unaryExpr(norm_fn)
.isApproxToConstant(1.0f, precision));
EXPECT_TRUE(output()
.packets[1]
.Get<Eigen::MatrixXcf>()
.unaryExpr(norm_fn)
.isApproxToConstant(1.0f, precision));
// Because the second Packet's impulse is delayed by exactly one sample with
// respect to the first Packet's impulse, the second impulse should have
// greater phase, and in the highest frequency bin, the real part should
// (approximately) flip sign from the first Packet to the second
EXPECT_LT(std::arg(output().packets[0].Get<Eigen::MatrixXcf>()(1, 0)),
std::arg(output().packets[1].Get<Eigen::MatrixXcf>()(1, 0)));
const float highest_bucket_real_ratio =
output().packets[0].Get<Eigen::MatrixXcf>()(num_buckets - 1, 0).real() /
output().packets[1].Get<Eigen::MatrixXcf>()(num_buckets - 1, 0).real();
EXPECT_NEAR(highest_bucket_real_ratio, -1.0f, precision);
}
TEST_F(SpectrogramCalculatorTest, SquaredMagnitudeOutputLooksRightForNonDC) {
const int frame_size_samples = 100;
options_.set_frame_duration_seconds(frame_size_samples / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_output_type(SpectrogramCalculatorOptions::SQUARED_MAGNITUDE);
const std::vector<int> input_packet_sizes = {140};
InitializeGraph();
FillInputHeader();
// Make the tone have an integral number of cycles within the window
const int target_bin = 16;
const int fft_size = audio_dsp::NextPowerOfTwo(frame_size_samples);
const float tone_frequency_hz = target_bin * (input_sample_rate_ / fft_size);
SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
// For a non-DC bin, the magnitude will be split between positive and
// negative frequency bins, so it should about be half-magnitude
// = quarter-power.
// It's not quite exact because of the interference from the hann(100)
// spread from the negative-frequency half.
EXPECT_GT(output().packets[0].Get<Matrix>()(target_bin, 0),
0.98 * expected_dc_squared_magnitude_ / 4.0);
EXPECT_LT(output().packets[0].Get<Matrix>()(target_bin, 0),
1.02 * expected_dc_squared_magnitude_ / 4.0);
}
TEST_F(SpectrogramCalculatorTest, ZeroOutputsForZeroInputsWithPaddingEnabled) {
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(true);
const std::vector<int> input_packet_sizes = {};
const std::vector<int> expected_output_packet_sizes = {};
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes);
}
TEST_F(SpectrogramCalculatorTest, NumChannelsIsRight) {
const std::vector<int> input_packet_sizes = {460};
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
options_.set_allow_multichannel_input(true);
num_input_channels_ = 3;
InitializeGraph();
FillInputHeader();
const float tone_frequency_hz = 440.0;
SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
EXPECT_EQ(output().packets[0].Get<std::vector<Matrix>>().size(),
num_input_channels_);
}
TEST_F(SpectrogramCalculatorTest, NumSamplesAndPacketRateAreCleared) {
num_input_samples_ = 500;
input_packet_rate_ = 1.0;
const std::vector<int> input_packet_sizes = {num_input_samples_};
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(0.0);
options_.set_pad_final_packet(false);
InitializeGraph();
FillInputHeader();
SetupConstantInputPackets(input_packet_sizes);
MP_ASSERT_OK(Run());
const TimeSeriesHeader& output_header =
output().header.Get<TimeSeriesHeader>();
EXPECT_FALSE(output_header.has_num_samples());
EXPECT_FALSE(output_header.has_packet_rate());
}
TEST_F(SpectrogramCalculatorTest, MultichannelSpectrogramSizesAreRight) {
const std::vector<int> input_packet_sizes = {420}; // less than 10 frames
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
options_.set_allow_multichannel_input(true);
num_input_channels_ = 10;
InitializeGraph();
FillInputHeader();
const float tone_frequency_hz = 440.0;
SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
auto spectrograms = output().packets[0].Get<std::vector<Matrix>>();
EXPECT_FLOAT_EQ(spectrograms.size(), num_input_channels_);
int spectrogram_num_rows = spectrograms[0].rows();
int spectrogram_num_cols = spectrograms[0].cols();
for (int i = 1; i < num_input_channels_; i++) {
EXPECT_EQ(spectrogram_num_rows, spectrograms[i].rows());
EXPECT_EQ(spectrogram_num_cols, spectrograms[i].cols());
}
}
TEST_F(SpectrogramCalculatorTest, MultichannelSpectrogramValuesAreRight) {
const std::vector<int> input_packet_sizes = {
460}; // 100 + 9*40 for 10 frames.
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_allow_multichannel_input(true);
num_input_channels_ = 10;
InitializeGraph();
FillInputHeader();
const float tone_frequency_hz = 440.0;
SetupMultichannelInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
auto spectrograms = output().packets[0].Get<std::vector<Matrix>>();
int num_output_frames = spectrograms[0].cols();
for (int i = 0; i < num_input_channels_; i++) {
for (int frame = 0; frame < num_output_frames; ++frame) {
if (i % 2 == 0) {
CheckPeakFrequencyInMatrix(spectrograms[i], frame, 0);
} else {
CheckPeakFrequencyInMatrix(spectrograms[i], frame, tone_frequency_hz);
}
}
}
}
TEST_F(SpectrogramCalculatorTest, MultichannelHandlesShortInitialPacket) {
// First packet is less than one frame, but second packet should trigger a
// complete frame from all channels.
const std::vector<int> input_packet_sizes = {50, 50};
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
options_.set_allow_multichannel_input(true);
num_input_channels_ = 2;
InitializeGraph();
FillInputHeader();
const float tone_frequency_hz = 440.0;
SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
auto spectrograms = output().packets[0].Get<std::vector<Matrix>>();
EXPECT_FLOAT_EQ(spectrograms.size(), num_input_channels_);
int spectrogram_num_rows = spectrograms[0].rows();
int spectrogram_num_cols = spectrograms[0].cols();
for (int i = 1; i < num_input_channels_; i++) {
EXPECT_EQ(spectrogram_num_rows, spectrograms[i].rows());
EXPECT_EQ(spectrogram_num_cols, spectrograms[i].cols());
}
}
TEST_F(SpectrogramCalculatorTest,
MultichannelComplexHandlesShortInitialPacket) {
// First packet is less than one frame, but second packet should trigger a
// complete frame from all channels, even for complex output.
const std::vector<int> input_packet_sizes = {50, 50};
options_.set_frame_duration_seconds(100.0 / input_sample_rate_);
options_.set_frame_overlap_seconds(60.0 / input_sample_rate_);
options_.set_pad_final_packet(false);
options_.set_allow_multichannel_input(true);
options_.set_output_type(SpectrogramCalculatorOptions::COMPLEX);
num_input_channels_ = 2;
InitializeGraph();
FillInputHeader();
const float tone_frequency_hz = 440.0;
SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz);
MP_ASSERT_OK(Run());
CheckOutputHeadersAndTimestamps();
auto spectrograms = output().packets[0].Get<std::vector<Eigen::MatrixXcf>>();
EXPECT_FLOAT_EQ(spectrograms.size(), num_input_channels_);
int spectrogram_num_rows = spectrograms[0].rows();
int spectrogram_num_cols = spectrograms[0].cols();
for (int i = 1; i < num_input_channels_; i++) {
EXPECT_EQ(spectrogram_num_rows, spectrograms[i].rows());
EXPECT_EQ(spectrogram_num_cols, spectrograms[i].cols());
}
}
void BM_ProcessDC(benchmark::State& state) {
CalculatorGraphConfig::Node node_config;
node_config.set_calculator("SpectrogramCalculator");
node_config.add_input_stream("input_audio");
node_config.add_output_stream("output_spectrogram");
SpectrogramCalculatorOptions* options =
node_config.mutable_options()->MutableExtension(
SpectrogramCalculatorOptions::ext);
options->set_frame_duration_seconds(0.010);
options->set_frame_overlap_seconds(0.0);
options->set_pad_final_packet(false);
*node_config.mutable_options()->MutableExtension(
SpectrogramCalculatorOptions::ext) = *options;
int num_input_channels = 1;
int packet_size_samples = 1600000;
TimeSeriesHeader* header = new TimeSeriesHeader();
header->set_sample_rate(16000.0);
header->set_num_channels(num_input_channels);
CalculatorRunner runner(node_config);
runner.MutableInputs()->Index(0).header = Adopt(header);
Matrix* payload = new Matrix(
Matrix::Constant(num_input_channels, packet_size_samples, 1.0));
Timestamp timestamp = Timestamp(0);
runner.MutableInputs()->Index(0).packets.push_back(
Adopt(payload).At(timestamp));
for (auto _ : state) {
ASSERT_TRUE(runner.Run().ok());
}
const CalculatorRunner::StreamContents& output = runner.Outputs().Index(0);
const Matrix& output_matrix = output.packets[0].Get<Matrix>();
LOG(INFO) << "Output matrix=" << output_matrix.rows() << "x"
<< output_matrix.cols();
LOG(INFO) << "First values=" << output_matrix(0, 0) << ", "
<< output_matrix(1, 0) << ", " << output_matrix(2, 0) << ", "
<< output_matrix(3, 0);
}
BENCHMARK(BM_ProcessDC);
} // anonymous namespace
} // namespace mediapipe