// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include #include #include #include "Eigen/Core" #include "audio/dsp/number_util.h" #include "mediapipe/calculators/audio/spectrogram_calculator.pb.h" #include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/calculator_runner.h" #include "mediapipe/framework/formats/matrix.h" #include "mediapipe/framework/formats/time_series_header.pb.h" #include "mediapipe/framework/port/benchmark.h" #include "mediapipe/framework/port/gmock.h" #include "mediapipe/framework/port/gtest.h" #include "mediapipe/framework/port/integral_types.h" #include "mediapipe/framework/port/status.h" #include "mediapipe/util/time_series_test_util.h" namespace mediapipe { namespace { const int kInitialTimestampOffsetMicroseconds = 4; class SpectrogramCalculatorTest : public TimeSeriesCalculatorTest { protected: void SetUp() override { calculator_name_ = "SpectrogramCalculator"; input_sample_rate_ = 4000.0; num_input_channels_ = 1; } // Initializes and runs the test graph. absl::Status Run() { // Now that options are set, we can set up some internal constants. frame_duration_samples_ = round(options_.frame_duration_seconds() * input_sample_rate_); frame_step_samples_ = frame_duration_samples_ - round(options_.frame_overlap_seconds() * input_sample_rate_); // The magnitude of the 0th FFT bin (DC) should be sum(input.*window); // for an input identically 1.0, this is just sum(window). The average // value of our Hann window is 0.5, hence this is the expected squared- // magnitude output value in the DC bin for constant input of 1.0. expected_dc_squared_magnitude_ = pow((static_cast(frame_duration_samples_) * 0.5), 2.0); return RunGraph(); } // Creates test multichannel input with specified packet sizes and containing // a constant-frequency sinusoid that maintains phase between adjacent // packets. void SetupCosineInputPackets(const std::vector& packet_sizes_samples, float cosine_frequency_hz) { int total_num_input_samples = 0; for (int packet_size_samples : packet_sizes_samples) { double packet_start_time_seconds = kInitialTimestampOffsetMicroseconds * 1e-6 + total_num_input_samples / input_sample_rate_; double packet_end_time_seconds = packet_start_time_seconds + packet_size_samples / input_sample_rate_; double angular_freq = 2 * M_PI * cosine_frequency_hz; Matrix* packet_data = new Matrix(num_input_channels_, packet_size_samples); // Use Eigen's vectorized cos() function to fill the vector with a // sinusoid of appropriate frequency & phase. for (int i = 0; i < num_input_channels_; i++) { packet_data->row(i) = Eigen::ArrayXf::LinSpaced(packet_size_samples, packet_start_time_seconds * angular_freq, packet_end_time_seconds * angular_freq) .cos() .transpose(); } int64 input_timestamp = round(packet_start_time_seconds * Timestamp::kTimestampUnitsPerSecond); AppendInputPacket(packet_data, input_timestamp); total_num_input_samples += packet_size_samples; } } // Setup a sequence of input packets of specified sizes, each filled // with samples of 1.0. void SetupConstantInputPackets(const std::vector& packet_sizes_samples) { // A 0 Hz cosine is identically 1.0 for all samples. SetupCosineInputPackets(packet_sizes_samples, 0.0); } // Setup a sequence of input packets of specified sizes, each containing a // single sample of 1.0 at a specified offset. void SetupImpulseInputPackets( const std::vector& packet_sizes_samples, const std::vector& impulse_offsets_samples) { int total_num_input_samples = 0; for (int i = 0; i < packet_sizes_samples.size(); ++i) { double packet_start_time_seconds = kInitialTimestampOffsetMicroseconds * 1e-6 + total_num_input_samples / input_sample_rate_; int64 input_timestamp = round(packet_start_time_seconds * Timestamp::kTimestampUnitsPerSecond); std::unique_ptr impulse( new Matrix(Matrix::Zero(1, packet_sizes_samples[i]))); (*impulse)(0, impulse_offsets_samples[i]) = 1.0; AppendInputPacket(impulse.release(), input_timestamp); total_num_input_samples += packet_sizes_samples[i]; } } // Creates test multichannel input with specified packet sizes and containing // constant input packets for the even channels and constant-frequency // sinusoid that maintains phase between adjacent packets for the odd // channels. void SetupMultichannelInputPackets( const std::vector& packet_sizes_samples, float cosine_frequency_hz) { int total_num_input_samples = 0; for (int packet_size_samples : packet_sizes_samples) { double packet_start_time_seconds = kInitialTimestampOffsetMicroseconds * 1e-6 + total_num_input_samples / input_sample_rate_; double packet_end_time_seconds = packet_start_time_seconds + packet_size_samples / input_sample_rate_; double angular_freq; Matrix* packet_data = new Matrix(num_input_channels_, packet_size_samples); // Use Eigen's vectorized cos() function to fill the vector with a // sinusoid of appropriate frequency & phase. for (int i = 0; i < num_input_channels_; i++) { if (i % 2 == 0) { angular_freq = 0; } else { angular_freq = 2 * M_PI * cosine_frequency_hz; } packet_data->row(i) = Eigen::ArrayXf::LinSpaced(packet_size_samples, packet_start_time_seconds * angular_freq, packet_end_time_seconds * angular_freq) .cos() .transpose(); } int64 input_timestamp = round(packet_start_time_seconds * Timestamp::kTimestampUnitsPerSecond); AppendInputPacket(packet_data, input_timestamp); total_num_input_samples += packet_size_samples; } } // Return vector of the numbers of frames in each output packet. std::vector OutputFramesPerPacket() { std::vector frame_counts; for (const Packet& packet : output().packets) { const Matrix& matrix = packet.Get(); frame_counts.push_back(matrix.cols()); } return frame_counts; } // Checks output headers and Timestamps. void CheckOutputHeadersAndTimestamps() { const int fft_size = audio_dsp::NextPowerOfTwo(frame_duration_samples_); TimeSeriesHeader expected_header = input().header.Get(); expected_header.set_num_channels(fft_size / 2 + 1); // The output header sample rate should depend on the output frame step. expected_header.set_sample_rate(input_sample_rate_ / frame_step_samples_); // SpectrogramCalculator stores the sample rate of the input in // the TimeSeriesHeader. expected_header.set_audio_sample_rate(input_sample_rate_); // We expect the output header to have num_samples and packet_rate unset. expected_header.clear_num_samples(); expected_header.clear_packet_rate(); if (!options_.allow_multichannel_input()) { ExpectOutputHeaderEquals(expected_header); } else { EXPECT_THAT(output() .header.template Get() .time_series_header(), mediapipe::EqualsProto(expected_header)); EXPECT_THAT(output() .header.template Get() .num_streams(), num_input_channels_); } int cumulative_output_frames = 0; // The timestamps coming out of the spectrogram correspond to the // middle of the first frame's window, hence frame_duration_samples_/2 // term. We use frame_duration_samples_ because that is how it is // actually quantized inside spectrogram. const double packet_timestamp_offset_seconds = kInitialTimestampOffsetMicroseconds * 1e-6; const double frame_step_seconds = frame_step_samples_ / input_sample_rate_; Timestamp initial_timestamp = Timestamp::Unstarted(); for (const Packet& packet : output().packets) { // This is the timestamp we expect based on how the spectrogram should // behave (advancing by one step's worth of input samples each frame). const double expected_timestamp_seconds = packet_timestamp_offset_seconds + cumulative_output_frames * frame_step_seconds; const int64 expected_timestamp_ticks = expected_timestamp_seconds * Timestamp::kTimestampUnitsPerSecond; EXPECT_EQ(expected_timestamp_ticks, packet.Timestamp().Value()); // Accept the timestamp of the first packet as the baseline for checking // the remainder. if (initial_timestamp == Timestamp::Unstarted()) { initial_timestamp = packet.Timestamp(); } // Also check that the timestamp is consistent with the sample_rate // in the output stream's TimeSeriesHeader. EXPECT_TRUE(time_series_util::LogWarningIfTimestampIsInconsistent( packet.Timestamp(), initial_timestamp, cumulative_output_frames, expected_header.sample_rate())); if (!options_.allow_multichannel_input()) { if (options_.output_type() == SpectrogramCalculatorOptions::COMPLEX) { const Eigen::MatrixXcf& matrix = packet.Get(); cumulative_output_frames += matrix.cols(); } else { const Matrix& matrix = packet.Get(); cumulative_output_frames += matrix.cols(); } } else { if (options_.output_type() == SpectrogramCalculatorOptions::COMPLEX) { const Eigen::MatrixXcf& matrix = packet.Get>().at(0); cumulative_output_frames += matrix.cols(); } else { const Matrix& matrix = packet.Get>().at(0); cumulative_output_frames += matrix.cols(); } } } } // Verify that the bin corresponding to the specified frequency // is the largest one in one particular frame of a single packet. void CheckPeakFrequencyInPacketFrame(const Packet& packet, int frame, float frequency) { const int fft_size = audio_dsp::NextPowerOfTwo(frame_duration_samples_); const int target_bin = round((frequency / input_sample_rate_) * static_cast(fft_size)); const Matrix& matrix = packet.Get(); // Stop here if the requested frame is not in this packet. ASSERT_GT(matrix.cols(), frame); int actual_largest_bin; matrix.col(frame).maxCoeff(&actual_largest_bin); EXPECT_EQ(actual_largest_bin, target_bin); } // Verify that the bin corresponding to the specified frequency // is the largest one in one particular frame of a single spectrogram Matrix. void CheckPeakFrequencyInMatrix(const Matrix& matrix, int frame, float frequency) { const int fft_size = audio_dsp::NextPowerOfTwo(frame_duration_samples_); const int target_bin = round((frequency / input_sample_rate_) * static_cast(fft_size)); // Stop here if the requested frame is not in this packet. ASSERT_GT(matrix.cols(), frame); int actual_largest_bin; matrix.col(frame).maxCoeff(&actual_largest_bin); EXPECT_EQ(actual_largest_bin, target_bin); } int frame_duration_samples_; int frame_step_samples_; // Expected DC output for a window of pure 1.0, set when window length // is set. float expected_dc_squared_magnitude_; }; TEST_F(SpectrogramCalculatorTest, IntegerFrameDurationNoOverlap) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(0.0 / input_sample_rate_); options_.set_pad_final_packet(false); const std::vector input_packet_sizes = {500, 200}; const std::vector expected_output_packet_sizes = {5, 2}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, IntegerFrameDurationSomeOverlap) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); const std::vector input_packet_sizes = {500, 200}; // complete_output_frames = 1 + floor((input_samples - window_length)/step) // = 1 + floor((500 - 100)/40) = 1 + 10 = 11 for the first packet // = 1 + floor((700 - 100)/40) = 1 + 15 = 16 for the whole stream // so expect 16 - 11 = 5 in the second packet. const std::vector expected_output_packet_sizes = {11, 5}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, NonintegerFrameDurationAndOverlap) { options_.set_frame_duration_seconds(98.5 / input_sample_rate_); options_.set_frame_overlap_seconds(58.4 / input_sample_rate_); options_.set_pad_final_packet(false); const std::vector input_packet_sizes = {500, 200}; // now frame_duration_samples will be 99 (rounded), and frame_step_samples // will be (99-58) = 41, so the first packet of 500 samples will generate // 1 + floor(500-99)/41 = 10 samples. const std::vector expected_output_packet_sizes = {10, 5}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, ShortInitialPacketNoOverlap) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(0.0 / input_sample_rate_); options_.set_pad_final_packet(false); const std::vector input_packet_sizes = {90, 100, 110}; // The first input packet is too small to generate any frames, // but zero-length packets would result in a timestamp monotonicity // violation, so they are suppressed. Thus, only the second and third // input packets generate output packets. const std::vector expected_output_packet_sizes = {1, 2}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, TrailingSamplesNoPad) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); const std::vector input_packet_sizes = {140, 90}; const std::vector expected_output_packet_sizes = {2, 2}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, NoTrailingSamplesWithPad) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(true); const std::vector input_packet_sizes = {140, 80}; const std::vector expected_output_packet_sizes = {2, 2}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, TrailingSamplesWithPad) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(true); const std::vector input_packet_sizes = {140, 90}; // In contrast to NoTrailingSamplesWithPad and TrailingSamplesNoPad, // this time we get an extra frame in an extra final packet. const std::vector expected_output_packet_sizes = {2, 2, 1}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, VeryShortInputWillPad) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(true); const std::vector input_packet_sizes = {30}; const std::vector expected_output_packet_sizes = {1}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, VeryShortInputZeroOutputFramesIfNoPad) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); const std::vector input_packet_sizes = {90}; const std::vector expected_output_packet_sizes = {}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, DCSignalIsPeakBin) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); const std::vector input_packet_sizes = {140}; // Gives 2 output frames. InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); const float dc_frequency_hz = 0.0; CheckPeakFrequencyInPacketFrame(output().packets[0], 0, dc_frequency_hz); CheckPeakFrequencyInPacketFrame(output().packets[0], 1, dc_frequency_hz); } TEST_F(SpectrogramCalculatorTest, A440ToneIsPeakBin) { const std::vector input_packet_sizes = { 460}; // 100 + 9*40 for 10 frames. options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); InitializeGraph(); FillInputHeader(); const float tone_frequency_hz = 440.0; SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); int num_output_frames = output().packets[0].Get().cols(); for (int frame = 0; frame < num_output_frames; ++frame) { CheckPeakFrequencyInPacketFrame(output().packets[0], frame, tone_frequency_hz); } } TEST_F(SpectrogramCalculatorTest, SquaredMagnitudeOutputLooksRight) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_output_type(SpectrogramCalculatorOptions::SQUARED_MAGNITUDE); const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_FLOAT_EQ(output().packets[0].Get()(0, 0), expected_dc_squared_magnitude_); } TEST_F(SpectrogramCalculatorTest, DefaultOutputIsSquaredMagnitude) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); // Let the output_type be its default const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_FLOAT_EQ(output().packets[0].Get()(0, 0), expected_dc_squared_magnitude_); } TEST_F(SpectrogramCalculatorTest, LinearMagnitudeOutputLooksRight) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_output_type(SpectrogramCalculatorOptions::LINEAR_MAGNITUDE); const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_FLOAT_EQ(output().packets[0].Get()(0, 0), std::sqrt(expected_dc_squared_magnitude_)); } TEST_F(SpectrogramCalculatorTest, DbMagnitudeOutputLooksRight) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_output_type(SpectrogramCalculatorOptions::DECIBELS); const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_FLOAT_EQ(output().packets[0].Get()(0, 0), 10.0 * std::log10(expected_dc_squared_magnitude_)); } TEST_F(SpectrogramCalculatorTest, OutputScalingLooksRight) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_output_type(SpectrogramCalculatorOptions::DECIBELS); double output_scale = 2.5; options_.set_output_scale(output_scale); const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_FLOAT_EQ( output().packets[0].Get()(0, 0), output_scale * 10.0 * std::log10(expected_dc_squared_magnitude_)); } TEST_F(SpectrogramCalculatorTest, ComplexOutputLooksRight) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_output_type(SpectrogramCalculatorOptions::COMPLEX); const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Setup packets with DC input (non-zero constant value). SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_FLOAT_EQ(std::norm(output().packets[0].Get()(0, 0)), expected_dc_squared_magnitude_); } TEST_F(SpectrogramCalculatorTest, ComplexOutputLooksRightForImpulses) { const int frame_size_samples = 100; options_.set_frame_duration_seconds(frame_size_samples / input_sample_rate_); options_.set_frame_overlap_seconds(0.0 / input_sample_rate_); options_.set_pad_final_packet(false); options_.set_output_type(SpectrogramCalculatorOptions::COMPLEX); const std::vector input_packet_sizes = {frame_size_samples, frame_size_samples}; const std::vector input_packet_impulse_offsets = {49, 50}; InitializeGraph(); FillInputHeader(); // Make two impulse packets offset one sample from each other SetupImpulseInputPackets(input_packet_sizes, input_packet_impulse_offsets); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); const int num_buckets = (audio_dsp::NextPowerOfTwo(frame_size_samples) / 2) + 1; const float precision = 0.01f; auto norm_fn = [](const std::complex& cf) { return std::norm(cf); }; // Both impulses should have (approximately) constant power across all // frequency bins EXPECT_TRUE(output() .packets[0] .Get() .unaryExpr(norm_fn) .isApproxToConstant(1.0f, precision)); EXPECT_TRUE(output() .packets[1] .Get() .unaryExpr(norm_fn) .isApproxToConstant(1.0f, precision)); // Because the second Packet's impulse is delayed by exactly one sample with // respect to the first Packet's impulse, the second impulse should have // greater phase, and in the highest frequency bin, the real part should // (approximately) flip sign from the first Packet to the second EXPECT_LT(std::arg(output().packets[0].Get()(1, 0)), std::arg(output().packets[1].Get()(1, 0))); const float highest_bucket_real_ratio = output().packets[0].Get()(num_buckets - 1, 0).real() / output().packets[1].Get()(num_buckets - 1, 0).real(); EXPECT_NEAR(highest_bucket_real_ratio, -1.0f, precision); } TEST_F(SpectrogramCalculatorTest, SquaredMagnitudeOutputLooksRightForNonDC) { const int frame_size_samples = 100; options_.set_frame_duration_seconds(frame_size_samples / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_output_type(SpectrogramCalculatorOptions::SQUARED_MAGNITUDE); const std::vector input_packet_sizes = {140}; InitializeGraph(); FillInputHeader(); // Make the tone have an integral number of cycles within the window const int target_bin = 16; const int fft_size = audio_dsp::NextPowerOfTwo(frame_size_samples); const float tone_frequency_hz = target_bin * (input_sample_rate_ / fft_size); SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); // For a non-DC bin, the magnitude will be split between positive and // negative frequency bins, so it should about be half-magnitude // = quarter-power. // It's not quite exact because of the interference from the hann(100) // spread from the negative-frequency half. EXPECT_GT(output().packets[0].Get()(target_bin, 0), 0.98 * expected_dc_squared_magnitude_ / 4.0); EXPECT_LT(output().packets[0].Get()(target_bin, 0), 1.02 * expected_dc_squared_magnitude_ / 4.0); } TEST_F(SpectrogramCalculatorTest, ZeroOutputsForZeroInputsWithPaddingEnabled) { options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(true); const std::vector input_packet_sizes = {}; const std::vector expected_output_packet_sizes = {}; InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(OutputFramesPerPacket(), expected_output_packet_sizes); } TEST_F(SpectrogramCalculatorTest, NumChannelsIsRight) { const std::vector input_packet_sizes = {460}; options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); options_.set_allow_multichannel_input(true); num_input_channels_ = 3; InitializeGraph(); FillInputHeader(); const float tone_frequency_hz = 440.0; SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); EXPECT_EQ(output().packets[0].Get>().size(), num_input_channels_); } TEST_F(SpectrogramCalculatorTest, NumSamplesAndPacketRateAreCleared) { num_input_samples_ = 500; input_packet_rate_ = 1.0; const std::vector input_packet_sizes = {num_input_samples_}; options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(0.0); options_.set_pad_final_packet(false); InitializeGraph(); FillInputHeader(); SetupConstantInputPackets(input_packet_sizes); MP_ASSERT_OK(Run()); const TimeSeriesHeader& output_header = output().header.Get(); EXPECT_FALSE(output_header.has_num_samples()); EXPECT_FALSE(output_header.has_packet_rate()); } TEST_F(SpectrogramCalculatorTest, MultichannelSpectrogramSizesAreRight) { const std::vector input_packet_sizes = {420}; // less than 10 frames options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); options_.set_allow_multichannel_input(true); num_input_channels_ = 10; InitializeGraph(); FillInputHeader(); const float tone_frequency_hz = 440.0; SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); auto spectrograms = output().packets[0].Get>(); EXPECT_FLOAT_EQ(spectrograms.size(), num_input_channels_); int spectrogram_num_rows = spectrograms[0].rows(); int spectrogram_num_cols = spectrograms[0].cols(); for (int i = 1; i < num_input_channels_; i++) { EXPECT_EQ(spectrogram_num_rows, spectrograms[i].rows()); EXPECT_EQ(spectrogram_num_cols, spectrograms[i].cols()); } } TEST_F(SpectrogramCalculatorTest, MultichannelSpectrogramValuesAreRight) { const std::vector input_packet_sizes = { 460}; // 100 + 9*40 for 10 frames. options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_allow_multichannel_input(true); num_input_channels_ = 10; InitializeGraph(); FillInputHeader(); const float tone_frequency_hz = 440.0; SetupMultichannelInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); auto spectrograms = output().packets[0].Get>(); int num_output_frames = spectrograms[0].cols(); for (int i = 0; i < num_input_channels_; i++) { for (int frame = 0; frame < num_output_frames; ++frame) { if (i % 2 == 0) { CheckPeakFrequencyInMatrix(spectrograms[i], frame, 0); } else { CheckPeakFrequencyInMatrix(spectrograms[i], frame, tone_frequency_hz); } } } } TEST_F(SpectrogramCalculatorTest, MultichannelHandlesShortInitialPacket) { // First packet is less than one frame, but second packet should trigger a // complete frame from all channels. const std::vector input_packet_sizes = {50, 50}; options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); options_.set_allow_multichannel_input(true); num_input_channels_ = 2; InitializeGraph(); FillInputHeader(); const float tone_frequency_hz = 440.0; SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); auto spectrograms = output().packets[0].Get>(); EXPECT_FLOAT_EQ(spectrograms.size(), num_input_channels_); int spectrogram_num_rows = spectrograms[0].rows(); int spectrogram_num_cols = spectrograms[0].cols(); for (int i = 1; i < num_input_channels_; i++) { EXPECT_EQ(spectrogram_num_rows, spectrograms[i].rows()); EXPECT_EQ(spectrogram_num_cols, spectrograms[i].cols()); } } TEST_F(SpectrogramCalculatorTest, MultichannelComplexHandlesShortInitialPacket) { // First packet is less than one frame, but second packet should trigger a // complete frame from all channels, even for complex output. const std::vector input_packet_sizes = {50, 50}; options_.set_frame_duration_seconds(100.0 / input_sample_rate_); options_.set_frame_overlap_seconds(60.0 / input_sample_rate_); options_.set_pad_final_packet(false); options_.set_allow_multichannel_input(true); options_.set_output_type(SpectrogramCalculatorOptions::COMPLEX); num_input_channels_ = 2; InitializeGraph(); FillInputHeader(); const float tone_frequency_hz = 440.0; SetupCosineInputPackets(input_packet_sizes, tone_frequency_hz); MP_ASSERT_OK(Run()); CheckOutputHeadersAndTimestamps(); auto spectrograms = output().packets[0].Get>(); EXPECT_FLOAT_EQ(spectrograms.size(), num_input_channels_); int spectrogram_num_rows = spectrograms[0].rows(); int spectrogram_num_cols = spectrograms[0].cols(); for (int i = 1; i < num_input_channels_; i++) { EXPECT_EQ(spectrogram_num_rows, spectrograms[i].rows()); EXPECT_EQ(spectrogram_num_cols, spectrograms[i].cols()); } } void BM_ProcessDC(benchmark::State& state) { CalculatorGraphConfig::Node node_config; node_config.set_calculator("SpectrogramCalculator"); node_config.add_input_stream("input_audio"); node_config.add_output_stream("output_spectrogram"); SpectrogramCalculatorOptions* options = node_config.mutable_options()->MutableExtension( SpectrogramCalculatorOptions::ext); options->set_frame_duration_seconds(0.010); options->set_frame_overlap_seconds(0.0); options->set_pad_final_packet(false); *node_config.mutable_options()->MutableExtension( SpectrogramCalculatorOptions::ext) = *options; int num_input_channels = 1; int packet_size_samples = 1600000; TimeSeriesHeader* header = new TimeSeriesHeader(); header->set_sample_rate(16000.0); header->set_num_channels(num_input_channels); CalculatorRunner runner(node_config); runner.MutableInputs()->Index(0).header = Adopt(header); Matrix* payload = new Matrix( Matrix::Constant(num_input_channels, packet_size_samples, 1.0)); Timestamp timestamp = Timestamp(0); runner.MutableInputs()->Index(0).packets.push_back( Adopt(payload).At(timestamp)); for (auto _ : state) { ASSERT_TRUE(runner.Run().ok()); } const CalculatorRunner::StreamContents& output = runner.Outputs().Index(0); const Matrix& output_matrix = output.packets[0].Get(); LOG(INFO) << "Output matrix=" << output_matrix.rows() << "x" << output_matrix.cols(); LOG(INFO) << "First values=" << output_matrix(0, 0) << ", " << output_matrix(1, 0) << ", " << output_matrix(2, 0) << ", " << output_matrix(3, 0); } BENCHMARK(BM_ProcessDC); } // anonymous namespace } // namespace mediapipe