diff --git a/mediapipe/objc/BUILD b/mediapipe/objc/BUILD index c71c02b6d..a21677608 100644 --- a/mediapipe/objc/BUILD +++ b/mediapipe/objc/BUILD @@ -193,6 +193,20 @@ objc_library( ], ) +objc_library( + name = "mediapipe_audio_util", + srcs = ["MediaPipeAudioUtil.mm"], + hdrs = ["MediaPipeAudioUtil.h"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework/formats:matrix", + "//mediapipe/framework/port:statusor", + "//third_party/apple_frameworks:AVFoundation", + "//third_party/apple_frameworks:CoreAudio", + "//third_party/apple_frameworks:CoreMedia", + ], +) + objc_library( name = "MPPGraphTestBase", testonly = 1, @@ -230,6 +244,7 @@ objc_library( "CFHolderTests.mm", "MPPDisplayLinkWeakTargetTests.mm", "MPPGraphTests.mm", + "MediaPipeAudioUtilTests.mm", ], copts = [ "-Wno-shorten-64-to-32", @@ -242,11 +257,13 @@ objc_library( ":CGImageRefUtils", ":MPPGraphTestBase", ":Weakify", + ":mediapipe_audio_util", ":mediapipe_framework_ios", ":mediapipe_input_sources_ios", "//mediapipe/calculators/core:pass_through_calculator", "//third_party/apple_frameworks:AVFoundation", "//third_party/apple_frameworks:Accelerate", + "//third_party/apple_frameworks:CoreAudio", "//third_party/apple_frameworks:CoreGraphics", "//third_party/apple_frameworks:CoreMedia", "//third_party/apple_frameworks:CoreVideo", diff --git a/mediapipe/objc/DrishtiAudioUtil.h b/mediapipe/objc/DrishtiAudioUtil.h new file mode 100644 index 000000000..40e6ded0d --- /dev/null +++ b/mediapipe/objc/DrishtiAudioUtil.h @@ -0,0 +1,36 @@ +// Copyright 2023 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef MEDIAPIPE_OBJC_AUDIO_UTIL_H_ +#define MEDIAPIPE_OBJC_AUDIO_UTIL_H_ + +#import +#import + +#include + +#include "absl/status/statusor.h" +#include "mediapipe/framework/formats/matrix.h" + +NS_ASSUME_NONNULL_BEGIN + +// Converts an audio sample buffer list into a `mediapipe::Matrix`. +// Returns an error status on failure. +absl::StatusOr> +MediaPipeConvertAudioBufferListToAudioMatrix( + const AudioBufferList* audioBufferList, + const AudioStreamBasicDescription* streamHeader, CMItemCount numFrames); + +NS_ASSUME_NONNULL_END + +#endif // MEDIAPIPE_OBJC_AUDIO_UTIL_H_ diff --git a/mediapipe/objc/DrishtiAudioUtil.mm b/mediapipe/objc/DrishtiAudioUtil.mm new file mode 100644 index 000000000..83c8bedab --- /dev/null +++ b/mediapipe/objc/DrishtiAudioUtil.mm @@ -0,0 +1,101 @@ +// Copyright 2023 The MediaPipe Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#import "mediapipe/objc/MediaPipeAudioUtil.h" + +#include + +namespace { +// `float` is 32-bit. +static_assert(std::numeric_limits::is_iec559); +using float32_t = float; + +template +float GetSample(const void* data, int index); + +template <> +float GetSample(const void* data, int index) { + return reinterpret_cast(data)[index]; +}; + +template <> +float GetSample(const void* data, int index) { + // Convert to the [-1, 1] range. + return static_cast(reinterpret_cast(data)[index]) / + static_cast(std::numeric_limits::max()); +}; + +template +std::unique_ptr MakeMatrix(const AudioBuffer* buffers, int channels, + CMItemCount frames, bool interleaved) { + // Create the matrix and fill it accordingly. Its dimensions are `channels x frames`. + auto matrix = std::make_unique(channels, frames); + // Split the case of interleaved and non-interleaved samples (see + // https://developer.apple.com/documentation/coremedia/1489723-cmsamplebuffercreate#discussion) + // - however, the resulting operations coincide when `channels == 1`. + if (interleaved) { + // A single buffer contains interleaved samples for all the channels {L, R, L, R, L, R, ...}. + const void* samples = buffers[0].mData; + for (int channel = 0; channel < channels; ++channel) { + for (int frame = 0; frame < frames; ++frame) { + (*matrix)(channel, frame) = GetSample(samples, channels * frame + channel); + } + } + } else { + // Non-interleaved audio: each channel's samples are stored in a separate buffer: + // {{L, L, L, L, ...}, {R, R, R, R, ...}}. + for (int channel = 0; channel < channels; ++channel) { + const void* samples = buffers[channel].mData; + for (int frame = 0; frame < frames; ++frame) { + (*matrix)(channel, frame) = GetSample(samples, frame); + } + } + } + return matrix; +} +} // namespace + +absl::StatusOr> MediaPipeConvertAudioBufferListToAudioMatrix( + const AudioBufferList* audioBufferList, const AudioStreamBasicDescription* streamHeader, + CMItemCount numFrames) { + // Sort out the channel count and whether the data is interleaved. + // Note that we treat "interleaved" mono audio as non-interleaved. + CMItemCount numChannels = 1; + bool isAudioInterleaved = false; + if (streamHeader->mChannelsPerFrame > 1) { + if (streamHeader->mFormatFlags & kAudioFormatFlagIsNonInterleaved) { + numChannels = audioBufferList->mNumberBuffers; + isAudioInterleaved = false; + } else { + numChannels = audioBufferList->mBuffers[0].mNumberChannels; + isAudioInterleaved = true; + } + if (numChannels <= 1) { + return absl::InternalError("AudioStreamBasicDescription indicates more than 1 channel, " + "but the buffer data declares an incompatible number of channels"); + } + } + + if ((streamHeader->mFormatFlags & kAudioFormatFlagIsFloat) && + streamHeader->mBitsPerChannel == 32) { + return MakeMatrix(audioBufferList->mBuffers, numChannels, numFrames, + isAudioInterleaved); + } + if ((streamHeader->mFormatFlags & kAudioFormatFlagIsSignedInteger) && + streamHeader->mBitsPerChannel == 16) { + return MakeMatrix(audioBufferList->mBuffers, numChannels, numFrames, + isAudioInterleaved); + } + return absl::InternalError("Incompatible audio sample storage format"); +} diff --git a/mediapipe/objc/DrishtiAudioUtilTests.mm b/mediapipe/objc/DrishtiAudioUtilTests.mm new file mode 100644 index 000000000..7663a70d6 --- /dev/null +++ b/mediapipe/objc/DrishtiAudioUtilTests.mm @@ -0,0 +1,363 @@ +#import "mediapipe/objc/MediaPipeAudioUtil.h" + +#include +#include +#include +#include +#include + +#import + +static const float kMatrixComparisonPrecisionFloat = 1e-9; +static const float kMatrixComparisonPrecisionInt16 = 1e-4; + +@interface MediaPipeAudioUtilTest : XCTestCase +@end + +template +class AudioBufferListWrapper { + public: + AudioBufferListWrapper(int num_frames, int num_channels, bool interleaved) + : num_frames_(num_frames), num_channels_(num_channels), interleaved_(interleaved) { + int num_buffers = interleaved_ ? 1 : num_channels_; + int channels_per_buffer = interleaved_ ? num_channels_ : 1; + int buffer_size_samples = num_frames_ * channels_per_buffer; + int buffer_size_bytes = buffer_size_samples * static_cast(BytesPerSample()); + + buffer_list_.reset(reinterpret_cast( + calloc(1, offsetof(AudioBufferList, mBuffers) + + (sizeof(AudioBuffer) * num_buffers)))); // Var. length array. + assert(buffer_list_.get() != nullptr); + + buffer_list_->mNumberBuffers = static_cast(num_buffers); + for (int buffer_index = 0; buffer_index < num_buffers; ++buffer_index) { + AudioBuffer& buffer = GetBuffer(buffer_index); + auto buffer_data = std::make_unique(buffer_size_samples); + assert(buffer_data != nullptr); + + buffer.mData = static_cast(buffer_data.get()); + buffer.mDataByteSize = buffer_size_bytes; + buffer.mNumberChannels = channels_per_buffer; + + buffers_.push_back(std::move(buffer_data)); + } + } + + UInt32 BytesPerSample() const { return static_cast(sizeof(DataType)); } + UInt32 BytesPerPacket() const { + return static_cast(BytesPerSample() * num_frames_ * num_channels_); + } + + AudioBufferList* GetBufferList() { return buffer_list_.get(); }; + const AudioBufferList* GetBufferList() const { return buffer_list_.get(); }; + + AudioBuffer& GetBuffer(int index) { return GetBufferList()->mBuffers[index]; } + + DataType* GetBufferData(int index) { return reinterpret_cast(GetBuffer(index).mData); } + + DataType& At(int channel, int frame) { + assert(frame >= 0 && frame < num_frames_); + assert(channel >= 0 && channel < num_channels_); + if (interleaved_) { + // [[L, R, L, R, ...]] + return GetBufferData(0)[frame * num_channels_ + channel]; + } else { + // [[L, L, ...], [R, R, ...]] + return GetBufferData(channel)[frame]; + } + } + + DataType ToDataType(float value) const; + + void InitFromMatrix(const mediapipe::Matrix& matrix) { + assert(matrix.rows() == num_channels_); + assert(matrix.cols() == num_frames_); + for (int channel = 0; channel < num_channels_; ++channel) { + for (int frame = 0; frame < num_frames_; ++frame) { + this->At(channel, frame) = ToDataType(matrix(channel, frame)); + ; + } + } + } + + private: + int num_frames_; + int num_channels_; + bool interleaved_; + std::unique_ptr buffer_list_; + std::vector> buffers_; +}; + +template <> +float AudioBufferListWrapper::ToDataType(float value) const { + return value; +} + +template <> +int16_t AudioBufferListWrapper::ToDataType(float value) const { + return static_cast(value * std::numeric_limits::max()); +} + +@implementation MediaPipeAudioUtilTest + +- (void)testBufferListToMatrixStereoNonInterleavedFloat { + constexpr int kChannels = 2; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/false); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = + kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked | kAudioFormatFlagIsNonInterleaved, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionFloat)); +} + +- (void)testBufferListToMatrixStereoInterleavedFloat { + constexpr int kChannels = 2; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/true); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionFloat)); +} + +- (void)testBufferListToMatrixMonoNonInterleavedFloat { + constexpr int kChannels = 1; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/false); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = + kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked | kAudioFormatFlagIsNonInterleaved, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionFloat)); +} + +- (void)testBufferListToMatrixMonoInterleavedFloat { + constexpr int kChannels = 1; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/true); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionFloat)); +} + +- (void)testBufferListToMatrixStereoNonInterleavedInt16 { + constexpr int kChannels = 2; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/false); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked | + kAudioFormatFlagIsNonInterleaved, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionInt16)); +} + +- (void)testBufferListToMatrixStereoInterleavedInt16 { + constexpr int kChannels = 2; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/true); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionInt16)); +} + +- (void)testBufferListToMatrixMonoNonInterleavedInt16 { + constexpr int kChannels = 1; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/false); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked | + kAudioFormatFlagIsNonInterleaved, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionInt16)); +} + +- (void)testBufferListToMatrixMonoInterleavedInt16 { + constexpr int kChannels = 1; + constexpr int kFrames = 5; + mediapipe::Matrix inputMatrix(kChannels, kFrames); + inputMatrix << 0, 0.1, 0.2, 0.3, 0.4; + AudioBufferListWrapper bufferList(/*num_frames=*/kFrames, + /*num_channels=*/kChannels, + /*interleaved=*/true); + bufferList.InitFromMatrix(inputMatrix); + + static const AudioStreamBasicDescription kStreamDescription = { + .mSampleRate = 44100, + .mFormatID = kAudioFormatLinearPCM, + .mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked, + .mBytesPerPacket = bufferList.BytesPerPacket(), + .mFramesPerPacket = kFrames, + .mBytesPerFrame = bufferList.BytesPerSample() * kChannels, + .mChannelsPerFrame = kChannels, + .mBitsPerChannel = bufferList.BytesPerSample() * 8, + }; + + absl::StatusOr> matrix = + MediaPipeConvertAudioBufferListToAudioMatrix(bufferList.GetBufferList(), &kStreamDescription, + static_cast(kFrames)); + if (!matrix.ok()) { + XCTFail(@"Unable to convert a sample buffer list to a matrix: %s", + matrix.status().ToString().c_str()); + } + + XCTAssertTrue(inputMatrix.isApprox(**matrix, kMatrixComparisonPrecisionInt16)); +} + +@end