// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef MEDIAPIPE_UTIL_AUDIO_DECODER_H_ #define MEDIAPIPE_UTIL_AUDIO_DECODER_H_ #include // required by avutil.h #include #include #include #include "absl/flags/flag.h" #include "absl/time/time.h" #include "mediapipe/framework/formats/time_series_header.pb.h" #include "mediapipe/framework/packet.h" #include "mediapipe/framework/port/integral_types.h" #include "mediapipe/framework/port/status.h" #include "mediapipe/framework/timestamp.h" #include "mediapipe/util/audio_decoder.pb.h" extern "C" { #include "libavcodec/avcodec.h" #include "libavformat/avformat.h" #include "libavutil/avutil.h" #include "libavutil/dict.h" #include "mediapipe/util/audio_decoder.pb.h" } namespace mediapipe { using mediapipe::AudioStreamOptions; using mediapipe::TimeSeriesHeader; // The base helper class for a processor which handles decoding of a single // stream. class BasePacketProcessor { public: BasePacketProcessor(); virtual ~BasePacketProcessor(); // Opens the codec. virtual absl::Status Open(int id, AVStream* stream) = 0; // Processes a packet of data. Caller retains ownership of packet. virtual absl::Status ProcessPacket(AVPacket* packet) = 0; // Returns true if the processor has data immediately available // (without providing more data with ProcessPacket()). bool HasData(); // Fills packet with the next frame of data. Returns an empty packet // if there is nothing to return. absl::Status GetData(Packet* packet); // Once no more AVPackets are available in the file, each stream must // be flushed to get any remaining frames which the codec is buffering. absl::Status Flush(); // Closes the Processor, this does not close the file. You may not // call ProcessPacket() after calling Close(). Close() may be called // repeatedly. void Close(); protected: // Decodes frames in a packet. virtual absl::Status Decode(const AVPacket& packet, bool ignore_decode_failures); // Processes a decoded frame. virtual absl::Status ProcessDecodedFrame(const AVPacket& packet) = 0; // Corrects the given PTS for MPEG PTS rollover. Assumed to be called with // the PTS of each frame in decode order. We detect a rollover whenever the // PTS timestamp changes by more than 2^33/2 (half the timestamp space). For // video this means every 26.5h with 1 PTS tick = 1/90000 of a second. // Example timeline: // CorrectPtsForRollover(0) -> 0 // CorrectPtsForRollover(42) -> 42 // CorrectPtsForRollover(2^33 - 1) -> 2^33 - 1 // CorrectPtsForRollover(0) -> 2^33 // PTS in media rolls over, corrected. // CorrectPtsForRollover(1) -> 2^33 + 1 int64 CorrectPtsForRollover(int64 media_pts); AVCodecContext* avcodec_ctx_ = nullptr; const AVCodec* avcodec_ = nullptr; AVDictionary* avcodec_opts_ = nullptr; AVFrame* decoded_frame_ = nullptr; // Stream ID this object processes. int id_ = -1; // Set to true if the stream has been flushed and no more AVPackets // will be processed with it. bool flushed_ = false; // The source time base. AVRational source_time_base_; // The output time base. const AVRational output_time_base_; // The source frame rate (estimated from header information). AVRational source_frame_rate_; // The number of frames that were successfully processed. int64 num_frames_processed_ = 0; int bytes_per_sample_ = 0; // boolean flag to show if time regression has been detected for last frame; bool last_frame_time_regression_detected_ = false; // The last rollover corrected PTS returned by CorrectPtsForRollover. int64 rollover_corrected_last_pts_ = AV_NOPTS_VALUE; // The buffer of current frames. std::deque buffer_; }; // Class which decodes packets from a single audio stream. class AudioPacketProcessor : public BasePacketProcessor { public: explicit AudioPacketProcessor(const AudioStreamOptions& options); absl::Status Open(int id, AVStream* stream) override; absl::Status ProcessPacket(AVPacket* packet) override; absl::Status FillHeader(TimeSeriesHeader* header) const; private: // Appends audio in buffer(s) to the output buffer (buffer_). absl::Status AddAudioDataToBuffer(const Timestamp output_timestamp, uint8* const* raw_audio, int buf_size_bytes); // Converts a number of samples into an approximate stream timestamp value. int64 SampleNumberToTimestamp(const int64 sample_number); int64 TimestampToSampleNumber(const int64 timestamp); // Converts a timestamp/sample number to microseconds. int64 TimestampToMicroseconds(const int64 timestamp); int64 SampleNumberToMicroseconds(const int64 sample_number); // Returns an error if the sample format in avformat_ctx_.sample_format // is not supported. absl::Status ValidateSampleFormat(); // Processes a decoded audio frame. audio_frame_ must have been filled // with the frame before calling this function. absl::Status ProcessDecodedFrame(const AVPacket& packet) override; // Corrects PTS for rollover if correction is enabled. int64 MaybeCorrectPtsForRollover(int64 media_pts); // Number of channels to output. This value might be different from // the actual number of channels for the current AVPacket, found in // avcodec_ctx_->channels. int num_channels_ = -1; // Sample rate of the data to output. This value might be different // from the actual sample rate for the current AVPacket, found in // avcodec_ctx_->sample_rate. int64 sample_rate_ = -1; // The time base of audio samples (i.e. the reciprocal of the sample rate). AVRational sample_time_base_; // The timestamp of the last packet added to the buffer. Timestamp last_timestamp_; // The expected sample number based on counting samples. int64 expected_sample_number_ = 0; // Options for the processor. AudioStreamOptions options_; }; // Decode the audio streams of a media file. The AudioDecoder is responsible // for demuxing the audio streams in the container format, whereas decoding of // the content is delegated to AudioPacketProcessor. class AudioDecoder { public: AudioDecoder(); ~AudioDecoder(); absl::Status Initialize(const std::string& input_file, const mediapipe::AudioDecoderOptions options); absl::Status GetData(int* options_index, Packet* data); absl::Status Close(); absl::Status FillAudioHeader(const AudioStreamOptions& stream_option, TimeSeriesHeader* header) const; private: absl::Status ProcessPacket(); absl::Status Flush(); std::map stream_id_to_audio_options_index_; std::map stream_index_to_stream_id_; std::map> audio_processor_; // Indexed by container stream index, true if the stream has not seen // a packet (whether returned or not), and false otherwise. std::vector is_first_packet_; bool flushed_ = false; Timestamp start_time_ = Timestamp::Unset(); Timestamp end_time_ = Timestamp::Unset(); AVFormatContext* avformat_ctx_ = nullptr; }; } // namespace mediapipe #endif // MEDIAPIPE_UTIL_AUDIO_DECODER_H_