7c331ad58b
GitOrigin-RevId: 6e4aff1cc351be3ae4537b677f36d139ee50ce09
229 lines
7.7 KiB
C++
229 lines
7.7 KiB
C++
// Copyright 2019 The MediaPipe Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef MEDIAPIPE_UTIL_AUDIO_DECODER_H_
|
|
#define MEDIAPIPE_UTIL_AUDIO_DECODER_H_
|
|
|
|
#include <cstdint> // required by avutil.h
|
|
#include <deque>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "absl/flags/flag.h"
|
|
#include "absl/time/time.h"
|
|
#include "mediapipe/framework/formats/time_series_header.pb.h"
|
|
#include "mediapipe/framework/packet.h"
|
|
#include "mediapipe/framework/port/integral_types.h"
|
|
#include "mediapipe/framework/port/status.h"
|
|
#include "mediapipe/framework/timestamp.h"
|
|
#include "mediapipe/util/audio_decoder.pb.h"
|
|
|
|
extern "C" {
|
|
#include "libavcodec/avcodec.h"
|
|
#include "libavformat/avformat.h"
|
|
#include "libavutil/avutil.h"
|
|
#include "libavutil/dict.h"
|
|
#include "mediapipe/util/audio_decoder.pb.h"
|
|
}
|
|
|
|
namespace mediapipe {
|
|
|
|
using mediapipe::AudioStreamOptions;
|
|
using mediapipe::TimeSeriesHeader;
|
|
|
|
// The base helper class for a processor which handles decoding of a single
|
|
// stream.
|
|
class BasePacketProcessor {
|
|
public:
|
|
BasePacketProcessor();
|
|
virtual ~BasePacketProcessor();
|
|
|
|
// Opens the codec.
|
|
virtual absl::Status Open(int id, AVStream* stream) = 0;
|
|
|
|
// Processes a packet of data. Caller retains ownership of packet.
|
|
virtual absl::Status ProcessPacket(AVPacket* packet) = 0;
|
|
|
|
// Returns true if the processor has data immediately available
|
|
// (without providing more data with ProcessPacket()).
|
|
bool HasData();
|
|
|
|
// Fills packet with the next frame of data. Returns an empty packet
|
|
// if there is nothing to return.
|
|
absl::Status GetData(Packet* packet);
|
|
|
|
// Once no more AVPackets are available in the file, each stream must
|
|
// be flushed to get any remaining frames which the codec is buffering.
|
|
absl::Status Flush();
|
|
|
|
// Closes the Processor, this does not close the file. You may not
|
|
// call ProcessPacket() after calling Close(). Close() may be called
|
|
// repeatedly.
|
|
void Close();
|
|
|
|
protected:
|
|
// Decodes frames in a packet.
|
|
virtual absl::Status Decode(const AVPacket& packet,
|
|
bool ignore_decode_failures);
|
|
|
|
// Processes a decoded frame.
|
|
virtual absl::Status ProcessDecodedFrame(const AVPacket& packet) = 0;
|
|
|
|
// Corrects the given PTS for MPEG PTS rollover. Assumed to be called with
|
|
// the PTS of each frame in decode order. We detect a rollover whenever the
|
|
// PTS timestamp changes by more than 2^33/2 (half the timestamp space). For
|
|
// video this means every 26.5h with 1 PTS tick = 1/90000 of a second.
|
|
// Example timeline:
|
|
// CorrectPtsForRollover(0) -> 0
|
|
// CorrectPtsForRollover(42) -> 42
|
|
// CorrectPtsForRollover(2^33 - 1) -> 2^33 - 1
|
|
// CorrectPtsForRollover(0) -> 2^33 // PTS in media rolls over, corrected.
|
|
// CorrectPtsForRollover(1) -> 2^33 + 1
|
|
int64 CorrectPtsForRollover(int64 media_pts);
|
|
|
|
AVCodecContext* avcodec_ctx_ = nullptr;
|
|
const AVCodec* avcodec_ = nullptr;
|
|
AVDictionary* avcodec_opts_ = nullptr;
|
|
AVFrame* decoded_frame_ = nullptr;
|
|
|
|
// Stream ID this object processes.
|
|
int id_ = -1;
|
|
|
|
// Set to true if the stream has been flushed and no more AVPackets
|
|
// will be processed with it.
|
|
bool flushed_ = false;
|
|
|
|
// The source time base.
|
|
AVRational source_time_base_;
|
|
// The output time base.
|
|
const AVRational output_time_base_;
|
|
|
|
// The source frame rate (estimated from header information).
|
|
AVRational source_frame_rate_;
|
|
|
|
// The number of frames that were successfully processed.
|
|
int64 num_frames_processed_ = 0;
|
|
|
|
int bytes_per_sample_ = 0;
|
|
|
|
// boolean flag to show if time regression has been detected for last frame;
|
|
bool last_frame_time_regression_detected_ = false;
|
|
|
|
// The last rollover corrected PTS returned by CorrectPtsForRollover.
|
|
int64 rollover_corrected_last_pts_ = AV_NOPTS_VALUE;
|
|
|
|
// The buffer of current frames.
|
|
std::deque<Packet> buffer_;
|
|
};
|
|
|
|
// Class which decodes packets from a single audio stream.
|
|
class AudioPacketProcessor : public BasePacketProcessor {
|
|
public:
|
|
explicit AudioPacketProcessor(const AudioStreamOptions& options);
|
|
|
|
absl::Status Open(int id, AVStream* stream) override;
|
|
|
|
absl::Status ProcessPacket(AVPacket* packet) override;
|
|
|
|
absl::Status FillHeader(TimeSeriesHeader* header) const;
|
|
|
|
private:
|
|
// Appends audio in buffer(s) to the output buffer (buffer_).
|
|
absl::Status AddAudioDataToBuffer(const Timestamp output_timestamp,
|
|
uint8* const* raw_audio,
|
|
int buf_size_bytes);
|
|
|
|
// Converts a number of samples into an approximate stream timestamp value.
|
|
int64 SampleNumberToTimestamp(const int64 sample_number);
|
|
int64 TimestampToSampleNumber(const int64 timestamp);
|
|
|
|
// Converts a timestamp/sample number to microseconds.
|
|
int64 TimestampToMicroseconds(const int64 timestamp);
|
|
int64 SampleNumberToMicroseconds(const int64 sample_number);
|
|
|
|
// Returns an error if the sample format in avformat_ctx_.sample_format
|
|
// is not supported.
|
|
absl::Status ValidateSampleFormat();
|
|
|
|
// Processes a decoded audio frame. audio_frame_ must have been filled
|
|
// with the frame before calling this function.
|
|
absl::Status ProcessDecodedFrame(const AVPacket& packet) override;
|
|
|
|
// Corrects PTS for rollover if correction is enabled.
|
|
int64 MaybeCorrectPtsForRollover(int64 media_pts);
|
|
|
|
// Number of channels to output. This value might be different from
|
|
// the actual number of channels for the current AVPacket, found in
|
|
// avcodec_ctx_->channels.
|
|
int num_channels_ = -1;
|
|
|
|
// Sample rate of the data to output. This value might be different
|
|
// from the actual sample rate for the current AVPacket, found in
|
|
// avcodec_ctx_->sample_rate.
|
|
int64 sample_rate_ = -1;
|
|
|
|
// The time base of audio samples (i.e. the reciprocal of the sample rate).
|
|
AVRational sample_time_base_;
|
|
|
|
// The timestamp of the last packet added to the buffer.
|
|
Timestamp last_timestamp_;
|
|
|
|
// The expected sample number based on counting samples.
|
|
int64 expected_sample_number_ = 0;
|
|
|
|
// Options for the processor.
|
|
AudioStreamOptions options_;
|
|
};
|
|
|
|
// Decode the audio streams of a media file. The AudioDecoder is responsible
|
|
// for demuxing the audio streams in the container format, whereas decoding of
|
|
// the content is delegated to AudioPacketProcessor.
|
|
class AudioDecoder {
|
|
public:
|
|
AudioDecoder();
|
|
~AudioDecoder();
|
|
|
|
absl::Status Initialize(const std::string& input_file,
|
|
const mediapipe::AudioDecoderOptions options);
|
|
|
|
absl::Status GetData(int* options_index, Packet* data);
|
|
|
|
absl::Status Close();
|
|
|
|
absl::Status FillAudioHeader(const AudioStreamOptions& stream_option,
|
|
TimeSeriesHeader* header) const;
|
|
|
|
private:
|
|
absl::Status ProcessPacket();
|
|
absl::Status Flush();
|
|
|
|
std::map<int, int> stream_id_to_audio_options_index_;
|
|
std::map<int, int> stream_index_to_stream_id_;
|
|
std::map<int, std::unique_ptr<AudioPacketProcessor>> audio_processor_;
|
|
|
|
// Indexed by container stream index, true if the stream has not seen
|
|
// a packet (whether returned or not), and false otherwise.
|
|
std::vector<bool> is_first_packet_;
|
|
bool flushed_ = false;
|
|
|
|
Timestamp start_time_ = Timestamp::Unset();
|
|
Timestamp end_time_ = Timestamp::Unset();
|
|
|
|
AVFormatContext* avformat_ctx_ = nullptr;
|
|
};
|
|
|
|
} // namespace mediapipe
|
|
|
|
#endif // MEDIAPIPE_UTIL_AUDIO_DECODER_H_
|