// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "mediapipe/util/audio_decoder.h" #include #include // required by avutil.h #include #include #include #include "Eigen/Core" #include "absl/base/internal/endian.h" #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" #include "absl/strings/substitute.h" #include "absl/time/time.h" #include "mediapipe/framework/deps/cleanup.h" #include "mediapipe/framework/formats/matrix.h" #include "mediapipe/framework/port/map_util.h" #include "mediapipe/framework/port/ret_check.h" #include "mediapipe/framework/port/status.h" #include "mediapipe/framework/tool/status_util.h" extern "C" { #include "libavcodec/avcodec.h" #include "libavformat/avformat.h" #include "libavutil/avutil.h" #include "libavutil/mem.h" #include "libavutil/samplefmt.h" } ABSL_FLAG(int64_t, media_decoder_allowed_audio_gap_merge, 5, "The time gap forwards or backwards in the audio to ignore. " "Timestamps in media files are restricted by the container format " "and stream codec and are invariably not accurate to exact sample " "numbers. If the discrepency between time based on counting " "samples and based on the container timestamps grows beyond this " "value it will be reset to the value in the audio stream and " "counting based on samples will resume."); namespace mediapipe { // MPEG PTS max value + 1, used to correct for PTS rollover. Unit is PTS ticks. const int64 kMpegPtsEpoch = 1LL << 33; // Maximum PTS change between frames. Larger changes are considered to indicate // the MPEG PTS has rolled over. Unit is PTS ticks. const int64 kMpegPtsMaxDelta = kMpegPtsEpoch / 2; // BasePacketProcessor namespace { inline std::string TimestampToString(int64 timestamp) { if (timestamp == AV_NOPTS_VALUE) { return "NOPTS"; } return absl::StrCat(timestamp); } float Uint32ToFloat(uint32 raw_value) { float value; memcpy(&value, &raw_value, 4); return value; } std::string AvErrorToString(int error) { if (error >= 0) { return absl::StrCat("Not an error (", error, ")"); } switch (error) { case AVERROR(EINVAL): return "AVERROR(EINVAL) - unknown error or invalid data"; case AVERROR(EIO): return "AVERROR(EIO) - I/O error"; case AVERROR(EDOM): return "AVERROR(EDOM) - Number syntax expected in filename."; case AVERROR(ENOMEM): return "AVERROR(ENOMEM) - not enough memory"; case AVERROR(EILSEQ): return "AVERROR(EILSEQ) - unknown format"; case AVERROR(ENOSYS): return "AVERROR(ENOSYS) - Operation not supported."; case AVERROR(ENOENT): return "AVERROR(ENOENT) - No such file or directory."; case AVERROR(EPIPE): return "AVERROR(EPIPE) - End of file."; case AVERROR_BSF_NOT_FOUND: return "AVERROR_BSF_NOT_FOUND - Bitstream filter not found."; case AVERROR_BUG: return "AVERROR_BUG - Internal bug, should not have happened."; case AVERROR_BUG2: return "AVERROR_BUG2 - Internal bug, should not have happened."; case AVERROR_BUFFER_TOO_SMALL: return "AVERROR_BUFFER_TOO_SMALL - Buffer too small."; case AVERROR_DECODER_NOT_FOUND: return "AVERROR_DECODER_NOT_FOUND - Decoder not found."; case AVERROR_DEMUXER_NOT_FOUND: return "AVERROR_DEMUXER_NOT_FOUND - Demuxer not found."; case AVERROR_ENCODER_NOT_FOUND: return "AVERROR_ENCODER_NOT_FOUND - Encoder not found."; case AVERROR_EOF: return "AVERROR_EOF - End of file."; case AVERROR_EXIT: return "AVERROR_EXIT - Immediate exit was requested."; case AVERROR_EXTERNAL: return "AVERROR_EXTERNAL - Generic error in an external library."; case AVERROR_FILTER_NOT_FOUND: return "AVERROR_FILTER_NOT_FOUND - Filter not found."; case AVERROR_INVALIDDATA: return "AVERROR_INVALIDDATA - Invalid data found when processing input."; case AVERROR_MUXER_NOT_FOUND: return "AVERROR_MUXER_NOT_FOUND - Muxer not found."; case AVERROR_OPTION_NOT_FOUND: return "AVERROR_OPTION_NOT_FOUND - Option not found."; case AVERROR_PATCHWELCOME: return "AVERROR_PATCHWELCOME - Not yet implemented in FFmpeg, " "patches welcome."; case AVERROR_PROTOCOL_NOT_FOUND: return "AVERROR_PROTOCOL_NOT_FOUND - Protocol not found."; case AVERROR_STREAM_NOT_FOUND: return "AVERROR_STREAM_NOT_FOUND - Stream not found."; case AVERROR_EXPERIMENTAL: return "AVERROR_EXPERIMENTAL - Requested feature is flagged " "experimental."; case AVERROR_INPUT_CHANGED: return "AVERROR_INPUT_CHANGED - Input changed between calls."; case AVERROR_OUTPUT_CHANGED: return "AVERROR_OUTPUT_CHANGED - Output changed between calls."; default: // FALLTHRU {} } char buf[AV_ERROR_MAX_STRING_SIZE]; if (av_strerror(error, buf, sizeof(buf)) == 0) { return absl::StrCat("AVERROR(", error, ") - ", buf); } return absl::StrCat("Unknown AVERROR number ", error); } // Send a packet to the decoder. absl::Status SendPacket(const AVPacket& packet, AVCodecContext* avcodec_ctx) { const int error = avcodec_send_packet(avcodec_ctx, &packet); if (error != 0 && error != AVERROR_EOF) { // Not consider AVERROR_EOF as an error because it can happen when more // than 1 flush packet is sent. return UnknownError(absl::StrCat("Failed to send packet: error=", error, " (", AvErrorToString(error), "). Packet size: ", packet.size)); } return absl::OkStatus(); } // Receive a decoded frame from the decoder. absl::Status ReceiveFrame(AVCodecContext* avcodec_ctx, AVFrame* frame, bool* received) { const int error = avcodec_receive_frame(avcodec_ctx, frame); *received = error == 0; if (error != 0 && error != AVERROR_EOF && error != AVERROR(EAGAIN)) { // Not consider AVERROR_EOF as an error because it can happen after a // flush, and AVERROR(EAGAIN) because it happens when there's no (more) // frame to be received from this packet. return UnknownError(absl::StrCat(" Failed to receive frame: error=", error, " (", AvErrorToString(error), ").")); } return absl::OkStatus(); } absl::Status LogStatus(const absl::Status& status, const AVCodecContext& avcodec_ctx, const AVPacket& packet, bool always_return_ok_status) { if (status.ok()) { return status; } VLOG(3) << "Failed to process packet:" << " media_type:" << (avcodec_ctx.codec_type == AVMEDIA_TYPE_VIDEO ? "video" : "audio") << " codec_id:" << avcodec_ctx.codec_id << " frame_number:" << avcodec_ctx.frame_number << " pts:" << TimestampToString(packet.pts) << " dts:" << TimestampToString(packet.dts) << " size:" << packet.size << (packet.flags & AV_PKT_FLAG_KEY ? " Key Frame." : ""); if (always_return_ok_status) { LOG(WARNING) << status.message(); return absl::OkStatus(); } else { return status; } } class AVPacketDeleter { public: void operator()(void* x) const { AVPacket* packet = static_cast(x); if (packet) { av_free_packet(packet); delete packet; } } }; } // namespace BasePacketProcessor::BasePacketProcessor() : decoded_frame_(av_frame_alloc()), source_time_base_{0, 0}, output_time_base_{1, 1000000}, source_frame_rate_{0, 0} {} BasePacketProcessor::~BasePacketProcessor() { Close(); } bool BasePacketProcessor::HasData() { return !buffer_.empty(); } absl::Status BasePacketProcessor::GetData(Packet* packet) { CHECK(packet); CHECK(!buffer_.empty()); *packet = buffer_.front(); buffer_.pop_front(); return absl::OkStatus(); } absl::Status BasePacketProcessor::Flush() { int64 last_num_frames_processed; do { std::unique_ptr av_packet(new AVPacket()); av_init_packet(av_packet.get()); av_packet->size = 0; av_packet->data = nullptr; av_packet->stream_index = id_; last_num_frames_processed = num_frames_processed_; // ProcessPacket increments num_frames_processed_ if it is able to // decode a frame. Not being able to decode a frame while being // flushed signals that the codec is completely done. MP_RETURN_IF_ERROR(ProcessPacket(av_packet.get())); } while (last_num_frames_processed != num_frames_processed_); flushed_ = true; return absl::OkStatus(); } void BasePacketProcessor::Close() { if (avcodec_ctx_) { if (avcodec_ctx_->codec) { avcodec_close(avcodec_ctx_); av_free(avcodec_ctx_); } avcodec_ctx_ = nullptr; } if (avcodec_opts_) { av_dict_free(&avcodec_opts_); } if (decoded_frame_) { av_frame_free(&decoded_frame_); } } absl::Status BasePacketProcessor::Decode(const AVPacket& packet, bool ignore_decode_failures) { MP_RETURN_IF_ERROR(LogStatus(SendPacket(packet, avcodec_ctx_), *avcodec_ctx_, packet, ignore_decode_failures)); while (true) { bool received; MP_RETURN_IF_ERROR( LogStatus(ReceiveFrame(avcodec_ctx_, decoded_frame_, &received), *avcodec_ctx_, packet, ignore_decode_failures)); if (received) { // Successfully decoded a frame (i.e., received it from the decoder). Now // further process it. MP_RETURN_IF_ERROR(ProcessDecodedFrame(packet)); } else { break; } } return absl::OkStatus(); } int64 BasePacketProcessor::CorrectPtsForRollover(int64 media_pts) { const int64 rollover_pts_media_bits = kMpegPtsEpoch - 1; // Ensure PTS in range 0 ... kMpegPtsEpoch. This avoids errors from post // decode PTS corrections that overflow the epoch range (while still yielding // the correct result as long as the corrections do not exceed // kMpegPtsMaxDelta). media_pts &= rollover_pts_media_bits; if (rollover_corrected_last_pts_ == AV_NOPTS_VALUE) { // First seen PTS. rollover_corrected_last_pts_ = media_pts; } else { int64 prev_media_pts = rollover_corrected_last_pts_ & rollover_pts_media_bits; int64 pts_step = media_pts - prev_media_pts; if (pts_step > kMpegPtsMaxDelta) { pts_step = pts_step - kMpegPtsEpoch; } else if (pts_step < -kMpegPtsMaxDelta) { pts_step = kMpegPtsEpoch + pts_step; } rollover_corrected_last_pts_ = std::max((int64)0, rollover_corrected_last_pts_ + pts_step); } return rollover_corrected_last_pts_; } // AudioPacketProcessor namespace { // Converts a PCM_S16LE-encoded input sample to float between -1 and 1. inline float PcmEncodedSampleToFloat(const char* data) { static const float kMultiplier = 1.f / (1 << 15); return absl::little_endian::Load16(data) * kMultiplier; } // Converts a PCM_S32LE-encoded input sample to float between -1 and 1. inline float PcmEncodedSampleInt32ToFloat(const char* data) { static constexpr float kMultiplier = 1.f / (1u << 31); return absl::little_endian::Load32(data) * kMultiplier; } } // namespace AudioPacketProcessor::AudioPacketProcessor(const AudioStreamOptions& options) : sample_time_base_{0, 0}, options_(options) { DCHECK(absl::little_endian::IsLittleEndian()); } absl::Status AudioPacketProcessor::Open(int id, AVStream* stream) { id_ = id; avcodec_ = avcodec_find_decoder(stream->codecpar->codec_id); if (!avcodec_) { return absl::InvalidArgumentError("Failed to find codec"); } avcodec_ctx_ = avcodec_alloc_context3(avcodec_); avcodec_parameters_to_context(avcodec_ctx_, stream->codecpar); if (avcodec_open2(avcodec_ctx_, avcodec_, &avcodec_opts_) < 0) { return UnknownError("avcodec_open() failed."); } CHECK(avcodec_ctx_->codec); source_time_base_ = stream->time_base; source_frame_rate_ = stream->r_frame_rate; last_frame_time_regression_detected_ = false; MP_RETURN_IF_ERROR(ValidateSampleFormat()); bytes_per_sample_ = av_get_bytes_per_sample(avcodec_ctx_->sample_fmt); num_channels_ = avcodec_ctx_->channels; sample_rate_ = avcodec_ctx_->sample_rate; if (num_channels_ <= 0) { return UnknownError("num_channels must be strictly positive."); } if (sample_rate_ <= 0) { return UnknownError("sample_rate must be strictly positive."); } sample_time_base_ = {1, static_cast(sample_rate_)}; VLOG(0) << absl::Substitute( "Opened audio stream (id: $0, channels: $1, sample rate: $2, time base: " "$3/$4).", id_, num_channels_, sample_rate_, source_time_base_.num, source_time_base_.den); return absl::OkStatus(); } absl::Status AudioPacketProcessor::ValidateSampleFormat() { switch (avcodec_ctx_->sample_fmt) { case AV_SAMPLE_FMT_S16: case AV_SAMPLE_FMT_S16P: case AV_SAMPLE_FMT_S32: case AV_SAMPLE_FMT_FLT: case AV_SAMPLE_FMT_FLTP: return absl::OkStatus(); default: return mediapipe::UnimplementedErrorBuilder(MEDIAPIPE_LOC) << "sample_fmt = " << avcodec_ctx_->sample_fmt; } } int64 AudioPacketProcessor::SampleNumberToTimestamp(const int64 sample_number) { return av_rescale_q(sample_number, sample_time_base_, source_time_base_); } int64 AudioPacketProcessor::TimestampToSampleNumber(const int64 timestamp) { return av_rescale_q(timestamp, source_time_base_, sample_time_base_); } int64 AudioPacketProcessor::TimestampToMicroseconds(const int64 timestamp) { return av_rescale_q(timestamp, source_time_base_, {1, 1000000}); } int64 AudioPacketProcessor::SampleNumberToMicroseconds( const int64 sample_number) { return av_rescale_q(sample_number, sample_time_base_, {1, 1000000}); } absl::Status AudioPacketProcessor::ProcessPacket(AVPacket* packet) { CHECK(packet); if (flushed_) { return UnknownError( "ProcessPacket was called, but AudioPacketProcessor is already " "finished."); } RET_CHECK_EQ(packet->stream_index, id_); decoded_frame_->nb_samples = 0; return Decode(*packet, options_.ignore_decode_failures()); } absl::Status AudioPacketProcessor::ProcessDecodedFrame(const AVPacket& packet) { RET_CHECK_EQ(decoded_frame_->channels, num_channels_); int buf_size_bytes = av_samples_get_buffer_size(nullptr, num_channels_, decoded_frame_->nb_samples, avcodec_ctx_->sample_fmt, 1); VLOG(3) << "Audio packet " << avcodec_ctx_->frame_number << " pts: " << TimestampToString(packet.pts) << " frame.pts:" << TimestampToString(decoded_frame_->pts) << " pkt_dts:" << TimestampToString(decoded_frame_->pkt_dts) << " dts:" << TimestampToString(packet.dts) << " size:" << packet.size << " decoded:" << buf_size_bytes; uint8* const* data_ptr = decoded_frame_->data; if (!data_ptr[0]) { return UnknownError("No data in audio frame."); } if (decoded_frame_->pts != AV_NOPTS_VALUE) { int64 pts = MaybeCorrectPtsForRollover(decoded_frame_->pts); if (num_frames_processed_ == 0) { expected_sample_number_ = TimestampToSampleNumber(pts); } const int64 expected_us = SampleNumberToMicroseconds(expected_sample_number_); const int64 actual_us = TimestampToMicroseconds(pts); if (absl::Microseconds(std::abs(expected_us - actual_us)) > absl::Seconds( absl::GetFlag(FLAGS_media_decoder_allowed_audio_gap_merge))) { LOG(ERROR) << "The expected time based on how many samples we have seen (" << expected_us << " microseconds) no longer matches the time based " "on what the audio stream is telling us (" << actual_us << " microseconds). The difference is more than " "--media_decoder_allowed_audio_gap_merge (" << absl::FormatDuration(absl::Seconds(absl::GetFlag( FLAGS_media_decoder_allowed_audio_gap_merge))) << " microseconds). Resetting the timestamps to track what " "the audio stream is telling us."; expected_sample_number_ = TimestampToSampleNumber(pts); } } MP_RETURN_IF_ERROR(AddAudioDataToBuffer( Timestamp(av_rescale_q(expected_sample_number_, sample_time_base_, output_time_base_)), data_ptr, buf_size_bytes)); ++num_frames_processed_; return absl::OkStatus(); } absl::Status AudioPacketProcessor::AddAudioDataToBuffer( const Timestamp output_timestamp, uint8* const* raw_audio, int buf_size_bytes) { if (buf_size_bytes == 0) { return absl::OkStatus(); } if (buf_size_bytes % (num_channels_ * bytes_per_sample_) != 0) { return UnknownError("Buffer is not an integral number of samples."); } const int64 num_samples = buf_size_bytes / bytes_per_sample_ / num_channels_; VLOG(3) << "Adding " << num_samples << " audio samples in " << num_channels_ << " channels to output."; auto current_frame = absl::make_unique(num_channels_, num_samples); const char* sample_ptr = nullptr; switch (avcodec_ctx_->sample_fmt) { case AV_SAMPLE_FMT_S16: sample_ptr = reinterpret_cast(raw_audio[0]); for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) { for (int channel = 0; channel < num_channels_; ++channel) { (*current_frame)(channel, sample_index) = PcmEncodedSampleToFloat(sample_ptr); sample_ptr += bytes_per_sample_; } } break; case AV_SAMPLE_FMT_S32: sample_ptr = reinterpret_cast(raw_audio[0]); for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) { for (int channel = 0; channel < num_channels_; ++channel) { (*current_frame)(channel, sample_index) = PcmEncodedSampleInt32ToFloat(sample_ptr); sample_ptr += bytes_per_sample_; } } break; case AV_SAMPLE_FMT_FLT: sample_ptr = reinterpret_cast(raw_audio[0]); for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) { for (int channel = 0; channel < num_channels_; ++channel) { (*current_frame)(channel, sample_index) = Uint32ToFloat(absl::little_endian::Load32(sample_ptr)); sample_ptr += bytes_per_sample_; } } break; case AV_SAMPLE_FMT_S16P: for (int channel = 0; channel < num_channels_; ++channel) { sample_ptr = reinterpret_cast(raw_audio[channel]); for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) { (*current_frame)(channel, sample_index) = PcmEncodedSampleToFloat(sample_ptr); sample_ptr += bytes_per_sample_; } } break; case AV_SAMPLE_FMT_FLTP: for (int channel = 0; channel < num_channels_; ++channel) { sample_ptr = reinterpret_cast(raw_audio[channel]); for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) { (*current_frame)(channel, sample_index) = Uint32ToFloat(absl::little_endian::Load32(sample_ptr)); sample_ptr += bytes_per_sample_; } } break; default: return mediapipe::UnimplementedErrorBuilder(MEDIAPIPE_LOC) << "sample_fmt = " << avcodec_ctx_->sample_fmt; } if (options_.output_regressing_timestamps() || last_timestamp_ == Timestamp::Unset() || output_timestamp > last_timestamp_) { buffer_.push_back(Adopt(current_frame.release()).At(output_timestamp)); last_timestamp_ = output_timestamp; if (last_frame_time_regression_detected_) { last_frame_time_regression_detected_ = false; LOG(INFO) << "Processor " << this << " resumed audio packet processing."; } } else if (!last_frame_time_regression_detected_) { last_frame_time_regression_detected_ = true; LOG(ERROR) << "Processor " << this << " is dropping an audio packet because the timestamps " "regressed. Was " << last_timestamp_ << " but got " << output_timestamp; } expected_sample_number_ += num_samples; return absl::OkStatus(); } absl::Status AudioPacketProcessor::FillHeader(TimeSeriesHeader* header) const { CHECK(header); header->set_sample_rate(sample_rate_); header->set_num_channels(num_channels_); return absl::OkStatus(); } int64 AudioPacketProcessor::MaybeCorrectPtsForRollover(int64 media_pts) { return options_.correct_pts_for_rollover() ? CorrectPtsForRollover(media_pts) : media_pts; } // AudioDecoder AudioDecoder::AudioDecoder() { av_register_all(); } AudioDecoder::~AudioDecoder() { absl::Status status = Close(); if (!status.ok()) { LOG(ERROR) << "Encountered error while closing media file: " << status.message(); } } absl::Status AudioDecoder::Initialize( const std::string& input_file, const mediapipe::AudioDecoderOptions options) { if (options.audio_stream().empty()) { return absl::InvalidArgumentError( "At least one audio_stream must be defined in AudioDecoderOptions"); } std::map stream_index_to_audio_options_index; int options_index = 0; for (const auto& audio_stream : options.audio_stream()) { InsertIfNotPresent(&stream_index_to_audio_options_index, audio_stream.stream_index(), options_index); ++options_index; } Cleanup> decoder_closer([this]() { absl::Status status = Close(); if (!status.ok()) { LOG(ERROR) << "Encountered error while closing media file: " << status.message(); } }); avformat_ctx_ = avformat_alloc_context(); if (avformat_open_input(&avformat_ctx_, input_file.c_str(), NULL, NULL) < 0) { return absl::InvalidArgumentError( absl::StrCat("Could not open file: ", input_file)); } if (avformat_find_stream_info(avformat_ctx_, NULL) < 0) { return absl::InvalidArgumentError(absl::StrCat( "Could not find stream information of file: ", input_file)); } std::map audio_options_index_to_stream_id; for (int current_audio_index = 0, stream_id = 0; stream_id < avformat_ctx_->nb_streams; ++stream_id) { AVStream* stream = avformat_ctx_->streams[stream_id]; AVCodecParameters* dec_param = stream->codecpar; switch (dec_param->codec_type) { case AVMEDIA_TYPE_AUDIO: { const int* options_index_ptr = FindOrNull( stream_index_to_audio_options_index, current_audio_index); if (options_index_ptr) { std::unique_ptr processor = absl::make_unique( options.audio_stream(*options_index_ptr)); if (!ContainsKey(audio_processor_, stream_id)) { LOG(INFO) << "Created audio processor " << processor.get() << " for file \"" << input_file << "\""; } else { LOG(ERROR) << "Stream " << stream_id << " already mapped to audio processor " << audio_processor_[stream_id].get(); } MP_RETURN_IF_ERROR(processor->Open(stream_id, stream)); audio_processor_.emplace(stream_id, std::move(processor)); CHECK(InsertIfNotPresent( &stream_index_to_stream_id_, options.audio_stream(*options_index_ptr).stream_index(), stream_id)); CHECK(InsertIfNotPresent(&stream_id_to_audio_options_index_, stream_id, *options_index_ptr)); CHECK(InsertIfNotPresent(&audio_options_index_to_stream_id, *options_index_ptr, stream_id)); } ++current_audio_index; break; } default: { // Ignore other stream types. } } } for (int i = 0; i < options.audio_stream_size(); ++i) { RET_CHECK(ContainsKey(audio_options_index_to_stream_id, i) || options.audio_stream(i).allow_missing()) << absl::StrCat("Could not find audio stream with index ", i, " in file ", input_file); } if (options.has_start_time()) { start_time_ = Timestamp::FromSeconds(options.start_time()); } if (options.has_end_time()) { end_time_ = Timestamp::FromSeconds(options.end_time()); } is_first_packet_.resize(avformat_ctx_->nb_streams, true); decoder_closer.release(); return absl::OkStatus(); } absl::Status AudioDecoder::GetData(int* options_index, Packet* data) { while (true) { for (auto& item : audio_processor_) { while (item.second && item.second->HasData()) { bool is_first_packet = is_first_packet_[item.first]; is_first_packet_[item.first] = false; *options_index = FindOrDie(stream_id_to_audio_options_index_, item.first); absl::Status status = item.second->GetData(data); // Ignore packets which are out of the requested timestamp range. if (start_time_ != Timestamp::Unset()) { if (is_first_packet && data->Timestamp() > start_time_) { LOG(ERROR) << "First packet in audio stream " << *options_index << " has timestamp " << data->Timestamp() << " which is after start time of " << start_time_ << "."; } if (data->Timestamp() < start_time_) { VLOG(1) << "Skipping audio frame with timestamp " << data->Timestamp() << " before start time " << start_time_; *data = Packet(); continue; } } if (end_time_ != Timestamp::Unset() && data->Timestamp() > end_time_) { VLOG(1) << "Skipping audio frame with timestamp " << data->Timestamp() << " after end time " << end_time_; // We are past the last timestamp we care about, close the // packet processor. We cannot remove the element from // audio_processor_ right now, because we need to continue // iterating through it. item.second->Close(); item.second.reset(nullptr); *data = Packet(); continue; } return status; } } if (flushed_) { MP_RETURN_IF_ERROR(Close()); return tool::StatusStop(); } MP_RETURN_IF_ERROR(ProcessPacket()); } return absl::OkStatus(); } absl::Status AudioDecoder::Close() { for (auto& item : audio_processor_) { if (item.second) { item.second->Close(); item.second.reset(nullptr); } } // Free the context. if (avformat_ctx_) { avformat_close_input(&avformat_ctx_); } return absl::OkStatus(); } absl::Status AudioDecoder::FillAudioHeader( const AudioStreamOptions& stream_option, TimeSeriesHeader* header) const { const std::unique_ptr* processor_ptr_ = FindOrNull( audio_processor_, FindOrDie(stream_index_to_stream_id_, stream_option.stream_index())); RET_CHECK(processor_ptr_ && *processor_ptr_) << "audio stream is not open."; MP_RETURN_IF_ERROR((*processor_ptr_)->FillHeader(header)); return absl::OkStatus(); } absl::Status AudioDecoder::ProcessPacket() { std::unique_ptr av_packet(new AVPacket()); av_init_packet(av_packet.get()); av_packet->size = 0; av_packet->data = nullptr; int ret = av_read_frame(avformat_ctx_, av_packet.get()); if (ret >= 0) { CHECK(av_packet->data) << "AVPacket does not include any data but " "av_read_frame was successful."; const int stream_id = av_packet->stream_index; auto audio_iterator = audio_processor_.find(stream_id); if (audio_iterator != audio_processor_.end()) { // This stream_id is belongs to an audio stream we care about. if (audio_iterator->second) { MP_RETURN_IF_ERROR( audio_iterator->second->ProcessPacket(av_packet.get())); } else { VLOG(3) << "processor for stream " << stream_id << " is nullptr."; } } else { VLOG(3) << "Ignoring packet for stream " << stream_id; } return absl::OkStatus(); } VLOG(1) << "Demuxing returned error (or EOF): " << AvErrorToString(ret); if (ret == AVERROR(EAGAIN)) { // EAGAIN is used to signify that the av_packet should be skipped // (maybe the demuxer is trying to re-sync). This definitely // occurs in the FLV and MpegT demuxers. return absl::OkStatus(); } // Unrecoverable demuxing error with details in avformat_ctx_->pb->error. int demuxing_error = avformat_ctx_->pb ? avformat_ctx_->pb->error : 0 /* no error */; if (ret == AVERROR_EOF && !demuxing_error) { VLOG(1) << "Reached EOF."; return Flush(); } RET_CHECK(!demuxing_error) << absl::Substitute( "Failed to read a frame: retval = $0 ($1), avformat_ctx_->pb->error = " "$2 ($3)", ret, AvErrorToString(ret), demuxing_error, AvErrorToString(demuxing_error)); if (is_first_packet_[av_packet->stream_index]) { RET_CHECK_FAIL() << "Couldn't even read the first frame; maybe a partial " "file with only metadata?"; } // Unrecoverable demuxing error without details. RET_CHECK_FAIL() << absl::Substitute( "Failed to read a frame: retval = $0 ($1)", ret, AvErrorToString(ret)); } absl::Status AudioDecoder::Flush() { std::vector statuses; for (auto& item : audio_processor_) { if (item.second) { statuses.push_back(item.second->Flush()); } } flushed_ = true; return tool::CombinedStatus("Error while flushing codecs: ", statuses); } } // namespace mediapipe