7c331ad58b
GitOrigin-RevId: 6e4aff1cc351be3ae4537b677f36d139ee50ce09
831 lines
30 KiB
C++
831 lines
30 KiB
C++
// Copyright 2019 The MediaPipe Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "mediapipe/util/audio_decoder.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdint> // required by avutil.h
|
|
#include <cstdlib>
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
#include "Eigen/Core"
|
|
#include "absl/base/internal/endian.h"
|
|
#include "absl/strings/numbers.h"
|
|
#include "absl/strings/str_cat.h"
|
|
#include "absl/strings/substitute.h"
|
|
#include "absl/time/time.h"
|
|
#include "mediapipe/framework/deps/cleanup.h"
|
|
#include "mediapipe/framework/formats/matrix.h"
|
|
#include "mediapipe/framework/port/map_util.h"
|
|
#include "mediapipe/framework/port/ret_check.h"
|
|
#include "mediapipe/framework/port/status.h"
|
|
#include "mediapipe/framework/tool/status_util.h"
|
|
|
|
extern "C" {
|
|
#include "libavcodec/avcodec.h"
|
|
#include "libavformat/avformat.h"
|
|
#include "libavutil/avutil.h"
|
|
#include "libavutil/mem.h"
|
|
#include "libavutil/samplefmt.h"
|
|
}
|
|
|
|
ABSL_FLAG(int64_t, media_decoder_allowed_audio_gap_merge, 5,
|
|
"The time gap forwards or backwards in the audio to ignore. "
|
|
"Timestamps in media files are restricted by the container format "
|
|
"and stream codec and are invariably not accurate to exact sample "
|
|
"numbers. If the discrepency between time based on counting "
|
|
"samples and based on the container timestamps grows beyond this "
|
|
"value it will be reset to the value in the audio stream and "
|
|
"counting based on samples will resume.");
|
|
|
|
namespace mediapipe {
|
|
|
|
// MPEG PTS max value + 1, used to correct for PTS rollover. Unit is PTS ticks.
|
|
const int64 kMpegPtsEpoch = 1LL << 33;
|
|
// Maximum PTS change between frames. Larger changes are considered to indicate
|
|
// the MPEG PTS has rolled over. Unit is PTS ticks.
|
|
const int64 kMpegPtsMaxDelta = kMpegPtsEpoch / 2;
|
|
|
|
// BasePacketProcessor
|
|
namespace {
|
|
|
|
inline std::string TimestampToString(int64 timestamp) {
|
|
if (timestamp == AV_NOPTS_VALUE) {
|
|
return "NOPTS";
|
|
}
|
|
return absl::StrCat(timestamp);
|
|
}
|
|
|
|
float Uint32ToFloat(uint32 raw_value) {
|
|
float value;
|
|
memcpy(&value, &raw_value, 4);
|
|
return value;
|
|
}
|
|
|
|
std::string AvErrorToString(int error) {
|
|
if (error >= 0) {
|
|
return absl::StrCat("Not an error (", error, ")");
|
|
}
|
|
|
|
switch (error) {
|
|
case AVERROR(EINVAL):
|
|
return "AVERROR(EINVAL) - unknown error or invalid data";
|
|
case AVERROR(EIO):
|
|
return "AVERROR(EIO) - I/O error";
|
|
case AVERROR(EDOM):
|
|
return "AVERROR(EDOM) - Number syntax expected in filename.";
|
|
case AVERROR(ENOMEM):
|
|
return "AVERROR(ENOMEM) - not enough memory";
|
|
case AVERROR(EILSEQ):
|
|
return "AVERROR(EILSEQ) - unknown format";
|
|
case AVERROR(ENOSYS):
|
|
return "AVERROR(ENOSYS) - Operation not supported.";
|
|
case AVERROR(ENOENT):
|
|
return "AVERROR(ENOENT) - No such file or directory.";
|
|
case AVERROR(EPIPE):
|
|
return "AVERROR(EPIPE) - End of file.";
|
|
case AVERROR_BSF_NOT_FOUND:
|
|
return "AVERROR_BSF_NOT_FOUND - Bitstream filter not found.";
|
|
case AVERROR_BUG:
|
|
return "AVERROR_BUG - Internal bug, should not have happened.";
|
|
case AVERROR_BUG2:
|
|
return "AVERROR_BUG2 - Internal bug, should not have happened.";
|
|
case AVERROR_BUFFER_TOO_SMALL:
|
|
return "AVERROR_BUFFER_TOO_SMALL - Buffer too small.";
|
|
case AVERROR_DECODER_NOT_FOUND:
|
|
return "AVERROR_DECODER_NOT_FOUND - Decoder not found.";
|
|
case AVERROR_DEMUXER_NOT_FOUND:
|
|
return "AVERROR_DEMUXER_NOT_FOUND - Demuxer not found.";
|
|
case AVERROR_ENCODER_NOT_FOUND:
|
|
return "AVERROR_ENCODER_NOT_FOUND - Encoder not found.";
|
|
case AVERROR_EOF:
|
|
return "AVERROR_EOF - End of file.";
|
|
case AVERROR_EXIT:
|
|
return "AVERROR_EXIT - Immediate exit was requested.";
|
|
case AVERROR_EXTERNAL:
|
|
return "AVERROR_EXTERNAL - Generic error in an external library.";
|
|
case AVERROR_FILTER_NOT_FOUND:
|
|
return "AVERROR_FILTER_NOT_FOUND - Filter not found.";
|
|
case AVERROR_INVALIDDATA:
|
|
return "AVERROR_INVALIDDATA - Invalid data found when processing input.";
|
|
case AVERROR_MUXER_NOT_FOUND:
|
|
return "AVERROR_MUXER_NOT_FOUND - Muxer not found.";
|
|
case AVERROR_OPTION_NOT_FOUND:
|
|
return "AVERROR_OPTION_NOT_FOUND - Option not found.";
|
|
case AVERROR_PATCHWELCOME:
|
|
return "AVERROR_PATCHWELCOME - Not yet implemented in FFmpeg, "
|
|
"patches welcome.";
|
|
case AVERROR_PROTOCOL_NOT_FOUND:
|
|
return "AVERROR_PROTOCOL_NOT_FOUND - Protocol not found.";
|
|
case AVERROR_STREAM_NOT_FOUND:
|
|
return "AVERROR_STREAM_NOT_FOUND - Stream not found.";
|
|
case AVERROR_EXPERIMENTAL:
|
|
return "AVERROR_EXPERIMENTAL - Requested feature is flagged "
|
|
"experimental.";
|
|
case AVERROR_INPUT_CHANGED:
|
|
return "AVERROR_INPUT_CHANGED - Input changed between calls.";
|
|
case AVERROR_OUTPUT_CHANGED:
|
|
return "AVERROR_OUTPUT_CHANGED - Output changed between calls.";
|
|
default:
|
|
// FALLTHRU
|
|
{}
|
|
}
|
|
|
|
char buf[AV_ERROR_MAX_STRING_SIZE];
|
|
if (av_strerror(error, buf, sizeof(buf)) == 0) {
|
|
return absl::StrCat("AVERROR(", error, ") - ", buf);
|
|
}
|
|
|
|
return absl::StrCat("Unknown AVERROR number ", error);
|
|
}
|
|
|
|
// Send a packet to the decoder.
|
|
absl::Status SendPacket(const AVPacket& packet, AVCodecContext* avcodec_ctx) {
|
|
const int error = avcodec_send_packet(avcodec_ctx, &packet);
|
|
if (error != 0 && error != AVERROR_EOF) {
|
|
// Not consider AVERROR_EOF as an error because it can happen when more
|
|
// than 1 flush packet is sent.
|
|
return UnknownError(absl::StrCat("Failed to send packet: error=", error,
|
|
" (", AvErrorToString(error),
|
|
"). Packet size: ", packet.size));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
// Receive a decoded frame from the decoder.
|
|
absl::Status ReceiveFrame(AVCodecContext* avcodec_ctx, AVFrame* frame,
|
|
bool* received) {
|
|
const int error = avcodec_receive_frame(avcodec_ctx, frame);
|
|
*received = error == 0;
|
|
if (error != 0 && error != AVERROR_EOF && error != AVERROR(EAGAIN)) {
|
|
// Not consider AVERROR_EOF as an error because it can happen after a
|
|
// flush, and AVERROR(EAGAIN) because it happens when there's no (more)
|
|
// frame to be received from this packet.
|
|
return UnknownError(absl::StrCat(" Failed to receive frame: error=", error,
|
|
" (", AvErrorToString(error), ")."));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status LogStatus(const absl::Status& status,
|
|
const AVCodecContext& avcodec_ctx,
|
|
const AVPacket& packet, bool always_return_ok_status) {
|
|
if (status.ok()) {
|
|
return status;
|
|
}
|
|
|
|
VLOG(3) << "Failed to process packet:"
|
|
<< " media_type:"
|
|
<< (avcodec_ctx.codec_type == AVMEDIA_TYPE_VIDEO ? "video" : "audio")
|
|
<< " codec_id:" << avcodec_ctx.codec_id
|
|
<< " frame_number:" << avcodec_ctx.frame_number
|
|
<< " pts:" << TimestampToString(packet.pts)
|
|
<< " dts:" << TimestampToString(packet.dts) << " size:" << packet.size
|
|
<< (packet.flags & AV_PKT_FLAG_KEY ? " Key Frame." : "");
|
|
|
|
if (always_return_ok_status) {
|
|
LOG(WARNING) << status.message();
|
|
return absl::OkStatus();
|
|
} else {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
class AVPacketDeleter {
|
|
public:
|
|
void operator()(void* x) const {
|
|
AVPacket* packet = static_cast<AVPacket*>(x);
|
|
if (packet) {
|
|
av_free_packet(packet);
|
|
delete packet;
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
BasePacketProcessor::BasePacketProcessor()
|
|
: decoded_frame_(av_frame_alloc()),
|
|
source_time_base_{0, 0},
|
|
output_time_base_{1, 1000000},
|
|
source_frame_rate_{0, 0} {}
|
|
|
|
BasePacketProcessor::~BasePacketProcessor() { Close(); }
|
|
|
|
bool BasePacketProcessor::HasData() { return !buffer_.empty(); }
|
|
|
|
absl::Status BasePacketProcessor::GetData(Packet* packet) {
|
|
CHECK(packet);
|
|
CHECK(!buffer_.empty());
|
|
*packet = buffer_.front();
|
|
buffer_.pop_front();
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status BasePacketProcessor::Flush() {
|
|
int64 last_num_frames_processed;
|
|
do {
|
|
std::unique_ptr<AVPacket, AVPacketDeleter> av_packet(new AVPacket());
|
|
av_init_packet(av_packet.get());
|
|
av_packet->size = 0;
|
|
av_packet->data = nullptr;
|
|
av_packet->stream_index = id_;
|
|
|
|
last_num_frames_processed = num_frames_processed_;
|
|
// ProcessPacket increments num_frames_processed_ if it is able to
|
|
// decode a frame. Not being able to decode a frame while being
|
|
// flushed signals that the codec is completely done.
|
|
MP_RETURN_IF_ERROR(ProcessPacket(av_packet.get()));
|
|
} while (last_num_frames_processed != num_frames_processed_);
|
|
|
|
flushed_ = true;
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
void BasePacketProcessor::Close() {
|
|
if (avcodec_ctx_) {
|
|
if (avcodec_ctx_->codec) {
|
|
avcodec_close(avcodec_ctx_);
|
|
av_free(avcodec_ctx_);
|
|
}
|
|
avcodec_ctx_ = nullptr;
|
|
}
|
|
if (avcodec_opts_) {
|
|
av_dict_free(&avcodec_opts_);
|
|
}
|
|
if (decoded_frame_) {
|
|
av_frame_free(&decoded_frame_);
|
|
}
|
|
}
|
|
|
|
absl::Status BasePacketProcessor::Decode(const AVPacket& packet,
|
|
bool ignore_decode_failures) {
|
|
MP_RETURN_IF_ERROR(LogStatus(SendPacket(packet, avcodec_ctx_), *avcodec_ctx_,
|
|
packet, ignore_decode_failures));
|
|
while (true) {
|
|
bool received;
|
|
MP_RETURN_IF_ERROR(
|
|
LogStatus(ReceiveFrame(avcodec_ctx_, decoded_frame_, &received),
|
|
*avcodec_ctx_, packet, ignore_decode_failures));
|
|
if (received) {
|
|
// Successfully decoded a frame (i.e., received it from the decoder). Now
|
|
// further process it.
|
|
MP_RETURN_IF_ERROR(ProcessDecodedFrame(packet));
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
int64 BasePacketProcessor::CorrectPtsForRollover(int64 media_pts) {
|
|
const int64 rollover_pts_media_bits = kMpegPtsEpoch - 1;
|
|
// Ensure PTS in range 0 ... kMpegPtsEpoch. This avoids errors from post
|
|
// decode PTS corrections that overflow the epoch range (while still yielding
|
|
// the correct result as long as the corrections do not exceed
|
|
// kMpegPtsMaxDelta).
|
|
media_pts &= rollover_pts_media_bits;
|
|
if (rollover_corrected_last_pts_ == AV_NOPTS_VALUE) {
|
|
// First seen PTS.
|
|
rollover_corrected_last_pts_ = media_pts;
|
|
} else {
|
|
int64 prev_media_pts =
|
|
rollover_corrected_last_pts_ & rollover_pts_media_bits;
|
|
int64 pts_step = media_pts - prev_media_pts;
|
|
if (pts_step > kMpegPtsMaxDelta) {
|
|
pts_step = pts_step - kMpegPtsEpoch;
|
|
} else if (pts_step < -kMpegPtsMaxDelta) {
|
|
pts_step = kMpegPtsEpoch + pts_step;
|
|
}
|
|
rollover_corrected_last_pts_ =
|
|
std::max((int64)0, rollover_corrected_last_pts_ + pts_step);
|
|
}
|
|
return rollover_corrected_last_pts_;
|
|
}
|
|
|
|
// AudioPacketProcessor
|
|
namespace {
|
|
|
|
// Converts a PCM_S16LE-encoded input sample to float between -1 and 1.
|
|
inline float PcmEncodedSampleToFloat(const char* data) {
|
|
static const float kMultiplier = 1.f / (1 << 15);
|
|
return absl::little_endian::Load16(data) * kMultiplier;
|
|
}
|
|
|
|
// Converts a PCM_S32LE-encoded input sample to float between -1 and 1.
|
|
inline float PcmEncodedSampleInt32ToFloat(const char* data) {
|
|
static constexpr float kMultiplier = 1.f / (1u << 31);
|
|
return absl::little_endian::Load32(data) * kMultiplier;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
AudioPacketProcessor::AudioPacketProcessor(const AudioStreamOptions& options)
|
|
: sample_time_base_{0, 0}, options_(options) {
|
|
DCHECK(absl::little_endian::IsLittleEndian());
|
|
}
|
|
|
|
absl::Status AudioPacketProcessor::Open(int id, AVStream* stream) {
|
|
id_ = id;
|
|
avcodec_ = avcodec_find_decoder(stream->codecpar->codec_id);
|
|
if (!avcodec_) {
|
|
return absl::InvalidArgumentError("Failed to find codec");
|
|
}
|
|
avcodec_ctx_ = avcodec_alloc_context3(avcodec_);
|
|
avcodec_parameters_to_context(avcodec_ctx_, stream->codecpar);
|
|
if (avcodec_open2(avcodec_ctx_, avcodec_, &avcodec_opts_) < 0) {
|
|
return UnknownError("avcodec_open() failed.");
|
|
}
|
|
CHECK(avcodec_ctx_->codec);
|
|
|
|
source_time_base_ = stream->time_base;
|
|
source_frame_rate_ = stream->r_frame_rate;
|
|
last_frame_time_regression_detected_ = false;
|
|
|
|
MP_RETURN_IF_ERROR(ValidateSampleFormat());
|
|
bytes_per_sample_ = av_get_bytes_per_sample(avcodec_ctx_->sample_fmt);
|
|
num_channels_ = avcodec_ctx_->channels;
|
|
sample_rate_ = avcodec_ctx_->sample_rate;
|
|
|
|
if (num_channels_ <= 0) {
|
|
return UnknownError("num_channels must be strictly positive.");
|
|
}
|
|
if (sample_rate_ <= 0) {
|
|
return UnknownError("sample_rate must be strictly positive.");
|
|
}
|
|
|
|
sample_time_base_ = {1, static_cast<int>(sample_rate_)};
|
|
|
|
VLOG(0) << absl::Substitute(
|
|
"Opened audio stream (id: $0, channels: $1, sample rate: $2, time base: "
|
|
"$3/$4).",
|
|
id_, num_channels_, sample_rate_, source_time_base_.num,
|
|
source_time_base_.den);
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioPacketProcessor::ValidateSampleFormat() {
|
|
switch (avcodec_ctx_->sample_fmt) {
|
|
case AV_SAMPLE_FMT_S16:
|
|
case AV_SAMPLE_FMT_S16P:
|
|
case AV_SAMPLE_FMT_S32:
|
|
case AV_SAMPLE_FMT_FLT:
|
|
case AV_SAMPLE_FMT_FLTP:
|
|
return absl::OkStatus();
|
|
default:
|
|
return mediapipe::UnimplementedErrorBuilder(MEDIAPIPE_LOC)
|
|
<< "sample_fmt = " << avcodec_ctx_->sample_fmt;
|
|
}
|
|
}
|
|
|
|
int64 AudioPacketProcessor::SampleNumberToTimestamp(const int64 sample_number) {
|
|
return av_rescale_q(sample_number, sample_time_base_, source_time_base_);
|
|
}
|
|
|
|
int64 AudioPacketProcessor::TimestampToSampleNumber(const int64 timestamp) {
|
|
return av_rescale_q(timestamp, source_time_base_, sample_time_base_);
|
|
}
|
|
|
|
int64 AudioPacketProcessor::TimestampToMicroseconds(const int64 timestamp) {
|
|
return av_rescale_q(timestamp, source_time_base_, {1, 1000000});
|
|
}
|
|
|
|
int64 AudioPacketProcessor::SampleNumberToMicroseconds(
|
|
const int64 sample_number) {
|
|
return av_rescale_q(sample_number, sample_time_base_, {1, 1000000});
|
|
}
|
|
|
|
absl::Status AudioPacketProcessor::ProcessPacket(AVPacket* packet) {
|
|
CHECK(packet);
|
|
if (flushed_) {
|
|
return UnknownError(
|
|
"ProcessPacket was called, but AudioPacketProcessor is already "
|
|
"finished.");
|
|
}
|
|
RET_CHECK_EQ(packet->stream_index, id_);
|
|
|
|
decoded_frame_->nb_samples = 0;
|
|
return Decode(*packet, options_.ignore_decode_failures());
|
|
}
|
|
|
|
absl::Status AudioPacketProcessor::ProcessDecodedFrame(const AVPacket& packet) {
|
|
RET_CHECK_EQ(decoded_frame_->channels, num_channels_);
|
|
int buf_size_bytes = av_samples_get_buffer_size(nullptr, num_channels_,
|
|
decoded_frame_->nb_samples,
|
|
avcodec_ctx_->sample_fmt, 1);
|
|
VLOG(3) << "Audio packet " << avcodec_ctx_->frame_number
|
|
<< " pts: " << TimestampToString(packet.pts)
|
|
<< " frame.pts:" << TimestampToString(decoded_frame_->pts)
|
|
<< " pkt_dts:" << TimestampToString(decoded_frame_->pkt_dts)
|
|
<< " dts:" << TimestampToString(packet.dts) << " size:" << packet.size
|
|
<< " decoded:" << buf_size_bytes;
|
|
uint8* const* data_ptr = decoded_frame_->data;
|
|
if (!data_ptr[0]) {
|
|
return UnknownError("No data in audio frame.");
|
|
}
|
|
if (decoded_frame_->pts != AV_NOPTS_VALUE) {
|
|
int64 pts = MaybeCorrectPtsForRollover(decoded_frame_->pts);
|
|
if (num_frames_processed_ == 0) {
|
|
expected_sample_number_ = TimestampToSampleNumber(pts);
|
|
}
|
|
|
|
const int64 expected_us =
|
|
SampleNumberToMicroseconds(expected_sample_number_);
|
|
const int64 actual_us = TimestampToMicroseconds(pts);
|
|
if (absl::Microseconds(std::abs(expected_us - actual_us)) >
|
|
absl::Seconds(
|
|
absl::GetFlag(FLAGS_media_decoder_allowed_audio_gap_merge))) {
|
|
LOG(ERROR) << "The expected time based on how many samples we have seen ("
|
|
<< expected_us
|
|
<< " microseconds) no longer matches the time based "
|
|
"on what the audio stream is telling us ("
|
|
<< actual_us
|
|
<< " microseconds). The difference is more than "
|
|
"--media_decoder_allowed_audio_gap_merge ("
|
|
<< absl::FormatDuration(absl::Seconds(absl::GetFlag(
|
|
FLAGS_media_decoder_allowed_audio_gap_merge)))
|
|
<< " microseconds). Resetting the timestamps to track what "
|
|
"the audio stream is telling us.";
|
|
expected_sample_number_ = TimestampToSampleNumber(pts);
|
|
}
|
|
}
|
|
|
|
MP_RETURN_IF_ERROR(AddAudioDataToBuffer(
|
|
Timestamp(av_rescale_q(expected_sample_number_, sample_time_base_,
|
|
output_time_base_)),
|
|
data_ptr, buf_size_bytes));
|
|
|
|
++num_frames_processed_;
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioPacketProcessor::AddAudioDataToBuffer(
|
|
const Timestamp output_timestamp, uint8* const* raw_audio,
|
|
int buf_size_bytes) {
|
|
if (buf_size_bytes == 0) {
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
if (buf_size_bytes % (num_channels_ * bytes_per_sample_) != 0) {
|
|
return UnknownError("Buffer is not an integral number of samples.");
|
|
}
|
|
|
|
const int64 num_samples = buf_size_bytes / bytes_per_sample_ / num_channels_;
|
|
VLOG(3) << "Adding " << num_samples << " audio samples in " << num_channels_
|
|
<< " channels to output.";
|
|
auto current_frame = absl::make_unique<Matrix>(num_channels_, num_samples);
|
|
|
|
const char* sample_ptr = nullptr;
|
|
switch (avcodec_ctx_->sample_fmt) {
|
|
case AV_SAMPLE_FMT_S16:
|
|
sample_ptr = reinterpret_cast<const char*>(raw_audio[0]);
|
|
for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) {
|
|
for (int channel = 0; channel < num_channels_; ++channel) {
|
|
(*current_frame)(channel, sample_index) =
|
|
PcmEncodedSampleToFloat(sample_ptr);
|
|
sample_ptr += bytes_per_sample_;
|
|
}
|
|
}
|
|
break;
|
|
case AV_SAMPLE_FMT_S32:
|
|
sample_ptr = reinterpret_cast<const char*>(raw_audio[0]);
|
|
for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) {
|
|
for (int channel = 0; channel < num_channels_; ++channel) {
|
|
(*current_frame)(channel, sample_index) =
|
|
PcmEncodedSampleInt32ToFloat(sample_ptr);
|
|
sample_ptr += bytes_per_sample_;
|
|
}
|
|
}
|
|
break;
|
|
case AV_SAMPLE_FMT_FLT:
|
|
sample_ptr = reinterpret_cast<const char*>(raw_audio[0]);
|
|
for (int64 sample_index = 0; sample_index < num_samples; ++sample_index) {
|
|
for (int channel = 0; channel < num_channels_; ++channel) {
|
|
(*current_frame)(channel, sample_index) =
|
|
Uint32ToFloat(absl::little_endian::Load32(sample_ptr));
|
|
sample_ptr += bytes_per_sample_;
|
|
}
|
|
}
|
|
break;
|
|
case AV_SAMPLE_FMT_S16P:
|
|
for (int channel = 0; channel < num_channels_; ++channel) {
|
|
sample_ptr = reinterpret_cast<const char*>(raw_audio[channel]);
|
|
for (int64 sample_index = 0; sample_index < num_samples;
|
|
++sample_index) {
|
|
(*current_frame)(channel, sample_index) =
|
|
PcmEncodedSampleToFloat(sample_ptr);
|
|
sample_ptr += bytes_per_sample_;
|
|
}
|
|
}
|
|
break;
|
|
case AV_SAMPLE_FMT_FLTP:
|
|
for (int channel = 0; channel < num_channels_; ++channel) {
|
|
sample_ptr = reinterpret_cast<const char*>(raw_audio[channel]);
|
|
for (int64 sample_index = 0; sample_index < num_samples;
|
|
++sample_index) {
|
|
(*current_frame)(channel, sample_index) =
|
|
Uint32ToFloat(absl::little_endian::Load32(sample_ptr));
|
|
sample_ptr += bytes_per_sample_;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
return mediapipe::UnimplementedErrorBuilder(MEDIAPIPE_LOC)
|
|
<< "sample_fmt = " << avcodec_ctx_->sample_fmt;
|
|
}
|
|
|
|
if (options_.output_regressing_timestamps() ||
|
|
last_timestamp_ == Timestamp::Unset() ||
|
|
output_timestamp > last_timestamp_) {
|
|
buffer_.push_back(Adopt(current_frame.release()).At(output_timestamp));
|
|
last_timestamp_ = output_timestamp;
|
|
if (last_frame_time_regression_detected_) {
|
|
last_frame_time_regression_detected_ = false;
|
|
LOG(INFO) << "Processor " << this << " resumed audio packet processing.";
|
|
}
|
|
} else if (!last_frame_time_regression_detected_) {
|
|
last_frame_time_regression_detected_ = true;
|
|
LOG(ERROR) << "Processor " << this
|
|
<< " is dropping an audio packet because the timestamps "
|
|
"regressed. Was "
|
|
<< last_timestamp_ << " but got " << output_timestamp;
|
|
}
|
|
expected_sample_number_ += num_samples;
|
|
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioPacketProcessor::FillHeader(TimeSeriesHeader* header) const {
|
|
CHECK(header);
|
|
header->set_sample_rate(sample_rate_);
|
|
header->set_num_channels(num_channels_);
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
int64 AudioPacketProcessor::MaybeCorrectPtsForRollover(int64 media_pts) {
|
|
return options_.correct_pts_for_rollover() ? CorrectPtsForRollover(media_pts)
|
|
: media_pts;
|
|
}
|
|
|
|
// AudioDecoder
|
|
AudioDecoder::AudioDecoder() { av_register_all(); }
|
|
|
|
AudioDecoder::~AudioDecoder() {
|
|
absl::Status status = Close();
|
|
if (!status.ok()) {
|
|
LOG(ERROR) << "Encountered error while closing media file: "
|
|
<< status.message();
|
|
}
|
|
}
|
|
|
|
absl::Status AudioDecoder::Initialize(
|
|
const std::string& input_file,
|
|
const mediapipe::AudioDecoderOptions options) {
|
|
if (options.audio_stream().empty()) {
|
|
return absl::InvalidArgumentError(
|
|
"At least one audio_stream must be defined in AudioDecoderOptions");
|
|
}
|
|
std::map<int, int> stream_index_to_audio_options_index;
|
|
int options_index = 0;
|
|
for (const auto& audio_stream : options.audio_stream()) {
|
|
InsertIfNotPresent(&stream_index_to_audio_options_index,
|
|
audio_stream.stream_index(), options_index);
|
|
++options_index;
|
|
}
|
|
|
|
Cleanup<std::function<void()>> decoder_closer([this]() {
|
|
absl::Status status = Close();
|
|
if (!status.ok()) {
|
|
LOG(ERROR) << "Encountered error while closing media file: "
|
|
<< status.message();
|
|
}
|
|
});
|
|
|
|
avformat_ctx_ = avformat_alloc_context();
|
|
if (avformat_open_input(&avformat_ctx_, input_file.c_str(), NULL, NULL) < 0) {
|
|
return absl::InvalidArgumentError(
|
|
absl::StrCat("Could not open file: ", input_file));
|
|
}
|
|
|
|
if (avformat_find_stream_info(avformat_ctx_, NULL) < 0) {
|
|
return absl::InvalidArgumentError(absl::StrCat(
|
|
"Could not find stream information of file: ", input_file));
|
|
}
|
|
|
|
std::map<int, int> audio_options_index_to_stream_id;
|
|
for (int current_audio_index = 0, stream_id = 0;
|
|
stream_id < avformat_ctx_->nb_streams; ++stream_id) {
|
|
AVStream* stream = avformat_ctx_->streams[stream_id];
|
|
AVCodecParameters* dec_param = stream->codecpar;
|
|
switch (dec_param->codec_type) {
|
|
case AVMEDIA_TYPE_AUDIO: {
|
|
const int* options_index_ptr = FindOrNull(
|
|
stream_index_to_audio_options_index, current_audio_index);
|
|
if (options_index_ptr) {
|
|
std::unique_ptr<AudioPacketProcessor> processor =
|
|
absl::make_unique<AudioPacketProcessor>(
|
|
options.audio_stream(*options_index_ptr));
|
|
if (!ContainsKey(audio_processor_, stream_id)) {
|
|
LOG(INFO) << "Created audio processor " << processor.get()
|
|
<< " for file \"" << input_file << "\"";
|
|
} else {
|
|
LOG(ERROR) << "Stream " << stream_id
|
|
<< " already mapped to audio processor "
|
|
<< audio_processor_[stream_id].get();
|
|
}
|
|
|
|
MP_RETURN_IF_ERROR(processor->Open(stream_id, stream));
|
|
audio_processor_.emplace(stream_id, std::move(processor));
|
|
CHECK(InsertIfNotPresent(
|
|
&stream_index_to_stream_id_,
|
|
options.audio_stream(*options_index_ptr).stream_index(),
|
|
stream_id));
|
|
CHECK(InsertIfNotPresent(&stream_id_to_audio_options_index_,
|
|
stream_id, *options_index_ptr));
|
|
CHECK(InsertIfNotPresent(&audio_options_index_to_stream_id,
|
|
*options_index_ptr, stream_id));
|
|
}
|
|
++current_audio_index;
|
|
break;
|
|
}
|
|
default: {
|
|
// Ignore other stream types.
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0; i < options.audio_stream_size(); ++i) {
|
|
RET_CHECK(ContainsKey(audio_options_index_to_stream_id, i) ||
|
|
options.audio_stream(i).allow_missing())
|
|
<< absl::StrCat("Could not find audio stream with index ", i,
|
|
" in file ", input_file);
|
|
}
|
|
|
|
if (options.has_start_time()) {
|
|
start_time_ = Timestamp::FromSeconds(options.start_time());
|
|
}
|
|
if (options.has_end_time()) {
|
|
end_time_ = Timestamp::FromSeconds(options.end_time());
|
|
}
|
|
is_first_packet_.resize(avformat_ctx_->nb_streams, true);
|
|
|
|
decoder_closer.release();
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioDecoder::GetData(int* options_index, Packet* data) {
|
|
while (true) {
|
|
for (auto& item : audio_processor_) {
|
|
while (item.second && item.second->HasData()) {
|
|
bool is_first_packet = is_first_packet_[item.first];
|
|
is_first_packet_[item.first] = false;
|
|
*options_index =
|
|
FindOrDie(stream_id_to_audio_options_index_, item.first);
|
|
absl::Status status = item.second->GetData(data);
|
|
// Ignore packets which are out of the requested timestamp range.
|
|
if (start_time_ != Timestamp::Unset()) {
|
|
if (is_first_packet && data->Timestamp() > start_time_) {
|
|
LOG(ERROR) << "First packet in audio stream " << *options_index
|
|
<< " has timestamp " << data->Timestamp()
|
|
<< " which is after start time of " << start_time_
|
|
<< ".";
|
|
}
|
|
if (data->Timestamp() < start_time_) {
|
|
VLOG(1) << "Skipping audio frame with timestamp "
|
|
<< data->Timestamp() << " before start time "
|
|
<< start_time_;
|
|
*data = Packet();
|
|
continue;
|
|
}
|
|
}
|
|
if (end_time_ != Timestamp::Unset() && data->Timestamp() > end_time_) {
|
|
VLOG(1) << "Skipping audio frame with timestamp " << data->Timestamp()
|
|
<< " after end time " << end_time_;
|
|
// We are past the last timestamp we care about, close the
|
|
// packet processor. We cannot remove the element from
|
|
// audio_processor_ right now, because we need to continue
|
|
// iterating through it.
|
|
item.second->Close();
|
|
item.second.reset(nullptr);
|
|
*data = Packet();
|
|
continue;
|
|
}
|
|
return status;
|
|
}
|
|
}
|
|
if (flushed_) {
|
|
MP_RETURN_IF_ERROR(Close());
|
|
return tool::StatusStop();
|
|
}
|
|
MP_RETURN_IF_ERROR(ProcessPacket());
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioDecoder::Close() {
|
|
for (auto& item : audio_processor_) {
|
|
if (item.second) {
|
|
item.second->Close();
|
|
item.second.reset(nullptr);
|
|
}
|
|
}
|
|
// Free the context.
|
|
if (avformat_ctx_) {
|
|
avformat_close_input(&avformat_ctx_);
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioDecoder::FillAudioHeader(
|
|
const AudioStreamOptions& stream_option, TimeSeriesHeader* header) const {
|
|
const std::unique_ptr<AudioPacketProcessor>* processor_ptr_ = FindOrNull(
|
|
audio_processor_,
|
|
FindOrDie(stream_index_to_stream_id_, stream_option.stream_index()));
|
|
|
|
RET_CHECK(processor_ptr_ && *processor_ptr_) << "audio stream is not open.";
|
|
MP_RETURN_IF_ERROR((*processor_ptr_)->FillHeader(header));
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
absl::Status AudioDecoder::ProcessPacket() {
|
|
std::unique_ptr<AVPacket, AVPacketDeleter> av_packet(new AVPacket());
|
|
av_init_packet(av_packet.get());
|
|
av_packet->size = 0;
|
|
av_packet->data = nullptr;
|
|
int ret = av_read_frame(avformat_ctx_, av_packet.get());
|
|
if (ret >= 0) {
|
|
CHECK(av_packet->data) << "AVPacket does not include any data but "
|
|
"av_read_frame was successful.";
|
|
const int stream_id = av_packet->stream_index;
|
|
auto audio_iterator = audio_processor_.find(stream_id);
|
|
if (audio_iterator != audio_processor_.end()) {
|
|
// This stream_id is belongs to an audio stream we care about.
|
|
if (audio_iterator->second) {
|
|
MP_RETURN_IF_ERROR(
|
|
audio_iterator->second->ProcessPacket(av_packet.get()));
|
|
} else {
|
|
VLOG(3) << "processor for stream " << stream_id << " is nullptr.";
|
|
}
|
|
} else {
|
|
VLOG(3) << "Ignoring packet for stream " << stream_id;
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
VLOG(1) << "Demuxing returned error (or EOF): " << AvErrorToString(ret);
|
|
if (ret == AVERROR(EAGAIN)) {
|
|
// EAGAIN is used to signify that the av_packet should be skipped
|
|
// (maybe the demuxer is trying to re-sync). This definitely
|
|
// occurs in the FLV and MpegT demuxers.
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
// Unrecoverable demuxing error with details in avformat_ctx_->pb->error.
|
|
int demuxing_error =
|
|
avformat_ctx_->pb ? avformat_ctx_->pb->error : 0 /* no error */;
|
|
if (ret == AVERROR_EOF && !demuxing_error) {
|
|
VLOG(1) << "Reached EOF.";
|
|
return Flush();
|
|
}
|
|
|
|
RET_CHECK(!demuxing_error) << absl::Substitute(
|
|
"Failed to read a frame: retval = $0 ($1), avformat_ctx_->pb->error = "
|
|
"$2 ($3)",
|
|
ret, AvErrorToString(ret), demuxing_error,
|
|
AvErrorToString(demuxing_error));
|
|
|
|
if (is_first_packet_[av_packet->stream_index]) {
|
|
RET_CHECK_FAIL() << "Couldn't even read the first frame; maybe a partial "
|
|
"file with only metadata?";
|
|
}
|
|
|
|
// Unrecoverable demuxing error without details.
|
|
RET_CHECK_FAIL() << absl::Substitute(
|
|
"Failed to read a frame: retval = $0 ($1)", ret, AvErrorToString(ret));
|
|
}
|
|
|
|
absl::Status AudioDecoder::Flush() {
|
|
std::vector<absl::Status> statuses;
|
|
for (auto& item : audio_processor_) {
|
|
if (item.second) {
|
|
statuses.push_back(item.second->Flush());
|
|
}
|
|
}
|
|
flushed_ = true;
|
|
return tool::CombinedStatus("Error while flushing codecs: ", statuses);
|
|
}
|
|
|
|
} // namespace mediapipe
|