mediapipe/mediapipe/util/audio_decoder.h

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef MEDIAPIPE_UTIL_AUDIO_DECODER_H_
#define MEDIAPIPE_UTIL_AUDIO_DECODER_H_

#include <cstdint>  // required by avutil.h
#include <deque>
#include <string>
#include <vector>

#include "absl/flags/flag.h"
#include "absl/time/time.h"
#include "mediapipe/framework/formats/time_series_header.pb.h"
#include "mediapipe/framework/packet.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/framework/timestamp.h"
#include "mediapipe/util/audio_decoder.pb.h"

extern "C" {
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libavutil/avutil.h"
#include "libavutil/dict.h"
#include "mediapipe/util/audio_decoder.pb.h"
}

namespace mediapipe {

using mediapipe::AudioStreamOptions;
using mediapipe::TimeSeriesHeader;

// The base helper class for a processor which handles decoding of a single
// stream.
class BasePacketProcessor {
 public:
  BasePacketProcessor();
  virtual ~BasePacketProcessor();

  // Opens the codec.
  virtual absl::Status Open(int id, AVStream* stream) = 0;

  // Processes a packet of data.  Caller retains ownership of packet.
  virtual absl::Status ProcessPacket(AVPacket* packet) = 0;

  // Returns true if the processor has data immediately available
  // (without providing more data with ProcessPacket()).
  bool HasData();

  // Fills packet with the next frame of data.  Returns an empty packet
  // if there is nothing to return.
  absl::Status GetData(Packet* packet);

  // Once no more AVPackets are available in the file, each stream must
  // be flushed to get any remaining frames which the codec is buffering.
  absl::Status Flush();

  // Closes the Processor, this does not close the file.  You may not
  // call ProcessPacket() after calling Close().  Close() may be called
  // repeatedly.
  void Close();

 protected:
  // Decodes frames in a packet.
  virtual absl::Status Decode(const AVPacket& packet,
                              bool ignore_decode_failures);

  // Processes a decoded frame.
  virtual absl::Status ProcessDecodedFrame(const AVPacket& packet) = 0;

  // Corrects the given PTS for MPEG PTS rollover. Assumed to be called with
  // the PTS of each frame in decode order. We detect a rollover whenever the
  // PTS timestamp changes by more than 2^33/2 (half the timestamp space). For
  // video this means every 26.5h with 1 PTS tick = 1/90000 of a second.
  // Example timeline:
  // CorrectPtsForRollover(0) -> 0
  // CorrectPtsForRollover(42) -> 42
  // CorrectPtsForRollover(2^33 - 1) -> 2^33 - 1
  // CorrectPtsForRollover(0) -> 2^33  // PTS in media rolls over, corrected.
  // CorrectPtsForRollover(1) -> 2^33 + 1
  int64 CorrectPtsForRollover(int64 media_pts);

  AVCodecContext* avcodec_ctx_ = nullptr;
  const AVCodec* avcodec_ = nullptr;
  AVDictionary* avcodec_opts_ = nullptr;
  AVFrame* decoded_frame_ = nullptr;

  // Stream ID this object processes.
  int id_ = -1;

  // Set to true if the stream has been flushed and no more AVPackets
  // will be processed with it.
  bool flushed_ = false;

  // The source time base.
  AVRational source_time_base_;
  // The output time base.
  const AVRational output_time_base_;

  // The source frame rate (estimated from header information).
  AVRational source_frame_rate_;

  // The number of frames that were successfully processed.
  int64 num_frames_processed_ = 0;

  int bytes_per_sample_ = 0;

  // boolean flag to show if time regression has been detected for last frame;
  bool last_frame_time_regression_detected_ = false;

  // The last rollover corrected PTS returned by CorrectPtsForRollover.
  int64 rollover_corrected_last_pts_ = AV_NOPTS_VALUE;

  // The buffer of current frames.
  std::deque<Packet> buffer_;
};

// Class which decodes packets from a single audio stream.
class AudioPacketProcessor : public BasePacketProcessor {
 public:
  explicit AudioPacketProcessor(const AudioStreamOptions& options);

  absl::Status Open(int id, AVStream* stream) override;

  absl::Status ProcessPacket(AVPacket* packet) override;

  absl::Status FillHeader(TimeSeriesHeader* header) const;

 private:
  // Appends audio in buffer(s) to the output buffer (buffer_).
  absl::Status AddAudioDataToBuffer(const Timestamp output_timestamp,
                                    uint8* const* raw_audio,
                                    int buf_size_bytes);

  // Converts a number of samples into an approximate stream timestamp value.
  int64 SampleNumberToTimestamp(const int64 sample_number);
  int64 TimestampToSampleNumber(const int64 timestamp);

  // Converts a timestamp/sample number to microseconds.
  int64 TimestampToMicroseconds(const int64 timestamp);
  int64 SampleNumberToMicroseconds(const int64 sample_number);

  // Returns an error if the sample format in avformat_ctx_.sample_format
  // is not supported.
  absl::Status ValidateSampleFormat();

  // Processes a decoded audio frame.  audio_frame_ must have been filled
  // with the frame before calling this function.
  absl::Status ProcessDecodedFrame(const AVPacket& packet) override;

  // Corrects PTS for rollover if correction is enabled.
  int64 MaybeCorrectPtsForRollover(int64 media_pts);

  // Number of channels to output. This value might be different from
  // the actual number of channels for the current AVPacket, found in
  // avcodec_ctx_->channels.
  int num_channels_ = -1;

  // Sample rate of the data to output. This value might be different
  // from the actual sample rate for the current AVPacket, found in
  // avcodec_ctx_->sample_rate.
  int64 sample_rate_ = -1;

  // The time base of audio samples (i.e. the reciprocal of the sample rate).
  AVRational sample_time_base_;

  // The timestamp of the last packet added to the buffer.
  Timestamp last_timestamp_;

  // The expected sample number based on counting samples.
  int64 expected_sample_number_ = 0;

  // Options for the processor.
  AudioStreamOptions options_;
};

// Decode the audio streams of a media file.  The AudioDecoder is responsible
// for demuxing the audio streams in the container format, whereas decoding of
// the content is delegated to AudioPacketProcessor.
class AudioDecoder {
 public:
  AudioDecoder();
  ~AudioDecoder();

  absl::Status Initialize(const std::string& input_file,
                          const mediapipe::AudioDecoderOptions options);

  absl::Status GetData(int* options_index, Packet* data);

  absl::Status Close();

  absl::Status FillAudioHeader(const AudioStreamOptions& stream_option,
                               TimeSeriesHeader* header) const;

 private:
  absl::Status ProcessPacket();
  absl::Status Flush();

  std::map<int, int> stream_id_to_audio_options_index_;
  std::map<int, int> stream_index_to_stream_id_;
  std::map<int, std::unique_ptr<AudioPacketProcessor>> audio_processor_;

  // Indexed by container stream index, true if the stream has not seen
  // a packet (whether returned or not), and false otherwise.
  std::vector<bool> is_first_packet_;
  bool flushed_ = false;

  Timestamp start_time_ = Timestamp::Unset();
  Timestamp end_time_ = Timestamp::Unset();

  AVFormatContext* avformat_ctx_ = nullptr;
};

}  // namespace mediapipe

#endif  // MEDIAPIPE_UTIL_AUDIO_DECODER_H_