mediapipe/yt8m_inference/video_inference.py

import tensorflow as tf
import numpy as np
import os


def read_pb_file(pb_file_path):
    sequence_example = open(pb_file_path, 'rb').read()
    sequence = tf.train.SequenceExample.FromString(sequence_example)
    return sequence

def parse_pb_sequence(pb_file_path):
    seq = read_pb_file(pb_file_path)

    rgb_features = seq.feature_lists.feature_list['RGB/feature/floats'].feature
    #print(len(rgb_features), len(rgb_features[0].float_list.value))
    rgb_features_array = np.array([rgb_feature.float_list.value for rgb_feature in rgb_features])

    audio_features = seq.feature_lists.feature_list['AUDIO/feature/floats'].feature
    #print(len(audio_features), len(audio_features[0].float_list.value))
    audio_features_array = np.array([audio_feature.float_list.value for audio_feature in audio_features])

    n = min(rgb_features_array.shape[0], audio_features_array.shape[0])

    rgb_features_array, audio_features_array = rgb_features_array[:n, :], audio_features_array[:n, :]

    concatenated_features = np.concatenate([rgb_features_array, audio_features_array], axis=1)

    return concatenated_features

def individual_frames_to_segments(frames_feature_matrix, segment_size=5):
    num_frames, descriptor_dimensionality = frames_feature_matrix.shape
    num_segments = int(num_frames / segment_size)
    num_frames_included =  num_segments*segment_size
    frames_feature_matrix = frames_feature_matrix[:num_frames_included, :]
    segmented_version = frames_feature_matrix.reshape([num_segments, segment_size, descriptor_dimensionality])
    return segmented_version

class VideoInference:
    def __init__(self) -> None:
        model_path = "/tmp/mediapipe/saved_model"
        self.sess = tf.Session()
        meta_graph = tf.saved_model.load(export_dir=model_path, sess=self.sess, tags=['serve'])
        sig_def = meta_graph.signature_def[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]

        self.num_frames_placeholder = sig_def.inputs['num_frames'].name
        self.rgb_and_audio_placeholder = sig_def.inputs['rgb_and_audio'].name
        self.output_name = sig_def.outputs['predictions'].name
        self.labels = self.get_label_map()

    def get_label_map(self):
        with open("/mediapipe/mediapipe/graphs/youtube8m/label_map.txt", 'rb') as file:
            lines = file.readlines()
            labels = [line.rstrip().decode("utf-8") for line in lines]
            return labels

    def extract_video_features(self, video_path):
        command = "/mediapipe/extract_video_features.sh " +  video_path
        print(command)
        os.system(command)


    def infer(self, features_pb_filepath):
        f = parse_pb_sequence(features_pb_filepath)
        #print(f.shape)
        rgb_and_audio_segments = individual_frames_to_segments(f, segment_size=6)
        #print(rgb_and_audio_segments.shape)
        num_frames_array = np.ones(shape=[rgb_and_audio_segments.shape[0], 1], dtype=np.int32) * rgb_and_audio_segments.shape[1]
        with tf.Graph().as_default() as g:
            predictions = self.sess.run(self.output_name, feed_dict= {self.num_frames_placeholder: num_frames_array, self.rgb_and_audio_placeholder:rgb_and_audio_segments})
        return predictions

    def aggregate_scores(self, predictions):
        top_prediction_per_frame = predictions.argmax(axis=-1)
        top_scores = predictions.max(axis=-1)
        top_prediction_per_frame[np.where(top_scores < 0.75)] = -1

        u, count = np.unique(top_prediction_per_frame, return_counts=True)
        count_sort_ind = np.argsort(-count)
        sorted_counts = count[count_sort_ind]
        sorted_predictions = u[count_sort_ind]
        print(sorted_predictions[:3], sorted_counts[:3])
        top_prediction = sorted_predictions[0]
        if top_prediction == -1:
            top_prediction = sorted_predictions[1]
        label = self.labels[top_prediction]

        return label

    def extract_features_and_infer(self, video_path):
        self.extract_video_features(video_path)
        print("Features Extracted, about to run inference")
        predictions = self.infer('/tmp/mediapipe/features.pb')
        video_label = v.aggregate_scores(predictions)
        return video_label


if __name__ == "__main__":
    import glob
    import os
    v = VideoInference()
    videos = glob.glob("/shared_volume/test_videos/*mp4")
    print(f"test_videos: {videos}")
    d = {}
    for video in videos:
        video_label = v.extract_features_and_infer(video)
        print(video_label)
        d[os.path.basename(video)] = video_label

    print(f"results: {d}")