From e9a8b458f35a0bd197c7e5c19afda9c72fe81046 Mon Sep 17 00:00:00 2001 From: Omar Sayed Mostafa Date: Sun, 9 Jan 2022 16:43:30 +0200 Subject: [PATCH] added video inference for youtube 8 million model --- yt8m_inference/extract_video_features.sh | 14 +++ yt8m_inference/video_inference.py | 112 +++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 yt8m_inference/extract_video_features.sh create mode 100644 yt8m_inference/video_inference.py diff --git a/yt8m_inference/extract_video_features.sh b/yt8m_inference/extract_video_features.sh new file mode 100644 index 000000000..e14f98e4f --- /dev/null +++ b/yt8m_inference/extract_video_features.sh @@ -0,0 +1,14 @@ +#!/bin/bash + + +video_name=$1 +echo $video_name + +time_float=$(ffprobe -i $video_name -show_entries format=duration -v quiet -of csv="p=0") +time_int=${time_float/.*} +echo $time_int + +python -m mediapipe.examples.desktop.youtube8m.generate_input_sequence_example --path_to_input_video=$video_name --clip_end_time_sec=$time_int + + +GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/youtube8m/extract_yt8m_features --calculator_graph_config_file=mediapipe/graphs/youtube8m/feature_extraction.pbtxt --input_side_packets=input_sequence_example=/tmp/mediapipe/metadata.pb --output_side_packets=output_sequence_example=/tmp/mediapipe/features.pb \ No newline at end of file diff --git a/yt8m_inference/video_inference.py b/yt8m_inference/video_inference.py new file mode 100644 index 000000000..d9617d000 --- /dev/null +++ b/yt8m_inference/video_inference.py @@ -0,0 +1,112 @@ +import tensorflow as tf +import numpy as np +import os + + +def read_pb_file(pb_file_path): + sequence_example = open(pb_file_path, 'rb').read() + sequence = tf.train.SequenceExample.FromString(sequence_example) + return sequence + +def parse_pb_sequence(pb_file_path): + seq = read_pb_file(pb_file_path) + + rgb_features = seq.feature_lists.feature_list['RGB/feature/floats'].feature + #print(len(rgb_features), len(rgb_features[0].float_list.value)) + rgb_features_array = np.array([rgb_feature.float_list.value for rgb_feature in rgb_features]) + + audio_features = seq.feature_lists.feature_list['AUDIO/feature/floats'].feature + #print(len(audio_features), len(audio_features[0].float_list.value)) + audio_features_array = np.array([audio_feature.float_list.value for audio_feature in audio_features]) + + n = min(rgb_features_array.shape[0], audio_features_array.shape[0]) + + rgb_features_array, audio_features_array = rgb_features_array[:n, :], audio_features_array[:n, :] + + concatenated_features = np.concatenate([rgb_features_array, audio_features_array], axis=1) + + return concatenated_features + +def individual_frames_to_segments(frames_feature_matrix, segment_size=5): + num_frames, descriptor_dimensionality = frames_feature_matrix.shape + num_segments = int(num_frames / segment_size) + num_frames_included = num_segments*segment_size + frames_feature_matrix = frames_feature_matrix[:num_frames_included, :] + segmented_version = frames_feature_matrix.reshape([num_segments, segment_size, descriptor_dimensionality]) + return segmented_version + +class VideoInference: + def __init__(self) -> None: + model_path = "/tmp/mediapipe/saved_model" + self.sess = tf.Session() + meta_graph = tf.saved_model.load(export_dir=model_path, sess=self.sess, tags=['serve']) + sig_def = meta_graph.signature_def[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY] + + self.num_frames_placeholder = sig_def.inputs['num_frames'].name + self.rgb_and_audio_placeholder = sig_def.inputs['rgb_and_audio'].name + self.output_name = sig_def.outputs['predictions'].name + self.labels = self.get_label_map() + + def get_label_map(self): + with open("/mediapipe/mediapipe/graphs/youtube8m/label_map.txt", 'rb') as file: + lines = file.readlines() + labels = [line.rstrip().decode("utf-8") for line in lines] + return labels + + def extract_video_features(self, video_path): + command = "/mediapipe/extract_video_features.sh " + video_path + print(command) + os.system(command) + + + def infer(self, features_pb_filepath): + f = parse_pb_sequence(features_pb_filepath) + #print(f.shape) + rgb_and_audio_segments = individual_frames_to_segments(f, segment_size=6) + #print(rgb_and_audio_segments.shape) + num_frames_array = np.ones(shape=[rgb_and_audio_segments.shape[0], 1], dtype=np.int32) * rgb_and_audio_segments.shape[1] + with tf.Graph().as_default() as g: + predictions = self.sess.run(self.output_name, feed_dict= {self.num_frames_placeholder: num_frames_array, self.rgb_and_audio_placeholder:rgb_and_audio_segments}) + return predictions + + def aggregate_scores(self, predictions): + top_prediction_per_frame = predictions.argmax(axis=-1) + top_scores = predictions.max(axis=-1) + top_prediction_per_frame[np.where(top_scores < 0.75)] = -1 + + u, count = np.unique(top_prediction_per_frame, return_counts=True) + count_sort_ind = np.argsort(-count) + sorted_counts = count[count_sort_ind] + sorted_predictions = u[count_sort_ind] + print(sorted_predictions[:3], sorted_counts[:3]) + top_prediction = sorted_predictions[0] + if top_prediction == -1: + top_prediction = sorted_predictions[1] + label = self.labels[top_prediction] + + return label + + def extract_features_and_infer(self, video_path): + self.extract_video_features(video_path) + print("Features Extracted, about to run inference") + predictions = self.infer('/tmp/mediapipe/features.pb') + video_label = v.aggregate_scores(predictions) + return video_label + + + + + +if __name__ == "__main__": + import glob + import os + v = VideoInference() + videos = glob.glob("/shared_volume/test_videos/*mp4") + print(f"test_videos: {videos}") + d = {} + for video in videos: + video_label = v.extract_features_and_infer(video) + print(video_label) + d[os.path.basename(video)] = video_label + + print(f"results: {d}") \ No newline at end of file