From c9ebc6fa606888542ad89b978c2658c127d4226f Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Fri, 6 Jan 2023 21:34:46 -0800 Subject: [PATCH] Use synthetic timestamps in Web when none provided PiperOrigin-RevId: 500327275 --- .../audio_classifier/audio_classifier.ts | 5 ++++- .../audio/audio_embedder/audio_embedder.ts | 18 ++++++++++++------ .../tasks/web/audio/core/audio_task_runner.ts | 5 ++++- mediapipe/tasks/web/core/task_runner.ts | 18 +++++++++++++++++- .../text/text_classifier/text_classifier.ts | 10 ++++++---- .../web/text/text_embedder/text_embedder.ts | 19 ++++++++++++------- .../web/vision/core/vision_task_runner.ts | 6 +++++- .../gesture_recognizer/gesture_recognizer.ts | 12 ++++++++---- .../vision/hand_landmarker/hand_landmarker.ts | 9 ++++++--- .../image_classifier/image_classifier.ts | 3 ++- .../vision/image_embedder/image_embedder.ts | 8 +++++--- .../vision/object_detector/object_detector.ts | 5 +++-- 12 files changed, 84 insertions(+), 34 deletions(-) diff --git a/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts b/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts index 92fca93ad..e26ead6a9 100644 --- a/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts +++ b/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts @@ -126,6 +126,8 @@ export class AudioClassifier extends AudioTaskRunner { return this.applyOptions(options); } + // TODO: Add a classifyStream() that takes a timestamp + /** * Performs audio classification on the provided audio clip and waits * synchronously for the response. @@ -194,8 +196,9 @@ export class AudioClassifier extends AudioTaskRunner { graphConfig.addNode(classifierNode); this.graphRunner.attachProtoVectorListener( - TIMESTAMPED_CLASSIFICATIONS_STREAM, binaryProtos => { + TIMESTAMPED_CLASSIFICATIONS_STREAM, (binaryProtos, timestamp) => { this.addJsAudioClassificationResults(binaryProtos); + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary(); diff --git a/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts b/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts index 2e210f969..7411f95ef 100644 --- a/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts +++ b/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts @@ -128,6 +128,8 @@ export class AudioEmbedder extends AudioTaskRunner { return this.applyOptions(options); } + // TODO: Add a classifyStream() that takes a timestamp + /** * Performs embeding extraction on the provided audio clip and waits * synchronously for the response. @@ -193,20 +195,24 @@ export class AudioEmbedder extends AudioTaskRunner { graphConfig.addNode(embedderNode); - this.graphRunner.attachProtoListener(EMBEDDINGS_STREAM, binaryProto => { - const embeddingResult = EmbeddingResult.deserializeBinary(binaryProto); - this.embeddingResults.push( - convertFromEmbeddingResultProto(embeddingResult)); - }); + this.graphRunner.attachProtoListener( + EMBEDDINGS_STREAM, (binaryProto, timestamp) => { + const embeddingResult = + EmbeddingResult.deserializeBinary(binaryProto); + this.embeddingResults.push( + convertFromEmbeddingResultProto(embeddingResult)); + this.setLatestOutputTimestamp(timestamp); + }); this.graphRunner.attachProtoVectorListener( - TIMESTAMPED_EMBEDDINGS_STREAM, data => { + TIMESTAMPED_EMBEDDINGS_STREAM, (data, timestamp) => { for (const binaryProto of data) { const embeddingResult = EmbeddingResult.deserializeBinary(binaryProto); this.embeddingResults.push( convertFromEmbeddingResultProto(embeddingResult)); } + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary(); diff --git a/mediapipe/tasks/web/audio/core/audio_task_runner.ts b/mediapipe/tasks/web/audio/core/audio_task_runner.ts index 24d78378d..ff39185f2 100644 --- a/mediapipe/tasks/web/audio/core/audio_task_runner.ts +++ b/mediapipe/tasks/web/audio/core/audio_task_runner.ts @@ -36,8 +36,11 @@ export abstract class AudioTaskRunner extends TaskRunner { /** Sends a single audio clip to the graph and awaits results. */ protected processAudioClip(audioData: Float32Array, sampleRate?: number): T { + // Increment the timestamp by 1 millisecond to guarantee that we send + // monotonically increasing timestamps to the graph. + const syntheticTimestamp = this.getLatestOutputTimestamp() + 1; return this.process( - audioData, sampleRate ?? this.defaultSampleRate, performance.now()); + audioData, sampleRate ?? this.defaultSampleRate, syntheticTimestamp); } } diff --git a/mediapipe/tasks/web/core/task_runner.ts b/mediapipe/tasks/web/core/task_runner.ts index c2679b773..8d483d9ff 100644 --- a/mediapipe/tasks/web/core/task_runner.ts +++ b/mediapipe/tasks/web/core/task_runner.ts @@ -50,7 +50,7 @@ export async function createTaskRunner( } }; - // Initialize a canvas if requested. If OffscreenCanvas is availble, we + // Initialize a canvas if requested. If OffscreenCanvas is available, we // let the graph runner initialize it by passing `undefined`. const canvas = initializeCanvas ? (typeof OffscreenCanvas === 'undefined' ? document.createElement('canvas') : @@ -66,6 +66,7 @@ export async function createTaskRunner( export abstract class TaskRunner { protected abstract baseOptions: BaseOptionsProto; private processingErrors: Error[] = []; + private latestOutputTimestamp = 0; /** * Creates a new instance of a Mediapipe Task. Determines if SIMD is @@ -162,6 +163,21 @@ export abstract class TaskRunner { this.handleErrors(); } + /* + * Sets the latest output timestamp received from the graph (in ms). + * Timestamps that are smaller than the currently latest output timestamp are + * ignored. + */ + protected setLatestOutputTimestamp(timestamp: number): void { + this.latestOutputTimestamp = + Math.max(this.latestOutputTimestamp, timestamp); + } + + /** Returns the latest output timestamp. */ + protected getLatestOutputTimestamp() { + return this.latestOutputTimestamp; + } + /** Throws the error from the error listener if an error was raised. */ private handleErrors() { try { diff --git a/mediapipe/tasks/web/text/text_classifier/text_classifier.ts b/mediapipe/tasks/web/text/text_classifier/text_classifier.ts index 6aef1b3e4..ff314cfc3 100644 --- a/mediapipe/tasks/web/text/text_classifier/text_classifier.ts +++ b/mediapipe/tasks/web/text/text_classifier/text_classifier.ts @@ -131,10 +131,11 @@ export class TextClassifier extends TaskRunner { * @return The classification result of the text */ classify(text: string): TextClassifierResult { - // Get classification result by running our MediaPipe graph. + // Increment the timestamp by 1 millisecond to guarantee that we send + // monotonically increasing timestamps to the graph. + const syntheticTimestamp = this.getLatestOutputTimestamp() + 1; this.classificationResult = {classifications: []}; - this.graphRunner.addStringToStream( - text, INPUT_STREAM, /* timestamp= */ performance.now()); + this.graphRunner.addStringToStream(text, INPUT_STREAM, syntheticTimestamp); this.finishProcessing(); return this.classificationResult; } @@ -158,9 +159,10 @@ export class TextClassifier extends TaskRunner { graphConfig.addNode(classifierNode); this.graphRunner.attachProtoListener( - CLASSIFICATIONS_STREAM, binaryProto => { + CLASSIFICATIONS_STREAM, (binaryProto, timestamp) => { this.classificationResult = convertFromClassificationResultProto( ClassificationResult.deserializeBinary(binaryProto)); + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary(); diff --git a/mediapipe/tasks/web/text/text_embedder/text_embedder.ts b/mediapipe/tasks/web/text/text_embedder/text_embedder.ts index db7986dec..daa1d24ed 100644 --- a/mediapipe/tasks/web/text/text_embedder/text_embedder.ts +++ b/mediapipe/tasks/web/text/text_embedder/text_embedder.ts @@ -135,9 +135,10 @@ export class TextEmbedder extends TaskRunner { * @return The embedding resuls of the text */ embed(text: string): TextEmbedderResult { - // Get text embeddings by running our MediaPipe graph. - this.graphRunner.addStringToStream( - text, INPUT_STREAM, /* timestamp= */ performance.now()); + // Increment the timestamp by 1 millisecond to guarantee that we send + // monotonically increasing timestamps to the graph. + const syntheticTimestamp = this.getLatestOutputTimestamp() + 1; + this.graphRunner.addStringToStream(text, INPUT_STREAM, syntheticTimestamp); this.finishProcessing(); return this.embeddingResult; } @@ -173,10 +174,14 @@ export class TextEmbedder extends TaskRunner { graphConfig.addNode(embedderNode); - this.graphRunner.attachProtoListener(EMBEDDINGS_STREAM, binaryProto => { - const embeddingResult = EmbeddingResult.deserializeBinary(binaryProto); - this.embeddingResult = convertFromEmbeddingResultProto(embeddingResult); - }); + this.graphRunner.attachProtoListener( + EMBEDDINGS_STREAM, (binaryProto, timestamp) => { + const embeddingResult = + EmbeddingResult.deserializeBinary(binaryProto); + this.embeddingResult = + convertFromEmbeddingResultProto(embeddingResult); + this.setLatestOutputTimestamp(timestamp); + }); const binaryGraph = graphConfig.serializeBinary(); this.setGraph(new Uint8Array(binaryGraph), /* isBinary= */ true); diff --git a/mediapipe/tasks/web/vision/core/vision_task_runner.ts b/mediapipe/tasks/web/vision/core/vision_task_runner.ts index 9adc810fc..9ed9ffdb2 100644 --- a/mediapipe/tasks/web/vision/core/vision_task_runner.ts +++ b/mediapipe/tasks/web/vision/core/vision_task_runner.ts @@ -71,7 +71,11 @@ export abstract class VisionTaskRunner extends TaskRunner { 'Task is not initialized with image mode. ' + '\'runningMode\' must be set to \'image\'.'); } - this.process(image, imageProcessingOptions, performance.now()); + + // Increment the timestamp by 1 millisecond to guarantee that we send + // monotonically increasing timestamps to the graph. + const syntheticTimestamp = this.getLatestOutputTimestamp() + 1; + this.process(image, imageProcessingOptions, syntheticTimestamp); } /** Sends a single video frame to the graph and awaits results. */ diff --git a/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts b/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts index e0c6affcb..48efc4855 100644 --- a/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts +++ b/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts @@ -380,23 +380,27 @@ export class GestureRecognizer extends VisionTaskRunner { graphConfig.addNode(recognizerNode); this.graphRunner.attachProtoVectorListener( - LANDMARKS_STREAM, binaryProto => { + LANDMARKS_STREAM, (binaryProto, timestamp) => { this.addJsLandmarks(binaryProto); + this.setLatestOutputTimestamp(timestamp); }); this.graphRunner.attachProtoVectorListener( - WORLD_LANDMARKS_STREAM, binaryProto => { + WORLD_LANDMARKS_STREAM, (binaryProto, timestamp) => { this.adddJsWorldLandmarks(binaryProto); + this.setLatestOutputTimestamp(timestamp); }); this.graphRunner.attachProtoVectorListener( - HAND_GESTURES_STREAM, binaryProto => { + HAND_GESTURES_STREAM, (binaryProto, timestamp) => { // Gesture index is not used, because the final gesture result comes // from multiple classifiers. this.gestures.push( ...this.toJsCategories(binaryProto, /* populateIndex= */ false)); + this.setLatestOutputTimestamp(timestamp); }); this.graphRunner.attachProtoVectorListener( - HANDEDNESS_STREAM, binaryProto => { + HANDEDNESS_STREAM, (binaryProto, timestamp) => { this.handednesses.push(...this.toJsCategories(binaryProto)); + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary(); diff --git a/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts b/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts index e238bc96f..b51fb6a52 100644 --- a/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts +++ b/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts @@ -313,16 +313,19 @@ export class HandLandmarker extends VisionTaskRunner { graphConfig.addNode(landmarkerNode); this.graphRunner.attachProtoVectorListener( - LANDMARKS_STREAM, binaryProto => { + LANDMARKS_STREAM, (binaryProto, timestamp) => { this.addJsLandmarks(binaryProto); + this.setLatestOutputTimestamp(timestamp); }); this.graphRunner.attachProtoVectorListener( - WORLD_LANDMARKS_STREAM, binaryProto => { + WORLD_LANDMARKS_STREAM, (binaryProto, timestamp) => { this.adddJsWorldLandmarks(binaryProto); + this.setLatestOutputTimestamp(timestamp); }); this.graphRunner.attachProtoVectorListener( - HANDEDNESS_STREAM, binaryProto => { + HANDEDNESS_STREAM, (binaryProto, timestamp) => { this.handednesses.push(...this.toJsCategories(binaryProto)); + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary(); diff --git a/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts b/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts index 2ad4a821d..cb2849cd8 100644 --- a/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts +++ b/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts @@ -187,9 +187,10 @@ export class ImageClassifier extends VisionTaskRunner { graphConfig.addNode(classifierNode); this.graphRunner.attachProtoListener( - CLASSIFICATIONS_STREAM, binaryProto => { + CLASSIFICATIONS_STREAM, (binaryProto, timestamp) => { this.classificationResult = convertFromClassificationResultProto( ClassificationResult.deserializeBinary(binaryProto)); + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary(); diff --git a/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts b/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts index 64a10f5f4..788646e6d 100644 --- a/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts +++ b/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts @@ -206,9 +206,11 @@ export class ImageEmbedder extends VisionTaskRunner { graphConfig.addNode(embedderNode); - this.graphRunner.attachProtoListener(EMBEDDINGS_STREAM, binaryProto => { - this.addJsImageEmdedding(binaryProto); - }); + this.graphRunner.attachProtoListener( + EMBEDDINGS_STREAM, (binaryProto, timestamp) => { + this.addJsImageEmdedding(binaryProto); + this.setLatestOutputTimestamp(timestamp); + }); const binaryGraph = graphConfig.serializeBinary(); this.setGraph(new Uint8Array(binaryGraph), /* isBinary= */ true); diff --git a/mediapipe/tasks/web/vision/object_detector/object_detector.ts b/mediapipe/tasks/web/vision/object_detector/object_detector.ts index 3a79c1b00..5741a3a0c 100644 --- a/mediapipe/tasks/web/vision/object_detector/object_detector.ts +++ b/mediapipe/tasks/web/vision/object_detector/object_detector.ts @@ -176,7 +176,7 @@ export class ObjectDetector extends VisionTaskRunner { } /** - * Performs object detection on the provided vidoe frame and waits + * Performs object detection on the provided video frame and waits * synchronously for the response. Only use this method when the * ObjectDetector is created with running mode `video`. * @@ -248,8 +248,9 @@ export class ObjectDetector extends VisionTaskRunner { graphConfig.addNode(detectorNode); this.graphRunner.attachProtoVectorListener( - DETECTIONS_STREAM, binaryProto => { + DETECTIONS_STREAM, (binaryProto, timestamp) => { this.addJsObjectDetections(binaryProto); + this.setLatestOutputTimestamp(timestamp); }); const binaryGraph = graphConfig.serializeBinary();