From c9ebc6fa606888542ad89b978c2658c127d4226f Mon Sep 17 00:00:00 2001
From: Sebastian Schmidt <mrschmidt@google.com>
Date: Fri, 6 Jan 2023 21:34:46 -0800
Subject: [PATCH] Use synthetic timestamps in Web when none provided

PiperOrigin-RevId: 500327275
---
 .../audio_classifier/audio_classifier.ts      |  5 ++++-
 .../audio/audio_embedder/audio_embedder.ts    | 18 ++++++++++++------
 .../tasks/web/audio/core/audio_task_runner.ts |  5 ++++-
 mediapipe/tasks/web/core/task_runner.ts       | 18 +++++++++++++++++-
 .../text/text_classifier/text_classifier.ts   | 10 ++++++----
 .../web/text/text_embedder/text_embedder.ts   | 19 ++++++++++++-------
 .../web/vision/core/vision_task_runner.ts     |  6 +++++-
 .../gesture_recognizer/gesture_recognizer.ts  | 12 ++++++++----
 .../vision/hand_landmarker/hand_landmarker.ts |  9 ++++++---
 .../image_classifier/image_classifier.ts      |  3 ++-
 .../vision/image_embedder/image_embedder.ts   |  8 +++++---
 .../vision/object_detector/object_detector.ts |  5 +++--
 12 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts b/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts
index 92fca93ad..e26ead6a9 100644
--- a/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts
+++ b/mediapipe/tasks/web/audio/audio_classifier/audio_classifier.ts
@@ -126,6 +126,8 @@ export class AudioClassifier extends AudioTaskRunner<AudioClassifierResult[]> {
     return this.applyOptions(options);
   }
 
+  // TODO: Add a classifyStream() that takes a timestamp
+
   /**
    * Performs audio classification on the provided audio clip and waits
    * synchronously for the response.
@@ -194,8 +196,9 @@ export class AudioClassifier extends AudioTaskRunner<AudioClassifierResult[]> {
     graphConfig.addNode(classifierNode);
 
     this.graphRunner.attachProtoVectorListener(
-        TIMESTAMPED_CLASSIFICATIONS_STREAM, binaryProtos => {
+        TIMESTAMPED_CLASSIFICATIONS_STREAM, (binaryProtos, timestamp) => {
           this.addJsAudioClassificationResults(binaryProtos);
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();
diff --git a/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts b/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts
index 2e210f969..7411f95ef 100644
--- a/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts
+++ b/mediapipe/tasks/web/audio/audio_embedder/audio_embedder.ts
@@ -128,6 +128,8 @@ export class AudioEmbedder extends AudioTaskRunner<AudioEmbedderResult[]> {
     return this.applyOptions(options);
   }
 
+  // TODO: Add a classifyStream() that takes a timestamp
+
   /**
    * Performs embeding extraction on the provided audio clip and waits
    * synchronously for the response.
@@ -193,20 +195,24 @@ export class AudioEmbedder extends AudioTaskRunner<AudioEmbedderResult[]> {
 
     graphConfig.addNode(embedderNode);
 
-    this.graphRunner.attachProtoListener(EMBEDDINGS_STREAM, binaryProto => {
-      const embeddingResult = EmbeddingResult.deserializeBinary(binaryProto);
-      this.embeddingResults.push(
-          convertFromEmbeddingResultProto(embeddingResult));
-    });
+    this.graphRunner.attachProtoListener(
+        EMBEDDINGS_STREAM, (binaryProto, timestamp) => {
+          const embeddingResult =
+              EmbeddingResult.deserializeBinary(binaryProto);
+          this.embeddingResults.push(
+              convertFromEmbeddingResultProto(embeddingResult));
+          this.setLatestOutputTimestamp(timestamp);
+        });
 
     this.graphRunner.attachProtoVectorListener(
-        TIMESTAMPED_EMBEDDINGS_STREAM, data => {
+        TIMESTAMPED_EMBEDDINGS_STREAM, (data, timestamp) => {
           for (const binaryProto of data) {
             const embeddingResult =
                 EmbeddingResult.deserializeBinary(binaryProto);
             this.embeddingResults.push(
                 convertFromEmbeddingResultProto(embeddingResult));
           }
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();
diff --git a/mediapipe/tasks/web/audio/core/audio_task_runner.ts b/mediapipe/tasks/web/audio/core/audio_task_runner.ts
index 24d78378d..ff39185f2 100644
--- a/mediapipe/tasks/web/audio/core/audio_task_runner.ts
+++ b/mediapipe/tasks/web/audio/core/audio_task_runner.ts
@@ -36,8 +36,11 @@ export abstract class AudioTaskRunner<T> extends TaskRunner {
 
   /** Sends a single audio clip to the graph and awaits results. */
   protected processAudioClip(audioData: Float32Array, sampleRate?: number): T {
+    // Increment the timestamp by 1 millisecond to guarantee that we send
+    // monotonically increasing timestamps to the graph.
+    const syntheticTimestamp = this.getLatestOutputTimestamp() + 1;
     return this.process(
-        audioData, sampleRate ?? this.defaultSampleRate, performance.now());
+        audioData, sampleRate ?? this.defaultSampleRate, syntheticTimestamp);
   }
 }
 
diff --git a/mediapipe/tasks/web/core/task_runner.ts b/mediapipe/tasks/web/core/task_runner.ts
index c2679b773..8d483d9ff 100644
--- a/mediapipe/tasks/web/core/task_runner.ts
+++ b/mediapipe/tasks/web/core/task_runner.ts
@@ -50,7 +50,7 @@ export async function createTaskRunner<T extends TaskRunner>(
     }
   };
 
-  // Initialize a canvas if requested. If OffscreenCanvas is availble, we
+  // Initialize a canvas if requested. If OffscreenCanvas is available, we
   // let the graph runner initialize it by passing `undefined`.
   const canvas = initializeCanvas ? (typeof OffscreenCanvas === 'undefined' ?
                                          document.createElement('canvas') :
@@ -66,6 +66,7 @@ export async function createTaskRunner<T extends TaskRunner>(
 export abstract class TaskRunner {
   protected abstract baseOptions: BaseOptionsProto;
   private processingErrors: Error[] = [];
+  private latestOutputTimestamp = 0;
 
   /**
    * Creates a new instance of a Mediapipe Task. Determines if SIMD is
@@ -162,6 +163,21 @@ export abstract class TaskRunner {
     this.handleErrors();
   }
 
+  /*
+   * Sets the latest output timestamp received from the graph (in ms).
+   * Timestamps that are smaller than the currently latest output timestamp are
+   * ignored.
+   */
+  protected setLatestOutputTimestamp(timestamp: number): void {
+    this.latestOutputTimestamp =
+        Math.max(this.latestOutputTimestamp, timestamp);
+  }
+
+  /** Returns the latest output timestamp. */
+  protected getLatestOutputTimestamp() {
+    return this.latestOutputTimestamp;
+  }
+
   /** Throws the error from the error listener if an error was raised. */
   private handleErrors() {
     try {
diff --git a/mediapipe/tasks/web/text/text_classifier/text_classifier.ts b/mediapipe/tasks/web/text/text_classifier/text_classifier.ts
index 6aef1b3e4..ff314cfc3 100644
--- a/mediapipe/tasks/web/text/text_classifier/text_classifier.ts
+++ b/mediapipe/tasks/web/text/text_classifier/text_classifier.ts
@@ -131,10 +131,11 @@ export class TextClassifier extends TaskRunner {
    * @return The classification result of the text
    */
   classify(text: string): TextClassifierResult {
-    // Get classification result by running our MediaPipe graph.
+    // Increment the timestamp by 1 millisecond to guarantee that we send
+    // monotonically increasing timestamps to the graph.
+    const syntheticTimestamp = this.getLatestOutputTimestamp() + 1;
     this.classificationResult = {classifications: []};
-    this.graphRunner.addStringToStream(
-        text, INPUT_STREAM, /* timestamp= */ performance.now());
+    this.graphRunner.addStringToStream(text, INPUT_STREAM, syntheticTimestamp);
     this.finishProcessing();
     return this.classificationResult;
   }
@@ -158,9 +159,10 @@ export class TextClassifier extends TaskRunner {
     graphConfig.addNode(classifierNode);
 
     this.graphRunner.attachProtoListener(
-        CLASSIFICATIONS_STREAM, binaryProto => {
+        CLASSIFICATIONS_STREAM, (binaryProto, timestamp) => {
           this.classificationResult = convertFromClassificationResultProto(
               ClassificationResult.deserializeBinary(binaryProto));
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();
diff --git a/mediapipe/tasks/web/text/text_embedder/text_embedder.ts b/mediapipe/tasks/web/text/text_embedder/text_embedder.ts
index db7986dec..daa1d24ed 100644
--- a/mediapipe/tasks/web/text/text_embedder/text_embedder.ts
+++ b/mediapipe/tasks/web/text/text_embedder/text_embedder.ts
@@ -135,9 +135,10 @@ export class TextEmbedder extends TaskRunner {
    * @return The embedding resuls of the text
    */
   embed(text: string): TextEmbedderResult {
-    // Get text embeddings by running our MediaPipe graph.
-    this.graphRunner.addStringToStream(
-        text, INPUT_STREAM, /* timestamp= */ performance.now());
+    // Increment the timestamp by 1 millisecond to guarantee that we send
+    // monotonically increasing timestamps to the graph.
+    const syntheticTimestamp = this.getLatestOutputTimestamp() + 1;
+    this.graphRunner.addStringToStream(text, INPUT_STREAM, syntheticTimestamp);
     this.finishProcessing();
     return this.embeddingResult;
   }
@@ -173,10 +174,14 @@ export class TextEmbedder extends TaskRunner {
 
     graphConfig.addNode(embedderNode);
 
-    this.graphRunner.attachProtoListener(EMBEDDINGS_STREAM, binaryProto => {
-      const embeddingResult = EmbeddingResult.deserializeBinary(binaryProto);
-      this.embeddingResult = convertFromEmbeddingResultProto(embeddingResult);
-    });
+    this.graphRunner.attachProtoListener(
+        EMBEDDINGS_STREAM, (binaryProto, timestamp) => {
+          const embeddingResult =
+              EmbeddingResult.deserializeBinary(binaryProto);
+          this.embeddingResult =
+              convertFromEmbeddingResultProto(embeddingResult);
+          this.setLatestOutputTimestamp(timestamp);
+        });
 
     const binaryGraph = graphConfig.serializeBinary();
     this.setGraph(new Uint8Array(binaryGraph), /* isBinary= */ true);
diff --git a/mediapipe/tasks/web/vision/core/vision_task_runner.ts b/mediapipe/tasks/web/vision/core/vision_task_runner.ts
index 9adc810fc..9ed9ffdb2 100644
--- a/mediapipe/tasks/web/vision/core/vision_task_runner.ts
+++ b/mediapipe/tasks/web/vision/core/vision_task_runner.ts
@@ -71,7 +71,11 @@ export abstract class VisionTaskRunner extends TaskRunner {
           'Task is not initialized with image mode. ' +
           '\'runningMode\' must be set to \'image\'.');
     }
-    this.process(image, imageProcessingOptions, performance.now());
+
+    // Increment the timestamp by 1 millisecond to guarantee that we send
+    // monotonically increasing timestamps to the graph.
+    const syntheticTimestamp = this.getLatestOutputTimestamp() + 1;
+    this.process(image, imageProcessingOptions, syntheticTimestamp);
   }
 
   /** Sends a single video frame to the graph and awaits results. */
diff --git a/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts b/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts
index e0c6affcb..48efc4855 100644
--- a/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts
+++ b/mediapipe/tasks/web/vision/gesture_recognizer/gesture_recognizer.ts
@@ -380,23 +380,27 @@ export class GestureRecognizer extends VisionTaskRunner {
     graphConfig.addNode(recognizerNode);
 
     this.graphRunner.attachProtoVectorListener(
-        LANDMARKS_STREAM, binaryProto => {
+        LANDMARKS_STREAM, (binaryProto, timestamp) => {
           this.addJsLandmarks(binaryProto);
+          this.setLatestOutputTimestamp(timestamp);
         });
     this.graphRunner.attachProtoVectorListener(
-        WORLD_LANDMARKS_STREAM, binaryProto => {
+        WORLD_LANDMARKS_STREAM, (binaryProto, timestamp) => {
           this.adddJsWorldLandmarks(binaryProto);
+          this.setLatestOutputTimestamp(timestamp);
         });
     this.graphRunner.attachProtoVectorListener(
-        HAND_GESTURES_STREAM, binaryProto => {
+        HAND_GESTURES_STREAM, (binaryProto, timestamp) => {
           // Gesture index is not used, because the final gesture result comes
           // from multiple classifiers.
           this.gestures.push(
               ...this.toJsCategories(binaryProto, /* populateIndex= */ false));
+          this.setLatestOutputTimestamp(timestamp);
         });
     this.graphRunner.attachProtoVectorListener(
-        HANDEDNESS_STREAM, binaryProto => {
+        HANDEDNESS_STREAM, (binaryProto, timestamp) => {
           this.handednesses.push(...this.toJsCategories(binaryProto));
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();
diff --git a/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts b/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts
index e238bc96f..b51fb6a52 100644
--- a/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts
+++ b/mediapipe/tasks/web/vision/hand_landmarker/hand_landmarker.ts
@@ -313,16 +313,19 @@ export class HandLandmarker extends VisionTaskRunner {
     graphConfig.addNode(landmarkerNode);
 
     this.graphRunner.attachProtoVectorListener(
-        LANDMARKS_STREAM, binaryProto => {
+        LANDMARKS_STREAM, (binaryProto, timestamp) => {
           this.addJsLandmarks(binaryProto);
+          this.setLatestOutputTimestamp(timestamp);
         });
     this.graphRunner.attachProtoVectorListener(
-        WORLD_LANDMARKS_STREAM, binaryProto => {
+        WORLD_LANDMARKS_STREAM, (binaryProto, timestamp) => {
           this.adddJsWorldLandmarks(binaryProto);
+          this.setLatestOutputTimestamp(timestamp);
         });
     this.graphRunner.attachProtoVectorListener(
-        HANDEDNESS_STREAM, binaryProto => {
+        HANDEDNESS_STREAM, (binaryProto, timestamp) => {
           this.handednesses.push(...this.toJsCategories(binaryProto));
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();
diff --git a/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts b/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts
index 2ad4a821d..cb2849cd8 100644
--- a/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts
+++ b/mediapipe/tasks/web/vision/image_classifier/image_classifier.ts
@@ -187,9 +187,10 @@ export class ImageClassifier extends VisionTaskRunner {
     graphConfig.addNode(classifierNode);
 
     this.graphRunner.attachProtoListener(
-        CLASSIFICATIONS_STREAM, binaryProto => {
+        CLASSIFICATIONS_STREAM, (binaryProto, timestamp) => {
           this.classificationResult = convertFromClassificationResultProto(
               ClassificationResult.deserializeBinary(binaryProto));
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();
diff --git a/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts b/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts
index 64a10f5f4..788646e6d 100644
--- a/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts
+++ b/mediapipe/tasks/web/vision/image_embedder/image_embedder.ts
@@ -206,9 +206,11 @@ export class ImageEmbedder extends VisionTaskRunner {
 
     graphConfig.addNode(embedderNode);
 
-    this.graphRunner.attachProtoListener(EMBEDDINGS_STREAM, binaryProto => {
-      this.addJsImageEmdedding(binaryProto);
-    });
+    this.graphRunner.attachProtoListener(
+        EMBEDDINGS_STREAM, (binaryProto, timestamp) => {
+          this.addJsImageEmdedding(binaryProto);
+          this.setLatestOutputTimestamp(timestamp);
+        });
 
     const binaryGraph = graphConfig.serializeBinary();
     this.setGraph(new Uint8Array(binaryGraph), /* isBinary= */ true);
diff --git a/mediapipe/tasks/web/vision/object_detector/object_detector.ts b/mediapipe/tasks/web/vision/object_detector/object_detector.ts
index 3a79c1b00..5741a3a0c 100644
--- a/mediapipe/tasks/web/vision/object_detector/object_detector.ts
+++ b/mediapipe/tasks/web/vision/object_detector/object_detector.ts
@@ -176,7 +176,7 @@ export class ObjectDetector extends VisionTaskRunner {
   }
 
   /**
-   * Performs object detection on the provided vidoe frame and waits
+   * Performs object detection on the provided video frame and waits
    * synchronously for the response. Only use this method when the
    * ObjectDetector is created with running mode `video`.
    *
@@ -248,8 +248,9 @@ export class ObjectDetector extends VisionTaskRunner {
     graphConfig.addNode(detectorNode);
 
     this.graphRunner.attachProtoVectorListener(
-        DETECTIONS_STREAM, binaryProto => {
+        DETECTIONS_STREAM, (binaryProto, timestamp) => {
           this.addJsObjectDetections(binaryProto);
+          this.setLatestOutputTimestamp(timestamp);
         });
 
     const binaryGraph = graphConfig.serializeBinary();