Open source Model Maker text tasks.

PiperOrigin-RevId: 487706929
2022-11-10 19:51:08 -08:00 · 2022-11-10 19:51:08 -08:00 · 3e05871f98
commit 3e05871f98
parent d2284083b3
19 changed files with 1882 additions and 0 deletions
--- a/mediapipe/model_maker/python/text/core/BUILD
+++ b/mediapipe/model_maker/python/text/core/BUILD
@ -0,0 +1,35 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Placeholder for internal Python strict library and test compatibility macro.
+
+package(
+    default_visibility = ["//mediapipe:__subpackages__"],
+)
+
+licenses(["notice"])
+
+py_library(
+    name = "bert_model_options",
+    srcs = ["bert_model_options.py"],
+)
+
+py_library(
+    name = "bert_model_spec",
+    srcs = ["bert_model_spec.py"],
+    deps = [
+        ":bert_model_options",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+    ],
+)
--- a/mediapipe/model_maker/python/text/core/init.py
+++ b/mediapipe/model_maker/python/text/core/init.py
@ -0,0 +1,13 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/mediapipe/model_maker/python/text/core/bert_model_options.py
+++ b/mediapipe/model_maker/python/text/core/bert_model_options.py
@ -0,0 +1,33 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configurable model options for a BERT model."""
+
+import dataclasses
+
+
+@dataclasses.dataclass
+class BertModelOptions:
+  """Configurable model options for a BERT model.
+
+  See https://arxiv.org/abs/1810.04805 (BERT: Pre-training of Deep Bidirectional
+  Transformers for Language Understanding) for more details.
+
+    Attributes:
+      seq_len: Length of the sequence to feed into the model.
+      do_fine_tuning: If true, then the BERT model is not frozen for training.
+      dropout_rate: The rate for dropout.
+  """
+  seq_len: int = 128
+  do_fine_tuning: bool = True
+  dropout_rate: float = 0.1
--- a/mediapipe/model_maker/python/text/core/bert_model_spec.py
+++ b/mediapipe/model_maker/python/text/core/bert_model_spec.py
@ -0,0 +1,58 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specification for a BERT model."""
+
+import dataclasses
+from typing import Dict
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.text.core import bert_model_options
+
+_DEFAULT_TFLITE_INPUT_NAME = {
+    'ids': 'serving_default_input_word_ids:0',
+    'mask': 'serving_default_input_mask:0',
+    'segment_ids': 'serving_default_input_type_ids:0'
+}
+
+
+@dataclasses.dataclass
+class BertModelSpec:
+  """Specification for a BERT model.
+
+  See https://arxiv.org/abs/1810.04805 (BERT: Pre-training of Deep Bidirectional
+  Transformers for Language Understanding) for more details.
+
+    Attributes:
+      hparams: Hyperparameters used for training.
+      model_options: Configurable options for a BERT model.
+      do_lower_case: boolean, whether to lower case the input text. Should be
+        True / False for uncased / cased models respectively, where the models
+        are specified by the `uri`.
+      tflite_input_name: Dict, input names for the TFLite model.
+      uri: URI for the BERT module.
+      name: The name of the object.
+  """
+
+  hparams: hp.BaseHParams = hp.BaseHParams(
+      epochs=3,
+      batch_size=32,
+      learning_rate=3e-5,
+      distribution_strategy='mirrored')
+  model_options: bert_model_options.BertModelOptions = (
+      bert_model_options.BertModelOptions())
+  do_lower_case: bool = True
+  tflite_input_name: Dict[str, str] = dataclasses.field(
+      default_factory=lambda: _DEFAULT_TFLITE_INPUT_NAME)
+  uri: str = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'
+  name: str = 'Bert'
--- a/mediapipe/model_maker/python/text/text_classifier/BUILD
+++ b/mediapipe/model_maker/python/text/text_classifier/BUILD
@ -0,0 +1,146 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Placeholder for internal Python strict library and test compatibility macro.
+# Placeholder for internal Python strict test compatibility macro.
+
+package(
+    default_visibility = ["//mediapipe:__subpackages__"],
+)
+
+licenses(["notice"])
+
+py_library(
+    name = "model_options",
+    srcs = ["model_options.py"],
+    deps = ["//mediapipe/model_maker/python/text/core:bert_model_options"],
+)
+
+py_library(
+    name = "model_spec",
+    srcs = ["model_spec.py"],
+    deps = [
+        ":model_options",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+        "//mediapipe/model_maker/python/text/core:bert_model_spec",
+    ],
+)
+
+py_test(
+    name = "model_spec_test",
+    srcs = ["model_spec_test.py"],
+    deps = [
+        ":model_options",
+        ":model_spec",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+    ],
+)
+
+py_library(
+    name = "dataset",
+    srcs = ["dataset.py"],
+    deps = ["//mediapipe/model_maker/python/core/data:classification_dataset"],
+)
+
+py_test(
+    name = "dataset_test",
+    srcs = ["dataset_test.py"],
+    deps = [":dataset"],
+)
+
+py_library(
+    name = "preprocessor",
+    srcs = ["preprocessor.py"],
+    deps = [":dataset"],
+)
+
+py_test(
+    name = "preprocessor_test",
+    srcs = ["preprocessor_test.py"],
+    tags = ["requires-net:external"],
+    deps = [
+        ":dataset",
+        ":model_spec",
+        ":preprocessor",
+    ],
+)
+
+py_library(
+    name = "text_classifier_options",
+    srcs = ["text_classifier_options.py"],
+    deps = [
+        ":model_options",
+        ":model_spec",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+    ],
+)
+
+py_library(
+    name = "text_classifier",
+    srcs = ["text_classifier.py"],
+    deps = [
+        ":dataset",
+        ":model_options",
+        ":model_spec",
+        ":preprocessor",
+        ":text_classifier_options",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+        "//mediapipe/model_maker/python/core/data:dataset",
+        "//mediapipe/model_maker/python/core/tasks:classifier",
+        "//mediapipe/model_maker/python/core/utils:model_util",
+        "//mediapipe/model_maker/python/core/utils:quantization",
+        "//mediapipe/tasks/python/metadata/metadata_writers:metadata_writer",
+        "//mediapipe/tasks/python/metadata/metadata_writers:text_classifier",
+    ],
+)
+
+py_test(
+    name = "text_classifier_test",
+    size = "large",
+    srcs = ["text_classifier_test.py"],
+    data = [
+        "//mediapipe/model_maker/python/text/text_classifier/testdata",
+    ],
+    tags = ["requires-net:external"],
+    deps = [
+        ":dataset",
+        ":model_options",
+        ":model_spec",
+        ":text_classifier",
+        ":text_classifier_options",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+        "//mediapipe/tasks/python/test:test_utils",
+    ],
+)
+
+py_library(
+    name = "text_classifier_demo_lib",
+    srcs = ["text_classifier_demo.py"],
+    deps = [
+        ":dataset",
+        ":model_spec",
+        ":text_classifier",
+        ":text_classifier_options",
+        "//mediapipe/model_maker/python/core:hyperparameters",
+        "//mediapipe/model_maker/python/core/utils:quantization",
+    ],
+)
+
+py_binary(
+    name = "text_classifier_demo",
+    srcs = ["text_classifier_demo.py"],
+    deps = [
+        ":text_classifier_demo_lib",
+    ],
+)
--- a/mediapipe/model_maker/python/text/text_classifier/init.py
+++ b/mediapipe/model_maker/python/text/text_classifier/init.py
@ -0,0 +1,13 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/mediapipe/model_maker/python/text/text_classifier/dataset.py
+++ b/mediapipe/model_maker/python/text/text_classifier/dataset.py
@ -0,0 +1,88 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text classifier dataset library."""
+
+import csv
+import dataclasses
+import random
+
+from typing import Optional, Sequence
+import tensorflow as tf
+
+from mediapipe.model_maker.python.core.data import classification_dataset
+
+
+@dataclasses.dataclass
+class CSVParameters:
+  """Parameters used when reading a CSV file.
+
+  Attributes:
+    text_column: Column name for the input text.
+    label_column: Column name for the labels.
+    fieldnames: Sequence of keys for the CSV columns. If None, the first row of
+      the CSV file is used as the keys.
+    delimiter: Character that separates fields.
+    quotechar: Character used to quote fields that contain special characters
+      like the `delimiter`.
+  """
+  text_column: str
+  label_column: str
+  fieldnames: Optional[Sequence[str]] = None
+  delimiter: str = ","
+  quotechar: str = '"'
+
+
+class Dataset(classification_dataset.ClassificationDataset):
+  """Dataset library for text classifier."""
+
+  @classmethod
+  def from_csv(cls,
+               filename: str,
+               csv_params: CSVParameters,
+               shuffle: bool = True) -> "Dataset":
+    """Loads text with labels from a CSV file.
+
+    Args:
+      filename: Name of the CSV file.
+      csv_params: Parameters used for reading the CSV file.
+      shuffle: If True, randomly shuffle the data.
+
+    Returns:
+      Dataset containing (text, label) pairs and other related info.
+    """
+    with tf.io.gfile.GFile(filename, "r") as f:
+      reader = csv.DictReader(
+          f,
+          fieldnames=csv_params.fieldnames,
+          delimiter=csv_params.delimiter,
+          quotechar=csv_params.quotechar)
+
+    lines = list(reader)
+    if shuffle:
+      random.shuffle(lines)
+
+    label_names = sorted(set([line[csv_params.label_column] for line in lines]))
+    index_by_label = {label: index for index, label in enumerate(label_names)}
+
+    texts = [line[csv_params.text_column] for line in lines]
+    text_ds = tf.data.Dataset.from_tensor_slices(tf.cast(texts, tf.string))
+    label_indices = [
+        index_by_label[line[csv_params.label_column]] for line in lines
+    ]
+    label_index_ds = tf.data.Dataset.from_tensor_slices(
+        tf.cast(label_indices, tf.int64))
+    text_label_ds = tf.data.Dataset.zip((text_ds, label_index_ds))
+
+    return Dataset(
+        dataset=text_label_ds, size=len(texts), label_names=label_names)
--- a/mediapipe/model_maker/python/text/text_classifier/dataset_test.py
+++ b/mediapipe/model_maker/python/text/text_classifier/dataset_test.py
@ -0,0 +1,75 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import os
+
+import tensorflow as tf
+
+from mediapipe.model_maker.python.text.text_classifier import dataset
+
+
+class DatasetTest(tf.test.TestCase):
+
+  def _get_csv_file(self):
+    labels_and_text = (('neutral', 'indifferent'), ('pos', 'extremely great'),
+                       ('neg', 'totally awful'), ('pos', 'super good'),
+                       ('neg', 'really bad'))
+    csv_file = os.path.join(self.get_temp_dir(), 'data.csv')
+    if os.path.exists(csv_file):
+      return csv_file
+    fieldnames = ['text', 'label']
+    with open(csv_file, 'w') as f:
+      writer = csv.DictWriter(f, fieldnames=fieldnames)
+      writer.writeheader()
+      for label, text in labels_and_text:
+        writer.writerow({'text': text, 'label': label})
+    return csv_file
+
+  def test_from_csv(self):
+    csv_file = self._get_csv_file()
+    csv_params = dataset.CSVParameters(text_column='text', label_column='label')
+    data = dataset.Dataset.from_csv(filename=csv_file, csv_params=csv_params)
+    self.assertLen(data, 5)
+    self.assertEqual(data.num_classes, 3)
+    self.assertEqual(data.label_names, ['neg', 'neutral', 'pos'])
+    data_values = set([(text.numpy()[0], label.numpy()[0])
+                       for text, label in data.gen_tf_dataset()])
+    expected_data_values = set([(b'indifferent', 1), (b'extremely great', 2),
+                                (b'totally awful', 0), (b'super good', 2),
+                                (b'really bad', 0)])
+    self.assertEqual(data_values, expected_data_values)
+
+  def test_split(self):
+    ds = tf.data.Dataset.from_tensor_slices(['good', 'bad', 'neutral', 'odd'])
+    data = dataset.Dataset(ds, 4, ['pos', 'neg'])
+    train_data, test_data = data.split(0.5)
+    expected_train_data = [b'good', b'bad']
+    expected_test_data = [b'neutral', b'odd']
+
+    self.assertLen(train_data, 2)
+    train_data_values = [elem.numpy() for elem in train_data._dataset]
+    self.assertEqual(train_data_values, expected_train_data)
+    self.assertEqual(train_data.num_classes, 2)
+    self.assertEqual(train_data.label_names, ['pos', 'neg'])
+
+    self.assertLen(test_data, 2)
+    test_data_values = [elem.numpy() for elem in test_data._dataset]
+    self.assertEqual(test_data_values, expected_test_data)
+    self.assertEqual(test_data.num_classes, 2)
+    self.assertEqual(test_data.label_names, ['pos', 'neg'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/mediapipe/model_maker/python/text/text_classifier/model_options.py
+++ b/mediapipe/model_maker/python/text/text_classifier/model_options.py
@ -0,0 +1,45 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configurable model options for text classifier models."""
+
+import dataclasses
+from typing import Union
+
+from mediapipe.model_maker.python.text.core import bert_model_options
+
+# BERT text classifier options inherited from BertModelOptions.
+BertClassifierOptions = bert_model_options.BertModelOptions
+
+
+@dataclasses.dataclass
+class AverageWordEmbeddingClassifierOptions:
+  """Configurable model options for an Average Word Embedding classifier.
+
+  Attributes:
+    seq_len: Length of the sequence to feed into the model.
+    wordvec_dim: Dimension of the word embedding.
+    do_lower_case: Whether to convert all uppercase characters to lowercase
+      during preprocessing.
+    vocab_size: Number of words to generate the vocabulary from data.
+    dropout_rate: The rate for dropout.
+  """
+  seq_len: int = 256
+  wordvec_dim: int = 16
+  do_lower_case: bool = True
+  vocab_size: int = 10000
+  dropout_rate: float = 0.2
+
+
+TextClassifierModelOptions = Union[AverageWordEmbeddingClassifierOptions,
+                                   BertClassifierOptions]
--- a/mediapipe/model_maker/python/text/text_classifier/model_spec.py
+++ b/mediapipe/model_maker/python/text/text_classifier/model_spec.py
@ -0,0 +1,70 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Specifications for text classifier models."""
+
+import dataclasses
+import enum
+import functools
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.text.core import bert_model_spec
+from mediapipe.model_maker.python.text.text_classifier import model_options as mo
+
+# BERT-based text classifier spec inherited from BertModelSpec
+BertClassifierSpec = bert_model_spec.BertModelSpec
+
+
+@dataclasses.dataclass
+class AverageWordEmbeddingClassifierSpec:
+  """Specification for an average word embedding classifier model.
+
+  Attributes:
+    hparams: Configurable hyperparameters for training.
+    model_options: Configurable options for the average word embedding model.
+    name: The name of the object.
+  """
+
+  # `learning_rate` is unused for the average word embedding model
+  hparams: hp.BaseHParams = hp.BaseHParams(
+      epochs=10, batch_size=32, learning_rate=0)
+  model_options: mo.AverageWordEmbeddingClassifierOptions = (
+      mo.AverageWordEmbeddingClassifierOptions())
+  name: str = 'AverageWordEmbedding'
+
+
+average_word_embedding_classifier_spec = functools.partial(
+    AverageWordEmbeddingClassifierSpec)
+
+mobilebert_classifier_spec = functools.partial(
+    BertClassifierSpec,
+    hparams=hp.BaseHParams(
+        epochs=3,
+        batch_size=48,
+        learning_rate=3e-5,
+        distribution_strategy='off'),
+    name='MobileBert',
+    uri='https://tfhub.dev/tensorflow/mobilebert_en_uncased_L-24_H-128_B-512_A-4_F-4_OPT/1',
+    tflite_input_name={
+        'ids': 'serving_default_input_1:0',
+        'mask': 'serving_default_input_3:0',
+        'segment_ids': 'serving_default_input_2:0'
+    },
+)
+
+
+@enum.unique
+class SupportedModels(enum.Enum):
+  """Predefined text classifier model specs supported by Model Maker."""
+  AVERAGE_WORD_EMBEDDING_CLASSIFIER = average_word_embedding_classifier_spec
+  MOBILEBERT_CLASSIFIER = mobilebert_classifier_spec
--- a/mediapipe/model_maker/python/text/text_classifier/model_spec_test.py
+++ b/mediapipe/model_maker/python/text/text_classifier/model_spec_test.py
@ -0,0 +1,118 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for model_spec."""
+
+import os
+
+import tensorflow as tf
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.text.text_classifier import model_options as classifier_model_options
+from mediapipe.model_maker.python.text.text_classifier import model_spec as ms
+
+
+class ModelSpecTest(tf.test.TestCase):
+
+  def test_predefined_bert_spec(self):
+    model_spec_obj = ms.SupportedModels.MOBILEBERT_CLASSIFIER.value()
+    self.assertIsInstance(model_spec_obj, ms.BertClassifierSpec)
+    self.assertEqual(model_spec_obj.name, 'MobileBert')
+    self.assertEqual(
+        model_spec_obj.uri, 'https://tfhub.dev/tensorflow/'
+        'mobilebert_en_uncased_L-24_H-128_B-512_A-4_F-4_OPT/1')
+    self.assertTrue(model_spec_obj.do_lower_case)
+    self.assertEqual(
+        model_spec_obj.tflite_input_name, {
+            'ids': 'serving_default_input_1:0',
+            'mask': 'serving_default_input_3:0',
+            'segment_ids': 'serving_default_input_2:0'
+        })
+    self.assertEqual(
+        model_spec_obj.model_options,
+        classifier_model_options.BertClassifierOptions(
+            seq_len=128, do_fine_tuning=True, dropout_rate=0.1))
+    self.assertEqual(
+        model_spec_obj.hparams,
+        hp.BaseHParams(
+            epochs=3,
+            batch_size=48,
+            learning_rate=3e-5,
+            distribution_strategy='off'))
+
+  def test_predefined_average_word_embedding_spec(self):
+    model_spec_obj = (
+        ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER.value())
+    self.assertIsInstance(model_spec_obj, ms.AverageWordEmbeddingClassifierSpec)
+    self.assertEqual(model_spec_obj.name, 'AverageWordEmbedding')
+    self.assertEqual(
+        model_spec_obj.model_options,
+        classifier_model_options.AverageWordEmbeddingClassifierOptions(
+            seq_len=256,
+            wordvec_dim=16,
+            do_lower_case=True,
+            vocab_size=10000,
+            dropout_rate=0.2))
+    self.assertEqual(
+        model_spec_obj.hparams,
+        hp.BaseHParams(
+            epochs=10,
+            batch_size=32,
+            learning_rate=0,
+            steps_per_epoch=None,
+            shuffle=False,
+            distribution_strategy='off',
+            num_gpus=-1,
+            tpu=''))
+
+  def test_custom_bert_spec(self):
+    custom_bert_classifier_options = (
+        classifier_model_options.BertClassifierOptions(
+            seq_len=512, do_fine_tuning=False, dropout_rate=0.3))
+    model_spec_obj = (
+        ms.SupportedModels.MOBILEBERT_CLASSIFIER.value(
+            model_options=custom_bert_classifier_options))
+    self.assertEqual(model_spec_obj.model_options,
+                     custom_bert_classifier_options)
+
+  def test_custom_average_word_embedding_spec(self):
+    custom_hparams = hp.BaseHParams(
+        learning_rate=0.4,
+        batch_size=64,
+        epochs=10,
+        steps_per_epoch=10,
+        shuffle=True,
+        export_dir='foo/bar',
+        distribution_strategy='mirrored',
+        num_gpus=3,
+        tpu='tpu/address')
+    custom_average_word_embedding_model_options = (
+        classifier_model_options.AverageWordEmbeddingClassifierOptions(
+            seq_len=512,
+            wordvec_dim=32,
+            do_lower_case=False,
+            vocab_size=5000,
+            dropout_rate=0.5))
+    model_spec_obj = (
+        ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER.value(
+            model_options=custom_average_word_embedding_model_options,
+            hparams=custom_hparams))
+    self.assertEqual(model_spec_obj.model_options,
+                     custom_average_word_embedding_model_options)
+    self.assertEqual(model_spec_obj.hparams, custom_hparams)
+
+
+if __name__ == '__main__':
+  # Load compressed models from tensorflow_hub
+  os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
+  tf.test.main()
--- a/mediapipe/model_maker/python/text/text_classifier/preprocessor.py
+++ b/mediapipe/model_maker/python/text/text_classifier/preprocessor.py
@ -0,0 +1,285 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preprocessors for text classification."""
+
+import collections
+import os
+import re
+import tempfile
+from typing import Mapping, Sequence, Tuple, Union
+
+import tensorflow as tf
+import tensorflow_hub
+
+from mediapipe.model_maker.python.text.text_classifier import dataset as text_classifier_ds
+from official.nlp.data import classifier_data_lib
+from official.nlp.tools import tokenization
+
+
+def _validate_text_and_label(text: tf.Tensor, label: tf.Tensor) -> None:
+  """Validates the shape and type of `text` and `label`.
+
+  Args:
+    text: Stores text data. Should have shape [1] and dtype tf.string.
+    label: Stores the label for the corresponding `text`. Should have shape [1]
+      and dtype tf.int64.
+
+  Raises:
+    ValueError: If either tensor has the wrong shape or type.
+  """
+  if text.shape != [1]:
+    raise ValueError(f"`text` should have shape [1], got {text.shape}")
+  if text.dtype != tf.string:
+    raise ValueError(f"Expected dtype string for `text`, got {text.dtype}")
+  if label.shape != [1]:
+    raise ValueError(f"`label` should have shape [1], got {text.shape}")
+  if label.dtype != tf.int64:
+    raise ValueError(f"Expected dtype int64 for `label`, got {label.dtype}")
+
+
+def _decode_record(
+    record: tf.Tensor, name_to_features: Mapping[str, tf.io.FixedLenFeature]
+) -> Tuple[Mapping[str, tf.Tensor], tf.Tensor]:
+  """Decodes a record into input for a BERT model.
+
+  Args:
+    record: Stores serialized example.
+    name_to_features: Maps record keys to feature types.
+
+  Returns:
+    BERT model input features and label for the record.
+  """
+  example = tf.io.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  for name in list(example.keys()):
+    example[name] = tf.cast(example[name], tf.int32)
+
+  bert_features = {
+      "input_word_ids": example["input_ids"],
+      "input_mask": example["input_mask"],
+      "input_type_ids": example["segment_ids"]
+  }
+  return bert_features, example["label_ids"]
+
+
+def _single_file_dataset(
+    input_file: str, name_to_features: Mapping[str, tf.io.FixedLenFeature]
+) -> tf.data.TFRecordDataset:
+  """Creates a single-file dataset to be passed for BERT custom training.
+
+  Args:
+    input_file: Filepath for the dataset.
+    name_to_features: Maps record keys to feature types.
+
+  Returns:
+    Dataset containing BERT model input features and labels.
+  """
+  d = tf.data.TFRecordDataset(input_file)
+  d = d.map(
+      lambda record: _decode_record(record, name_to_features),
+      num_parallel_calls=tf.data.AUTOTUNE)
+  return d
+
+
+class AverageWordEmbeddingClassifierPreprocessor:
+  """Preprocessor for an Average Word Embedding model.
+
+  Takes (text, label) data and applies regex tokenization and padding to the
+  text to generate (token IDs, label) data.
+
+  Attributes:
+    seq_len: Length of the input sequence to the model.
+    do_lower_case: Whether text inputs should be converted to lower-case.
+    vocab: Vocabulary of tokens used by the model.
+  """
+
+  PAD: str = "<PAD>"  # Index: 0
+  START: str = "<START>"  # Index: 1
+  UNKNOWN: str = "<UNKNOWN>"  # Index: 2
+
+  def __init__(self, seq_len: int, do_lower_case: bool, texts: Sequence[str],
+               vocab_size: int):
+    self._seq_len = seq_len
+    self._do_lower_case = do_lower_case
+    self._vocab = self._gen_vocab(texts, vocab_size)
+
+  def _gen_vocab(self, texts: Sequence[str],
+                 vocab_size: int) -> Mapping[str, int]:
+    """Generates vocabulary list in `texts` with size `vocab_size`.
+
+    Args:
+      texts: All texts (across training and validation data) that will be
+        preprocessed by the model.
+      vocab_size: Size of the vocab.
+
+    Returns:
+      The vocab mapping tokens to IDs.
+    """
+    vocab_counter = collections.Counter()
+
+    for text in texts:
+      tokens = self._regex_tokenize(text)
+      for token in tokens:
+        vocab_counter[token] += 1
+
+    vocab_freq = vocab_counter.most_common(vocab_size)
+    vocab_list = [self.PAD, self.START, self.UNKNOWN
+                 ] + [word for word, _ in vocab_freq]
+    return collections.OrderedDict(((v, i) for i, v in enumerate(vocab_list)))
+
+  def get_vocab(self) -> Mapping[str, int]:
+    """Returns the vocab of the AverageWordEmbeddingClassifierPreprocessor."""
+    return self._vocab
+
+  # TODO: Align with MediaPipe's RegexTokenizer.
+  def _regex_tokenize(self, text: str) -> Sequence[str]:
+    """Splits `text` by words but does not split on single quotes.
+
+    Args:
+      text: Text to be tokenized.
+
+    Returns:
+      List of tokens.
+    """
+    text = tf.compat.as_text(text)
+    if self._do_lower_case:
+      text = text.lower()
+    tokens = re.compile(r"[^\w\']+").split(text.strip())
+    # Filters out any empty strings in `tokens`.
+    return list(filter(None, tokens))
+
+  def _tokenize_and_pad(self, text: str) -> Sequence[int]:
+    """Tokenizes `text` and pads the tokens to `seq_len`.
+
+    Args:
+      text: Text to be tokenized and padded.
+
+    Returns:
+      List of token IDs padded to have length `seq_len`.
+    """
+    tokens = self._regex_tokenize(text)
+
+    # Gets ids for START, PAD and UNKNOWN tokens.
+    start_id = self._vocab[self.START]
+    pad_id = self._vocab[self.PAD]
+    unknown_id = self._vocab[self.UNKNOWN]
+
+    token_ids = [self._vocab.get(token, unknown_id) for token in tokens]
+    token_ids = [start_id] + token_ids
+
+    if len(token_ids) < self._seq_len:
+      pad_length = self._seq_len - len(token_ids)
+      token_ids = token_ids + pad_length * [pad_id]
+    else:
+      token_ids = token_ids[:self._seq_len]
+    return token_ids
+
+  def preprocess(
+      self, dataset: text_classifier_ds.Dataset) -> text_classifier_ds.Dataset:
+    """Preprocesses data into input for an Average Word Embedding model.
+
+    Args:
+      dataset: Stores (text, label) data.
+
+    Returns:
+      Dataset containing (token IDs, label) data.
+    """
+    token_ids_list = []
+    labels_list = []
+    for text, label in dataset.gen_tf_dataset():
+      _validate_text_and_label(text, label)
+      token_ids = self._tokenize_and_pad(text.numpy()[0].decode("utf-8"))
+      token_ids_list.append(token_ids)
+      labels_list.append(label.numpy()[0])
+
+    token_ids_ds = tf.data.Dataset.from_tensor_slices(token_ids_list)
+    labels_ds = tf.data.Dataset.from_tensor_slices(labels_list)
+    preprocessed_ds = tf.data.Dataset.zip((token_ids_ds, labels_ds))
+    return text_classifier_ds.Dataset(
+        dataset=preprocessed_ds,
+        size=dataset.size,
+        label_names=dataset.label_names)
+
+
+class BertClassifierPreprocessor:
+  """Preprocessor for a BERT-based classifier.
+
+  Attributes:
+    seq_len: Length of the input sequence to the model.
+    vocab_file: File containing the BERT vocab.
+    tokenizer: BERT tokenizer.
+  """
+
+  def __init__(self, seq_len: int, do_lower_case: bool, uri: str):
+    self._seq_len = seq_len
+    # Vocab filepath is tied to the BERT module's URI.
+    self._vocab_file = os.path.join(
+        tensorflow_hub.resolve(uri), "assets", "vocab.txt")
+    self._tokenizer = tokenization.FullTokenizer(self._vocab_file,
+                                                 do_lower_case)
+
+  def _get_name_to_features(self):
+    """Gets the dictionary mapping record keys to feature types."""
+    return {
+        "input_ids": tf.io.FixedLenFeature([self._seq_len], tf.int64),
+        "input_mask": tf.io.FixedLenFeature([self._seq_len], tf.int64),
+        "segment_ids": tf.io.FixedLenFeature([self._seq_len], tf.int64),
+        "label_ids": tf.io.FixedLenFeature([], tf.int64),
+    }
+
+  def get_vocab_file(self) -> str:
+    """Returns the vocab file of the BertClassifierPreprocessor."""
+    return self._vocab_file
+
+  def preprocess(
+      self, dataset: text_classifier_ds.Dataset) -> text_classifier_ds.Dataset:
+    """Preprocesses data into input for a BERT-based classifier.
+
+    Args:
+      dataset: Stores (text, label) data.
+
+    Returns:
+      Dataset containing (bert_features, label) data.
+    """
+    examples = []
+    for index, (text, label) in enumerate(dataset.gen_tf_dataset()):
+      _validate_text_and_label(text, label)
+      examples.append(
+          classifier_data_lib.InputExample(
+              guid=str(index),
+              text_a=text.numpy()[0].decode("utf-8"),
+              text_b=None,
+              # InputExample expects the label name rather than the int ID
+              label=dataset.label_names[label.numpy()[0]]))
+
+    tfrecord_file = os.path.join(tempfile.mkdtemp(), "bert_features.tfrecord")
+    classifier_data_lib.file_based_convert_examples_to_features(
+        examples=examples,
+        label_list=dataset.label_names,
+        max_seq_length=self._seq_len,
+        tokenizer=self._tokenizer,
+        output_file=tfrecord_file)
+    preprocessed_ds = _single_file_dataset(tfrecord_file,
+                                           self._get_name_to_features())
+    return text_classifier_ds.Dataset(
+        dataset=preprocessed_ds,
+        size=dataset.size,
+        label_names=dataset.label_names)
+
+
+TextClassifierPreprocessor = (
+    Union[BertClassifierPreprocessor,
+          AverageWordEmbeddingClassifierPreprocessor])
--- a/mediapipe/model_maker/python/text/text_classifier/preprocessor_test.py
+++ b/mediapipe/model_maker/python/text/text_classifier/preprocessor_test.py
@ -0,0 +1,96 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import os
+
+import numpy as np
+import numpy.testing as npt
+import tensorflow as tf
+
+from mediapipe.model_maker.python.text.text_classifier import dataset as text_classifier_ds
+from mediapipe.model_maker.python.text.text_classifier import model_spec
+from mediapipe.model_maker.python.text.text_classifier import preprocessor
+
+
+class PreprocessorTest(tf.test.TestCase):
+  CSV_PARAMS_ = text_classifier_ds.CSVParameters(
+      text_column='text', label_column='label')
+
+  def _get_csv_file(self):
+    labels_and_text = (('pos', 'super super super super good'),
+                       (('neg', 'really bad')))
+    csv_file = os.path.join(self.get_temp_dir(), 'data.csv')
+    if os.path.exists(csv_file):
+      return csv_file
+    fieldnames = ['text', 'label']
+    with open(csv_file, 'w') as f:
+      writer = csv.DictWriter(f, fieldnames=fieldnames)
+      writer.writeheader()
+      for label, text in labels_and_text:
+        writer.writerow({'text': text, 'label': label})
+    return csv_file
+
+  def test_average_word_embedding_preprocessor(self):
+    csv_file = self._get_csv_file()
+    dataset = text_classifier_ds.Dataset.from_csv(
+        filename=csv_file, csv_params=self.CSV_PARAMS_)
+    average_word_embedding_preprocessor = (
+        preprocessor.AverageWordEmbeddingClassifierPreprocessor(
+            seq_len=5,
+            do_lower_case=True,
+            texts=['super super super super good', 'really bad'],
+            vocab_size=7))
+    preprocessed_dataset = (
+        average_word_embedding_preprocessor.preprocess(dataset))
+    labels = []
+    features_list = []
+    for features, label in preprocessed_dataset.gen_tf_dataset():
+      self.assertEqual(label.shape, [1])
+      labels.append(label.numpy()[0])
+      self.assertEqual(features.shape, [1, 5])
+      features_list.append(features.numpy()[0])
+    self.assertEqual(labels, [1, 0])
+    npt.assert_array_equal(
+        np.stack(features_list), np.array([[1, 3, 3, 3, 3], [1, 5, 6, 0, 0]]))
+
+  def test_bert_preprocessor(self):
+    csv_file = self._get_csv_file()
+    dataset = text_classifier_ds.Dataset.from_csv(
+        filename=csv_file, csv_params=self.CSV_PARAMS_)
+    bert_spec = model_spec.SupportedModels.MOBILEBERT_CLASSIFIER.value()
+    bert_preprocessor = preprocessor.BertClassifierPreprocessor(
+        seq_len=5, do_lower_case=bert_spec.do_lower_case, uri=bert_spec.uri)
+    preprocessed_dataset = bert_preprocessor.preprocess(dataset)
+    labels = []
+    input_masks = []
+    for features, label in preprocessed_dataset.gen_tf_dataset():
+      self.assertEqual(label.shape, [1])
+      labels.append(label.numpy()[0])
+      self.assertSameElements(
+          features.keys(), ['input_word_ids', 'input_mask', 'input_type_ids'])
+      for feature in features.values():
+        self.assertEqual(feature.shape, [1, 5])
+      input_masks.append(features['input_mask'].numpy()[0])
+      npt.assert_array_equal(features['input_type_ids'].numpy()[0],
+                             [0, 0, 0, 0, 0])
+    npt.assert_array_equal(
+        np.stack(input_masks), np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]))
+    self.assertEqual(labels, [1, 0])
+
+
+if __name__ == '__main__':
+  # Load compressed models from tensorflow_hub
+  os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
+  tf.test.main()
--- a/mediapipe/model_maker/python/text/text_classifier/testdata/BUILD
+++ b/mediapipe/model_maker/python/text/text_classifier/testdata/BUILD
@ -0,0 +1,23 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(
+    default_visibility = ["//mediapipe/model_maker/python/text/text_classifier:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "testdata",
+    srcs = ["average_word_embedding_metadata.json"],
+)
--- a/mediapipe/model_maker/python/text/text_classifier/testdata/average_word_embedding_metadata.json
+++ b/mediapipe/model_maker/python/text/text_classifier/testdata/average_word_embedding_metadata.json
@ -0,0 +1,63 @@
+{
+  "name": "TextClassifier",
+  "description": "Classify the input text into a set of known categories.",
+  "subgraph_metadata": [
+    {
+      "input_tensor_metadata": [
+        {
+          "name": "input_text",
+          "description": "Embedding vectors representing the input text to be processed.",
+          "content": {
+            "content_properties_type": "FeatureProperties",
+            "content_properties": {
+            }
+          },
+          "process_units": [
+            {
+              "options_type": "RegexTokenizerOptions",
+              "options": {
+                "delim_regex_pattern": "[^\\w\\']+",
+                "vocab_file": [
+                  {
+                    "name": "vocab.txt",
+                    "description": "Vocabulary file to convert natural language words to embedding vectors.",
+                    "type": "VOCABULARY"
+                  }
+                ]
+              }
+            }
+          ],
+          "stats": {
+          }
+        }
+      ],
+      "output_tensor_metadata": [
+        {
+          "name": "score",
+          "description": "Score of the labels respectively.",
+          "content": {
+            "content_properties_type": "FeatureProperties",
+            "content_properties": {
+            }
+          },
+          "stats": {
+            "max": [
+              1.0
+            ],
+            "min": [
+              0.0
+            ]
+          },
+          "associated_files": [
+            {
+              "name": "labels.txt",
+              "description": "Labels for categories that the model can recognize.",
+              "type": "TENSOR_AXIS_LABELS"
+            }
+          ]
+        }
+      ]
+    }
+  ],
+  "min_parser_version": "1.2.1"
+}
--- a/mediapipe/model_maker/python/text/text_classifier/text_classifier.py
+++ b/mediapipe/model_maker/python/text/text_classifier/text_classifier.py
@ -0,0 +1,437 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""API for text classification."""
+
+import abc
+import os
+import tempfile
+from typing import Any, Optional, Sequence, Tuple
+
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.core.data import dataset as ds
+from mediapipe.model_maker.python.core.tasks import classifier
+from mediapipe.model_maker.python.core.utils import model_util
+from mediapipe.model_maker.python.core.utils import quantization
+from mediapipe.model_maker.python.text.text_classifier import dataset as text_ds
+from mediapipe.model_maker.python.text.text_classifier import model_options as mo
+from mediapipe.model_maker.python.text.text_classifier import model_spec as ms
+from mediapipe.model_maker.python.text.text_classifier import preprocessor
+from mediapipe.model_maker.python.text.text_classifier import text_classifier_options
+from mediapipe.tasks.python.metadata.metadata_writers import metadata_writer
+from mediapipe.tasks.python.metadata.metadata_writers import text_classifier as text_classifier_writer
+from official.nlp import optimization
+
+
+def _validate(options: text_classifier_options.TextClassifierOptions):
+  """Validates that `model_options` and `supported_model` are compatible.
+
+  Args:
+    options: Options for creating and training a text classifier.
+
+  Raises:
+    ValueError if there is a mismatch between `model_options` and
+    `supported_model`.
+  """
+
+  if options.model_options is None:
+    return
+
+  if (isinstance(options.model_options,
+                 mo.AverageWordEmbeddingClassifierOptions) and
+      (options.supported_model !=
+       ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER)):
+    raise ValueError("Expected AVERAGE_WORD_EMBEDDING_CLASSIFIER,"
+                     f" got {options.supported_model}")
+  if (isinstance(options.model_options, mo.BertClassifierOptions) and
+      (options.supported_model != ms.SupportedModels.MOBILEBERT_CLASSIFIER)):
+    raise ValueError(
+        f"Expected MOBILEBERT_CLASSIFIER, got {options.supported_model}")
+
+
+class TextClassifier(classifier.Classifier):
+  """API for creating and training a text classification model."""
+
+  def __init__(self, model_spec: Any, hparams: hp.BaseHParams,
+               label_names: Sequence[str]):
+    super().__init__(
+        model_spec=model_spec, label_names=label_names, shuffle=hparams.shuffle)
+    self._model_spec = model_spec
+    self._hparams = hparams
+    self._callbacks = model_util.get_default_callbacks(self._hparams.export_dir)
+    self._text_preprocessor: preprocessor.TextClassifierPreprocessor = None
+
+  @classmethod
+  def create(
+      cls, train_data: text_ds.Dataset, validation_data: text_ds.Dataset,
+      options: text_classifier_options.TextClassifierOptions
+  ) -> "TextClassifier":
+    """Factory function that creates and trains a text classifier.
+
+    Note that `train_data` and `validation_data` are expected to share the same
+    `label_names` since they should be split from the same dataset.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+      options: Options for creating and training the text classifier.
+
+    Returns:
+      A text classifier.
+
+    Raises:
+      ValueError if `train_data` and `validation_data` do not have the
+      same label_names or `options` contains an unknown `supported_model`
+    """
+    if train_data.label_names != validation_data.label_names:
+      raise ValueError(
+          f"Training data label names {train_data.label_names} not equal to "
+          f"validation data label names {validation_data.label_names}")
+
+    _validate(options)
+    if options.model_options is None:
+      options.model_options = options.supported_model.value().model_options
+
+    if options.hparams is None:
+      options.hparams = options.supported_model.value().hparams
+
+    if options.supported_model == ms.SupportedModels.MOBILEBERT_CLASSIFIER:
+      text_classifier = (
+          _BertClassifier.create_bert_classifier(train_data, validation_data,
+                                                 options,
+                                                 train_data.label_names))
+    elif (options.supported_model ==
+          ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER):
+      text_classifier = (
+          _AverageWordEmbeddingClassifier
+          .create_average_word_embedding_classifier(train_data, validation_data,
+                                                    options,
+                                                    train_data.label_names))
+    else:
+      raise ValueError(f"Unknown model {options.supported_model}")
+
+    return text_classifier
+
+  def evaluate(self, data: ds.Dataset, batch_size: int = 32) -> Any:
+    """Overrides Classifier.evaluate().
+
+    Args:
+      data: Evaluation dataset. Must be a TextClassifier Dataset.
+      batch_size: Number of samples per evaluation step.
+
+    Returns:
+      The loss value and accuracy.
+
+    Raises:
+      ValueError if `data` is not a TextClassifier Dataset.
+    """
+    # This override is needed because TextClassifier preprocesses its data
+    # outside of the `gen_tf_dataset()` method. The preprocess call also
+    # requires a TextClassifier Dataset instead of a core Dataset.
+    if not isinstance(data, text_ds.Dataset):
+      raise ValueError("Need a TextClassifier Dataset.")
+
+    processed_data = self._text_preprocessor.preprocess(data)
+    dataset = processed_data.gen_tf_dataset(batch_size, is_training=False)
+    return self._model.evaluate(dataset)
+
+  def export_model(
+      self,
+      model_name: str = "model.tflite",
+      quantization_config: Optional[quantization.QuantizationConfig] = None):
+    """Converts and saves the model to a TFLite file with metadata included.
+
+    Note that only the TFLite file is needed for deployment. This function also
+    saves a metadata.json file to the same directory as the TFLite file which
+    can be used to interpret the metadata content in the TFLite file.
+
+    Args:
+      model_name: File name to save TFLite model with metadata. The full export
+        path is {self._hparams.export_dir}/{model_name}.
+      quantization_config: The configuration for model quantization.
+    """
+    if not tf.io.gfile.exists(self._hparams.export_dir):
+      tf.io.gfile.makedirs(self._hparams.export_dir)
+    tflite_file = os.path.join(self._hparams.export_dir, model_name)
+    metadata_file = os.path.join(self._hparams.export_dir, "metadata.json")
+
+    tflite_model = model_util.convert_to_tflite(
+        model=self._model, quantization_config=quantization_config)
+    vocab_filepath = os.path.join(tempfile.mkdtemp(), "vocab.txt")
+    self._save_vocab(vocab_filepath)
+
+    writer = self._get_metadata_writer(tflite_model, vocab_filepath)
+    tflite_model_with_metadata, metadata_json = writer.populate()
+    model_util.save_tflite(tflite_model_with_metadata, tflite_file)
+    with open(metadata_file, "w") as f:
+      f.write(metadata_json)
+
+  @abc.abstractmethod
+  def _save_vocab(self, vocab_filepath: str):
+    """Saves the preprocessor's vocab to `vocab_filepath`."""
+
+  @abc.abstractmethod
+  def _get_metadata_writer(self, tflite_model: bytearray, vocab_filepath: str):
+    """Gets the metadata writer for the text classifier TFLite model."""
+
+
+class _AverageWordEmbeddingClassifier(TextClassifier):
+  """APIs to help create and train an Average Word Embedding text classifier."""
+
+  _DELIM_REGEX_PATTERN = r"[^\w\']+"
+
+  def __init__(self, model_spec: ms.AverageWordEmbeddingClassifierSpec,
+               model_options: mo.AverageWordEmbeddingClassifierOptions,
+               hparams: hp.BaseHParams, label_names: Sequence[str]):
+    super().__init__(model_spec, hparams, label_names)
+    self._model_options = model_options
+    self._loss_function = "sparse_categorical_crossentropy"
+    self._metric_function = "accuracy"
+    self._text_preprocessor: (
+        preprocessor.AverageWordEmbeddingClassifierPreprocessor) = None
+
+  @classmethod
+  def create_average_word_embedding_classifier(
+      cls, train_data: text_ds.Dataset, validation_data: text_ds.Dataset,
+      options: text_classifier_options.TextClassifierOptions,
+      label_names: Sequence[str]) -> "_AverageWordEmbeddingClassifier":
+    """Creates, trains, and returns an Average Word Embedding classifier.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+      options: Options for creating and training the text classifier.
+      label_names: Label names used in the data.
+
+    Returns:
+      An Average Word Embedding classifier.
+    """
+    average_word_embedding_classifier = _AverageWordEmbeddingClassifier(
+        model_spec=options.supported_model.value(),
+        model_options=options.model_options,
+        hparams=options.hparams,
+        label_names=train_data.label_names)
+    average_word_embedding_classifier._create_and_train_model(
+        train_data, validation_data)
+    return average_word_embedding_classifier
+
+  def _create_and_train_model(self, train_data: text_ds.Dataset,
+                              validation_data: text_ds.Dataset):
+    """Creates the Average Word Embedding classifier keras model and trains it.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+    """
+    (processed_train_data, processed_validation_data) = (
+        self._load_and_run_preprocessor(train_data, validation_data))
+    self._create_model()
+    self._optimizer = "rmsprop"
+    self._train_model(processed_train_data, processed_validation_data)
+
+  def _load_and_run_preprocessor(
+      self, train_data: text_ds.Dataset, validation_data: text_ds.Dataset
+  ) -> Tuple[text_ds.Dataset, text_ds.Dataset]:
+    """Runs an AverageWordEmbeddingClassifierPreprocessor on the data.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+
+    Returns:
+      Preprocessed training data and preprocessed validation data.
+    """
+    train_texts = [text.numpy()[0] for text, _ in train_data.gen_tf_dataset()]
+    validation_texts = [
+        text.numpy()[0] for text, _ in validation_data.gen_tf_dataset()
+    ]
+    self._text_preprocessor = (
+        preprocessor.AverageWordEmbeddingClassifierPreprocessor(
+            seq_len=self._model_options.seq_len,
+            do_lower_case=self._model_options.do_lower_case,
+            texts=train_texts + validation_texts,
+            vocab_size=self._model_options.vocab_size))
+    return self._text_preprocessor.preprocess(
+        train_data), self._text_preprocessor.preprocess(validation_data)
+
+  def _create_model(self):
+    """Creates an Average Word Embedding model."""
+    self._model = tf.keras.Sequential([
+        tf.keras.layers.InputLayer(
+            input_shape=[self._model_options.seq_len], dtype=tf.int32),
+        tf.keras.layers.Embedding(
+            len(self._text_preprocessor.get_vocab()),
+            self._model_options.wordvec_dim,
+            input_length=self._model_options.seq_len),
+        tf.keras.layers.GlobalAveragePooling1D(),
+        tf.keras.layers.Dense(
+            self._model_options.wordvec_dim, activation=tf.nn.relu),
+        tf.keras.layers.Dropout(self._model_options.dropout_rate),
+        tf.keras.layers.Dense(self._num_classes, activation="softmax")
+    ])
+
+  def _save_vocab(self, vocab_filepath: str):
+    with tf.io.gfile.GFile(vocab_filepath, "w") as f:
+      for token, index in self._text_preprocessor.get_vocab().items():
+        f.write(f"{token} {index}\n")
+
+  def _get_metadata_writer(self, tflite_model: bytearray, vocab_filepath: str):
+    return text_classifier_writer.MetadataWriter.create_for_regex_model(
+        model_buffer=tflite_model,
+        regex_tokenizer=metadata_writer.RegexTokenizer(
+            # TODO: Align with MediaPipe's RegexTokenizer.
+            delim_regex_pattern=self._DELIM_REGEX_PATTERN,
+            vocab_file_path=vocab_filepath),
+        labels=metadata_writer.Labels().add(list(self._label_names)))
+
+
+class _BertClassifier(TextClassifier):
+  """APIs to help create and train a BERT-based text classifier."""
+
+  _INITIALIZER_RANGE = 0.02
+
+  def __init__(self, model_spec: ms.BertClassifierSpec,
+               model_options: mo.BertClassifierOptions, hparams: hp.BaseHParams,
+               label_names: Sequence[str]):
+    super().__init__(model_spec, hparams, label_names)
+    self._model_options = model_options
+    self._loss_function = tf.keras.losses.SparseCategoricalCrossentropy()
+    self._metric_function = tf.keras.metrics.SparseCategoricalAccuracy(
+        "test_accuracy", dtype=tf.float32)
+    self._text_preprocessor: preprocessor.BertClassifierPreprocessor = None
+
+  @classmethod
+  def create_bert_classifier(
+      cls, train_data: text_ds.Dataset, validation_data: text_ds.Dataset,
+      options: text_classifier_options.TextClassifierOptions,
+      label_names: Sequence[str]) -> "_BertClassifier":
+    """Creates, trains, and returns a BERT-based classifier.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+      options: Options for creating and training the text classifier.
+      label_names: Label names used in the data.
+
+    Returns:
+      A BERT-based classifier.
+    """
+    bert_classifier = _BertClassifier(
+        model_spec=options.supported_model.value(),
+        model_options=options.model_options,
+        hparams=options.hparams,
+        label_names=train_data.label_names)
+    bert_classifier._create_and_train_model(train_data, validation_data)
+    return bert_classifier
+
+  def _create_and_train_model(self, train_data: text_ds.Dataset,
+                              validation_data: text_ds.Dataset):
+    """Creates the BERT-based classifier keras model and trains it.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+    """
+    (processed_train_data, processed_validation_data) = (
+        self._load_and_run_preprocessor(train_data, validation_data))
+    self._create_model()
+    self._create_optimizer(processed_train_data)
+    self._train_model(processed_train_data, processed_validation_data)
+
+  def _load_and_run_preprocessor(
+      self, train_data: text_ds.Dataset, validation_data: text_ds.Dataset
+  ) -> Tuple[text_ds.Dataset, text_ds.Dataset]:
+    """Loads a BertClassifierPreprocessor and runs it on the data.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+
+    Returns:
+      Preprocessed training data and preprocessed validation data.
+    """
+    self._text_preprocessor = preprocessor.BertClassifierPreprocessor(
+        seq_len=self._model_options.seq_len,
+        do_lower_case=self._model_spec.do_lower_case,
+        uri=self._model_spec.uri)
+    return (self._text_preprocessor.preprocess(train_data),
+            self._text_preprocessor.preprocess(validation_data))
+
+  def _create_model(self):
+    """Creates a BERT-based classifier model.
+
+    The model architecture consists of stacking a dense classification layer and
+    dropout layer on top of the BERT encoder outputs.
+    """
+    encoder_inputs = dict(
+        input_word_ids=tf.keras.layers.Input(
+            shape=(self._model_options.seq_len,), dtype=tf.int32),
+        input_mask=tf.keras.layers.Input(
+            shape=(self._model_options.seq_len,), dtype=tf.int32),
+        input_type_ids=tf.keras.layers.Input(
+            shape=(self._model_options.seq_len,), dtype=tf.int32),
+    )
+    encoder = hub.KerasLayer(
+        self._model_spec.uri, trainable=self._model_options.do_fine_tuning)
+    encoder_outputs = encoder(encoder_inputs)
+    pooled_output = encoder_outputs["pooled_output"]
+
+    output = tf.keras.layers.Dropout(rate=self._model_options.dropout_rate)(
+        pooled_output)
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=self._INITIALIZER_RANGE)
+    output = tf.keras.layers.Dense(
+        self._num_classes,
+        kernel_initializer=initializer,
+        name="output",
+        activation="softmax",
+        dtype=tf.float32)(
+            output)
+    self._model = tf.keras.Model(inputs=encoder_inputs, outputs=output)
+
+  def _create_optimizer(self, train_data: text_ds.Dataset):
+    """Loads an optimizer with a learning rate schedule.
+
+    The decay steps in the learning rate schedule depend on the
+    `steps_per_epoch` which may depend on the size of the training data.
+
+    Args:
+      train_data: Training data.
+    """
+    self._hparams.steps_per_epoch = model_util.get_steps_per_epoch(
+        steps_per_epoch=self._hparams.steps_per_epoch,
+        batch_size=self._hparams.batch_size,
+        train_data=train_data)
+    total_steps = self._hparams.steps_per_epoch * self._hparams.epochs
+    warmup_steps = int(total_steps * 0.1)
+    initial_lr = self._hparams.learning_rate
+    self._optimizer = optimization.create_optimizer(initial_lr, total_steps,
+                                                    warmup_steps)
+
+  def _save_vocab(self, vocab_filepath: str):
+    tf.io.gfile.copy(
+        self._text_preprocessor.get_vocab_file(),
+        vocab_filepath,
+        overwrite=True)
+
+  def _get_metadata_writer(self, tflite_model: bytearray, vocab_filepath: str):
+    return text_classifier_writer.MetadataWriter.create_for_bert_model(
+        model_buffer=tflite_model,
+        tokenizer=metadata_writer.BertTokenizer(vocab_filepath),
+        labels=metadata_writer.Labels().add(list(self._label_names)),
+        ids_name=self._model_spec.tflite_input_name["ids"],
+        mask_name=self._model_spec.tflite_input_name["mask"],
+        segment_name=self._model_spec.tflite_input_name["segment_ids"])
--- a/mediapipe/model_maker/python/text/text_classifier/text_classifier_demo.py
+++ b/mediapipe/model_maker/python/text/text_classifier/text_classifier_demo.py
@ -0,0 +1,108 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Demo for making a text classifier model by MediaPipe Model Maker."""
+
+import os
+import tempfile
+
+# Dependency imports
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.core.utils import quantization
+from mediapipe.model_maker.python.text.text_classifier import dataset as text_ds
+from mediapipe.model_maker.python.text.text_classifier import model_spec as ms
+from mediapipe.model_maker.python.text.text_classifier import text_classifier
+from mediapipe.model_maker.python.text.text_classifier import text_classifier_options
+
+FLAGS = flags.FLAGS
+
+
+def define_flags():
+  flags.DEFINE_string('export_dir', None,
+                      'The directory to save exported files.')
+  flags.DEFINE_enum('supported_model', 'average_word_embedding',
+                    ['average_word_embedding', 'bert'],
+                    'The text classifier to run.')
+  flags.mark_flag_as_required('export_dir')
+
+
+def download_demo_data():
+  """Downloads demo data, and returns directory path."""
+  data_path = tf.keras.utils.get_file(
+      fname='SST-2.zip',
+      origin='https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+      extract=True)
+  return os.path.join(os.path.dirname(data_path), 'SST-2')  # folder name
+
+
+def run(data_dir,
+        export_dir=tempfile.mkdtemp(),
+        supported_model=ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER):
+  """Runs demo."""
+
+  # Gets training data and validation data.
+  csv_params = text_ds.CSVParameters(
+      text_column='sentence', label_column='label', delimiter='\t')
+  train_data = text_ds.Dataset.from_csv(
+      filename=os.path.join(os.path.join(data_dir, 'train.tsv')),
+      csv_params=csv_params)
+  validation_data = text_ds.Dataset.from_csv(
+      filename=os.path.join(os.path.join(data_dir, 'dev.tsv')),
+      csv_params=csv_params)
+
+  quantization_config = None
+  if supported_model == ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER:
+    hparams = hp.BaseHParams(
+        epochs=10, batch_size=32, learning_rate=0, export_dir=export_dir)
+  # Warning: This takes extremely long to run on CPU
+  elif supported_model == ms.SupportedModels.MOBILEBERT_CLASSIFIER:
+    quantization_config = quantization.QuantizationConfig.for_dynamic()
+    hparams = hp.BaseHParams(
+        epochs=3, batch_size=48, learning_rate=3e-5, export_dir=export_dir)
+
+  # Fine-tunes the model.
+  options = text_classifier_options.TextClassifierOptions(
+      supported_model=supported_model, hparams=hparams)
+  model = text_classifier.TextClassifier.create(train_data, validation_data,
+                                                options)
+
+  # Gets evaluation results.
+  _, acc = model.evaluate(validation_data)
+  print('Eval accuracy: %f' % acc)
+
+  model.export_model(quantization_config=quantization_config)
+  model.export_labels(export_dir=options.hparams.export_dir)
+
+
+def main(_):
+  logging.set_verbosity(logging.INFO)
+  data_dir = download_demo_data()
+  export_dir = os.path.expanduser(FLAGS.export_dir)
+
+  if FLAGS.supported_model == 'average_word_embedding':
+    supported_model = ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER
+  elif FLAGS.supported_model == 'bert':
+    supported_model = ms.SupportedModels.MOBILEBERT_CLASSIFIER
+
+  run(data_dir, export_dir, supported_model)
+
+
+if __name__ == '__main__':
+  define_flags()
+  app.run(main)
--- a/mediapipe/model_maker/python/text/text_classifier/text_classifier_options.py
+++ b/mediapipe/model_maker/python/text/text_classifier/text_classifier_options.py
@ -0,0 +1,38 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""User-facing customization options to create and train a text classifier."""
+
+import dataclasses
+from typing import Optional
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.text.text_classifier import model_options as mo
+from mediapipe.model_maker.python.text.text_classifier import model_spec as ms
+
+
+@dataclasses.dataclass
+class TextClassifierOptions:
+  """User-facing options for creating the text classifier.
+
+  Attributes:
+    supported_model: A preconfigured model spec.
+    hparams: Training hyperparameters the user can set to override the ones in
+      `supported_model`.
+    model_options: Model options the user can set to override the ones in
+      `supported_model`. The model options type should be consistent with the
+      architecture of the `supported_model`.
+  """
+  supported_model: ms.SupportedModels
+  hparams: Optional[hp.BaseHParams] = None
+  model_options: Optional[mo.TextClassifierModelOptions] = None
--- a/mediapipe/model_maker/python/text/text_classifier/text_classifier_test.py
+++ b/mediapipe/model_maker/python/text/text_classifier/text_classifier_test.py
@ -0,0 +1,138 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import filecmp
+import os
+
+import tensorflow as tf
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+from mediapipe.model_maker.python.text.text_classifier import dataset
+from mediapipe.model_maker.python.text.text_classifier import model_options as mo
+from mediapipe.model_maker.python.text.text_classifier import model_spec as ms
+from mediapipe.model_maker.python.text.text_classifier import text_classifier
+from mediapipe.model_maker.python.text.text_classifier import text_classifier_options
+from mediapipe.tasks.python.test import test_utils
+
+
+class TextClassifierTest(tf.test.TestCase):
+
+  _AVERAGE_WORD_EMBEDDING_JSON_FILE = (
+      test_utils.get_test_data_path('average_word_embedding_metadata.json'))
+
+  def _get_data(self):
+    labels_and_text = (('pos', 'super good'), (('neg', 'really bad')))
+    csv_file = os.path.join(self.get_temp_dir(), 'data.csv')
+    if os.path.exists(csv_file):
+      return csv_file
+    fieldnames = ['text', 'label']
+    with open(csv_file, 'w') as f:
+      writer = csv.DictWriter(f, fieldnames=fieldnames)
+      writer.writeheader()
+      for label, text in labels_and_text:
+        writer.writerow({'text': text, 'label': label})
+    csv_params = dataset.CSVParameters(text_column='text', label_column='label')
+    all_data = dataset.Dataset.from_csv(
+        filename=csv_file, csv_params=csv_params)
+    return all_data.split(0.5)
+
+  def test_create_and_train_average_word_embedding_model(self):
+    train_data, validation_data = self._get_data()
+    options = text_classifier_options.TextClassifierOptions(
+        supported_model=ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER,
+        hparams=hp.BaseHParams(epochs=1, batch_size=1, learning_rate=0))
+    average_word_embedding_classifier = text_classifier.TextClassifier.create(
+        train_data, validation_data, options)
+
+    _, accuracy = average_word_embedding_classifier.evaluate(validation_data)
+    self.assertGreaterEqual(accuracy, 0.0)
+
+    # Test export_model
+    average_word_embedding_classifier.export_model()
+    output_metadata_file = os.path.join(options.hparams.export_dir,
+                                        'metadata.json')
+    output_tflite_file = os.path.join(options.hparams.export_dir,
+                                      'model.tflite')
+
+    self.assertTrue(os.path.exists(output_tflite_file))
+    self.assertGreater(os.path.getsize(output_tflite_file), 0)
+
+    self.assertTrue(os.path.exists(output_metadata_file))
+    self.assertGreater(os.path.getsize(output_metadata_file), 0)
+    self.assertTrue(
+        filecmp.cmp(output_metadata_file,
+                    self._AVERAGE_WORD_EMBEDDING_JSON_FILE))
+
+  def test_create_and_train_bert(self):
+    train_data, validation_data = self._get_data()
+    options = text_classifier_options.TextClassifierOptions(
+        supported_model=ms.SupportedModels.MOBILEBERT_CLASSIFIER,
+        model_options=mo.BertClassifierOptions(do_fine_tuning=False, seq_len=2),
+        hparams=hp.BaseHParams(
+            epochs=1,
+            batch_size=1,
+            learning_rate=3e-5,
+            distribution_strategy='off'))
+    bert_classifier = text_classifier.TextClassifier.create(
+        train_data, validation_data, options)
+
+    _, accuracy = bert_classifier.evaluate(validation_data)
+    self.assertGreaterEqual(accuracy, 0.0)
+    # TODO: Add a unit test that does not run OOM.
+
+  def test_label_mismatch(self):
+    options = (
+        text_classifier_options.TextClassifierOptions(
+            supported_model=ms.SupportedModels.MOBILEBERT_CLASSIFIER))
+    train_tf_dataset = tf.data.Dataset.from_tensor_slices([[0]])
+    train_data = dataset.Dataset(train_tf_dataset, 1, ['foo'])
+    validation_tf_dataset = tf.data.Dataset.from_tensor_slices([[0]])
+    validation_data = dataset.Dataset(validation_tf_dataset, 1, ['bar'])
+    with self.assertRaisesRegex(
+        ValueError,
+        'Training data label names .* not equal to validation data label names'
+    ):
+      text_classifier.TextClassifier.create(train_data, validation_data,
+                                            options)
+
+  def test_options_mismatch(self):
+    train_data, validation_data = self._get_data()
+
+    avg_options = (
+        text_classifier_options.TextClassifierOptions(
+            supported_model=ms.SupportedModels.MOBILEBERT_CLASSIFIER,
+            model_options=mo.AverageWordEmbeddingClassifierOptions()))
+    with self.assertRaisesRegex(
+        ValueError, 'Expected AVERAGE_WORD_EMBEDDING_CLASSIFIER, got'
+        ' SupportedModels.MOBILEBERT_CLASSIFIER'):
+      text_classifier.TextClassifier.create(train_data, validation_data,
+                                            avg_options)
+
+    bert_options = (
+        text_classifier_options.TextClassifierOptions(
+            supported_model=(
+                ms.SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER),
+            model_options=mo.BertClassifierOptions()))
+    with self.assertRaisesRegex(
+        ValueError, 'Expected MOBILEBERT_CLASSIFIER, got'
+        ' SupportedModels.AVERAGE_WORD_EMBEDDING_CLASSIFIER'):
+      text_classifier.TextClassifier.create(train_data, validation_data,
+                                            bert_options)
+
+
+if __name__ == '__main__':
+  # Load compressed models from tensorflow_hub
+  os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
+  tf.test.main()