169 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			169 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
# you may not use this file except in compliance with the License.
 | 
						|
# You may obtain a copy of the License at
 | 
						|
#
 | 
						|
#      http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
# See the License for the specific language governing permissions and
 | 
						|
# limitations under the License.
 | 
						|
 | 
						|
package(default_visibility = ["//mediapipe/calculators/tensor:__subpackages__"])
 | 
						|
 | 
						|
licenses(["notice"])
 | 
						|
 | 
						|
cc_library(
 | 
						|
    name = "tokenizer",
 | 
						|
    hdrs = [
 | 
						|
        "tokenizer.h",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        "@com_google_absl//absl/strings",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_library(
 | 
						|
    name = "bert_tokenizer",
 | 
						|
    srcs = [
 | 
						|
        "bert_tokenizer.cc",
 | 
						|
    ],
 | 
						|
    hdrs = [
 | 
						|
        "bert_tokenizer.h",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        ":tokenizer",
 | 
						|
        "//mediapipe/framework/port:integral_types",
 | 
						|
        "//mediapipe/tasks/cc/text/utils:vocab_utils",
 | 
						|
        "@com_google_absl//absl/container:flat_hash_map",
 | 
						|
        "@com_google_absl//absl/strings",
 | 
						|
        "@com_googlesource_code_re2//:re2",
 | 
						|
        "@org_tensorflow_text//tensorflow_text/core/kernels:regex_split",
 | 
						|
        "@org_tensorflow_text//tensorflow_text/core/kernels:wordpiece_tokenizer",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_test(
 | 
						|
    name = "bert_tokenizer_test",
 | 
						|
    srcs = ["bert_tokenizer_test.cc"],
 | 
						|
    data = [
 | 
						|
        "//mediapipe/tasks/testdata/text:vocab_files",
 | 
						|
    ],
 | 
						|
    linkopts = ["-ldl"],
 | 
						|
    deps = [
 | 
						|
        ":bert_tokenizer",
 | 
						|
        "//mediapipe/framework/port:gtest_main",
 | 
						|
        "//mediapipe/tasks/cc/core:utils",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_library(
 | 
						|
    name = "sentencepiece_tokenizer",
 | 
						|
    hdrs = [
 | 
						|
        "sentencepiece_tokenizer.h",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        ":tokenizer",
 | 
						|
        "//mediapipe/framework/port:logging",
 | 
						|
        "@com_google_absl//absl/strings",
 | 
						|
        "@com_google_sentencepiece//src:sentencepiece_processor",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_test(
 | 
						|
    name = "sentencepiece_tokenizer_test",
 | 
						|
    srcs = ["sentencepiece_tokenizer_test.cc"],
 | 
						|
    data = [
 | 
						|
        "//mediapipe/tasks/testdata/text:albert_model",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        ":sentencepiece_tokenizer",
 | 
						|
        "//mediapipe/framework/port:gtest_main",
 | 
						|
        "//mediapipe/tasks/cc/core:utils",
 | 
						|
        "@com_google_sentencepiece//src:sentencepiece_processor",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_library(
 | 
						|
    name = "tokenizer_utils",
 | 
						|
    srcs = ["tokenizer_utils.cc"],
 | 
						|
    hdrs = [
 | 
						|
        "tokenizer_utils.h",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        ":bert_tokenizer",
 | 
						|
        ":regex_tokenizer",
 | 
						|
        ":sentencepiece_tokenizer",
 | 
						|
        ":tokenizer",
 | 
						|
        "//mediapipe/framework/port:status",
 | 
						|
        "//mediapipe/tasks/cc:common",
 | 
						|
        "//mediapipe/tasks/cc/metadata:metadata_extractor",
 | 
						|
        "//mediapipe/tasks/metadata:metadata_schema_cc",
 | 
						|
        "@com_google_absl//absl/status",
 | 
						|
        "@com_google_absl//absl/status:statusor",
 | 
						|
        "@com_google_absl//absl/strings",
 | 
						|
        "@flatbuffers//:runtime_cc",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_test(
 | 
						|
    name = "tokenizer_utils_test",
 | 
						|
    srcs = ["tokenizer_utils_test.cc"],
 | 
						|
    data = [
 | 
						|
        "//mediapipe/tasks/testdata/text:albert_model",
 | 
						|
        "//mediapipe/tasks/testdata/text:mobile_bert_model",
 | 
						|
        "//mediapipe/tasks/testdata/text:text_classifier_models",
 | 
						|
    ],
 | 
						|
    linkopts = ["-ldl"],
 | 
						|
    deps = [
 | 
						|
        ":bert_tokenizer",
 | 
						|
        ":regex_tokenizer",
 | 
						|
        ":sentencepiece_tokenizer",
 | 
						|
        ":tokenizer_utils",
 | 
						|
        "//mediapipe/framework/port:gtest_main",
 | 
						|
        "//mediapipe/framework/port:status",
 | 
						|
        "//mediapipe/tasks/cc:common",
 | 
						|
        "//mediapipe/tasks/cc/core:utils",
 | 
						|
        "//mediapipe/tasks/cc/metadata:metadata_extractor",
 | 
						|
        "//mediapipe/tasks/metadata:metadata_schema_cc",
 | 
						|
        "@com_google_absl//absl/status",
 | 
						|
        "@com_google_absl//absl/status:statusor",
 | 
						|
        "@com_google_absl//absl/strings",
 | 
						|
        "@com_google_absl//absl/strings:cord",
 | 
						|
        "@com_google_sentencepiece//src:sentencepiece_processor",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_library(
 | 
						|
    name = "regex_tokenizer",
 | 
						|
    srcs = [
 | 
						|
        "regex_tokenizer.cc",
 | 
						|
    ],
 | 
						|
    hdrs = [
 | 
						|
        "regex_tokenizer.h",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        ":tokenizer",
 | 
						|
        "//mediapipe/tasks/cc/text/utils:vocab_utils",
 | 
						|
        "@com_google_absl//absl/container:node_hash_map",
 | 
						|
        "@com_google_absl//absl/strings",
 | 
						|
        "@com_googlesource_code_re2//:re2",
 | 
						|
    ],
 | 
						|
)
 | 
						|
 | 
						|
cc_test(
 | 
						|
    name = "regex_tokenizer_test",
 | 
						|
    srcs = ["regex_tokenizer_test.cc"],
 | 
						|
    data = [
 | 
						|
        "//mediapipe/tasks/testdata/text:regex_tokenizer_files",
 | 
						|
    ],
 | 
						|
    deps = [
 | 
						|
        ":regex_tokenizer",
 | 
						|
        "//mediapipe/framework/port:gtest_main",
 | 
						|
        "//mediapipe/tasks/cc/core:utils",
 | 
						|
    ],
 | 
						|
)
 |