diff --git a/mediapipe/tasks/cc/text/tokenizers/sentencepiece_tokenizer_test.cc b/mediapipe/tasks/cc/text/tokenizers/sentencepiece_tokenizer_test.cc index ed7decbd9..a42719446 100644 --- a/mediapipe/tasks/cc/text/tokenizers/sentencepiece_tokenizer_test.cc +++ b/mediapipe/tasks/cc/text/tokenizers/sentencepiece_tokenizer_test.cc @@ -32,24 +32,31 @@ constexpr char kTestSPModelPath[] = "mediapipe/tasks/testdata/text/30k-clean.model"; } // namespace +std::unique_ptr CreateSentencePieceTokenizer( + absl::string_view model_path) { + // We are using `LoadBinaryContent()` instead of loading the model direclty + // via `SentencePieceTokenizer` so that the file can be located on Windows + std::string buffer = LoadBinaryContent(kTestSPModelPath); + return absl::make_unique(buffer.data(), + buffer.size()); +} + TEST(SentencePieceTokenizerTest, TestTokenize) { - auto tokenizer = absl::make_unique(kTestSPModelPath); + auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath); auto results = tokenizer->Tokenize("good morning, i'm your teacher.\n"); EXPECT_THAT(results.subwords, ElementsAre("▁good", "▁morning", ",", "▁i", "'", "m", "▁your", "▁teacher", ".")); } TEST(SentencePieceTokenizerTest, TestTokenizeFromFileBuffer) { - std::string buffer = LoadBinaryContent(kTestSPModelPath); - auto tokenizer = - absl::make_unique(buffer.data(), buffer.size()); + auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath); EXPECT_THAT(tokenizer->Tokenize("good morning, i'm your teacher.\n").subwords, ElementsAre("▁good", "▁morning", ",", "▁i", "'", "m", "▁your", "▁teacher", ".")); } TEST(SentencePieceTokenizerTest, TestLookupId) { - auto tokenizer = absl::make_unique(kTestSPModelPath); + auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath); std::vector subwords = {"▁good", "▁morning", ",", "▁i", "'", "m", "▁your", "▁teacher", "."}; std::vector true_ids = {254, 959, 15, 31, 22, 79, 154, 2197, 9}; @@ -61,7 +68,7 @@ TEST(SentencePieceTokenizerTest, TestLookupId) { } TEST(SentencePieceTokenizerTest, TestLookupWord) { - auto tokenizer = absl::make_unique(kTestSPModelPath); + auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath); std::vector ids = {254, 959, 15, 31, 22, 79, 154, 2197, 9}; std::vector subwords = {"▁good", "▁morning", ",", "▁i", "'", "m", "▁your", "▁teacher", "."};