Make SentencePieceTokenizerTest work on Windows
PiperOrigin-RevId: 513638600
This commit is contained in:
parent
945b36766c
commit
7664e0ef64
|
@ -32,24 +32,31 @@ constexpr char kTestSPModelPath[] =
|
||||||
"mediapipe/tasks/testdata/text/30k-clean.model";
|
"mediapipe/tasks/testdata/text/30k-clean.model";
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
std::unique_ptr<SentencePieceTokenizer> CreateSentencePieceTokenizer(
|
||||||
|
absl::string_view model_path) {
|
||||||
|
// We are using `LoadBinaryContent()` instead of loading the model direclty
|
||||||
|
// via `SentencePieceTokenizer` so that the file can be located on Windows
|
||||||
|
std::string buffer = LoadBinaryContent(kTestSPModelPath);
|
||||||
|
return absl::make_unique<SentencePieceTokenizer>(buffer.data(),
|
||||||
|
buffer.size());
|
||||||
|
}
|
||||||
|
|
||||||
TEST(SentencePieceTokenizerTest, TestTokenize) {
|
TEST(SentencePieceTokenizerTest, TestTokenize) {
|
||||||
auto tokenizer = absl::make_unique<SentencePieceTokenizer>(kTestSPModelPath);
|
auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath);
|
||||||
auto results = tokenizer->Tokenize("good morning, i'm your teacher.\n");
|
auto results = tokenizer->Tokenize("good morning, i'm your teacher.\n");
|
||||||
EXPECT_THAT(results.subwords, ElementsAre("▁good", "▁morning", ",", "▁i", "'",
|
EXPECT_THAT(results.subwords, ElementsAre("▁good", "▁morning", ",", "▁i", "'",
|
||||||
"m", "▁your", "▁teacher", "."));
|
"m", "▁your", "▁teacher", "."));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SentencePieceTokenizerTest, TestTokenizeFromFileBuffer) {
|
TEST(SentencePieceTokenizerTest, TestTokenizeFromFileBuffer) {
|
||||||
std::string buffer = LoadBinaryContent(kTestSPModelPath);
|
auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath);
|
||||||
auto tokenizer =
|
|
||||||
absl::make_unique<SentencePieceTokenizer>(buffer.data(), buffer.size());
|
|
||||||
EXPECT_THAT(tokenizer->Tokenize("good morning, i'm your teacher.\n").subwords,
|
EXPECT_THAT(tokenizer->Tokenize("good morning, i'm your teacher.\n").subwords,
|
||||||
ElementsAre("▁good", "▁morning", ",", "▁i", "'", "m", "▁your",
|
ElementsAre("▁good", "▁morning", ",", "▁i", "'", "m", "▁your",
|
||||||
"▁teacher", "."));
|
"▁teacher", "."));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SentencePieceTokenizerTest, TestLookupId) {
|
TEST(SentencePieceTokenizerTest, TestLookupId) {
|
||||||
auto tokenizer = absl::make_unique<SentencePieceTokenizer>(kTestSPModelPath);
|
auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath);
|
||||||
std::vector<std::string> subwords = {"▁good", "▁morning", ",", "▁i", "'", "m",
|
std::vector<std::string> subwords = {"▁good", "▁morning", ",", "▁i", "'", "m",
|
||||||
"▁your", "▁teacher", "."};
|
"▁your", "▁teacher", "."};
|
||||||
std::vector<int> true_ids = {254, 959, 15, 31, 22, 79, 154, 2197, 9};
|
std::vector<int> true_ids = {254, 959, 15, 31, 22, 79, 154, 2197, 9};
|
||||||
|
@ -61,7 +68,7 @@ TEST(SentencePieceTokenizerTest, TestLookupId) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SentencePieceTokenizerTest, TestLookupWord) {
|
TEST(SentencePieceTokenizerTest, TestLookupWord) {
|
||||||
auto tokenizer = absl::make_unique<SentencePieceTokenizer>(kTestSPModelPath);
|
auto tokenizer = CreateSentencePieceTokenizer(kTestSPModelPath);
|
||||||
std::vector<int> ids = {254, 959, 15, 31, 22, 79, 154, 2197, 9};
|
std::vector<int> ids = {254, 959, 15, 31, 22, 79, 154, 2197, 9};
|
||||||
std::vector<std::string> subwords = {"▁good", "▁morning", ",", "▁i", "'", "m",
|
std::vector<std::string> subwords = {"▁good", "▁morning", ",", "▁i", "'", "m",
|
||||||
"▁your", "▁teacher", "."};
|
"▁your", "▁teacher", "."};
|
||||||
|
|
Loading…
Reference in New Issue
Block a user