Internal change
PiperOrigin-RevId: 516871638
This commit is contained in:
parent
04ffb8432e
commit
ce3cd94f45
17
LICENSE
17
LICENSE
|
@ -199,3 +199,20 @@
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
|
|
||||||
|
===========================================================================
|
||||||
|
For files under tasks/cc/text/language_detector/custom_ops/utils/utf/
|
||||||
|
===========================================================================
|
||||||
|
/*
|
||||||
|
* The authors of this software are Rob Pike and Ken Thompson.
|
||||||
|
* Copyright (c) 2002 by Lucent Technologies.
|
||||||
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
|
* purpose without fee is hereby granted, provided that this entire notice
|
||||||
|
* is included in all copies of any software which is or includes a copy
|
||||||
|
* or modification of this software and in all copies of the supporting
|
||||||
|
* documentation for such software.
|
||||||
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
|
*/
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
package(default_visibility = ["//mediapipe/tasks:internal"])
|
||||||
|
|
||||||
|
licenses(["notice"])
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "ngram_hash_ops_utils",
|
||||||
|
srcs = [
|
||||||
|
"ngram_hash_ops_utils.cc",
|
||||||
|
],
|
||||||
|
hdrs = [
|
||||||
|
"ngram_hash_ops_utils.h",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
"//mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cc_test(
|
||||||
|
name = "ngram_hash_ops_utils_test",
|
||||||
|
size = "small",
|
||||||
|
srcs = [
|
||||||
|
"ngram_hash_ops_utils_test.cc",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
":ngram_hash_ops_utils",
|
||||||
|
"//mediapipe/framework/port:gtest_main",
|
||||||
|
],
|
||||||
|
)
|
|
@ -0,0 +1,96 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
|
||||||
|
|
||||||
|
namespace mediapipe::tasks::text::language_detector::custom_ops {
|
||||||
|
|
||||||
|
TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
|
||||||
|
bool exclude_nonalphaspace_tokens) {
|
||||||
|
const std::string kPrefix = "^";
|
||||||
|
const std::string kSuffix = "$";
|
||||||
|
const std::string kReplacementToken = " ";
|
||||||
|
|
||||||
|
TokenizedOutput output;
|
||||||
|
|
||||||
|
size_t token_start = 0;
|
||||||
|
output.str.reserve(len + 2);
|
||||||
|
output.tokens.reserve(len + 2);
|
||||||
|
|
||||||
|
output.str.append(kPrefix);
|
||||||
|
output.tokens.push_back(std::make_pair(token_start, kPrefix.size()));
|
||||||
|
token_start += kPrefix.size();
|
||||||
|
|
||||||
|
Rune token;
|
||||||
|
for (int i = 0; i < len && output.tokens.size() + 1 < max_tokens;) {
|
||||||
|
// Use the standard UTF-8 library to find the next token.
|
||||||
|
size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
|
||||||
|
|
||||||
|
// Stop processing, if we can't read any more tokens, or we have reached
|
||||||
|
// maximum allowed tokens, allocating one token for the suffix.
|
||||||
|
if (bytes_read == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If `exclude_nonalphaspace_tokens` is set to true, and the token is not
|
||||||
|
// alphanumeric, replace it with a replacement token.
|
||||||
|
if (exclude_nonalphaspace_tokens && !utf_isalpharune(token)) {
|
||||||
|
output.str.append(kReplacementToken);
|
||||||
|
output.tokens.push_back(
|
||||||
|
std::make_pair(token_start, kReplacementToken.size()));
|
||||||
|
token_start += kReplacementToken.size();
|
||||||
|
i += bytes_read;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append the token in the output string, and note its position and the
|
||||||
|
// number of bytes that token consumed.
|
||||||
|
output.str.append(input_str + i, bytes_read);
|
||||||
|
output.tokens.push_back(std::make_pair(token_start, bytes_read));
|
||||||
|
token_start += bytes_read;
|
||||||
|
i += bytes_read;
|
||||||
|
}
|
||||||
|
output.str.append(kSuffix);
|
||||||
|
output.tokens.push_back(std::make_pair(token_start, kSuffix.size()));
|
||||||
|
token_start += kSuffix.size();
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
void LowercaseUnicodeStr(const char* input_str, int len,
|
||||||
|
std::string* output_str) {
|
||||||
|
for (int i = 0; i < len;) {
|
||||||
|
Rune token;
|
||||||
|
|
||||||
|
// Tokenize the given string, and get the appropriate lowercase token.
|
||||||
|
size_t bytes_read = utf_charntorune(&token, input_str + i, len - i);
|
||||||
|
token = utf_isalpharune(token) ? utf_tolowerrune(token) : token;
|
||||||
|
|
||||||
|
// Write back the token to the output string.
|
||||||
|
char token_buf[UTFmax];
|
||||||
|
size_t bytes_to_write = utf_runetochar(token_buf, &token);
|
||||||
|
output_str->append(token_buf, bytes_to_write);
|
||||||
|
|
||||||
|
i += bytes_read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace mediapipe::tasks::text::language_detector::custom_ops
|
|
@ -0,0 +1,56 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
|
||||||
|
#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace mediapipe::tasks::text::language_detector::custom_ops {
|
||||||
|
|
||||||
|
struct TokenizedOutput {
|
||||||
|
// The processed string (with necessary prefix, suffix, skipped tokens, etc.).
|
||||||
|
std::string str;
|
||||||
|
|
||||||
|
// This vector contains pairs, where each pair has two members. The first
|
||||||
|
// denoting the starting index of the token in the `str` string, and the
|
||||||
|
// second denoting the length of that token in bytes.
|
||||||
|
std::vector<std::pair<const size_t, const size_t>> tokens;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Tokenizes the given input string on Unicode token boundaries, with a maximum
|
||||||
|
// of `max_tokens` tokens.
|
||||||
|
//
|
||||||
|
// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores
|
||||||
|
// non-alphanumeric tokens, and replaces them with a replacement token (" ").
|
||||||
|
//
|
||||||
|
// The method returns the output in the `TokenizedOutput` struct, which stores
|
||||||
|
// both, the processed input string, and the indices and sizes of each token
|
||||||
|
// within that string.
|
||||||
|
TokenizedOutput Tokenize(const char* input_str, int len, int max_tokens,
|
||||||
|
bool exclude_nonalphaspace_tokens);
|
||||||
|
|
||||||
|
// Converts the given unicode string (`input_str`) with the specified length
|
||||||
|
// (`len`) to a lowercase string.
|
||||||
|
//
|
||||||
|
// The method populates the lowercased string in `output_str`.
|
||||||
|
void LowercaseUnicodeStr(const char* input_str, int len,
|
||||||
|
std::string* output_str);
|
||||||
|
|
||||||
|
} // namespace mediapipe::tasks::text::language_detector::custom_ops
|
||||||
|
|
||||||
|
#endif // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_NGRAM_HASH_OPS_UTILS_H_
|
|
@ -0,0 +1,135 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/ngram_hash_ops_utils.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "mediapipe/framework/port/gmock.h"
|
||||||
|
#include "mediapipe/framework/port/gtest.h"
|
||||||
|
|
||||||
|
namespace mediapipe::tasks::text::language_detector::custom_ops {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using ::testing::Values;
|
||||||
|
|
||||||
|
std::string ReconstructStringFromTokens(TokenizedOutput output) {
|
||||||
|
std::string reconstructed_str;
|
||||||
|
for (int i = 0; i < output.tokens.size(); i++) {
|
||||||
|
reconstructed_str.append(
|
||||||
|
output.str.c_str() + output.tokens[i].first,
|
||||||
|
output.str.c_str() + output.tokens[i].first + output.tokens[i].second);
|
||||||
|
}
|
||||||
|
return reconstructed_str;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct TokenizeTestParams {
|
||||||
|
std::string input_str;
|
||||||
|
size_t max_tokens;
|
||||||
|
bool exclude_nonalphaspace_tokens;
|
||||||
|
std::string expected_output_str;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TokenizeParameterizedTest
|
||||||
|
: public ::testing::Test,
|
||||||
|
public testing::WithParamInterface<TokenizeTestParams> {};
|
||||||
|
|
||||||
|
TEST_P(TokenizeParameterizedTest, Tokenize) {
|
||||||
|
// Checks that the Tokenize method returns the expected value.
|
||||||
|
const TokenizeTestParams params = TokenizeParameterizedTest::GetParam();
|
||||||
|
const TokenizedOutput output = Tokenize(
|
||||||
|
/*input_str=*/params.input_str.c_str(),
|
||||||
|
/*len=*/params.input_str.size(),
|
||||||
|
/*max_tokens=*/params.max_tokens,
|
||||||
|
/*exclude_nonalphaspace_tokens=*/params.exclude_nonalphaspace_tokens);
|
||||||
|
|
||||||
|
// The output string should have the necessary prefixes, and the "!" token
|
||||||
|
// should have been replaced with a " ".
|
||||||
|
EXPECT_EQ(output.str, params.expected_output_str);
|
||||||
|
EXPECT_EQ(ReconstructStringFromTokens(output), params.expected_output_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
TokenizeParameterizedTests, TokenizeParameterizedTest,
|
||||||
|
Values(
|
||||||
|
// Test including non-alphanumeric characters.
|
||||||
|
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
|
||||||
|
/*exclude_alphanonspace=*/false,
|
||||||
|
/*expected_output_str=*/"^hi!$"}),
|
||||||
|
// Test not including non-alphanumeric characters.
|
||||||
|
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/100,
|
||||||
|
/*exclude_alphanonspace=*/true,
|
||||||
|
/*expected_output_str=*/"^hi $"}),
|
||||||
|
// Test with a maximum of 3 tokens.
|
||||||
|
TokenizeTestParams({/*input_str=*/"hi!", /*max_tokens=*/3,
|
||||||
|
/*exclude_alphanonspace=*/true,
|
||||||
|
/*expected_output_str=*/"^h$"}),
|
||||||
|
// Test with non-latin characters.
|
||||||
|
TokenizeTestParams({/*input_str=*/"ありがと", /*max_tokens=*/100,
|
||||||
|
/*exclude_alphanonspace=*/true,
|
||||||
|
/*expected_output_str=*/"^ありがと$"})));
|
||||||
|
|
||||||
|
TEST(LowercaseUnicodeTest, TestLowercaseUnicode) {
|
||||||
|
{
|
||||||
|
// Check that the method is a no-op when the string is lowercase.
|
||||||
|
std::string input_str = "hello";
|
||||||
|
std::string output_str;
|
||||||
|
LowercaseUnicodeStr(
|
||||||
|
/*input_str=*/input_str.c_str(),
|
||||||
|
/*len=*/input_str.size(),
|
||||||
|
/*output_str=*/&output_str);
|
||||||
|
|
||||||
|
EXPECT_EQ(output_str, "hello");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// Check that the method has uppercase characters.
|
||||||
|
std::string input_str = "hElLo";
|
||||||
|
std::string output_str;
|
||||||
|
LowercaseUnicodeStr(
|
||||||
|
/*input_str=*/input_str.c_str(),
|
||||||
|
/*len=*/input_str.size(),
|
||||||
|
/*output_str=*/&output_str);
|
||||||
|
|
||||||
|
EXPECT_EQ(output_str, "hello");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// Check that the method works with non-latin scripts.
|
||||||
|
// Cyrillic has the concept of cases, so it should change the input.
|
||||||
|
std::string input_str = "БЙп";
|
||||||
|
std::string output_str;
|
||||||
|
LowercaseUnicodeStr(
|
||||||
|
/*input_str=*/input_str.c_str(),
|
||||||
|
/*len=*/input_str.size(),
|
||||||
|
/*output_str=*/&output_str);
|
||||||
|
|
||||||
|
EXPECT_EQ(output_str, "бйп");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// Check that the method works with non-latin scripts.
|
||||||
|
// Japanese doesn't have the concept of cases, so it should not change.
|
||||||
|
std::string input_str = "ありがと";
|
||||||
|
std::string output_str;
|
||||||
|
LowercaseUnicodeStr(
|
||||||
|
/*input_str=*/input_str.c_str(),
|
||||||
|
/*len=*/input_str.size(),
|
||||||
|
/*output_str=*/&output_str);
|
||||||
|
|
||||||
|
EXPECT_EQ(output_str, "ありがと");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
} // namespace mediapipe::tasks::text::language_detector::custom_ops
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
package(default_visibility = ["//mediapipe/tasks:internal"])
|
||||||
|
|
||||||
|
licenses(["notice"])
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "utf",
|
||||||
|
srcs = [
|
||||||
|
"rune.c",
|
||||||
|
"runetype.c",
|
||||||
|
"runetypebody.h",
|
||||||
|
],
|
||||||
|
hdrs = ["utf.h"],
|
||||||
|
)
|
|
@ -0,0 +1,233 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
// Forked from a library written by Rob Pike and Ken Thompson. Original
|
||||||
|
// copyright message below.
|
||||||
|
/*
|
||||||
|
* The authors of this software are Rob Pike and Ken Thompson.
|
||||||
|
* Copyright (c) 2002 by Lucent Technologies.
|
||||||
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
|
* purpose without fee is hereby granted, provided that this entire notice
|
||||||
|
* is included in all copies of any software which is or includes a copy
|
||||||
|
* or modification of this software and in all copies of the supporting
|
||||||
|
* documentation for such software.
|
||||||
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
|
*/
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
|
||||||
|
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
Bit1 = 7,
|
||||||
|
Bitx = 6,
|
||||||
|
Bit2 = 5,
|
||||||
|
Bit3 = 4,
|
||||||
|
Bit4 = 3,
|
||||||
|
Bit5 = 2,
|
||||||
|
|
||||||
|
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||||
|
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||||
|
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||||
|
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||||
|
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||||
|
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||||
|
|
||||||
|
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||||
|
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||||
|
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||||
|
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
||||||
|
/* 0001 1111 1111 1111 1111 1111 */
|
||||||
|
|
||||||
|
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||||
|
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||||
|
|
||||||
|
Bad = Runeerror,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
|
||||||
|
* This is a slower but "safe" version of the old chartorune
|
||||||
|
* that works on strings that are not necessarily null-terminated.
|
||||||
|
*
|
||||||
|
* If you know for sure that your string is null-terminated,
|
||||||
|
* chartorune will be a bit faster.
|
||||||
|
*
|
||||||
|
* It is guaranteed not to attempt to access "length"
|
||||||
|
* past the incoming pointer. This is to avoid
|
||||||
|
* possible access violations. If the string appears to be
|
||||||
|
* well-formed but incomplete (i.e., to get the whole Rune
|
||||||
|
* we'd need to read past str+length) then we'll set the Rune
|
||||||
|
* to Bad and return 0.
|
||||||
|
*
|
||||||
|
* Note that if we have decoding problems for other
|
||||||
|
* reasons, we return 1 instead of 0.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
utf_charntorune(Rune *rune, const char *str, int length)
|
||||||
|
{
|
||||||
|
int c, c1, c2, c3;
|
||||||
|
long l;
|
||||||
|
|
||||||
|
/* When we're not allowed to read anything */
|
||||||
|
if(length <= 0) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* one character sequence (7-bit value)
|
||||||
|
* 00000-0007F => T1
|
||||||
|
*/
|
||||||
|
c = *(uchar*)str;
|
||||||
|
if(c < Tx) {
|
||||||
|
*rune = c;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we can't read more than one character we must stop
|
||||||
|
if(length <= 1) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* two character sequence (11-bit value)
|
||||||
|
* 0080-07FF => T2 Tx
|
||||||
|
*/
|
||||||
|
c1 = *(uchar*)(str+1) ^ Tx;
|
||||||
|
if(c1 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T3) {
|
||||||
|
if(c < T2)
|
||||||
|
goto bad;
|
||||||
|
l = ((c << Bitx) | c1) & Rune2;
|
||||||
|
if(l <= Rune1)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we can't read more than two characters we must stop
|
||||||
|
if(length <= 2) {
|
||||||
|
goto badlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* three character sequence (16-bit value)
|
||||||
|
* 0800-FFFF => T3 Tx Tx
|
||||||
|
*/
|
||||||
|
c2 = *(uchar*)(str+2) ^ Tx;
|
||||||
|
if(c2 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if(c < T4) {
|
||||||
|
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||||
|
if(l <= Rune2)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length <= 3)
|
||||||
|
goto badlen;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
c3 = *(uchar*)(str+3) ^ Tx;
|
||||||
|
if (c3 & Testx)
|
||||||
|
goto bad;
|
||||||
|
if (c < T5) {
|
||||||
|
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||||
|
if (l <= Rune3)
|
||||||
|
goto bad;
|
||||||
|
if (l > Runemax)
|
||||||
|
goto bad;
|
||||||
|
*rune = l;
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Support for 5-byte or longer UTF-8 would go here, but
|
||||||
|
// since we don't have that, we'll just fall through to bad.
|
||||||
|
|
||||||
|
/*
|
||||||
|
* bad decoding
|
||||||
|
*/
|
||||||
|
bad:
|
||||||
|
*rune = Bad;
|
||||||
|
return 1;
|
||||||
|
badlen:
|
||||||
|
*rune = Bad;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
utf_runetochar(char *str, const Rune *rune)
|
||||||
|
{
|
||||||
|
/* Runes are signed, so convert to unsigned for range check. */
|
||||||
|
unsigned long c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* one character sequence
|
||||||
|
* 00000-0007F => 00-7F
|
||||||
|
*/
|
||||||
|
c = *rune;
|
||||||
|
if(c <= Rune1) {
|
||||||
|
str[0] = c;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* two character sequence
|
||||||
|
* 0080-07FF => T2 Tx
|
||||||
|
*/
|
||||||
|
if(c <= Rune2) {
|
||||||
|
str[0] = T2 | (c >> 1*Bitx);
|
||||||
|
str[1] = Tx | (c & Maskx);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the Rune is out of range, convert it to the error rune.
|
||||||
|
* Do this test here because the error rune encodes to three bytes.
|
||||||
|
* Doing it earlier would duplicate work, since an out of range
|
||||||
|
* Rune wouldn't have fit in one or two bytes.
|
||||||
|
*/
|
||||||
|
if (c > Runemax)
|
||||||
|
c = Runeerror;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* three character sequence
|
||||||
|
* 0800-FFFF => T3 Tx Tx
|
||||||
|
*/
|
||||||
|
if (c <= Rune3) {
|
||||||
|
str[0] = T3 | (c >> 2*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | (c & Maskx);
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* four character sequence (21-bit value)
|
||||||
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
|
*/
|
||||||
|
str[0] = T4 | (c >> 3*Bitx);
|
||||||
|
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||||
|
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
|
str[3] = Tx | (c & Maskx);
|
||||||
|
return 4;
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
// Forked from a library written by Rob Pike and Ken Thompson. Original
|
||||||
|
// copyright message below.
|
||||||
|
/*
|
||||||
|
* The authors of this software are Rob Pike and Ken Thompson.
|
||||||
|
* Copyright (c) 2002 by Lucent Technologies.
|
||||||
|
* Permission to use, copy, modify, and distribute this software for any
|
||||||
|
* purpose without fee is hereby granted, provided that this entire notice
|
||||||
|
* is included in all copies of any software which is or includes a copy
|
||||||
|
* or modification of this software and in all copies of the supporting
|
||||||
|
* documentation for such software.
|
||||||
|
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||||
|
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||||
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
|
*/
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/utf.h"
|
||||||
|
|
||||||
|
static
|
||||||
|
Rune*
|
||||||
|
rbsearch(Rune c, Rune *t, int n, int ne)
|
||||||
|
{
|
||||||
|
Rune *p;
|
||||||
|
int m;
|
||||||
|
|
||||||
|
while(n > 1) {
|
||||||
|
m = n >> 1;
|
||||||
|
p = t + m*ne;
|
||||||
|
if(c >= p[0]) {
|
||||||
|
t = p;
|
||||||
|
n = n-m;
|
||||||
|
} else
|
||||||
|
n = m;
|
||||||
|
}
|
||||||
|
if(n && c >= t[0])
|
||||||
|
return t;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define RUNETYPEBODY
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/utf/runetypebody.h"
|
|
@ -0,0 +1,212 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#ifdef RUNETYPEBODY
|
||||||
|
|
||||||
|
static Rune __isalphar[] = {
|
||||||
|
0x0041, 0x005a, 0x0061, 0x007a, 0x00c0, 0x00d6, 0x00d8, 0x00f6,
|
||||||
|
0x00f8, 0x02c1, 0x02c6, 0x02d1, 0x02e0, 0x02e4, 0x0370, 0x0374,
|
||||||
|
0x0376, 0x0377, 0x037a, 0x037d, 0x0388, 0x038a, 0x038e, 0x03a1,
|
||||||
|
0x03a3, 0x03f5, 0x03f7, 0x0481, 0x048a, 0x0527, 0x0531, 0x0556,
|
||||||
|
0x0561, 0x0587, 0x05d0, 0x05ea, 0x05f0, 0x05f2, 0x0620, 0x064a,
|
||||||
|
0x066e, 0x066f, 0x0671, 0x06d3, 0x06e5, 0x06e6, 0x06ee, 0x06ef,
|
||||||
|
0x06fa, 0x06fc, 0x0712, 0x072f, 0x074d, 0x07a5, 0x07ca, 0x07ea,
|
||||||
|
0x07f4, 0x07f5, 0x0800, 0x0815, 0x0840, 0x0858, 0x08a2, 0x08ac,
|
||||||
|
0x0904, 0x0939, 0x0958, 0x0961, 0x0971, 0x0977, 0x0979, 0x097f,
|
||||||
|
0x0985, 0x098c, 0x098f, 0x0990, 0x0993, 0x09a8, 0x09aa, 0x09b0,
|
||||||
|
0x09b6, 0x09b9, 0x09dc, 0x09dd, 0x09df, 0x09e1, 0x09f0, 0x09f1,
|
||||||
|
0x0a05, 0x0a0a, 0x0a0f, 0x0a10, 0x0a13, 0x0a28, 0x0a2a, 0x0a30,
|
||||||
|
0x0a32, 0x0a33, 0x0a35, 0x0a36, 0x0a38, 0x0a39, 0x0a59, 0x0a5c,
|
||||||
|
0x0a72, 0x0a74, 0x0a85, 0x0a8d, 0x0a8f, 0x0a91, 0x0a93, 0x0aa8,
|
||||||
|
0x0aaa, 0x0ab0, 0x0ab2, 0x0ab3, 0x0ab5, 0x0ab9, 0x0ae0, 0x0ae1,
|
||||||
|
0x0b05, 0x0b0c, 0x0b0f, 0x0b10, 0x0b13, 0x0b28, 0x0b2a, 0x0b30,
|
||||||
|
0x0b32, 0x0b33, 0x0b35, 0x0b39, 0x0b5c, 0x0b5d, 0x0b5f, 0x0b61,
|
||||||
|
0x0b85, 0x0b8a, 0x0b8e, 0x0b90, 0x0b92, 0x0b95, 0x0b99, 0x0b9a,
|
||||||
|
0x0b9e, 0x0b9f, 0x0ba3, 0x0ba4, 0x0ba8, 0x0baa, 0x0bae, 0x0bb9,
|
||||||
|
0x0c05, 0x0c0c, 0x0c0e, 0x0c10, 0x0c12, 0x0c28, 0x0c2a, 0x0c33,
|
||||||
|
0x0c35, 0x0c39, 0x0c58, 0x0c59, 0x0c60, 0x0c61, 0x0c85, 0x0c8c,
|
||||||
|
0x0c8e, 0x0c90, 0x0c92, 0x0ca8, 0x0caa, 0x0cb3, 0x0cb5, 0x0cb9,
|
||||||
|
0x0ce0, 0x0ce1, 0x0cf1, 0x0cf2, 0x0d05, 0x0d0c, 0x0d0e, 0x0d10,
|
||||||
|
0x0d12, 0x0d3a, 0x0d60, 0x0d61, 0x0d7a, 0x0d7f, 0x0d85, 0x0d96,
|
||||||
|
0x0d9a, 0x0db1, 0x0db3, 0x0dbb, 0x0dc0, 0x0dc6, 0x0e01, 0x0e30,
|
||||||
|
0x0e32, 0x0e33, 0x0e40, 0x0e46, 0x0e81, 0x0e82, 0x0e87, 0x0e88,
|
||||||
|
0x0e94, 0x0e97, 0x0e99, 0x0e9f, 0x0ea1, 0x0ea3, 0x0eaa, 0x0eab,
|
||||||
|
0x0ead, 0x0eb0, 0x0eb2, 0x0eb3, 0x0ec0, 0x0ec4, 0x0edc, 0x0edf,
|
||||||
|
0x0f40, 0x0f47, 0x0f49, 0x0f6c, 0x0f88, 0x0f8c, 0x1000, 0x102a,
|
||||||
|
0x1050, 0x1055, 0x105a, 0x105d, 0x1065, 0x1066, 0x106e, 0x1070,
|
||||||
|
0x1075, 0x1081, 0x10a0, 0x10c5, 0x10d0, 0x10fa, 0x10fc, 0x1248,
|
||||||
|
0x124a, 0x124d, 0x1250, 0x1256, 0x125a, 0x125d, 0x1260, 0x1288,
|
||||||
|
0x128a, 0x128d, 0x1290, 0x12b0, 0x12b2, 0x12b5, 0x12b8, 0x12be,
|
||||||
|
0x12c2, 0x12c5, 0x12c8, 0x12d6, 0x12d8, 0x1310, 0x1312, 0x1315,
|
||||||
|
0x1318, 0x135a, 0x1380, 0x138f, 0x13a0, 0x13f4, 0x1401, 0x166c,
|
||||||
|
0x166f, 0x167f, 0x1681, 0x169a, 0x16a0, 0x16ea, 0x1700, 0x170c,
|
||||||
|
0x170e, 0x1711, 0x1720, 0x1731, 0x1740, 0x1751, 0x1760, 0x176c,
|
||||||
|
0x176e, 0x1770, 0x1780, 0x17b3, 0x1820, 0x1877, 0x1880, 0x18a8,
|
||||||
|
0x18b0, 0x18f5, 0x1900, 0x191c, 0x1950, 0x196d, 0x1970, 0x1974,
|
||||||
|
0x1980, 0x19ab, 0x19c1, 0x19c7, 0x1a00, 0x1a16, 0x1a20, 0x1a54,
|
||||||
|
0x1b05, 0x1b33, 0x1b45, 0x1b4b, 0x1b83, 0x1ba0, 0x1bae, 0x1baf,
|
||||||
|
0x1bba, 0x1be5, 0x1c00, 0x1c23, 0x1c4d, 0x1c4f, 0x1c5a, 0x1c7d,
|
||||||
|
0x1ce9, 0x1cec, 0x1cee, 0x1cf1, 0x1cf5, 0x1cf6, 0x1d00, 0x1dbf,
|
||||||
|
0x1e00, 0x1f15, 0x1f18, 0x1f1d, 0x1f20, 0x1f45, 0x1f48, 0x1f4d,
|
||||||
|
0x1f50, 0x1f57, 0x1f5f, 0x1f7d, 0x1f80, 0x1fb4, 0x1fb6, 0x1fbc,
|
||||||
|
0x1fc2, 0x1fc4, 0x1fc6, 0x1fcc, 0x1fd0, 0x1fd3, 0x1fd6, 0x1fdb,
|
||||||
|
0x1fe0, 0x1fec, 0x1ff2, 0x1ff4, 0x1ff6, 0x1ffc, 0x2090, 0x209c,
|
||||||
|
0x210a, 0x2113, 0x2119, 0x211d, 0x212a, 0x212d, 0x212f, 0x2139,
|
||||||
|
0x213c, 0x213f, 0x2145, 0x2149, 0x2183, 0x2184, 0x2c00, 0x2c2e,
|
||||||
|
0x2c30, 0x2c5e, 0x2c60, 0x2ce4, 0x2ceb, 0x2cee, 0x2cf2, 0x2cf3,
|
||||||
|
0x2d00, 0x2d25, 0x2d30, 0x2d67, 0x2d80, 0x2d96, 0x2da0, 0x2da6,
|
||||||
|
0x2da8, 0x2dae, 0x2db0, 0x2db6, 0x2db8, 0x2dbe, 0x2dc0, 0x2dc6,
|
||||||
|
0x2dc8, 0x2dce, 0x2dd0, 0x2dd6, 0x2dd8, 0x2dde, 0x3005, 0x3006,
|
||||||
|
0x3031, 0x3035, 0x303b, 0x303c, 0x3041, 0x3096, 0x309d, 0x309f,
|
||||||
|
0x30a1, 0x30fa, 0x30fc, 0x30ff, 0x3105, 0x312d, 0x3131, 0x318e,
|
||||||
|
0x31a0, 0x31ba, 0x31f0, 0x31ff, 0x3400, 0x4db5, 0x4e00, 0x9fcc,
|
||||||
|
0xa000, 0xa48c, 0xa4d0, 0xa4fd, 0xa500, 0xa60c, 0xa610, 0xa61f,
|
||||||
|
0xa62a, 0xa62b, 0xa640, 0xa66e, 0xa67f, 0xa697, 0xa6a0, 0xa6e5,
|
||||||
|
0xa717, 0xa71f, 0xa722, 0xa788, 0xa78b, 0xa78e, 0xa790, 0xa793,
|
||||||
|
0xa7a0, 0xa7aa, 0xa7f8, 0xa801, 0xa803, 0xa805, 0xa807, 0xa80a,
|
||||||
|
0xa80c, 0xa822, 0xa840, 0xa873, 0xa882, 0xa8b3, 0xa8f2, 0xa8f7,
|
||||||
|
0xa90a, 0xa925, 0xa930, 0xa946, 0xa960, 0xa97c, 0xa984, 0xa9b2,
|
||||||
|
0xaa00, 0xaa28, 0xaa40, 0xaa42, 0xaa44, 0xaa4b, 0xaa60, 0xaa76,
|
||||||
|
0xaa80, 0xaaaf, 0xaab5, 0xaab6, 0xaab9, 0xaabd, 0xaadb, 0xaadd,
|
||||||
|
0xaae0, 0xaaea, 0xaaf2, 0xaaf4, 0xab01, 0xab06, 0xab09, 0xab0e,
|
||||||
|
0xab11, 0xab16, 0xab20, 0xab26, 0xab28, 0xab2e, 0xabc0, 0xabe2,
|
||||||
|
0xac00, 0xd7a3, 0xd7b0, 0xd7c6, 0xd7cb, 0xd7fb, 0xf900, 0xfa6d,
|
||||||
|
0xfa70, 0xfad9, 0xfb00, 0xfb06, 0xfb13, 0xfb17, 0xfb1f, 0xfb28,
|
||||||
|
0xfb2a, 0xfb36, 0xfb38, 0xfb3c, 0xfb40, 0xfb41, 0xfb43, 0xfb44,
|
||||||
|
0xfb46, 0xfbb1, 0xfbd3, 0xfd3d, 0xfd50, 0xfd8f, 0xfd92, 0xfdc7,
|
||||||
|
0xfdf0, 0xfdfb, 0xfe70, 0xfe74, 0xfe76, 0xfefc, 0xff21, 0xff3a,
|
||||||
|
0xff41, 0xff5a, 0xff66, 0xffbe, 0xffc2, 0xffc7, 0xffca, 0xffcf,
|
||||||
|
0xffd2, 0xffd7, 0xffda, 0xffdc, 0x10000, 0x1000b, 0x1000d, 0x10026,
|
||||||
|
0x10028, 0x1003a, 0x1003c, 0x1003d, 0x1003f, 0x1004d, 0x10050, 0x1005d,
|
||||||
|
0x10080, 0x100fa, 0x10280, 0x1029c, 0x102a0, 0x102d0, 0x10300, 0x1031e,
|
||||||
|
0x10330, 0x10340, 0x10342, 0x10349, 0x10380, 0x1039d, 0x103a0, 0x103c3,
|
||||||
|
0x103c8, 0x103cf, 0x10400, 0x1049d, 0x10800, 0x10805, 0x1080a, 0x10835,
|
||||||
|
0x10837, 0x10838, 0x1083f, 0x10855, 0x10900, 0x10915, 0x10920, 0x10939,
|
||||||
|
0x10980, 0x109b7, 0x109be, 0x109bf, 0x10a10, 0x10a13, 0x10a15, 0x10a17,
|
||||||
|
0x10a19, 0x10a33, 0x10a60, 0x10a7c, 0x10b00, 0x10b35, 0x10b40, 0x10b55,
|
||||||
|
0x10b60, 0x10b72, 0x10c00, 0x10c48, 0x11003, 0x11037, 0x11083, 0x110af,
|
||||||
|
0x110d0, 0x110e8, 0x11103, 0x11126, 0x11183, 0x111b2, 0x111c1, 0x111c4,
|
||||||
|
0x11680, 0x116aa, 0x12000, 0x1236e, 0x13000, 0x1342e, 0x16800, 0x16a38,
|
||||||
|
0x16f00, 0x16f44, 0x16f93, 0x16f9f, 0x1b000, 0x1b001, 0x1d400, 0x1d454,
|
||||||
|
0x1d456, 0x1d49c, 0x1d49e, 0x1d49f, 0x1d4a5, 0x1d4a6, 0x1d4a9, 0x1d4ac,
|
||||||
|
0x1d4ae, 0x1d4b9, 0x1d4bd, 0x1d4c3, 0x1d4c5, 0x1d505, 0x1d507, 0x1d50a,
|
||||||
|
0x1d50d, 0x1d514, 0x1d516, 0x1d51c, 0x1d51e, 0x1d539, 0x1d53b, 0x1d53e,
|
||||||
|
0x1d540, 0x1d544, 0x1d54a, 0x1d550, 0x1d552, 0x1d6a5, 0x1d6a8, 0x1d6c0,
|
||||||
|
0x1d6c2, 0x1d6da, 0x1d6dc, 0x1d6fa, 0x1d6fc, 0x1d714, 0x1d716, 0x1d734,
|
||||||
|
0x1d736, 0x1d74e, 0x1d750, 0x1d76e, 0x1d770, 0x1d788, 0x1d78a, 0x1d7a8,
|
||||||
|
0x1d7aa, 0x1d7c2, 0x1d7c4, 0x1d7cb, 0x1ee00, 0x1ee03, 0x1ee05, 0x1ee1f,
|
||||||
|
0x1ee21, 0x1ee22, 0x1ee29, 0x1ee32, 0x1ee34, 0x1ee37, 0x1ee4d, 0x1ee4f,
|
||||||
|
0x1ee51, 0x1ee52, 0x1ee61, 0x1ee62, 0x1ee67, 0x1ee6a, 0x1ee6c, 0x1ee72,
|
||||||
|
0x1ee74, 0x1ee77, 0x1ee79, 0x1ee7c, 0x1ee80, 0x1ee89, 0x1ee8b, 0x1ee9b,
|
||||||
|
0x1eea1, 0x1eea3, 0x1eea5, 0x1eea9, 0x1eeab, 0x1eebb, 0x20000, 0x2a6d6,
|
||||||
|
0x2a700, 0x2b734, 0x2b740, 0x2b81d, 0x2f800, 0x2fa1d,
|
||||||
|
};
|
||||||
|
|
||||||
|
static Rune __isalphas[] = {
|
||||||
|
0x00aa, 0x00b5, 0x00ba, 0x02ec, 0x02ee, 0x0386, 0x038c, 0x0559,
|
||||||
|
0x06d5, 0x06ff, 0x0710, 0x07b1, 0x07fa, 0x081a, 0x0824, 0x0828,
|
||||||
|
0x08a0, 0x093d, 0x0950, 0x09b2, 0x09bd, 0x09ce, 0x0a5e, 0x0abd,
|
||||||
|
0x0ad0, 0x0b3d, 0x0b71, 0x0b83, 0x0b9c, 0x0bd0, 0x0c3d, 0x0cbd,
|
||||||
|
0x0cde, 0x0d3d, 0x0d4e, 0x0dbd, 0x0e84, 0x0e8a, 0x0e8d, 0x0ea5,
|
||||||
|
0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
|
||||||
|
0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59,
|
||||||
|
0x1f5b, 0x1f5d, 0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115,
|
||||||
|
0x2124, 0x2126, 0x2128, 0x214e, 0x2d27, 0x2d2d, 0x2d6f, 0x2e2f,
|
||||||
|
0xa8fb, 0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xfb1d, 0xfb3e,
|
||||||
|
0x10808, 0x1083c, 0x10a00, 0x16f50, 0x1d4a2, 0x1d4bb, 0x1d546, 0x1ee24,
|
||||||
|
0x1ee27, 0x1ee39, 0x1ee3b, 0x1ee42, 0x1ee47, 0x1ee49, 0x1ee4b, 0x1ee54,
|
||||||
|
0x1ee57, 0x1ee59, 0x1ee5b, 0x1ee5d, 0x1ee5f, 0x1ee64, 0x1ee7e,
|
||||||
|
};
|
||||||
|
|
||||||
|
int utf_isalpharune(Rune c) {
|
||||||
|
Rune *p;
|
||||||
|
|
||||||
|
p = rbsearch(c, __isalphar, nelem(__isalphar) / 2, 2);
|
||||||
|
if (p && c >= p[0] && c <= p[1]) return 1;
|
||||||
|
p = rbsearch(c, __isalphas, nelem(__isalphas), 1);
|
||||||
|
if (p && c == p[0]) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Rune __tolowerr[] = {
|
||||||
|
0x0041, 0x005a, 1048608, 0x00c0, 0x00d6, 1048608, 0x00d8, 0x00de, 1048608,
|
||||||
|
0x0189, 0x018a, 1048781, 0x01b1, 0x01b2, 1048793, 0x0388, 0x038a, 1048613,
|
||||||
|
0x038e, 0x038f, 1048639, 0x0391, 0x03a1, 1048608, 0x03a3, 0x03ab, 1048608,
|
||||||
|
0x03fd, 0x03ff, 1048446, 0x0400, 0x040f, 1048656, 0x0410, 0x042f, 1048608,
|
||||||
|
0x0531, 0x0556, 1048624, 0x10a0, 0x10c5, 1055840, 0x1f08, 0x1f0f, 1048568,
|
||||||
|
0x1f18, 0x1f1d, 1048568, 0x1f28, 0x1f2f, 1048568, 0x1f38, 0x1f3f, 1048568,
|
||||||
|
0x1f48, 0x1f4d, 1048568, 0x1f68, 0x1f6f, 1048568, 0x1f88, 0x1f8f, 1048568,
|
||||||
|
0x1f98, 0x1f9f, 1048568, 0x1fa8, 0x1faf, 1048568, 0x1fb8, 0x1fb9, 1048568,
|
||||||
|
0x1fba, 0x1fbb, 1048502, 0x1fc8, 0x1fcb, 1048490, 0x1fd8, 0x1fd9, 1048568,
|
||||||
|
0x1fda, 0x1fdb, 1048476, 0x1fe8, 0x1fe9, 1048568, 0x1fea, 0x1feb, 1048464,
|
||||||
|
0x1ff8, 0x1ff9, 1048448, 0x1ffa, 0x1ffb, 1048450, 0x2160, 0x216f, 1048592,
|
||||||
|
0x24b6, 0x24cf, 1048602, 0x2c00, 0x2c2e, 1048624, 0x2c7e, 0x2c7f, 1037761,
|
||||||
|
0xff21, 0xff3a, 1048608, 0x10400, 0x10427, 1048616,
|
||||||
|
};
|
||||||
|
|
||||||
|
static Rune __tolowerp[] = {
|
||||||
|
0x0100, 0x012e, 1048577, 0x0132, 0x0136, 1048577, 0x0139, 0x0147, 1048577,
|
||||||
|
0x014a, 0x0176, 1048577, 0x017b, 0x017d, 1048577, 0x01a2, 0x01a4, 1048577,
|
||||||
|
0x01b3, 0x01b5, 1048577, 0x01cd, 0x01db, 1048577, 0x01de, 0x01ee, 1048577,
|
||||||
|
0x01f8, 0x021e, 1048577, 0x0222, 0x0232, 1048577, 0x0248, 0x024e, 1048577,
|
||||||
|
0x0370, 0x0372, 1048577, 0x03d8, 0x03ee, 1048577, 0x0460, 0x0480, 1048577,
|
||||||
|
0x048a, 0x04be, 1048577, 0x04c3, 0x04cd, 1048577, 0x04d0, 0x0526, 1048577,
|
||||||
|
0x1e00, 0x1e94, 1048577, 0x1ea0, 0x1efe, 1048577, 0x1f59, 0x1f5f, 1048568,
|
||||||
|
0x2c67, 0x2c6b, 1048577, 0x2c80, 0x2ce2, 1048577, 0x2ceb, 0x2ced, 1048577,
|
||||||
|
0xa640, 0xa66c, 1048577, 0xa680, 0xa696, 1048577, 0xa722, 0xa72e, 1048577,
|
||||||
|
0xa732, 0xa76e, 1048577, 0xa779, 0xa77b, 1048577, 0xa780, 0xa786, 1048577,
|
||||||
|
0xa790, 0xa792, 1048577, 0xa7a0, 0xa7a8, 1048577,
|
||||||
|
};
|
||||||
|
|
||||||
|
static Rune __tolowers[] = {
|
||||||
|
0x0130, 1048377, 0x0178, 1048455, 0x0179, 1048577, 0x0181, 1048786,
|
||||||
|
0x0182, 1048577, 0x0184, 1048577, 0x0186, 1048782, 0x0187, 1048577,
|
||||||
|
0x018b, 1048577, 0x018e, 1048655, 0x018f, 1048778, 0x0190, 1048779,
|
||||||
|
0x0191, 1048577, 0x0193, 1048781, 0x0194, 1048783, 0x0196, 1048787,
|
||||||
|
0x0197, 1048785, 0x0198, 1048577, 0x019c, 1048787, 0x019d, 1048789,
|
||||||
|
0x019f, 1048790, 0x01a0, 1048577, 0x01a6, 1048794, 0x01a7, 1048577,
|
||||||
|
0x01a9, 1048794, 0x01ac, 1048577, 0x01ae, 1048794, 0x01af, 1048577,
|
||||||
|
0x01b7, 1048795, 0x01b8, 1048577, 0x01bc, 1048577, 0x01c4, 1048578,
|
||||||
|
0x01c5, 1048577, 0x01c7, 1048578, 0x01c8, 1048577, 0x01ca, 1048578,
|
||||||
|
0x01cb, 1048577, 0x01f1, 1048578, 0x01f2, 1048577, 0x01f4, 1048577,
|
||||||
|
0x01f6, 1048479, 0x01f7, 1048520, 0x0220, 1048446, 0x023a, 1059371,
|
||||||
|
0x023b, 1048577, 0x023d, 1048413, 0x023e, 1059368, 0x0241, 1048577,
|
||||||
|
0x0243, 1048381, 0x0244, 1048645, 0x0245, 1048647, 0x0246, 1048577,
|
||||||
|
0x0376, 1048577, 0x0386, 1048614, 0x038c, 1048640, 0x03cf, 1048584,
|
||||||
|
0x03f4, 1048516, 0x03f7, 1048577, 0x03f9, 1048569, 0x03fa, 1048577,
|
||||||
|
0x04c0, 1048591, 0x04c1, 1048577, 0x10c7, 1055840, 0x10cd, 1055840,
|
||||||
|
0x1e9e, 1040961, 0x1fbc, 1048567, 0x1fcc, 1048567, 0x1fec, 1048569,
|
||||||
|
0x1ffc, 1048567, 0x2126, 1041059, 0x212a, 1040193, 0x212b, 1040314,
|
||||||
|
0x2132, 1048604, 0x2183, 1048577, 0x2c60, 1048577, 0x2c62, 1037833,
|
||||||
|
0x2c63, 1044762, 0x2c64, 1037849, 0x2c6d, 1037796, 0x2c6e, 1037827,
|
||||||
|
0x2c6f, 1037793, 0x2c70, 1037794, 0x2c72, 1048577, 0x2c75, 1048577,
|
||||||
|
0x2cf2, 1048577, 0xa77d, 1013244, 0xa77e, 1048577, 0xa78b, 1048577,
|
||||||
|
0xa78d, 1006296, 0xa7aa, 1006268,
|
||||||
|
};
|
||||||
|
|
||||||
|
Rune utf_tolowerrune(Rune c) {
|
||||||
|
Rune *p;
|
||||||
|
|
||||||
|
p = rbsearch(c, __tolowerr, nelem(__tolowerr) / 3, 3);
|
||||||
|
if (p && c >= p[0] && c <= p[1]) return c + p[2] - 1048576;
|
||||||
|
p = rbsearch(c, __tolowerp, nelem(__tolowerp) / 3, 3);
|
||||||
|
if (p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))
|
||||||
|
return c + p[2] - 1048576;
|
||||||
|
p = rbsearch(c, __tolowers, nelem(__tolowers) / 2, 2);
|
||||||
|
if (p && c == p[0]) return c + p[1] - 1048576;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,98 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
// Fork of several UTF utils originally written by Rob Pike and Ken Thompson.
|
||||||
|
#ifndef MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_UTF_UTF_H_
|
||||||
|
#define MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_UTF_UTF_H_ 1
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
// Code-point values in Unicode 4.0 are 21 bits wide.
|
||||||
|
typedef signed int Rune;
|
||||||
|
|
||||||
|
#define uchar _utfuchar
|
||||||
|
|
||||||
|
typedef unsigned char uchar;
|
||||||
|
|
||||||
|
#define nelem(x) (sizeof(x) / sizeof((x)[0]))
|
||||||
|
|
||||||
|
enum {
|
||||||
|
UTFmax = 4, // maximum bytes per rune
|
||||||
|
Runeerror = 0xFFFD, // decoding error in UTF
|
||||||
|
Runemax = 0x10FFFF, // maximum rune value
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* rune routines
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These routines were written by Rob Pike and Ken Thompson
|
||||||
|
* and first appeared in Plan 9.
|
||||||
|
* SEE ALSO
|
||||||
|
* utf (7)
|
||||||
|
* tcs (1)
|
||||||
|
*/
|
||||||
|
|
||||||
|
// utf_runetochar copies (encodes) one rune, pointed to by r, to at most
|
||||||
|
// UTFmax bytes starting at s and returns the number of bytes generated.
|
||||||
|
|
||||||
|
int utf_runetochar(char* s, const Rune* r);
|
||||||
|
|
||||||
|
// utf_charntorune copies (decodes) at most UTFmax bytes starting at `str` to
|
||||||
|
// one rune, pointed to by `rune`, accesss at most `length` bytes of `str`, and
|
||||||
|
// returns the number of bytes consumed.
|
||||||
|
// If the UTF sequence is incomplete within n bytes,
|
||||||
|
// utf_charntorune will set *r to Runeerror and return 0. If it is complete
|
||||||
|
// but not in UTF format, it will set *r to Runeerror and return 1.
|
||||||
|
//
|
||||||
|
// Added 2004-09-24 by Wei-Hwa Huang
|
||||||
|
|
||||||
|
int utf_charntorune(Rune* rune, const char* str, int length);
|
||||||
|
|
||||||
|
// Unicode defines some characters as letters and
|
||||||
|
// specifies three cases: upper, lower, and title. Mappings among the
|
||||||
|
// cases are also defined, although they are not exhaustive: some
|
||||||
|
// upper case letters have no lower case mapping, and so on. Unicode
|
||||||
|
// also defines several character properties, a subset of which are
|
||||||
|
// checked by these routines. These routines are based on Unicode
|
||||||
|
// version 3.0.0.
|
||||||
|
//
|
||||||
|
// NOTE: The routines are implemented in C, so isalpharrune returns 0 for false
|
||||||
|
// and 1 for true.
|
||||||
|
//
|
||||||
|
// utf_tolowerrune is the Unicode case mapping. It returns the character
|
||||||
|
// unchanged if it has no defined mapping.
|
||||||
|
|
||||||
|
Rune utf_tolowerrune(Rune r);
|
||||||
|
|
||||||
|
// utf_isalpharune tests for Unicode letters; this includes ideographs in
|
||||||
|
// addition to alphabetic characters.
|
||||||
|
|
||||||
|
int utf_isalpharune(Rune r);
|
||||||
|
|
||||||
|
// (The comments in this file were copied from the manpage files rune.3,
|
||||||
|
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
|
||||||
|
// to conform to Google style. /JRM 11/11/05)
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // MEDIAPIPE_TASKS_CC_TEXT_LANGUAGE_DETECTOR_CUSTOM_OPS_UTILS_UTF_UTF_H_
|
Loading…
Reference in New Issue
Block a user