From c9bd4f5957ca100d320c8346fd430ba39b83a023 Mon Sep 17 00:00:00 2001 From: MediaPipe Team Date: Fri, 10 Mar 2023 12:18:45 -0800 Subject: [PATCH] Internal change PiperOrigin-RevId: 515706419 --- .../custom_ops/utils/hash/BUILD | 38 ++++++++ .../custom_ops/utils/hash/murmur.cc | 95 +++++++++++++++++++ .../custom_ops/utils/hash/murmur.h | 43 +++++++++ .../custom_ops/utils/hash/murmur_test.cc | 66 +++++++++++++ 4 files changed, 242 insertions(+) create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/BUILD create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.cc create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h create mode 100644 mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur_test.cc diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/BUILD b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/BUILD new file mode 100644 index 000000000..86b659245 --- /dev/null +++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/BUILD @@ -0,0 +1,38 @@ +# Copyright 2023 The MediaPipe Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package(default_visibility = ["//mediapipe/tasks:internal"]) + +licenses(["notice"]) + +cc_library( + name = "murmur", + srcs = ["murmur.cc"], + hdrs = ["murmur.h"], + deps = [ + "//mediapipe/framework/port:integral_types", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:endian", + ], +) + +cc_test( + name = "murmur_test", + srcs = ["murmur_test.cc"], + deps = [ + ":murmur", + "//mediapipe/framework/port:gtest_main", + "//mediapipe/framework/port:integral_types", + ], +) diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.cc b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.cc new file mode 100644 index 000000000..75dd161bf --- /dev/null +++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.cc @@ -0,0 +1,95 @@ +/* Copyright 2023 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Forked from a library written by Austin Appelby and Jyrki Alakuijala. +// Original copyright message below. +// Copyright 2009 Google Inc. All Rights Reserved. +// Author: aappleby@google.com (Austin Appleby) +// jyrki@google.com (Jyrki Alakuijala) + +#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h" + +#include + +#include "absl/base/internal/endian.h" +#include "absl/base/optimization.h" +#include "mediapipe/framework/port/integral_types.h" + +namespace mediapipe::tasks::text::language_detector::custom_ops::hash { + +namespace { + +using ::absl::little_endian::Load64; + +// Murmur 2.0 multiplication constant. +static const uint64_t kMul = 0xc6a4a7935bd1e995ULL; + +// We need to mix some of the bits that get propagated and mixed into the +// high bits by multiplication back into the low bits. 17 last bits get +// a more efficiently mixed with this. +inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); } + +// Accumulate 8 bytes into 64-bit Murmur hash +inline uint64_t MurmurStep(uint64_t hash, uint64_t data) { + hash ^= ShiftMix(data * kMul) * kMul; + hash *= kMul; + return hash; +} + +// Build a uint64 from 1-8 bytes. +// 8 * len least significant bits are loaded from the memory with +// LittleEndian order. The 64 - 8 * len most significant bits are +// set all to 0. +// In latex-friendly words, this function returns: +// $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned. +// +// This function is equivalent to: +// uint64 val = 0; +// memcpy(&val, p, len); +// return ToHost64(val); +// +// The caller needs to guarantee that 0 <= len <= 8. +uint64_t Load64VariableLength(const void* const p, int len) { + ABSL_ASSUME(len >= 0 && len <= 8); + uint64_t val = 0; + const uint8_t* const src = static_cast(p); + for (int i = 0; i < len; ++i) { + val |= static_cast(src[i]) << (8 * i); + } + return val; +} + +} // namespace + +unsigned long long MurmurHash64WithSeed(const char* buf, // NOLINT + const size_t len, const uint64_t seed) { + // Let's remove the bytes not divisible by the sizeof(uint64). + // This allows the inner loop to process the data as 64 bit integers. + const size_t len_aligned = len & ~0x7; + const char* const end = buf + len_aligned; + uint64_t hash = seed ^ (len * kMul); + for (const char* p = buf; p != end; p += 8) { + hash = MurmurStep(hash, Load64(p)); + } + if ((len & 0x7) != 0) { + const uint64_t data = Load64VariableLength(end, len & 0x7); + hash ^= data; + hash *= kMul; + } + hash = ShiftMix(hash) * kMul; + hash = ShiftMix(hash); + return hash; +} + +} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h new file mode 100644 index 000000000..abcb41a6b --- /dev/null +++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h @@ -0,0 +1,43 @@ +/* Copyright 2023 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Forked from a library written by Austin Appelby and Jyrki Alakuijala. +// Original copyright message below. +// Copyright 2009 Google Inc. All Rights Reserved. +// Author: aappleby@google.com (Austin Appelby) +// jyrki@google.com (Jyrki Alakuijala) +// +// MurmurHash is a fast multiplication and shifting based algorithm, +// based on Austin Appleby's MurmurHash 2.0 algorithm. + +#ifndef UTIL_HASH_MURMUR_H_ +#define UTIL_HASH_MURMUR_H_ + +#include +#include // for size_t. + +#include + +#include "mediapipe/framework/port/integral_types.h" + +namespace mediapipe::tasks::text::language_detector::custom_ops::hash { + +// Hash function for a byte array. Has a seed which allows this hash function to +// be used in algorithms that need a family of parameterized hash functions. +// e.g. Minhash. +unsigned long long MurmurHash64WithSeed(const char* buf, size_t len, // NOLINT + uint64_t seed); +} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash + +#endif // UTIL_HASH_MURMUR_H_ diff --git a/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur_test.cc b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur_test.cc new file mode 100644 index 000000000..6658965bf --- /dev/null +++ b/mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur_test.cc @@ -0,0 +1,66 @@ +/* Copyright 2023 The MediaPipe Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Forked from a test library written by Jyrki Alakuijala. +// Original copyright message below. +// Copyright 2009 Google Inc. All Rights Reserved. +// Author: jyrki@google.com (Jyrki Alakuijala) +// +// Tests for the fast hashing algorithm based on Austin Appleby's +// MurmurHash 2.0 algorithm. See http://murmurhash.googlepages.com/ + +#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h" + +#include + +#include +#include + +#include "mediapipe/framework/port/gmock.h" +#include "mediapipe/framework/port/gtest.h" +#include "mediapipe/framework/port/integral_types.h" + +namespace mediapipe::tasks::text::language_detector::custom_ops::hash { + +TEST(Murmur, EmptyData64) { + EXPECT_EQ(uint64_t{0}, MurmurHash64WithSeed(nullptr, uint64_t{0}, 0)); +} + +TEST(Murmur, VaryWithDifferentSeeds) { + // While in theory different seeds could return the same + // hash for the same data this is unlikely. + char data1 = 'x'; + EXPECT_NE(MurmurHash64WithSeed(&data1, 1, 100), + MurmurHash64WithSeed(&data1, 1, 101)); +} + +// Hashes don't change. +TEST(Murmur, Idempotence) { + const char data[] = "deadbeef"; + const size_t dlen = strlen(data); + + for (int i = 0; i < 10; i++) { + EXPECT_EQ(MurmurHash64WithSeed(data, dlen, i), + MurmurHash64WithSeed(data, dlen, i)); + } + + const char next_data[] = "deadbeef000---"; + const size_t next_dlen = strlen(next_data); + + for (int i = 0; i < 10; i++) { + EXPECT_EQ(MurmurHash64WithSeed(next_data, next_dlen, i), + MurmurHash64WithSeed(next_data, next_dlen, i)); + } +} +} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash