Internal change
PiperOrigin-RevId: 515706419
This commit is contained in:
parent
c3a32d76be
commit
c9bd4f5957
|
@ -0,0 +1,38 @@
|
|||
# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
package(default_visibility = ["//mediapipe/tasks:internal"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
cc_library(
|
||||
name = "murmur",
|
||||
srcs = ["murmur.cc"],
|
||||
hdrs = ["murmur.h"],
|
||||
deps = [
|
||||
"//mediapipe/framework/port:integral_types",
|
||||
"@com_google_absl//absl/base:core_headers",
|
||||
"@com_google_absl//absl/base:endian",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "murmur_test",
|
||||
srcs = ["murmur_test.cc"],
|
||||
deps = [
|
||||
":murmur",
|
||||
"//mediapipe/framework/port:gtest_main",
|
||||
"//mediapipe/framework/port:integral_types",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,95 @@
|
|||
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
// Forked from a library written by Austin Appelby and Jyrki Alakuijala.
|
||||
// Original copyright message below.
|
||||
// Copyright 2009 Google Inc. All Rights Reserved.
|
||||
// Author: aappleby@google.com (Austin Appleby)
|
||||
// jyrki@google.com (Jyrki Alakuijala)
|
||||
|
||||
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "absl/base/internal/endian.h"
|
||||
#include "absl/base/optimization.h"
|
||||
#include "mediapipe/framework/port/integral_types.h"
|
||||
|
||||
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
|
||||
|
||||
namespace {
|
||||
|
||||
using ::absl::little_endian::Load64;
|
||||
|
||||
// Murmur 2.0 multiplication constant.
|
||||
static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
|
||||
|
||||
// We need to mix some of the bits that get propagated and mixed into the
|
||||
// high bits by multiplication back into the low bits. 17 last bits get
|
||||
// a more efficiently mixed with this.
|
||||
inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); }
|
||||
|
||||
// Accumulate 8 bytes into 64-bit Murmur hash
|
||||
inline uint64_t MurmurStep(uint64_t hash, uint64_t data) {
|
||||
hash ^= ShiftMix(data * kMul) * kMul;
|
||||
hash *= kMul;
|
||||
return hash;
|
||||
}
|
||||
|
||||
// Build a uint64 from 1-8 bytes.
|
||||
// 8 * len least significant bits are loaded from the memory with
|
||||
// LittleEndian order. The 64 - 8 * len most significant bits are
|
||||
// set all to 0.
|
||||
// In latex-friendly words, this function returns:
|
||||
// $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned.
|
||||
//
|
||||
// This function is equivalent to:
|
||||
// uint64 val = 0;
|
||||
// memcpy(&val, p, len);
|
||||
// return ToHost64(val);
|
||||
//
|
||||
// The caller needs to guarantee that 0 <= len <= 8.
|
||||
uint64_t Load64VariableLength(const void* const p, int len) {
|
||||
ABSL_ASSUME(len >= 0 && len <= 8);
|
||||
uint64_t val = 0;
|
||||
const uint8_t* const src = static_cast<const uint8_t*>(p);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val |= static_cast<uint64_t>(src[i]) << (8 * i);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
unsigned long long MurmurHash64WithSeed(const char* buf, // NOLINT
|
||||
const size_t len, const uint64_t seed) {
|
||||
// Let's remove the bytes not divisible by the sizeof(uint64).
|
||||
// This allows the inner loop to process the data as 64 bit integers.
|
||||
const size_t len_aligned = len & ~0x7;
|
||||
const char* const end = buf + len_aligned;
|
||||
uint64_t hash = seed ^ (len * kMul);
|
||||
for (const char* p = buf; p != end; p += 8) {
|
||||
hash = MurmurStep(hash, Load64(p));
|
||||
}
|
||||
if ((len & 0x7) != 0) {
|
||||
const uint64_t data = Load64VariableLength(end, len & 0x7);
|
||||
hash ^= data;
|
||||
hash *= kMul;
|
||||
}
|
||||
hash = ShiftMix(hash) * kMul;
|
||||
hash = ShiftMix(hash);
|
||||
return hash;
|
||||
}
|
||||
|
||||
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
|
|
@ -0,0 +1,43 @@
|
|||
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
// Forked from a library written by Austin Appelby and Jyrki Alakuijala.
|
||||
// Original copyright message below.
|
||||
// Copyright 2009 Google Inc. All Rights Reserved.
|
||||
// Author: aappleby@google.com (Austin Appelby)
|
||||
// jyrki@google.com (Jyrki Alakuijala)
|
||||
//
|
||||
// MurmurHash is a fast multiplication and shifting based algorithm,
|
||||
// based on Austin Appleby's MurmurHash 2.0 algorithm.
|
||||
|
||||
#ifndef UTIL_HASH_MURMUR_H_
|
||||
#define UTIL_HASH_MURMUR_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h> // for size_t.
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "mediapipe/framework/port/integral_types.h"
|
||||
|
||||
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
|
||||
|
||||
// Hash function for a byte array. Has a seed which allows this hash function to
|
||||
// be used in algorithms that need a family of parameterized hash functions.
|
||||
// e.g. Minhash.
|
||||
unsigned long long MurmurHash64WithSeed(const char* buf, size_t len, // NOLINT
|
||||
uint64_t seed);
|
||||
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
|
||||
|
||||
#endif // UTIL_HASH_MURMUR_H_
|
|
@ -0,0 +1,66 @@
|
|||
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
// Forked from a test library written by Jyrki Alakuijala.
|
||||
// Original copyright message below.
|
||||
// Copyright 2009 Google Inc. All Rights Reserved.
|
||||
// Author: jyrki@google.com (Jyrki Alakuijala)
|
||||
//
|
||||
// Tests for the fast hashing algorithm based on Austin Appleby's
|
||||
// MurmurHash 2.0 algorithm. See http://murmurhash.googlepages.com/
|
||||
|
||||
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "mediapipe/framework/port/gmock.h"
|
||||
#include "mediapipe/framework/port/gtest.h"
|
||||
#include "mediapipe/framework/port/integral_types.h"
|
||||
|
||||
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
|
||||
|
||||
TEST(Murmur, EmptyData64) {
|
||||
EXPECT_EQ(uint64_t{0}, MurmurHash64WithSeed(nullptr, uint64_t{0}, 0));
|
||||
}
|
||||
|
||||
TEST(Murmur, VaryWithDifferentSeeds) {
|
||||
// While in theory different seeds could return the same
|
||||
// hash for the same data this is unlikely.
|
||||
char data1 = 'x';
|
||||
EXPECT_NE(MurmurHash64WithSeed(&data1, 1, 100),
|
||||
MurmurHash64WithSeed(&data1, 1, 101));
|
||||
}
|
||||
|
||||
// Hashes don't change.
|
||||
TEST(Murmur, Idempotence) {
|
||||
const char data[] = "deadbeef";
|
||||
const size_t dlen = strlen(data);
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
EXPECT_EQ(MurmurHash64WithSeed(data, dlen, i),
|
||||
MurmurHash64WithSeed(data, dlen, i));
|
||||
}
|
||||
|
||||
const char next_data[] = "deadbeef000---";
|
||||
const size_t next_dlen = strlen(next_data);
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
EXPECT_EQ(MurmurHash64WithSeed(next_data, next_dlen, i),
|
||||
MurmurHash64WithSeed(next_data, next_dlen, i));
|
||||
}
|
||||
}
|
||||
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
|
Loading…
Reference in New Issue
Block a user