Internal change
PiperOrigin-RevId: 515706419
This commit is contained in:
parent
c3a32d76be
commit
c9bd4f5957
|
@ -0,0 +1,38 @@
|
||||||
|
# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
package(default_visibility = ["//mediapipe/tasks:internal"])
|
||||||
|
|
||||||
|
licenses(["notice"])
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "murmur",
|
||||||
|
srcs = ["murmur.cc"],
|
||||||
|
hdrs = ["murmur.h"],
|
||||||
|
deps = [
|
||||||
|
"//mediapipe/framework/port:integral_types",
|
||||||
|
"@com_google_absl//absl/base:core_headers",
|
||||||
|
"@com_google_absl//absl/base:endian",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cc_test(
|
||||||
|
name = "murmur_test",
|
||||||
|
srcs = ["murmur_test.cc"],
|
||||||
|
deps = [
|
||||||
|
":murmur",
|
||||||
|
"//mediapipe/framework/port:gtest_main",
|
||||||
|
"//mediapipe/framework/port:integral_types",
|
||||||
|
],
|
||||||
|
)
|
|
@ -0,0 +1,95 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
// Forked from a library written by Austin Appelby and Jyrki Alakuijala.
|
||||||
|
// Original copyright message below.
|
||||||
|
// Copyright 2009 Google Inc. All Rights Reserved.
|
||||||
|
// Author: aappleby@google.com (Austin Appleby)
|
||||||
|
// jyrki@google.com (Jyrki Alakuijala)
|
||||||
|
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#include "absl/base/internal/endian.h"
|
||||||
|
#include "absl/base/optimization.h"
|
||||||
|
#include "mediapipe/framework/port/integral_types.h"
|
||||||
|
|
||||||
|
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using ::absl::little_endian::Load64;
|
||||||
|
|
||||||
|
// Murmur 2.0 multiplication constant.
|
||||||
|
static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
|
||||||
|
|
||||||
|
// We need to mix some of the bits that get propagated and mixed into the
|
||||||
|
// high bits by multiplication back into the low bits. 17 last bits get
|
||||||
|
// a more efficiently mixed with this.
|
||||||
|
inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); }
|
||||||
|
|
||||||
|
// Accumulate 8 bytes into 64-bit Murmur hash
|
||||||
|
inline uint64_t MurmurStep(uint64_t hash, uint64_t data) {
|
||||||
|
hash ^= ShiftMix(data * kMul) * kMul;
|
||||||
|
hash *= kMul;
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a uint64 from 1-8 bytes.
|
||||||
|
// 8 * len least significant bits are loaded from the memory with
|
||||||
|
// LittleEndian order. The 64 - 8 * len most significant bits are
|
||||||
|
// set all to 0.
|
||||||
|
// In latex-friendly words, this function returns:
|
||||||
|
// $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned.
|
||||||
|
//
|
||||||
|
// This function is equivalent to:
|
||||||
|
// uint64 val = 0;
|
||||||
|
// memcpy(&val, p, len);
|
||||||
|
// return ToHost64(val);
|
||||||
|
//
|
||||||
|
// The caller needs to guarantee that 0 <= len <= 8.
|
||||||
|
uint64_t Load64VariableLength(const void* const p, int len) {
|
||||||
|
ABSL_ASSUME(len >= 0 && len <= 8);
|
||||||
|
uint64_t val = 0;
|
||||||
|
const uint8_t* const src = static_cast<const uint8_t*>(p);
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val |= static_cast<uint64_t>(src[i]) << (8 * i);
|
||||||
|
}
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
unsigned long long MurmurHash64WithSeed(const char* buf, // NOLINT
|
||||||
|
const size_t len, const uint64_t seed) {
|
||||||
|
// Let's remove the bytes not divisible by the sizeof(uint64).
|
||||||
|
// This allows the inner loop to process the data as 64 bit integers.
|
||||||
|
const size_t len_aligned = len & ~0x7;
|
||||||
|
const char* const end = buf + len_aligned;
|
||||||
|
uint64_t hash = seed ^ (len * kMul);
|
||||||
|
for (const char* p = buf; p != end; p += 8) {
|
||||||
|
hash = MurmurStep(hash, Load64(p));
|
||||||
|
}
|
||||||
|
if ((len & 0x7) != 0) {
|
||||||
|
const uint64_t data = Load64VariableLength(end, len & 0x7);
|
||||||
|
hash ^= data;
|
||||||
|
hash *= kMul;
|
||||||
|
}
|
||||||
|
hash = ShiftMix(hash) * kMul;
|
||||||
|
hash = ShiftMix(hash);
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
|
|
@ -0,0 +1,43 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
// Forked from a library written by Austin Appelby and Jyrki Alakuijala.
|
||||||
|
// Original copyright message below.
|
||||||
|
// Copyright 2009 Google Inc. All Rights Reserved.
|
||||||
|
// Author: aappleby@google.com (Austin Appelby)
|
||||||
|
// jyrki@google.com (Jyrki Alakuijala)
|
||||||
|
//
|
||||||
|
// MurmurHash is a fast multiplication and shifting based algorithm,
|
||||||
|
// based on Austin Appleby's MurmurHash 2.0 algorithm.
|
||||||
|
|
||||||
|
#ifndef UTIL_HASH_MURMUR_H_
|
||||||
|
#define UTIL_HASH_MURMUR_H_
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdlib.h> // for size_t.
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#include "mediapipe/framework/port/integral_types.h"
|
||||||
|
|
||||||
|
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
|
||||||
|
|
||||||
|
// Hash function for a byte array. Has a seed which allows this hash function to
|
||||||
|
// be used in algorithms that need a family of parameterized hash functions.
|
||||||
|
// e.g. Minhash.
|
||||||
|
unsigned long long MurmurHash64WithSeed(const char* buf, size_t len, // NOLINT
|
||||||
|
uint64_t seed);
|
||||||
|
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
|
||||||
|
|
||||||
|
#endif // UTIL_HASH_MURMUR_H_
|
|
@ -0,0 +1,66 @@
|
||||||
|
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
// Forked from a test library written by Jyrki Alakuijala.
|
||||||
|
// Original copyright message below.
|
||||||
|
// Copyright 2009 Google Inc. All Rights Reserved.
|
||||||
|
// Author: jyrki@google.com (Jyrki Alakuijala)
|
||||||
|
//
|
||||||
|
// Tests for the fast hashing algorithm based on Austin Appleby's
|
||||||
|
// MurmurHash 2.0 algorithm. See http://murmurhash.googlepages.com/
|
||||||
|
|
||||||
|
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h"
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "mediapipe/framework/port/gmock.h"
|
||||||
|
#include "mediapipe/framework/port/gtest.h"
|
||||||
|
#include "mediapipe/framework/port/integral_types.h"
|
||||||
|
|
||||||
|
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
|
||||||
|
|
||||||
|
TEST(Murmur, EmptyData64) {
|
||||||
|
EXPECT_EQ(uint64_t{0}, MurmurHash64WithSeed(nullptr, uint64_t{0}, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Murmur, VaryWithDifferentSeeds) {
|
||||||
|
// While in theory different seeds could return the same
|
||||||
|
// hash for the same data this is unlikely.
|
||||||
|
char data1 = 'x';
|
||||||
|
EXPECT_NE(MurmurHash64WithSeed(&data1, 1, 100),
|
||||||
|
MurmurHash64WithSeed(&data1, 1, 101));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hashes don't change.
|
||||||
|
TEST(Murmur, Idempotence) {
|
||||||
|
const char data[] = "deadbeef";
|
||||||
|
const size_t dlen = strlen(data);
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
EXPECT_EQ(MurmurHash64WithSeed(data, dlen, i),
|
||||||
|
MurmurHash64WithSeed(data, dlen, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
const char next_data[] = "deadbeef000---";
|
||||||
|
const size_t next_dlen = strlen(next_data);
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
EXPECT_EQ(MurmurHash64WithSeed(next_data, next_dlen, i),
|
||||||
|
MurmurHash64WithSeed(next_data, next_dlen, i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
|
Loading…
Reference in New Issue
Block a user