Internal change

PiperOrigin-RevId: 515706419
This commit is contained in:
MediaPipe Team 2023-03-10 12:18:45 -08:00 committed by Copybara-Service
parent c3a32d76be
commit c9bd4f5957
4 changed files with 242 additions and 0 deletions

View File

@ -0,0 +1,38 @@
# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
package(default_visibility = ["//mediapipe/tasks:internal"])
licenses(["notice"])
cc_library(
name = "murmur",
srcs = ["murmur.cc"],
hdrs = ["murmur.h"],
deps = [
"//mediapipe/framework/port:integral_types",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/base:endian",
],
)
cc_test(
name = "murmur_test",
srcs = ["murmur_test.cc"],
deps = [
":murmur",
"//mediapipe/framework/port:gtest_main",
"//mediapipe/framework/port:integral_types",
],
)

View File

@ -0,0 +1,95 @@
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Forked from a library written by Austin Appelby and Jyrki Alakuijala.
// Original copyright message below.
// Copyright 2009 Google Inc. All Rights Reserved.
// Author: aappleby@google.com (Austin Appleby)
// jyrki@google.com (Jyrki Alakuijala)
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h"
#include <cstdint>
#include "absl/base/internal/endian.h"
#include "absl/base/optimization.h"
#include "mediapipe/framework/port/integral_types.h"
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
namespace {
using ::absl::little_endian::Load64;
// Murmur 2.0 multiplication constant.
static const uint64_t kMul = 0xc6a4a7935bd1e995ULL;
// We need to mix some of the bits that get propagated and mixed into the
// high bits by multiplication back into the low bits. 17 last bits get
// a more efficiently mixed with this.
inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); }
// Accumulate 8 bytes into 64-bit Murmur hash
inline uint64_t MurmurStep(uint64_t hash, uint64_t data) {
hash ^= ShiftMix(data * kMul) * kMul;
hash *= kMul;
return hash;
}
// Build a uint64 from 1-8 bytes.
// 8 * len least significant bits are loaded from the memory with
// LittleEndian order. The 64 - 8 * len most significant bits are
// set all to 0.
// In latex-friendly words, this function returns:
// $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned.
//
// This function is equivalent to:
// uint64 val = 0;
// memcpy(&val, p, len);
// return ToHost64(val);
//
// The caller needs to guarantee that 0 <= len <= 8.
uint64_t Load64VariableLength(const void* const p, int len) {
ABSL_ASSUME(len >= 0 && len <= 8);
uint64_t val = 0;
const uint8_t* const src = static_cast<const uint8_t*>(p);
for (int i = 0; i < len; ++i) {
val |= static_cast<uint64_t>(src[i]) << (8 * i);
}
return val;
}
} // namespace
unsigned long long MurmurHash64WithSeed(const char* buf, // NOLINT
const size_t len, const uint64_t seed) {
// Let's remove the bytes not divisible by the sizeof(uint64).
// This allows the inner loop to process the data as 64 bit integers.
const size_t len_aligned = len & ~0x7;
const char* const end = buf + len_aligned;
uint64_t hash = seed ^ (len * kMul);
for (const char* p = buf; p != end; p += 8) {
hash = MurmurStep(hash, Load64(p));
}
if ((len & 0x7) != 0) {
const uint64_t data = Load64VariableLength(end, len & 0x7);
hash ^= data;
hash *= kMul;
}
hash = ShiftMix(hash) * kMul;
hash = ShiftMix(hash);
return hash;
}
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash

View File

@ -0,0 +1,43 @@
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Forked from a library written by Austin Appelby and Jyrki Alakuijala.
// Original copyright message below.
// Copyright 2009 Google Inc. All Rights Reserved.
// Author: aappleby@google.com (Austin Appelby)
// jyrki@google.com (Jyrki Alakuijala)
//
// MurmurHash is a fast multiplication and shifting based algorithm,
// based on Austin Appleby's MurmurHash 2.0 algorithm.
#ifndef UTIL_HASH_MURMUR_H_
#define UTIL_HASH_MURMUR_H_
#include <stddef.h>
#include <stdlib.h> // for size_t.
#include <cstdint>
#include "mediapipe/framework/port/integral_types.h"
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
// Hash function for a byte array. Has a seed which allows this hash function to
// be used in algorithms that need a family of parameterized hash functions.
// e.g. Minhash.
unsigned long long MurmurHash64WithSeed(const char* buf, size_t len, // NOLINT
uint64_t seed);
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash
#endif // UTIL_HASH_MURMUR_H_

View File

@ -0,0 +1,66 @@
/* Copyright 2023 The MediaPipe Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Forked from a test library written by Jyrki Alakuijala.
// Original copyright message below.
// Copyright 2009 Google Inc. All Rights Reserved.
// Author: jyrki@google.com (Jyrki Alakuijala)
//
// Tests for the fast hashing algorithm based on Austin Appleby's
// MurmurHash 2.0 algorithm. See http://murmurhash.googlepages.com/
#include "mediapipe/tasks/cc/text/language_detector/custom_ops/utils/hash/murmur.h"
#include <string.h>
#include <cstdint>
#include <string>
#include "mediapipe/framework/port/gmock.h"
#include "mediapipe/framework/port/gtest.h"
#include "mediapipe/framework/port/integral_types.h"
namespace mediapipe::tasks::text::language_detector::custom_ops::hash {
TEST(Murmur, EmptyData64) {
EXPECT_EQ(uint64_t{0}, MurmurHash64WithSeed(nullptr, uint64_t{0}, 0));
}
TEST(Murmur, VaryWithDifferentSeeds) {
// While in theory different seeds could return the same
// hash for the same data this is unlikely.
char data1 = 'x';
EXPECT_NE(MurmurHash64WithSeed(&data1, 1, 100),
MurmurHash64WithSeed(&data1, 1, 101));
}
// Hashes don't change.
TEST(Murmur, Idempotence) {
const char data[] = "deadbeef";
const size_t dlen = strlen(data);
for (int i = 0; i < 10; i++) {
EXPECT_EQ(MurmurHash64WithSeed(data, dlen, i),
MurmurHash64WithSeed(data, dlen, i));
}
const char next_data[] = "deadbeef000---";
const size_t next_dlen = strlen(next_data);
for (int i = 0; i < 10; i++) {
EXPECT_EQ(MurmurHash64WithSeed(next_data, next_dlen, i),
MurmurHash64WithSeed(next_data, next_dlen, i));
}
}
} // namespace mediapipe::tasks::text::language_detector::custom_ops::hash