From a43d5028b17a85ed17d5a05924b060c39a2188a1 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Wed, 10 Aug 2022 23:11:09 +0200 Subject: [PATCH] Use wyhash for (#13686) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wyhash was chosen based on the results found in `smhasher`, were it proved itself as an algorithm with little flaws and fairly high output quality. While I have a personal preference for xxhash (XXH3 specifically), wyhash is a better fit for this project as its source code is multiple magnitudes smaller, simplifying the review and integration into the header-only `hash.h` file. For use with hashmaps the hash quality doesn't actually matter much for optimal performance and instead the binary size usually matters more. But even in that scenario wyhash is fairly close to FNV1a (aka "FNV64"). The result is that this new hash algorithm will only have little impact on hashmap performance if used over the standard FNV1a as used in the STL, while simultaneously offering a vastly better hash quality. This partially solves #13124. ## Validation Steps Performed * Added test cases ✅ --- .github/actions/spelling/expect/expect.txt | 15 +- NOTICE.md | 33 +++ oss/wyhash/LICENSE | 25 ++ oss/wyhash/MAINTAINER_README.md | 4 + oss/wyhash/cgmanifest.json | 14 + src/buffer/out/UnicodeStorage.hpp | 5 +- src/inc/til/hash.h | 266 +++++++++++++----- src/til/ut_til/HashTests.cpp | 46 +++ src/til/ut_til/til.unit.tests.vcxproj | 27 ++ src/til/ut_til/til.unit.tests.vcxproj.filters | 84 ++++++ 10 files changed, 441 insertions(+), 78 deletions(-) create mode 100644 oss/wyhash/LICENSE create mode 100644 oss/wyhash/MAINTAINER_README.md create mode 100644 oss/wyhash/cgmanifest.json create mode 100644 src/til/ut_til/HashTests.cpp diff --git a/.github/actions/spelling/expect/expect.txt b/.github/actions/spelling/expect/expect.txt index 065cfd84a6..46e864af93 100644 --- a/.github/actions/spelling/expect/expect.txt +++ b/.github/actions/spelling/expect/expect.txt @@ -1,4 +1,5 @@ ABANDONFONT +ABCDEFGHIJKLMNOPQRSTUVWXY abgr abi ACCESSTOKEN @@ -42,7 +43,6 @@ antialias antialiasing ANull anycpu -AOn APARTMENTTHREADED APCs api @@ -80,7 +80,6 @@ ASingle asm asmv asmx -AStomps ASYNCWINDOWPOS atch ATest @@ -232,7 +231,6 @@ chcp checkbox checkboxes chh -Childitem chk chrono CHT @@ -669,6 +667,7 @@ ECH echokey ecount ECpp +ect Edgium EDITKEYS EDITTEXT @@ -702,6 +701,7 @@ ENUMLOGFONTEX enumranges envvar eol +eplace EPres EQU ERASEBKGND @@ -779,7 +779,6 @@ FIXEDCONVERTED FIXEDFILEINFO Flg flyout -fmix fmodern fmtarg fmtid @@ -996,6 +995,7 @@ HPR HProvider HREDRAW hresult +hrottled HRSRC hscroll hsl @@ -1030,6 +1030,7 @@ ICache icacls iccex IChar +icket ico IComponent ICONERROR @@ -2431,6 +2432,8 @@ uint uintptr ulcch ulong +umul +umulh Unadvise unattend uncomment @@ -2735,6 +2738,9 @@ WUX WVerify WWith wxh +wyhash +wymix +wyr xact xaml Xamlmeta @@ -2795,6 +2801,7 @@ YSize YSubstantial YVIRTUALSCREEN YWalk +Zabcdefghijklmnopqrstuvwxyz ZCmd ZCtrl zsh diff --git a/NOTICE.md b/NOTICE.md index 5c153f6ab6..e4a1a694a2 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -276,6 +276,39 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ``` +## wyhash + +**Source**: [https://github.com/wangyi-fudan/wyhash](https://github.com/wangyi-fudan/wyhash) + +### License + +``` +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to +``` + ## ConEmu **Source**: [https://github.com/Maximus5/ConEmu](https://github.com/Maximus5/ConEmu) diff --git a/oss/wyhash/LICENSE b/oss/wyhash/LICENSE new file mode 100644 index 0000000000..f223c03afe --- /dev/null +++ b/oss/wyhash/LICENSE @@ -0,0 +1,25 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + diff --git a/oss/wyhash/MAINTAINER_README.md b/oss/wyhash/MAINTAINER_README.md new file mode 100644 index 0000000000..a4f827393b --- /dev/null +++ b/oss/wyhash/MAINTAINER_README.md @@ -0,0 +1,4 @@ +### Notes for Future Maintainers + +[wyhash](https://github.com/wangyi-fudan/wyhash) is used as the hash algorithm for `` and its `til::hasher`. +The source code was directly integrated into that header file and can be found in `/src/inc/til/hash.h`. diff --git a/oss/wyhash/cgmanifest.json b/oss/wyhash/cgmanifest.json new file mode 100644 index 0000000000..872322ff16 --- /dev/null +++ b/oss/wyhash/cgmanifest.json @@ -0,0 +1,14 @@ +{ + "Registrations": [ + { + "component": { + "type": "git", + "git": { + "repositoryUrl": "https://github.com/wangyi-fudan/wyhash", + "commitHash": "e77036ac1943369dc03e611cde52a8570f8ceefe" + } + } + } + ], + "Version": 1 +} \ No newline at end of file diff --git a/src/buffer/out/UnicodeStorage.hpp b/src/buffer/out/UnicodeStorage.hpp index e733013197..609338dca5 100644 --- a/src/buffer/out/UnicodeStorage.hpp +++ b/src/buffer/out/UnicodeStorage.hpp @@ -17,7 +17,6 @@ Author(s): #include #include -#include #include // std::unordered_map needs help to know how to hash a til::point @@ -33,9 +32,9 @@ namespace std // - coord - the coord to hash // Return Value: // - the hashed coord - constexpr size_t operator()(const til::point coord) const noexcept + size_t operator()(const til::point coord) const noexcept { - return til::hash(til::bit_cast(coord)); + return til::hash(coord); } }; } diff --git a/src/inc/til/hash.h b/src/inc/til/hash.h index 7b7c9c4b53..49e9e57795 100644 --- a/src/inc/til/hash.h +++ b/src/inc/til/hash.h @@ -3,7 +3,25 @@ #pragma once -#include "bit.h" +#pragma warning(push) +// std::hash() doesn't test for `nullptr`, nor do we want to. +#pragma warning(disable : 26429) // Symbol '...' is never tested for nullness, it can be marked as not_null (f.23). +// Misdiagnosis: static_cast is used to differentiate between 2 overloads of til::hasher::write. +#pragma warning(disable : 26474) // Don't cast between pointer types when the conversion could be implicit (type.1). +// We don't want to unnecessarily modify wyhash from its original. +#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). +#pragma warning(disable : 26494) // Variable '...' is uninitialized. Always initialize an object (type.5). +#pragma warning(disable : 26496) // The variable '...' does not change after construction, mark it as const (con.4). + +#if defined(_M_X64) && !defined(_M_ARM64EC) +#define TIL_HASH_X64 +#elif defined(_M_ARM64) || defined(_M_ARM64EC) +#define TIL_HASH_ARM64 +#elif defined(_M_IX86) +#define TIL_HASH_X86 +#else +#error "Unsupported architecture for til::hash" +#endif namespace til { @@ -12,31 +30,27 @@ namespace til struct hasher { - explicit constexpr hasher(size_t state = FNV_offset_basis) noexcept : + constexpr hasher() = default; + explicit constexpr hasher(size_t state) noexcept : _hash{ state } {} template - constexpr void write(const T& v) noexcept + hasher& write(const T& v) noexcept { hash_trait{}(*this, v); + return *this; } template>> - constexpr void write(const T* data, size_t count) noexcept + hasher& write(const T* data, size_t count) noexcept { -#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1). - write(reinterpret_cast(data), sizeof(T) * count); + return write(static_cast(data), sizeof(T) * count); } -#pragma warning(suppress : 26429) // Symbol 'data' is never tested for nullness, it can be marked as not_null (f.23). - constexpr void write(const uint8_t* data, size_t count) noexcept + hasher& write(const void* data, size_t len) noexcept { - for (size_t i = 0; i < count; ++i) - { -#pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). - _hash ^= static_cast(data[i]); - _hash *= FNV_prime; - } + _hash = _wyhash(data, len, _hash); + return *this; } constexpr size_t finalize() const noexcept @@ -45,15 +59,151 @@ namespace til } private: -#if defined(_WIN64) - static constexpr size_t FNV_offset_basis = 14695981039346656037ULL; - static constexpr size_t FNV_prime = 1099511628211ULL; -#else - static constexpr size_t FNV_offset_basis = 2166136261U; - static constexpr size_t FNV_prime = 16777619U; -#endif +#if defined(TIL_HASH_X86) - size_t _hash = FNV_offset_basis; + static uint32_t _wyr24(const uint8_t* p, uint32_t k) noexcept + { + return static_cast(p[0]) << 16 | static_cast(p[k >> 1]) << 8 | p[k - 1]; + } + + static uint32_t _wyr32(const uint8_t* p) noexcept + { + uint32_t v; + memcpy(&v, p, 4); + return v; + } + + static void _wymix32(uint32_t* a, uint32_t* b) noexcept + { + uint64_t c = *a ^ UINT32_C(0x53c5ca59); + c *= *b ^ UINT32_C(0x74743c1b); + *a = static_cast(c); + *b = static_cast(c >> 32); + } + + static uint32_t _wyhash(const void* data, uint32_t len, uint32_t seed) noexcept + { + auto p = static_cast(data); + auto i = len; + auto see1 = len; + _wymix32(&seed, &see1); + + for (; i > 8; i -= 8, p += 8) + { + seed ^= _wyr32(p); + see1 ^= _wyr32(p + 4); + _wymix32(&seed, &see1); + } + if (i >= 4) + { + seed ^= _wyr32(p); + see1 ^= _wyr32(p + i - 4); + } + else if (i) + { + seed ^= _wyr24(p, i); + } + + _wymix32(&seed, &see1); + _wymix32(&seed, &see1); + return seed ^ see1; + } + +#else // defined(TIL_HASH_X86) + + static uint64_t _wyr3(const uint8_t* p, size_t k) noexcept + { + return (static_cast(p[0]) << 16) | (static_cast(p[k >> 1]) << 8) | p[k - 1]; + } + + static uint64_t _wyr4(const uint8_t* p) noexcept + { + uint32_t v; + memcpy(&v, p, 4); + return v; + } + + static uint64_t _wyr8(const uint8_t* p) noexcept + { + uint64_t v; + memcpy(&v, p, 8); + return v; + } + + static uint64_t _wymix(uint64_t lhs, uint64_t rhs) noexcept + { +#if defined(TIL_HASH_X64) + uint64_t hi; + uint64_t lo = _umul128(lhs, rhs, &hi); +#elif defined(TIL_HASH_ARM64) + const uint64_t lo = lhs * rhs; + const uint64_t hi = __umulh(lhs, rhs); +#endif + return lo ^ hi; + } + + static uint64_t _wyhash(const void* data, uint64_t len, uint64_t seed) noexcept + { + static constexpr auto s0 = UINT64_C(0xa0761d6478bd642f); + static constexpr auto s1 = UINT64_C(0xe7037ed1a0b428db); + static constexpr auto s2 = UINT64_C(0x8ebc6af09c88c6e3); + static constexpr auto s3 = UINT64_C(0x589965cc75374cc3); + + auto p = static_cast(data); + seed ^= s0; + uint64_t a; + uint64_t b; + + if (len <= 16) + { + if (len >= 4) + { + a = (_wyr4(p) << 32) | _wyr4(p + ((len >> 3) << 2)); + b = (_wyr4(p + len - 4) << 32) | _wyr4(p + len - 4 - ((len >> 3) << 2)); + } + else if (len > 0) + { + a = _wyr3(p, len); + b = 0; + } + else + { + a = b = 0; + } + } + else + { + auto i = len; + if (i > 48) + { + auto seed1 = seed; + auto seed2 = seed; + do + { + seed = _wymix(_wyr8(p) ^ s1, _wyr8(p + 8) ^ seed); + seed1 = _wymix(_wyr8(p + 16) ^ s2, _wyr8(p + 24) ^ seed1); + seed2 = _wymix(_wyr8(p + 32) ^ s3, _wyr8(p + 40) ^ seed2); + p += 48; + i -= 48; + } while (i > 48); + seed ^= seed1 ^ seed2; + } + while (i > 16) + { + seed = _wymix(_wyr8(p) ^ s1, _wyr8(p + 8) ^ seed); + i -= 16; + p += 16; + } + a = _wyr8(p + i - 16); + b = _wyr8(p + i - 8); + } + + return _wymix(s1 ^ len, _wymix(a ^ s1, b ^ seed)); + } + +#endif // defined(TIL_HASH_X86) + + size_t _hash = 0; }; namespace details @@ -61,10 +211,9 @@ namespace til template struct conditionally_enabled_hash_trait { - constexpr void operator()(hasher& h, const T& v) const noexcept + void operator()(hasher& h, const T& v) const noexcept { -#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1). - h.write(reinterpret_cast(&v), sizeof(T)); + h.write(static_cast(&v), sizeof(T)); } }; @@ -87,80 +236,55 @@ namespace til template<> struct hash_trait { - constexpr void operator()(hasher& h, float v) const noexcept + void operator()(hasher& h, float v) const noexcept { v = v == 0.0f ? 0.0f : v; // map -0 to 0 -#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1). - h.write(reinterpret_cast(&v), sizeof(v)); + h.write(static_cast(&v), sizeof(v)); } }; template<> struct hash_trait { - constexpr void operator()(hasher& h, double v) const noexcept + void operator()(hasher& h, double v) const noexcept { v = v == 0.0 ? 0.0 : v; // map -0 to 0 -#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1). - h.write(reinterpret_cast(&v), sizeof(v)); + h.write(static_cast(&v), sizeof(v)); } }; template struct hash_trait> { - constexpr void operator()(hasher& h, const std::basic_string& v) const noexcept + void operator()(hasher& h, const std::basic_string& v) const noexcept { -#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1). - h.write(reinterpret_cast(v.data()), sizeof(T) * v.size()); + h.write(v.data(), v.size()); } }; template struct hash_trait> { - constexpr void operator()(hasher& h, const std::basic_string_view& v) const noexcept + void operator()(hasher& h, const std::basic_string_view& v) const noexcept { -#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1). - h.write(reinterpret_cast(v.data()), sizeof(T) * v.size()); + h.write(v.data(), v.size()); } }; template - constexpr size_t hash(const T& v) noexcept + size_t hash(const T& v) noexcept { - if constexpr (sizeof(T) <= sizeof(size_t) && (std::is_integral_v || std::is_enum_v)) - { - // This runs murmurhash3's finalizer (fmix32/fmix64) on a single integer. - // It's fast, public domain and produces good results. - // - // Using til::as_unsigned here allows the compiler to drop the first - // `>> 33` mix for all Ts which are >= 32 bits. - // The existence of sign extension shouldn't change hash quality. - size_t h = til::as_unsigned(v); - if constexpr (sizeof(size_t) == 4) - { - h ^= h >> 16; - h *= UINT32_C(0x85ebca6b); - h ^= h >> 13; - h *= UINT32_C(0xc2b2ae35); - h ^= h >> 16; - } - else - { - h ^= h >> 33; - h *= UINT64_C(0xff51afd7ed558ccd); - h ^= h >> 33; - h *= UINT64_C(0xc4ceb9fe1a85ec53); - h ^= h >> 33; - } - return h; - } - else - { - hasher h; - h.write(v); - return h.finalize(); - } + hasher h; + h.write(v); + return h.finalize(); + } + + inline size_t hash(const void* data, size_t len) noexcept + { + hasher h; + h.write(data, len); + return h.finalize(); } } + +#pragma warning(pop) diff --git a/src/til/ut_til/HashTests.cpp b/src/til/ut_til/HashTests.cpp new file mode 100644 index 0000000000..4b8ba38074 --- /dev/null +++ b/src/til/ut_til/HashTests.cpp @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "precomp.h" + +#include + +using namespace WEX::Common; +using namespace WEX::Logging; +using namespace WEX::TestExecution; + +class HashTests +{ + TEST_CLASS(HashTests); + + TEST_METHOD(TestVectors) + { + struct Test + { + std::string_view input; + size_t seed; + uint64_t expected64; + uint32_t expected32; + }; + + static constexpr std::array tests{ + Test{ "", 0, 0x42bc986dc5eec4d3, 0xa45f982f }, + Test{ "a", 1, 0x84508dc903c31551, 0x09021114 }, + Test{ "abc", 2, 0x0bc54887cfc9ecb1, 0xfe40215d }, + Test{ "message digest", 3, 0x6e2ff3298208a67c, 0x6e0fb730 }, + Test{ "abcdefghijklmnopqrstuvwxyz", 4, 0x9a64e42e897195b9, 0x9435b8c2 }, + Test{ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 5, 0x9199383239c32554, 0xccf9734c }, + Test{ "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 6, 0x7c1ccf6bba30f5a5, 0x9fa5ef6e }, + }; + + for (const auto& t : tests) + { + const auto actual = til::hasher{ t.seed }.write(t.input).finalize(); +#if defined(TIL_HASH_X86) + VERIFY_ARE_EQUAL(t.expected32, actual); +#else + VERIFY_ARE_EQUAL(t.expected64, actual); +#endif + } + } +}; diff --git a/src/til/ut_til/til.unit.tests.vcxproj b/src/til/ut_til/til.unit.tests.vcxproj index 444b0e4cae..0ff3ba29c7 100644 --- a/src/til/ut_til/til.unit.tests.vcxproj +++ b/src/til/ut_til/til.unit.tests.vcxproj @@ -19,6 +19,7 @@ + @@ -35,6 +36,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/til/ut_til/til.unit.tests.vcxproj.filters b/src/til/ut_til/til.unit.tests.vcxproj.filters index 3db31b332e..5fb66a2af6 100644 --- a/src/til/ut_til/til.unit.tests.vcxproj.filters +++ b/src/til/ut_til/til.unit.tests.vcxproj.filters @@ -24,8 +24,92 @@ + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + inc + + + + + {7cf29ba4-d33d-4c3b-82e3-ab73e5a79685} + \ No newline at end of file