mirror of
https://github.com/git-for-windows/git.git
synced 2026-03-02 06:23:48 -06:00
test-tool: add helper for name-hash values
Add a new test-tool helper, name-hash, to output the value of the name-hash algorithms for the input list of strings, one per line. Since the name-hash values can be stored in the .bitmap files, it is important that these hash functions do not change across Git versions. Add a simple test to t5310-pack-bitmaps.sh to provide some testing of the current values. Due to how these functions are implemented, it would be difficult to change them without disturbing these values. Create a performance test that uses test_size to demonstrate how collisions occur for these hash algorithms. This test helps inform someone as to the behavior of the name-hash algorithms for their repo based on the paths at HEAD. My copy of the Git repository shows modest statistics around the collisions of the default name-hash algorithm: Test this tree ----------------------------------------------------------------- 5314.1: paths at head 4.5K 5314.2: number of distinct name-hashes 4.1K 5314.3: number of distinct full-name-hashes 4.5K 5314.4: maximum multiplicity of name-hashes 13 5314.5: maximum multiplicity of fullname-hashes 1 Here, the maximum collision multiplicity is 13, but around 10% of paths have a collision with another path. In a more interesting example, the microsoft/fluentui [1] repo had these statistics at time of committing: Test this tree ----------------------------------------------------------------- 5314.1: paths at head 19.6K 5314.2: number of distinct name-hashes 8.2K 5314.3: number of distinct full-name-hashes 19.6K 5314.4: maximum multiplicity of name-hashes 279 5314.5: maximum multiplicity of fullname-hashes 1 [1] https://github.com/microsoft/fluentui That demonstrates that of the nearly twenty thousand path names, they are assigned around eight thousand distinct values. 279 paths are assigned to a single value, leading the packing algorithm to sort objects from those paths together, by size. In this repository, no collisions occur for the full-name-hash algorithm. In a more extreme example, an internal monorepo had a much worse collision rate: Test this tree ----------------------------------------------------------------- 5314.1: paths at head 221.6K 5314.2: number of distinct name-hashes 72.0K 5314.3: number of distinct full-name-hashes 221.6K 5314.4: maximum multiplicity of name-hashes 14.4K 5314.5: maximum multiplicity of fullname-hashes 2 Even in this repository with many more paths at HEAD, the collision rate was low and the maximum number of paths being grouped into a single bucket by the full-path-name algorithm was two. Signed-off-by: Derrick Stolee <stolee@gmail.com>
This commit is contained in:
parent
871f936841
commit
97829b5bdd
1
Makefile
1
Makefile
@ -816,6 +816,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
|
||||
TEST_BUILTINS_OBJS += test-match-trees.o
|
||||
TEST_BUILTINS_OBJS += test-mergesort.o
|
||||
TEST_BUILTINS_OBJS += test-mktemp.o
|
||||
TEST_BUILTINS_OBJS += test-name-hash.o
|
||||
TEST_BUILTINS_OBJS += test-online-cpus.o
|
||||
TEST_BUILTINS_OBJS += test-pack-mtimes.o
|
||||
TEST_BUILTINS_OBJS += test-parse-options.o
|
||||
|
||||
24
t/helper/test-name-hash.c
Normal file
24
t/helper/test-name-hash.c
Normal file
@ -0,0 +1,24 @@
|
||||
/*
|
||||
* test-name-hash.c: Read a list of paths over stdin and report on their
|
||||
* name-hash and full name-hash.
|
||||
*/
|
||||
|
||||
#include "test-tool.h"
|
||||
#include "git-compat-util.h"
|
||||
#include "pack-objects.h"
|
||||
#include "strbuf.h"
|
||||
|
||||
int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
|
||||
{
|
||||
struct strbuf line = STRBUF_INIT;
|
||||
|
||||
while (!strbuf_getline(&line, stdin)) {
|
||||
uint32_t name_hash = pack_name_hash(line.buf);
|
||||
uint32_t full_hash = pack_full_name_hash(line.buf);
|
||||
|
||||
printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
|
||||
}
|
||||
|
||||
strbuf_release(&line);
|
||||
return 0;
|
||||
}
|
||||
@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
|
||||
{ "match-trees", cmd__match_trees },
|
||||
{ "mergesort", cmd__mergesort },
|
||||
{ "mktemp", cmd__mktemp },
|
||||
{ "name-hash", cmd__name_hash },
|
||||
{ "online-cpus", cmd__online_cpus },
|
||||
{ "pack-mtimes", cmd__pack_mtimes },
|
||||
{ "parse-options", cmd__parse_options },
|
||||
|
||||
@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
|
||||
int cmd__match_trees(int argc, const char **argv);
|
||||
int cmd__mergesort(int argc, const char **argv);
|
||||
int cmd__mktemp(int argc, const char **argv);
|
||||
int cmd__name_hash(int argc, const char **argv);
|
||||
int cmd__online_cpus(int argc, const char **argv);
|
||||
int cmd__pack_mtimes(int argc, const char **argv);
|
||||
int cmd__parse_options(int argc, const char **argv);
|
||||
|
||||
41
t/perf/p5314-name-hash.sh
Executable file
41
t/perf/p5314-name-hash.sh
Executable file
@ -0,0 +1,41 @@
|
||||
#!/bin/sh
|
||||
|
||||
test_description='Tests pack performance using bitmaps'
|
||||
. ./perf-lib.sh
|
||||
|
||||
GIT_TEST_PASSING_SANITIZE_LEAK=0
|
||||
export GIT_TEST_PASSING_SANITIZE_LEAK
|
||||
|
||||
test_perf_large_repo
|
||||
|
||||
test_size 'paths at head' '
|
||||
git ls-tree -r --name-only HEAD >path-list &&
|
||||
wc -l <path-list
|
||||
'
|
||||
|
||||
test_size 'number of distinct name-hashes' '
|
||||
cat path-list | test-tool name-hash >name-hashes &&
|
||||
cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count &&
|
||||
wc -l <name-hash-count
|
||||
'
|
||||
|
||||
test_size 'number of distinct full-name-hashes' '
|
||||
cat name-hashes | awk "{ print \$2; }" | sort -n | uniq -c >full-name-hash-count &&
|
||||
wc -l <full-name-hash-count
|
||||
'
|
||||
|
||||
test_size 'maximum multiplicity of name-hashes' '
|
||||
cat name-hash-count | \
|
||||
sort -nr | \
|
||||
head -n 1 | \
|
||||
awk "{ print \$1; }"
|
||||
'
|
||||
|
||||
test_size 'maximum multiplicity of fullname-hashes' '
|
||||
cat full-name-hash-count | \
|
||||
sort -nr | \
|
||||
head -n 1 | \
|
||||
awk "{ print \$1; }"
|
||||
'
|
||||
|
||||
test_done
|
||||
@ -27,6 +27,32 @@ has_any () {
|
||||
grep -Ff "$1" "$2"
|
||||
}
|
||||
|
||||
# Since name-hash values are stored in the .bitmap files, add a test
|
||||
# that checks that the name-hash calculations are stable across versions.
|
||||
# Not exhaustive, but these hashing algorithms would be hard to change
|
||||
# without causing deviations here.
|
||||
test_expect_success 'name-hash value stability' '
|
||||
cat >names <<-\EOF &&
|
||||
first
|
||||
second
|
||||
third
|
||||
one-long-enough-for-collisions
|
||||
two-long-enough-for-collisions
|
||||
EOF
|
||||
|
||||
test-tool name-hash <names >out &&
|
||||
|
||||
cat >expect <<-\EOF &&
|
||||
2582249472 3109209818 first
|
||||
2289942528 3781118409 second
|
||||
2300837888 3028707182 third
|
||||
2544516325 3241327563 one-long-enough-for-collisions
|
||||
2544516325 4207880830 two-long-enough-for-collisions
|
||||
EOF
|
||||
|
||||
test_cmp expect out
|
||||
'
|
||||
|
||||
test_bitmap_cases () {
|
||||
writeLookupTable=false
|
||||
for i in "$@"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user