From da80feb5be9076bb56af0681d684c36028f92ae6 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Jun 2026 06:37:22 +0000 Subject: [PATCH 1/5] merge-ort: propagate callback errors from traverse_trees_wrapper() traverse_trees_wrapper() saves entries from a first pass through traverse_trees() and then replays them through the real callback (collect_merge_info_callback). However, the replay loop silently discards the callback return value. This is not a deferred error; it is an ignored error. Today the only originator of a negative return in this entire call graph is traverse_trees()'s "exceeded maximum allowed tree depth" check; everything else (collect_merge_info_callback, traverse_trees_wrapper, the inner traverse_trees recursion) only relays that. So in current Git, the visible effect of dropping the replay callback's return value is narrow but bad: a tree nested past core.maxTreeDepth has its -1 swallowed, the subtree below the limit is silently pruned, and the merge completes as if that were the correct result. A later patch in this series will teach collect_merge_info_callback() to return -1 on an additional path -- detecting duplicate entries in malformed trees -- which is similarly handled today by just ignoring the problem (resulting in mostly a "last one wins" rule, though the non-last entry can mutate various state flags). Capture the return value, stop the loop on negative returns, and propagate the error to the caller. The callback returns a positive mask value on success, so normalize non-negative returns to 0 for the caller. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- merge-ort.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/merge-ort.c b/merge-ort.c index 00923ce3cd..4b8e32209d 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1008,18 +1008,20 @@ static int traverse_trees_wrapper(struct index_state *istate, info->traverse_path = renames->callback_data_traverse_path; info->fn = old_fn; for (i = old_offset; i < renames->callback_data_nr; ++i) { - info->fn(n, - renames->callback_data[i].mask, - renames->callback_data[i].dirmask, - renames->callback_data[i].names, - info); + ret = info->fn(n, + renames->callback_data[i].mask, + renames->callback_data[i].dirmask, + renames->callback_data[i].names, + info); + if (ret < 0) + break; } renames->callback_data_nr = old_offset; free(renames->callback_data_traverse_path); renames->callback_data_traverse_path = old_callback_data_traverse_path; info->traverse_path = NULL; - return 0; + return ret < 0 ? ret : 0; } static void setup_path_info(struct merge_options *opt, From 159e4d903458ac3ec0aa944aefeadbaf9e83b73c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Jun 2026 06:37:23 +0000 Subject: [PATCH 2/5] merge-ort: drop unnecessary show_all_errors from collect_merge_info() collect_merge_info() has set info.show_all_errors = 1 since d2bc1994f363 (merge-ort: implement a very basic collect_merge_info(), 2020-12-13). This setting was copied from unpack-trees.c where it controls batching of error messages for porcelain display, but merge-ort has no such error-batching logic and never needed it. With show_all_errors set, traverse_trees() captures a negative callback return but continues processing remaining entries rather than stopping immediately. Removing the setting restores the default behavior where a negative return from collect_merge_info_callback() breaks out of the traversal loop right away, allowing a future commit to exit early when a corrupt tree is detected. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- merge-ort.c | 1 - 1 file changed, 1 deletion(-) diff --git a/merge-ort.c b/merge-ort.c index 4b8e32209d..74e9636020 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1740,7 +1740,6 @@ static int collect_merge_info(struct merge_options *opt, setup_traverse_info(&info, opt->priv->toplevel_dir); info.fn = collect_merge_info_callback; info.data = opt; - info.show_all_errors = 1; if (repo_parse_tree(opt->repo, merge_base) < 0 || repo_parse_tree(opt->repo, side1) < 0 || From 83ae606c9838b06bde36479756de2ab75b6fbb96 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Jun 2026 06:37:24 +0000 Subject: [PATCH 3/5] merge-ort: free diff pairs queue in clear_or_reinit_internal_opts() clear_or_reinit_internal_opts() is responsible for cleaning up the various data structures in merge_options_internal. It already handles many renames-related structures (dirs_removed, dir_renames, relevant_sources, cached_pairs, deferred, etc.) but does not free renames->pairs[].queue. In the normal code path, resolve_and_process_renames() frees pairs[s].queue and reinitializes it with diff_queue_init() before clear_or_reinit_internal_opts() runs, so the omission is harmless. However, if collect_merge_info() encounters an error and returns early (before resolve_and_process_renames() is ever called), any diff pairs already queued by collect_rename_info()/add_pair() will have their backing array leaked. Fix this by freeing renames->pairs[].queue in the cleanup function. In the normal path the pointer is already NULL (from the earlier diff_queue_init() in resolve_and_process_renames()), so free(NULL) is a safe no-op. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- merge-ort.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/merge-ort.c b/merge-ort.c index 74e9636020..8f911cb639 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -728,6 +728,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti, strintmap_clear_func(&renames->deferred[i].possible_trivial_merges); strset_clear_func(&renames->deferred[i].target_dirs); renames->deferred[i].trivial_merges_okay = 1; /* 1 == maybe */ + free(renames->pairs[i].queue); + diff_queue_init(&renames->pairs[i]); } renames->cached_pairs_valid_side = 0; renames->dir_rename_mask = 0; From 43a5fa7f5a9b7c44dd958a21368d690fa55d4f50 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Jun 2026 06:37:25 +0000 Subject: [PATCH 4/5] merge-ort: abort merge when trees have duplicate entries Trees with duplicate entries are malformed; fsck reports "contains duplicate file entries" for them. merge-ort has from the beginning assumed that we would never hit such trees. It was written with the assumption that traverse_trees() calls collect_merge_info_callback() at most once per path. The "sanity checks" in that callback (added in d2bc1994f363 (merge-ort: implement a very basic collect_merge_info(), 2020-12-13)) verify properties of each individual call but not that invariant. The strmap_put() in setup_path_info() silently overwrites the entry from any prior call for the same path, because it assumed there would be no other path. Unfortunately, supplemental data structures for various optimizations could still be tweaked before the extra paths were overwritten, and those data structures not matching expected state could trip various assertions. Change the return type of setup_path_info() from void to int to allow us to detect this case, and abort the merge with a clear error message when it occurs. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- merge-ort.c | 61 ++++++++++++++++------------ t/t6422-merge-rename-corner-cases.sh | 54 ++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 27 deletions(-) diff --git a/merge-ort.c b/merge-ort.c index 8f911cb639..be0829bbb7 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -1026,18 +1026,18 @@ static int traverse_trees_wrapper(struct index_state *istate, return ret < 0 ? ret : 0; } -static void setup_path_info(struct merge_options *opt, - struct string_list_item *result, - const char *current_dir_name, - int current_dir_name_len, - char *fullpath, /* we'll take over ownership */ - struct name_entry *names, - struct name_entry *merged_version, - unsigned is_null, /* boolean */ - unsigned df_conflict, /* boolean */ - unsigned filemask, - unsigned dirmask, - int resolved /* boolean */) +static int setup_path_info(struct merge_options *opt, + struct string_list_item *result, + const char *current_dir_name, + int current_dir_name_len, + char *fullpath, /* we'll take over ownership */ + struct name_entry *names, + struct name_entry *merged_version, + unsigned is_null, /* boolean */ + unsigned df_conflict, /* boolean */ + unsigned filemask, + unsigned dirmask, + int resolved /* boolean */) { /* result->util is void*, so mi is a convenience typed variable */ struct merged_info *mi; @@ -1081,9 +1081,11 @@ static void setup_path_info(struct merge_options *opt, */ mi->is_null = 1; } - strmap_put(&opt->priv->paths, fullpath, mi); + if (strmap_put(&opt->priv->paths, fullpath, mi)) + return error(_("tree has duplicate entries for '%s'"), fullpath); result->string = fullpath; result->util = mi; + return 0; } static void add_pair(struct merge_options *opt, @@ -1350,9 +1352,10 @@ static int collect_merge_info_callback(int n, */ if (side1_matches_mbase && side2_matches_mbase) { /* mbase, side1, & side2 all match; use mbase as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+0, mbase_null, 0 /* df_conflict */, - filemask, dirmask, 1 /* resolved */); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+0, mbase_null, 0 /* df_conflict */, + filemask, dirmask, 1 /* resolved */)) + return -1; /* Quit traversing */ return mask; } @@ -1364,9 +1367,10 @@ static int collect_merge_info_callback(int n, */ if (sides_match && filemask == 0x07) { /* use side1 (== side2) version as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+1, side1_null, 0, - filemask, dirmask, 1); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+1, side1_null, 0, + filemask, dirmask, 1)) + return -1; /* Quit traversing */ return mask; } @@ -1378,18 +1382,20 @@ static int collect_merge_info_callback(int n, */ if (side1_matches_mbase && filemask == 0x07) { /* use side2 version as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+2, side2_null, 0, - filemask, dirmask, 1); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+2, side2_null, 0, + filemask, dirmask, 1)) + return -1; /* Quit traversing */ return mask; } /* Similar to above but swapping sides 1 and 2 */ if (side2_matches_mbase && filemask == 0x07) { /* use side1 version as resolution */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, names+1, side1_null, 0, - filemask, dirmask, 1); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, names+1, side1_null, 0, + filemask, dirmask, 1)) + return -1; /* Quit traversing */ return mask; } @@ -1413,8 +1419,9 @@ static int collect_merge_info_callback(int n, * unconflict some more cases, but that comes later so all we can * do now is record the different non-null file hashes.) */ - setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, - names, NULL, 0, df_conflict, filemask, dirmask, 0); + if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath, + names, NULL, 0, df_conflict, filemask, dirmask, 0)) + return -1; /* Quit traversing */ ci = pi.util; VERIFY_CI(ci); diff --git a/t/t6422-merge-rename-corner-cases.sh b/t/t6422-merge-rename-corner-cases.sh index e18d5a227d..81b645bb3b 100755 --- a/t/t6422-merge-rename-corner-cases.sh +++ b/t/t6422-merge-rename-corner-cases.sh @@ -1525,4 +1525,58 @@ test_expect_success 'submodule/directory preliminary conflict' ' ) ' +# Testcase: submodule/directory conflict with duplicate tree entries +# One side has a path as a gitlink (submodule). The other side replaces +# the gitlink with a directory. A third-party tool creates a tree on the +# submodule side that has *both* a gitlink and a tree entry for the same +# path (adding a file inside the submodule path ignoring that there's a +# gitlink there). collect_merge_info_callback() should detect the +# duplicate and abort rather than silently corrupting its bookkeeping. + +test_expect_success 'duplicate tree entries trigger an error' ' + test_when_finished "rm -rf duplicate-entry" && + git init duplicate-entry && + ( + cd duplicate-entry && + + # Base commit: "docs" is a gitlink (submodule) + empty_tree=$(git mktree file.txt && + git add file.txt && + git commit -m base && + + # side1: remove the gitlink, replace with a directory + git checkout -b side1 && + git rm --cached docs && + mkdir -p docs && + echo hello >docs/requirements.txt && + git add docs/requirements.txt && + git commit -m "side1: submodule to directory" && + + # side2: keep the gitlink but craft a tree that also + # contains a tree entry for "docs" (simulating a tool + # that adds files inside a submodule path without + # removing the gitlink first). + git checkout main && + git checkout -b side2 && + blob_oid=$(echo world | git hash-object -w --stdin) && + docs_tree=$(printf "100644 blob %s\trequirements.txt\n" \ + "$blob_oid" | git mktree) && + cur_tree=$(git rev-parse HEAD^{tree}) && + git cat-file -p $cur_tree >tree-listing && + printf "040000 tree %s\tdocs\n" "$docs_tree" >>tree-listing && + new_tree=$(git mktree err && + test_grep "duplicate entries" err + ) +' + test_done From 0c8eb46fbbfc48065bc93dcc4c9901031615516c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Jun 2026 06:37:26 +0000 Subject: [PATCH 5/5] cache-tree: fix verify_cache() to catch non-adjacent D/F conflicts verify_cache() checks that the index does not contain both "path" and "path/file" before writing a tree. It does this by comparing only adjacent entries, relying on the assumption that "path/file" would immediately follow "path" in sorted order. Unfortunately, this assumption does not always hold. For example: docs <-- submodule entry docs-internal/README.md <-- intervening entry docs/requirements.txt <-- D/F conflict, NOT adjacent to "docs" When this happens, verify_cache() silently misses the D/F conflict and write-tree produces a corrupt tree object containing duplicate entries (one for the submodule "docs" and one for the tree "docs"). I could not find any caller in current git that both allows the index to get into this state and then tries to write it out without doing other checks beyond the verify_cache() call in cache_tree_update(), but verify_cache() is documented as a safety net for preventing corrupt trees and should actually provide that guarantee. A downstream consumer that relied solely on cache_tree_update()'s internal checking via verify_cache() to prevent duplicate tree entries was bitten by the gap. Add a test that constructs a corrupt index directly (bypassing the D/F checks in add_index_entry) and verifies that write-tree now rejects it. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- cache-tree.c | 64 ++++++++++++++++++++++++++++------ t/meson.build | 1 + t/t0093-direct-index-write.pl | 38 ++++++++++++++++++++ t/t0093-verify-cache-df-gap.sh | 59 +++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 11 deletions(-) create mode 100644 t/t0093-direct-index-write.pl create mode 100755 t/t0093-verify-cache-df-gap.sh diff --git a/cache-tree.c b/cache-tree.c index 7881b42aa2..4d2669b312 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -161,6 +161,54 @@ void cache_tree_invalidate_path(struct index_state *istate, const char *path) istate->cache_changed |= CACHE_TREE_CHANGED; } +/* + * Check whether this_ce and the next entry in the index form a D/F + * conflict ("path" vs "path/file"). Returns the conflicting "path/..." + * name when one is found, or NULL otherwise. + * + * The cache is sorted, so "path/file" sorts after "path" and the + * conflict is usually visible as adjacent entries. But other entries + * can sort between them -- e.g. "path-internal" sits between "path" + * and "path/file" because '-' (0x2D) precedes '/' (0x2F) -- so when + * the immediately following entry shares our prefix but starts with a + * character that sorts before '/', binary search for "path/" instead. + */ +static const char *find_df_conflict(struct index_state *istate, + const struct cache_entry *this_ce, + const struct cache_entry *next_ce) +{ + const char *this_name = this_ce->name; + const char *next_name = next_ce->name; + int this_len = ce_namelen(this_ce); + const struct cache_entry *other; + struct strbuf probe = STRBUF_INIT; + int pos; + + if (this_len >= ce_namelen(next_ce) || + next_name[this_len] > '/' || + strncmp(this_name, next_name, this_len)) + return NULL; + + if (next_name[this_len] == '/') + return next_name; + + strbuf_add(&probe, this_name, this_len); + strbuf_addch(&probe, '/'); + pos = index_name_pos_sparse(istate, probe.buf, probe.len); + strbuf_release(&probe); + + if (pos < 0) + pos = -pos - 1; + if (pos >= (int)istate->cache_nr) + return NULL; + other = istate->cache[pos]; + if (ce_namelen(other) > this_len && + other->name[this_len] == '/' && + !strncmp(this_name, other->name, this_len)) + return other->name; + return NULL; +} + static int verify_cache(struct index_state *istate, int flags) { unsigned i, funny; @@ -190,24 +238,18 @@ static int verify_cache(struct index_state *istate, int flags) */ funny = 0; for (i = 0; i + 1 < istate->cache_nr; i++) { - /* path/file always comes after path because of the way - * the cache is sorted. Also path can appear only once, - * which means conflicting one would immediately follow. - */ const struct cache_entry *this_ce = istate->cache[i]; const struct cache_entry *next_ce = istate->cache[i + 1]; - const char *this_name = this_ce->name; - const char *next_name = next_ce->name; - int this_len = ce_namelen(this_ce); - if (this_len < ce_namelen(next_ce) && - next_name[this_len] == '/' && - strncmp(this_name, next_name, this_len) == 0) { + const char *conflict_name; + + conflict_name = find_df_conflict(istate, this_ce, next_ce); + if (conflict_name) { if (10 < ++funny) { fprintf(stderr, "...\n"); break; } fprintf(stderr, "You have both %s and %s\n", - this_name, next_name); + this_ce->name, conflict_name); } } if (funny) diff --git a/t/meson.build b/t/meson.build index 7528e5cda5..362177999b 100644 --- a/t/meson.build +++ b/t/meson.build @@ -124,6 +124,7 @@ integration_tests = [ 't0090-cache-tree.sh', 't0091-bugreport.sh', 't0092-diagnose.sh', + 't0093-verify-cache-df-gap.sh', 't0095-bloom.sh', 't0100-previous.sh', 't0101-at-syntax.sh', diff --git a/t/t0093-direct-index-write.pl b/t/t0093-direct-index-write.pl new file mode 100644 index 0000000000..2881a3ebb2 --- /dev/null +++ b/t/t0093-direct-index-write.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl +# +# Build a v2 index file from entries listed on stdin. +# Each line: "octalmode hex-oid name" +# Output: binary index written to stdout. +# +# This bypasses all D/F safety checks in add_index_entry(), simulating +# what happens when code uses ADD_CACHE_JUST_APPEND to bulk-load entries. +use strict; +use warnings; +use Digest::SHA qw(sha1 sha256); + +my $hash_algo = $ENV{'GIT_DEFAULT_HASH'} || 'sha1'; +my $hash_func = $hash_algo eq 'sha256' ? \&sha256 : \&sha1; + +my @entries; +while (my $line = ) { + chomp $line; + my ($mode, $oid_hex, $name) = split(/ /, $line, 3); + push @entries, [$mode, $oid_hex, $name]; +} + +my $body = "DIRC" . pack("NN", 2, scalar @entries); + +for my $ent (@entries) { + my ($mode, $oid_hex, $name) = @{$ent}; + # 10 x 32-bit stat fields (zeroed), with mode in position 7 + my $stat = pack("N10", 0, 0, 0, 0, 0, 0, oct($mode), 0, 0, 0); + my $oid = pack("H*", $oid_hex); + my $flags = pack("n", length($name) & 0xFFF); + my $entry = $stat . $oid . $flags . $name . "\0"; + # Pad to 8-byte boundary + while (length($entry) % 8) { $entry .= "\0"; } + $body .= $entry; +} + +binmode STDOUT; +print $body . $hash_func->($body); diff --git a/t/t0093-verify-cache-df-gap.sh b/t/t0093-verify-cache-df-gap.sh new file mode 100755 index 0000000000..0b6829d805 --- /dev/null +++ b/t/t0093-verify-cache-df-gap.sh @@ -0,0 +1,59 @@ +#!/bin/sh + +test_description='verify_cache() must catch non-adjacent D/F conflicts + +Ensure that verify_cache() can complain about bad entries like: + + docs <-- submodule + docs-internal/... <-- sorts here because "-" < "/" + docs/... <-- D/F conflict with "docs" above, not adjacent + +In order to test verify_cache, we directly construct a corrupt index +(bypassing the D/F safety checks in add_index_entry) and verify that +write-tree rejects it. +' + +. ./test-lib.sh + +if ! test_have_prereq PERL +then + skip_all='skipping verify_cache D/F tests; Perl not available' + test_done +fi + +# Build a v2 index from entries on stdin, bypassing D/F checks. +# Each line: "octalmode hex-oid name" (entries must be pre-sorted). +build_corrupt_index () { + perl "$TEST_DIRECTORY/t0093-direct-index-write.pl" >"$1" +} + +test_expect_success 'setup objects' ' + test_commit base && + BLOB=$(git rev-parse HEAD:base.t) && + SUB_COMMIT=$(git rev-parse HEAD) +' + +test_expect_success 'adjacent D/F conflict is caught by verify_cache' ' + cat >index-entries <<-EOF && + 0160000 $SUB_COMMIT docs + 0100644 $BLOB docs/requirements.txt + EOF + build_corrupt_index .git/index err && + test_grep "You have both docs and docs/requirements.txt" err +' + +test_expect_success 'non-adjacent D/F conflict is caught by verify_cache' ' + cat >index-entries <<-EOF && + 0160000 $SUB_COMMIT docs + 0100644 $BLOB docs-internal/README.md + 0100644 $BLOB docs/requirements.txt + EOF + build_corrupt_index .git/index err && + test_grep "You have both docs and docs/requirements.txt" err +' + +test_done