pack-objects: add --path-walk option

In order to more easily compute delta bases among objects that appear at the
exact same path, add a --path-walk option to 'git pack-objects'.

This option will use the path-walk API instead of the object walk given by
the revision machinery. Since objects will be provided in batches
representing a common path, those objects can be tested for delta bases
immediately instead of waiting for a sort of the full object list by
name-hash. This has multiple benefits, including avoiding collisions by
name-hash.

The objects marked as UNINTERESTING are included in these batches, so we
are guaranteeing some locality to find good delta bases.

After the individual passes are done on a per-path basis, the default
name-hash is used to find other opportunistic delta bases that did not
match exactly by the full path name.

RFC TODO: It is important to note that this option is inherently
incompatible with using a bitmap index. This walk probably also does not
work with other advanced features, such as delta islands.

Getting ahead of myself, this option compares well with --full-name-hash
when the packfile is large enough, but also performs at least as well as
the default in all cases that I've seen.

RFC TODO: this should probably be recording the batch locations to another
list so they could be processed in a second phase using threads.

RFC TODO: list some examples of how this outperforms previous pack-objects
strategies. (This is coming in later commits that include performance
test changes.)

Signed-off-by: Derrick Stolee <stolee@gmail.com>
This commit is contained in:
Derrick Stolee 2024-09-05 10:04:51 -04:00 committed by Johannes Schindelin
parent 812b057cd3
commit 6b5c2b8b7b
4 changed files with 168 additions and 11 deletions

View File

@ -16,7 +16,7 @@ SYNOPSIS
[--cruft] [--cruft-expiration=<time>]
[--stdout [--filter=<filter-spec>] | <base-name>]
[--shallow] [--keep-true-parents] [--[no-]sparse]
[--full-name-hash] < <object-list>
[--full-name-hash] [--path-walk] < <object-list>
DESCRIPTION
@ -346,6 +346,16 @@ raise an error.
Restrict delta matches based on "islands". See DELTA ISLANDS
below.
--path-walk::
By default, `git pack-objects` walks objects in an order that
presents trees and blobs in an order unrelated to the path they
appear relative to a commit's root tree. The `--path-walk` option
enables a different walking algorithm that organizes trees and
blobs by path. This has the potential to improve delta compression
especially in the presence of filenames that cause collisions in
Git's default name-hash algorithm. Due to changing how the objects
are walked, this option is not compatible with `--delta-islands`,
`--shallow`, or `--filter`.
DELTA ISLANDS
-------------

View File

@ -69,4 +69,5 @@ Examples
--------
See example usages in:
`t/helper/test-path-walk.c`
`t/helper/test-path-walk.c`,
`builtin/pack-objects.c`

View File

@ -41,6 +41,9 @@
#include "promisor-remote.h"
#include "pack-mtimes.h"
#include "parse-options.h"
#include "blob.h"
#include "tree.h"
#include "path-walk.h"
/*
* Objects we are going to pack are collected in the `to_pack` structure.
@ -217,6 +220,7 @@ static int delta_search_threads;
static int pack_to_stdout;
static int sparse;
static int thin;
static int path_walk;
static int num_preferred_base;
static struct progress *progress_state;
@ -4142,6 +4146,105 @@ static void mark_bitmap_preferred_tips(void)
}
}
static inline int is_oid_interesting(struct repository *repo,
struct object_id *oid)
{
struct object *o = lookup_object(repo, oid);
return o && !(o->flags & UNINTERESTING);
}
static int add_objects_by_path(const char *path,
struct oid_array *oids,
enum object_type type,
void *data)
{
struct object_entry **delta_list;
size_t oe_start = to_pack.nr_objects;
size_t oe_end;
unsigned int sub_list_size;
unsigned int *processed = data;
/*
* First, add all objects to the packing data, including the ones
* marked UNINTERESTING (translated to 'exclude') as they can be
* used as delta bases.
*/
for (size_t i = 0; i < oids->nr; i++) {
int exclude;
struct object_info oi = OBJECT_INFO_INIT;
struct object_id *oid = &oids->oid[i];
/* Skip objects that do not exist locally. */
if (exclude_promisor_objects &&
oid_object_info_extended(the_repository, oid, &oi,
OBJECT_INFO_FOR_PREFETCH) < 0)
continue;
exclude = !is_oid_interesting(the_repository, oid);
if (exclude && !thin)
continue;
add_object_entry(oid, type, path, exclude);
}
oe_end = to_pack.nr_objects;
/* We can skip delta calculations if it is a no-op. */
if (oe_end == oe_start || !window)
return 0;
sub_list_size = 0;
ALLOC_ARRAY(delta_list, oe_end - oe_start);
for (size_t i = 0; i < oe_end - oe_start; i++) {
struct object_entry *entry = to_pack.objects + oe_start + i;
if (!should_attempt_deltas(entry))
continue;
delta_list[sub_list_size++] = entry;
}
/*
* Find delta bases among this list of objects that all match the same
* path. This causes the delta compression to be interleaved in the
* object walk, which can lead to confusing progress indicators. This is
* also incompatible with threaded delta calculations. In the future,
* consider creating a list of regions in the full to_pack.objects array
* that could be picked up by the threaded delta computation.
*/
if (sub_list_size && window) {
QSORT(delta_list, sub_list_size, type_size_sort);
find_deltas(delta_list, &sub_list_size, window, depth, processed);
}
free(delta_list);
return 0;
}
static void get_object_list_path_walk(struct rev_info *revs)
{
struct path_walk_info info = PATH_WALK_INFO_INIT;
unsigned int processed = 0;
info.revs = revs;
info.path_fn = add_objects_by_path;
info.path_fn_data = &processed;
revs->tag_objects = 1;
/*
* Allow the --[no-]sparse option to be interesting here, if only
* for testing purposes. Paths with no interesting objects will not
* contribute to the resulting pack, but only create noisy preferred
* base objects.
*/
info.prune_all_uninteresting = sparse;
if (walk_objects_by_path(&info))
die(_("failed to pack objects via path-walk"));
}
static void get_object_list(struct rev_info *revs, int ac, const char **av)
{
struct setup_revision_opt s_r_opt = {
@ -4188,7 +4291,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
warn_on_object_refname_ambiguity = save_warning;
if (use_bitmap_index && !get_object_list_from_bitmap(revs))
if (use_bitmap_index && !path_walk && !get_object_list_from_bitmap(revs))
return;
if (use_delta_islands)
@ -4197,15 +4300,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
if (write_bitmap_index)
mark_bitmap_preferred_tips();
if (prepare_revision_walk(revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(revs, show_edge, sparse);
if (!fn_show_object)
fn_show_object = show_object;
traverse_commit_list(revs,
show_commit, fn_show_object,
NULL);
if (path_walk) {
get_object_list_path_walk(revs);
} else {
if (prepare_revision_walk(revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(revs, show_edge, sparse);
traverse_commit_list(revs,
show_commit, fn_show_object,
NULL);
}
if (unpack_unreachable_expiration) {
revs->ignore_missing_links = 1;
@ -4415,6 +4522,8 @@ int cmd_pack_objects(int argc,
N_("use the sparse reachability algorithm")),
OPT_BOOL(0, "thin", &thin,
N_("create thin packs")),
OPT_BOOL(0, "path-walk", &path_walk,
N_("use the path-walk API to walk objects when possible")),
OPT_BOOL(0, "shallow", &shallow,
N_("create packs suitable for shallow fetches")),
OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
@ -4500,7 +4609,27 @@ int cmd_pack_objects(int argc,
window = 0;
strvec_push(&rp, "pack-objects");
if (thin) {
if (path_walk && filter_options.choice) {
warning(_("cannot use --filter with --path-walk"));
path_walk = 0;
}
if (path_walk && use_delta_islands) {
warning(_("cannot use delta islands with --path-walk"));
path_walk = 0;
}
if (path_walk && shallow) {
warning(_("cannot use --shallow with --path-walk"));
path_walk = 0;
}
if (path_walk) {
strvec_push(&rp, "--boundary");
/*
* We must disable the bitmaps because we are removing
* the --objects / --objects-edge[-aggressive] options.
*/
use_bitmap_index = 0;
} else if (thin) {
use_internal_rev_list = 1;
strvec_push(&rp, shallow
? "--objects-edge-aggressive"

View File

@ -704,4 +704,21 @@ test_expect_success '--full-name-hash and --write-bitmap-index are incompatible'
git pack-objects --stdout --all --full-name-hash --write-bitmap-index >out
'
# Basic "repack everything" test
test_expect_success '--path-walk pack everything' '
git -C server rev-parse HEAD >in &&
git -C server pack-objects --stdout --revs --path-walk <in >out.pack &&
git -C server index-pack --stdin <out.pack
'
# Basic "thin pack" test
test_expect_success '--path-walk thin pack' '
cat >in <<-EOF &&
$(git -C server rev-parse HEAD)
^$(git -C server rev-parse HEAD~2)
EOF
git -C server pack-objects --thin --stdout --revs --path-walk <in >out.pack &&
git -C server index-pack --fix-thin --stdin <out.pack
'
test_done