pack-objects: add --path-walk option

In order to more easily compute delta bases among objects that appear at the
exact same path, add a --path-walk option to 'git pack-objects'.

This option will use the path-walk API instead of the object walk given by
the revision machinery. Since objects will be provided in batches
representing a common path, those objects can be tested for delta bases
immediately instead of waiting for a sort of the full object list by
name-hash. This has multiple benefits, including avoiding collisions by
name-hash.

The objects marked as UNINTERESTING are included in these batches, so we
are guaranteeing some locality to find good delta bases.

After the individual passes are done on a per-path basis, the default
name-hash is used to find other opportunistic delta bases that did not
match exactly by the full path name.

RFC TODO: It is important to note that this option is inherently
incompatible with using a bitmap index. This walk probably also does not
work with other advanced features, such as delta islands.

Getting ahead of myself, this option compares well with --full-name-hash
when the packfile is large enough, but also performs at least as well as
the default in all cases that I've seen.

RFC TODO: this should probably be recording the batch locations to another
list so they could be processed in a second phase using threads.

RFC TODO: list some examples of how this outperforms previous pack-objects
strategies. (This is coming in later commits that include performance
test changes.)

Signed-off-by: Derrick Stolee <stolee@gmail.com>
This commit is contained in:
Derrick Stolee 2024-09-05 10:04:51 -04:00 committed by Johannes Schindelin
parent 2ebf07375b
commit 7f4a2a2d9e
4 changed files with 167 additions and 10 deletions

View File

@ -16,7 +16,7 @@ SYNOPSIS
[--cruft] [--cruft-expiration=<time>]
[--stdout [--filter=<filter-spec>] | <base-name>]
[--shallow] [--keep-true-parents] [--[no-]sparse]
[--name-hash-version=<n>] < <object-list>
[--name-hash-version=<n>] [--path-walk] < <object-list>
DESCRIPTION
@ -375,6 +375,16 @@ many different directories. At the moment, this version is not allowed
when writing reachability bitmap files with `--write-bitmap-index` and it
will be automatically changed to version `1`.
--path-walk::
By default, `git pack-objects` walks objects in an order that
presents trees and blobs in an order unrelated to the path they
appear relative to a commit's root tree. The `--path-walk` option
enables a different walking algorithm that organizes trees and
blobs by path. This has the potential to improve delta compression
especially in the presence of filenames that cause collisions in
Git's default name-hash algorithm. Due to changing how the objects
are walked, this option is not compatible with `--delta-islands`,
`--shallow`, or `--filter`.
DELTA ISLANDS
-------------

View File

@ -70,3 +70,4 @@ Examples
See example usages in:
`t/helper/test-path-walk.c`,
`builtin/backfill.c`
`builtin/pack-objects.c`

View File

@ -41,6 +41,9 @@
#include "promisor-remote.h"
#include "pack-mtimes.h"
#include "parse-options.h"
#include "blob.h"
#include "tree.h"
#include "path-walk.h"
/*
* Objects we are going to pack are collected in the `to_pack` structure.
@ -217,6 +220,7 @@ static int delta_search_threads;
static int pack_to_stdout;
static int sparse;
static int thin;
static int path_walk;
static int num_preferred_base;
static struct progress *progress_state;
@ -4188,6 +4192,105 @@ static void mark_bitmap_preferred_tips(void)
}
}
static inline int is_oid_interesting(struct repository *repo,
struct object_id *oid)
{
struct object *o = lookup_object(repo, oid);
return o && !(o->flags & UNINTERESTING);
}
static int add_objects_by_path(const char *path,
struct oid_array *oids,
enum object_type type,
void *data)
{
struct object_entry **delta_list;
size_t oe_start = to_pack.nr_objects;
size_t oe_end;
unsigned int sub_list_size;
unsigned int *processed = data;
/*
* First, add all objects to the packing data, including the ones
* marked UNINTERESTING (translated to 'exclude') as they can be
* used as delta bases.
*/
for (size_t i = 0; i < oids->nr; i++) {
int exclude;
struct object_info oi = OBJECT_INFO_INIT;
struct object_id *oid = &oids->oid[i];
/* Skip objects that do not exist locally. */
if (exclude_promisor_objects &&
oid_object_info_extended(the_repository, oid, &oi,
OBJECT_INFO_FOR_PREFETCH) < 0)
continue;
exclude = !is_oid_interesting(the_repository, oid);
if (exclude && !thin)
continue;
add_object_entry(oid, type, path, exclude);
}
oe_end = to_pack.nr_objects;
/* We can skip delta calculations if it is a no-op. */
if (oe_end == oe_start || !window)
return 0;
sub_list_size = 0;
ALLOC_ARRAY(delta_list, oe_end - oe_start);
for (size_t i = 0; i < oe_end - oe_start; i++) {
struct object_entry *entry = to_pack.objects + oe_start + i;
if (!should_attempt_deltas(entry))
continue;
delta_list[sub_list_size++] = entry;
}
/*
* Find delta bases among this list of objects that all match the same
* path. This causes the delta compression to be interleaved in the
* object walk, which can lead to confusing progress indicators. This is
* also incompatible with threaded delta calculations. In the future,
* consider creating a list of regions in the full to_pack.objects array
* that could be picked up by the threaded delta computation.
*/
if (sub_list_size && window) {
QSORT(delta_list, sub_list_size, type_size_sort);
find_deltas(delta_list, &sub_list_size, window, depth, processed);
}
free(delta_list);
return 0;
}
static void get_object_list_path_walk(struct rev_info *revs)
{
struct path_walk_info info = PATH_WALK_INFO_INIT;
unsigned int processed = 0;
info.revs = revs;
info.path_fn = add_objects_by_path;
info.path_fn_data = &processed;
revs->tag_objects = 1;
/*
* Allow the --[no-]sparse option to be interesting here, if only
* for testing purposes. Paths with no interesting objects will not
* contribute to the resulting pack, but only create noisy preferred
* base objects.
*/
info.prune_all_uninteresting = sparse;
if (walk_objects_by_path(&info))
die(_("failed to pack objects via path-walk"));
}
static void get_object_list(struct rev_info *revs, int ac, const char **av)
{
struct setup_revision_opt s_r_opt = {
@ -4234,7 +4337,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
warn_on_object_refname_ambiguity = save_warning;
if (use_bitmap_index && !get_object_list_from_bitmap(revs))
if (use_bitmap_index && !path_walk && !get_object_list_from_bitmap(revs))
return;
if (use_delta_islands)
@ -4243,15 +4346,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
if (write_bitmap_index)
mark_bitmap_preferred_tips();
if (prepare_revision_walk(revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(revs, show_edge, sparse);
if (!fn_show_object)
fn_show_object = show_object;
traverse_commit_list(revs,
show_commit, fn_show_object,
NULL);
if (path_walk) {
get_object_list_path_walk(revs);
} else {
if (prepare_revision_walk(revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(revs, show_edge, sparse);
traverse_commit_list(revs,
show_commit, fn_show_object,
NULL);
}
if (unpack_unreachable_expiration) {
revs->ignore_missing_links = 1;
@ -4461,6 +4568,8 @@ int cmd_pack_objects(int argc,
N_("use the sparse reachability algorithm")),
OPT_BOOL(0, "thin", &thin,
N_("create thin packs")),
OPT_BOOL(0, "path-walk", &path_walk,
N_("use the path-walk API to walk objects when possible")),
OPT_BOOL(0, "shallow", &shallow,
N_("create packs suitable for shallow fetches")),
OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
@ -4546,7 +4655,27 @@ int cmd_pack_objects(int argc,
window = 0;
strvec_push(&rp, "pack-objects");
if (thin) {
if (path_walk && filter_options.choice) {
warning(_("cannot use --filter with --path-walk"));
path_walk = 0;
}
if (path_walk && use_delta_islands) {
warning(_("cannot use delta islands with --path-walk"));
path_walk = 0;
}
if (path_walk && shallow) {
warning(_("cannot use --shallow with --path-walk"));
path_walk = 0;
}
if (path_walk) {
strvec_push(&rp, "--boundary");
/*
* We must disable the bitmaps because we are removing
* the --objects / --objects-edge[-aggressive] options.
*/
use_bitmap_index = 0;
} else if (thin) {
use_internal_rev_list = 1;
strvec_push(&rp, shallow
? "--objects-edge-aggressive"

View File

@ -723,4 +723,21 @@ test_expect_success '--name-hash-version=2 and --write-bitmap-index are incompat
! test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err
'
# Basic "repack everything" test
test_expect_success '--path-walk pack everything' '
git -C server rev-parse HEAD >in &&
git -C server pack-objects --stdout --revs --path-walk <in >out.pack &&
git -C server index-pack --stdin <out.pack
'
# Basic "thin pack" test
test_expect_success '--path-walk thin pack' '
cat >in <<-EOF &&
$(git -C server rev-parse HEAD)
^$(git -C server rev-parse HEAD~2)
EOF
git -C server pack-objects --thin --stdout --revs --path-walk <in >out.pack &&
git -C server index-pack --fix-thin --stdin <out.pack
'
test_done