mirror of
https://github.com/git-for-windows/git.git
synced 2026-05-31 02:17:14 -05:00
backfill: basic functionality and tests
The default behavior of 'git backfill' is to fetch all missing blobs that are reachable from HEAD. Document and test this behavior. The implementation is a very simple use of the path-walk API, initializing the revision walk at HEAD to start the path-walk from all commits reachable from HEAD. Ignore the object arrays that correspond to tree entries, assuming that they are all present already. Signed-off-by: Derrick Stolee <stolee@gmail.com>
This commit is contained in:
committed by
Johannes Schindelin
parent
58eb7f1388
commit
ebd1692609
@@ -14,6 +14,30 @@ SYNOPSIS
|
|||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
|
Blobless partial clones are created using `git clone --filter=blob:none`
|
||||||
|
and then configure the local repository such that the Git client avoids
|
||||||
|
downloading blob objects unless they are required for a local operation.
|
||||||
|
This initially means that the clone and later fetches download reachable
|
||||||
|
commits and trees but no blobs. Later operations that change the `HEAD`
|
||||||
|
pointer, such as `git checkout` or `git merge`, may need to download
|
||||||
|
missing blobs in order to complete their operation.
|
||||||
|
|
||||||
|
In the worst cases, commands that compute blob diffs, such as `git blame`,
|
||||||
|
become very slow as they download the missing blobs in single-blob
|
||||||
|
requests to satisfy the missing object as the Git command needs it. This
|
||||||
|
leads to multiple download requests and no ability for the Git server to
|
||||||
|
provide delta compression across those objects.
|
||||||
|
|
||||||
|
The `git backfill` command provides a way for the user to request that
|
||||||
|
Git downloads the missing blobs (with optional filters) such that the
|
||||||
|
missing blobs representing historical versions of files can be downloaded
|
||||||
|
in batches. The `backfill` command attempts to optimize the request by
|
||||||
|
grouping blobs that appear at the same path, hopefully leading to good
|
||||||
|
delta compression in the packfile sent by the server.
|
||||||
|
|
||||||
|
By default, `git backfill` downloads all blobs reachable from the `HEAD`
|
||||||
|
commit. This set can be restricted or expanded using various options.
|
||||||
|
|
||||||
SEE ALSO
|
SEE ALSO
|
||||||
--------
|
--------
|
||||||
linkgit:git-clone[1].
|
linkgit:git-clone[1].
|
||||||
|
|||||||
@@ -70,4 +70,5 @@ Examples
|
|||||||
|
|
||||||
See example usages in:
|
See example usages in:
|
||||||
`t/helper/test-path-walk.c`,
|
`t/helper/test-path-walk.c`,
|
||||||
|
`builtin/backfill.c`,
|
||||||
`builtin/pack-objects.c`
|
`builtin/pack-objects.c`
|
||||||
|
|||||||
@@ -1,16 +1,117 @@
|
|||||||
#include "builtin.h"
|
#include "builtin.h"
|
||||||
|
#include "git-compat-util.h"
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include "parse-options.h"
|
#include "parse-options.h"
|
||||||
#include "repository.h"
|
#include "repository.h"
|
||||||
|
#include "commit.h"
|
||||||
|
#include "hex.h"
|
||||||
|
#include "tree.h"
|
||||||
|
#include "tree-walk.h"
|
||||||
#include "object.h"
|
#include "object.h"
|
||||||
|
#include "object-store-ll.h"
|
||||||
|
#include "oid-array.h"
|
||||||
|
#include "oidset.h"
|
||||||
|
#include "promisor-remote.h"
|
||||||
|
#include "strmap.h"
|
||||||
|
#include "string-list.h"
|
||||||
|
#include "revision.h"
|
||||||
|
#include "trace2.h"
|
||||||
|
#include "progress.h"
|
||||||
|
#include "packfile.h"
|
||||||
|
#include "path-walk.h"
|
||||||
|
|
||||||
static const char * const builtin_backfill_usage[] = {
|
static const char * const builtin_backfill_usage[] = {
|
||||||
N_("git backfill [<options>]"),
|
N_("git backfill [<options>]"),
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct backfill_context {
|
||||||
|
struct repository *repo;
|
||||||
|
struct oid_array current_batch;
|
||||||
|
size_t batch_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void clear_backfill_context(struct backfill_context *ctx)
|
||||||
|
{
|
||||||
|
oid_array_clear(&ctx->current_batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void download_batch(struct backfill_context *ctx)
|
||||||
|
{
|
||||||
|
promisor_remote_get_direct(ctx->repo,
|
||||||
|
ctx->current_batch.oid,
|
||||||
|
ctx->current_batch.nr);
|
||||||
|
oid_array_clear(&ctx->current_batch);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We likely have a new packfile. Add it to the packed list to
|
||||||
|
* avoid possible duplicate downloads of the same objects.
|
||||||
|
*/
|
||||||
|
reprepare_packed_git(ctx->repo);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int fill_missing_blobs(const char *path UNUSED,
|
||||||
|
struct oid_array *list,
|
||||||
|
enum object_type type,
|
||||||
|
void *data)
|
||||||
|
{
|
||||||
|
struct backfill_context *ctx = data;
|
||||||
|
|
||||||
|
if (type != OBJ_BLOB)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < list->nr; i++) {
|
||||||
|
off_t size = 0;
|
||||||
|
struct object_info info = OBJECT_INFO_INIT;
|
||||||
|
info.disk_sizep = &size;
|
||||||
|
if (oid_object_info_extended(ctx->repo,
|
||||||
|
&list->oid[i],
|
||||||
|
&info,
|
||||||
|
OBJECT_INFO_FOR_PREFETCH) ||
|
||||||
|
!size)
|
||||||
|
oid_array_append(&ctx->current_batch, &list->oid[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->current_batch.nr >= ctx->batch_size)
|
||||||
|
download_batch(ctx);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int do_backfill(struct backfill_context *ctx)
|
||||||
|
{
|
||||||
|
struct rev_info revs;
|
||||||
|
struct path_walk_info info = PATH_WALK_INFO_INIT;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
repo_init_revisions(ctx->repo, &revs, "");
|
||||||
|
handle_revision_arg("HEAD", &revs, 0, 0);
|
||||||
|
|
||||||
|
info.blobs = 1;
|
||||||
|
info.tags = info.commits = info.trees = 0;
|
||||||
|
|
||||||
|
info.revs = &revs;
|
||||||
|
info.path_fn = fill_missing_blobs;
|
||||||
|
info.path_fn_data = ctx;
|
||||||
|
|
||||||
|
ret = walk_objects_by_path(&info);
|
||||||
|
|
||||||
|
/* Download the objects that did not fill a batch. */
|
||||||
|
if (!ret)
|
||||||
|
download_batch(ctx);
|
||||||
|
|
||||||
|
clear_backfill_context(ctx);
|
||||||
|
release_revisions(&revs);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
|
int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
|
||||||
{
|
{
|
||||||
|
struct backfill_context ctx = {
|
||||||
|
.repo = repo,
|
||||||
|
.current_batch = OID_ARRAY_INIT,
|
||||||
|
.batch_size = 50000,
|
||||||
|
};
|
||||||
struct option options[] = {
|
struct option options[] = {
|
||||||
OPT_END(),
|
OPT_END(),
|
||||||
};
|
};
|
||||||
@@ -23,7 +124,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
|
|||||||
|
|
||||||
repo_config(repo, git_default_config, NULL);
|
repo_config(repo, git_default_config, NULL);
|
||||||
|
|
||||||
die(_("not implemented"));
|
return do_backfill(&ctx);
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -721,6 +721,7 @@ integration_tests = [
|
|||||||
't5617-clone-submodules-remote.sh',
|
't5617-clone-submodules-remote.sh',
|
||||||
't5618-alternate-refs.sh',
|
't5618-alternate-refs.sh',
|
||||||
't5619-clone-local-ambiguous-transport.sh',
|
't5619-clone-local-ambiguous-transport.sh',
|
||||||
|
't5620-backfill.sh',
|
||||||
't5700-protocol-v1.sh',
|
't5700-protocol-v1.sh',
|
||||||
't5701-git-serve.sh',
|
't5701-git-serve.sh',
|
||||||
't5702-protocol-v2.sh',
|
't5702-protocol-v2.sh',
|
||||||
|
|||||||
94
t/t5620-backfill.sh
Executable file
94
t/t5620-backfill.sh
Executable file
@@ -0,0 +1,94 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
test_description='git backfill on partial clones'
|
||||||
|
|
||||||
|
GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
|
||||||
|
export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
|
||||||
|
|
||||||
|
. ./test-lib.sh
|
||||||
|
|
||||||
|
# We create objects in the 'src' repo.
|
||||||
|
test_expect_success 'setup repo for object creation' '
|
||||||
|
echo "{print \$1}" >print_1.awk &&
|
||||||
|
echo "{print \$2}" >print_2.awk &&
|
||||||
|
|
||||||
|
git init src &&
|
||||||
|
|
||||||
|
mkdir -p src/a/b/c &&
|
||||||
|
mkdir -p src/d/e &&
|
||||||
|
|
||||||
|
for i in 1 2
|
||||||
|
do
|
||||||
|
for n in 1 2 3 4
|
||||||
|
do
|
||||||
|
echo "Version $i of file $n" > src/file.$n.txt &&
|
||||||
|
echo "Version $i of file a/$n" > src/a/file.$n.txt &&
|
||||||
|
echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
|
||||||
|
echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
|
||||||
|
echo "Version $i of file d/$n" > src/d/file.$n.txt &&
|
||||||
|
echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
|
||||||
|
git -C src add . &&
|
||||||
|
git -C src commit -m "Iteration $n" || return 1
|
||||||
|
done
|
||||||
|
done
|
||||||
|
'
|
||||||
|
|
||||||
|
# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
|
||||||
|
# server for the partial clone.
|
||||||
|
test_expect_success 'setup bare clone for server' '
|
||||||
|
git clone --bare "file://$(pwd)/src" srv.bare &&
|
||||||
|
git -C srv.bare config --local uploadpack.allowfilter 1 &&
|
||||||
|
git -C srv.bare config --local uploadpack.allowanysha1inwant 1
|
||||||
|
'
|
||||||
|
|
||||||
|
# do basic partial clone from "srv.bare"
|
||||||
|
test_expect_success 'do partial clone 1, backfill gets all objects' '
|
||||||
|
git clone --no-checkout --filter=blob:none \
|
||||||
|
--single-branch --branch=main \
|
||||||
|
"file://$(pwd)/srv.bare" backfill1 &&
|
||||||
|
|
||||||
|
# Backfill with no options gets everything reachable from HEAD.
|
||||||
|
GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
|
||||||
|
-C backfill1 backfill &&
|
||||||
|
|
||||||
|
# We should have engaged the partial clone machinery
|
||||||
|
test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
|
||||||
|
|
||||||
|
# No more missing objects!
|
||||||
|
git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
|
||||||
|
test_line_count = 0 revs2
|
||||||
|
'
|
||||||
|
|
||||||
|
. "$TEST_DIRECTORY"/lib-httpd.sh
|
||||||
|
start_httpd
|
||||||
|
|
||||||
|
test_expect_success 'create a partial clone over HTTP' '
|
||||||
|
SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
|
||||||
|
rm -rf "$SERVER" repo &&
|
||||||
|
git clone --bare "file://$(pwd)/src" "$SERVER" &&
|
||||||
|
test_config -C "$SERVER" uploadpack.allowfilter 1 &&
|
||||||
|
test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
|
||||||
|
|
||||||
|
git clone --no-checkout --filter=blob:none \
|
||||||
|
"$HTTPD_URL/smart/server" backfill-http
|
||||||
|
'
|
||||||
|
|
||||||
|
test_expect_success 'backfilling over HTTP succeeds' '
|
||||||
|
GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
|
||||||
|
-C backfill-http backfill &&
|
||||||
|
|
||||||
|
# We should have engaged the partial clone machinery
|
||||||
|
test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
|
||||||
|
|
||||||
|
# Confirm all objects are present, none missing.
|
||||||
|
git -C backfill-http rev-list --objects --all >rev-list-out &&
|
||||||
|
awk "{print \$1;}" <rev-list-out >oids &&
|
||||||
|
GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
|
||||||
|
cat-file --batch-check <oids >batch-out &&
|
||||||
|
! grep missing batch-out
|
||||||
|
'
|
||||||
|
|
||||||
|
# DO NOT add non-httpd-specific tests here, because the last part of this
|
||||||
|
# test script is only executed when httpd is available and enabled.
|
||||||
|
|
||||||
|
test_done
|
||||||
Reference in New Issue
Block a user