survey: add report of "largest" paths

Since we are already walking our reachable objects using the path-walk API,
let's now collect lists of the paths that contribute most to different
metrics. Specifically, we care about

 * Number of versions.
 * Total size on disk.
 * Total inflated size (no delta or zlib compression).

This information can be critical to discovering which parts of the
repository are causing the most growth, especially on-disk size. Different
packing strategies might help compress data more efficiently, but the toal
inflated size is a representation of the raw size of all snapshots of those
paths. Even when stored efficiently on disk, that size represents how much
information must be processed to complete a command such as 'git blame'.

The exact disk size seems to be not quite robust enough for testing, as
could be seen by the `linux-musl-meson` job consistently failing, possibly
because of zlib-ng deflates differently: t8100.4(git survey
(default)) was failing with a symptom like this:

   TOTAL OBJECT SIZES BY TYPE
   ===============================================
   Object Type | Count | Disk Size | Inflated Size
   ------------+-------+-----------+--------------
  -    Commits |    10 |      1523 |          2153
  +    Commits |    10 |      1528 |          2153
         Trees |    10 |       495 |          1706
         Blobs |    10 |       191 |           101
  -       Tags |     4 |       510 |           528
  +       Tags |     4 |       547 |           528

This means: the disk size is unlikely something we can verify robustly.
Since zlib-ng seems to increase the disk size of the tags from 528 to
547, we cannot even assume that the disk size is always smaller than the
inflated size. We will most likely want to either skip verifying the
disk size altogether, or go for some kind of fuzzy matching, say, by
replacing `s/ 1[45][0-9][0-9] / ~1.5k /` and `s/ [45][0-9][0-9] / ~½k /`
or something like that.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
This commit is contained in:
Derrick Stolee 2024-09-01 22:35:40 -04:00 committed by Johannes Schindelin
parent c596bba980
commit 65fef5758a
2 changed files with 82 additions and 8 deletions

View File

@ -75,7 +75,6 @@ struct survey_report_object_size_summary {
typedef int (*survey_top_cmp)(void *v1, void *v2);
MAYBE_UNUSED
static int cmp_by_nr(void *v1, void *v2)
{
struct survey_report_object_size_summary *s1 = v1;
@ -88,7 +87,6 @@ static int cmp_by_nr(void *v1, void *v2)
return 0;
}
MAYBE_UNUSED
static int cmp_by_disk_size(void *v1, void *v2)
{
struct survey_report_object_size_summary *s1 = v1;
@ -101,7 +99,6 @@ static int cmp_by_disk_size(void *v1, void *v2)
return 0;
}
MAYBE_UNUSED
static int cmp_by_inflated_size(void *v1, void *v2)
{
struct survey_report_object_size_summary *s1 = v1;
@ -132,7 +129,6 @@ struct survey_report_top_table {
void *data;
};
MAYBE_UNUSED
static void init_top_sizes(struct survey_report_top_table *top,
size_t limit, const char *name,
survey_top_cmp cmp)
@ -158,7 +154,6 @@ static void clear_top_sizes(struct survey_report_top_table *top)
free(sz_array);
}
MAYBE_UNUSED
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
struct survey_report_object_size_summary *summary)
{
@ -195,6 +190,10 @@ struct survey_report {
struct survey_report_object_summary reachable_objects;
struct survey_report_object_size_summary *by_type;
struct survey_report_top_table *top_paths_by_count;
struct survey_report_top_table *top_paths_by_disk;
struct survey_report_top_table *top_paths_by_inflate;
};
#define REPORT_TYPE_COMMIT 0
@ -446,6 +445,13 @@ static void survey_report_object_sizes(const char *title,
clear_table(&table);
}
static void survey_report_plaintext_sorted_size(
struct survey_report_top_table *top)
{
survey_report_object_sizes(top->name, _("Path"),
top->data, top->nr);
}
static void survey_report_plaintext(struct survey_context *ctx)
{
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@ -456,6 +462,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
_("Object Type"),
ctx->report.by_type,
REPORT_TYPE_COUNT);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
}
/*
@ -697,7 +718,8 @@ static void increment_totals(struct survey_context *ctx,
static void increment_object_totals(struct survey_context *ctx,
struct oid_array *oids,
enum object_type type)
enum object_type type,
const char *path)
{
struct survey_report_object_size_summary *total;
struct survey_report_object_size_summary summary = { 0 };
@ -729,6 +751,27 @@ static void increment_object_totals(struct survey_context *ctx,
total->disk_size += summary.disk_size;
total->inflated_size += summary.inflated_size;
total->num_missing += summary.num_missing;
if (type == OBJ_TREE || type == OBJ_BLOB) {
int index = type == OBJ_TREE ?
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
struct survey_report_top_table *top;
/*
* Temporarily store (const char *) here, but it will
* be duped if inserted and will not be freed.
*/
summary.label = (char *)path;
top = ctx->report.top_paths_by_count;
maybe_insert_into_top_size(&top[index], &summary);
top = ctx->report.top_paths_by_disk;
maybe_insert_into_top_size(&top[index], &summary);
top = ctx->report.top_paths_by_inflate;
maybe_insert_into_top_size(&top[index], &summary);
}
}
static int survey_objects_path_walk_fn(const char *path,
@ -740,7 +783,7 @@ static int survey_objects_path_walk_fn(const char *path,
increment_object_counts(&ctx->report.reachable_objects,
type, oids->nr);
increment_object_totals(ctx, oids, type);
increment_object_totals(ctx, oids, type, path);
ctx->progress_nr += oids->nr;
display_progress(ctx->progress, ctx->progress_nr);
@ -750,11 +793,31 @@ static int survey_objects_path_walk_fn(const char *path,
static void initialize_report(struct survey_context *ctx)
{
const int top_limit = 100;
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
}
static void survey_phase_objects(struct survey_context *ctx)

View File

@ -86,7 +86,18 @@ test_expect_success 'git survey (default)' '
Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size)
EOF
test_cmp expect out
lines=$(wc -l <expect) &&
head -n $lines out >out-trimmed &&
sed -e "s/ 1528 / 1523 /" -e "s/ 547 / 510 /" out-trimmed >out-edited &&
test_cmp expect out-edited &&
for type in "DIRECTORIES" "FILES"
do
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
do
grep "TOP $type BY $metric" out || return 1
done || return 1
done
'
test_done