survey: summarize total sizes by object type

Now that we have explored objects by count, we can expand that a bit more to summarize the data for the on-disk and inflated size of those objects. This information is helpful for diagnosing both why disk space (and perhaps clone or fetch times) is growing but also why certain operations are slow because the inflated size of the abstract objects that must be processed is so large. Note: zlib-ng is slightly more efficient even at those small sizes. Even between zlib versions, there are slight differences in compression. To accommodate for that in the tests, not the exact numbers but some rough approximations are validated (the test should validate `git survey`, after all, not zlib). Signed-off-by: Derrick Stolee <stolee@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
2026-04-10 16:54:08 -05:00 · 2024-09-01 20:58:35 -04:00
parent 701b9e5622
commit 5f080b35ab
2 changed files with 169 additions and 1 deletions
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -60,6 +60,19 @@ struct survey_report_object_summary {
 	size_t blobs_nr;
 };

+/**
+ * For some category given by 'label', count the number of objects
+ * that match that label along with the on-disk size and the size
+ * after decompressing (both with delta bases and zlib).
+ */
+struct survey_report_object_size_summary {
+	char *label;
+	size_t nr;
+	size_t disk_size;
+	size_t inflated_size;
+	size_t num_missing;
+};
+
 /**
 * This struct contains all of the information that needs to be printed
 * at the end of the exploration of the repository and its references.
@@ -67,8 +80,16 @@ struct survey_report_object_summary {
 struct survey_report {
 	struct survey_report_ref_summary refs;
 	struct survey_report_object_summary reachable_objects;
+
+	struct survey_report_object_size_summary *by_type;
 };

+#define REPORT_TYPE_COMMIT 0
+#define REPORT_TYPE_TREE 1
+#define REPORT_TYPE_BLOB 2
+#define REPORT_TYPE_TAG 3
+#define REPORT_TYPE_COUNT 4
+
 struct survey_context {
 	struct repository *repo;

@@ -280,12 +301,48 @@ static void survey_report_plaintext_reachable_object_summary(struct survey_conte
 	clear_table(&table);
 }

+static void survey_report_object_sizes(const char *title,
+				       const char *categories,
+				       struct survey_report_object_size_summary *summary,
+				       size_t summary_nr)
+{
+	struct survey_table table = SURVEY_TABLE_INIT;
+	table.table_name = title;
+
+	strvec_push(&table.header, categories);
+	strvec_push(&table.header, _("Count"));
+	strvec_push(&table.header, _("Disk Size"));
+	strvec_push(&table.header, _("Inflated Size"));
+
+	for (size_t i = 0; i < summary_nr; i++) {
+		char *label_str =  xstrdup(summary[i].label);
+		char *nr_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].nr);
+		char *disk_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].disk_size);
+		char *inflate_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].inflated_size);
+
+		insert_table_rowv(&table, label_str, nr_str,
+				  disk_str, inflate_str, NULL);
+
+		free(label_str);
+		free(nr_str);
+		free(disk_str);
+		free(inflate_str);
+	}
+
+	print_table_plaintext(&table);
+	clear_table(&table);
+}
+
 static void survey_report_plaintext(struct survey_context *ctx)
 {
 	printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
 	printf("-----------------------------------------------------\n");
 	survey_report_plaintext_refs(ctx);
 	survey_report_plaintext_reachable_object_summary(ctx);
+	survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"),
+				   _("Object Type"),
+				   ctx->report.by_type,
+				   REPORT_TYPE_COUNT);
 }

 /*
@@ -499,6 +556,69 @@ static void increment_object_counts(
 	}
 }

+static void increment_totals(struct survey_context *ctx,
+			     struct oid_array *oids,
+			     struct survey_report_object_size_summary *summary)
+{
+	for (size_t i = 0; i < oids->nr; i++) {
+		struct object_info oi = OBJECT_INFO_INIT;
+		unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH;
+		unsigned long object_length = 0;
+		off_t disk_sizep = 0;
+		enum object_type type;
+
+		oi.typep = &type;
+		oi.sizep = &object_length;
+		oi.disk_sizep = &disk_sizep;
+
+		if (odb_read_object_info_extended(ctx->repo->objects,
+						  &oids->oid[i],
+						  &oi, oi_flags) < 0) {
+			summary->num_missing++;
+		} else {
+			summary->nr++;
+			summary->disk_size += disk_sizep;
+			summary->inflated_size += object_length;
+		}
+	}
+}
+
+static void increment_object_totals(struct survey_context *ctx,
+				    struct oid_array *oids,
+				    enum object_type type)
+{
+	struct survey_report_object_size_summary *total;
+	struct survey_report_object_size_summary summary = { 0 };
+
+	increment_totals(ctx, oids, &summary);
+
+	switch (type) {
+	case OBJ_COMMIT:
+		total = &ctx->report.by_type[REPORT_TYPE_COMMIT];
+		break;
+
+	case OBJ_TREE:
+		total = &ctx->report.by_type[REPORT_TYPE_TREE];
+		break;
+
+	case OBJ_BLOB:
+		total = &ctx->report.by_type[REPORT_TYPE_BLOB];
+		break;
+
+	case OBJ_TAG:
+		total = &ctx->report.by_type[REPORT_TYPE_TAG];
+		break;
+
+	default:
+		BUG("No other type allowed");
+	}
+
+	total->nr += summary.nr;
+	total->disk_size += summary.disk_size;
+	total->inflated_size += summary.inflated_size;
+	total->num_missing += summary.num_missing;
+}
+
 static int survey_objects_path_walk_fn(const char *path UNUSED,
 				       struct oid_array *oids,
 				       enum object_type type,
@@ -508,10 +628,20 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,

 	increment_object_counts(&ctx->report.reachable_objects,
 				type, oids->nr);
+	increment_object_totals(ctx, oids, type);

 	return 0;
 }

+static void initialize_report(struct survey_context *ctx)
+{
+	CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
+	ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
+	ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
+	ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
+	ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
+}
+
 static void survey_phase_objects(struct survey_context *ctx)
 {
 	struct rev_info revs = REV_INFO_INIT;
@@ -524,12 +654,15 @@ static void survey_phase_objects(struct survey_context *ctx)
 	info.path_fn = survey_objects_path_walk_fn;
 	info.path_fn_data = ctx;

+	initialize_report(ctx);
+
 	repo_init_revisions(ctx->repo, &revs, "");
 	revs.tag_objects = 1;

 	for (int i = 0; i < ctx->ref_array.nr; i++) {
 		struct ref_array_item *item = ctx->ref_array.items[i];
 		add_pending_oid(&revs, NULL, &item->objectname, add_flags);
+		display_progress(ctx->progress, ++(ctx->progress_nr));
 	}

 	walk_objects_by_path(&info);
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -25,10 +25,35 @@ test_expect_success 'create a semi-interesting repo' '
 	git update-ref -d refs/tags/two
 '

+approximate_sizes() {
+	# very simplistic approximate rounding
+	sed -Ee "s/  *(1[0-9][0-9])( |$)/ ~0.1kB\2/g" \
+	  -e "s/  *(4[6-9][0-9]|5[0-6][0-9])( |$)/ ~0.5kB\2/g" \
+	  -e "s/  *(5[6-9][0-9]|6[0-6][0-9])( |$)/ ~0.6kB\2/g" \
+	  -e "s/  *1(4[89][0-9]|5[0-8][0-9])( |$)/ ~1.5kB\2/g" \
+	  -e "s/  *1(69[0-9]|7[0-9][0-9])( |$)/ ~1.7kB\2/g" \
+	  -e "s/  *1(79[0-9]|8[0-9][0-9])( |$)/ ~1.8kB\2/g" \
+	  -e "s/  *2(1[0-9][0-9]|20[0-1])( |$)/ ~2.1kB\2/g" \
+	  -e "s/  *2(3[0-9][0-9]|4[0-1][0-9])( |$)/ ~2.3kB\2/g" \
+	  -e "s/  *2(5[0-9][0-9]|6[0-1][0-9])( |$)/ ~2.5kB\2/g" \
+	 "$@"
+}
+
 test_expect_success 'git survey (default)' '
 	git survey --all-refs >out 2>err &&
 	test_line_count = 0 err &&

+	test_oid_cache <<-EOF &&
+	commits_sizes sha1:~1.5kB | ~2.1kB
+	commits_sizes sha256:~1.8kB | ~2.5kB
+	trees_sizes sha1:~0.5kB | ~1.7kB
+	trees_sizes sha256:~0.6kB | ~2.3kB
+	blobs_sizes sha1:~0.1kB | ~0.1kB
+	blobs_sizes sha256:~0.1kB | ~0.1kB
+	tags_sizes sha1:~0.5kB | ~0.5kB
+	tags_sizes sha256:~0.5kB | ~0.6kB
+	EOF
+
 	tr , " " >expect <<-EOF &&
 	GIT SURVEY for "$(pwd)"
 	-----------------------------------------------------
@@ -50,9 +75,19 @@ test_expect_success 'git survey (default)' '
 	    Commits |    10
 	      Trees |    10
 	      Blobs |    10
+
+	TOTAL OBJECT SIZES BY TYPE
+	===============================================
+	Object Type | Count | Disk Size | Inflated Size
+	------------+-------+-----------+--------------
+	    Commits |    10 | $(test_oid commits_sizes)
+	      Trees |    10 | $(test_oid trees_sizes)
+	      Blobs |    10 | $(test_oid blobs_sizes)
+	       Tags |     4 | $(test_oid tags_sizes)
 	EOF

-	test_cmp expect out
+	approximate_sizes out >out-edited &&
+	test_cmp expect out-edited
 '

 test_done