From 5f6744d3eba6264ce78c8b507b1a1d3c0e540c37 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:34 -0500 Subject: [PATCH 01/24] odb: split `struct odb_transaction` into separate header The current ODB transaction interface is colocated with other ODB interfaces in "odb.{c,h}". Subsequent commits will expand `struct odb_transaction` to support write operations on the transaction directly. To keep things organized and prevent "odb.{c,h}" from becoming more unwieldy, split out `struct odb_transaction` into a separate header. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- Makefile | 1 + builtin/add.c | 1 + builtin/unpack-objects.c | 1 + builtin/update-index.c | 1 + cache-tree.c | 1 + meson.build | 1 + object-file.c | 1 + odb.c | 25 ------------------------- odb.h | 31 ------------------------------- odb/transaction.c | 28 ++++++++++++++++++++++++++++ odb/transaction.h | 38 ++++++++++++++++++++++++++++++++++++++ read-cache.c | 1 + 12 files changed, 74 insertions(+), 56 deletions(-) create mode 100644 odb/transaction.c create mode 100644 odb/transaction.h diff --git a/Makefile b/Makefile index dbf0022054..6342db13e5 100644 --- a/Makefile +++ b/Makefile @@ -1219,6 +1219,7 @@ LIB_OBJS += odb.o LIB_OBJS += odb/source.o LIB_OBJS += odb/source-files.o LIB_OBJS += odb/streaming.o +LIB_OBJS += odb/transaction.o LIB_OBJS += oid-array.o LIB_OBJS += oidmap.o LIB_OBJS += oidset.o diff --git a/builtin/add.c b/builtin/add.c index 7737ab878b..c859f66519 100644 --- a/builtin/add.c +++ b/builtin/add.c @@ -16,6 +16,7 @@ #include "run-command.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "parse-options.h" #include "path.h" #include "preload-index.h" diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 6fc64e9e4b..bc9b1e047e 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -9,6 +9,7 @@ #include "hex.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "object.h" #include "delta.h" #include "pack.h" diff --git a/builtin/update-index.c b/builtin/update-index.c index 8a5907767b..bcc43852ef 100644 --- a/builtin/update-index.c +++ b/builtin/update-index.c @@ -19,6 +19,7 @@ #include "tree-walk.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "refs.h" #include "resolve-undo.h" #include "parse-options.h" diff --git a/cache-tree.c b/cache-tree.c index 60bcc07c3b..f056869cfd 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -10,6 +10,7 @@ #include "cache-tree.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "read-cache-ll.h" #include "replace-object.h" #include "repository.h" diff --git a/meson.build b/meson.build index 8309942d18..6dc23b3af2 100644 --- a/meson.build +++ b/meson.build @@ -405,6 +405,7 @@ libgit_sources = [ 'odb/source.c', 'odb/source-files.c', 'odb/streaming.c', + 'odb/transaction.c', 'oid-array.c', 'oidmap.c', 'oidset.c', diff --git a/object-file.c b/object-file.c index f0b029ff0b..bfbb632cf8 100644 --- a/object-file.c +++ b/object-file.c @@ -21,6 +21,7 @@ #include "object-file.h" #include "odb.h" #include "odb/streaming.h" +#include "odb/transaction.h" #include "oidtree.h" #include "pack.h" #include "packfile.h" diff --git a/odb.c b/odb.c index 350e23f3c0..8c3cbc1b53 100644 --- a/odb.c +++ b/odb.c @@ -1069,28 +1069,3 @@ void odb_reprepare(struct object_database *o) obj_read_unlock(); } - -struct odb_transaction *odb_transaction_begin(struct object_database *odb) -{ - if (odb->transaction) - return NULL; - - odb->transaction = odb_transaction_files_begin(odb->sources); - - return odb->transaction; -} - -void odb_transaction_commit(struct odb_transaction *transaction) -{ - if (!transaction) - return; - - /* - * Ensure the transaction ending matches the pending transaction. - */ - ASSERT(transaction == transaction->source->odb->transaction); - - transaction->commit(transaction); - transaction->source->odb->transaction = NULL; - free(transaction); -} diff --git a/odb.h b/odb.h index 9aee260105..ec5367b13e 100644 --- a/odb.h +++ b/odb.h @@ -35,24 +35,6 @@ struct packed_git; struct packfile_store; struct cached_object_entry; -/* - * A transaction may be started for an object database prior to writing new - * objects via odb_transaction_begin(). These objects are not committed until - * odb_transaction_commit() is invoked. Only a single transaction may be pending - * at a time. - * - * Each ODB source is expected to implement its own transaction handling. - */ -struct odb_transaction; -typedef void (*odb_transaction_commit_fn)(struct odb_transaction *transaction); -struct odb_transaction { - /* The ODB source the transaction is opened against. */ - struct odb_source *source; - - /* The ODB source specific callback invoked to commit a transaction. */ - odb_transaction_commit_fn commit; -}; - /* * The object database encapsulates access to objects in a repository. It * manages one or more sources that store the actual objects which are @@ -154,19 +136,6 @@ void odb_close(struct object_database *o); */ void odb_reprepare(struct object_database *o); -/* - * Starts an ODB transaction. Subsequent objects are written to the transaction - * and not committed until odb_transaction_commit() is invoked on the - * transaction. If the ODB already has a pending transaction, NULL is returned. - */ -struct odb_transaction *odb_transaction_begin(struct object_database *odb); - -/* - * Commits an ODB transaction making the written objects visible. If the - * specified transaction is NULL, the function is a no-op. - */ -void odb_transaction_commit(struct odb_transaction *transaction); - /* * Find source by its object directory path. Returns a `NULL` pointer in case * the source could not be found. diff --git a/odb/transaction.c b/odb/transaction.c new file mode 100644 index 0000000000..9bf3f347dc --- /dev/null +++ b/odb/transaction.c @@ -0,0 +1,28 @@ +#include "git-compat-util.h" +#include "object-file.h" +#include "odb/transaction.h" + +struct odb_transaction *odb_transaction_begin(struct object_database *odb) +{ + if (odb->transaction) + return NULL; + + odb->transaction = odb_transaction_files_begin(odb->sources); + + return odb->transaction; +} + +void odb_transaction_commit(struct odb_transaction *transaction) +{ + if (!transaction) + return; + + /* + * Ensure the transaction ending matches the pending transaction. + */ + ASSERT(transaction == transaction->source->odb->transaction); + + transaction->commit(transaction); + transaction->source->odb->transaction = NULL; + free(transaction); +} diff --git a/odb/transaction.h b/odb/transaction.h new file mode 100644 index 0000000000..a56e392f21 --- /dev/null +++ b/odb/transaction.h @@ -0,0 +1,38 @@ +#ifndef ODB_TRANSACTION_H +#define ODB_TRANSACTION_H + +#include "odb.h" +#include "odb/source.h" + +/* + * A transaction may be started for an object database prior to writing new + * objects via odb_transaction_begin(). These objects are not committed until + * odb_transaction_commit() is invoked. Only a single transaction may be pending + * at a time. + * + * Each ODB source is expected to implement its own transaction handling. + */ +struct odb_transaction; +typedef void (*odb_transaction_commit_fn)(struct odb_transaction *transaction); +struct odb_transaction { + /* The ODB source the transaction is opened against. */ + struct odb_source *source; + + /* The ODB source specific callback invoked to commit a transaction. */ + odb_transaction_commit_fn commit; +}; + +/* + * Starts an ODB transaction. Subsequent objects are written to the transaction + * and not committed until odb_transaction_commit() is invoked on the + * transaction. If the ODB already has a pending transaction, NULL is returned. + */ +struct odb_transaction *odb_transaction_begin(struct object_database *odb); + +/* + * Commits an ODB transaction making the written objects visible. If the + * specified transaction is NULL, the function is a no-op. + */ +void odb_transaction_commit(struct odb_transaction *transaction); + +#endif diff --git a/read-cache.c b/read-cache.c index 5049f9baca..8147c7e94a 100644 --- a/read-cache.c +++ b/read-cache.c @@ -20,6 +20,7 @@ #include "dir.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "oid-array.h" #include "tree.h" #include "commit.h" From 10cdbc7423b2f5f9c666c48fe8a6e6e044595799 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:35 -0500 Subject: [PATCH 02/24] odb/transaction: use pluggable `begin_transaction()` Each ODB source is expected to provide an ODB transaction implementation that should be used when starting a transaction. With d6fc6fe6f8 (odb/source: make `begin_transaction()` function pluggable, 2026-03-05), the `struct odb_source` now provides a pluggable callback for beginning transactions. Use the callback provided by the ODB source accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- odb/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/odb/transaction.c b/odb/transaction.c index 9bf3f347dc..592ac84075 100644 --- a/odb/transaction.c +++ b/odb/transaction.c @@ -1,5 +1,5 @@ #include "git-compat-util.h" -#include "object-file.h" +#include "odb/source.h" #include "odb/transaction.h" struct odb_transaction *odb_transaction_begin(struct object_database *odb) @@ -7,7 +7,7 @@ struct odb_transaction *odb_transaction_begin(struct object_database *odb) if (odb->transaction) return NULL; - odb->transaction = odb_transaction_files_begin(odb->sources); + odb_source_begin_transaction(odb->sources, &odb->transaction); return odb->transaction; } From 970f63519e494b590c807972af6c40477e14bc61 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:36 -0500 Subject: [PATCH 03/24] odb: update `struct odb_write_stream` read() callback The `read()` callback used by `struct odb_write_stream` currently returns a pointer to an internal buffer along with the number of bytes read. This makes buffer ownership unclear and provides no way to report errors. Update the interface to instead require the caller to provide a buffer, and have the callback return the number of bytes written to it or a negative value on error. While at it, also move the `struct odb_write_stream` definition to "odb/streaming.h". Call sites are updated accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- builtin/unpack-objects.c | 20 ++++++++------------ object-file.c | 15 ++++++++++++--- odb.h | 6 +----- odb/streaming.c | 5 +++++ odb/streaming.h | 18 ++++++++++++++++++ 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index bc9b1e047e..64e58e79fd 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -9,6 +9,7 @@ #include "hex.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" #include "odb/transaction.h" #include "object.h" #include "delta.h" @@ -360,24 +361,21 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size, struct input_zstream_data { git_zstream *zstream; - unsigned char buf[8192]; int status; }; -static const void *feed_input_zstream(struct odb_write_stream *in_stream, - unsigned long *readlen) +static ssize_t feed_input_zstream(struct odb_write_stream *in_stream, + unsigned char *buf, size_t buf_len) { struct input_zstream_data *data = in_stream->data; git_zstream *zstream = data->zstream; void *in = fill(1); - if (in_stream->is_finished) { - *readlen = 0; - return NULL; - } + if (in_stream->is_finished) + return 0; - zstream->next_out = data->buf; - zstream->avail_out = sizeof(data->buf); + zstream->next_out = buf; + zstream->avail_out = buf_len; zstream->next_in = in; zstream->avail_in = len; @@ -385,9 +383,7 @@ static const void *feed_input_zstream(struct odb_write_stream *in_stream, in_stream->is_finished = data->status != Z_OK; use(len - zstream->avail_in); - *readlen = sizeof(data->buf) - zstream->avail_out; - - return data->buf; + return buf_len - zstream->avail_out; } static void stream_blob(unsigned long size, unsigned nr) diff --git a/object-file.c b/object-file.c index bfbb632cf8..a1afca23c5 100644 --- a/object-file.c +++ b/object-file.c @@ -1066,6 +1066,7 @@ int odb_source_loose_write_stream(struct odb_source *source, struct git_hash_ctx c, compat_c; struct strbuf tmp_file = STRBUF_INIT; struct strbuf filename = STRBUF_INIT; + unsigned char buf[8192]; int dirlen; char hdr[MAX_HEADER_LEN]; int hdrlen; @@ -1098,9 +1099,17 @@ int odb_source_loose_write_stream(struct odb_source *source, unsigned char *in0 = stream.next_in; if (!stream.avail_in && !in_stream->is_finished) { - const void *in = in_stream->read(in_stream, &stream.avail_in); - stream.next_in = (void *)in; - in0 = (unsigned char *)in; + ssize_t read_len = odb_write_stream_read(in_stream, buf, + sizeof(buf)); + if (read_len < 0) { + close(fd); + err = -1; + goto cleanup; + } + + stream.avail_in = read_len; + stream.next_in = buf; + in0 = buf; /* All data has been read. */ if (in_stream->is_finished) flush = 1; diff --git a/odb.h b/odb.h index ec5367b13e..6faeaa0589 100644 --- a/odb.h +++ b/odb.h @@ -529,11 +529,7 @@ static inline int odb_write_object(struct object_database *odb, return odb_write_object_ext(odb, buf, len, type, oid, NULL, 0); } -struct odb_write_stream { - const void *(*read)(struct odb_write_stream *, unsigned long *len); - void *data; - int is_finished; -}; +struct odb_write_stream; int odb_write_object_stream(struct object_database *odb, struct odb_write_stream *stream, size_t len, diff --git a/odb/streaming.c b/odb/streaming.c index 5927a12954..a68dd2cbe3 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -232,6 +232,11 @@ struct odb_read_stream *odb_read_stream_open(struct object_database *odb, return st; } +ssize_t odb_write_stream_read(struct odb_write_stream *st, void *buf, size_t sz) +{ + return st->read(st, buf, sz); +} + int odb_stream_blob_to_fd(struct object_database *odb, int fd, const struct object_id *oid, diff --git a/odb/streaming.h b/odb/streaming.h index c7861f7e13..65ced911fe 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -47,6 +47,24 @@ int odb_read_stream_close(struct odb_read_stream *stream); */ ssize_t odb_read_stream_read(struct odb_read_stream *stream, void *buf, size_t len); +/* + * A stream that provides an object to be written to the object database without + * loading all of it into memory. + */ +struct odb_write_stream { + ssize_t (*read)(struct odb_write_stream *, unsigned char *, size_t); + void *data; + int is_finished; +}; + +/* + * Read data from the stream into the buffer. Returns 0 when finished and the + * number of bytes read on success. Returns a negative error code in case + * reading from the stream fails. + */ +ssize_t odb_write_stream_read(struct odb_write_stream *stream, void *buf, + size_t len); + /* * Look up the object by its ID and write the full contents to the file * descriptor. The object must be a blob, or the function will fail. When From 8a1f5ecf287bf73c2dee102d726709f444b77c44 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:37 -0500 Subject: [PATCH 04/24] object-file: remove flags from transaction packfile writes The `index_blob_packfile_transaction()` function handles streaming a blob from an fd to compute its object ID and conditionally writes the object directly to a packfile if the INDEX_WRITE_OBJECT flag is set. A subsequent commit will make these packfile object writes part of the transaction interface. Consequently, having the object write be conditional on this flag is a bit awkward. In preparation for this change, introduce a dedicated `hash_blob_stream()` helper that only computes the OID from a `struct odb_write_stream`. This is invoked by `index_fd()` instead when the INDEX_WRITE_OBJECT is not set. The object write performed via `index_blob_packfile_transaction()` is made unconditional accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 128 +++++++++++++++++++++++++++++------------------- odb/streaming.c | 46 +++++++++++++++++ odb/streaming.h | 12 +++++ 3 files changed, 136 insertions(+), 50 deletions(-) diff --git a/object-file.c b/object-file.c index a1afca23c5..a59030911f 100644 --- a/object-file.c +++ b/object-file.c @@ -1397,11 +1397,10 @@ static int already_written(struct odb_transaction_files *transaction, } /* Lazily create backing packfile for the state */ -static void prepare_packfile_transaction(struct odb_transaction_files *transaction, - unsigned flags) +static void prepare_packfile_transaction(struct odb_transaction_files *transaction) { struct transaction_packfile *state = &transaction->packfile; - if (!(flags & INDEX_WRITE_OBJECT) || state->f) + if (state->f) return; state->f = create_tmp_packfile(transaction->base.source->odb->repo, @@ -1414,6 +1413,39 @@ static void prepare_packfile_transaction(struct odb_transaction_files *transacti die_errno("unable to write pack header"); } +static int hash_blob_stream(struct odb_write_stream *stream, + const struct git_hash_algo *hash_algo, + struct object_id *result_oid, size_t size) +{ + unsigned char buf[16384]; + struct git_hash_ctx ctx; + unsigned header_len; + size_t bytes_hashed = 0; + + header_len = format_object_header((char *)buf, sizeof(buf), + OBJ_BLOB, size); + hash_algo->init_fn(&ctx); + git_hash_update(&ctx, buf, header_len); + + while (!stream->is_finished) { + ssize_t read_result = odb_write_stream_read(stream, buf, + sizeof(buf)); + + if (read_result < 0) + return -1; + + git_hash_update(&ctx, buf, read_result); + bytes_hashed += read_result; + } + + if (bytes_hashed != size) + return -1; + + git_hash_final_oid(result_oid, &ctx); + + return 0; +} + /* * Read the contents from fd for size bytes, streaming it to the * packfile in state while updating the hash in ctx. Signal a failure @@ -1431,15 +1463,13 @@ static void prepare_packfile_transaction(struct odb_transaction_files *transacti */ static int stream_blob_to_pack(struct transaction_packfile *state, struct git_hash_ctx *ctx, off_t *already_hashed_to, - int fd, size_t size, const char *path, - unsigned flags) + int fd, size_t size, const char *path) { git_zstream s; unsigned char ibuf[16384]; unsigned char obuf[16384]; unsigned hdrlen; int status = Z_OK; - int write_object = (flags & INDEX_WRITE_OBJECT); off_t offset = 0; git_deflate_init(&s, pack_compression_level); @@ -1474,20 +1504,18 @@ static int stream_blob_to_pack(struct transaction_packfile *state, status = git_deflate(&s, size ? 0 : Z_FINISH); if (!s.avail_out || status == Z_STREAM_END) { - if (write_object) { - size_t written = s.next_out - obuf; + size_t written = s.next_out - obuf; - /* would we bust the size limit? */ - if (state->nr_written && - pack_size_limit_cfg && - pack_size_limit_cfg < state->offset + written) { - git_deflate_abort(&s); - return -1; - } - - hashwrite(state->f, obuf, written); - state->offset += written; + /* would we bust the size limit? */ + if (state->nr_written && + pack_size_limit_cfg && + pack_size_limit_cfg < state->offset + written) { + git_deflate_abort(&s); + return -1; } + + hashwrite(state->f, obuf, written); + state->offset += written; s.next_out = obuf; s.avail_out = sizeof(obuf); } @@ -1575,8 +1603,7 @@ clear_exit: */ static int index_blob_packfile_transaction(struct odb_transaction_files *transaction, struct object_id *result_oid, int fd, - size_t size, const char *path, - unsigned flags) + size_t size, const char *path) { struct transaction_packfile *state = &transaction->packfile; off_t seekback, already_hashed_to; @@ -1584,7 +1611,7 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac unsigned char obuf[16384]; unsigned header_len; struct hashfile_checkpoint checkpoint; - struct pack_idx_entry *idx = NULL; + struct pack_idx_entry *idx; seekback = lseek(fd, 0, SEEK_CUR); if (seekback == (off_t)-1) @@ -1595,33 +1622,26 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac transaction->base.source->odb->repo->hash_algo->init_fn(&ctx); git_hash_update(&ctx, obuf, header_len); - /* Note: idx is non-NULL when we are writing */ - if ((flags & INDEX_WRITE_OBJECT) != 0) { - CALLOC_ARRAY(idx, 1); - - prepare_packfile_transaction(transaction, flags); - hashfile_checkpoint_init(state->f, &checkpoint); - } + CALLOC_ARRAY(idx, 1); + prepare_packfile_transaction(transaction); + hashfile_checkpoint_init(state->f, &checkpoint); already_hashed_to = 0; while (1) { - prepare_packfile_transaction(transaction, flags); - if (idx) { - hashfile_checkpoint(state->f, &checkpoint); - idx->offset = state->offset; - crc32_begin(state->f); - } + prepare_packfile_transaction(transaction); + hashfile_checkpoint(state->f, &checkpoint); + idx->offset = state->offset; + crc32_begin(state->f); + if (!stream_blob_to_pack(state, &ctx, &already_hashed_to, - fd, size, path, flags)) + fd, size, path)) break; /* * Writing this object to the current pack will make * it too big; we need to truncate it, start a new * pack, and write into it. */ - if (!idx) - BUG("should not happen"); hashfile_truncate(state->f, &checkpoint); state->offset = checkpoint.offset; flush_packfile_transaction(transaction); @@ -1629,8 +1649,6 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac return error("cannot seek back"); } git_hash_final_oid(result_oid, &ctx); - if (!idx) - return 0; idx->crc32 = crc32_end(state->f); if (already_written(transaction, result_oid)) { @@ -1668,18 +1686,28 @@ int index_fd(struct index_state *istate, struct object_id *oid, ret = index_core(istate, oid, fd, xsize_t(st->st_size), type, path, flags); } else { - struct object_database *odb = the_repository->objects; - struct odb_transaction_files *files_transaction; - struct odb_transaction *transaction; + struct odb_write_stream stream; + odb_write_stream_from_fd(&stream, fd, xsize_t(st->st_size)); - transaction = odb_transaction_begin(odb); - files_transaction = container_of(odb->transaction, - struct odb_transaction_files, - base); - ret = index_blob_packfile_transaction(files_transaction, oid, fd, - xsize_t(st->st_size), - path, flags); - odb_transaction_commit(transaction); + if (flags & INDEX_WRITE_OBJECT) { + struct object_database *odb = the_repository->objects; + struct odb_transaction_files *files_transaction; + struct odb_transaction *transaction; + + transaction = odb_transaction_begin(odb); + files_transaction = container_of(odb->transaction, + struct odb_transaction_files, + base); + ret = index_blob_packfile_transaction(files_transaction, oid, fd, + xsize_t(st->st_size), path); + odb_transaction_commit(transaction); + } else { + ret = hash_blob_stream(&stream, + the_repository->hash_algo, oid, + xsize_t(st->st_size)); + } + + odb_write_stream_release(&stream); } close(fd); diff --git a/odb/streaming.c b/odb/streaming.c index a68dd2cbe3..20531e864c 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -237,6 +237,11 @@ ssize_t odb_write_stream_read(struct odb_write_stream *st, void *buf, size_t sz) return st->read(st, buf, sz); } +void odb_write_stream_release(struct odb_write_stream *st) +{ + free(st->data); +} + int odb_stream_blob_to_fd(struct object_database *odb, int fd, const struct object_id *oid, @@ -292,3 +297,44 @@ int odb_stream_blob_to_fd(struct object_database *odb, odb_read_stream_close(st); return result; } + +struct read_object_fd_data { + int fd; + size_t remaining; +}; + +static ssize_t read_object_fd(struct odb_write_stream *stream, + unsigned char *buf, size_t len) +{ + struct read_object_fd_data *data = stream->data; + ssize_t read_result; + size_t count; + + if (stream->is_finished) + return 0; + + count = data->remaining < len ? data->remaining : len; + read_result = read_in_full(data->fd, buf, count); + if (read_result < 0 || (size_t)read_result != count) + return -1; + + data->remaining -= count; + if (!data->remaining) + stream->is_finished = 1; + + return read_result; +} + +void odb_write_stream_from_fd(struct odb_write_stream *stream, int fd, + size_t size) +{ + struct read_object_fd_data *data; + + CALLOC_ARRAY(data, 1); + data->fd = fd; + data->remaining = size; + + stream->data = data; + stream->read = read_object_fd; + stream->is_finished = 0; +} diff --git a/odb/streaming.h b/odb/streaming.h index 65ced911fe..2a8cac19a4 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -5,6 +5,7 @@ #define STREAMING_H 1 #include "object.h" +#include "odb.h" struct object_database; struct odb_read_stream; @@ -65,6 +66,11 @@ struct odb_write_stream { ssize_t odb_write_stream_read(struct odb_write_stream *stream, void *buf, size_t len); +/* + * Releases memory allocated for underlying stream data. + */ +void odb_write_stream_release(struct odb_write_stream *stream); + /* * Look up the object by its ID and write the full contents to the file * descriptor. The object must be a blob, or the function will fail. When @@ -82,4 +88,10 @@ int odb_stream_blob_to_fd(struct object_database *odb, struct stream_filter *filter, int can_seek); +/* + * Sets up an ODB write stream that reads from an fd. + */ +void odb_write_stream_from_fd(struct odb_write_stream *stream, int fd, + size_t size); + #endif /* STREAMING_H */ From d4c92e2ac975f256ccc207c65bf46e3be75a2115 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:38 -0500 Subject: [PATCH 05/24] object-file: avoid fd seekback by checking object size upfront In certain scenarios, Git handles writing blobs that exceed "core.bigFileThreshold" differently by streaming the object directly into a packfile. When there is an active ODB transaction, these blobs are streamed to the same packfile instead of using a separate packfile for each. If "pack.packSizeLimit" is configured and streaming another object causes the packfile to exceed the configured limit, the packfile is truncated back to the previous object and the object write is restarted in a new packfile. This works fine, but requires the fd being read from to save a checkpoint so it becomes possible to rewind the input source via seeking back to a known offset at the beginning. In a subsequent commit, blob streaming is converted to use `struct odb_write_stream` as a more generic input source instead of an fd which doesn't provide a mechanism for rewinding. For this use case though, rewinding the fd is not strictly necessary because the inflated size of the object is known and can be used to approximate whether writing the object would cause the packfile to exceed the configured limit prior to writing anything. These blobs written to the packfile are never deltified thus the size difference between what is written versus the inflated size is due to zlib compression. While this does prevent packfiles from being filled to the potential maximum is some cases, it should be good enough and still prevents the packfile from exceeding any configured limit. Use the inflated blob size to determine whether writing an object to a packfile will exceed the configured "pack.packSizeLimit". Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 86 +++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 61 deletions(-) diff --git a/object-file.c b/object-file.c index a59030911f..6d7afdb723 100644 --- a/object-file.c +++ b/object-file.c @@ -1448,29 +1448,17 @@ static int hash_blob_stream(struct odb_write_stream *stream, /* * Read the contents from fd for size bytes, streaming it to the - * packfile in state while updating the hash in ctx. Signal a failure - * by returning a negative value when the resulting pack would exceed - * the pack size limit and this is not the first object in the pack, - * so that the caller can discard what we wrote from the current pack - * by truncating it and opening a new one. The caller will then call - * us again after rewinding the input fd. - * - * The already_hashed_to pointer is kept untouched by the caller to - * make sure we do not hash the same byte when we are called - * again. This way, the caller does not have to checkpoint its hash - * status before calling us just in case we ask it to call us again - * with a new pack. + * packfile in state while updating the hash in ctx. */ -static int stream_blob_to_pack(struct transaction_packfile *state, - struct git_hash_ctx *ctx, off_t *already_hashed_to, - int fd, size_t size, const char *path) +static void stream_blob_to_pack(struct transaction_packfile *state, + struct git_hash_ctx *ctx, int fd, size_t size, + const char *path) { git_zstream s; unsigned char ibuf[16384]; unsigned char obuf[16384]; unsigned hdrlen; int status = Z_OK; - off_t offset = 0; git_deflate_init(&s, pack_compression_level); @@ -1487,15 +1475,9 @@ static int stream_blob_to_pack(struct transaction_packfile *state, if ((size_t)read_result != rsize) die("failed to read %u bytes from '%s'", (unsigned)rsize, path); - offset += rsize; - if (*already_hashed_to < offset) { - size_t hsize = offset - *already_hashed_to; - if (rsize < hsize) - hsize = rsize; - if (hsize) - git_hash_update(ctx, ibuf, hsize); - *already_hashed_to = offset; - } + + git_hash_update(ctx, ibuf, rsize); + s.next_in = ibuf; s.avail_in = rsize; size -= rsize; @@ -1506,14 +1488,6 @@ static int stream_blob_to_pack(struct transaction_packfile *state, if (!s.avail_out || status == Z_STREAM_END) { size_t written = s.next_out - obuf; - /* would we bust the size limit? */ - if (state->nr_written && - pack_size_limit_cfg && - pack_size_limit_cfg < state->offset + written) { - git_deflate_abort(&s); - return -1; - } - hashwrite(state->f, obuf, written); state->offset += written; s.next_out = obuf; @@ -1530,7 +1504,6 @@ static int stream_blob_to_pack(struct transaction_packfile *state, } } git_deflate_end(&s); - return 0; } static void flush_packfile_transaction(struct odb_transaction_files *transaction) @@ -1606,48 +1579,39 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac size_t size, const char *path) { struct transaction_packfile *state = &transaction->packfile; - off_t seekback, already_hashed_to; struct git_hash_ctx ctx; unsigned char obuf[16384]; unsigned header_len; struct hashfile_checkpoint checkpoint; struct pack_idx_entry *idx; - seekback = lseek(fd, 0, SEEK_CUR); - if (seekback == (off_t)-1) - return error("cannot find the current offset"); - header_len = format_object_header((char *)obuf, sizeof(obuf), OBJ_BLOB, size); transaction->base.source->odb->repo->hash_algo->init_fn(&ctx); git_hash_update(&ctx, obuf, header_len); + /* + * If writing another object to the packfile could result in it + * exceeding the configured size limit, flush the current packfile + * transaction. + * + * Note that this uses the inflated object size as an approximation. + * Blob objects written in this manner are not delta-compressed, so + * the difference between the inflated and on-disk size is limited + * to zlib compression and is sufficient for this check. + */ + if (state->nr_written && pack_size_limit_cfg && + pack_size_limit_cfg < state->offset + size) + flush_packfile_transaction(transaction); + CALLOC_ARRAY(idx, 1); prepare_packfile_transaction(transaction); hashfile_checkpoint_init(state->f, &checkpoint); - already_hashed_to = 0; - - while (1) { - prepare_packfile_transaction(transaction); - hashfile_checkpoint(state->f, &checkpoint); - idx->offset = state->offset; - crc32_begin(state->f); - - if (!stream_blob_to_pack(state, &ctx, &already_hashed_to, - fd, size, path)) - break; - /* - * Writing this object to the current pack will make - * it too big; we need to truncate it, start a new - * pack, and write into it. - */ - hashfile_truncate(state->f, &checkpoint); - state->offset = checkpoint.offset; - flush_packfile_transaction(transaction); - if (lseek(fd, seekback, SEEK_SET) == (off_t)-1) - return error("cannot seek back"); - } + hashfile_checkpoint(state->f, &checkpoint); + idx->offset = state->offset; + crc32_begin(state->f); + stream_blob_to_pack(state, &ctx, fd, size, path); git_hash_final_oid(result_oid, &ctx); idx->crc32 = crc32_end(state->f); From 45a75d6187dd8e85470e70aaada6346576333370 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:39 -0500 Subject: [PATCH 06/24] object-file: generalize packfile writes to use odb_write_stream The `index_blob_packfile_transaction()` function streams blob data directly from an fd. This makes it difficult to reuse as part of a generic transactional object writing interface. Refactor the packfile write path to operate on a `struct odb_write_stream`, allowing callers to supply data from arbitrary sources. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 54 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/object-file.c b/object-file.c index 6d7afdb723..0d492e6962 100644 --- a/object-file.c +++ b/object-file.c @@ -1447,18 +1447,19 @@ static int hash_blob_stream(struct odb_write_stream *stream, } /* - * Read the contents from fd for size bytes, streaming it to the + * Read the contents from the stream provided, streaming it to the * packfile in state while updating the hash in ctx. */ static void stream_blob_to_pack(struct transaction_packfile *state, - struct git_hash_ctx *ctx, int fd, size_t size, - const char *path) + struct git_hash_ctx *ctx, size_t size, + struct odb_write_stream *stream) { git_zstream s; unsigned char ibuf[16384]; unsigned char obuf[16384]; unsigned hdrlen; int status = Z_OK; + size_t bytes_read = 0; git_deflate_init(&s, pack_compression_level); @@ -1467,23 +1468,21 @@ static void stream_blob_to_pack(struct transaction_packfile *state, s.avail_out = sizeof(obuf) - hdrlen; while (status != Z_STREAM_END) { - if (size && !s.avail_in) { - size_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf); - ssize_t read_result = read_in_full(fd, ibuf, rsize); - if (read_result < 0) - die_errno("failed to read from '%s'", path); - if ((size_t)read_result != rsize) - die("failed to read %u bytes from '%s'", - (unsigned)rsize, path); + if (!stream->is_finished && !s.avail_in) { + ssize_t rsize = odb_write_stream_read(stream, ibuf, + sizeof(ibuf)); + + if (rsize < 0) + die("failed to read blob data"); git_hash_update(ctx, ibuf, rsize); s.next_in = ibuf; s.avail_in = rsize; - size -= rsize; + bytes_read += rsize; } - status = git_deflate(&s, size ? 0 : Z_FINISH); + status = git_deflate(&s, stream->is_finished ? Z_FINISH : 0); if (!s.avail_out || status == Z_STREAM_END) { size_t written = s.next_out - obuf; @@ -1503,6 +1502,11 @@ static void stream_blob_to_pack(struct transaction_packfile *state, die("unexpected deflate failure: %d", status); } } + + if (bytes_read != size) + die("read %" PRIuMAX " bytes of blob data, but expected %" PRIuMAX " bytes", + (uintmax_t)bytes_read, (uintmax_t)size); + git_deflate_end(&s); } @@ -1574,10 +1578,13 @@ clear_exit: * binary blobs, they generally do not want to get any conversion, and * callers should avoid this code path when filters are requested. */ -static int index_blob_packfile_transaction(struct odb_transaction_files *transaction, - struct object_id *result_oid, int fd, - size_t size, const char *path) +static int index_blob_packfile_transaction(struct odb_transaction *base, + struct odb_write_stream *stream, + size_t size, struct object_id *result_oid) { + struct odb_transaction_files *transaction = container_of(base, + struct odb_transaction_files, + base); struct transaction_packfile *state = &transaction->packfile; struct git_hash_ctx ctx; unsigned char obuf[16384]; @@ -1611,7 +1618,7 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac hashfile_checkpoint(state->f, &checkpoint); idx->offset = state->offset; crc32_begin(state->f); - stream_blob_to_pack(state, &ctx, fd, size, path); + stream_blob_to_pack(state, &ctx, size, stream); git_hash_final_oid(result_oid, &ctx); idx->crc32 = crc32_end(state->f); @@ -1655,15 +1662,12 @@ int index_fd(struct index_state *istate, struct object_id *oid, if (flags & INDEX_WRITE_OBJECT) { struct object_database *odb = the_repository->objects; - struct odb_transaction_files *files_transaction; - struct odb_transaction *transaction; + struct odb_transaction *transaction = odb_transaction_begin(odb); - transaction = odb_transaction_begin(odb); - files_transaction = container_of(odb->transaction, - struct odb_transaction_files, - base); - ret = index_blob_packfile_transaction(files_transaction, oid, fd, - xsize_t(st->st_size), path); + ret = index_blob_packfile_transaction(odb->transaction, + &stream, + xsize_t(st->st_size), + oid); odb_transaction_commit(transaction); } else { ret = hash_blob_stream(&stream, From 08b6afb2a2dbf762e3d9fa7abd78090b9afbd1a8 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 14 May 2026 13:37:40 -0500 Subject: [PATCH 07/24] odb/transaction: make `write_object_stream()` pluggable How an ODB transaction handles writing objects is expected to vary between implementations. Introduce a new `write_object_stream()` callback in `struct odb_transaction` to make this function pluggable. Rename `index_blob_packfile_transaction()` to `odb_transaction_files_write_object_stream()` and wire it up for use with `struct odb_transaction_files` accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 16 +++++++++------- odb/transaction.c | 7 +++++++ odb/transaction.h | 25 ++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/object-file.c b/object-file.c index 0d492e6962..23f665df90 100644 --- a/object-file.c +++ b/object-file.c @@ -1578,9 +1578,10 @@ clear_exit: * binary blobs, they generally do not want to get any conversion, and * callers should avoid this code path when filters are requested. */ -static int index_blob_packfile_transaction(struct odb_transaction *base, - struct odb_write_stream *stream, - size_t size, struct object_id *result_oid) +static int odb_transaction_files_write_object_stream(struct odb_transaction *base, + struct odb_write_stream *stream, + size_t size, + struct object_id *result_oid) { struct odb_transaction_files *transaction = container_of(base, struct odb_transaction_files, @@ -1664,10 +1665,10 @@ int index_fd(struct index_state *istate, struct object_id *oid, struct object_database *odb = the_repository->objects; struct odb_transaction *transaction = odb_transaction_begin(odb); - ret = index_blob_packfile_transaction(odb->transaction, - &stream, - xsize_t(st->st_size), - oid); + ret = odb_transaction_write_object_stream(odb->transaction, + &stream, + xsize_t(st->st_size), + oid); odb_transaction_commit(transaction); } else { ret = hash_blob_stream(&stream, @@ -2132,6 +2133,7 @@ struct odb_transaction *odb_transaction_files_begin(struct odb_source *source) transaction = xcalloc(1, sizeof(*transaction)); transaction->base.source = source; transaction->base.commit = odb_transaction_files_commit; + transaction->base.write_object_stream = odb_transaction_files_write_object_stream; return &transaction->base; } diff --git a/odb/transaction.c b/odb/transaction.c index 592ac84075..b16e07aebf 100644 --- a/odb/transaction.c +++ b/odb/transaction.c @@ -26,3 +26,10 @@ void odb_transaction_commit(struct odb_transaction *transaction) transaction->source->odb->transaction = NULL; free(transaction); } + +int odb_transaction_write_object_stream(struct odb_transaction *transaction, + struct odb_write_stream *stream, + size_t len, struct object_id *oid) +{ + return transaction->write_object_stream(transaction, stream, len, oid); +} diff --git a/odb/transaction.h b/odb/transaction.h index a56e392f21..854fda06f5 100644 --- a/odb/transaction.h +++ b/odb/transaction.h @@ -12,14 +12,24 @@ * * Each ODB source is expected to implement its own transaction handling. */ -struct odb_transaction; -typedef void (*odb_transaction_commit_fn)(struct odb_transaction *transaction); struct odb_transaction { /* The ODB source the transaction is opened against. */ struct odb_source *source; /* The ODB source specific callback invoked to commit a transaction. */ - odb_transaction_commit_fn commit; + void (*commit)(struct odb_transaction *transaction); + + /* + * This callback is expected to write the given object stream into + * the ODB transaction. Note that for now, only blobs support streaming. + * + * The resulting object ID shall be written into the out pointer. The + * callback is expected to return 0 on success, a negative error code + * otherwise. + */ + int (*write_object_stream)(struct odb_transaction *transaction, + struct odb_write_stream *stream, size_t len, + struct object_id *oid); }; /* @@ -35,4 +45,13 @@ struct odb_transaction *odb_transaction_begin(struct object_database *odb); */ void odb_transaction_commit(struct odb_transaction *transaction); +/* + * Writes the object in the provided stream into the transaction. The resulting + * object ID is written into the out pointer. Returns 0 on success, a negative + * error code otherwise. + */ +int odb_transaction_write_object_stream(struct odb_transaction *transaction, + struct odb_write_stream *stream, + size_t len, struct object_id *oid); + #endif From 822d403651aa6baff064095f98d8d8349d876eb8 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:31 +0200 Subject: [PATCH 08/24] odb: introduce "in-memory" source Next to our typical object database sources, each object database also has an implicit source of "cached" objects. These cached objects only exist in memory and some use cases: - They contain evergreen objects that we expect to always exist, like for example the empty tree. - They can be used to store temporary objects that we don't want to persist to disk, which is used by git-blame(1) to create a fake worktree commit. Overall, their use is somewhat restricted though. For example, we don't provide the ability to use it as a temporary object database source that allows the user to write objects, but discard them after Git exists. So while these cached objects behave almost like a source, they aren't used as one. This is about to change over the following commits, where we will turn cached objects into a new "in-memory" source. This will allow us to use it exactly the same as any other source by providing the same common interface as the "files" source. For now, the in-memory source only hosts the cached objects and doesn't provide any logic yet. This will change with subsequent commits, where we move respective functionality into the source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- Makefile | 1 + meson.build | 1 + odb.c | 21 +++++++++++++-------- odb.h | 4 ++-- odb/source-inmemory.c | 12 ++++++++++++ odb/source-inmemory.h | 35 +++++++++++++++++++++++++++++++++++ odb/source.h | 3 +++ 7 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 odb/source-inmemory.c create mode 100644 odb/source-inmemory.h diff --git a/Makefile b/Makefile index 22a8993482..3cda12c455 100644 --- a/Makefile +++ b/Makefile @@ -1218,6 +1218,7 @@ LIB_OBJS += object.o LIB_OBJS += odb.o LIB_OBJS += odb/source.o LIB_OBJS += odb/source-files.o +LIB_OBJS += odb/source-inmemory.o LIB_OBJS += odb/streaming.o LIB_OBJS += odb/transaction.o LIB_OBJS += oid-array.o diff --git a/meson.build b/meson.build index 6dc23b3af2..ffa73ce7ce 100644 --- a/meson.build +++ b/meson.build @@ -404,6 +404,7 @@ libgit_sources = [ 'odb.c', 'odb/source.c', 'odb/source-files.c', + 'odb/source-inmemory.c', 'odb/streaming.c', 'odb/transaction.c', 'oid-array.c', diff --git a/odb.c b/odb.c index 40a5e9c4e0..60e1eead25 100644 --- a/odb.c +++ b/odb.c @@ -14,6 +14,7 @@ #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "odb/source-inmemory.h" #include "packfile.h" #include "path.h" #include "promisor-remote.h" @@ -53,9 +54,9 @@ static const struct cached_object *find_cached_object(struct object_database *ob .type = OBJ_TREE, .buf = "", }; - const struct cached_object_entry *co = object_store->cached_objects; + const struct cached_object_entry *co = object_store->inmemory_objects->objects; - for (size_t i = 0; i < object_store->cached_object_nr; i++, co++) + for (size_t i = 0; i < object_store->inmemory_objects->objects_nr; i++, co++) if (oideq(&co->oid, oid)) return &co->value; @@ -792,9 +793,10 @@ int odb_pretend_object(struct object_database *odb, find_cached_object(odb, oid)) return 0; - ALLOC_GROW(odb->cached_objects, - odb->cached_object_nr + 1, odb->cached_object_alloc); - co = &odb->cached_objects[odb->cached_object_nr++]; + ALLOC_GROW(odb->inmemory_objects->objects, + odb->inmemory_objects->objects_nr + 1, + odb->inmemory_objects->objects_alloc); + co = &odb->inmemory_objects->objects[odb->inmemory_objects->objects_nr++]; co->value.size = len; co->value.type = type; co_buf = xmalloc(len); @@ -1083,6 +1085,7 @@ struct object_database *odb_new(struct repository *repo, o->sources = odb_source_new(o, primary_source, true); o->sources_tail = &o->sources->next; o->alternate_db = xstrdup_or_null(secondary_sources); + o->inmemory_objects = odb_source_inmemory_new(o); free(to_free); @@ -1123,9 +1126,11 @@ void odb_free(struct object_database *o) odb_close(o); odb_free_sources(o); - for (size_t i = 0; i < o->cached_object_nr; i++) - free((char *) o->cached_objects[i].value.buf); - free(o->cached_objects); + for (size_t i = 0; i < o->inmemory_objects->objects_nr; i++) + free((char *) o->inmemory_objects->objects[i].value.buf); + free(o->inmemory_objects->objects); + free(o->inmemory_objects->base.path); + free(o->inmemory_objects); string_list_clear(&o->submodule_source_paths, 0); diff --git a/odb.h b/odb.h index 9eb8355aca..c3a7edf9c8 100644 --- a/odb.h +++ b/odb.h @@ -8,6 +8,7 @@ #include "thread-utils.h" struct cached_object_entry; +struct odb_source_inmemory; struct packed_git; struct repository; struct strbuf; @@ -80,8 +81,7 @@ struct object_database { * to write them into the object store (e.g. a browse-only * application). */ - struct cached_object_entry *cached_objects; - size_t cached_object_nr, cached_object_alloc; + struct odb_source_inmemory *inmemory_objects; /* * A fast, rough count of the number of objects in the repository. diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c new file mode 100644 index 0000000000..c7ac5c24f0 --- /dev/null +++ b/odb/source-inmemory.c @@ -0,0 +1,12 @@ +#include "git-compat-util.h" +#include "odb/source-inmemory.h" + +struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) +{ + struct odb_source_inmemory *source; + + CALLOC_ARRAY(source, 1); + odb_source_init(&source->base, odb, ODB_SOURCE_INMEMORY, "source", false); + + return source; +} diff --git a/odb/source-inmemory.h b/odb/source-inmemory.h new file mode 100644 index 0000000000..15db068ef7 --- /dev/null +++ b/odb/source-inmemory.h @@ -0,0 +1,35 @@ +#ifndef ODB_SOURCE_INMEMORY_H +#define ODB_SOURCE_INMEMORY_H + +#include "odb/source.h" + +struct cached_object_entry; + +/* + * An in-memory source that you can write objects to that shall be made + * available for reading, but that shouldn't ever be persisted to disk. Note + * that any objects written to this source will be stored in memory, so the + * number of objects you can store is limited by available system memory. + */ +struct odb_source_inmemory { + struct odb_source base; + + struct cached_object_entry *objects; + size_t objects_nr, objects_alloc; +}; + +/* Create a new in-memory object database source. */ +struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb); + +/* + * Cast the given object database source to the in-memory backend. This will + * cause a BUG in case the source doesn't use this backend. + */ +static inline struct odb_source_inmemory *odb_source_inmemory_downcast(struct odb_source *source) +{ + if (source->type != ODB_SOURCE_INMEMORY) + BUG("trying to downcast source of type '%d' to in-memory", source->type); + return container_of(source, struct odb_source_inmemory, base); +} + +#endif diff --git a/odb/source.h b/odb/source.h index f706e0608a..0a440884e4 100644 --- a/odb/source.h +++ b/odb/source.h @@ -13,6 +13,9 @@ enum odb_source_type { /* The "files" backend that uses loose objects and packfiles. */ ODB_SOURCE_FILES, + + /* The "in-memory" backend that stores objects in memory. */ + ODB_SOURCE_INMEMORY, }; struct object_id; From 8caa2e090f1b83df7c0fc82ed7f7c8772f3ec5f4 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:32 +0200 Subject: [PATCH 09/24] odb/source-inmemory: implement `free()` callback Implement the `free()` callback function for the "in-memory" source. Note that this requires us to define `struct cached_object_entry` in "odb/source-inmemory.h", as it is accessed in both "odb.c" and "odb/source-inmemory.c" now. This will be fixed in subsequent commits though. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 25 ++++--------------------- odb/source-inmemory.c | 12 ++++++++++++ odb/source-inmemory.h | 9 ++++++++- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/odb.c b/odb.c index 60e1eead25..1d65825ed3 100644 --- a/odb.c +++ b/odb.c @@ -32,21 +32,6 @@ KHASH_INIT(odb_path_map, const char * /* key: odb_path */, struct odb_source *, 1, fspathhash, fspatheq) -/* - * This is meant to hold a *small* number of objects that you would - * want odb_read_object() to be able to return, but yet you do not want - * to write them into the object store (e.g. a browse-only - * application). - */ -struct cached_object_entry { - struct object_id oid; - struct cached_object { - enum object_type type; - const void *buf; - unsigned long size; - } value; -}; - static const struct cached_object *find_cached_object(struct object_database *object_store, const struct object_id *oid) { @@ -1109,6 +1094,10 @@ static void odb_free_sources(struct object_database *o) odb_source_free(o->sources); o->sources = next; } + + odb_source_free(&o->inmemory_objects->base); + o->inmemory_objects = NULL; + kh_destroy_odb_path_map(o->source_by_path); o->source_by_path = NULL; } @@ -1126,12 +1115,6 @@ void odb_free(struct object_database *o) odb_close(o); odb_free_sources(o); - for (size_t i = 0; i < o->inmemory_objects->objects_nr; i++) - free((char *) o->inmemory_objects->objects[i].value.buf); - free(o->inmemory_objects->objects); - free(o->inmemory_objects->base.path); - free(o->inmemory_objects); - string_list_clear(&o->submodule_source_paths, 0); free(o); diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index c7ac5c24f0..ccbb622eae 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -1,6 +1,16 @@ #include "git-compat-util.h" #include "odb/source-inmemory.h" +static void odb_source_inmemory_free(struct odb_source *source) +{ + struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); + for (size_t i = 0; i < inmemory->objects_nr; i++) + free((char *) inmemory->objects[i].value.buf); + free(inmemory->objects); + free(inmemory->base.path); + free(inmemory); +} + struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) { struct odb_source_inmemory *source; @@ -8,5 +18,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) CALLOC_ARRAY(source, 1); odb_source_init(&source->base, odb, ODB_SOURCE_INMEMORY, "source", false); + source->base.free = odb_source_inmemory_free; + return source; } diff --git a/odb/source-inmemory.h b/odb/source-inmemory.h index 15db068ef7..d1b05a3996 100644 --- a/odb/source-inmemory.h +++ b/odb/source-inmemory.h @@ -3,7 +3,14 @@ #include "odb/source.h" -struct cached_object_entry; +struct cached_object_entry { + struct object_id oid; + struct cached_object { + enum object_type type; + const void *buf; + unsigned long size; + } value; +}; /* * An in-memory source that you can write objects to that shall be made From 87de1b31e04fc5ce4f47c2a8dbfdc90b25e5bdbe Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:33 +0200 Subject: [PATCH 10/24] odb: fix unnecessary call to `find_cached_object()` The function `odb_pretend_object()` writes an object into the in-memory object database source. The effect of this is that the object will now become readable, but it won't ever be persisted to disk. Before storing the object, we first verify whether the object already exists. This is done by calling `odb_has_object()` to check all sources, followed by `find_cached_object()` to check whether we have already stored the object in our in-memory source. This is unnecessary though, as `odb_has_object()` already checks the in-memory source transitively via: - `odb_has_object()` - `odb_read_object_info_extended()` - `do_oid_object_info_extended()` - `find_cached_object()` Drop the explicit call to `find_cached_object()`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/odb.c b/odb.c index 1d65825ed3..ea3fcf5e11 100644 --- a/odb.c +++ b/odb.c @@ -774,8 +774,7 @@ int odb_pretend_object(struct object_database *odb, char *co_buf; hash_object_file(odb->repo->hash_algo, buf, len, type, oid); - if (odb_has_object(odb, oid, 0) || - find_cached_object(odb, oid)) + if (odb_has_object(odb, oid, 0)) return 0; ALLOC_GROW(odb->inmemory_objects->objects, From ec45c1e8bf8958bdcca2b324573d02ac934c51ea Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:34 +0200 Subject: [PATCH 11/24] odb/source-inmemory: implement `read_object_info()` callback Implement the `read_object_info()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 39 +------------------------------ odb/source-inmemory.c | 53 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/odb.c b/odb.c index ea3fcf5e11..6a3912adac 100644 --- a/odb.c +++ b/odb.c @@ -32,25 +32,6 @@ KHASH_INIT(odb_path_map, const char * /* key: odb_path */, struct odb_source *, 1, fspathhash, fspatheq) -static const struct cached_object *find_cached_object(struct object_database *object_store, - const struct object_id *oid) -{ - static const struct cached_object empty_tree = { - .type = OBJ_TREE, - .buf = "", - }; - const struct cached_object_entry *co = object_store->inmemory_objects->objects; - - for (size_t i = 0; i < object_store->inmemory_objects->objects_nr; i++, co++) - if (oideq(&co->oid, oid)) - return &co->value; - - if (oid->algo && oideq(oid, hash_algos[oid->algo].empty_tree)) - return &empty_tree; - - return NULL; -} - int odb_mkstemp(struct object_database *odb, struct strbuf *temp_filename, const char *pattern) { @@ -570,7 +551,6 @@ static int do_oid_object_info_extended(struct object_database *odb, const struct object_id *oid, struct object_info *oi, unsigned flags) { - const struct cached_object *co; const struct object_id *real = oid; int already_retried = 0; @@ -580,25 +560,8 @@ static int do_oid_object_info_extended(struct object_database *odb, if (is_null_oid(real)) return -1; - co = find_cached_object(odb, real); - if (co) { - if (oi) { - if (oi->typep) - *(oi->typep) = co->type; - if (oi->sizep) - *(oi->sizep) = co->size; - if (oi->disk_sizep) - *(oi->disk_sizep) = 0; - if (oi->delta_base_oid) - oidclr(oi->delta_base_oid, odb->repo->hash_algo); - if (oi->contentp) - *oi->contentp = xmemdupz(co->buf, co->size); - if (oi->mtimep) - *oi->mtimep = 0; - oi->whence = OI_CACHED; - } + if (!odb_source_read_object_info(&odb->inmemory_objects->base, oid, oi, flags)) return 0; - } odb_prepare_alternates(odb); diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index ccbb622eae..12c80f9b34 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -1,5 +1,57 @@ #include "git-compat-util.h" +#include "odb.h" #include "odb/source-inmemory.h" +#include "repository.h" + +static const struct cached_object *find_cached_object(struct odb_source_inmemory *source, + const struct object_id *oid) +{ + static const struct cached_object empty_tree = { + .type = OBJ_TREE, + .buf = "", + }; + const struct cached_object_entry *co = source->objects; + + for (size_t i = 0; i < source->objects_nr; i++, co++) + if (oideq(&co->oid, oid)) + return &co->value; + + if (oid->algo && oideq(oid, hash_algos[oid->algo].empty_tree)) + return &empty_tree; + + return NULL; +} + +static int odb_source_inmemory_read_object_info(struct odb_source *source, + const struct object_id *oid, + struct object_info *oi, + enum object_info_flags flags UNUSED) +{ + struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); + const struct cached_object *object; + + object = find_cached_object(inmemory, oid); + if (!object) + return -1; + + if (oi) { + if (oi->typep) + *(oi->typep) = object->type; + if (oi->sizep) + *(oi->sizep) = object->size; + if (oi->disk_sizep) + *(oi->disk_sizep) = 0; + if (oi->delta_base_oid) + oidclr(oi->delta_base_oid, source->odb->repo->hash_algo); + if (oi->contentp) + *oi->contentp = xmemdupz(object->buf, object->size); + if (oi->mtimep) + *oi->mtimep = 0; + oi->whence = OI_CACHED; + } + + return 0; +} static void odb_source_inmemory_free(struct odb_source *source) { @@ -19,6 +71,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) odb_source_init(&source->base, odb, ODB_SOURCE_INMEMORY, "source", false); source->base.free = odb_source_inmemory_free; + source->base.read_object_info = odb_source_inmemory_read_object_info; return source; } From 8d9c1e421ce36be06ff304ce166593cf2e4ef66f Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:35 +0200 Subject: [PATCH 12/24] odb/source-inmemory: implement `read_object_stream()` callback Implement the `read_object_stream()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 52 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 12c80f9b34..39f0e799c7 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -1,6 +1,7 @@ #include "git-compat-util.h" #include "odb.h" #include "odb/source-inmemory.h" +#include "odb/streaming.h" #include "repository.h" static const struct cached_object *find_cached_object(struct odb_source_inmemory *source, @@ -53,6 +54,56 @@ static int odb_source_inmemory_read_object_info(struct odb_source *source, return 0; } +struct odb_read_stream_inmemory { + struct odb_read_stream base; + const unsigned char *buf; + size_t offset; +}; + +static ssize_t odb_read_stream_inmemory_read(struct odb_read_stream *stream, + char *buf, size_t buf_len) +{ + struct odb_read_stream_inmemory *inmemory = + container_of(stream, struct odb_read_stream_inmemory, base); + size_t bytes = buf_len; + + if (buf_len > inmemory->base.size - inmemory->offset) + bytes = inmemory->base.size - inmemory->offset; + + memcpy(buf, inmemory->buf + inmemory->offset, bytes); + inmemory->offset += bytes; + + return bytes; +} + +static int odb_read_stream_inmemory_close(struct odb_read_stream *stream UNUSED) +{ + return 0; +} + +static int odb_source_inmemory_read_object_stream(struct odb_read_stream **out, + struct odb_source *source, + const struct object_id *oid) +{ + struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); + struct odb_read_stream_inmemory *stream; + const struct cached_object *object; + + object = find_cached_object(inmemory, oid); + if (!object) + return -1; + + CALLOC_ARRAY(stream, 1); + stream->base.read = odb_read_stream_inmemory_read; + stream->base.close = odb_read_stream_inmemory_close; + stream->base.size = object->size; + stream->base.type = object->type; + stream->buf = object->buf; + + *out = &stream->base; + return 0; +} + static void odb_source_inmemory_free(struct odb_source *source) { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); @@ -72,6 +123,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.free = odb_source_inmemory_free; source->base.read_object_info = odb_source_inmemory_read_object_info; + source->base.read_object_stream = odb_source_inmemory_read_object_stream; return source; } From f611f4ba41de07a89649c74c01477cf55b20bc31 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:36 +0200 Subject: [PATCH 13/24] odb/source-inmemory: implement `write_object()` callback Implement the `write_object()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 16 ++-------------- odb/source-inmemory.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/odb.c b/odb.c index 6a3912adac..24e929f03c 100644 --- a/odb.c +++ b/odb.c @@ -733,24 +733,12 @@ int odb_pretend_object(struct object_database *odb, void *buf, unsigned long len, enum object_type type, struct object_id *oid) { - struct cached_object_entry *co; - char *co_buf; - hash_object_file(odb->repo->hash_algo, buf, len, type, oid); if (odb_has_object(odb, oid, 0)) return 0; - ALLOC_GROW(odb->inmemory_objects->objects, - odb->inmemory_objects->objects_nr + 1, - odb->inmemory_objects->objects_alloc); - co = &odb->inmemory_objects->objects[odb->inmemory_objects->objects_nr++]; - co->value.size = len; - co->value.type = type; - co_buf = xmalloc(len); - memcpy(co_buf, buf, len); - co->value.buf = co_buf; - oidcpy(&co->oid, oid); - return 0; + return odb_source_write_object(&odb->inmemory_objects->base, + buf, len, type, oid, NULL, 0); } void *odb_read_object(struct object_database *odb, diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 39f0e799c7..4848011df5 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -1,4 +1,5 @@ #include "git-compat-util.h" +#include "object-file.h" #include "odb.h" #include "odb/source-inmemory.h" #include "odb/streaming.h" @@ -104,6 +105,29 @@ static int odb_source_inmemory_read_object_stream(struct odb_read_stream **out, return 0; } +static int odb_source_inmemory_write_object(struct odb_source *source, + const void *buf, unsigned long len, + enum object_type type, + struct object_id *oid, + struct object_id *compat_oid UNUSED, + enum odb_write_object_flags flags UNUSED) +{ + struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); + struct cached_object_entry *object; + + hash_object_file(source->odb->repo->hash_algo, buf, len, type, oid); + + ALLOC_GROW(inmemory->objects, inmemory->objects_nr + 1, + inmemory->objects_alloc); + object = &inmemory->objects[inmemory->objects_nr++]; + object->value.size = len; + object->value.type = type; + object->value.buf = xmemdupz(buf, len); + oidcpy(&object->oid, oid); + + return 0; +} + static void odb_source_inmemory_free(struct odb_source *source) { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); @@ -124,6 +148,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.free = odb_source_inmemory_free; source->base.read_object_info = odb_source_inmemory_read_object_info; source->base.read_object_stream = odb_source_inmemory_read_object_stream; + source->base.write_object = odb_source_inmemory_write_object; return source; } From 197c8a85e37720e54afda1ed92bf8b393cca92f1 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:37 +0200 Subject: [PATCH 14/24] odb/source-inmemory: implement `write_object_stream()` callback Implement the `write_object_stream()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 4848011df5..d05a13df45 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -128,6 +128,45 @@ static int odb_source_inmemory_write_object(struct odb_source *source, return 0; } +static int odb_source_inmemory_write_object_stream(struct odb_source *source, + struct odb_write_stream *stream, + size_t len, + struct object_id *oid) +{ + char buf[16384]; + size_t total_read = 0; + char *data; + int ret; + + CALLOC_ARRAY(data, len); + while (!stream->is_finished) { + ssize_t bytes_read; + + bytes_read = odb_write_stream_read(stream, buf, sizeof(buf)); + if (total_read + bytes_read > len) { + ret = error("object stream yielded more bytes than expected"); + goto out; + } + + memcpy(data + total_read, buf, bytes_read); + total_read += bytes_read; + } + + if (total_read != len) { + ret = error("object stream yielded less bytes than expected"); + goto out; + } + + ret = odb_source_inmemory_write_object(source, data, len, OBJ_BLOB, oid, + NULL, 0); + if (ret < 0) + goto out; + +out: + free(data); + return ret; +} + static void odb_source_inmemory_free(struct odb_source *source) { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); @@ -149,6 +188,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.read_object_info = odb_source_inmemory_read_object_info; source->base.read_object_stream = odb_source_inmemory_read_object_stream; source->base.write_object = odb_source_inmemory_write_object; + source->base.write_object_stream = odb_source_inmemory_write_object_stream; return source; } From 550d7b7c89a9cf80794c72e8c7d036164a5b1927 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:38 +0200 Subject: [PATCH 15/24] cbtree: allow using arbitrary wrapper structures for nodes The cbtree subsystem allows the user to store arbitrary data in a prefix-free set of strings. This is used by us to store object IDs in a way that we can easily iterate through them in lexicograph order, and so that we can easily perform lookups with shortened object IDs. In its current form, it is not easily possible to store arbitrary data with the tree nodes. There are a couple of approaches such a caller could try to use, but none of them really work: - One may embed the `struct cb_node` in a custom structure. This does not work though as `struct cb_node` contains a flex array, and embedding such a struct in another struct is forbidden. - One may use a `union` over `struct cb_node` and ones own data type, which _is_ allowed even if the struct contains a flex array. This does not work though, as the compiler may align members of the struct so that the node key would not immediately start where the flex array starts. - One may allocate `struct cb_node` such that it has room for both its key and the custom data. This has the downside though that if the custom data is itself a pointer to allocated memory, then the leak checker will not consider the pointer to be alive anymore. Refactor the cbtree to drop the flex array and instead take in an explicit offset for where to find the key, which allows the caller to embed `struct cb_node` is a wrapper struct. Note that this change has the downside that we now have a bit of padding in our structure, which grows the size from 60 to 64 bytes on a 64 bit system. On the other hand though, it allows us to get rid of the memory copies that we previously had to do to ensure proper alignment. This seems like a reasonable tradeoff. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- cbtree.c | 25 ++++++++++++++++++------- cbtree.h | 17 +++++++++-------- oidtree.c | 33 ++++++++++++++------------------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/cbtree.c b/cbtree.c index 4ab794bddc..8f5edbb80a 100644 --- a/cbtree.c +++ b/cbtree.c @@ -7,6 +7,11 @@ #include "git-compat-util.h" #include "cbtree.h" +static inline uint8_t *cb_node_key(struct cb_tree *t, struct cb_node *node) +{ + return (uint8_t *) node + t->key_offset; +} + static struct cb_node *cb_node_of(const void *p) { return (struct cb_node *)((uintptr_t)p - 1); @@ -33,6 +38,7 @@ struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen) uint8_t c; int newdirection; struct cb_node **wherep, *p; + uint8_t *node_key, *p_key; assert(!((uintptr_t)node & 1)); /* allocations must be aligned */ @@ -41,23 +47,26 @@ struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen) return NULL; /* success */ } + node_key = cb_node_key(t, node); + /* see if a node already exists */ - p = cb_internal_best_match(t->root, node->k, klen); + p = cb_internal_best_match(t->root, node_key, klen); + p_key = cb_node_key(t, p); /* find first differing byte */ for (newbyte = 0; newbyte < klen; newbyte++) { - if (p->k[newbyte] != node->k[newbyte]) + if (p_key[newbyte] != node_key[newbyte]) goto different_byte_found; } return p; /* element exists, let user deal with it */ different_byte_found: - newotherbits = p->k[newbyte] ^ node->k[newbyte]; + newotherbits = p_key[newbyte] ^ node_key[newbyte]; newotherbits |= newotherbits >> 1; newotherbits |= newotherbits >> 2; newotherbits |= newotherbits >> 4; newotherbits = (newotherbits & ~(newotherbits >> 1)) ^ 255; - c = p->k[newbyte]; + c = p_key[newbyte]; newdirection = (1 + (newotherbits | c)) >> 8; node->byte = newbyte; @@ -78,7 +87,7 @@ different_byte_found: break; if (q->byte == newbyte && q->otherbits > newotherbits) break; - c = q->byte < klen ? node->k[q->byte] : 0; + c = q->byte < klen ? node_key[q->byte] : 0; direction = (1 + (q->otherbits | c)) >> 8; wherep = q->child + direction; } @@ -93,7 +102,7 @@ struct cb_node *cb_lookup(struct cb_tree *t, const uint8_t *k, size_t klen) { struct cb_node *p = cb_internal_best_match(t->root, k, klen); - return p && !memcmp(p->k, k, klen) ? p : NULL; + return p && !memcmp(cb_node_key(t, p), k, klen) ? p : NULL; } static int cb_descend(struct cb_node *p, cb_iter fn, void *arg) @@ -115,6 +124,7 @@ int cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen, struct cb_node *p = t->root; struct cb_node *top = p; size_t i = 0; + uint8_t *p_key; if (!p) return 0; /* empty tree */ @@ -130,8 +140,9 @@ int cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen, top = p; } + p_key = cb_node_key(t, p); for (i = 0; i < klen; i++) { - if (p->k[i] != kpfx[i]) + if (p_key[i] != kpfx[i]) return 0; /* "best" match failed */ } diff --git a/cbtree.h b/cbtree.h index c374b1b3db..4647d4a32f 100644 --- a/cbtree.h +++ b/cbtree.h @@ -6,9 +6,9 @@ * * This is adapted to store arbitrary data (not just NUL-terminated C strings * and allocates no memory internally. The user needs to allocate - * "struct cb_node" and fill cb_node.k[] with arbitrary match data - * for memcmp. - * If "klen" is variable, then it should be embedded into "c_node.k[]" + * "struct cb_node" and provide `key_offset` to indicate where the key can be + * found relative to the `struct cb_node` for memcmp. + * If "klen" is variable, then it should be embedded into the key. * Recursion is bound by the maximum value of "klen" used. */ #ifndef CBTREE_H @@ -23,18 +23,19 @@ struct cb_node { */ uint32_t byte; uint8_t otherbits; - uint8_t k[FLEX_ARRAY]; /* arbitrary data, unaligned */ }; struct cb_tree { struct cb_node *root; + ptrdiff_t key_offset; }; -#define CBTREE_INIT { 0 } - -static inline void cb_init(struct cb_tree *t) +static inline void cb_init(struct cb_tree *t, + ptrdiff_t key_offset) { - struct cb_tree blank = CBTREE_INIT; + struct cb_tree blank = { + .key_offset = key_offset, + }; memcpy(t, &blank, sizeof(*t)); } diff --git a/oidtree.c b/oidtree.c index ab9fe7ec7a..117649753f 100644 --- a/oidtree.c +++ b/oidtree.c @@ -6,9 +6,14 @@ #include "oidtree.h" #include "hash.h" +struct oidtree_node { + struct cb_node base; + struct object_id key; +}; + void oidtree_init(struct oidtree *ot) { - cb_init(&ot->tree); + cb_init(&ot->tree, offsetof(struct oidtree_node, key)); mem_pool_init(&ot->mem_pool, 0); } @@ -22,20 +27,13 @@ void oidtree_clear(struct oidtree *ot) void oidtree_insert(struct oidtree *ot, const struct object_id *oid) { - struct cb_node *on; - struct object_id k; + struct oidtree_node *on; if (!oid->algo) BUG("oidtree_insert requires oid->algo"); - on = mem_pool_alloc(&ot->mem_pool, sizeof(*on) + sizeof(*oid)); - - /* - * Clear the padding and copy the result in separate steps to - * respect the 4-byte alignment needed by struct object_id. - */ - oidcpy(&k, oid); - memcpy(on->k, &k, sizeof(k)); + on = mem_pool_alloc(&ot->mem_pool, sizeof(*on)); + oidcpy(&on->key, oid); /* * n.b. Current callers won't get us duplicates, here. If a @@ -43,7 +41,7 @@ void oidtree_insert(struct oidtree *ot, const struct object_id *oid) * that won't be freed until oidtree_clear. Currently it's not * worth maintaining a free list */ - cb_insert(&ot->tree, on, sizeof(*oid)); + cb_insert(&ot->tree, &on->base, sizeof(*oid)); } bool oidtree_contains(struct oidtree *ot, const struct object_id *oid) @@ -73,21 +71,18 @@ struct oidtree_each_data { static int iter(struct cb_node *n, void *cb_data) { + struct oidtree_node *node = container_of(n, struct oidtree_node, base); struct oidtree_each_data *data = cb_data; - struct object_id k; - /* Copy to provide 4-byte alignment needed by struct object_id. */ - memcpy(&k, n->k, sizeof(k)); - - if (data->algo != GIT_HASH_UNKNOWN && data->algo != k.algo) + if (data->algo != GIT_HASH_UNKNOWN && data->algo != node->key.algo) return 0; if (data->last_nibble_at) { - if ((k.hash[*data->last_nibble_at] ^ data->last_byte) & 0xf0) + if ((node->key.hash[*data->last_nibble_at] ^ data->last_byte) & 0xf0) return 0; } - return data->cb(&k, data->cb_data); + return data->cb(&node->key, data->cb_data); } int oidtree_each(struct oidtree *ot, const struct object_id *prefix, From 449650decf49b1fe5b1dac1c48dfb919e9b57b0d Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:39 +0200 Subject: [PATCH 16/24] oidtree: add ability to store data The oidtree data structure is currently only used to store object IDs, without any associated data. So consequently, it can only really be used to track which object IDs exist, and we can use the tree structure to efficiently operate on OID prefixes. But there are valid use cases where we want to both: - Store object IDs in a sorted order. - Associated arbitrary data with them. Refactor the oidtree interface so that it allows us to store arbitrary payloads within the respective nodes. This will be used in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- loose.c | 2 +- object-file.c | 3 ++- oidtree.c | 37 ++++++++++++++++++++++++++++++++----- oidtree.h | 12 ++++++++++-- t/unit-tests/u-oidtree.c | 26 +++++++++++++++++++++++--- 5 files changed, 68 insertions(+), 12 deletions(-) diff --git a/loose.c b/loose.c index 07333be696..f7a3dd1a72 100644 --- a/loose.c +++ b/loose.c @@ -57,7 +57,7 @@ static int insert_loose_map(struct odb_source *source, inserted |= insert_oid_pair(map->to_compat, oid, compat_oid); inserted |= insert_oid_pair(map->to_storage, compat_oid, oid); if (inserted) - oidtree_insert(files->loose->cache, compat_oid); + oidtree_insert(files->loose->cache, compat_oid, NULL); return inserted; } diff --git a/object-file.c b/object-file.c index 7b1a12f8eb..8705251e4d 100644 --- a/object-file.c +++ b/object-file.c @@ -1858,6 +1858,7 @@ static int for_each_object_wrapper_cb(const struct object_id *oid, } static int for_each_prefixed_object_wrapper_cb(const struct object_id *oid, + void *node_data UNUSED, void *cb_data) { struct for_each_object_wrapper_data *data = cb_data; @@ -2003,7 +2004,7 @@ static int append_loose_object(const struct object_id *oid, const char *path UNUSED, void *data) { - oidtree_insert(data, oid); + oidtree_insert(data, oid, NULL); return 0; } diff --git a/oidtree.c b/oidtree.c index 117649753f..e43f18026e 100644 --- a/oidtree.c +++ b/oidtree.c @@ -9,6 +9,7 @@ struct oidtree_node { struct cb_node base; struct object_id key; + void *data; }; void oidtree_init(struct oidtree *ot) @@ -25,15 +26,22 @@ void oidtree_clear(struct oidtree *ot) } } -void oidtree_insert(struct oidtree *ot, const struct object_id *oid) +struct oidtree_data { + struct object_id oid; +}; + +void oidtree_insert(struct oidtree *ot, const struct object_id *oid, + void *data) { struct oidtree_node *on; + struct cb_node *node; if (!oid->algo) BUG("oidtree_insert requires oid->algo"); on = mem_pool_alloc(&ot->mem_pool, sizeof(*on)); oidcpy(&on->key, oid); + on->data = data; /* * n.b. Current callers won't get us duplicates, here. If a @@ -41,13 +49,19 @@ void oidtree_insert(struct oidtree *ot, const struct object_id *oid) * that won't be freed until oidtree_clear. Currently it's not * worth maintaining a free list */ - cb_insert(&ot->tree, &on->base, sizeof(*oid)); + node = cb_insert(&ot->tree, &on->base, sizeof(*oid)); + if (node) { + struct oidtree_node *preexisting = container_of(node, struct oidtree_node, base); + preexisting->data = data; + } } -bool oidtree_contains(struct oidtree *ot, const struct object_id *oid) +static struct oidtree_node *oidtree_lookup(struct oidtree *ot, + const struct object_id *oid) { struct object_id k; size_t klen = sizeof(k); + struct cb_node *node; oidcpy(&k, oid); @@ -58,7 +72,20 @@ bool oidtree_contains(struct oidtree *ot, const struct object_id *oid) klen += BUILD_ASSERT_OR_ZERO(offsetof(struct object_id, hash) < offsetof(struct object_id, algo)); - return !!cb_lookup(&ot->tree, (const uint8_t *)&k, klen); + node = cb_lookup(&ot->tree, (const uint8_t *)&k, klen); + return node ? container_of(node, struct oidtree_node, base) : NULL; +} + +bool oidtree_contains(struct oidtree *ot, const struct object_id *oid) +{ + struct oidtree_node *node = oidtree_lookup(ot, oid); + return node ? 1 : 0; +} + +void *oidtree_get(struct oidtree *ot, const struct object_id *oid) +{ + struct oidtree_node *node = oidtree_lookup(ot, oid); + return node ? node->data : NULL; } struct oidtree_each_data { @@ -82,7 +109,7 @@ static int iter(struct cb_node *n, void *cb_data) return 0; } - return data->cb(&node->key, data->cb_data); + return data->cb(&node->key, node->data, data->cb_data); } int oidtree_each(struct oidtree *ot, const struct object_id *prefix, diff --git a/oidtree.h b/oidtree.h index 2b7bad2e60..baa5a436ea 100644 --- a/oidtree.h +++ b/oidtree.h @@ -29,18 +29,26 @@ void oidtree_init(struct oidtree *ot); */ void oidtree_clear(struct oidtree *ot); -/* Insert the object ID into the tree. */ -void oidtree_insert(struct oidtree *ot, const struct object_id *oid); +/* + * Insert the object ID into the tree and store the given pointer alongside + * with it. The data pointer of any preexisting entry will be overwritten. + */ +void oidtree_insert(struct oidtree *ot, const struct object_id *oid, + void *data); /* Check whether the tree contains the given object ID. */ bool oidtree_contains(struct oidtree *ot, const struct object_id *oid); +/* Get the payload stored with the given object ID. */ +void *oidtree_get(struct oidtree *ot, const struct object_id *oid); + /* * Callback function used for `oidtree_each()`. Returning a non-zero exit code * will cause iteration to stop. The exit code will be propagated to the caller * of `oidtree_each()`. */ typedef int (*oidtree_each_cb)(const struct object_id *oid, + void *node_data, void *cb_data); /* diff --git a/t/unit-tests/u-oidtree.c b/t/unit-tests/u-oidtree.c index d4d05c7dc3..f0d5ebb733 100644 --- a/t/unit-tests/u-oidtree.c +++ b/t/unit-tests/u-oidtree.c @@ -19,7 +19,7 @@ static int fill_tree_loc(struct oidtree *ot, const char *hexes[], size_t n) for (size_t i = 0; i < n; i++) { struct object_id oid; cl_parse_any_oid(hexes[i], &oid); - oidtree_insert(ot, &oid); + oidtree_insert(ot, &oid, NULL); } return 0; } @@ -38,9 +38,9 @@ struct expected_hex_iter { const char *query; }; -static int check_each_cb(const struct object_id *oid, void *data) +static int check_each_cb(const struct object_id *oid, void *node_data UNUSED, void *cb_data) { - struct expected_hex_iter *hex_iter = data; + struct expected_hex_iter *hex_iter = cb_data; struct object_id expected; cl_assert(hex_iter->i < hex_iter->expected_hexes.nr); @@ -105,3 +105,23 @@ void test_oidtree__each(void) check_each(&ot, "32100", "321", NULL); check_each(&ot, "32", "320", "321", NULL); } + +void test_oidtree__insert_overwrites_data(void) +{ + struct object_id oid; + struct oidtree ot; + int a, b; + + cl_parse_any_oid("1", &oid); + + oidtree_init(&ot); + + oidtree_insert(&ot, &oid, NULL); + cl_assert_equal_p(oidtree_get(&ot, &oid), NULL); + oidtree_insert(&ot, &oid, &a); + cl_assert_equal_p(oidtree_get(&ot, &oid), &a); + oidtree_insert(&ot, &oid, &b); + cl_assert_equal_p(oidtree_get(&ot, &oid), &b); + + oidtree_clear(&ot); +} From c04907694601556de0ce862ad4f80fc55ec38c62 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:40 +0200 Subject: [PATCH 17/24] odb/source-inmemory: convert to use oidtree The in-memory source stores its objects in a simple array that we grow as needed. This has a couple of downsides: - The object lookup is O(n). This doesn't matter in practice because we only store a small number of objects. - We don't have an easy way to iterate over all objects in lexicographic order. - We don't have an easy way to compute unique object ID prefixes. Refactor the code to use an oidtree instead. This is the same data structure used by our loose object source, and thus it means we get a bunch of functionality for free. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 72 +++++++++++++++++++++++++++++++------------ odb/source-inmemory.h | 13 ++------ 2 files changed, 54 insertions(+), 31 deletions(-) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index d05a13df45..3b51cc7fef 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -3,20 +3,29 @@ #include "odb.h" #include "odb/source-inmemory.h" #include "odb/streaming.h" +#include "oidtree.h" #include "repository.h" -static const struct cached_object *find_cached_object(struct odb_source_inmemory *source, - const struct object_id *oid) +struct inmemory_object { + enum object_type type; + const void *buf; + unsigned long size; +}; + +static const struct inmemory_object *find_cached_object(struct odb_source_inmemory *source, + const struct object_id *oid) { - static const struct cached_object empty_tree = { + static const struct inmemory_object empty_tree = { .type = OBJ_TREE, .buf = "", }; - const struct cached_object_entry *co = source->objects; + const struct inmemory_object *object; - for (size_t i = 0; i < source->objects_nr; i++, co++) - if (oideq(&co->oid, oid)) - return &co->value; + if (source->objects) { + object = oidtree_get(source->objects, oid); + if (object) + return object; + } if (oid->algo && oideq(oid, hash_algos[oid->algo].empty_tree)) return &empty_tree; @@ -30,7 +39,7 @@ static int odb_source_inmemory_read_object_info(struct odb_source *source, enum object_info_flags flags UNUSED) { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); - const struct cached_object *object; + const struct inmemory_object *object; object = find_cached_object(inmemory, oid); if (!object) @@ -88,7 +97,7 @@ static int odb_source_inmemory_read_object_stream(struct odb_read_stream **out, { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); struct odb_read_stream_inmemory *stream; - const struct cached_object *object; + const struct inmemory_object *object; object = find_cached_object(inmemory, oid); if (!object) @@ -113,17 +122,23 @@ static int odb_source_inmemory_write_object(struct odb_source *source, enum odb_write_object_flags flags UNUSED) { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); - struct cached_object_entry *object; + struct inmemory_object *object; hash_object_file(source->odb->repo->hash_algo, buf, len, type, oid); - ALLOC_GROW(inmemory->objects, inmemory->objects_nr + 1, - inmemory->objects_alloc); - object = &inmemory->objects[inmemory->objects_nr++]; - object->value.size = len; - object->value.type = type; - object->value.buf = xmemdupz(buf, len); - oidcpy(&object->oid, oid); + if (!inmemory->objects) { + CALLOC_ARRAY(inmemory->objects, 1); + oidtree_init(inmemory->objects); + } else if (oidtree_contains(inmemory->objects, oid)) { + return 0; + } + + CALLOC_ARRAY(object, 1); + object->size = len; + object->type = type; + object->buf = xmemdupz(buf, len); + + oidtree_insert(inmemory->objects, oid, object); return 0; } @@ -167,12 +182,29 @@ out: return ret; } +static int inmemory_object_free(const struct object_id *oid UNUSED, + void *node_data, + void *cb_data UNUSED) +{ + struct inmemory_object *object = node_data; + free((void *) object->buf); + free(object); + return 0; +} + static void odb_source_inmemory_free(struct odb_source *source) { struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); - for (size_t i = 0; i < inmemory->objects_nr; i++) - free((char *) inmemory->objects[i].value.buf); - free(inmemory->objects); + + if (inmemory->objects) { + struct object_id null_oid = { 0 }; + + oidtree_each(inmemory->objects, &null_oid, 0, + inmemory_object_free, NULL); + oidtree_clear(inmemory->objects); + free(inmemory->objects); + } + free(inmemory->base.path); free(inmemory); } diff --git a/odb/source-inmemory.h b/odb/source-inmemory.h index d1b05a3996..a88fc2e320 100644 --- a/odb/source-inmemory.h +++ b/odb/source-inmemory.h @@ -3,14 +3,7 @@ #include "odb/source.h" -struct cached_object_entry { - struct object_id oid; - struct cached_object { - enum object_type type; - const void *buf; - unsigned long size; - } value; -}; +struct oidtree; /* * An in-memory source that you can write objects to that shall be made @@ -20,9 +13,7 @@ struct cached_object_entry { */ struct odb_source_inmemory { struct odb_source base; - - struct cached_object_entry *objects; - size_t objects_nr, objects_alloc; + struct oidtree *objects; }; /* Create a new in-memory object database source. */ From 4babe3b673882adf853526475192ae7e3007877c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:41 +0200 Subject: [PATCH 18/24] odb/source-inmemory: implement `for_each_object()` callback Implement the `for_each_object()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 88 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 16 deletions(-) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 3b51cc7fef..f60eecbdbb 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -33,6 +33,28 @@ static const struct inmemory_object *find_cached_object(struct odb_source_inmemo return NULL; } +static void populate_object_info(struct odb_source_inmemory *source, + struct object_info *oi, + const struct inmemory_object *object) +{ + if (!oi) + return; + + if (oi->typep) + *(oi->typep) = object->type; + if (oi->sizep) + *(oi->sizep) = object->size; + if (oi->disk_sizep) + *(oi->disk_sizep) = 0; + if (oi->delta_base_oid) + oidclr(oi->delta_base_oid, source->base.odb->repo->hash_algo); + if (oi->contentp) + *oi->contentp = xmemdupz(object->buf, object->size); + if (oi->mtimep) + *oi->mtimep = 0; + oi->whence = OI_CACHED; +} + static int odb_source_inmemory_read_object_info(struct odb_source *source, const struct object_id *oid, struct object_info *oi, @@ -45,22 +67,7 @@ static int odb_source_inmemory_read_object_info(struct odb_source *source, if (!object) return -1; - if (oi) { - if (oi->typep) - *(oi->typep) = object->type; - if (oi->sizep) - *(oi->sizep) = object->size; - if (oi->disk_sizep) - *(oi->disk_sizep) = 0; - if (oi->delta_base_oid) - oidclr(oi->delta_base_oid, source->odb->repo->hash_algo); - if (oi->contentp) - *oi->contentp = xmemdupz(object->buf, object->size); - if (oi->mtimep) - *oi->mtimep = 0; - oi->whence = OI_CACHED; - } - + populate_object_info(inmemory, oi, object); return 0; } @@ -114,6 +121,54 @@ static int odb_source_inmemory_read_object_stream(struct odb_read_stream **out, return 0; } +struct odb_source_inmemory_for_each_object_data { + struct odb_source_inmemory *inmemory; + const struct object_info *request; + odb_for_each_object_cb cb; + void *cb_data; +}; + +static int odb_source_inmemory_for_each_object_cb(const struct object_id *oid, + void *node_data, void *cb_data) +{ + struct odb_source_inmemory_for_each_object_data *data = cb_data; + struct inmemory_object *object = node_data; + + if (data->request) { + struct object_info oi = *data->request; + populate_object_info(data->inmemory, &oi, object); + return data->cb(oid, &oi, data->cb_data); + } else { + return data->cb(oid, NULL, data->cb_data); + } +} + +static int odb_source_inmemory_for_each_object(struct odb_source *source, + const struct object_info *request, + odb_for_each_object_cb cb, + void *cb_data, + const struct odb_for_each_object_options *opts) +{ + struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); + struct odb_source_inmemory_for_each_object_data payload = { + .inmemory = inmemory, + .request = request, + .cb = cb, + .cb_data = cb_data, + }; + struct object_id null_oid = { 0 }; + + if ((opts->flags & ODB_FOR_EACH_OBJECT_PROMISOR_ONLY) || + (opts->flags & ODB_FOR_EACH_OBJECT_LOCAL_ONLY && !source->local)) + return 0; + if (!inmemory->objects) + return 0; + + return oidtree_each(inmemory->objects, + opts->prefix ? opts->prefix : &null_oid, opts->prefix_hex_len, + odb_source_inmemory_for_each_object_cb, &payload); +} + static int odb_source_inmemory_write_object(struct odb_source *source, const void *buf, unsigned long len, enum object_type type, @@ -219,6 +274,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.free = odb_source_inmemory_free; source->base.read_object_info = odb_source_inmemory_read_object_info; source->base.read_object_stream = odb_source_inmemory_read_object_stream; + source->base.for_each_object = odb_source_inmemory_for_each_object; source->base.write_object = odb_source_inmemory_write_object; source->base.write_object_stream = odb_source_inmemory_write_object_stream; From 3bd2856d3448943c4037d454f3e9cc0135330e73 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:42 +0200 Subject: [PATCH 19/24] odb/source-inmemory: implement `find_abbrev_len()` callback Implement the `find_abbrev_len()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index f60eecbdbb..44d9bbedec 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -169,6 +169,44 @@ static int odb_source_inmemory_for_each_object(struct odb_source *source, odb_source_inmemory_for_each_object_cb, &payload); } +struct find_abbrev_len_data { + const struct object_id *oid; + unsigned len; +}; + +static int find_abbrev_len_cb(const struct object_id *oid, + struct object_info *oi UNUSED, + void *cb_data) +{ + struct find_abbrev_len_data *data = cb_data; + unsigned len = oid_common_prefix_hexlen(oid, data->oid); + if (len != hash_algos[oid->algo].hexsz && len >= data->len) + data->len = len + 1; + return 0; +} + +static int odb_source_inmemory_find_abbrev_len(struct odb_source *source, + const struct object_id *oid, + unsigned min_len, + unsigned *out) +{ + struct odb_for_each_object_options opts = { + .prefix = oid, + .prefix_hex_len = min_len, + }; + struct find_abbrev_len_data data = { + .oid = oid, + .len = min_len, + }; + int ret; + + ret = odb_source_inmemory_for_each_object(source, NULL, find_abbrev_len_cb, + &data, &opts); + *out = data.len; + + return ret; +} + static int odb_source_inmemory_write_object(struct odb_source *source, const void *buf, unsigned long len, enum object_type type, @@ -275,6 +313,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.read_object_info = odb_source_inmemory_read_object_info; source->base.read_object_stream = odb_source_inmemory_read_object_stream; source->base.for_each_object = odb_source_inmemory_for_each_object; + source->base.find_abbrev_len = odb_source_inmemory_find_abbrev_len; source->base.write_object = odb_source_inmemory_write_object; source->base.write_object_stream = odb_source_inmemory_write_object_stream; From 27d219132afe13db43d9732caeb37a14c026e717 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:43 +0200 Subject: [PATCH 20/24] odb/source-inmemory: implement `count_objects()` callback Implement the `count_objects()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 44d9bbedec..674dbcad30 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -207,6 +207,25 @@ static int odb_source_inmemory_find_abbrev_len(struct odb_source *source, return ret; } +static int count_objects_cb(const struct object_id *oid UNUSED, + struct object_info *oi UNUSED, + void *cb_data) +{ + unsigned long *counter = cb_data; + (*counter)++; + return 0; +} + +static int odb_source_inmemory_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags UNUSED, + unsigned long *out) +{ + struct odb_for_each_object_options opts = { 0 }; + *out = 0; + return odb_source_inmemory_for_each_object(source, NULL, count_objects_cb, + out, &opts); +} + static int odb_source_inmemory_write_object(struct odb_source *source, const void *buf, unsigned long len, enum object_type type, @@ -314,6 +333,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.read_object_stream = odb_source_inmemory_read_object_stream; source->base.for_each_object = odb_source_inmemory_for_each_object; source->base.find_abbrev_len = odb_source_inmemory_find_abbrev_len; + source->base.count_objects = odb_source_inmemory_count_objects; source->base.write_object = odb_source_inmemory_write_object; source->base.write_object_stream = odb_source_inmemory_write_object_stream; From 7357196c49d537588d6c450fa3a902fac13cfbb9 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:44 +0200 Subject: [PATCH 21/24] odb/source-inmemory: implement `freshen_object()` callback Implement the `freshen_object()` callback function for the in-memory source. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 674dbcad30..8934e0f547 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -294,6 +294,15 @@ out: return ret; } +static int odb_source_inmemory_freshen_object(struct odb_source *source, + const struct object_id *oid) +{ + struct odb_source_inmemory *inmemory = odb_source_inmemory_downcast(source); + if (find_cached_object(inmemory, oid)) + return 1; + return 0; +} + static int inmemory_object_free(const struct object_id *oid UNUSED, void *node_data, void *cb_data UNUSED) @@ -336,6 +345,7 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.count_objects = odb_source_inmemory_count_objects; source->base.write_object = odb_source_inmemory_write_object; source->base.write_object_stream = odb_source_inmemory_write_object_stream; + source->base.freshen_object = odb_source_inmemory_freshen_object; return source; } From 314fa0199ddc1a37069ab7c006a5b0bb8e72f45d Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:45 +0200 Subject: [PATCH 22/24] odb/source-inmemory: stub out remaining functions Stub out remaining functions that we either don't need or that are basically no-ops. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-inmemory.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/odb/source-inmemory.c b/odb/source-inmemory.c index 8934e0f547..e004566d76 100644 --- a/odb/source-inmemory.c +++ b/odb/source-inmemory.c @@ -303,6 +303,32 @@ static int odb_source_inmemory_freshen_object(struct odb_source *source, return 0; } +static int odb_source_inmemory_begin_transaction(struct odb_source *source UNUSED, + struct odb_transaction **out UNUSED) +{ + return error("in-memory source does not support transactions"); +} + +static int odb_source_inmemory_read_alternates(struct odb_source *source UNUSED, + struct strvec *out UNUSED) +{ + return 0; +} + +static int odb_source_inmemory_write_alternate(struct odb_source *source UNUSED, + const char *alternate UNUSED) +{ + return error("in-memory source does not support alternates"); +} + +static void odb_source_inmemory_close(struct odb_source *source UNUSED) +{ +} + +static void odb_source_inmemory_reprepare(struct odb_source *source UNUSED) +{ +} + static int inmemory_object_free(const struct object_id *oid UNUSED, void *node_data, void *cb_data UNUSED) @@ -338,6 +364,8 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) odb_source_init(&source->base, odb, ODB_SOURCE_INMEMORY, "source", false); source->base.free = odb_source_inmemory_free; + source->base.close = odb_source_inmemory_close; + source->base.reprepare = odb_source_inmemory_reprepare; source->base.read_object_info = odb_source_inmemory_read_object_info; source->base.read_object_stream = odb_source_inmemory_read_object_stream; source->base.for_each_object = odb_source_inmemory_for_each_object; @@ -346,6 +374,9 @@ struct odb_source_inmemory *odb_source_inmemory_new(struct object_database *odb) source->base.write_object = odb_source_inmemory_write_object; source->base.write_object_stream = odb_source_inmemory_write_object_stream; source->base.freshen_object = odb_source_inmemory_freshen_object; + source->base.begin_transaction = odb_source_inmemory_begin_transaction; + source->base.read_alternates = odb_source_inmemory_read_alternates; + source->base.write_alternate = odb_source_inmemory_write_alternate; return source; } From fdf74cb2cab6a4a95fd6e7e589ac6a4508bf358f Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:46 +0200 Subject: [PATCH 23/24] odb: generic in-memory source Make the in-memory source generic. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 8 ++++---- odb.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/odb.c b/odb.c index 24e929f03c..965ef68e4e 100644 --- a/odb.c +++ b/odb.c @@ -560,7 +560,7 @@ static int do_oid_object_info_extended(struct object_database *odb, if (is_null_oid(real)) return -1; - if (!odb_source_read_object_info(&odb->inmemory_objects->base, oid, oi, flags)) + if (!odb_source_read_object_info(odb->inmemory_objects, oid, oi, flags)) return 0; odb_prepare_alternates(odb); @@ -737,7 +737,7 @@ int odb_pretend_object(struct object_database *odb, if (odb_has_object(odb, oid, 0)) return 0; - return odb_source_write_object(&odb->inmemory_objects->base, + return odb_source_write_object(odb->inmemory_objects, buf, len, type, oid, NULL, 0); } @@ -1020,7 +1020,7 @@ struct object_database *odb_new(struct repository *repo, o->sources = odb_source_new(o, primary_source, true); o->sources_tail = &o->sources->next; o->alternate_db = xstrdup_or_null(secondary_sources); - o->inmemory_objects = odb_source_inmemory_new(o); + o->inmemory_objects = &odb_source_inmemory_new(o)->base; free(to_free); @@ -1045,7 +1045,7 @@ static void odb_free_sources(struct object_database *o) o->sources = next; } - odb_source_free(&o->inmemory_objects->base); + odb_source_free(o->inmemory_objects); o->inmemory_objects = NULL; kh_destroy_odb_path_map(o->source_by_path); diff --git a/odb.h b/odb.h index c3a7edf9c8..73553ed5a7 100644 --- a/odb.h +++ b/odb.h @@ -81,7 +81,7 @@ struct object_database { * to write them into the object store (e.g. a browse-only * application). */ - struct odb_source_inmemory *inmemory_objects; + struct odb_source *inmemory_objects; /* * A fast, rough count of the number of objects in the repository. From d2902a45498793f8dc69abc6448f517b69437eec Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:47 +0200 Subject: [PATCH 24/24] t/unit-tests: add tests for the in-memory object source While the in-memory object source is a full-fledged source, our code base only exercises parts of its functionality because we only use it in git-blame(1). Implement unit tests to verify that the yet-unused functionality of the backend works as expected. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- Makefile | 1 + t/meson.build | 1 + t/unit-tests/u-odb-inmemory.c | 313 ++++++++++++++++++++++++++++++++++ 3 files changed, 315 insertions(+) create mode 100644 t/unit-tests/u-odb-inmemory.c diff --git a/Makefile b/Makefile index 3cda12c455..68b4daa1ad 100644 --- a/Makefile +++ b/Makefile @@ -1529,6 +1529,7 @@ CLAR_TEST_SUITES += u-hash CLAR_TEST_SUITES += u-hashmap CLAR_TEST_SUITES += u-list-objects-filter-options CLAR_TEST_SUITES += u-mem-pool +CLAR_TEST_SUITES += u-odb-inmemory CLAR_TEST_SUITES += u-oid-array CLAR_TEST_SUITES += u-oidmap CLAR_TEST_SUITES += u-oidtree diff --git a/t/meson.build b/t/meson.build index 7528e5cda5..db5e01c49b 100644 --- a/t/meson.build +++ b/t/meson.build @@ -6,6 +6,7 @@ clar_test_suites = [ 'unit-tests/u-hashmap.c', 'unit-tests/u-list-objects-filter-options.c', 'unit-tests/u-mem-pool.c', + 'unit-tests/u-odb-inmemory.c', 'unit-tests/u-oid-array.c', 'unit-tests/u-oidmap.c', 'unit-tests/u-oidtree.c', diff --git a/t/unit-tests/u-odb-inmemory.c b/t/unit-tests/u-odb-inmemory.c new file mode 100644 index 0000000000..482502ef4b --- /dev/null +++ b/t/unit-tests/u-odb-inmemory.c @@ -0,0 +1,313 @@ +#include "unit-test.h" +#include "hex.h" +#include "odb/source-inmemory.h" +#include "odb/streaming.h" +#include "oidset.h" +#include "repository.h" +#include "strbuf.h" + +#define RANDOM_OID "da39a3ee5e6b4b0d3255bfef95601890afd80709" +#define FOOBAR_OID "f6ea0495187600e7b2288c8ac19c5886383a4632" + +static struct repository repo = { + .hash_algo = &hash_algos[GIT_HASH_SHA1], +}; +static struct object_database *odb; + +static void cl_assert_object_info(struct odb_source_inmemory *source, + const struct object_id *oid, + enum object_type expected_type, + const char *expected_content) +{ + enum object_type actual_type; + unsigned long actual_size; + void *actual_content; + struct object_info oi = { + .typep = &actual_type, + .sizep = &actual_size, + .contentp = &actual_content, + }; + + cl_must_pass(odb_source_read_object_info(&source->base, oid, &oi, 0)); + cl_assert_equal_u(actual_size, strlen(expected_content)); + cl_assert_equal_u(actual_type, expected_type); + cl_assert_equal_s((char *) actual_content, expected_content); + + free(actual_content); +} + +void test_odb_inmemory__initialize(void) +{ + odb = odb_new(&repo, "", ""); +} + +void test_odb_inmemory__cleanup(void) +{ + odb_free(odb); +} + +void test_odb_inmemory__new(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + cl_assert_equal_i(source->base.type, ODB_SOURCE_INMEMORY); + odb_source_free(&source->base); +} + +void test_odb_inmemory__read_missing_object(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct object_id oid; + const char *end; + + cl_must_pass(parse_oid_hex_algop(RANDOM_OID, &oid, &end, repo.hash_algo)); + cl_must_fail(odb_source_read_object_info(&source->base, &oid, NULL, 0)); + + odb_source_free(&source->base); +} + +void test_odb_inmemory__read_empty_tree(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + cl_assert_object_info(source, repo.hash_algo->empty_tree, OBJ_TREE, ""); + odb_source_free(&source->base); +} + +void test_odb_inmemory__read_written_object(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + const char data[] = "foobar"; + struct object_id written_oid; + + cl_must_pass(odb_source_write_object(&source->base, data, strlen(data), + OBJ_BLOB, &written_oid, NULL, 0)); + cl_assert_equal_s(oid_to_hex(&written_oid), FOOBAR_OID); + cl_assert_object_info(source, &written_oid, OBJ_BLOB, "foobar"); + + odb_source_free(&source->base); +} + +void test_odb_inmemory__read_stream_object(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct odb_read_stream *stream; + struct object_id written_oid; + const char data[] = "foobar"; + char buf[3] = { 0 }; + + cl_must_pass(odb_source_write_object(&source->base, data, strlen(data), + OBJ_BLOB, &written_oid, NULL, 0)); + + cl_must_pass(odb_source_read_object_stream(&stream, &source->base, + &written_oid)); + cl_assert_equal_i(stream->type, OBJ_BLOB); + cl_assert_equal_u(stream->size, 6); + + cl_assert_equal_i(odb_read_stream_read(stream, buf, 2), 2); + cl_assert_equal_s(buf, "fo"); + cl_assert_equal_i(odb_read_stream_read(stream, buf, 2), 2); + cl_assert_equal_s(buf, "ob"); + cl_assert_equal_i(odb_read_stream_read(stream, buf, 2), 2); + cl_assert_equal_s(buf, "ar"); + cl_assert_equal_i(odb_read_stream_read(stream, buf, 2), 0); + + odb_read_stream_close(stream); + odb_source_free(&source->base); +} + +static int add_one_object(const struct object_id *oid, + struct object_info *oi UNUSED, + void *payload) +{ + struct oidset *actual_oids = payload; + cl_must_pass(oidset_insert(actual_oids, oid)); + return 0; +} + +void test_odb_inmemory__for_each_object(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct odb_for_each_object_options opts = { 0 }; + struct oidset expected_oids = OIDSET_INIT; + struct oidset actual_oids = OIDSET_INIT; + struct strbuf buf = STRBUF_INIT; + + cl_must_pass(odb_source_for_each_object(&source->base, NULL, + add_one_object, &actual_oids, &opts)); + cl_assert_equal_u(oidset_size(&actual_oids), 0); + + for (int i = 0; i < 10; i++) { + struct object_id written_oid; + + strbuf_reset(&buf); + strbuf_addf(&buf, "%d", i); + + cl_must_pass(odb_source_write_object(&source->base, buf.buf, buf.len, + OBJ_BLOB, &written_oid, NULL, 0)); + cl_must_pass(oidset_insert(&expected_oids, &written_oid)); + } + + cl_must_pass(odb_source_for_each_object(&source->base, NULL, + add_one_object, &actual_oids, &opts)); + cl_assert_equal_b(oidset_equal(&expected_oids, &actual_oids), true); + + odb_source_free(&source->base); + oidset_clear(&expected_oids); + oidset_clear(&actual_oids); + strbuf_release(&buf); +} + +static int abort_after_two_objects(const struct object_id *oid UNUSED, + struct object_info *oi UNUSED, + void *payload) +{ + unsigned *counter = payload; + (*counter)++; + if (*counter == 2) + return 123; + return 0; +} + +void test_odb_inmemory__for_each_object_can_abort_iteration(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct odb_for_each_object_options opts = { 0 }; + struct object_id written_oid; + unsigned counter = 0; + + cl_must_pass(odb_source_write_object(&source->base, "1", 1, + OBJ_BLOB, &written_oid, NULL, 0)); + cl_must_pass(odb_source_write_object(&source->base, "2", 1, + OBJ_BLOB, &written_oid, NULL, 0)); + cl_must_pass(odb_source_write_object(&source->base, "3", 1, + OBJ_BLOB, &written_oid, NULL, 0)); + + cl_assert_equal_i(odb_source_for_each_object(&source->base, NULL, + abort_after_two_objects, + &counter, &opts), + 123); + cl_assert_equal_u(counter, 2); + + odb_source_free(&source->base); +} + +void test_odb_inmemory__count_objects(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct object_id written_oid; + unsigned long count; + + cl_must_pass(odb_source_count_objects(&source->base, 0, &count)); + cl_assert_equal_u(count, 0); + + cl_must_pass(odb_source_write_object(&source->base, "1", 1, + OBJ_BLOB, &written_oid, NULL, 0)); + cl_must_pass(odb_source_write_object(&source->base, "2", 1, + OBJ_BLOB, &written_oid, NULL, 0)); + cl_must_pass(odb_source_write_object(&source->base, "3", 1, + OBJ_BLOB, &written_oid, NULL, 0)); + + cl_must_pass(odb_source_count_objects(&source->base, 0, &count)); + cl_assert_equal_u(count, 3); + + odb_source_free(&source->base); +} + +void test_odb_inmemory__find_abbrev_len(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct object_id oid1, oid2; + unsigned abbrev_len; + + /* + * The two blobs we're about to write share the first 10 hex characters + * of their object IDs ("a09f43dc45"), so at least 11 characters are + * needed to tell them apart: + * + * "368317" -> a09f43dc4562d45115583f5094640ae237df55f7 + * "514796" -> a09f43dc45fef837235eb7e6b1a6ca5e169a3981 + * + * With only one blob written we expect a length of 4. + */ + cl_must_pass(odb_source_write_object(&source->base, "368317", strlen("368317"), + OBJ_BLOB, &oid1, NULL, 0)); + cl_must_pass(odb_source_find_abbrev_len(&source->base, &oid1, 4, + &abbrev_len)); + cl_assert_equal_u(abbrev_len, 4); + + /* + * With both objects present, the shared 10-character prefix means we + * need at least 11 characters to uniquely identify either object. + */ + cl_must_pass(odb_source_write_object(&source->base, "514796", strlen("514796"), + OBJ_BLOB, &oid2, NULL, 0)); + cl_must_pass(odb_source_find_abbrev_len(&source->base, &oid1, 4, + &abbrev_len)); + cl_assert_equal_u(abbrev_len, 11); + + odb_source_free(&source->base); +} + +void test_odb_inmemory__freshen_object(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + struct object_id written_oid; + struct object_id oid; + const char *end; + + cl_must_pass(parse_oid_hex_algop(RANDOM_OID, &oid, &end, repo.hash_algo)); + cl_assert_equal_i(odb_source_freshen_object(&source->base, &oid), 0); + + cl_must_pass(odb_source_write_object(&source->base, "foobar", + strlen("foobar"), OBJ_BLOB, + &written_oid, NULL, 0)); + cl_assert_equal_i(odb_source_freshen_object(&source->base, + &written_oid), 1); + + odb_source_free(&source->base); +} + +struct membuf_write_stream { + struct odb_write_stream base; + const char *buf; + size_t offset; + size_t size; +}; + +static ssize_t membuf_write_stream_read(struct odb_write_stream *stream, + unsigned char *buf, size_t len) +{ + struct membuf_write_stream *s = container_of(stream, struct membuf_write_stream, base); + size_t chunk_size = 2; + + if (chunk_size > len) + chunk_size = len; + if (chunk_size > s->size - s->offset) + chunk_size = s->size - s->offset; + + memcpy(buf, s->buf + s->offset, chunk_size); + + s->offset += chunk_size; + if (s->offset == s->size) + s->base.is_finished = 1; + + return chunk_size; +} + +void test_odb_inmemory__write_object_stream(void) +{ + struct odb_source_inmemory *source = odb_source_inmemory_new(odb); + const char data[] = "foobar"; + struct membuf_write_stream stream = { + .base.read = membuf_write_stream_read, + .buf = data, + .size = strlen(data), + }; + struct object_id written_oid; + + cl_must_pass(odb_source_write_object_stream(&source->base, &stream.base, + strlen(data), &written_oid)); + cl_assert_equal_s(oid_to_hex(&written_oid), FOOBAR_OID); + cl_assert_object_info(source, &written_oid, OBJ_BLOB, "foobar"); + + odb_source_free(&source->base); +}