From d429ab497942f8d20953c00d6db9741d29572b62 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:14 -0500 Subject: [PATCH 1/7] odb: split `struct odb_transaction` into separate header The current ODB transaction interface is colocated with other ODB interfaces in "odb.{c,h}". Subsequent commits will expand `struct odb_transaction` to support write operations on the transaction directly. To keep things organized and prevent "odb.{c,h}" from becoming more unwieldy, split out `struct odb_transaction` into a separate header. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- Makefile | 1 + builtin/add.c | 1 + builtin/unpack-objects.c | 1 + builtin/update-index.c | 1 + cache-tree.c | 1 + meson.build | 1 + object-file.c | 1 + odb.c | 25 ------------------------- odb.h | 31 ------------------------------- odb/transaction.c | 28 ++++++++++++++++++++++++++++ odb/transaction.h | 38 ++++++++++++++++++++++++++++++++++++++ read-cache.c | 1 + 12 files changed, 74 insertions(+), 56 deletions(-) create mode 100644 odb/transaction.c create mode 100644 odb/transaction.h diff --git a/Makefile b/Makefile index dbf0022054..6342db13e5 100644 --- a/Makefile +++ b/Makefile @@ -1219,6 +1219,7 @@ LIB_OBJS += odb.o LIB_OBJS += odb/source.o LIB_OBJS += odb/source-files.o LIB_OBJS += odb/streaming.o +LIB_OBJS += odb/transaction.o LIB_OBJS += oid-array.o LIB_OBJS += oidmap.o LIB_OBJS += oidset.o diff --git a/builtin/add.c b/builtin/add.c index 7737ab878b..c859f66519 100644 --- a/builtin/add.c +++ b/builtin/add.c @@ -16,6 +16,7 @@ #include "run-command.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "parse-options.h" #include "path.h" #include "preload-index.h" diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 6fc64e9e4b..bc9b1e047e 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -9,6 +9,7 @@ #include "hex.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "object.h" #include "delta.h" #include "pack.h" diff --git a/builtin/update-index.c b/builtin/update-index.c index 8a5907767b..bcc43852ef 100644 --- a/builtin/update-index.c +++ b/builtin/update-index.c @@ -19,6 +19,7 @@ #include "tree-walk.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "refs.h" #include "resolve-undo.h" #include "parse-options.h" diff --git a/cache-tree.c b/cache-tree.c index 60bcc07c3b..f056869cfd 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -10,6 +10,7 @@ #include "cache-tree.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "read-cache-ll.h" #include "replace-object.h" #include "repository.h" diff --git a/meson.build b/meson.build index 8309942d18..6dc23b3af2 100644 --- a/meson.build +++ b/meson.build @@ -405,6 +405,7 @@ libgit_sources = [ 'odb/source.c', 'odb/source-files.c', 'odb/streaming.c', + 'odb/transaction.c', 'oid-array.c', 'oidmap.c', 'oidset.c', diff --git a/object-file.c b/object-file.c index f0b029ff0b..bfbb632cf8 100644 --- a/object-file.c +++ b/object-file.c @@ -21,6 +21,7 @@ #include "object-file.h" #include "odb.h" #include "odb/streaming.h" +#include "odb/transaction.h" #include "oidtree.h" #include "pack.h" #include "packfile.h" diff --git a/odb.c b/odb.c index 350e23f3c0..8c3cbc1b53 100644 --- a/odb.c +++ b/odb.c @@ -1069,28 +1069,3 @@ void odb_reprepare(struct object_database *o) obj_read_unlock(); } - -struct odb_transaction *odb_transaction_begin(struct object_database *odb) -{ - if (odb->transaction) - return NULL; - - odb->transaction = odb_transaction_files_begin(odb->sources); - - return odb->transaction; -} - -void odb_transaction_commit(struct odb_transaction *transaction) -{ - if (!transaction) - return; - - /* - * Ensure the transaction ending matches the pending transaction. - */ - ASSERT(transaction == transaction->source->odb->transaction); - - transaction->commit(transaction); - transaction->source->odb->transaction = NULL; - free(transaction); -} diff --git a/odb.h b/odb.h index 9aee260105..ec5367b13e 100644 --- a/odb.h +++ b/odb.h @@ -35,24 +35,6 @@ struct packed_git; struct packfile_store; struct cached_object_entry; -/* - * A transaction may be started for an object database prior to writing new - * objects via odb_transaction_begin(). These objects are not committed until - * odb_transaction_commit() is invoked. Only a single transaction may be pending - * at a time. - * - * Each ODB source is expected to implement its own transaction handling. - */ -struct odb_transaction; -typedef void (*odb_transaction_commit_fn)(struct odb_transaction *transaction); -struct odb_transaction { - /* The ODB source the transaction is opened against. */ - struct odb_source *source; - - /* The ODB source specific callback invoked to commit a transaction. */ - odb_transaction_commit_fn commit; -}; - /* * The object database encapsulates access to objects in a repository. It * manages one or more sources that store the actual objects which are @@ -154,19 +136,6 @@ void odb_close(struct object_database *o); */ void odb_reprepare(struct object_database *o); -/* - * Starts an ODB transaction. Subsequent objects are written to the transaction - * and not committed until odb_transaction_commit() is invoked on the - * transaction. If the ODB already has a pending transaction, NULL is returned. - */ -struct odb_transaction *odb_transaction_begin(struct object_database *odb); - -/* - * Commits an ODB transaction making the written objects visible. If the - * specified transaction is NULL, the function is a no-op. - */ -void odb_transaction_commit(struct odb_transaction *transaction); - /* * Find source by its object directory path. Returns a `NULL` pointer in case * the source could not be found. diff --git a/odb/transaction.c b/odb/transaction.c new file mode 100644 index 0000000000..9bf3f347dc --- /dev/null +++ b/odb/transaction.c @@ -0,0 +1,28 @@ +#include "git-compat-util.h" +#include "object-file.h" +#include "odb/transaction.h" + +struct odb_transaction *odb_transaction_begin(struct object_database *odb) +{ + if (odb->transaction) + return NULL; + + odb->transaction = odb_transaction_files_begin(odb->sources); + + return odb->transaction; +} + +void odb_transaction_commit(struct odb_transaction *transaction) +{ + if (!transaction) + return; + + /* + * Ensure the transaction ending matches the pending transaction. + */ + ASSERT(transaction == transaction->source->odb->transaction); + + transaction->commit(transaction); + transaction->source->odb->transaction = NULL; + free(transaction); +} diff --git a/odb/transaction.h b/odb/transaction.h new file mode 100644 index 0000000000..a56e392f21 --- /dev/null +++ b/odb/transaction.h @@ -0,0 +1,38 @@ +#ifndef ODB_TRANSACTION_H +#define ODB_TRANSACTION_H + +#include "odb.h" +#include "odb/source.h" + +/* + * A transaction may be started for an object database prior to writing new + * objects via odb_transaction_begin(). These objects are not committed until + * odb_transaction_commit() is invoked. Only a single transaction may be pending + * at a time. + * + * Each ODB source is expected to implement its own transaction handling. + */ +struct odb_transaction; +typedef void (*odb_transaction_commit_fn)(struct odb_transaction *transaction); +struct odb_transaction { + /* The ODB source the transaction is opened against. */ + struct odb_source *source; + + /* The ODB source specific callback invoked to commit a transaction. */ + odb_transaction_commit_fn commit; +}; + +/* + * Starts an ODB transaction. Subsequent objects are written to the transaction + * and not committed until odb_transaction_commit() is invoked on the + * transaction. If the ODB already has a pending transaction, NULL is returned. + */ +struct odb_transaction *odb_transaction_begin(struct object_database *odb); + +/* + * Commits an ODB transaction making the written objects visible. If the + * specified transaction is NULL, the function is a no-op. + */ +void odb_transaction_commit(struct odb_transaction *transaction); + +#endif diff --git a/read-cache.c b/read-cache.c index 5049f9baca..8147c7e94a 100644 --- a/read-cache.c +++ b/read-cache.c @@ -20,6 +20,7 @@ #include "dir.h" #include "object-file.h" #include "odb.h" +#include "odb/transaction.h" #include "oid-array.h" #include "tree.h" #include "commit.h" From ec562fe26b305d7cff3e324f6b2f50a8b7469015 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:15 -0500 Subject: [PATCH 2/7] odb/transaction: use pluggable `begin_transaction()` Each ODB source is expected to provide an ODB transaction implementation that should be used when starting a transaction. With d6fc6fe6f8 (odb/source: make `begin_transaction()` function pluggable, 2026-03-05), the `struct odb_source` now provides a pluggable callback for beginning transactions. Use the callback provided by the ODB source accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- odb/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/odb/transaction.c b/odb/transaction.c index 9bf3f347dc..592ac84075 100644 --- a/odb/transaction.c +++ b/odb/transaction.c @@ -1,5 +1,5 @@ #include "git-compat-util.h" -#include "object-file.h" +#include "odb/source.h" #include "odb/transaction.h" struct odb_transaction *odb_transaction_begin(struct object_database *odb) @@ -7,7 +7,7 @@ struct odb_transaction *odb_transaction_begin(struct object_database *odb) if (odb->transaction) return NULL; - odb->transaction = odb_transaction_files_begin(odb->sources); + odb_source_begin_transaction(odb->sources, &odb->transaction); return odb->transaction; } From 61effed8751a65e15e12cf049043b961211fc69a Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:16 -0500 Subject: [PATCH 3/7] odb: update `struct odb_write_stream` read() callback The `read()` callback used by `struct odb_write_stream` currently returns a pointer to an internal buffer along with the number of bytes read. This makes buffer ownership unclear and provides no way to report errors. Update the interface to instead require the caller to provide a buffer, and have the callback return the number of bytes written to it or a negative value on error. While at it, also move the `struct odb_write_stream` definition to "odb/streaming.h". Call sites are updated accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- builtin/unpack-objects.c | 20 ++++++++------------ object-file.c | 14 +++++++++++--- odb.h | 6 +----- odb/streaming.c | 5 +++++ odb/streaming.h | 18 ++++++++++++++++++ 5 files changed, 43 insertions(+), 20 deletions(-) diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index bc9b1e047e..64e58e79fd 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -9,6 +9,7 @@ #include "hex.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" #include "odb/transaction.h" #include "object.h" #include "delta.h" @@ -360,24 +361,21 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size, struct input_zstream_data { git_zstream *zstream; - unsigned char buf[8192]; int status; }; -static const void *feed_input_zstream(struct odb_write_stream *in_stream, - unsigned long *readlen) +static ssize_t feed_input_zstream(struct odb_write_stream *in_stream, + unsigned char *buf, size_t buf_len) { struct input_zstream_data *data = in_stream->data; git_zstream *zstream = data->zstream; void *in = fill(1); - if (in_stream->is_finished) { - *readlen = 0; - return NULL; - } + if (in_stream->is_finished) + return 0; - zstream->next_out = data->buf; - zstream->avail_out = sizeof(data->buf); + zstream->next_out = buf; + zstream->avail_out = buf_len; zstream->next_in = in; zstream->avail_in = len; @@ -385,9 +383,7 @@ static const void *feed_input_zstream(struct odb_write_stream *in_stream, in_stream->is_finished = data->status != Z_OK; use(len - zstream->avail_in); - *readlen = sizeof(data->buf) - zstream->avail_out; - - return data->buf; + return buf_len - zstream->avail_out; } static void stream_blob(unsigned long size, unsigned nr) diff --git a/object-file.c b/object-file.c index bfbb632cf8..0ae36314aa 100644 --- a/object-file.c +++ b/object-file.c @@ -1066,6 +1066,7 @@ int odb_source_loose_write_stream(struct odb_source *source, struct git_hash_ctx c, compat_c; struct strbuf tmp_file = STRBUF_INIT; struct strbuf filename = STRBUF_INIT; + unsigned char buf[8192]; int dirlen; char hdr[MAX_HEADER_LEN]; int hdrlen; @@ -1098,9 +1099,16 @@ int odb_source_loose_write_stream(struct odb_source *source, unsigned char *in0 = stream.next_in; if (!stream.avail_in && !in_stream->is_finished) { - const void *in = in_stream->read(in_stream, &stream.avail_in); - stream.next_in = (void *)in; - in0 = (unsigned char *)in; + ssize_t read_len = odb_write_stream_read(in_stream, buf, + sizeof(buf)); + if (read_len < 0) { + err = -1; + goto cleanup; + } + + stream.avail_in = read_len; + stream.next_in = buf; + in0 = buf; /* All data has been read. */ if (in_stream->is_finished) flush = 1; diff --git a/odb.h b/odb.h index ec5367b13e..6faeaa0589 100644 --- a/odb.h +++ b/odb.h @@ -529,11 +529,7 @@ static inline int odb_write_object(struct object_database *odb, return odb_write_object_ext(odb, buf, len, type, oid, NULL, 0); } -struct odb_write_stream { - const void *(*read)(struct odb_write_stream *, unsigned long *len); - void *data; - int is_finished; -}; +struct odb_write_stream; int odb_write_object_stream(struct object_database *odb, struct odb_write_stream *stream, size_t len, diff --git a/odb/streaming.c b/odb/streaming.c index 5927a12954..a68dd2cbe3 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -232,6 +232,11 @@ struct odb_read_stream *odb_read_stream_open(struct object_database *odb, return st; } +ssize_t odb_write_stream_read(struct odb_write_stream *st, void *buf, size_t sz) +{ + return st->read(st, buf, sz); +} + int odb_stream_blob_to_fd(struct object_database *odb, int fd, const struct object_id *oid, diff --git a/odb/streaming.h b/odb/streaming.h index c7861f7e13..65ced911fe 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -47,6 +47,24 @@ int odb_read_stream_close(struct odb_read_stream *stream); */ ssize_t odb_read_stream_read(struct odb_read_stream *stream, void *buf, size_t len); +/* + * A stream that provides an object to be written to the object database without + * loading all of it into memory. + */ +struct odb_write_stream { + ssize_t (*read)(struct odb_write_stream *, unsigned char *, size_t); + void *data; + int is_finished; +}; + +/* + * Read data from the stream into the buffer. Returns 0 when finished and the + * number of bytes read on success. Returns a negative error code in case + * reading from the stream fails. + */ +ssize_t odb_write_stream_read(struct odb_write_stream *stream, void *buf, + size_t len); + /* * Look up the object by its ID and write the full contents to the file * descriptor. The object must be a blob, or the function will fail. When From 43eadce3407aa63829633b5dc903db79f4cdf29e Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:17 -0500 Subject: [PATCH 4/7] object-file: remove flags from transaction packfile writes The `index_blob_packfile_transaction()` function handles streaming a blob from an fd to compute its object ID and conditionally writes the object directly to a packfile if the INDEX_WRITE_OBJECT flag is set. A subsequent commit will make these packfile object writes part of the transaction interface. Consequently, having the object write be conditional on this flag is a bit awkward. In preparation for this change, introduce a dedicated `hash_blob_stream()` helper that only computes the OID from a `struct odb_write_stream`. This is invoked by `index_fd()` instead when the INDEX_WRITE_OBJECT is not set. The object write performed via `index_blob_packfile_transaction()` is made unconditional accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 128 +++++++++++++++++++++++++++++------------------- odb/streaming.c | 46 +++++++++++++++++ odb/streaming.h | 12 +++++ 3 files changed, 136 insertions(+), 50 deletions(-) diff --git a/object-file.c b/object-file.c index 0ae36314aa..382d14c8c0 100644 --- a/object-file.c +++ b/object-file.c @@ -1396,11 +1396,10 @@ static int already_written(struct odb_transaction_files *transaction, } /* Lazily create backing packfile for the state */ -static void prepare_packfile_transaction(struct odb_transaction_files *transaction, - unsigned flags) +static void prepare_packfile_transaction(struct odb_transaction_files *transaction) { struct transaction_packfile *state = &transaction->packfile; - if (!(flags & INDEX_WRITE_OBJECT) || state->f) + if (state->f) return; state->f = create_tmp_packfile(transaction->base.source->odb->repo, @@ -1413,6 +1412,39 @@ static void prepare_packfile_transaction(struct odb_transaction_files *transacti die_errno("unable to write pack header"); } +static int hash_blob_stream(struct odb_write_stream *stream, + const struct git_hash_algo *hash_algo, + struct object_id *result_oid, size_t size) +{ + unsigned char buf[16384]; + struct git_hash_ctx ctx; + unsigned header_len; + size_t bytes_hashed = 0; + + header_len = format_object_header((char *)buf, sizeof(buf), + OBJ_BLOB, size); + hash_algo->init_fn(&ctx); + git_hash_update(&ctx, buf, header_len); + + while (!stream->is_finished) { + ssize_t read_result = odb_write_stream_read(stream, buf, + sizeof(buf)); + + if (read_result < 0) + return -1; + + git_hash_update(&ctx, buf, read_result); + bytes_hashed += read_result; + } + + if (bytes_hashed != size) + return -1; + + git_hash_final_oid(result_oid, &ctx); + + return 0; +} + /* * Read the contents from fd for size bytes, streaming it to the * packfile in state while updating the hash in ctx. Signal a failure @@ -1430,15 +1462,13 @@ static void prepare_packfile_transaction(struct odb_transaction_files *transacti */ static int stream_blob_to_pack(struct transaction_packfile *state, struct git_hash_ctx *ctx, off_t *already_hashed_to, - int fd, size_t size, const char *path, - unsigned flags) + int fd, size_t size, const char *path) { git_zstream s; unsigned char ibuf[16384]; unsigned char obuf[16384]; unsigned hdrlen; int status = Z_OK; - int write_object = (flags & INDEX_WRITE_OBJECT); off_t offset = 0; git_deflate_init(&s, pack_compression_level); @@ -1473,20 +1503,18 @@ static int stream_blob_to_pack(struct transaction_packfile *state, status = git_deflate(&s, size ? 0 : Z_FINISH); if (!s.avail_out || status == Z_STREAM_END) { - if (write_object) { - size_t written = s.next_out - obuf; + size_t written = s.next_out - obuf; - /* would we bust the size limit? */ - if (state->nr_written && - pack_size_limit_cfg && - pack_size_limit_cfg < state->offset + written) { - git_deflate_abort(&s); - return -1; - } - - hashwrite(state->f, obuf, written); - state->offset += written; + /* would we bust the size limit? */ + if (state->nr_written && + pack_size_limit_cfg && + pack_size_limit_cfg < state->offset + written) { + git_deflate_abort(&s); + return -1; } + + hashwrite(state->f, obuf, written); + state->offset += written; s.next_out = obuf; s.avail_out = sizeof(obuf); } @@ -1574,8 +1602,7 @@ clear_exit: */ static int index_blob_packfile_transaction(struct odb_transaction_files *transaction, struct object_id *result_oid, int fd, - size_t size, const char *path, - unsigned flags) + size_t size, const char *path) { struct transaction_packfile *state = &transaction->packfile; off_t seekback, already_hashed_to; @@ -1583,7 +1610,7 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac unsigned char obuf[16384]; unsigned header_len; struct hashfile_checkpoint checkpoint; - struct pack_idx_entry *idx = NULL; + struct pack_idx_entry *idx; seekback = lseek(fd, 0, SEEK_CUR); if (seekback == (off_t)-1) @@ -1594,33 +1621,26 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac transaction->base.source->odb->repo->hash_algo->init_fn(&ctx); git_hash_update(&ctx, obuf, header_len); - /* Note: idx is non-NULL when we are writing */ - if ((flags & INDEX_WRITE_OBJECT) != 0) { - CALLOC_ARRAY(idx, 1); - - prepare_packfile_transaction(transaction, flags); - hashfile_checkpoint_init(state->f, &checkpoint); - } + CALLOC_ARRAY(idx, 1); + prepare_packfile_transaction(transaction); + hashfile_checkpoint_init(state->f, &checkpoint); already_hashed_to = 0; while (1) { - prepare_packfile_transaction(transaction, flags); - if (idx) { - hashfile_checkpoint(state->f, &checkpoint); - idx->offset = state->offset; - crc32_begin(state->f); - } + prepare_packfile_transaction(transaction); + hashfile_checkpoint(state->f, &checkpoint); + idx->offset = state->offset; + crc32_begin(state->f); + if (!stream_blob_to_pack(state, &ctx, &already_hashed_to, - fd, size, path, flags)) + fd, size, path)) break; /* * Writing this object to the current pack will make * it too big; we need to truncate it, start a new * pack, and write into it. */ - if (!idx) - BUG("should not happen"); hashfile_truncate(state->f, &checkpoint); state->offset = checkpoint.offset; flush_packfile_transaction(transaction); @@ -1628,8 +1648,6 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac return error("cannot seek back"); } git_hash_final_oid(result_oid, &ctx); - if (!idx) - return 0; idx->crc32 = crc32_end(state->f); if (already_written(transaction, result_oid)) { @@ -1667,18 +1685,28 @@ int index_fd(struct index_state *istate, struct object_id *oid, ret = index_core(istate, oid, fd, xsize_t(st->st_size), type, path, flags); } else { - struct object_database *odb = the_repository->objects; - struct odb_transaction_files *files_transaction; - struct odb_transaction *transaction; + struct odb_write_stream stream; + odb_write_stream_from_fd(&stream, fd, xsize_t(st->st_size)); - transaction = odb_transaction_begin(odb); - files_transaction = container_of(odb->transaction, - struct odb_transaction_files, - base); - ret = index_blob_packfile_transaction(files_transaction, oid, fd, - xsize_t(st->st_size), - path, flags); - odb_transaction_commit(transaction); + if (flags & INDEX_WRITE_OBJECT) { + struct object_database *odb = the_repository->objects; + struct odb_transaction_files *files_transaction; + struct odb_transaction *transaction; + + transaction = odb_transaction_begin(odb); + files_transaction = container_of(odb->transaction, + struct odb_transaction_files, + base); + ret = index_blob_packfile_transaction(files_transaction, oid, fd, + xsize_t(st->st_size), path); + odb_transaction_commit(transaction); + } else { + ret = hash_blob_stream(&stream, + the_repository->hash_algo, oid, + xsize_t(st->st_size)); + } + + odb_write_stream_release(&stream); } close(fd); diff --git a/odb/streaming.c b/odb/streaming.c index a68dd2cbe3..20531e864c 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -237,6 +237,11 @@ ssize_t odb_write_stream_read(struct odb_write_stream *st, void *buf, size_t sz) return st->read(st, buf, sz); } +void odb_write_stream_release(struct odb_write_stream *st) +{ + free(st->data); +} + int odb_stream_blob_to_fd(struct object_database *odb, int fd, const struct object_id *oid, @@ -292,3 +297,44 @@ int odb_stream_blob_to_fd(struct object_database *odb, odb_read_stream_close(st); return result; } + +struct read_object_fd_data { + int fd; + size_t remaining; +}; + +static ssize_t read_object_fd(struct odb_write_stream *stream, + unsigned char *buf, size_t len) +{ + struct read_object_fd_data *data = stream->data; + ssize_t read_result; + size_t count; + + if (stream->is_finished) + return 0; + + count = data->remaining < len ? data->remaining : len; + read_result = read_in_full(data->fd, buf, count); + if (read_result < 0 || (size_t)read_result != count) + return -1; + + data->remaining -= count; + if (!data->remaining) + stream->is_finished = 1; + + return read_result; +} + +void odb_write_stream_from_fd(struct odb_write_stream *stream, int fd, + size_t size) +{ + struct read_object_fd_data *data; + + CALLOC_ARRAY(data, 1); + data->fd = fd; + data->remaining = size; + + stream->data = data; + stream->read = read_object_fd; + stream->is_finished = 0; +} diff --git a/odb/streaming.h b/odb/streaming.h index 65ced911fe..2a8cac19a4 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -5,6 +5,7 @@ #define STREAMING_H 1 #include "object.h" +#include "odb.h" struct object_database; struct odb_read_stream; @@ -65,6 +66,11 @@ struct odb_write_stream { ssize_t odb_write_stream_read(struct odb_write_stream *stream, void *buf, size_t len); +/* + * Releases memory allocated for underlying stream data. + */ +void odb_write_stream_release(struct odb_write_stream *stream); + /* * Look up the object by its ID and write the full contents to the file * descriptor. The object must be a blob, or the function will fail. When @@ -82,4 +88,10 @@ int odb_stream_blob_to_fd(struct object_database *odb, struct stream_filter *filter, int can_seek); +/* + * Sets up an ODB write stream that reads from an fd. + */ +void odb_write_stream_from_fd(struct odb_write_stream *stream, int fd, + size_t size); + #endif /* STREAMING_H */ From b247256a17fdf52777aab4ef412bd9132a3a4d9f Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:18 -0500 Subject: [PATCH 5/7] object-file: avoid fd seekback by checking object size upfront In certain scenarios, Git handles writing blobs that exceed "core.bigFileThreshold" differently by streaming the object directly into a packfile. When there is an active ODB transaction, these blobs are streamed to the same packfile instead of using a separate packfile for each. If "pack.packSizeLimit" is configured and streaming another object causes the packfile to exceed the configured limit, the packfile is truncated back to the previous object and the object write is restarted in a new packfile. This works fine, but requires the fd being read from to save a checkpoint so it becomes possible to rewind the input source via seeking back to a known offset at the beginning. In a subsequent commit, blob streaming is converted to use `struct odb_write_stream` as a more generic input source instead of an fd which doesn't provide a mechanism for rewinding. For this use case though, rewinding the fd is not strictly necessary because the inflated size of the object is known and can be used to approximate whether writing the object would cause the packfile to exceed the configured limit prior to writing anything. These blobs written to the packfile are never deltified thus the size difference between what is written versus the inflated size is due to zlib compression. While this does prevent packfiles from being filled to the potential maximum is some cases, it should be good enough and still prevents the packfile from exceeding any configured limit. Use the inflated blob size to determine whether writing an object to a packfile will exceed the configured "pack.packSizeLimit". Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 86 +++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 61 deletions(-) diff --git a/object-file.c b/object-file.c index 382d14c8c0..0284d5434b 100644 --- a/object-file.c +++ b/object-file.c @@ -1447,29 +1447,17 @@ static int hash_blob_stream(struct odb_write_stream *stream, /* * Read the contents from fd for size bytes, streaming it to the - * packfile in state while updating the hash in ctx. Signal a failure - * by returning a negative value when the resulting pack would exceed - * the pack size limit and this is not the first object in the pack, - * so that the caller can discard what we wrote from the current pack - * by truncating it and opening a new one. The caller will then call - * us again after rewinding the input fd. - * - * The already_hashed_to pointer is kept untouched by the caller to - * make sure we do not hash the same byte when we are called - * again. This way, the caller does not have to checkpoint its hash - * status before calling us just in case we ask it to call us again - * with a new pack. + * packfile in state while updating the hash in ctx. */ -static int stream_blob_to_pack(struct transaction_packfile *state, - struct git_hash_ctx *ctx, off_t *already_hashed_to, - int fd, size_t size, const char *path) +static void stream_blob_to_pack(struct transaction_packfile *state, + struct git_hash_ctx *ctx, int fd, size_t size, + const char *path) { git_zstream s; unsigned char ibuf[16384]; unsigned char obuf[16384]; unsigned hdrlen; int status = Z_OK; - off_t offset = 0; git_deflate_init(&s, pack_compression_level); @@ -1486,15 +1474,9 @@ static int stream_blob_to_pack(struct transaction_packfile *state, if ((size_t)read_result != rsize) die("failed to read %u bytes from '%s'", (unsigned)rsize, path); - offset += rsize; - if (*already_hashed_to < offset) { - size_t hsize = offset - *already_hashed_to; - if (rsize < hsize) - hsize = rsize; - if (hsize) - git_hash_update(ctx, ibuf, hsize); - *already_hashed_to = offset; - } + + git_hash_update(ctx, ibuf, rsize); + s.next_in = ibuf; s.avail_in = rsize; size -= rsize; @@ -1505,14 +1487,6 @@ static int stream_blob_to_pack(struct transaction_packfile *state, if (!s.avail_out || status == Z_STREAM_END) { size_t written = s.next_out - obuf; - /* would we bust the size limit? */ - if (state->nr_written && - pack_size_limit_cfg && - pack_size_limit_cfg < state->offset + written) { - git_deflate_abort(&s); - return -1; - } - hashwrite(state->f, obuf, written); state->offset += written; s.next_out = obuf; @@ -1529,7 +1503,6 @@ static int stream_blob_to_pack(struct transaction_packfile *state, } } git_deflate_end(&s); - return 0; } static void flush_packfile_transaction(struct odb_transaction_files *transaction) @@ -1605,48 +1578,39 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac size_t size, const char *path) { struct transaction_packfile *state = &transaction->packfile; - off_t seekback, already_hashed_to; struct git_hash_ctx ctx; unsigned char obuf[16384]; unsigned header_len; struct hashfile_checkpoint checkpoint; struct pack_idx_entry *idx; - seekback = lseek(fd, 0, SEEK_CUR); - if (seekback == (off_t)-1) - return error("cannot find the current offset"); - header_len = format_object_header((char *)obuf, sizeof(obuf), OBJ_BLOB, size); transaction->base.source->odb->repo->hash_algo->init_fn(&ctx); git_hash_update(&ctx, obuf, header_len); + /* + * If writing another object to the packfile could result in it + * exceeding the configured size limit, flush the current packfile + * transaction. + * + * Note that this uses the inflated object size as an approximation. + * Blob objects written in this manner are not delta-compressed, so + * the difference between the inflated and on-disk size is limited + * to zlib compression and is sufficient for this check. + */ + if (state->nr_written && pack_size_limit_cfg && + pack_size_limit_cfg < state->offset + size) + flush_packfile_transaction(transaction); + CALLOC_ARRAY(idx, 1); prepare_packfile_transaction(transaction); hashfile_checkpoint_init(state->f, &checkpoint); - already_hashed_to = 0; - - while (1) { - prepare_packfile_transaction(transaction); - hashfile_checkpoint(state->f, &checkpoint); - idx->offset = state->offset; - crc32_begin(state->f); - - if (!stream_blob_to_pack(state, &ctx, &already_hashed_to, - fd, size, path)) - break; - /* - * Writing this object to the current pack will make - * it too big; we need to truncate it, start a new - * pack, and write into it. - */ - hashfile_truncate(state->f, &checkpoint); - state->offset = checkpoint.offset; - flush_packfile_transaction(transaction); - if (lseek(fd, seekback, SEEK_SET) == (off_t)-1) - return error("cannot seek back"); - } + hashfile_checkpoint(state->f, &checkpoint); + idx->offset = state->offset; + crc32_begin(state->f); + stream_blob_to_pack(state, &ctx, fd, size, path); git_hash_final_oid(result_oid, &ctx); idx->crc32 = crc32_end(state->f); From a4e0a8bbb7102adeb16bd66074a5bc9102a3bb46 Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:19 -0500 Subject: [PATCH 6/7] object-file: generalize packfile writes to use odb_write_stream The `index_blob_packfile_transaction()` function streams blob data directly from an fd. This makes it difficult to reuse as part of a generic transactional object writing interface. Refactor the packfile write path to operate on a `struct odb_write_stream`, allowing callers to supply data from arbitrary sources. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 54 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/object-file.c b/object-file.c index 0284d5434b..7fa2b9239f 100644 --- a/object-file.c +++ b/object-file.c @@ -1446,18 +1446,19 @@ static int hash_blob_stream(struct odb_write_stream *stream, } /* - * Read the contents from fd for size bytes, streaming it to the + * Read the contents from the stream provided, streaming it to the * packfile in state while updating the hash in ctx. */ static void stream_blob_to_pack(struct transaction_packfile *state, - struct git_hash_ctx *ctx, int fd, size_t size, - const char *path) + struct git_hash_ctx *ctx, size_t size, + struct odb_write_stream *stream) { git_zstream s; unsigned char ibuf[16384]; unsigned char obuf[16384]; unsigned hdrlen; int status = Z_OK; + size_t bytes_read = 0; git_deflate_init(&s, pack_compression_level); @@ -1466,23 +1467,21 @@ static void stream_blob_to_pack(struct transaction_packfile *state, s.avail_out = sizeof(obuf) - hdrlen; while (status != Z_STREAM_END) { - if (size && !s.avail_in) { - size_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf); - ssize_t read_result = read_in_full(fd, ibuf, rsize); - if (read_result < 0) - die_errno("failed to read from '%s'", path); - if ((size_t)read_result != rsize) - die("failed to read %u bytes from '%s'", - (unsigned)rsize, path); + if (!stream->is_finished && !s.avail_in) { + ssize_t rsize = odb_write_stream_read(stream, ibuf, + sizeof(ibuf)); + + if (rsize < 0) + die("failed to read blob data"); git_hash_update(ctx, ibuf, rsize); s.next_in = ibuf; s.avail_in = rsize; - size -= rsize; + bytes_read += rsize; } - status = git_deflate(&s, size ? 0 : Z_FINISH); + status = git_deflate(&s, stream->is_finished ? Z_FINISH : 0); if (!s.avail_out || status == Z_STREAM_END) { size_t written = s.next_out - obuf; @@ -1502,6 +1501,11 @@ static void stream_blob_to_pack(struct transaction_packfile *state, die("unexpected deflate failure: %d", status); } } + + if (bytes_read != size) + die("read %" PRIuMAX " bytes of blob data, but expected %" PRIuMAX " bytes", + (uintmax_t)bytes_read, (uintmax_t)size); + git_deflate_end(&s); } @@ -1573,10 +1577,13 @@ clear_exit: * binary blobs, they generally do not want to get any conversion, and * callers should avoid this code path when filters are requested. */ -static int index_blob_packfile_transaction(struct odb_transaction_files *transaction, - struct object_id *result_oid, int fd, - size_t size, const char *path) +static int index_blob_packfile_transaction(struct odb_transaction *base, + struct odb_write_stream *stream, + size_t size, struct object_id *result_oid) { + struct odb_transaction_files *transaction = container_of(base, + struct odb_transaction_files, + base); struct transaction_packfile *state = &transaction->packfile; struct git_hash_ctx ctx; unsigned char obuf[16384]; @@ -1610,7 +1617,7 @@ static int index_blob_packfile_transaction(struct odb_transaction_files *transac hashfile_checkpoint(state->f, &checkpoint); idx->offset = state->offset; crc32_begin(state->f); - stream_blob_to_pack(state, &ctx, fd, size, path); + stream_blob_to_pack(state, &ctx, size, stream); git_hash_final_oid(result_oid, &ctx); idx->crc32 = crc32_end(state->f); @@ -1654,15 +1661,12 @@ int index_fd(struct index_state *istate, struct object_id *oid, if (flags & INDEX_WRITE_OBJECT) { struct object_database *odb = the_repository->objects; - struct odb_transaction_files *files_transaction; - struct odb_transaction *transaction; + struct odb_transaction *transaction = odb_transaction_begin(odb); - transaction = odb_transaction_begin(odb); - files_transaction = container_of(odb->transaction, - struct odb_transaction_files, - base); - ret = index_blob_packfile_transaction(files_transaction, oid, fd, - xsize_t(st->st_size), path); + ret = index_blob_packfile_transaction(odb->transaction, + &stream, + xsize_t(st->st_size), + oid); odb_transaction_commit(transaction); } else { ret = hash_blob_stream(&stream, From ddf6aee9c68e7cfb3e5e21421b1d7798a67534bb Mon Sep 17 00:00:00 2001 From: Justin Tobler Date: Thu, 2 Apr 2026 16:32:20 -0500 Subject: [PATCH 7/7] odb/transaction: make `write_object_stream()` pluggable How an ODB transaction handles writing objects is expected to vary between implementations. Introduce a new `write_object_stream()` callback in `struct odb_transaction` to make this function pluggable. Rename `index_blob_packfile_transaction()` to `odb_transaction_files_write_object_stream()` and wire it up for use with `struct odb_transaction_files` accordingly. Signed-off-by: Justin Tobler Signed-off-by: Junio C Hamano --- object-file.c | 16 +++++++++------- odb/transaction.c | 7 +++++++ odb/transaction.h | 25 ++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/object-file.c b/object-file.c index 7fa2b9239f..65356998f3 100644 --- a/object-file.c +++ b/object-file.c @@ -1577,9 +1577,10 @@ clear_exit: * binary blobs, they generally do not want to get any conversion, and * callers should avoid this code path when filters are requested. */ -static int index_blob_packfile_transaction(struct odb_transaction *base, - struct odb_write_stream *stream, - size_t size, struct object_id *result_oid) +static int odb_transaction_files_write_object_stream(struct odb_transaction *base, + struct odb_write_stream *stream, + size_t size, + struct object_id *result_oid) { struct odb_transaction_files *transaction = container_of(base, struct odb_transaction_files, @@ -1663,10 +1664,10 @@ int index_fd(struct index_state *istate, struct object_id *oid, struct object_database *odb = the_repository->objects; struct odb_transaction *transaction = odb_transaction_begin(odb); - ret = index_blob_packfile_transaction(odb->transaction, - &stream, - xsize_t(st->st_size), - oid); + ret = odb_transaction_write_object_stream(odb->transaction, + &stream, + xsize_t(st->st_size), + oid); odb_transaction_commit(transaction); } else { ret = hash_blob_stream(&stream, @@ -2131,6 +2132,7 @@ struct odb_transaction *odb_transaction_files_begin(struct odb_source *source) transaction = xcalloc(1, sizeof(*transaction)); transaction->base.source = source; transaction->base.commit = odb_transaction_files_commit; + transaction->base.write_object_stream = odb_transaction_files_write_object_stream; return &transaction->base; } diff --git a/odb/transaction.c b/odb/transaction.c index 592ac84075..b16e07aebf 100644 --- a/odb/transaction.c +++ b/odb/transaction.c @@ -26,3 +26,10 @@ void odb_transaction_commit(struct odb_transaction *transaction) transaction->source->odb->transaction = NULL; free(transaction); } + +int odb_transaction_write_object_stream(struct odb_transaction *transaction, + struct odb_write_stream *stream, + size_t len, struct object_id *oid) +{ + return transaction->write_object_stream(transaction, stream, len, oid); +} diff --git a/odb/transaction.h b/odb/transaction.h index a56e392f21..854fda06f5 100644 --- a/odb/transaction.h +++ b/odb/transaction.h @@ -12,14 +12,24 @@ * * Each ODB source is expected to implement its own transaction handling. */ -struct odb_transaction; -typedef void (*odb_transaction_commit_fn)(struct odb_transaction *transaction); struct odb_transaction { /* The ODB source the transaction is opened against. */ struct odb_source *source; /* The ODB source specific callback invoked to commit a transaction. */ - odb_transaction_commit_fn commit; + void (*commit)(struct odb_transaction *transaction); + + /* + * This callback is expected to write the given object stream into + * the ODB transaction. Note that for now, only blobs support streaming. + * + * The resulting object ID shall be written into the out pointer. The + * callback is expected to return 0 on success, a negative error code + * otherwise. + */ + int (*write_object_stream)(struct odb_transaction *transaction, + struct odb_write_stream *stream, size_t len, + struct object_id *oid); }; /* @@ -35,4 +45,13 @@ struct odb_transaction *odb_transaction_begin(struct object_database *odb); */ void odb_transaction_commit(struct odb_transaction *transaction); +/* + * Writes the object in the provided stream into the transaction. The resulting + * object ID is written into the out pointer. Returns 0 on success, a negative + * error code otherwise. + */ +int odb_transaction_write_object_stream(struct odb_transaction *transaction, + struct odb_write_stream *stream, + size_t len, struct object_id *oid); + #endif