Skip to content

Commit 252cfb7

Browse files
derrickstoleegitster
authored andcommitted
maintenance: add loose-objects task
One goal of background maintenance jobs is to allow a user to disable auto-gc (gc.auto=0) but keep their repository in a clean state. Without any cleanup, loose objects will clutter the object database and slow operations. In addition, the loose objects will take up extra space because they are not stored with deltas against similar objects. Create a 'loose-objects' task for the 'git maintenance run' command. This helps clean up loose objects without disrupting concurrent Git commands using the following sequence of events: 1. Run 'git prune-packed' to delete any loose objects that exist in a pack-file. Concurrent commands will prefer the packed version of the object to the loose version. (Of course, there are exceptions for commands that specifically care about the location of an object. These are rare for a user to run on purpose, and we hope a user that has selected background maintenance will not be trying to do foreground maintenance.) 2. Run 'git pack-objects' on a batch of loose objects. These objects are grouped by scanning the loose object directories in lexicographic order until listing all loose objects -or- reaching 50,000 objects. This is more than enough if the loose objects are created only by a user doing normal development. We noticed users with _millions_ of loose objects because VFS for Git downloads blobs on-demand when a file read operation requires populating a virtual file. This step is based on a similar step in Scalar [1] and VFS for Git. [1] https://github.com/microsoft/scalar/blob/master/Scalar.Common/Maintenance/LooseObjectsStep.cs Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 28cb5e6 commit 252cfb7

File tree

3 files changed

+151
-0
lines changed

3 files changed

+151
-0
lines changed

Documentation/git-maintenance.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,21 @@ gc::
7070
be disruptive in some situations, as it deletes stale data. See
7171
linkgit:git-gc[1] for more details on garbage collection in Git.
7272

73+
loose-objects::
74+
The `loose-objects` job cleans up loose objects and places them into
75+
pack-files. In order to prevent race conditions with concurrent Git
76+
commands, it follows a two-step process. First, it deletes any loose
77+
objects that already exist in a pack-file; concurrent Git processes
78+
will examine the pack-file for the object data instead of the loose
79+
object. Second, it creates a new pack-file (starting with "loose-")
80+
containing a batch of loose objects. The batch size is limited to 50
81+
thousand objects to prevent the job from taking too long on a
82+
repository with many loose objects. The `gc` task writes unreachable
83+
objects as loose objects to be cleaned up by a later step only if
84+
they are not re-added to a pack-file; for this reason it is not
85+
advisable to enable both the `loose-objects` and `gc` tasks at the
86+
same time.
87+
7388
OPTIONS
7489
-------
7590
--auto::

builtin/gc.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,6 +880,98 @@ static int maintenance_task_gc(struct maintenance_run_opts *opts)
880880
return run_command(&child);
881881
}
882882

883+
static int prune_packed(struct maintenance_run_opts *opts)
884+
{
885+
struct child_process child = CHILD_PROCESS_INIT;
886+
887+
child.git_cmd = 1;
888+
strvec_push(&child.args, "prune-packed");
889+
890+
if (opts->quiet)
891+
strvec_push(&child.args, "--quiet");
892+
893+
return !!run_command(&child);
894+
}
895+
896+
struct write_loose_object_data {
897+
FILE *in;
898+
int count;
899+
int batch_size;
900+
};
901+
902+
static int bail_on_loose(const struct object_id *oid,
903+
const char *path,
904+
void *data)
905+
{
906+
return 1;
907+
}
908+
909+
static int write_loose_object_to_stdin(const struct object_id *oid,
910+
const char *path,
911+
void *data)
912+
{
913+
struct write_loose_object_data *d = (struct write_loose_object_data *)data;
914+
915+
fprintf(d->in, "%s\n", oid_to_hex(oid));
916+
917+
return ++(d->count) > d->batch_size;
918+
}
919+
920+
static int pack_loose(struct maintenance_run_opts *opts)
921+
{
922+
struct repository *r = the_repository;
923+
int result = 0;
924+
struct write_loose_object_data data;
925+
struct child_process pack_proc = CHILD_PROCESS_INIT;
926+
927+
/*
928+
* Do not start pack-objects process
929+
* if there are no loose objects.
930+
*/
931+
if (!for_each_loose_file_in_objdir(r->objects->odb->path,
932+
bail_on_loose,
933+
NULL, NULL, NULL))
934+
return 0;
935+
936+
pack_proc.git_cmd = 1;
937+
938+
strvec_push(&pack_proc.args, "pack-objects");
939+
if (opts->quiet)
940+
strvec_push(&pack_proc.args, "--quiet");
941+
strvec_pushf(&pack_proc.args, "%s/pack/loose", r->objects->odb->path);
942+
943+
pack_proc.in = -1;
944+
945+
if (start_command(&pack_proc)) {
946+
error(_("failed to start 'git pack-objects' process"));
947+
return 1;
948+
}
949+
950+
data.in = xfdopen(pack_proc.in, "w");
951+
data.count = 0;
952+
data.batch_size = 50000;
953+
954+
for_each_loose_file_in_objdir(r->objects->odb->path,
955+
write_loose_object_to_stdin,
956+
NULL,
957+
NULL,
958+
&data);
959+
960+
fclose(data.in);
961+
962+
if (finish_command(&pack_proc)) {
963+
error(_("failed to finish 'git pack-objects' process"));
964+
result = 1;
965+
}
966+
967+
return result;
968+
}
969+
970+
static int maintenance_task_loose_objects(struct maintenance_run_opts *opts)
971+
{
972+
return prune_packed(opts) || pack_loose(opts);
973+
}
974+
883975
typedef int maintenance_task_fn(struct maintenance_run_opts *opts);
884976

885977
/*
@@ -901,6 +993,7 @@ struct maintenance_task {
901993

902994
enum maintenance_task_label {
903995
TASK_PREFETCH,
996+
TASK_LOOSE_OBJECTS,
904997
TASK_GC,
905998
TASK_COMMIT_GRAPH,
906999

@@ -913,6 +1006,10 @@ static struct maintenance_task tasks[] = {
9131006
"prefetch",
9141007
maintenance_task_prefetch,
9151008
},
1009+
[TASK_LOOSE_OBJECTS] = {
1010+
"loose-objects",
1011+
maintenance_task_loose_objects,
1012+
},
9161013
[TASK_GC] = {
9171014
"gc",
9181015
maintenance_task_gc,

t/t7900-maintenance.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,43 @@ test_expect_success 'prefetch multiple remotes' '
8888
test_cmp_rev refs/remotes/remote2/two refs/prefetch/remote2/two
8989
'
9090

91+
test_expect_success 'loose-objects task' '
92+
# Repack everything so we know the state of the object dir
93+
git repack -adk &&
94+
95+
# Hack to stop maintenance from running during "git commit"
96+
echo in use >.git/objects/maintenance.lock &&
97+
98+
# Assuming that "git commit" creates at least one loose object
99+
test_commit create-loose-object &&
100+
rm .git/objects/maintenance.lock &&
101+
102+
ls .git/objects >obj-dir-before &&
103+
test_file_not_empty obj-dir-before &&
104+
ls .git/objects/pack/*.pack >packs-before &&
105+
test_line_count = 1 packs-before &&
106+
107+
# The first run creates a pack-file
108+
# but does not delete loose objects.
109+
git maintenance run --task=loose-objects &&
110+
ls .git/objects >obj-dir-between &&
111+
test_cmp obj-dir-before obj-dir-between &&
112+
ls .git/objects/pack/*.pack >packs-between &&
113+
test_line_count = 2 packs-between &&
114+
ls .git/objects/pack/loose-*.pack >loose-packs &&
115+
test_line_count = 1 loose-packs &&
116+
117+
# The second run deletes loose objects
118+
# but does not create a pack-file.
119+
git maintenance run --task=loose-objects &&
120+
ls .git/objects >obj-dir-after &&
121+
cat >expect <<-\EOF &&
122+
info
123+
pack
124+
EOF
125+
test_cmp expect obj-dir-after &&
126+
ls .git/objects/pack/*.pack >packs-after &&
127+
test_cmp packs-between packs-after
128+
'
129+
91130
test_done

0 commit comments

Comments
 (0)