Skip to content

Commit 321b65d

Browse files
derrickstoleeGit for Windows Build Agent
authored andcommitted
survey: summarize total sizes by object type
Now that we have explored objects by count, we can expand that a bit more to summarize the data for the on-disk and inflated size of those objects. This information is helpful for diagnosing both why disk space (and perhaps clone or fetch times) is growing but also why certain operations are slow because the inflated size of the abstract objects that must be processed is so large. Note: zlib-ng is slightly more efficient even at those small sizes. Even between zlib versions, there are slight differences in compression. To accommodate for that in the tests, not the exact numbers but some rough approximations are validated (the test should validate `git survey`, after all, not zlib). Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Johannes Schindelin <[email protected]>
1 parent df6f9d3 commit 321b65d

File tree

2 files changed

+169
-1
lines changed

2 files changed

+169
-1
lines changed

builtin/survey.c

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,36 @@ struct survey_report_object_summary {
6060
size_t blobs_nr;
6161
};
6262

63+
/**
64+
* For some category given by 'label', count the number of objects
65+
* that match that label along with the on-disk size and the size
66+
* after decompressing (both with delta bases and zlib).
67+
*/
68+
struct survey_report_object_size_summary {
69+
char *label;
70+
size_t nr;
71+
size_t disk_size;
72+
size_t inflated_size;
73+
size_t num_missing;
74+
};
75+
6376
/**
6477
* This struct contains all of the information that needs to be printed
6578
* at the end of the exploration of the repository and its references.
6679
*/
6780
struct survey_report {
6881
struct survey_report_ref_summary refs;
6982
struct survey_report_object_summary reachable_objects;
83+
84+
struct survey_report_object_size_summary *by_type;
7085
};
7186

87+
#define REPORT_TYPE_COMMIT 0
88+
#define REPORT_TYPE_TREE 1
89+
#define REPORT_TYPE_BLOB 2
90+
#define REPORT_TYPE_TAG 3
91+
#define REPORT_TYPE_COUNT 4
92+
7293
struct survey_context {
7394
struct repository *repo;
7495

@@ -280,12 +301,48 @@ static void survey_report_plaintext_reachable_object_summary(struct survey_conte
280301
clear_table(&table);
281302
}
282303

304+
static void survey_report_object_sizes(const char *title,
305+
const char *categories,
306+
struct survey_report_object_size_summary *summary,
307+
size_t summary_nr)
308+
{
309+
struct survey_table table = SURVEY_TABLE_INIT;
310+
table.table_name = title;
311+
312+
strvec_push(&table.header, categories);
313+
strvec_push(&table.header, _("Count"));
314+
strvec_push(&table.header, _("Disk Size"));
315+
strvec_push(&table.header, _("Inflated Size"));
316+
317+
for (size_t i = 0; i < summary_nr; i++) {
318+
char *label_str = xstrdup(summary[i].label);
319+
char *nr_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].nr);
320+
char *disk_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].disk_size);
321+
char *inflate_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].inflated_size);
322+
323+
insert_table_rowv(&table, label_str, nr_str,
324+
disk_str, inflate_str, NULL);
325+
326+
free(label_str);
327+
free(nr_str);
328+
free(disk_str);
329+
free(inflate_str);
330+
}
331+
332+
print_table_plaintext(&table);
333+
clear_table(&table);
334+
}
335+
283336
static void survey_report_plaintext(struct survey_context *ctx)
284337
{
285338
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
286339
printf("-----------------------------------------------------\n");
287340
survey_report_plaintext_refs(ctx);
288341
survey_report_plaintext_reachable_object_summary(ctx);
342+
survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"),
343+
_("Object Type"),
344+
ctx->report.by_type,
345+
REPORT_TYPE_COUNT);
289346
}
290347

291348
/*
@@ -499,6 +556,69 @@ static void increment_object_counts(
499556
}
500557
}
501558

559+
static void increment_totals(struct survey_context *ctx,
560+
struct oid_array *oids,
561+
struct survey_report_object_size_summary *summary)
562+
{
563+
for (size_t i = 0; i < oids->nr; i++) {
564+
struct object_info oi = OBJECT_INFO_INIT;
565+
unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH;
566+
unsigned long object_length = 0;
567+
off_t disk_sizep = 0;
568+
enum object_type type;
569+
570+
oi.typep = &type;
571+
oi.sizep = &object_length;
572+
oi.disk_sizep = &disk_sizep;
573+
574+
if (odb_read_object_info_extended(ctx->repo->objects,
575+
&oids->oid[i],
576+
&oi, oi_flags) < 0) {
577+
summary->num_missing++;
578+
} else {
579+
summary->nr++;
580+
summary->disk_size += disk_sizep;
581+
summary->inflated_size += object_length;
582+
}
583+
}
584+
}
585+
586+
static void increment_object_totals(struct survey_context *ctx,
587+
struct oid_array *oids,
588+
enum object_type type)
589+
{
590+
struct survey_report_object_size_summary *total;
591+
struct survey_report_object_size_summary summary = { 0 };
592+
593+
increment_totals(ctx, oids, &summary);
594+
595+
switch (type) {
596+
case OBJ_COMMIT:
597+
total = &ctx->report.by_type[REPORT_TYPE_COMMIT];
598+
break;
599+
600+
case OBJ_TREE:
601+
total = &ctx->report.by_type[REPORT_TYPE_TREE];
602+
break;
603+
604+
case OBJ_BLOB:
605+
total = &ctx->report.by_type[REPORT_TYPE_BLOB];
606+
break;
607+
608+
case OBJ_TAG:
609+
total = &ctx->report.by_type[REPORT_TYPE_TAG];
610+
break;
611+
612+
default:
613+
BUG("No other type allowed");
614+
}
615+
616+
total->nr += summary.nr;
617+
total->disk_size += summary.disk_size;
618+
total->inflated_size += summary.inflated_size;
619+
total->num_missing += summary.num_missing;
620+
}
621+
502622
static int survey_objects_path_walk_fn(const char *path UNUSED,
503623
struct oid_array *oids,
504624
enum object_type type,
@@ -508,10 +628,20 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
508628

509629
increment_object_counts(&ctx->report.reachable_objects,
510630
type, oids->nr);
631+
increment_object_totals(ctx, oids, type);
511632

512633
return 0;
513634
}
514635

636+
static void initialize_report(struct survey_context *ctx)
637+
{
638+
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
639+
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
640+
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
641+
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
642+
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
643+
}
644+
515645
static void survey_phase_objects(struct survey_context *ctx)
516646
{
517647
struct rev_info revs = REV_INFO_INIT;
@@ -524,12 +654,15 @@ static void survey_phase_objects(struct survey_context *ctx)
524654
info.path_fn = survey_objects_path_walk_fn;
525655
info.path_fn_data = ctx;
526656

657+
initialize_report(ctx);
658+
527659
repo_init_revisions(ctx->repo, &revs, "");
528660
revs.tag_objects = 1;
529661

530662
for (int i = 0; i < ctx->ref_array.nr; i++) {
531663
struct ref_array_item *item = ctx->ref_array.items[i];
532664
add_pending_oid(&revs, NULL, &item->objectname, add_flags);
665+
display_progress(ctx->progress, ++(ctx->progress_nr));
533666
}
534667

535668
walk_objects_by_path(&info);

t/t8100-git-survey.sh

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,35 @@ test_expect_success 'create a semi-interesting repo' '
2525
git update-ref -d refs/tags/two
2626
'
2727

28+
approximate_sizes() {
29+
# very simplistic approximate rounding
30+
sed -Ee "s/ *(1[0-9][0-9])( |$)/ ~0.1kB\2/g" \
31+
-e "s/ *(4[6-9][0-9]|5[0-6][0-9])( |$)/ ~0.5kB\2/g" \
32+
-e "s/ *(5[6-9][0-9]|6[0-6][0-9])( |$)/ ~0.6kB\2/g" \
33+
-e "s/ *1(4[89][0-9]|5[0-8][0-9])( |$)/ ~1.5kB\2/g" \
34+
-e "s/ *1(69[0-9]|7[0-9][0-9])( |$)/ ~1.7kB\2/g" \
35+
-e "s/ *1(79[0-9]|8[0-9][0-9])( |$)/ ~1.8kB\2/g" \
36+
-e "s/ *2(1[0-9][0-9]|20[0-1])( |$)/ ~2.1kB\2/g" \
37+
-e "s/ *2(3[0-9][0-9]|4[0-1][0-9])( |$)/ ~2.3kB\2/g" \
38+
-e "s/ *2(5[0-9][0-9]|6[0-1][0-9])( |$)/ ~2.5kB\2/g" \
39+
"$@"
40+
}
41+
2842
test_expect_success 'git survey (default)' '
2943
git survey --all-refs >out 2>err &&
3044
test_line_count = 0 err &&
3145
46+
test_oid_cache <<-EOF &&
47+
commits_sizes sha1:~1.5kB | ~2.1kB
48+
commits_sizes sha256:~1.8kB | ~2.5kB
49+
trees_sizes sha1:~0.5kB | ~1.7kB
50+
trees_sizes sha256:~0.6kB | ~2.3kB
51+
blobs_sizes sha1:~0.1kB | ~0.1kB
52+
blobs_sizes sha256:~0.1kB | ~0.1kB
53+
tags_sizes sha1:~0.5kB | ~0.5kB
54+
tags_sizes sha256:~0.5kB | ~0.6kB
55+
EOF
56+
3257
tr , " " >expect <<-EOF &&
3358
GIT SURVEY for "$(pwd)"
3459
-----------------------------------------------------
@@ -50,9 +75,19 @@ test_expect_success 'git survey (default)' '
5075
Commits | 10
5176
Trees | 10
5277
Blobs | 10
78+
79+
TOTAL OBJECT SIZES BY TYPE
80+
===============================================
81+
Object Type | Count | Disk Size | Inflated Size
82+
------------+-------+-----------+--------------
83+
Commits | 10 | $(test_oid commits_sizes)
84+
Trees | 10 | $(test_oid trees_sizes)
85+
Blobs | 10 | $(test_oid blobs_sizes)
86+
Tags | 4 | $(test_oid tags_sizes)
5387
EOF
5488
55-
test_cmp expect out
89+
approximate_sizes out >out-edited &&
90+
test_cmp expect out-edited
5691
'
5792

5893
test_done

0 commit comments

Comments
 (0)