Skip to content

Commit fbd0935

Browse files
jeffhostetlerdscho
authored andcommitted
survey: add vector of largest objects for various scaling dimensions
Create `struct large_item` and `struct large_item_vec` to capture the n largest commits, trees, and blobs under various scaling dimensions, such as size in bytes, number of commit parents, or number of entries in a tree. Each of these have a command line option to set them independently. Signed-off-by: Jeff Hostetler <[email protected]>
1 parent 549e0b2 commit fbd0935

File tree

3 files changed

+276
-6
lines changed

3 files changed

+276
-6
lines changed

Documentation/config/survey.adoc

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,33 @@ survey.*::
1111
top::
1212
This integer value implies `--top=<N>`, specifying the
1313
number of entries in the detail tables.
14+
showBlobSizes::
15+
A non-negative integer value. Requests details on the
16+
<n> largest file blobs by size in bytes. Provides a
17+
default value for `--blob-sizes=<n>` in
18+
linkgit:git-survey[1].
19+
showCommitParents::
20+
A non-negative integer value. Requests details on the
21+
<n> commits with the most number of parents. Provides a
22+
default value for `--commit-parents=<n>` in
23+
linkgit:git-survey[1].
24+
showCommitSizes::
25+
A non-negative integer value. Requests details on the
26+
<n> largest commits by size in bytes. Generally, these
27+
are the commits with the largest commit messages.
28+
Provides a default value for `--commit-sizes=<n>` in
29+
linkgit:git-survey[1].
30+
showTreeEntries::
31+
A non-negative integer value. Requests details on the
32+
<n> trees (directories) with the most number of entries
33+
(files and subdirectories). Provides a default value
34+
for `--tree-entries=<n>` in linkgit:git-survey[1].
35+
showTreeSizes::
36+
A non-negative integer value. Requests details on the
37+
<n> largest trees (directories) by size in bytes. This
38+
will set will usually be equal to the
39+
`survey.showTreeEntries` set, but may be skewed by very
40+
long file or subdirectory entry names. Provides a
41+
default value for `--tree-sizes=<n>` in
42+
linkgit:git-survey[1].
1443
--

Documentation/git-survey.adoc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,32 @@ only refs for the given options are added.
5959
--other::
6060
Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set.
6161

62+
Large Item Selection
63+
~~~~~~~~~~~~~~~~~~~~
64+
65+
The following options control the optional display of large items under
66+
various dimensions of scale. The OID of the largest `n` objects will be
67+
displayed in reverse sorted order. For each, `n` defaults to 10.
68+
69+
--commit-parents::
70+
Shows the OIDs of the commits with the most parent commits.
71+
72+
--commit-sizes::
73+
Shows the OIDs of the largest commits by size in bytes. This is
74+
usually the ones with the largest commit messages.
75+
76+
--tree-entries::
77+
Shows the OIDs of the trees with the most number of entries. These
78+
are the directories with the most number of files or subdirectories.
79+
80+
--tree-sizes::
81+
Shows the OIDs of the largest trees by size in bytes. This set
82+
will usually be the same as the vector of number of entries unless
83+
skewed by very long entry names.
84+
85+
--blob-sizes::
86+
Shows the OIDs of the largest blobs by size in bytes.
87+
6288
OUTPUT
6389
------
6490

@@ -78,6 +104,11 @@ Reachable Object Summary
78104
The reachable object summary shows the total number of each kind of Git
79105
object, including tags, commits, trees, and blobs.
80106

107+
CONFIGURATION
108+
-------------
109+
110+
include::config/survey.adoc[]
111+
81112
GIT
82113
---
83114
Part of the linkgit:git[1] suite

builtin/survey.c

Lines changed: 216 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,15 @@ static struct survey_refs_wanted default_ref_options = {
4141
struct survey_opts {
4242
int verbose;
4343
int show_progress;
44+
45+
int show_largest_commits_by_nr_parents;
46+
int show_largest_commits_by_size_bytes;
47+
48+
int show_largest_trees_by_nr_entries;
49+
int show_largest_trees_by_size_bytes;
50+
51+
int show_largest_blobs_by_size_bytes;
52+
4453
int top_nr;
4554
struct survey_refs_wanted refs;
4655
};
@@ -138,6 +147,87 @@ static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
138147
pbin->cnt_seen++;
139148
}
140149

150+
/*
151+
* Remember the largest n objects for some scaling dimension. This
152+
* could be the observed object size or number of entries in a tree.
153+
* We'll use this to generate a sorted vector in the output for that
154+
* dimension.
155+
*/
156+
struct large_item {
157+
uint64_t size;
158+
struct object_id oid;
159+
};
160+
161+
struct large_item_vec {
162+
char *dimension_label;
163+
char *item_label;
164+
uint64_t nr_items;
165+
struct large_item items[FLEX_ARRAY]; /* nr_items */
166+
};
167+
168+
static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
169+
const char *item_label,
170+
uint64_t nr_items)
171+
{
172+
struct large_item_vec *vec;
173+
size_t flex_len = nr_items * sizeof(struct large_item);
174+
175+
if (!nr_items)
176+
return NULL;
177+
178+
vec = xcalloc(1, (sizeof(struct large_item_vec) + flex_len));
179+
vec->dimension_label = strdup(dimension_label);
180+
vec->item_label = strdup(item_label);
181+
vec->nr_items = nr_items;
182+
183+
return vec;
184+
}
185+
186+
static void free_large_item_vec(struct large_item_vec *vec)
187+
{
188+
if (!vec)
189+
return;
190+
191+
free(vec->dimension_label);
192+
free(vec->item_label);
193+
free(vec);
194+
}
195+
196+
static void maybe_insert_large_item(struct large_item_vec *vec,
197+
uint64_t size,
198+
struct object_id *oid)
199+
{
200+
size_t rest_len;
201+
size_t k;
202+
203+
if (!vec || !vec->nr_items)
204+
return;
205+
206+
/*
207+
* Since the odds an object being among the largest n
208+
* is small, shortcut and see if it is smaller than
209+
* the smallest one in our set and quickly reject it.
210+
*/
211+
if (size < vec->items[vec->nr_items - 1].size)
212+
return;
213+
214+
for (k = 0; k < vec->nr_items; k++) {
215+
if (size < vec->items[k].size)
216+
continue;
217+
218+
/* push items[k..] down one and insert it here */
219+
220+
rest_len = (vec->nr_items - k - 1) * sizeof(struct large_item);
221+
if (rest_len)
222+
memmove(&vec->items[k + 1], &vec->items[k], rest_len);
223+
224+
memset(&vec->items[k], 0, sizeof(struct large_item));
225+
vec->items[k].size = size;
226+
oidcpy(&vec->items[k].oid, oid);
227+
return;
228+
}
229+
}
230+
141231
/*
142232
* Common fields for any type of object.
143233
*/
@@ -183,6 +273,9 @@ struct survey_stats_commits {
183273
* Count of commits with k parents.
184274
*/
185275
uint32_t parent_cnt_pbin[PBIN_VEC_LEN];
276+
277+
struct large_item_vec *vec_largest_by_nr_parents;
278+
struct large_item_vec *vec_largest_by_size_bytes;
186279
};
187280

188281
/*
@@ -192,11 +285,18 @@ struct survey_stats_trees {
192285
struct survey_stats_base_object base;
193286

194287
/*
195-
* In the following, nr_entries refers to the number of files or
196-
* subdirectories in a tree. We are interested in how wide the
197-
* tree is and if the repo has gigantic directories.
288+
* Keep a vector of the trees with the most number of entries.
289+
* This gives us a feel for the width of a tree when there are
290+
* gigantic directories.
198291
*/
199-
uint64_t max_entries; /* max(nr_entries) -- the width of the largest tree */
292+
struct large_item_vec *vec_largest_by_nr_entries;
293+
294+
/*
295+
* Keep a vector of the trees with the largest size in bytes.
296+
* The contents of this may or may not match items in the other
297+
* vector, since entryname length can alter the results.
298+
*/
299+
struct large_item_vec *vec_largest_by_size_bytes;
200300

201301
/*
202302
* Computing the sum of the number of entries across all trees
@@ -216,6 +316,11 @@ struct survey_stats_trees {
216316
*/
217317
struct survey_stats_blobs {
218318
struct survey_stats_base_object base;
319+
320+
/*
321+
* Remember the OIDs of the largest n blobs.
322+
*/
323+
struct large_item_vec *vec_largest_by_size_bytes;
219324
};
220325

221326
struct survey_report_object_summary {
@@ -396,6 +501,12 @@ struct survey_context {
396501

397502
static void clear_survey_context(struct survey_context *ctx)
398503
{
504+
free_large_item_vec(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents);
505+
free_large_item_vec(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes);
506+
free_large_item_vec(ctx->report.reachable_objects.trees.vec_largest_by_nr_entries);
507+
free_large_item_vec(ctx->report.reachable_objects.trees.vec_largest_by_size_bytes);
508+
free_large_item_vec(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes);
509+
399510
ref_array_clear(&ctx->ref_array);
400511
strvec_clear(&ctx->refs);
401512
}
@@ -608,6 +719,32 @@ static void survey_report_commit_parents(struct survey_context *ctx)
608719
clear_table(&table);
609720
}
610721

722+
static void survey_report_largest_vec(struct large_item_vec *vec)
723+
{
724+
struct survey_table table = SURVEY_TABLE_INIT;
725+
struct strbuf size = STRBUF_INIT;
726+
727+
if (!vec || !vec->nr_items)
728+
return;
729+
730+
table.table_name = vec->dimension_label;
731+
strvec_pushl(&table.header, "Size", "OID", NULL);
732+
733+
for (size_t k = 0; k < vec->nr_items; k++) {
734+
struct large_item *pk = &vec->items[k];
735+
if (!is_null_oid(&pk->oid)) {
736+
strbuf_reset(&size);
737+
strbuf_addf(&size, "%"PRIuMAX, (uintmax_t)pk->size);
738+
739+
insert_table_rowv(&table, size.buf, oid_to_hex(&pk->oid), NULL);
740+
}
741+
}
742+
strbuf_release(&size);
743+
744+
print_table_plaintext(&table);
745+
clear_table(&table);
746+
}
747+
611748
static void survey_report_plaintext_refs(struct survey_context *ctx)
612749
{
613750
struct survey_report_ref_summary *refs = &ctx->report.refs;
@@ -787,6 +924,12 @@ static void survey_report_plaintext(struct survey_context *ctx)
787924
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
788925
survey_report_plaintext_sorted_size(
789926
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
927+
928+
survey_report_largest_vec(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents);
929+
survey_report_largest_vec(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes);
930+
survey_report_largest_vec(ctx->report.reachable_objects.trees.vec_largest_by_nr_entries);
931+
survey_report_largest_vec(ctx->report.reachable_objects.trees.vec_largest_by_size_bytes);
932+
survey_report_largest_vec(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes);
790933
}
791934

792935
/*
@@ -858,6 +1001,27 @@ static int survey_load_config_cb(const char *var, const char *value,
8581001
ctx->opts.show_progress = git_config_bool(var, value);
8591002
return 0;
8601003
}
1004+
if (!strcmp(var, "survey.showcommitparents")) {
1005+
ctx->opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, cctx->kvi);
1006+
return 0;
1007+
}
1008+
if (!strcmp(var, "survey.showcommitsizes")) {
1009+
ctx->opts.show_largest_commits_by_size_bytes = git_config_ulong(var, value, cctx->kvi);
1010+
return 0;
1011+
}
1012+
1013+
if (!strcmp(var, "survey.showtreeentries")) {
1014+
ctx->opts.show_largest_trees_by_nr_entries = git_config_ulong(var, value, cctx->kvi);
1015+
return 0;
1016+
}
1017+
if (!strcmp(var, "survey.showtreesizes")) {
1018+
ctx->opts.show_largest_trees_by_size_bytes = git_config_ulong(var, value, cctx->kvi);
1019+
return 0;
1020+
}
1021+
if (!strcmp(var, "survey.showblobsizes")) {
1022+
ctx->opts.show_largest_blobs_by_size_bytes = git_config_ulong(var, value, cctx->kvi);
1023+
return 0;
1024+
}
8611025
if (!strcmp(var, "survey.top")) {
8621026
ctx->opts.top_nr = git_config_bool(var, value);
8631027
return 0;
@@ -1069,6 +1233,9 @@ static void increment_totals(struct survey_context *ctx,
10691233

10701234
ctx->report.reachable_objects.commits.parent_cnt_pbin[k]++;
10711235
base = &ctx->report.reachable_objects.commits.base;
1236+
1237+
maybe_insert_large_item(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents, k, &commit->object.oid);
1238+
maybe_insert_large_item(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes, object_length, &commit->object.oid);
10721239
break;
10731240
}
10741241
case OBJ_TREE: {
@@ -1088,8 +1255,8 @@ static void increment_totals(struct survey_context *ctx,
10881255

10891256
pst->sum_entries += nr_entries;
10901257

1091-
if (nr_entries > pst->max_entries)
1092-
pst->max_entries = nr_entries;
1258+
maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &tree->object.oid);
1259+
maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &tree->object.oid);
10931260

10941261
qb = qbin(nr_entries);
10951262
incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
@@ -1099,6 +1266,8 @@ static void increment_totals(struct survey_context *ctx,
10991266
}
11001267
case OBJ_BLOB:
11011268
base = &ctx->report.reachable_objects.blobs.base;
1269+
1270+
maybe_insert_large_item(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes, object_length, &oids->oid[i]);
11021271
break;
11031272
default:
11041273
continue;
@@ -1307,6 +1476,14 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor
13071476
OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG),
13081477
OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG),
13091478

1479+
OPT_INTEGER_F(0, "commit-parents", &ctx.opts.show_largest_commits_by_nr_parents, N_("show N largest commits by parent count"), PARSE_OPT_NONEG),
1480+
OPT_INTEGER_F(0, "commit-sizes", &ctx.opts.show_largest_commits_by_size_bytes, N_("show N largest commits by size in bytes"), PARSE_OPT_NONEG),
1481+
1482+
OPT_INTEGER_F(0, "tree-entries", &ctx.opts.show_largest_trees_by_nr_entries, N_("show N largest trees by entry count"), PARSE_OPT_NONEG),
1483+
OPT_INTEGER_F(0, "tree-sizes", &ctx.opts.show_largest_trees_by_size_bytes, N_("show N largest trees by size in bytes"), PARSE_OPT_NONEG),
1484+
1485+
OPT_INTEGER_F(0, "blob-sizes", &ctx.opts.show_largest_blobs_by_size_bytes, N_("show N largest blobs by size in bytes"), PARSE_OPT_NONEG),
1486+
13101487
OPT_END(),
13111488
};
13121489

@@ -1330,6 +1507,39 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor
13301507

13311508
fixup_refs_wanted(&ctx);
13321509

1510+
if (ctx.opts.show_largest_commits_by_nr_parents)
1511+
ctx.report.reachable_objects.commits.vec_largest_by_nr_parents =
1512+
alloc_large_item_vec(
1513+
"largest_commits_by_nr_parents",
1514+
"nr_parents",
1515+
ctx.opts.show_largest_commits_by_nr_parents);
1516+
if (ctx.opts.show_largest_commits_by_size_bytes)
1517+
ctx.report.reachable_objects.commits.vec_largest_by_size_bytes =
1518+
alloc_large_item_vec(
1519+
"largest_commits_by_size_bytes",
1520+
"size",
1521+
ctx.opts.show_largest_commits_by_size_bytes);
1522+
1523+
if (ctx.opts.show_largest_trees_by_nr_entries)
1524+
ctx.report.reachable_objects.trees.vec_largest_by_nr_entries =
1525+
alloc_large_item_vec(
1526+
"largest_trees_by_nr_entries",
1527+
"nr_entries",
1528+
ctx.opts.show_largest_trees_by_nr_entries);
1529+
if (ctx.opts.show_largest_trees_by_size_bytes)
1530+
ctx.report.reachable_objects.trees.vec_largest_by_size_bytes =
1531+
alloc_large_item_vec(
1532+
"largest_trees_by_size_bytes",
1533+
"size",
1534+
ctx.opts.show_largest_trees_by_size_bytes);
1535+
1536+
if (ctx.opts.show_largest_blobs_by_size_bytes)
1537+
ctx.report.reachable_objects.blobs.vec_largest_by_size_bytes =
1538+
alloc_large_item_vec(
1539+
"largest_blobs_by_size_bytes",
1540+
"size",
1541+
ctx.opts.show_largest_blobs_by_size_bytes);
1542+
13331543
survey_phase_refs(&ctx);
13341544

13351545
survey_phase_objects(&ctx);

0 commit comments

Comments
 (0)