Skip to content

Commit 662148c

Browse files
derrickstoleegitster
authored andcommitted
midx: write object offsets
The final pair of chunks for the multi-pack-index file stores the object offsets. We default to using 32-bit offsets as in the pack-index version 1 format, but if there exists an offset larger than 32-bits, we use a trick similar to the pack-index version 2 format by storing all offsets at least 2^31 in a 64-bit table; we use the 32-bit table to point into that 64-bit table as necessary. We only store these 64-bit offsets if necessary, so create a test that manipulates a version 2 pack-index to fake a large offset. This allows us to test that the large offset table is created, but the data does not match the actual packfile offsets. The multi-pack-index offset does match the (corrupted) pack-index offset, so a future feature will compare these offsets during a 'verify' step. Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent d7cacf2 commit 662148c

File tree

5 files changed

+155
-15
lines changed

5 files changed

+155
-15
lines changed

Documentation/technical/pack-format.txt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,20 @@ CHUNK DATA:
311311
The OIDs for all objects in the MIDX are stored in lexicographic
312312
order in this chunk.
313313

314-
(This section intentionally left incomplete.)
314+
Object Offsets (ID: {'O', 'O', 'F', 'F'})
315+
Stores two 4-byte values for every object.
316+
1: The pack-int-id for the pack storing this object.
317+
2: The offset within the pack.
318+
If all offsets are less than 2^31, then the large offset chunk
319+
will not exist and offsets are stored as in IDX v1.
320+
If there is at least one offset value larger than 2^32-1, then
321+
the large offset chunk must exist. If the large offset chunk
322+
exists and the 31st bit is on, then removing that bit reveals
323+
the row in the large offsets containing the 8-byte offset of
324+
this object.
325+
326+
[Optional] Object Large Offsets (ID: {'L', 'O', 'F', 'F'})
327+
8-byte offsets into large packfiles.
315328

316329
TRAILER:
317330

midx.c

Lines changed: 96 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,18 @@
1818
#define MIDX_HASH_LEN 20
1919
#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
2020

21-
#define MIDX_MAX_CHUNKS 3
21+
#define MIDX_MAX_CHUNKS 5
2222
#define MIDX_CHUNK_ALIGNMENT 4
2323
#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
2424
#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
2525
#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
26+
#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
27+
#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
2628
#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
2729
#define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
30+
#define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
31+
#define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
32+
#define MIDX_LARGE_OFFSET_NEEDED 0x80000000
2833

2934
static char *get_midx_filename(const char *object_dir)
3035
{
@@ -112,6 +117,14 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir)
112117
m->chunk_oid_lookup = m->data + chunk_offset;
113118
break;
114119

120+
case MIDX_CHUNKID_OBJECTOFFSETS:
121+
m->chunk_object_offsets = m->data + chunk_offset;
122+
break;
123+
124+
case MIDX_CHUNKID_LARGEOFFSETS:
125+
m->chunk_large_offsets = m->data + chunk_offset;
126+
break;
127+
115128
case 0:
116129
die(_("terminating multi-pack-index chunk id appears earlier than expected"));
117130
break;
@@ -131,6 +144,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir)
131144
die(_("multi-pack-index missing required OID fanout chunk"));
132145
if (!m->chunk_oid_lookup)
133146
die(_("multi-pack-index missing required OID lookup chunk"));
147+
if (!m->chunk_object_offsets)
148+
die(_("multi-pack-index missing required object offsets chunk"));
134149

135150
m->num_objects = ntohl(m->chunk_oid_fanout[255]);
136151

@@ -454,6 +469,56 @@ static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len,
454469
return written;
455470
}
456471

472+
static size_t write_midx_object_offsets(struct hashfile *f, int large_offset_needed,
473+
struct pack_midx_entry *objects, uint32_t nr_objects)
474+
{
475+
struct pack_midx_entry *list = objects;
476+
uint32_t i, nr_large_offset = 0;
477+
size_t written = 0;
478+
479+
for (i = 0; i < nr_objects; i++) {
480+
struct pack_midx_entry *obj = list++;
481+
482+
hashwrite_be32(f, obj->pack_int_id);
483+
484+
if (large_offset_needed && obj->offset >> 31)
485+
hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++);
486+
else if (!large_offset_needed && obj->offset >> 32)
487+
BUG("object %s requires a large offset (%"PRIx64") but the MIDX is not writing large offsets!",
488+
oid_to_hex(&obj->oid),
489+
obj->offset);
490+
else
491+
hashwrite_be32(f, (uint32_t)obj->offset);
492+
493+
written += MIDX_CHUNK_OFFSET_WIDTH;
494+
}
495+
496+
return written;
497+
}
498+
499+
static size_t write_midx_large_offsets(struct hashfile *f, uint32_t nr_large_offset,
500+
struct pack_midx_entry *objects, uint32_t nr_objects)
501+
{
502+
struct pack_midx_entry *list = objects;
503+
size_t written = 0;
504+
505+
while (nr_large_offset) {
506+
struct pack_midx_entry *obj = list++;
507+
uint64_t offset = obj->offset;
508+
509+
if (!(offset >> 31))
510+
continue;
511+
512+
hashwrite_be32(f, offset >> 32);
513+
hashwrite_be32(f, offset & 0xffffffffUL);
514+
written += 2 * sizeof(uint32_t);
515+
516+
nr_large_offset--;
517+
}
518+
519+
return written;
520+
}
521+
457522
int write_midx_file(const char *object_dir)
458523
{
459524
unsigned char cur_chunk, num_chunks = 0;
@@ -466,8 +531,9 @@ int write_midx_file(const char *object_dir)
466531
uint64_t written = 0;
467532
uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
468533
uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
469-
uint32_t nr_entries;
534+
uint32_t nr_entries, num_large_offsets = 0;
470535
struct pack_midx_entry *entries = NULL;
536+
int large_offsets_needed = 0;
471537

472538
midx_name = get_midx_filename(object_dir);
473539
if (safe_create_leading_directories(midx_name)) {
@@ -494,13 +560,19 @@ int write_midx_file(const char *object_dir)
494560
sort_packs_by_name(packs.names, packs.nr, pack_perm);
495561

496562
entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries);
563+
for (i = 0; i < nr_entries; i++) {
564+
if (entries[i].offset > 0x7fffffff)
565+
num_large_offsets++;
566+
if (entries[i].offset > 0xffffffff)
567+
large_offsets_needed = 1;
568+
}
497569

498570
hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
499571
f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
500572
FREE_AND_NULL(midx_name);
501573

502574
cur_chunk = 0;
503-
num_chunks = 3;
575+
num_chunks = large_offsets_needed ? 5 : 4;
504576

505577
written = write_midx_header(f, num_chunks, packs.nr);
506578

@@ -516,9 +588,21 @@ int write_midx_file(const char *object_dir)
516588
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + MIDX_CHUNK_FANOUT_SIZE;
517589

518590
cur_chunk++;
519-
chunk_ids[cur_chunk] = 0;
591+
chunk_ids[cur_chunk] = MIDX_CHUNKID_OBJECTOFFSETS;
520592
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_HASH_LEN;
521593

594+
cur_chunk++;
595+
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_CHUNK_OFFSET_WIDTH;
596+
if (large_offsets_needed) {
597+
chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS;
598+
599+
cur_chunk++;
600+
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] +
601+
num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH;
602+
}
603+
604+
chunk_ids[cur_chunk] = 0;
605+
522606
for (i = 0; i <= num_chunks; i++) {
523607
if (i && chunk_offsets[i] < chunk_offsets[i - 1])
524608
BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64,
@@ -556,6 +640,14 @@ int write_midx_file(const char *object_dir)
556640
written += write_midx_oid_lookup(f, MIDX_HASH_LEN, entries, nr_entries);
557641
break;
558642

643+
case MIDX_CHUNKID_OBJECTOFFSETS:
644+
written += write_midx_object_offsets(f, large_offsets_needed, entries, nr_entries);
645+
break;
646+
647+
case MIDX_CHUNKID_LARGEOFFSETS:
648+
written += write_midx_large_offsets(f, num_large_offsets, entries, nr_entries);
649+
break;
650+
559651
default:
560652
BUG("trying to write unknown chunk id %"PRIx32,
561653
chunk_ids[i]);

midx.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ struct multi_pack_index {
1717
const unsigned char *chunk_pack_names;
1818
const uint32_t *chunk_oid_fanout;
1919
const unsigned char *chunk_oid_lookup;
20+
const unsigned char *chunk_object_offsets;
21+
const unsigned char *chunk_large_offsets;
2022

2123
const char **pack_names;
2224
char object_dir[FLEX_ARRAY];

t/helper/test-read-midx.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ static int read_midx_file(const char *object_dir)
2626
printf(" oid-fanout");
2727
if (m->chunk_oid_lookup)
2828
printf(" oid-lookup");
29+
if (m->chunk_object_offsets)
30+
printf(" object-offsets");
31+
if (m->chunk_large_offsets)
32+
printf(" large-offsets");
2933

3034
printf("\nnum_objects: %d\n", m->num_objects);
3135

t/t5319-multi-pack-index.sh

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,30 @@ test_description='multi-pack-indexes'
66
midx_read_expect () {
77
NUM_PACKS=$1
88
NUM_OBJECTS=$2
9+
NUM_CHUNKS=$3
10+
OBJECT_DIR=$4
11+
EXTRA_CHUNKS="$5"
912
{
1013
cat <<-EOF &&
11-
header: 4d494458 1 3 $NUM_PACKS
12-
chunks: pack-names oid-fanout oid-lookup
14+
header: 4d494458 1 $NUM_CHUNKS $NUM_PACKS
15+
chunks: pack-names oid-fanout oid-lookup object-offsets$EXTRA_CHUNKS
1316
num_objects: $NUM_OBJECTS
1417
packs:
1518
EOF
1619
if test $NUM_PACKS -ge 1
1720
then
18-
ls pack/ | grep idx | sort
21+
ls $OBJECT_DIR/pack/ | grep idx | sort
1922
fi &&
20-
printf "object-dir: .\n"
23+
printf "object-dir: $OBJECT_DIR\n"
2124
} >expect &&
22-
test-tool read-midx . >actual &&
25+
test-tool read-midx $OBJECT_DIR >actual &&
2326
test_cmp expect actual
2427
}
2528

2629
test_expect_success 'write midx with no packs' '
2730
test_when_finished rm -f pack/multi-pack-index &&
2831
git multi-pack-index --object-dir=. write &&
29-
midx_read_expect 0 0
32+
midx_read_expect 0 0 4 .
3033
'
3134

3235
generate_objects () {
@@ -76,13 +79,13 @@ test_expect_success 'write midx with one v1 pack' '
7679
pack=$(git pack-objects --index-version=1 pack/test <obj-list) &&
7780
test_when_finished rm pack/test-$pack.pack pack/test-$pack.idx pack/multi-pack-index &&
7881
git multi-pack-index --object-dir=. write &&
79-
midx_read_expect 1 18
82+
midx_read_expect 1 18 4 .
8083
'
8184

8285
test_expect_success 'write midx with one v2 pack' '
8386
git pack-objects --index-version=2,0x40 pack/test <obj-list &&
8487
git multi-pack-index --object-dir=. write &&
85-
midx_read_expect 1 18
88+
midx_read_expect 1 18 4 .
8689
'
8790

8891
test_expect_success 'add more objects' '
@@ -96,7 +99,7 @@ test_expect_success 'add more objects' '
9699
test_expect_success 'write midx with two packs' '
97100
git pack-objects --index-version=1 pack/test-2 <obj-list &&
98101
git multi-pack-index --object-dir=. write &&
99-
midx_read_expect 2 34
102+
midx_read_expect 2 34 4 .
100103
'
101104

102105
test_expect_success 'add more packs' '
@@ -110,7 +113,33 @@ test_expect_success 'add more packs' '
110113

111114
test_expect_success 'write midx with twelve packs' '
112115
git multi-pack-index --object-dir=. write &&
113-
midx_read_expect 12 74
116+
midx_read_expect 12 74 4 .
117+
'
118+
119+
# usage: corrupt_data <file> <pos> [<data>]
120+
corrupt_data () {
121+
file=$1
122+
pos=$2
123+
data="${3:-\0}"
124+
printf "$data" | dd of="$file" bs=1 seek="$pos" conv=notrunc
125+
}
126+
127+
# Force 64-bit offsets by manipulating the idx file.
128+
# This makes the IDX file _incorrect_ so be careful to clean up after!
129+
test_expect_success 'force some 64-bit offsets with pack-objects' '
130+
mkdir objects64 &&
131+
mkdir objects64/pack &&
132+
for i in $(test_seq 1 11)
133+
do
134+
generate_objects 11
135+
done &&
136+
commit_and_list_objects &&
137+
pack64=$(git pack-objects --index-version=2,0x40 objects64/pack/test-64 <obj-list) &&
138+
idx64=objects64/pack/test-64-$pack64.idx &&
139+
chmod u+w $idx64 &&
140+
corrupt_data $idx64 2999 "\02" &&
141+
midx64=$(git multi-pack-index --object-dir=objects64 write) &&
142+
midx_read_expect 1 63 5 objects64 " large-offsets"
114143
'
115144

116145
test_done

0 commit comments

Comments
 (0)