Skip to content

Commit 2e0aabe

Browse files
committed
Merge branch 'jk/pack-corruption-post-mortem' into maint
Documentation update. * jk/pack-corruption-post-mortem: howto: document more tools for recovery corruption
2 parents e9ab76d + 2b8bd44 commit 2e0aabe

File tree

1 file changed

+237
-0
lines changed

1 file changed

+237
-0
lines changed

Documentation/howto/recover-corrupted-object-harder.txt

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,3 +240,240 @@ But more importantly, git's hashing and checksumming noticed a problem
240240
that easily could have gone undetected in another system. The result
241241
still compiled, but would have caused an interesting bug (that would
242242
have been blamed on some random commit).
243+
244+
245+
The adventure continues...
246+
--------------------------
247+
248+
I ended up doing this again! Same entity, new hardware. The assumption
249+
at this point is that the old disk corrupted the packfile, and then the
250+
corruption was migrated to the new hardware (because it was done by
251+
rsync or similar, and no fsck was done at the time of migration).
252+
253+
This time, the affected blob was over 20 megabytes, which was far too
254+
large to do a brute-force on. I followed the instructions above to
255+
create the `zlib` file. I then used the `inflate` program below to pull
256+
the corrupted data from that. Examining that output gave me a hint about
257+
where in the file the corruption was. But now I was working with the
258+
file itself, not the zlib contents. So knowing the sha1 of the object
259+
and the approximate area of the corruption, I used the `sha1-munge`
260+
program below to brute-force the correct byte.
261+
262+
Here's the inflate program (it's essentially `gunzip` but without the
263+
`.gz` header processing):
264+
265+
--------------------------
266+
#include <stdio.h>
267+
#include <string.h>
268+
#include <zlib.h>
269+
#include <stdlib.h>
270+
271+
int main(int argc, char **argv)
272+
{
273+
/*
274+
* oversized so we can read the whole buffer in;
275+
* this could actually be switched to streaming
276+
* to avoid any memory limitations
277+
*/
278+
static unsigned char buf[25 * 1024 * 1024];
279+
static unsigned char out[25 * 1024 * 1024];
280+
int len;
281+
z_stream z;
282+
int ret;
283+
284+
len = read(0, buf, sizeof(buf));
285+
memset(&z, 0, sizeof(z));
286+
inflateInit(&z);
287+
288+
z.next_in = buf;
289+
z.avail_in = len;
290+
z.next_out = out;
291+
z.avail_out = sizeof(out);
292+
293+
ret = inflate(&z, 0);
294+
if (ret != Z_OK && ret != Z_STREAM_END)
295+
fprintf(stderr, "initial inflate failed (%d)\n", ret);
296+
297+
fprintf(stderr, "outputting %lu bytes", z.total_out);
298+
fwrite(out, 1, z.total_out, stdout);
299+
return 0;
300+
}
301+
--------------------------
302+
303+
And here is the `sha1-munge` program:
304+
305+
--------------------------
306+
#include <stdio.h>
307+
#include <unistd.h>
308+
#include <string.h>
309+
#include <signal.h>
310+
#include <openssl/sha.h>
311+
#include <stdlib.h>
312+
313+
/* eye candy */
314+
static int counter = 0;
315+
static void progress(int sig)
316+
{
317+
fprintf(stderr, "\r%d", counter);
318+
alarm(1);
319+
}
320+
321+
static const signed char hexval_table[256] = {
322+
-1, -1, -1, -1, -1, -1, -1, -1, /* 00-07 */
323+
-1, -1, -1, -1, -1, -1, -1, -1, /* 08-0f */
324+
-1, -1, -1, -1, -1, -1, -1, -1, /* 10-17 */
325+
-1, -1, -1, -1, -1, -1, -1, -1, /* 18-1f */
326+
-1, -1, -1, -1, -1, -1, -1, -1, /* 20-27 */
327+
-1, -1, -1, -1, -1, -1, -1, -1, /* 28-2f */
328+
0, 1, 2, 3, 4, 5, 6, 7, /* 30-37 */
329+
8, 9, -1, -1, -1, -1, -1, -1, /* 38-3f */
330+
-1, 10, 11, 12, 13, 14, 15, -1, /* 40-47 */
331+
-1, -1, -1, -1, -1, -1, -1, -1, /* 48-4f */
332+
-1, -1, -1, -1, -1, -1, -1, -1, /* 50-57 */
333+
-1, -1, -1, -1, -1, -1, -1, -1, /* 58-5f */
334+
-1, 10, 11, 12, 13, 14, 15, -1, /* 60-67 */
335+
-1, -1, -1, -1, -1, -1, -1, -1, /* 68-67 */
336+
-1, -1, -1, -1, -1, -1, -1, -1, /* 70-77 */
337+
-1, -1, -1, -1, -1, -1, -1, -1, /* 78-7f */
338+
-1, -1, -1, -1, -1, -1, -1, -1, /* 80-87 */
339+
-1, -1, -1, -1, -1, -1, -1, -1, /* 88-8f */
340+
-1, -1, -1, -1, -1, -1, -1, -1, /* 90-97 */
341+
-1, -1, -1, -1, -1, -1, -1, -1, /* 98-9f */
342+
-1, -1, -1, -1, -1, -1, -1, -1, /* a0-a7 */
343+
-1, -1, -1, -1, -1, -1, -1, -1, /* a8-af */
344+
-1, -1, -1, -1, -1, -1, -1, -1, /* b0-b7 */
345+
-1, -1, -1, -1, -1, -1, -1, -1, /* b8-bf */
346+
-1, -1, -1, -1, -1, -1, -1, -1, /* c0-c7 */
347+
-1, -1, -1, -1, -1, -1, -1, -1, /* c8-cf */
348+
-1, -1, -1, -1, -1, -1, -1, -1, /* d0-d7 */
349+
-1, -1, -1, -1, -1, -1, -1, -1, /* d8-df */
350+
-1, -1, -1, -1, -1, -1, -1, -1, /* e0-e7 */
351+
-1, -1, -1, -1, -1, -1, -1, -1, /* e8-ef */
352+
-1, -1, -1, -1, -1, -1, -1, -1, /* f0-f7 */
353+
-1, -1, -1, -1, -1, -1, -1, -1, /* f8-ff */
354+
};
355+
356+
static inline unsigned int hexval(unsigned char c)
357+
{
358+
return hexval_table[c];
359+
}
360+
361+
static int get_sha1_hex(const char *hex, unsigned char *sha1)
362+
{
363+
int i;
364+
for (i = 0; i < 20; i++) {
365+
unsigned int val;
366+
/*
367+
* hex[1]=='\0' is caught when val is checked below,
368+
* but if hex[0] is NUL we have to avoid reading
369+
* past the end of the string:
370+
*/
371+
if (!hex[0])
372+
return -1;
373+
val = (hexval(hex[0]) << 4) | hexval(hex[1]);
374+
if (val & ~0xff)
375+
return -1;
376+
*sha1++ = val;
377+
hex += 2;
378+
}
379+
return 0;
380+
}
381+
382+
int main(int argc, char **argv)
383+
{
384+
/* oversized so we can read the whole buffer in */
385+
static unsigned char buf[25 * 1024 * 1024];
386+
char header[32];
387+
int header_len;
388+
unsigned char have[20], want[20];
389+
int start, len;
390+
SHA_CTX orig;
391+
unsigned i, j;
392+
393+
if (!argv[1] || get_sha1_hex(argv[1], want)) {
394+
fprintf(stderr, "usage: sha1-munge <sha1> [start] <file.in\n");
395+
return 1;
396+
}
397+
398+
if (argv[2])
399+
start = atoi(argv[2]);
400+
else
401+
start = 0;
402+
403+
len = read(0, buf, sizeof(buf));
404+
header_len = sprintf(header, "blob %d", len) + 1;
405+
fprintf(stderr, "using header: %s\n", header);
406+
407+
/*
408+
* We keep a running sha1 so that if you are munging
409+
* near the end of the file, we do not have to re-sha1
410+
* the unchanged earlier bytes
411+
*/
412+
SHA1_Init(&orig);
413+
SHA1_Update(&orig, header, header_len);
414+
if (start)
415+
SHA1_Update(&orig, buf, start);
416+
417+
signal(SIGALRM, progress);
418+
alarm(1);
419+
420+
for (i = start; i < len; i++) {
421+
unsigned char c;
422+
SHA_CTX x;
423+
424+
#if 0
425+
/*
426+
* deletion -- this would not actually work in practice,
427+
* I think, because we've already committed to a
428+
* particular size in the header. Ditto for addition
429+
* below. In those cases, you'd have to do the whole
430+
* sha1 from scratch, or possibly keep three running
431+
* "orig" sha1 computations going.
432+
*/
433+
memcpy(&x, &orig, sizeof(x));
434+
SHA1_Update(&x, buf + i + 1, len - i - 1);
435+
SHA1_Final(have, &x);
436+
if (!memcmp(have, want, 20))
437+
printf("i=%d, deletion\n", i);
438+
#endif
439+
440+
/*
441+
* replacement -- note that this tries each of the 256
442+
* possible bytes. If you suspect a single-bit flip,
443+
* it would be much shorter to just try the 8
444+
* bit-flipped variants.
445+
*/
446+
c = buf[i];
447+
for (j = 0; j <= 0xff; j++) {
448+
buf[i] = j;
449+
450+
memcpy(&x, &orig, sizeof(x));
451+
SHA1_Update(&x, buf + i, len - i);
452+
SHA1_Final(have, &x);
453+
if (!memcmp(have, want, 20))
454+
printf("i=%d, j=%02x\n", i, j);
455+
}
456+
buf[i] = c;
457+
458+
#if 0
459+
/* addition */
460+
for (j = 0; j <= 0xff; j++) {
461+
unsigned char extra = j;
462+
memcpy(&x, &orig, sizeof(x));
463+
SHA1_Update(&x, &extra, 1);
464+
SHA1_Update(&x, buf + i, len - i);
465+
SHA1_Final(have, &x);
466+
if (!memcmp(have, want, 20))
467+
printf("i=%d, addition=%02x", i, j);
468+
}
469+
#endif
470+
471+
SHA1_Update(&orig, buf + i, 1);
472+
counter++;
473+
}
474+
475+
alarm(0);
476+
fprintf(stderr, "\r%d\n", counter);
477+
return 0;
478+
}
479+
--------------------------

0 commit comments

Comments
 (0)