Skip to content

Commit 73db3a2

Browse files
cfsmp3claude
andcommitted
fix(avc): Handle streams that don't start with NAL start codes (CCExtractor#1626)
The AVC parser would fail with "Leading bytes are non-zero" error when processing HLS/Twitch stream segments that start mid-stream without proper NAL unit headers at the beginning. Root cause: When process_avc encountered non-zero leading bytes, it returned an error with 0 bytes processed. The C code would not remove any bytes from the buffer, causing subsequent data to accumulate with the corrupt beginning, leading to infinite errors. Fix: - Add find_nal_start_code() to search for valid NAL start codes - If buffer doesn't start with 0x00 0x00, search for first NAL start - Skip garbage data before first valid NAL unit - Return full buffer length when no NAL found (clears the buffer) - Change forbidden_zero_bit error from fatal to skip-and-continue Tested with 6 Twitch HLS sample files - all now process correctly. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent c3f637a commit 73db3a2

File tree

1 file changed

+161
-46
lines changed

1 file changed

+161
-46
lines changed

src/rust/src/avc/core.rs

Lines changed: 161 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,31 @@ fn find_next_zero(slice: &[u8]) -> Option<usize> {
367367
fn find_next_zero(slice: &[u8]) -> Option<usize> {
368368
slice.iter().position(|&b| b == 0x00)
369369
}
370+
/// Find the first NAL start code (0x00 0x00 0x01 or 0x00 0x00 0x00 0x01) in a buffer.
371+
/// Returns the position of the 0x01 byte if found, or None if not found.
372+
fn find_nal_start_code(buf: &[u8]) -> Option<usize> {
373+
if buf.len() < 3 {
374+
return None;
375+
}
376+
377+
for i in 0..buf.len().saturating_sub(2) {
378+
// Check for 0x00 0x00 0x01 (3-byte start code)
379+
if buf[i] == 0x00 && buf[i + 1] == 0x00 && buf[i + 2] == 0x01 {
380+
return Some(i + 2); // Position of the 0x01
381+
}
382+
// Also check for 0x00 0x00 0x00 0x01 (4-byte start code)
383+
if i + 3 < buf.len()
384+
&& buf[i] == 0x00
385+
&& buf[i + 1] == 0x00
386+
&& buf[i + 2] == 0x00
387+
&& buf[i + 3] == 0x01
388+
{
389+
return Some(i + 3); // Position of the 0x01
390+
}
391+
}
392+
None
393+
}
394+
370395
/// # Safety
371396
/// This function is unsafe because it dereferences raw pointers and calls `dump` and `do_nal`.
372397
pub unsafe fn process_avc(
@@ -384,118 +409,155 @@ pub unsafe fn process_avc(
384409
));
385410
}
386411

387-
// Warning there should be only leading zeros, nothing else
388-
if !(avcbuf[0] == 0x00 && avcbuf[1] == 0x00) {
389-
return Err(AvcError::BrokenStream(
390-
"Leading bytes are non-zero".to_string(),
391-
));
412+
// If the buffer doesn't start with leading zeros, try to find the first NAL start code.
413+
// This can happen with:
414+
// - HLS/Twitch stream segments that start mid-stream
415+
// - Streams with garbage data at the beginning
416+
// - Buffer accumulation issues after previous errors
417+
let start_offset = if avcbuf[0] == 0x00 && avcbuf[1] == 0x00 {
418+
// Normal case: buffer starts with zeros
419+
0
420+
} else {
421+
// Try to find the first NAL start code
422+
if let Some(nal_pos) = find_nal_start_code(avcbuf) {
423+
// Found a NAL start code, skip to the position before it (the zeros)
424+
// The position returned is the 0x01, so we need to go back to find the zeros
425+
let zeros_start = if nal_pos >= 3 && avcbuf[nal_pos - 3] == 0x00 {
426+
nal_pos - 3 // 4-byte start code
427+
} else {
428+
nal_pos - 2 // 3-byte start code
429+
};
430+
debug!(msg_type = DebugMessageFlag::VERBOSE;
431+
"Skipped {} bytes of garbage before first NAL start code", zeros_start);
432+
zeros_start
433+
} else {
434+
// No NAL start code found - return full buffer length to clear it
435+
debug!(msg_type = DebugMessageFlag::VERBOSE;
436+
"No NAL start code found in buffer of {} bytes, clearing", avcbuflen);
437+
return Ok(avcbuflen);
438+
}
439+
};
440+
441+
// Work with the buffer starting from start_offset
442+
let working_buf = &avcbuf[start_offset..];
443+
let working_len = working_buf.len();
444+
445+
if working_len <= 5 {
446+
// Not enough data after skipping garbage
447+
return Ok(avcbuflen);
392448
}
393449

394450
let mut buffer_position = 2usize;
395-
let mut firstloop = true;
396451

397452
// Loop over NAL units
398-
while buffer_position < avcbuflen.saturating_sub(2) {
453+
while buffer_position < working_len.saturating_sub(2) {
399454
let mut zeropad = 0;
400455

401456
// Find next NAL_start
402-
while buffer_position < avcbuflen {
403-
if avcbuf[buffer_position] == 0x01 {
457+
while buffer_position < working_len {
458+
if working_buf[buffer_position] == 0x01 {
404459
break;
405-
} else if firstloop && avcbuf[buffer_position] != 0x00 {
406-
return Err(AvcError::BrokenStream(
407-
"Leading bytes are non-zero".to_string(),
408-
));
460+
} else if working_buf[buffer_position] != 0x00 {
461+
// Non-zero byte found where we expected zeros - skip to next potential start code
462+
if let Some(next_nal) = find_nal_start_code(&working_buf[buffer_position..]) {
463+
buffer_position += next_nal - 1; // -1 because we'll increment at end of loop
464+
zeropad = 0;
465+
} else {
466+
// No more NAL units found
467+
return Ok(avcbuflen);
468+
}
409469
}
410470
buffer_position += 1;
411471
zeropad += 1;
412472
}
413473

414-
firstloop = false;
415-
416-
if buffer_position >= avcbuflen {
474+
if buffer_position >= working_len {
417475
break;
418476
}
419477

420478
let nal_start_pos = buffer_position + 1;
421-
let mut nal_stop_pos = avcbuflen;
479+
let mut nal_stop_pos = working_len;
422480

423481
buffer_position += 1;
424-
let restlen = avcbuflen.saturating_sub(buffer_position + 2);
482+
let restlen = working_len.saturating_sub(buffer_position + 2);
425483

426484
// Use optimized zero search
427485
if restlen > 0 {
428486
if let Some(zero_offset) =
429-
find_next_zero(&avcbuf[buffer_position..buffer_position + restlen])
487+
find_next_zero(&working_buf[buffer_position..buffer_position + restlen])
430488
{
431489
let zero_pos = buffer_position + zero_offset;
432490

433-
if zero_pos + 2 < avcbuflen {
434-
if avcbuf[zero_pos + 1] == 0x00 && (avcbuf[zero_pos + 2] | 0x01) == 0x01 {
491+
if zero_pos + 2 < working_len {
492+
if working_buf[zero_pos + 1] == 0x00
493+
&& (working_buf[zero_pos + 2] | 0x01) == 0x01
494+
{
435495
nal_stop_pos = zero_pos;
436496
buffer_position = zero_pos + 2;
437497
} else {
438498
// Continue searching from after this zero
439499
buffer_position = zero_pos + 1;
440500
// Recursive search for next start code
441-
while buffer_position < avcbuflen.saturating_sub(2) {
501+
while buffer_position < working_len.saturating_sub(2) {
442502
if let Some(next_zero_offset) = find_next_zero(
443-
&avcbuf[buffer_position..avcbuflen.saturating_sub(2)],
503+
&working_buf[buffer_position..working_len.saturating_sub(2)],
444504
) {
445505
let next_zero_pos = buffer_position + next_zero_offset;
446-
if next_zero_pos + 2 < avcbuflen {
447-
if avcbuf[next_zero_pos + 1] == 0x00
448-
&& (avcbuf[next_zero_pos + 2] | 0x01) == 0x01
506+
if next_zero_pos + 2 < working_len {
507+
if working_buf[next_zero_pos + 1] == 0x00
508+
&& (working_buf[next_zero_pos + 2] | 0x01) == 0x01
449509
{
450510
nal_stop_pos = next_zero_pos;
451511
buffer_position = next_zero_pos + 2;
452512
break;
453513
}
454514
} else {
455-
nal_stop_pos = avcbuflen;
456-
buffer_position = avcbuflen;
515+
nal_stop_pos = working_len;
516+
buffer_position = working_len;
457517
break;
458518
}
459519
buffer_position = next_zero_pos + 1;
460520
} else {
461-
nal_stop_pos = avcbuflen;
462-
buffer_position = avcbuflen;
521+
nal_stop_pos = working_len;
522+
buffer_position = working_len;
463523
break;
464524
}
465525
}
466526
}
467527
} else {
468-
nal_stop_pos = avcbuflen;
469-
buffer_position = avcbuflen;
528+
nal_stop_pos = working_len;
529+
buffer_position = working_len;
470530
}
471531
} else {
472-
nal_stop_pos = avcbuflen;
473-
buffer_position = avcbuflen;
532+
nal_stop_pos = working_len;
533+
buffer_position = working_len;
474534
}
475535
} else {
476-
nal_stop_pos = avcbuflen;
477-
buffer_position = avcbuflen;
536+
nal_stop_pos = working_len;
537+
buffer_position = working_len;
478538
}
479539

480-
if nal_start_pos >= avcbuflen {
540+
if nal_start_pos >= working_len {
481541
break;
482542
}
483543

484-
if (avcbuf[nal_start_pos] & 0x80) != 0 {
544+
if (working_buf[nal_start_pos] & 0x80) != 0 {
485545
let dump_start = nal_start_pos.saturating_sub(4);
486-
let dump_len = std::cmp::min(10, avcbuflen - dump_start);
487-
dump(avcbuf[dump_start..].as_ptr(), dump_len as i32, 0, 0);
488-
489-
return Err(AvcError::ForbiddenZeroBit(
490-
"forbidden_zero_bit not zero".to_string(),
491-
));
546+
let dump_len = std::cmp::min(10, working_len - dump_start);
547+
dump(working_buf[dump_start..].as_ptr(), dump_len as i32, 0, 0);
548+
549+
// Don't return an error - just skip this NAL and continue
550+
// This allows processing to continue even with some corrupt data
551+
debug!(msg_type = DebugMessageFlag::VERBOSE;
552+
"Skipping NAL with forbidden_zero_bit set");
553+
continue;
492554
}
493555

494-
(*dec_ctx.avc_ctx).nal_ref_idc = (avcbuf[nal_start_pos] >> 5) as u32;
556+
(*dec_ctx.avc_ctx).nal_ref_idc = (working_buf[nal_start_pos] >> 5) as u32;
495557

496558
debug!(msg_type = DebugMessageFlag::VIDEO_STREAM; "process_avc: zeropad {}", zeropad);
497559
let nal_length = (nal_stop_pos - nal_start_pos) as i64;
498-
let mut nal_slice = avcbuf[nal_start_pos..nal_stop_pos].to_vec();
560+
let mut nal_slice = working_buf[nal_start_pos..nal_stop_pos].to_vec();
499561

500562
if let Err(e) = do_nal(enc_ctx, dec_ctx, &mut nal_slice, nal_length, sub) {
501563
info!("Error processing NAL unit: {}", e);
@@ -504,3 +566,56 @@ pub unsafe fn process_avc(
504566

505567
Ok(avcbuflen)
506568
}
569+
570+
#[cfg(test)]
571+
mod tests {
572+
use super::*;
573+
574+
#[test]
575+
fn test_find_nal_start_code_3byte() {
576+
// 3-byte start code at position 0
577+
let buf = [0x00, 0x00, 0x01, 0x65, 0x88];
578+
assert_eq!(find_nal_start_code(&buf), Some(2));
579+
}
580+
581+
#[test]
582+
fn test_find_nal_start_code_4byte() {
583+
// 4-byte start code at position 0
584+
let buf = [0x00, 0x00, 0x00, 0x01, 0x67, 0x64];
585+
assert_eq!(find_nal_start_code(&buf), Some(3));
586+
}
587+
588+
#[test]
589+
fn test_find_nal_start_code_with_garbage() {
590+
// Garbage data followed by 3-byte start code
591+
let buf = [0xFF, 0xAB, 0xCD, 0x00, 0x00, 0x01, 0x09, 0xF0];
592+
assert_eq!(find_nal_start_code(&buf), Some(5));
593+
}
594+
595+
#[test]
596+
fn test_find_nal_start_code_no_start_code() {
597+
// No start code in buffer
598+
let buf = [0xFF, 0xAB, 0xCD, 0xEF];
599+
assert_eq!(find_nal_start_code(&buf), None);
600+
}
601+
602+
#[test]
603+
fn test_find_nal_start_code_too_short() {
604+
// Buffer too short
605+
let buf = [0x00, 0x00];
606+
assert_eq!(find_nal_start_code(&buf), None);
607+
}
608+
609+
#[test]
610+
fn test_find_nal_start_code_empty() {
611+
let buf: [u8; 0] = [];
612+
assert_eq!(find_nal_start_code(&buf), None);
613+
}
614+
615+
#[test]
616+
fn test_find_nal_start_code_partial_match() {
617+
// 0x00 0x00 but no 0x01 following
618+
let buf = [0x00, 0x00, 0x02, 0x00, 0x00, 0x01, 0x65];
619+
assert_eq!(find_nal_start_code(&buf), Some(5));
620+
}
621+
}

0 commit comments

Comments
 (0)