@@ -409,15 +409,28 @@ fn _indirect_object<'a>(
409409 Ok ( ( object_id, object) )
410410}
411411
412- pub fn header ( input : ParserInput ) -> Option < String > {
413- strip_nom ( map_res (
414- delimited (
415- tag ( & b"%PDF-" [ ..] ) ,
412+ pub fn header ( input : ParserInput , strict : bool ) -> Option < String > {
413+ // Parse version digits (e.g. "1.7") separately from any trailing bytes
414+ // before the newline. Some PDF generators (e.g. ImageMill) place binary
415+ // marker bytes on the header line which would fail UTF-8 validation.
416+ // In strict mode we reject such trailing bytes; in lenient mode we skip them.
417+ let ( _, ( version_raw, trailing) ) = delimited (
418+ tag ( & b"%PDF-" [ ..] ) ,
419+ pair (
420+ take_while ( |c : u8 | c. is_ascii_digit ( ) || c == b'.' ) ,
416421 take_while ( |c : u8 | !b"\r \n " . contains ( & c) ) ,
417- pair ( eol, many0_count ( comment) ) ,
418422 ) ,
419- |v : ParserInput | str:: from_utf8 ( & v) . map ( Into :: into) ,
420- ) . parse ( input) )
423+ pair ( eol, many0_count ( comment) ) ,
424+ )
425+ . parse ( input)
426+ . ok ( ) ?;
427+
428+ if strict && !trailing. is_empty ( ) {
429+ return None ;
430+ }
431+
432+ let version = str:: from_utf8 ( & version_raw) . ok ( ) ?. to_string ( ) ;
433+ Some ( version)
421434}
422435
423436pub fn binary_mark ( input : ParserInput ) -> Option < Vec < u8 > > {
@@ -802,6 +815,49 @@ startxref
802815 }
803816 }
804817
818+ #[ test]
819+ fn header_standard ( ) {
820+ // Standard header with proper EOL
821+ let input = b"%PDF-1.7\n %\xe2 \xe3 \xcf \xd3 \n " ;
822+ assert_eq ! ( header( test_span( input) , false ) , Some ( "1.7" . to_string( ) ) ) ;
823+ }
824+
825+ #[ test]
826+ fn header_with_binary_bytes_on_same_line ( ) {
827+ // Some generators (e.g. ImageMill) place binary marker bytes on the
828+ // header line without a separating newline or '%' prefix.
829+ let input = b"%PDF-1.3 \xb0 \x9f \x92 \x9c \x9f \xd4 \xe0 \xce \xd0 \xd0 \xd0 \r 1 0 obj\r " ;
830+ assert_eq ! ( header( test_span( input) , false ) , Some ( "1.3" . to_string( ) ) ) ;
831+ }
832+
833+ #[ test]
834+ fn header_with_binary_bytes_strict_rejects ( ) {
835+ // In strict mode, binary bytes on the header line should cause a
836+ // parse failure (the raw bytes are not valid UTF-8).
837+ let input = b"%PDF-1.3 \xb0 \x9f \x92 \x9c \x9f \xd4 \xe0 \xce \xd0 \xd0 \xd0 \r 1 0 obj\r " ;
838+ assert_eq ! ( header( test_span( input) , true ) , None ) ;
839+ }
840+
841+ #[ test]
842+ fn header_cr_line_ending ( ) {
843+ // CR-only line ending (common in older PDFs)
844+ let input = b"%PDF-1.3\r %\xe2 \xe3 \xcf \xd3 \r " ;
845+ assert_eq ! ( header( test_span( input) , false ) , Some ( "1.3" . to_string( ) ) ) ;
846+ }
847+
848+ #[ test]
849+ fn header_crlf_line_ending ( ) {
850+ // CRLF line ending (common on Windows-generated PDFs)
851+ let input = b"%PDF-1.7\r \n %\xe2 \xe3 \xcf \xd3 \r \n " ;
852+ assert_eq ! ( header( test_span( input) , false ) , Some ( "1.7" . to_string( ) ) ) ;
853+ }
854+
855+ #[ test]
856+ fn header_pdf_2_0 ( ) {
857+ let input = b"%PDF-2.0\n %\xe2 \xe3 \xcf \xd3 \n " ;
858+ assert_eq ! ( header( test_span( input) , false ) , Some ( "2.0" . to_string( ) ) ) ;
859+ }
860+
805861 #[ test]
806862 fn content_with_comments ( ) {
807863 // It should be processed as usual but ignoring the comments
0 commit comments