@@ -416,7 +416,9 @@ void ParseEngine<EventHandler>::_reset()
416416 m_doc_empty = true ;
417417 m_was_inside_qmrk = false ;
418418 m_prev_colon = npos;
419+ m_bom_len = 0 ;
419420 m_encoding = NOBOM;
421+ m_bom_line = 0 ;
420422 if (m_options.locations ())
421423 {
422424 _prepare_locations ();
@@ -523,7 +525,7 @@ void ParseEngine<EventHandler>::_dbg(csubstr fmt, Args const& C4_RESTRICT ...arg
523525{
524526 if (_dbg_enabled ())
525527 {
526- auto dumpfn = [](csubstr s){ if (s.str ) fwrite (s.str , 1 , s.len , stdout); };
528+ auto dumpfn = [](csubstr s){ if (s.len ) fwrite (s.str , 1 , s.len , stdout); };
527529 detail::_dump (dumpfn, fmt, args...);
528530 dumpfn (" \n " );
529531 _fmt_msg (dumpfn);
@@ -1603,6 +1605,7 @@ void ParseEngine<EventHandler>::_end2_seq()
16031605template <class EventHandler >
16041606void ParseEngine<EventHandler>::_begin2_doc()
16051607{
1608+ _c4dbgp (" begin_doc" );
16061609 m_doc_empty = true ;
16071610 add_flags (RDOC);
16081611 m_evt_handler->begin_doc ();
@@ -1612,6 +1615,7 @@ void ParseEngine<EventHandler>::_begin2_doc()
16121615template <class EventHandler >
16131616void ParseEngine<EventHandler>::_begin2_doc_expl()
16141617{
1618+ _c4dbgp (" begin_doc_expl" );
16151619 m_doc_empty = true ;
16161620 add_flags (RDOC);
16171621 m_evt_handler->begin_doc_expl ();
@@ -1630,6 +1634,7 @@ void ParseEngine<EventHandler>::_end2_doc()
16301634 m_evt_handler->set_val_scalar_plain_empty ();
16311635 }
16321636 m_evt_handler->end_doc ();
1637+ m_bom_len = 0 ;
16331638}
16341639
16351640template <class EventHandler >
@@ -1643,6 +1648,7 @@ void ParseEngine<EventHandler>::_end2_doc_expl()
16431648 m_evt_handler->set_val_scalar_plain_empty ();
16441649 }
16451650 m_evt_handler->end_doc_expl ();
1651+ m_bom_len = 0 ;
16461652}
16471653
16481654template <class EventHandler >
@@ -4354,39 +4360,44 @@ bool ParseEngine<EventHandler>::_handle_bom()
43544360 const csubstr rest = rem.sub (1 );
43554361 // https://yaml.org/spec/1.2.2/#52-character-encodings
43564362 #define _rymlisascii (c ) ((c) > ' \0 ' && (c) <= ' \x7f ' ) // is the character ASCII?
4357- if (rem.begins_with ({" \x00\x00\xfe\xff " , 4 }) || (rem.begins_with ({" \x00\x00\x00 " , 3 }) && rem.len >= 4u && _rymlisascii (rem.str [3 ])))
4363+ if (rem.begins_with (csubstr {" \x00\x00\xfe\xff " , 4 }) || (rem.begins_with (csubstr {" \x00\x00\x00 " , 3 }) && rem.len >= 4u && _rymlisascii (rem.str [3 ])))
43584364 {
43594365 _c4dbgp (" byte order mark: UTF32BE" );
43604366 _handle_bom (UTF32BE);
43614367 _line_progressed (4 );
4368+ m_bom_len = 4 ;
43624369 return true ;
43634370 }
4364- else if (rem.begins_with (" \xff\xfe\x00\x00 " ) || (rest.begins_with ({" \x00\x00\x00 " , 3 }) && rem.len >= 4u && _rymlisascii (rem.str [0 ])))
4371+ else if (rem.begins_with (csubstr{ " \xff\xfe\x00\x00 " , 4 } ) || (rest.begins_with (csubstr {" \x00\x00\x00 " , 3 }) && rem.len >= 4u && _rymlisascii (rem.str [0 ])))
43654372 {
43664373 _c4dbgp (" byte order mark: UTF32LE" );
43674374 _handle_bom (UTF32LE);
43684375 _line_progressed (4 );
4376+ m_bom_len = 4 ;
43694377 return true ;
43704378 }
43714379 else if (rem.begins_with (" \xfe\xff " ) || (rem.begins_with (' \x00 ' ) && rem.len >= 2u && _rymlisascii (rem.str [1 ])))
43724380 {
43734381 _c4dbgp (" byte order mark: UTF16BE" );
43744382 _handle_bom (UTF16BE);
43754383 _line_progressed (2 );
4384+ m_bom_len = 2 ;
43764385 return true ;
43774386 }
43784387 else if (rem.begins_with (" \xff\xfe " ) || (rest.begins_with (' \x00 ' ) && rem.len >= 2u && _rymlisascii (rem.str [0 ])))
43794388 {
43804389 _c4dbgp (" byte order mark: UTF16LE" );
43814390 _handle_bom (UTF16LE);
43824391 _line_progressed (2 );
4392+ m_bom_len = 2 ;
43834393 return true ;
43844394 }
43854395 else if (rem.begins_with (" \xef\xbb\xbf " ))
43864396 {
43874397 _c4dbgp (" byte order mark: UTF8" );
43884398 _handle_bom (UTF8);
43894399 _line_progressed (3 );
4400+ m_bom_len = 3 ;
43904401 return true ;
43914402 }
43924403 #undef _rymlisascii
@@ -4399,8 +4410,7 @@ void ParseEngine<EventHandler>::_handle_bom(Encoding_e enc)
43994410{
44004411 if (m_encoding == NOBOM)
44014412 {
4402- const bool is_beginning_of_file = m_evt_handler->m_curr ->line_contents .rem .str == m_buf.str ;
4403- if (enc == UTF8 || is_beginning_of_file)
4413+ if (enc == UTF8 || /* beginning of file*/ (m_evt_handler->m_curr ->line_contents .rem .str == m_buf.str ))
44044414 m_encoding = enc;
44054415 else
44064416 _c4err (" non-UTF8 byte order mark can appear only at the beginning of the file" );
@@ -5651,7 +5661,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
56515661 const size_t startline = m_evt_handler->m_curr ->pos .line ;
56525662 // warning: the gcc optimizer on x86 builds is brittle with
56535663 // this function:
5654- const size_t startindent = m_evt_handler->m_curr ->line_contents .current_col ();
5664+ const size_t startindent = m_evt_handler->m_curr ->line_contents .current_col () - m_bom_len ;
56555665 ScannedScalar sc;
56565666 if (first == ' \' ' )
56575667 {
@@ -5815,7 +5825,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
58155825 _handle_annotations_before_blck_val_scalar ();
58165826 m_evt_handler->begin_seq_val_block ();
58175827 addrem_flags (RVAL, RNXT);
5818- _save_indentation ( );
5828+ _set_indentation (startindent );
58195829 // keep going on inside this function
58205830 }
58215831 _line_progressed (1 );
@@ -5883,7 +5893,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
58835893 m_was_inside_qmrk = true ;
58845894 m_evt_handler->begin_map_val_block ();
58855895 addrem_flags (RMAP|QMRK, RSEQ|RNXT);
5886- _save_indentation ( );
5896+ _set_indentation (startindent );
58875897 _line_progressed (1 );
58885898 _maybe_skip_whitespace_tokens ();
58895899 goto seqblck_finish;
@@ -6050,6 +6060,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
60506060 _c4dbgt (" seqblck: go again" , 0 );
60516061 if (_finished_line ())
60526062 {
6063+ m_bom_len = 0 ;
60536064 _line_ended ();
60546065 _scan_line ();
60556066 if (_finished_file ())
@@ -7368,15 +7379,15 @@ void ParseEngine<EventHandler>::_handle_unk()
73687379 _c4dbgpf (" rem is now [{}]~~~{}~~~" , rem.len , rem);
73697380 }
73707381
7371- if (m_evt_handler->m_curr ->line_contents .indentation == 0u && _at_line_begin ())
7382+ if (m_evt_handler->m_curr ->line_contents .indentation == 0u && ( _at_line_begin () || (m_bom_len && (m_evt_handler-> m_curr -> pos . line == m_bom_line)) ))
73727383 {
7373- _c4dbgp (" rtop: zero indent + at line begin" );
7384+ _c4dbgpf (" rtop: zero indent + at line begin. offset={}" , m_evt_handler->m_curr ->pos .offset );
7385+ _c4dbgp (" check BOM" );
73747386 if (_handle_bom ())
73757387 {
7376- _c4dbgp (" byte order mark!" );
7377- rem = m_evt_handler->m_curr ->line_contents .rem ;
7378- if (!rem.len )
7379- return ;
7388+ m_bom_line = m_evt_handler->m_curr ->pos .line ;
7389+ _c4dbgpf (" byte order mark! line={} offset={}" , m_bom_line, m_evt_handler->m_curr ->pos .offset );
7390+ return ;
73807391 }
73817392 const char first = rem.str [0 ];
73827393 if (first == ' -' )
@@ -7427,30 +7438,46 @@ void ParseEngine<EventHandler>::_handle_unk()
74277438 /* no else-if! */
74287439 char first = rem.str [0 ];
74297440
7441+ const size_t startindent = m_evt_handler->m_curr ->line_contents .indentation ;
7442+ size_t remindent = m_evt_handler->m_curr ->line_contents .current_col (rem);
7443+ if (m_bom_len)
7444+ {
7445+ _c4dbgpf (" prev BOMlen={}" , m_bom_len);
7446+ if (m_evt_handler->m_curr ->pos .line == m_bom_line)
7447+ {
7448+ _c4dbgpf (" BOM remindent={} offset={}" , remindent, m_evt_handler->m_curr ->pos .offset );
7449+ _RYML_CB_ASSERT (m_evt_handler->m_stack .m_callbacks , remindent >= m_bom_len);
7450+ remindent -= m_bom_len;
7451+ }
7452+ else
7453+ {
7454+ m_bom_len = 0 ;
7455+ }
7456+ }
7457+
74307458 if (first == ' [' )
74317459 {
74327460 m_evt_handler->check_trailing_doc_token ();
74337461 _maybe_begin_doc ();
74347462 m_doc_empty = false ;
7435- const size_t startindent = m_evt_handler->m_curr ->line_contents .current_col (rem);
74367463 if (C4_LIKELY ( ! _annotations_require_key_container ()))
74377464 {
74387465 _c4dbgp (" it's a seq, flow" );
74397466 _handle_annotations_before_blck_val_scalar ();
74407467 m_evt_handler->begin_seq_val_flow ();
74417468 addrem_flags (RSEQ|FLOW|RVAL, RUNK|RTOP|RDOC);
7442- _set_indentation (startindent );
7469+ _set_indentation (remindent );
74437470 }
74447471 else
74457472 {
74467473 _c4dbgp (" start new block map, set flow seq as key (!)" );
74477474 _handle_annotations_before_start_mapblck (m_evt_handler->m_curr ->pos .line );
74487475 m_evt_handler->begin_map_val_block ();
74497476 addrem_flags (RMAP|BLCK|RKCL, RUNK|RTOP|RDOC);
7450- _handle_annotations_and_indentation_after_start_mapblck (startindent , m_evt_handler->m_curr ->pos .line );
7477+ _handle_annotations_and_indentation_after_start_mapblck (remindent , m_evt_handler->m_curr ->pos .line );
74517478 m_evt_handler->begin_seq_key_flow ();
74527479 addrem_flags (RSEQ|FLOW|RVAL, RMAP|BLCK|RKCL);
7453- _set_indentation (startindent );
7480+ _set_indentation (remindent );
74547481 }
74557482 _line_progressed (1 );
74567483 }
@@ -7459,25 +7486,24 @@ void ParseEngine<EventHandler>::_handle_unk()
74597486 m_evt_handler->check_trailing_doc_token ();
74607487 _maybe_begin_doc ();
74617488 m_doc_empty = false ;
7462- const size_t startindent = m_evt_handler->m_curr ->line_contents .current_col (rem);
74637489 if (C4_LIKELY ( ! _annotations_require_key_container ()))
74647490 {
74657491 _c4dbgp (" it's a map, flow" );
74667492 _handle_annotations_before_blck_val_scalar ();
74677493 m_evt_handler->begin_map_val_flow ();
74687494 addrem_flags (RMAP|FLOW|RKEY, RVAL|RTOP|RUNK|RDOC);
7469- _set_indentation (startindent );
7495+ _set_indentation (remindent );
74707496 }
74717497 else
74727498 {
74737499 _c4dbgp (" start new block map, set flow map as key (!)" );
74747500 _handle_annotations_before_start_mapblck (m_evt_handler->m_curr ->pos .line );
74757501 m_evt_handler->begin_map_val_block ();
74767502 addrem_flags (RMAP|BLCK|RKCL, RUNK|RTOP|RDOC);
7477- _handle_annotations_and_indentation_after_start_mapblck (startindent , m_evt_handler->m_curr ->pos .line );
7503+ _handle_annotations_and_indentation_after_start_mapblck (remindent , m_evt_handler->m_curr ->pos .line );
74787504 m_evt_handler->begin_map_key_flow ();
74797505 addrem_flags (RMAP|FLOW|RKEY, BLCK|RKCL);
7480- _set_indentation (startindent );
7506+ _set_indentation (remindent );
74817507 }
74827508 _line_progressed (1 );
74837509 }
@@ -7490,7 +7516,7 @@ void ParseEngine<EventHandler>::_handle_unk()
74907516 m_evt_handler->begin_seq_val_block ();
74917517 addrem_flags (RSEQ|BLCK|RVAL, RNXT|RTOP|RUNK|RDOC);
74927518 m_doc_empty = false ;
7493- _set_indentation (m_evt_handler-> m_curr -> line_contents . current_col (rem) );
7519+ _set_indentation (remindent );
74947520 _line_progressed (1 );
74957521 _maybe_skip_whitespace_tokens ();
74967522 }
@@ -7504,7 +7530,7 @@ void ParseEngine<EventHandler>::_handle_unk()
75047530 addrem_flags (RMAP|BLCK|QMRK, RKEY|RVAL|RTOP|RUNK);
75057531 m_doc_empty = false ;
75067532 m_was_inside_qmrk = true ;
7507- _save_indentation ();
7533+ _set_indentation (remindent); // _save_indentation();
75087534 _line_progressed (1 );
75097535 _maybe_skip_whitespace_tokens ();
75107536 }
@@ -7513,7 +7539,6 @@ void ParseEngine<EventHandler>::_handle_unk()
75137539 if (m_doc_empty)
75147540 {
75157541 _c4dbgp (" it's a map with an empty key" );
7516- const size_t startindent = m_evt_handler->m_curr ->line_contents .indentation ; // save
75177542 const size_t startline = m_evt_handler->m_curr ->pos .line ; // save
75187543 m_evt_handler->check_trailing_doc_token ();
75197544 _maybe_begin_doc ();
@@ -7542,9 +7567,8 @@ void ParseEngine<EventHandler>::_handle_unk()
75427567 _c4dbgpf (" anchor! [{}]~~~{}~~~" , anchor.len , anchor);
75437568 m_evt_handler->check_trailing_doc_token ();
75447569 _maybe_begin_doc ();
7545- const size_t indentation = m_evt_handler->m_curr ->line_contents .current_col (rem);
75467570 const size_t line = m_evt_handler->m_curr ->pos .line ;
7547- _add_annotation (&m_pending_anchors, anchor, indentation , line);
7571+ _add_annotation (&m_pending_anchors, anchor, remindent , line);
75487572 _set_indentation (m_evt_handler->m_curr ->line_contents .current_col (rem));
75497573 m_doc_empty = false ;
75507574 }
@@ -7564,7 +7588,6 @@ void ParseEngine<EventHandler>::_handle_unk()
75647588 else
75657589 {
75667590 _c4dbgp (" runk: start new block map, set ref as key" );
7567- const size_t startindent = m_evt_handler->m_curr ->line_contents .indentation ; // save
75687591 const size_t startline = m_evt_handler->m_curr ->pos .line ; // save
75697592 _handle_annotations_before_start_mapblck (startline);
75707593 m_evt_handler->begin_map_val_block ();
@@ -7592,7 +7615,6 @@ void ParseEngine<EventHandler>::_handle_unk()
75927615 csubstr s = m_evt_handler->m_curr ->line_contents .rem ;
75937616 if (!s.len )
75947617 return ;
7595- const size_t startindent = m_evt_handler->m_curr ->line_contents .indentation ; // save
75967618 const size_t startline = m_evt_handler->m_curr ->pos .line ; // save
75977619 first = s.str [0 ];
75987620 ScannedScalar sc;
0 commit comments