Skip to content

Commit 836180f

Browse files
committed
Byte order mark: account for indentation
1 parent 7bbce2a commit 836180f

File tree

6 files changed

+761
-352
lines changed

6 files changed

+761
-352
lines changed

changelog/current.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
### Changes
22

3+
- [PR#561](https://github.com/biojppm/rapidyaml/pull/561) (fixes [#559](https://github.com/biojppm/rapidyaml/issues/559)) - Byte Order Mark: account for BOM when determining block indentation
34
- [PR#563](https://github.com/biojppm/rapidyaml/pull/563) (fixes [#562](https://github.com/biojppm/rapidyaml/issues/562)) - Fix bug in `NodeRef::cend()`
45
- [PR#547](https://github.com/biojppm/rapidyaml/pull/547) - Fix parsing of implicit first documents with empty sequences, caused by a problem in `Tree::set_root_as_stream()`:
56
```yaml
67
[] # this container was lost during parsing
78
---
89
more data here
910
```
10-
- [PR#557](https://github.com/biojppm/rapidyaml/pull/557) - `Tree` is now non-empty by default, and `Tree::root_id()` will no longer modify the tree when it is empty. To create an empty tree now it is necessary to use the capacity constructor with a capacity of zero:
11+
- [PR#557](https://github.com/biojppm/rapidyaml/pull/557) - `Tree` is now non-empty by default, and `Tree::root_id()` will no longer modify the tree when it is empty. To create an empty tree now, it is necessary to use the capacity constructor with a capacity of zero:
1112
```c++
12-
// default-constructed tree is now non-empty
13+
// breaking change: default-constructed tree is now non-empty
1314
Tree tree;
1415
assert(!tree.empty()); // MODIFIED! was empty on previous version
1516
id_type root = tree.root_id(); // OK. default-constructed tree is now non-empty
1617

17-
// to create an empty tree:
18+
// to create an empty tree (as happened before):
1819
Tree tree(0); // pass capacity of zero
1920
assert(tree.empty()); // as expected
2021
// but watchout, this is no longer possible:

src/c4/yml/parse_engine.def.hpp

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,9 @@ void ParseEngine<EventHandler>::_reset()
416416
m_doc_empty = true;
417417
m_was_inside_qmrk = false;
418418
m_prev_colon = npos;
419+
m_bom_len = 0;
419420
m_encoding = NOBOM;
421+
m_bom_line = 0;
420422
if(m_options.locations())
421423
{
422424
_prepare_locations();
@@ -523,7 +525,7 @@ void ParseEngine<EventHandler>::_dbg(csubstr fmt, Args const& C4_RESTRICT ...arg
523525
{
524526
if(_dbg_enabled())
525527
{
526-
auto dumpfn = [](csubstr s){ if(s.str) fwrite(s.str, 1, s.len, stdout); };
528+
auto dumpfn = [](csubstr s){ if(s.len) fwrite(s.str, 1, s.len, stdout); };
527529
detail::_dump(dumpfn, fmt, args...);
528530
dumpfn("\n");
529531
_fmt_msg(dumpfn);
@@ -1603,6 +1605,7 @@ void ParseEngine<EventHandler>::_end2_seq()
16031605
template<class EventHandler>
16041606
void ParseEngine<EventHandler>::_begin2_doc()
16051607
{
1608+
_c4dbgp("begin_doc");
16061609
m_doc_empty = true;
16071610
add_flags(RDOC);
16081611
m_evt_handler->begin_doc();
@@ -1612,6 +1615,7 @@ void ParseEngine<EventHandler>::_begin2_doc()
16121615
template<class EventHandler>
16131616
void ParseEngine<EventHandler>::_begin2_doc_expl()
16141617
{
1618+
_c4dbgp("begin_doc_expl");
16151619
m_doc_empty = true;
16161620
add_flags(RDOC);
16171621
m_evt_handler->begin_doc_expl();
@@ -1630,6 +1634,7 @@ void ParseEngine<EventHandler>::_end2_doc()
16301634
m_evt_handler->set_val_scalar_plain_empty();
16311635
}
16321636
m_evt_handler->end_doc();
1637+
m_bom_len = 0;
16331638
}
16341639

16351640
template<class EventHandler>
@@ -1643,6 +1648,7 @@ void ParseEngine<EventHandler>::_end2_doc_expl()
16431648
m_evt_handler->set_val_scalar_plain_empty();
16441649
}
16451650
m_evt_handler->end_doc_expl();
1651+
m_bom_len = 0;
16461652
}
16471653

16481654
template<class EventHandler>
@@ -4354,39 +4360,44 @@ bool ParseEngine<EventHandler>::_handle_bom()
43544360
const csubstr rest = rem.sub(1);
43554361
// https://yaml.org/spec/1.2.2/#52-character-encodings
43564362
#define _rymlisascii(c) ((c) > '\0' && (c) <= '\x7f') // is the character ASCII?
4357-
if(rem.begins_with({"\x00\x00\xfe\xff", 4}) || (rem.begins_with({"\x00\x00\x00", 3}) && rem.len >= 4u && _rymlisascii(rem.str[3])))
4363+
if(rem.begins_with(csubstr{"\x00\x00\xfe\xff", 4}) || (rem.begins_with(csubstr{"\x00\x00\x00", 3}) && rem.len >= 4u && _rymlisascii(rem.str[3])))
43584364
{
43594365
_c4dbgp("byte order mark: UTF32BE");
43604366
_handle_bom(UTF32BE);
43614367
_line_progressed(4);
4368+
m_bom_len = 4;
43624369
return true;
43634370
}
4364-
else if(rem.begins_with("\xff\xfe\x00\x00") || (rest.begins_with({"\x00\x00\x00", 3}) && rem.len >= 4u && _rymlisascii(rem.str[0])))
4371+
else if(rem.begins_with(csubstr{"\xff\xfe\x00\x00", 4}) || (rest.begins_with(csubstr{"\x00\x00\x00", 3}) && rem.len >= 4u && _rymlisascii(rem.str[0])))
43654372
{
43664373
_c4dbgp("byte order mark: UTF32LE");
43674374
_handle_bom(UTF32LE);
43684375
_line_progressed(4);
4376+
m_bom_len = 4;
43694377
return true;
43704378
}
43714379
else if(rem.begins_with("\xfe\xff") || (rem.begins_with('\x00') && rem.len >= 2u && _rymlisascii(rem.str[1])))
43724380
{
43734381
_c4dbgp("byte order mark: UTF16BE");
43744382
_handle_bom(UTF16BE);
43754383
_line_progressed(2);
4384+
m_bom_len = 2;
43764385
return true;
43774386
}
43784387
else if(rem.begins_with("\xff\xfe") || (rest.begins_with('\x00') && rem.len >= 2u && _rymlisascii(rem.str[0])))
43794388
{
43804389
_c4dbgp("byte order mark: UTF16LE");
43814390
_handle_bom(UTF16LE);
43824391
_line_progressed(2);
4392+
m_bom_len = 2;
43834393
return true;
43844394
}
43854395
else if(rem.begins_with("\xef\xbb\xbf"))
43864396
{
43874397
_c4dbgp("byte order mark: UTF8");
43884398
_handle_bom(UTF8);
43894399
_line_progressed(3);
4400+
m_bom_len = 3;
43904401
return true;
43914402
}
43924403
#undef _rymlisascii
@@ -4399,8 +4410,7 @@ void ParseEngine<EventHandler>::_handle_bom(Encoding_e enc)
43994410
{
44004411
if(m_encoding == NOBOM)
44014412
{
4402-
const bool is_beginning_of_file = m_evt_handler->m_curr->line_contents.rem.str == m_buf.str;
4403-
if(enc == UTF8 || is_beginning_of_file)
4413+
if(enc == UTF8 || /*beginning of file*/(m_evt_handler->m_curr->line_contents.rem.str == m_buf.str))
44044414
m_encoding = enc;
44054415
else
44064416
_c4err("non-UTF8 byte order mark can appear only at the beginning of the file");
@@ -5651,7 +5661,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
56515661
const size_t startline = m_evt_handler->m_curr->pos.line;
56525662
// warning: the gcc optimizer on x86 builds is brittle with
56535663
// this function:
5654-
const size_t startindent = m_evt_handler->m_curr->line_contents.current_col();
5664+
const size_t startindent = m_evt_handler->m_curr->line_contents.current_col() - m_bom_len;
56555665
ScannedScalar sc;
56565666
if(first == '\'')
56575667
{
@@ -5815,7 +5825,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
58155825
_handle_annotations_before_blck_val_scalar();
58165826
m_evt_handler->begin_seq_val_block();
58175827
addrem_flags(RVAL, RNXT);
5818-
_save_indentation();
5828+
_set_indentation(startindent);
58195829
// keep going on inside this function
58205830
}
58215831
_line_progressed(1);
@@ -5883,7 +5893,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
58835893
m_was_inside_qmrk = true;
58845894
m_evt_handler->begin_map_val_block();
58855895
addrem_flags(RMAP|QMRK, RSEQ|RNXT);
5886-
_save_indentation();
5896+
_set_indentation(startindent);
58875897
_line_progressed(1);
58885898
_maybe_skip_whitespace_tokens();
58895899
goto seqblck_finish;
@@ -6050,6 +6060,7 @@ void ParseEngine<EventHandler>::_handle_seq_block()
60506060
_c4dbgt("seqblck: go again", 0);
60516061
if(_finished_line())
60526062
{
6063+
m_bom_len = 0;
60536064
_line_ended();
60546065
_scan_line();
60556066
if(_finished_file())
@@ -7368,15 +7379,15 @@ void ParseEngine<EventHandler>::_handle_unk()
73687379
_c4dbgpf("rem is now [{}]~~~{}~~~", rem.len, rem);
73697380
}
73707381

7371-
if(m_evt_handler->m_curr->line_contents.indentation == 0u && _at_line_begin())
7382+
if(m_evt_handler->m_curr->line_contents.indentation == 0u && (_at_line_begin() || (m_bom_len && (m_evt_handler->m_curr->pos.line == m_bom_line))))
73727383
{
7373-
_c4dbgp("rtop: zero indent + at line begin");
7384+
_c4dbgpf("rtop: zero indent + at line begin. offset={}", m_evt_handler->m_curr->pos.offset);
7385+
_c4dbgp("check BOM");
73747386
if(_handle_bom())
73757387
{
7376-
_c4dbgp("byte order mark!");
7377-
rem = m_evt_handler->m_curr->line_contents.rem;
7378-
if(!rem.len)
7379-
return;
7388+
m_bom_line = m_evt_handler->m_curr->pos.line;
7389+
_c4dbgpf("byte order mark! line={} offset={}", m_bom_line, m_evt_handler->m_curr->pos.offset);
7390+
return;
73807391
}
73817392
const char first = rem.str[0];
73827393
if(first == '-')
@@ -7427,30 +7438,46 @@ void ParseEngine<EventHandler>::_handle_unk()
74277438
/* no else-if! */
74287439
char first = rem.str[0];
74297440

7441+
const size_t startindent = m_evt_handler->m_curr->line_contents.indentation;
7442+
size_t remindent = m_evt_handler->m_curr->line_contents.current_col(rem);
7443+
if(m_bom_len)
7444+
{
7445+
_c4dbgpf("prev BOMlen={}", m_bom_len);
7446+
if(m_evt_handler->m_curr->pos.line == m_bom_line)
7447+
{
7448+
_c4dbgpf("BOM remindent={} offset={}", remindent, m_evt_handler->m_curr->pos.offset);
7449+
_RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, remindent >= m_bom_len);
7450+
remindent -= m_bom_len;
7451+
}
7452+
else
7453+
{
7454+
m_bom_len = 0;
7455+
}
7456+
}
7457+
74307458
if(first == '[')
74317459
{
74327460
m_evt_handler->check_trailing_doc_token();
74337461
_maybe_begin_doc();
74347462
m_doc_empty = false;
7435-
const size_t startindent = m_evt_handler->m_curr->line_contents.current_col(rem);
74367463
if(C4_LIKELY( ! _annotations_require_key_container()))
74377464
{
74387465
_c4dbgp("it's a seq, flow");
74397466
_handle_annotations_before_blck_val_scalar();
74407467
m_evt_handler->begin_seq_val_flow();
74417468
addrem_flags(RSEQ|FLOW|RVAL, RUNK|RTOP|RDOC);
7442-
_set_indentation(startindent);
7469+
_set_indentation(remindent);
74437470
}
74447471
else
74457472
{
74467473
_c4dbgp("start new block map, set flow seq as key (!)");
74477474
_handle_annotations_before_start_mapblck(m_evt_handler->m_curr->pos.line);
74487475
m_evt_handler->begin_map_val_block();
74497476
addrem_flags(RMAP|BLCK|RKCL, RUNK|RTOP|RDOC);
7450-
_handle_annotations_and_indentation_after_start_mapblck(startindent, m_evt_handler->m_curr->pos.line);
7477+
_handle_annotations_and_indentation_after_start_mapblck(remindent, m_evt_handler->m_curr->pos.line);
74517478
m_evt_handler->begin_seq_key_flow();
74527479
addrem_flags(RSEQ|FLOW|RVAL, RMAP|BLCK|RKCL);
7453-
_set_indentation(startindent);
7480+
_set_indentation(remindent);
74547481
}
74557482
_line_progressed(1);
74567483
}
@@ -7459,25 +7486,24 @@ void ParseEngine<EventHandler>::_handle_unk()
74597486
m_evt_handler->check_trailing_doc_token();
74607487
_maybe_begin_doc();
74617488
m_doc_empty = false;
7462-
const size_t startindent = m_evt_handler->m_curr->line_contents.current_col(rem);
74637489
if(C4_LIKELY( ! _annotations_require_key_container()))
74647490
{
74657491
_c4dbgp("it's a map, flow");
74667492
_handle_annotations_before_blck_val_scalar();
74677493
m_evt_handler->begin_map_val_flow();
74687494
addrem_flags(RMAP|FLOW|RKEY, RVAL|RTOP|RUNK|RDOC);
7469-
_set_indentation(startindent);
7495+
_set_indentation(remindent);
74707496
}
74717497
else
74727498
{
74737499
_c4dbgp("start new block map, set flow map as key (!)");
74747500
_handle_annotations_before_start_mapblck(m_evt_handler->m_curr->pos.line);
74757501
m_evt_handler->begin_map_val_block();
74767502
addrem_flags(RMAP|BLCK|RKCL, RUNK|RTOP|RDOC);
7477-
_handle_annotations_and_indentation_after_start_mapblck(startindent, m_evt_handler->m_curr->pos.line);
7503+
_handle_annotations_and_indentation_after_start_mapblck(remindent, m_evt_handler->m_curr->pos.line);
74787504
m_evt_handler->begin_map_key_flow();
74797505
addrem_flags(RMAP|FLOW|RKEY, BLCK|RKCL);
7480-
_set_indentation(startindent);
7506+
_set_indentation(remindent);
74817507
}
74827508
_line_progressed(1);
74837509
}
@@ -7490,7 +7516,7 @@ void ParseEngine<EventHandler>::_handle_unk()
74907516
m_evt_handler->begin_seq_val_block();
74917517
addrem_flags(RSEQ|BLCK|RVAL, RNXT|RTOP|RUNK|RDOC);
74927518
m_doc_empty = false;
7493-
_set_indentation(m_evt_handler->m_curr->line_contents.current_col(rem));
7519+
_set_indentation(remindent);
74947520
_line_progressed(1);
74957521
_maybe_skip_whitespace_tokens();
74967522
}
@@ -7504,7 +7530,7 @@ void ParseEngine<EventHandler>::_handle_unk()
75047530
addrem_flags(RMAP|BLCK|QMRK, RKEY|RVAL|RTOP|RUNK);
75057531
m_doc_empty = false;
75067532
m_was_inside_qmrk = true;
7507-
_save_indentation();
7533+
_set_indentation(remindent); //_save_indentation();
75087534
_line_progressed(1);
75097535
_maybe_skip_whitespace_tokens();
75107536
}
@@ -7513,7 +7539,6 @@ void ParseEngine<EventHandler>::_handle_unk()
75137539
if(m_doc_empty)
75147540
{
75157541
_c4dbgp("it's a map with an empty key");
7516-
const size_t startindent = m_evt_handler->m_curr->line_contents.indentation; // save
75177542
const size_t startline = m_evt_handler->m_curr->pos.line; // save
75187543
m_evt_handler->check_trailing_doc_token();
75197544
_maybe_begin_doc();
@@ -7542,9 +7567,8 @@ void ParseEngine<EventHandler>::_handle_unk()
75427567
_c4dbgpf("anchor! [{}]~~~{}~~~", anchor.len, anchor);
75437568
m_evt_handler->check_trailing_doc_token();
75447569
_maybe_begin_doc();
7545-
const size_t indentation = m_evt_handler->m_curr->line_contents.current_col(rem);
75467570
const size_t line = m_evt_handler->m_curr->pos.line;
7547-
_add_annotation(&m_pending_anchors, anchor, indentation, line);
7571+
_add_annotation(&m_pending_anchors, anchor, remindent, line);
75487572
_set_indentation(m_evt_handler->m_curr->line_contents.current_col(rem));
75497573
m_doc_empty = false;
75507574
}
@@ -7564,7 +7588,6 @@ void ParseEngine<EventHandler>::_handle_unk()
75647588
else
75657589
{
75667590
_c4dbgp("runk: start new block map, set ref as key");
7567-
const size_t startindent = m_evt_handler->m_curr->line_contents.indentation; // save
75687591
const size_t startline = m_evt_handler->m_curr->pos.line; // save
75697592
_handle_annotations_before_start_mapblck(startline);
75707593
m_evt_handler->begin_map_val_block();
@@ -7592,7 +7615,6 @@ void ParseEngine<EventHandler>::_handle_unk()
75927615
csubstr s = m_evt_handler->m_curr->line_contents.rem;
75937616
if(!s.len)
75947617
return;
7595-
const size_t startindent = m_evt_handler->m_curr->line_contents.indentation; // save
75967618
const size_t startline = m_evt_handler->m_curr->pos.line; // save
75977619
first = s.str[0];
75987620
ScannedScalar sc;

src/c4/yml/parse_engine.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,10 @@ class ParseEngine
758758
bool m_doc_empty = true;
759759
size_t m_prev_colon = npos;
760760

761+
private:
762+
763+
size_t m_bom_len = 0;
764+
size_t m_bom_line = 0;
761765
Encoding_e m_encoding = UTF8;
762766

763767
private:

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ ryml_add_test(json)
107107
ryml_add_test(preprocess)
108108
ryml_add_test(merge)
109109
ryml_add_test(location)
110+
ryml_add_test(bom)
110111
ryml_add_test_case_group(empty_file)
111112
ryml_add_test_case_group(doc)
112113
ryml_add_test_case_group(seq)

0 commit comments

Comments
 (0)