Skip to content

Commit 63e6d3e

Browse files
committed
Add option to discard whitespace before xml tag
Add an option to discard whitespace before the xml tag when reading from a stream. Also updated a bad error message.
1 parent 7b8942e commit 63e6d3e

File tree

5 files changed

+97
-10
lines changed

5 files changed

+97
-10
lines changed

lib/xmerl/src/xmerl_sax_parser.erl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ Possible options are:
9292
- **`{fail_undeclared_ref, Boolean}`** - Decides how the parser should behave
9393
when an undeclared reference is found. Can be useful if one has turned of
9494
external entities so that an external DTD is not parsed. Default is `true`.
95+
96+
- **`{discard_ws_before_xml_document, Boolean}`** - Discard whitespace before
97+
`xml` tag instead of returning a fatal error. Default is `false`.
9598
""".
9699
-type options() :: [{continuation_fun, continuation_fun()} |
97100
{continuation_state, continuation_state()} |
@@ -102,7 +105,8 @@ Possible options are:
102105
skip_external_dtd | disallow_entities |
103106
{entity_recurse_limit, non_neg_integer()} |
104107
{external_entities, all | file | none} |
105-
{fail_undeclared_ref, boolean()}].
108+
{fail_undeclared_ref, boolean()} |
109+
{discard_ws_before_xml_document, boolean()}].
106110

107111
-type continuation_state() :: term().
108112

@@ -472,6 +476,8 @@ parse_options([{external_entities, Type} |Options], State) when Type =:= all;
472476
parse_options(Options, State#xmerl_sax_parser_state{external_entities = Type});
473477
parse_options([{fail_undeclared_ref, Bool} |Options], State) when is_boolean(Bool) ->
474478
parse_options(Options, State#xmerl_sax_parser_state{fail_undeclared_ref = Bool});
479+
parse_options([{discard_ws_before_xml_document, Bool} |Options], State) when is_boolean(Bool) ->
480+
parse_options(Options, State#xmerl_sax_parser_state{discard_ws_before_xml_document = Bool});
475481
parse_options([O |_], _State) ->
476482
{error, lists:flatten(io_lib:format("Option: ~p not supported", [O]))}.
477483

lib/xmerl/src/xmerl_sax_parser.hrl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,7 @@
9393
attribute_values = [], % default attribute values
9494
allow_entities = true, % If true entities are allowed in the document
9595
entity_recurse_limit = 3, % How many levels of recursion is allowed for entities
96-
external_entities = none, % Which types of external entities are allowed: all, file or none(default)
97-
fail_undeclared_ref = true % If false the reference will be left unresolved in the document, true is default
96+
external_entities = none, % Which types of external entities are allowed: all, file or none(default)
97+
fail_undeclared_ref = true, % If false the reference will be left unresolved in the document, true is default
98+
discard_ws_before_xml_document = false % If true allow whitespace fefore the xml tag
9899
}).
99-
100-
101-

lib/xmerl/src/xmerl_sax_parser_base.erlsrc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ handle_end_document({other, Error, State}) ->
187187
%% Description: Parsing an XML document
188188
%% [1] document ::= prolog element Misc*
189189
%%----------------------------------------------------------------------
190+
parse_document(Rest, #xmerl_sax_parser_state{discard_ws_before_xml_document = true} = State) ->
191+
{_WS, Rest1, State1} = whitespace(Rest, State, []),
192+
{Rest2, State2} = parse_byte_order_mark(Rest1, State1),
193+
{Rest3, State3} = parse_misc(Rest2, State2, true),
194+
{ok, Rest3, State3};
190195
parse_document(Rest, State) when is_record(State, xmerl_sax_parser_state) ->
191196
{Rest1, State1} = parse_byte_order_mark(Rest, State),
192197
{Rest2, State2} = parse_misc(Rest1, State1, true),
@@ -336,8 +341,9 @@ parse_prolog(?STRING_REST("<?", Rest), State) ->
336341
case parse_pi(Rest, State) of
337342
{Rest1, State1} ->
338343
parse_prolog(Rest1, State1);
339-
{endDocument, Rest1, State1} ->
340-
parse_prolog(Rest1, State1)
344+
{endDocument, _Rest1, State1} ->
345+
?fatal_error(State1, "<?xml ...?> not first in document")
346+
%% parse_prolog(Rest1, State1)
341347
end;
342348
parse_prolog(?STRING_REST("<!", Rest), State) ->
343349
parse_prolog_1(Rest, State);

lib/xmerl/test/xmerl_sax_SUITE.erl

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@
3838
%%----------------------------------------------------------------------
3939

4040
all() ->
41-
[{group, bugs}].
41+
[{group, basic}, {group, bugs}].
4242

4343
groups() ->
44-
[{bugs, [], [ticket_8213, ticket_8214, ticket_11551,
44+
[{basic, [], [discard_ws_before_xml_tag_test]},
45+
{bugs, [], [ticket_8213, ticket_8214, ticket_11551,
4546
fragmented_xml_directive,
4647
old_dom_event_fun_endDocument_bug,
4748
event_fun_endDocument_error_test,
@@ -212,6 +213,23 @@ fail_undeclared_ref_test(Config) ->
212213
{ok, undefined, <<>>} = xmerl_sax_parser:file(File, [{external_entities, none}, {fail_undeclared_ref, false}]),
213214
ok.
214215

216+
%%----------------------------------------------------------------------
217+
%% Test Case
218+
%% ID: Test option that allows whitespace before xml tag
219+
discard_ws_before_xml_tag_test(Config) ->
220+
DataDir = proplists:get_value(data_dir, Config),
221+
File = filename:join(DataDir, "two_messages_with_ws_between.xml"),
222+
{ok, Bin} = file:read_file(File),
223+
%% Use Bin as stream
224+
%% Parse first
225+
{ok, undefined, RestBin} = xmerl_sax_parser:stream(Bin, []),
226+
%% Parse second that has a number of whitespaces first
227+
%% Whithout option
228+
{fatal_error, _, _, _, _} = xmerl_sax_parser:stream(RestBin, []),
229+
%% Whit option
230+
{ok, undefined, _} = xmerl_sax_parser:stream(RestBin, [{discard_ws_before_xml_document, true}]),
231+
ok.
232+
215233
%%======================================================================
216234
%% Internal functions
217235
%%======================================================================
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
<?xml version="1.0"?>
2+
<!--
3+
%CopyrightBegin%
4+
5+
SPDX-License-Identifier: Apache-2.0
6+
7+
Copyright Ericsson AB 2025. All Rights Reserved.
8+
9+
Licensed under the Apache License, Version 2.0 (the "License");
10+
you may not use this file except in compliance with the License.
11+
You may obtain a copy of the License at
12+
13+
http://www.apache.org/licenses/LICENSE-2.0
14+
15+
Unless required by applicable law or agreed to in writing, software
16+
distributed under the License is distributed on an "AS IS" BASIS,
17+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18+
See the License for the specific language governing permissions and
19+
limitations under the License.
20+
21+
%CopyrightEnd%
22+
-->
23+
<document>
24+
<batch>
25+
<group>
26+
<rec>
27+
<aaaaaa>123456</aaaaaa>
28+
<bbbbbb>hej hopp</bbbbbb>
29+
<cccccc>
30+
This is a test string
31+
</cccccc>
32+
</rec>
33+
<rec>
34+
<aaaaaa>123456</aaaaaa>
35+
<bbbbbb>hej hopp</bbbbbb>
36+
<cccccc>
37+
This is a test string
38+
</cccccc>
39+
</rec>
40+
</group>
41+
</batch>
42+
</document>
43+
44+
45+
46+
<?xml version="1.0"?>
47+
<document>
48+
<batch>
49+
<group>
50+
<rec>
51+
<aaaaaa>123456</aaaaaa>
52+
<bbbbbb>hej hopp</bbbbbb>
53+
<cccccc>
54+
This is a test string
55+
</cccccc>
56+
</rec>
57+
</group>
58+
</batch>
59+
</document>

0 commit comments

Comments
 (0)