Skip to content

Commit 458c125

Browse files
authored
Make custom yaml tag loader work with large files (#817)
* Make custom yaml tag loader work with large files * Fix checking stream change in custom yaml tag loader * Don't clear custom yaml anchor tags for new files
1 parent 549a60c commit 458c125

File tree

2 files changed

+161
-155
lines changed

2 files changed

+161
-155
lines changed

logprep/util/tag_yaml_loader.py

Lines changed: 134 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@
2626
Anchor tags
2727
^^^^^^^^^^^
2828
29-
Anchor tags work similar to regular YAML anchors, but are valid for all documents inside a file or
30-
stream.
29+
Anchor tags work similar to regular YAML anchors, but are valid across different documents.
3130
Tags are set with :code:`!set_anchor(:[0-9])?` and loaded with :code:`!load_anchor(:[0-9])?`.
32-
Ten anchors can be active inside a single file or stream.
31+
Ten anchors can be active at the same time. If more then ten are set then the first will be removed
32+
until there are ten.
3333
`!set_anchor` and :code:`!load_anchor` are shorthands for :code:`!set_anchor:0` and
3434
:code:`!load_anchor:0`.
3535
36-
`!include` and :code:`!set_anchor` can't be nested inside :code:`!set_anchor`.
36+
`!set_anchor` can't be nested inside another :code:`!set_anchor`.
3737
3838
Examples:
3939
@@ -75,157 +75,157 @@
7575
"""
7676

7777
import os.path
78-
from typing import Set, Callable, Any
78+
from typing import Callable, Any
7979

80-
from ruamel.yaml import YAML, Node, BaseConstructor
80+
from ruamel.yaml import (
81+
YAML,
82+
Node,
83+
BaseConstructor,
84+
ScalarNode,
85+
SequenceNode,
86+
MappingNode,
87+
RoundTripConstructor,
88+
)
8189

8290

83-
def init_yaml_loader_tags(*loader_types: str) -> None:
84-
"""Add custom tags !include, !set_anchor and !load_anchor to the specified loader types.
85-
86-
Must be initialized before yaml files have been loaded to take effect.
91+
def _include(_yaml: YAML) -> Callable[[BaseConstructor, Node], Any]:
92+
"""Includes the contents of a yaml file specified by the !include tag.
8793
8894
Parameters
8995
----------
90-
*loader_types : str
91-
Types of loaders for which tags will be initialized (i.e. "safe" or "rt").
92-
"""
96+
_yaml : YAML
97+
Used to load the yaml file that will be included.
9398
94-
def include(_yaml: YAML) -> Callable[[BaseConstructor, Node], Any]:
95-
"""Includes the contents of a yaml file specified by the !include tag.
96-
97-
Parameters
98-
----------
99-
_yaml : YAML
100-
Used to load the yaml file that will be included.
101-
102-
Returns
103-
-------
104-
Yaml data where the !include tag has been replaced by the content of the include file.
105-
"""
106-
107-
def _include(_: BaseConstructor, node: Node) -> Any:
108-
if not isinstance(node.value, (str, os.PathLike)):
109-
raise ValueError(f"'{node.value}' is not a file path")
110-
if not os.path.isfile(node.value):
111-
raise FileNotFoundError(node.value)
112-
with open(node.value, "r", encoding="utf-8") as yaml_file:
113-
try:
114-
data = _yaml.load(yaml_file)
115-
except AttributeError as error:
116-
raise ValueError(f"'{node.tag} {node.value}' could not be loaded") from error
117-
if data is None:
118-
raise ValueError(f"'{node.value}' is empty")
119-
return data
120-
121-
return _include
122-
123-
def set_anchor(
124-
_yaml: YAML, _anchors: dict[str, Any], _last_buffer: Set[str]
125-
) -> Callable[[BaseConstructor, Node], Any]:
126-
"""Sets a global anchor if the '!set_anchor'tag is used, which is valid within a file.
127-
128-
Setting it for a node with children stores the children inside the anchor.
129-
Setting it for a scalar node stores the value of that node inside the anchor.
130-
131-
Parameters
132-
----------
133-
_yaml : YAML
134-
Used to load the yaml file that will be included.
135-
_anchors : dict[str, Any]
136-
The dict where all anchors are stored.
137-
_last_buffer : Set[str]
138-
Used to check if a different file/stream has been loaded.
139-
140-
Returns
141-
-------
142-
The loaded yaml data without any modifications.
143-
"""
144-
145-
def _set_anchor(constructor: BaseConstructor, node: Node) -> Any:
146-
clear_anchors_if_buffer_changed(constructor, _anchors, _last_buffer)
147-
148-
anchor_name = get_anchor_name(node)
149-
_anchors[anchor_name] = _extract_anchor_value(constructor, node)
150-
return _anchors[anchor_name]
99+
Returns
100+
-------
101+
Yaml data where the !include tag has been replaced by the content of the include file.
102+
"""
151103

152-
def _extract_anchor_value(constructor: BaseConstructor, node: Node) -> Any:
153-
lines = constructor.loader.reader.buffer.splitlines()
154-
anchor_value_lines = lines[node.start_mark.line : node.end_mark.line + 1]
155-
anchor_value_lines[0] = anchor_value_lines[0][node.start_mark.column + len(node.tag) :]
156-
anchor_value_lines[-1] = anchor_value_lines[-1][: node.end_mark.column]
157-
anchor_value = "\n".join(anchor_value_lines)
104+
def _include_inner(_: BaseConstructor, node: Node) -> Any:
105+
if not isinstance(node.value, (str, os.PathLike)):
106+
raise ValueError(f"'{node.value}' is not a file path")
107+
if not os.path.isfile(node.value):
108+
raise FileNotFoundError(node.value)
109+
with open(node.value, "r", encoding="utf-8") as yaml_file:
158110
try:
159-
data = _yaml.load(anchor_value)
111+
data = _yaml.load(yaml_file)
160112
except AttributeError as error:
161-
_, _, value = "\\n".join(lines).partition(node.tag)
162-
raise ValueError(f"'{node.tag}{value}' could not be loaded") from error
113+
raise ValueError(f"'{node.tag} {node.value}' could not be loaded") from error
163114
if data is None:
164-
raise ValueError(f"'{lines[node.start_mark.line]}' is en empty anchor")
115+
raise ValueError(f"'{node.value}' is empty")
165116
return data
166117

167-
return _set_anchor
118+
return _include_inner
168119

169-
def load_anchor(
170-
_anchors: dict[str, Any], _last_buffer: Set[str]
171-
) -> Callable[[BaseConstructor, Node], Any]:
172-
"""Loads a global anchor if the '!load_anchor'tag is used, which is valid within a file.
173120

174-
Parameters
175-
----------
176-
_anchors : dict[str, Any]
177-
The dict where all anchors are stored.
178-
_last_buffer : Set[str]
179-
Used to check if a different file/stream has been loaded.
121+
def _set_anchor(_yaml: YAML, _anchors: dict[str, Any]) -> Callable[[BaseConstructor, Node], Any]:
122+
"""Sets a global anchor if the '!set_anchor'tag is used, which is valid within a file.
180123
181-
Returns
182-
-------
183-
Yaml data where the !load_anchor tag has been replaced by the content of the anchor.
184-
"""
124+
Setting it for a node with children stores the children inside the anchor.
125+
Setting it for a scalar node stores the value of that node inside the anchor.
185126
186-
def _load_anchor(constructor: BaseConstructor, node: Node) -> Any:
187-
clear_anchors_if_buffer_changed(constructor, _anchors, _last_buffer)
127+
Parameters
128+
----------
129+
_yaml : YAML
130+
Used to load the yaml file that will be included.
131+
_anchors : dict[str, Any]
132+
The dict where all anchors are stored.
133+
134+
Returns
135+
-------
136+
The loaded yaml data without any modifications.
137+
"""
188138

189-
anchor_name = get_anchor_name(node)
190-
try:
191-
return _anchors[anchor_name]
192-
except KeyError as error:
193-
raise ValueError(
194-
f"'{node.value}' is not a defined anchor within this yaml stream"
195-
) from error
196-
197-
return _load_anchor
198-
199-
def clear_anchors_if_buffer_changed(
200-
constructor: BaseConstructor, _anchors: dict[str, Any], _last_buffer: Set[str]
201-
) -> None:
202-
if constructor.loader.reader.buffer not in _last_buffer:
203-
_last_buffer.clear()
204-
_anchors.clear()
205-
_last_buffer.add(constructor.loader.reader.buffer)
206-
207-
def get_anchor_name(node: Node) -> str:
208-
anchor_name: str
209-
_, _, anchor_name = node.tag.partition(":")
210-
if anchor_name == "":
211-
anchor_name = "0"
212-
anchor_name = anchor_name.strip()
213-
return anchor_name
139+
def _set_anchor_inner(constructor: BaseConstructor, node: Node) -> Any:
140+
anchor_name = _get_anchor_name(node)
141+
_anchors[anchor_name] = _extract_anchor_value(constructor, node)
142+
return _anchors[anchor_name]
143+
144+
def _parse_node(constructor: BaseConstructor, node: Node) -> Any:
145+
if node.value == "":
146+
raise ValueError(f"{node.tag} is an empty anchor")
147+
148+
if isinstance(node, ScalarNode):
149+
return constructor.construct_scalar(node)
150+
151+
if isinstance(constructor, RoundTripConstructor):
152+
if isinstance(node, SequenceNode):
153+
return list(constructor.construct_yaml_seq(node))[0]
154+
if isinstance(node, MappingNode):
155+
return list(constructor.construct_yaml_map(node))[0]
156+
157+
if isinstance(node, SequenceNode):
158+
return constructor.construct_sequence(node)
159+
if isinstance(node, MappingNode):
160+
return constructor.construct_mapping(node)
161+
return {}
162+
163+
def _extract_anchor_value(constructor: BaseConstructor, node: Node) -> Any:
164+
try:
165+
data = _parse_node(constructor, node)
166+
except AttributeError as error:
167+
raise ValueError(f"'{node.tag} {node.value}' could not be loaded") from error
168+
if data is None:
169+
raise ValueError(f"{node.tag} is en empty anchor")
170+
return data
171+
172+
return _set_anchor_inner
173+
174+
175+
def _load_anchor(_anchors: dict[str, Any]) -> Callable[[BaseConstructor, Node], Any]:
176+
"""Loads a global anchor if the '!load_anchor'tag is used, which is valid within a file.
177+
178+
Parameters
179+
----------
180+
_anchors : dict[str, Any]
181+
The dict where all anchors are stored.
182+
183+
Returns
184+
-------
185+
Yaml data where the !load_anchor tag has been replaced by the content of the anchor.
186+
"""
187+
188+
def _load_anchor_inner(_: BaseConstructor, node: Node) -> Any:
189+
anchor_name = _get_anchor_name(node)
190+
try:
191+
return _anchors[anchor_name]
192+
except KeyError as error:
193+
raise ValueError(
194+
f"Global anchor '{anchor_name}' is not defined within this YAML stream"
195+
) from error
196+
197+
return _load_anchor_inner
198+
199+
200+
def _get_anchor_name(node: Node) -> str:
201+
anchor_name: str
202+
_, _, anchor_name = node.tag.partition(":")
203+
if anchor_name == "":
204+
anchor_name = "0"
205+
anchor_name = anchor_name.strip()
206+
return anchor_name
207+
208+
209+
def init_yaml_loader_tags(*loader_types: str) -> None:
210+
"""Add custom tags !include, !set_anchor and !load_anchor to the specified loader types.
211+
212+
Must be initialized before yaml files have been loaded to take effect.
213+
214+
Parameters
215+
----------
216+
*loader_types : str
217+
Types of loaders for which tags will be initialized (i.e. "safe" or "rt").
218+
"""
219+
220+
anchors: dict[str, Any] = {}
214221

215222
for loader_type in loader_types:
216223
yaml = YAML(pure=True, typ=loader_type)
224+
yaml.constructor.add_constructor("!include", _include(yaml))
217225

218-
yaml.constructor.add_constructor("!include", include(yaml))
219-
220-
last_buffer: Set[str] = set()
221-
anchors: dict[str, Any] = {}
222-
yaml.constructor.add_constructor("!set_anchor", set_anchor(yaml, anchors, last_buffer))
223-
yaml.constructor.add_constructor("!load_anchor", load_anchor(anchors, last_buffer))
226+
yaml.constructor.add_constructor("!set_anchor", _set_anchor(yaml, anchors))
227+
yaml.constructor.add_constructor("!load_anchor", _load_anchor(anchors))
224228

225229
for num in range(10):
226-
yaml.constructor.add_constructor(
227-
f"!set_anchor:{num}", set_anchor(yaml, anchors, last_buffer)
228-
)
229-
yaml.constructor.add_constructor(
230-
f"!load_anchor:{num}", load_anchor(anchors, last_buffer)
231-
)
230+
yaml.constructor.add_constructor(f"!set_anchor:{num}", _set_anchor(yaml, anchors))
231+
yaml.constructor.add_constructor(f"!load_anchor:{num}", _load_anchor(anchors))

0 commit comments

Comments
 (0)