1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414import codecs
15- import itertools
1615import logging
1716import re
18- from typing import TYPE_CHECKING , Dict , Generator , Iterable , Optional , Set , Union
17+ from typing import TYPE_CHECKING , Dict , Generator , Iterable , List , Optional , Set , Union
1918
2019if TYPE_CHECKING :
2120 from lxml import etree
@@ -276,7 +275,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
276275
277276 from lxml import etree
278277
279- TAGS_TO_REMOVE = (
278+ TAGS_TO_REMOVE = {
280279 "header" ,
281280 "nav" ,
282281 "aside" ,
@@ -291,31 +290,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
291290 "img" ,
292291 "picture" ,
293292 etree .Comment ,
294- )
293+ }
295294
296295 # Split all the text nodes into paragraphs (by splitting on new
297296 # lines)
298297 text_nodes = (
299298 re .sub (r"\s+" , "\n " , el ).strip ()
300- for el in _iterate_over_text (tree .find ("body" ), * TAGS_TO_REMOVE )
299+ for el in _iterate_over_text (tree .find ("body" ), TAGS_TO_REMOVE )
301300 )
302301 return summarize_paragraphs (text_nodes )
303302
304303
305304def _iterate_over_text (
306- tree : "etree.Element" , * tags_to_ignore : Union [str , "etree.Comment" ]
305+ tree : Optional ["etree.Element" ],
306+ tags_to_ignore : Set [Union [str , "etree.Comment" ]],
307+ stack_limit : int = 1024 ,
307308) -> Generator [str , None , None ]:
308309 """Iterate over the tree returning text nodes in a depth first fashion,
309310 skipping text nodes inside certain tags.
311+
312+ Args:
313+ tree: The parent element to iterate. Can be None if there isn't one.
314+ tags_to_ignore: Set of tags to ignore
315+ stack_limit: Maximum stack size limit for depth-first traversal.
316+ Nodes will be dropped if this limit is hit, which may truncate the
317+ textual result.
318+ Intended to limit the maximum working memory when generating a preview.
310319 """
311- # This is basically a stack that we extend using itertools.chain.
312- # This will either consist of an element to iterate over *or* a string
320+
321+ if tree is None :
322+ return
323+
324+ # This is a stack whose items are elements to iterate over *or* strings
313325 # to be returned.
314- elements = iter ([tree ])
315- while True :
316- el = next (elements , None )
317- if el is None :
318- return
326+ elements : List [Union [str , "etree.Element" ]] = [tree ]
327+ while elements :
328+ el = elements .pop ()
319329
320330 if isinstance (el , str ):
321331 yield el
@@ -329,17 +339,22 @@ def _iterate_over_text(
329339 if el .text :
330340 yield el .text
331341
332- # We add to the stack all the elements children, interspersed with
333- # each child's tail text (if it exists). The tail text of a node
334- # is text that comes *after* the node, so we always include it even
335- # if we ignore the child node.
336- elements = itertools .chain (
337- itertools .chain .from_iterable ( # Basically a flatmap
338- [child , child .tail ] if child .tail else [child ]
339- for child in el .iterchildren ()
340- ),
341- elements ,
342- )
342+ # We add to the stack all the element's children, interspersed with
343+ # each child's tail text (if it exists).
344+ #
345+ # We iterate in reverse order so that earlier pieces of text appear
346+ # closer to the top of the stack.
347+ for child in el .iterchildren (reversed = True ):
348+ if len (elements ) > stack_limit :
349+ # We've hit our limit for working memory
350+ break
351+
352+ if child .tail :
353+ # The tail text of a node is text that comes *after* the node,
354+ # so we always include it even if we ignore the child node.
355+ elements .append (child .tail )
356+
357+ elements .append (child )
343358
344359
345360def summarize_paragraphs (
0 commit comments