Skip to content

Commit d9b3d0c

Browse files
authored
Merge pull request #129 from victor-torres/remove-selector
Remove Selectors or SelectorLists from their parent elements
2 parents c9901d2 + 121dd1f commit d9b3d0c

File tree

4 files changed

+130
-0
lines changed

4 files changed

+130
-0
lines changed

README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Features
2424
--------
2525

2626
* Extract text using CSS or XPath selectors
27+
* Remove elements using CSS or XPath selectors
2728
* Regular expression helper methods
2829

2930
Example::

docs/usage.rst

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,41 @@ XPath specification.
385385
.. _Location Paths: https://www.w3.org/TR/xpath#location-paths
386386

387387

388+
Removing elements
389+
-----------------
390+
391+
If for any reason you need to remove elements based on a Selector or
392+
a SelectorList, you can do it with the ``remove()`` method, available for both
393+
classes.
394+
395+
.. warning:: this is a destructive action and cannot be undone. The original
396+
content of the selector is removed from the elements tree. This could be useful
397+
when trying to reduce the memory footprint of Responses.
398+
399+
Example removing an ad from a blog post:
400+
401+
>>> from parsel import Selector
402+
>>> doc = u"""
403+
... <article>
404+
... <div class="row">Content paragraph...</div>
405+
... <div class="row">
406+
... <div class="ad">
407+
... Ad content...
408+
... <a href="http://...">Link</a>
409+
... </div>
410+
... </div>
411+
... <div class="row">More content...</div>
412+
... </article>
413+
... """
414+
>>> sel = Selector(text=doc)
415+
>>> sel.xpath('//div/text()').getall()
416+
['Content paragraph...', 'Ad content...', 'Link', 'More content...']
417+
>>> sel.xpath('//div[@class="ad"]').remove()
418+
>>> sel.xpath('//div//text()').getall()
419+
['Content paragraph...', 'More content...']
420+
>>>
421+
422+
388423
Using EXSLT extensions
389424
----------------------
390425

parsel/selector.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@
1111
from .csstranslator import HTMLTranslator, GenericTranslator
1212

1313

14+
class CannotRemoveElementWithoutRoot(Exception):
15+
pass
16+
17+
18+
class CannotRemoveElementWithoutParent(Exception):
19+
pass
20+
21+
1422
class SafeXMLParser(etree.XMLParser):
1523
def __init__(self, *args, **kwargs):
1624
kwargs.setdefault('resolve_entities', False)
@@ -150,6 +158,13 @@ def attrib(self):
150158
else:
151159
return {}
152160

161+
def remove(self):
162+
"""
163+
Remove matched nodes from the parent for each element in this list.
164+
"""
165+
for x in self:
166+
x.remove()
167+
153168

154169
class Selector(object):
155170
"""
@@ -342,6 +357,30 @@ def remove_namespaces(self):
342357
# remove namespace declarations
343358
etree.cleanup_namespaces(self.root)
344359

360+
def remove(self):
361+
"""
362+
Remove matched nodes from the parent element.
363+
"""
364+
try:
365+
parent = self.root.getparent()
366+
except AttributeError:
367+
# 'str' object has no attribute 'getparent'
368+
raise CannotRemoveElementWithoutRoot(
369+
"The node you're trying to remove has no root, "
370+
"are you trying to remove a pseudo-element? "
371+
"Try to use 'li' as a selector instead of 'li::text' or "
372+
"'//li' instead of '//li/text()', for example."
373+
)
374+
375+
try:
376+
parent.remove(self.root)
377+
except AttributeError:
378+
# 'NoneType' object has no attribute 'remove'
379+
raise CannotRemoveElementWithoutParent(
380+
"The node you're trying to remove has no parent, "
381+
"are you trying to remove a root element?"
382+
)
383+
345384
@property
346385
def attrib(self):
347386
"""Return the attributes dictionary for underlying element.

tests/test_selector.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
import pickle
77

88
from parsel import Selector
9+
from parsel.selector import (
10+
CannotRemoveElementWithoutRoot,
11+
CannotRemoveElementWithoutParent,
12+
)
913

1014

1115
class SelectorTestCase(unittest.TestCase):
@@ -745,6 +749,57 @@ def test_replacement_null_char_from_body(self):
745749
self.assertEqual(u'<html><body><p>Grainy</p></body></html>',
746750
self.sscls(text).extract())
747751

752+
def test_remove_selector_list(self):
753+
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
754+
sel_list = sel.css('li')
755+
sel_list.remove()
756+
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
757+
self.assertEqual(sel.css('li'), [])
758+
759+
def test_remove_selector(self):
760+
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
761+
sel_list = sel.css('li')
762+
sel_list[0].remove()
763+
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
764+
self.assertEqual(sel.css('li::text').getall(), ['2', '3'])
765+
766+
def test_remove_pseudo_element_selector_list(self):
767+
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
768+
sel_list = sel.css('li::text')
769+
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
770+
with self.assertRaises(CannotRemoveElementWithoutRoot):
771+
sel_list.remove()
772+
773+
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
774+
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])
775+
776+
def test_remove_pseudo_element_selector(self):
777+
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
778+
sel_list = sel.css('li::text')
779+
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
780+
with self.assertRaises(CannotRemoveElementWithoutRoot):
781+
sel_list[0].remove()
782+
783+
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
784+
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])
785+
786+
def test_remove_root_element_selector(self):
787+
sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
788+
sel_list = sel.css('li::text')
789+
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
790+
with self.assertRaises(CannotRemoveElementWithoutParent):
791+
sel.remove()
792+
793+
with self.assertRaises(CannotRemoveElementWithoutParent):
794+
sel.css('html').remove()
795+
796+
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
797+
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])
798+
799+
sel.css('body').remove()
800+
self.assertEqual(sel.get(), '<html></html>')
801+
802+
748803
class ExsltTestCase(unittest.TestCase):
749804

750805
sscls = Selector

0 commit comments

Comments
 (0)