@@ -74,13 +74,21 @@ For the sake of completeness, here's its full HTML code:
7474
7575.. highlight :: python
7676
77- So, let's download that page and create a selector for it::
77+ So, let's download that page and create a selector for it:
7878
79- >>> import requests
80- >>> from parsel import Selector
81- >>> url = 'http://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html'
82- >>> text = requests.get(url).text
83- >>> selector = Selector(text=text)
79+ .. skip: start
80+
81+ >>> import requests
82+ >>> from parsel import Selector
83+ >>> url = ' http://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html'
84+ >>> text = requests.get(url).text
85+ >>> selector = Selector(text = text)
86+
87+ .. skip: end
88+
89+ .. invisible-code-block: python
90+
91+ selector = load_selector('selectors-sample1.html')
8492
8593 Since we're dealing with HTML, the default type for Selector, we don't need
8694to specify the `type ` argument.
@@ -279,7 +287,6 @@ too. Here's an example::
279287 >>> for index, link in enumerate(links):
280288 ... args = (index, link.xpath('@href').get(), link.xpath('img/@src').get())
281289 ... print('Link number %d points to url %r and image %r' % args)
282-
283290 Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg'
284291 Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg'
285292 Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg'
@@ -341,18 +348,18 @@ Here's an example used to extract image names from the :ref:`HTML code
341348<topics-selectors-htmlcode>` above::
342349
343350 >>> selector.xpath(' //a[contains(@href, "image")]/text()' ).re(r ' Name:\s * ( . * ) ' )
344- ['My image 1',
345- 'My image 2',
346- 'My image 3',
347- 'My image 4',
348- 'My image 5']
351+ ['My image 1 ',
352+ 'My image 2 ',
353+ 'My image 3 ',
354+ 'My image 4 ',
355+ 'My image 5 ']
349356
350357There's an additional helper reciprocating ``.get() `` (and its
351358alias ``.extract_first() ``) for ``.re() ``, named ``.re_first() ``.
352359Use it to extract just the first matching string::
353360
354361 >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)')
355- 'My image 1'
362+ 'My image 1 '
356363
357364.. _topics-selectors-relative-xpaths :
358365
@@ -419,11 +426,10 @@ Example removing an ad from a blog post:
419426 ... """
420427 >>> sel = Selector(text = doc)
421428 >>> sel.xpath(' //div/text()' ).getall()
422- ['Content paragraph...', 'Ad content...', 'Link ', 'More content...']
429+ ['Content paragraph...', '\n ', '\n Ad content...\n ', '\n ', '\n ', 'More content...']
423430 >>> sel.xpath(' //div[@class="ad"]' ).remove()
424431 >>> sel.xpath(' //div//text()' ).getall()
425432 ['Content paragraph...', 'More content...']
426- >>>
427433
428434
429435Using EXSLT extensions
@@ -463,7 +469,7 @@ Example selecting links in list item with a "class" attribute ending with a digi
463469 >>> sel = Selector(text=doc)
464470 >>> sel.xpath('//li//@href').getall()
465471 ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
466- >>> sel.xpath('//li[re:test(@class, "item-\d$")]//@href').getall()
472+ >>> sel.xpath(r '//li[re:test(@class, "item-\d$")]//@href').getall()
467473 ['link1.html', 'link2.html', 'link4.html', 'link5.html']
468474 >>>
469475
@@ -539,29 +545,27 @@ with groups of itemscopes and corresponding itemprops::
539545 ... .//*[@itemscope]/*/@itemprop)''')
540546 ... print(" properties: %s" % (props.getall()))
541547 ... print("")
542-
543548 current scope: ['http://schema.org/Product']
544549 properties: ['name', 'aggregateRating', 'offers', 'description', 'review', 'review']
545-
550+ <BLANKLINE>
546551 current scope: ['http://schema.org/AggregateRating']
547552 properties: ['ratingValue', 'reviewCount']
548-
553+ <BLANKLINE>
549554 current scope: ['http://schema.org/Offer']
550555 properties: ['price', 'availability']
551-
556+ <BLANKLINE>
552557 current scope: ['http://schema.org/Review']
553558 properties: ['name', 'author', 'datePublished', 'reviewRating', 'description']
554-
559+ <BLANKLINE>
555560 current scope: ['http://schema.org/Rating']
556561 properties: ['worstRating', 'ratingValue', 'bestRating']
557-
562+ <BLANKLINE>
558563 current scope: ['http://schema.org/Review']
559564 properties: ['name', 'author', 'datePublished', 'reviewRating', 'description']
560-
565+ <BLANKLINE>
561566 current scope: ['http://schema.org/Rating']
562567 properties: ['worstRating', 'ratingValue', 'bestRating']
563568
564- >>>
565569
566570Here we first iterate over ``itemscope `` elements, and for each one,
567571we look for all ``itemprops `` elements and exclude those that are themselves
@@ -680,16 +684,16 @@ Example::
680684
681685 >>> from parsel import Selector
682686 >>> sel = Selector(text="""
683- ....: <ul class="list">
684- ....: <li>1</li>
685- ....: <li>2</li>
686- ....: <li>3</li>
687- ....: </ul>
688- ....: <ul class="list">
689- ....: <li>4</li>
690- ....: <li>5</li>
691- ....: <li>6</li>
692- ....: </ul>""")
687+ ... <ul class="list">
688+ ... <li>1</li>
689+ ... <li>2</li>
690+ ... <li>3</li>
691+ ... </ul>
692+ ... <ul class="list">
693+ ... <li>4</li>
694+ ... <li>5</li>
695+ ... <li>6</li>
696+ ... </ul>""")
693697 >>> xp = lambda x: sel.xpath(x).getall()
694698
695699This gets all first ``<li> `` elements under whatever it is its parent::
@@ -752,21 +756,21 @@ For example::
752756
753757 >>> from parsel import Selector
754758 >>> selector = Selector(text="""
755- .... <script>
756- .... <!-- comment -->
757- .... text
758- .... <br/>
759- .... </script>
760- .... <style>
761- .... <!-- comment -->
762- .... text
763- .... <br/>
764- .... </style>
765- .... <div>
766- .... <!-- comment -->
767- .... text
768- .... <br/>
769- .... </div>""")
759+ ... <script>
760+ ... text
761+ ... <!-- comment -->
762+ ... <br/>
763+ ... </script>
764+ ... <style>
765+ ... text
766+ ... <!-- comment -->
767+ ... <br/>
768+ ... </style>
769+ ... <div>
770+ ... text
771+ ... <!-- comment -->
772+ ... <br/>
773+ ... </div>""")
770774 >>> for tag in selector.xpath('//*[contains(text(), "text")]'):
771775 ... print(tag.xpath('name()').get())
772776 ... print(' Text: ' + (tag.xpath('text()').get() or ''))
@@ -778,21 +782,21 @@ For example::
778782 text
779783 <!-- comment -->
780784 <br/>
781-
785+ <BLANKLINE>
782786 Comment:
783787 Children:
784788 style
785789 Text:
786790 text
787791 <!-- comment -->
788792 <br/>
789-
793+ <BLANKLINE>
790794 Comment:
791795 Children:
792796 div
793797 Text:
794798 text
795-
799+ <BLANKLINE>
796800 Comment: <!-- comment -->
797801 Children: <br>
798802
@@ -811,6 +815,10 @@ and readable code.
811815
812816The following examples show how these methods map to each other.
813817
818+ .. invisible-code-block: python
819+
820+ selector = load_selector('selectors-sample1.html')
821+
814822 1. ``SelectorList.get() `` is the same as ``SelectorList.extract_first() ``::
815823
816824 >>> selector.css('a::attr(href)').get()
@@ -852,10 +860,14 @@ Using CSS selectors in multi-root documents
852860Some webpages may have multiple root elements. It can happen, for example, when
853861a webpage has broken code, such as missing closing tags.
854862
855- You can use XPath to determine if a page has multiple root elements::
863+ .. invisible-code-block: python
856864
857- >>> len(selector.xpath('/*')) > 1
858- True
865+ selector = load_selector('multiroot.html')
866+
867+ You can use XPath to determine if a page has multiple root elements:
868+
869+ >>> len (selector.xpath(' /*' )) > 1
870+ True
859871
860872CSS selectors only work on the first root element, because the first root
861873element is always used as the starting current element, and CSS selectors do
@@ -954,12 +966,20 @@ method for that.
954966
955967Let's show an example that illustrates this with the Python Insider blog atom feed.
956968
957- Let's download the atom feed using `requests `_ and create a selector::
969+ Let's download the atom feed using `requests `_ and create a selector:
958970
959- >>> import requests
960- >>> from parsel import Selector
961- >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
962- >>> sel = Selector(text=text, type='xml')
971+ .. skip: start
972+
973+ >>> import requests
974+ >>> from parsel import Selector
975+ >>> text = requests.get(' https://feeds.feedburner.com/PythonInsider' ).text
976+ >>> sel = Selector(text = text, type = ' xml' )
977+
978+ .. skip: end
979+
980+ .. invisible-code-block: python
981+
982+ sel = load_selector('python-insider.xml', type='xml')
963983
964984 This is how the file starts:
965985
@@ -993,9 +1013,9 @@ directly by their names::
9931013
9941014 >>> sel.remove_namespaces()
9951015 >>> sel.xpath("//link")
996- [<Selector xpath='//link' data='<link rel="alternate" type="text/html" h '>,
997- <Selector xpath='//link' data='<link rel="next" type="application/atom+ '>,
998- ...
1016+ [<Selector xpath='//link' data='<link rel="alternate" type="text/html... '>,
1017+ <Selector xpath='//link' data='<link rel="next" type="application/at... '>,
1018+ ...]
9991019
10001020If you wonder why the namespace removal procedure isn't called always by default
10011021instead of having to call it manually, this is because of two reasons, which, in order
@@ -1020,29 +1040,37 @@ Ad-hoc namespaces references
10201040references along with the query, through a ``namespaces `` argument,
10211041with the prefixes you declare being used in your XPath or CSS query.
10221042
1023- Let's use the same Python Insider Atom feed::
1043+ Let's use the same Python Insider Atom feed:
10241044
1025- >>> import requests
1026- >>> from parsel import Selector
1027- >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
1028- >>> sel = Selector(text=text, type='xml')
1045+ .. skip: start
1046+
1047+ >>> import requests
1048+ >>> from parsel import Selector
1049+ >>> text = requests.get(' https://feeds.feedburner.com/PythonInsider' ).text
1050+ >>> sel = Selector(text = text, type = ' xml' )
1051+
1052+ .. skip: end
1053+
1054+ .. invisible-code-block: python
1055+
1056+ sel = load_selector('python-insider.xml', type='xml')
10291057
10301058 And try to select the links again, now using an "atom:" prefix
10311059for the "link" node test::
10321060
10331061 >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
1034- [<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom '>,
1035- <Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/Atom '>,
1036- ...
1062+ [<Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/A... '>,
1063+ <Selector xpath='//atom:link' data='<link xmlns="http://www.w3.org/2005/A... '>,
1064+ ...]
10371065
10381066You can pass several namespaces (here we're using shorter 1-letter prefixes)::
10391067
10401068 >>> sel.xpath("//a:entry/a:author/g:image/@src",
10411069 ... namespaces={"a": "http://www.w3.org/2005/Atom",
10421070 ... "g": "http://schemas.google.com/g/2005"}).getall()
1043- ['http ://photos1.blogger .com/blogger/4554/1119/400/beethoven_10.jpg ',
1044- '//lh3.googleusercontent .com/-7xisiK0EArc/AAAAAAAAAAI/AAAAAAAAAuM/-r6o6A8RKCM/s512-c/photo.jpg ',
1045- ...
1071+ ['https ://img1.blogblog .com/img/b16-rounded.gif ',
1072+ 'https://img1.blogblog .com/img/b16-rounded.gif ',
1073+ ...]
10461074
10471075.. _topics-xpath-variables :
10481076
@@ -1055,6 +1083,10 @@ queries or prepared statements in the SQL world where you replace
10551083some arguments in your queries with placeholders like ``? ``,
10561084which are then substituted with values passed with the query.
10571085
1086+ .. invisible-code-block: python
1087+
1088+ selector = load_selector('selectors-sample1.html')
1089+
10581090 Here's an example to match an element based on its normalized string-value::
10591091
10601092 >>> str_to_match = "Name: My image 3"
0 commit comments