Skip to content

Commit 6343ea2

Browse files
authored
Merge pull request #3329 from boegel/more_robust_pypi_source_urls
make pypi_source_urls more robust by using HTMLParser rather than xml.etree.ElementTree
2 parents f7c20ed + 2d3155b commit 6343ea2

File tree

3 files changed

+17
-10
lines changed

3 files changed

+17
-10
lines changed

easybuild/tools/filetools.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,13 @@
5454
import tempfile
5555
import time
5656
import zlib
57-
from xml.etree import ElementTree
5857

5958
from easybuild.base import fancylogger
6059
from easybuild.tools import run
6160
# import build_log must stay, to use of EasyBuildLog
6261
from easybuild.tools.build_log import EasyBuildError, dry_run_msg, print_msg, print_warning
6362
from easybuild.tools.config import DEFAULT_WAIT_ON_LOCK_INTERVAL, GENERIC_EASYBLOCK_PKG, build_option, install_path
64-
from easybuild.tools.py2vs3 import std_urllib, string_type
63+
from easybuild.tools.py2vs3 import HTMLParser, std_urllib, string_type
6564
from easybuild.tools.utilities import nub, remove_unwanted_chars
6665

6766
try:
@@ -519,15 +518,21 @@ def pypi_source_urls(pkg_name):
519518
else:
520519
urls_txt = read_file(urls_html)
521520

522-
# ignore yanked releases (see https://pypi.org/help/#yanked)
523-
# see https://github.com/easybuilders/easybuild-framework/issues/3301
524-
urls_txt = re.sub(r'<a.*?data-yanked.*?</a>', '', urls_txt)
521+
res = []
525522

526-
parsed_html = ElementTree.ElementTree(ElementTree.fromstring(urls_txt))
527-
if hasattr(parsed_html, 'iter'):
528-
res = [a.attrib['href'] for a in parsed_html.iter('a')]
529-
else:
530-
res = [a.attrib['href'] for a in parsed_html.getiterator('a')]
523+
# note: don't use xml.etree.ElementTree to parse HTML page served by PyPI's simple API
524+
# cfr. https://github.com/pypa/warehouse/issues/7886
525+
class HrefHTMLParser(HTMLParser):
526+
"""HTML parser to extract 'href' attribute values from anchor tags (<a href='...'>)."""
527+
528+
def handle_starttag(self, tag, attrs):
529+
if tag == 'a':
530+
attrs = dict(attrs)
531+
if 'href' in attrs:
532+
res.append(attrs['href'])
533+
534+
parser = HrefHTMLParser()
535+
parser.feed(urls_txt)
531536

532537
# links are relative, transform them into full URLs; for example:
533538
# from: ../../packages/<dir1>/<dir2>/<hash>/easybuild-<version>.tar.gz#md5=<md5>

easybuild/tools/py2vs3/py2.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import json
3535
import subprocess
3636
import urllib2 as std_urllib # noqa
37+
from HTMLParser import HTMLParser # noqa
3738
from string import letters as ascii_letters # noqa
3839
from string import lowercase as ascii_lowercase # noqa
3940
from StringIO import StringIO # noqa

easybuild/tools/py2vs3/py3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from collections import OrderedDict # noqa
3939
from distutils.version import LooseVersion
4040
from functools import cmp_to_key
41+
from html.parser import HTMLParser # noqa
4142
from itertools import zip_longest
4243
from io import StringIO # noqa
4344
from string import ascii_letters, ascii_lowercase # noqa

0 commit comments

Comments
 (0)