diff --git a/rdflib_jsonld/util.py b/rdflib_jsonld/util.py index b2bcea1..2eb06c6 100644 --- a/rdflib_jsonld/util.py +++ b/rdflib_jsonld/util.py @@ -4,21 +4,48 @@ assert json # workaround for pyflakes issue #13 except ImportError: import simplejson as json - -from ._compat import IS_PY3 as PY3 +from html.parser import HTMLParser from os import sep from os.path import normpath +from urllib.parse import urljoin, urlsplit, urlunsplit +from rdflib.parser import create_input_source +from io import StringIO -if PY3: - from urllib.parse import urljoin, urlsplit, urlunsplit -else: - from urlparse import urljoin, urlsplit, urlunsplit -from rdflib.parser import create_input_source +class HTMLJSONParser(HTMLParser): + def __init__(self): + super().__init__() + self.json = [] + self.contains_json = False + + def handle_starttag(self, tag, attrs): + self.contains_json = False + + # Only set self. contains_json to True if the + # type is 'application/ld+json' + if tag == "script": + for (attr, value) in attrs: + if attr == 'type' and value == 'application/ld+json': + self.contains_json = True + else: + # Nothing to do + continue -if PY3: - from io import StringIO + def handle_data(self, data): + # Only do something when we know the context is a + # script element containing application/ld+json + + if self.contains_json is True: + if data.strip() == "": + # skip empty data elements + return + + # Try to parse the json + self.json.append(json.loads(data)) + + def get_json(self): + return self.json def source_to_json(source): @@ -27,10 +54,17 @@ def source_to_json(source): stream = source.getByteStream() try: - if PY3: - return json.load(StringIO(stream.read().decode("utf-8"))) - else: - return json.load(stream) + return json.load(StringIO(stream.read().decode('utf-8'))) + except json.JSONDecodeError as e: + # The document is not a JSON document, let's see whether we can parse + # it as HTML + + # Reset stream pointer to 0 + stream.seek(0) + parser = HTMLJSONParser() + parser.feed(stream.read().decode('utf-8')) + + return parser.get_json() finally: stream.close()