diff --git a/px_reader.py b/px_reader.py index 8bbfc63..303479c 100644 --- a/px_reader.py +++ b/px_reader.py @@ -35,6 +35,26 @@ def get_logger(level=logging.DEBUG, handler=logging.StreamHandler): log.addHandler(ch) return log +class PxSyntaxError(Exception): pass + +def _iterate_px_entries(data): + # Definitely not the fastest way in Python! + start = 0 + in_quote = False + for i, c in enumerate(data): + if c == '"': + in_quote = not in_quote + continue + if c == ';' and not in_quote: + yield data[start:i].strip() + start = i + 1 + + if in_quote: + raise PxSyntaxError("Unclosed quote") + if data[start:].strip(): + raise PxSyntaxError("Data in the end without ending ';'") + + class Px(object): """ PC Axis document structure as a object interface @@ -71,26 +91,29 @@ def _split_px(self, px_doc): Parses metadata keywords from px_doc and inserts those into self object Returns the data part """ - meta, data = open(px_doc, 'U').read().split("DATA=") + if isinstance(px_doc, basestring): + px_doc = open(px_doc, 'U') + meta, data = px_doc.read().split("DATA=") meta = unicode(meta, 'iso-8859-1') data = unicode(data, 'iso-8859-1') nmeta = {} - for line in meta.strip().split(';\n'): - if line: - m = self._subfield_re.match(line) - if m: - field, subkey, value = self._get_subfield(m, line) - if hasattr(self, field): - getattr(self, field)[subkey] = value - else: - setattr(self, field, OD( - [(subkey, value)] - )) - else: - field, value = line.split('=', 1) - if not field.startswith('NOTE'): - setattr(self, field.strip().lower(), self._clean_value(value)) - #TODO: NOTE keywords can be standalone or have subfields... + for line in _iterate_px_entries(meta.strip()): + if not line: + continue + m = self._subfield_re.match(line) + if m: + field, subkey, value = self._get_subfield(m, line) + if hasattr(self, field): + getattr(self, field)[subkey] = value + else: + setattr(self, field, OD( + [(subkey, value)] + )) + else: + field, value = line.split('=', 1) + if not field.startswith('NOTE'): + setattr(self, field.strip().lower(), self._clean_value(value)) + #TODO: NOTE keywords can be standalone or have subfields... return data.strip()[:-1] def __init__(self, px_doc):