Skip to content

Commit 8d455d1

Browse files
authored
Merge pull request #297 from mpsonntag/fixConversionEnc
Conversion encoding LGTM
2 parents 04b408f + 7e60996 commit 8d455d1

File tree

6 files changed

+51
-16
lines changed

6 files changed

+51
-16
lines changed

odml/doc.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ def __init__(self, author=None, date=None, version=None, repository=None, oid=No
3737
self._date = None
3838
self.date = date
3939

40+
# Enable setting of the file name from whence this document came.
41+
# It is for knowing while processing and will not be serialized to a file.
42+
self._origin_file_name = None
43+
4044
def __repr__(self):
4145
return "<Doc %s by %s (%d sections)>" % (self._version, self._author,
4246
len(self._sections))

odml/tools/odmlparser.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@
77

88
import datetime
99
import json
10+
import sys
1011
import yaml
1112

13+
from os.path import basename
14+
1215
from . import xmlparser
1316
from .dict_parser import DictWriter, DictReader
1417
from ..info import FORMAT_VERSION
@@ -17,6 +20,11 @@
1720
from .rdf_converter import RDFReader, RDFWriter
1821
from ..validation import Validation
1922

23+
try:
24+
unicode = unicode
25+
except NameError:
26+
unicode = str
27+
2028

2129
class ODMLWriter:
2230
"""
@@ -58,10 +66,10 @@ def to_string(self, odml_document):
5866
string_doc = ''
5967

6068
if self.parser == 'XML':
61-
string_doc = str(xmlparser.XMLWriter(odml_document))
69+
string_doc = unicode(xmlparser.XMLWriter(odml_document))
6270
elif self.parser == "RDF":
6371
# Use turtle as default output format for now.
64-
string_doc = RDFWriter(odml_document).get_rdf_str("turtle")
72+
string_doc = RDFWriter(odml_document).get_rdf_str("xml")
6573
else:
6674
self.parsed_doc = DictWriter().to_dict(odml_document)
6775

@@ -74,6 +82,9 @@ def to_string(self, odml_document):
7482
string_doc = json.dumps(odml_output, indent=4,
7583
cls=JSONDateTimeSerializer)
7684

85+
if sys.version_info.major < 3:
86+
string_doc = string_doc.encode("utf-8")
87+
7788
return string_doc
7889

7990

@@ -122,6 +133,8 @@ def from_file(self, file, doc_format=None):
122133
return
123134

124135
self.doc = DictReader().to_odml(self.parsed_doc)
136+
# Provide original file name via the in memory document
137+
self.doc._origin_file_name = basename(file)
125138
return self.doc
126139

127140
elif self.parser == 'JSON':
@@ -133,6 +146,8 @@ def from_file(self, file, doc_format=None):
133146
return
134147

135148
self.doc = DictReader().to_odml(self.parsed_doc)
149+
# Provide original file name via the in memory document
150+
self.doc._origin_file_name = basename(file)
136151
return self.doc
137152

138153
elif self.parser == 'RDF':

odml/tools/rdf_converter.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def save_element(self, e, node=None):
7373
fmt = e.format()
7474

7575
if not node:
76-
curr_node = URIRef(odmlns + str(e.id))
76+
curr_node = URIRef(odmlns + unicode(e.id))
7777
else:
7878
curr_node = node
7979

@@ -88,6 +88,11 @@ def save_element(self, e, node=None):
8888
if isinstance(fmt, Document.__class__):
8989
self.g.add((self.hub_root, odmlns.hasDocument, curr_node))
9090

91+
# If available add the documents filename to the document node
92+
# so we can identify where the data came from.
93+
if hasattr(e, "_origin_file_name"):
94+
self.g.add((curr_node, odmlns.hasFileName, Literal(e._origin_file_name)))
95+
9196
for k in fmt.rdf_map_keys:
9297
if k == 'id':
9398
continue
@@ -101,7 +106,7 @@ def save_element(self, e, node=None):
101106
self.g.add((curr_node, fmt.rdf_map(k), terminology_node))
102107
else:
103108
# adding terminology to the hub and to link with the doc
104-
node = URIRef(odmlns + str(uuid.uuid4()))
109+
node = URIRef(odmlns + unicode(uuid.uuid4()))
105110
self.g.add((node, RDF.type, URIRef(terminology_url)))
106111
self.g.add((self.hub_root, odmlns.hasTerminology, node))
107112
self.g.add((curr_node, fmt.rdf_map(k), node))
@@ -111,20 +116,20 @@ def save_element(self, e, node=None):
111116
k == 'sections' and len(getattr(e, k)) > 0:
112117
sections = getattr(e, k)
113118
for s in sections:
114-
node = URIRef(odmlns + str(s.id))
119+
node = URIRef(odmlns + unicode(s.id))
115120
self.g.add((curr_node, fmt.rdf_map(k), node))
116121
self.save_element(s, node)
117122
elif isinstance(fmt, Section.__class__) and \
118123
k == 'properties' and len(getattr(e, k)) > 0:
119124
properties = getattr(e, k)
120125
for p in properties:
121-
node = URIRef(odmlns + str(p.id))
126+
node = URIRef(odmlns + unicode(p.id))
122127
self.g.add((curr_node, fmt.rdf_map(k), node))
123128
self.save_element(p, node)
124129
elif isinstance(fmt, Property.__class__) and \
125130
k == 'value' and len(getattr(e, k)) > 0:
126131
values = getattr(e, k)
127-
seq = URIRef(odmlns + str(uuid.uuid4()))
132+
seq = URIRef(odmlns + unicode(uuid.uuid4()))
128133
self.g.add((seq, RDF.type, RDF.Seq))
129134
self.g.add((curr_node, fmt.rdf_map(k), seq))
130135
# rdflib so far does not respect RDF:li item order
@@ -133,15 +138,15 @@ def save_element(self, e, node=None):
133138
# this should be reversed to RDF:li again!
134139
# see https://github.com/RDFLib/rdflib/issues/280
135140
# -- keep until supported
136-
# bag = URIRef(odmlns + str(uuid.uuid4()))
141+
# bag = URIRef(odmlns + unicode(uuid.uuid4()))
137142
# self.g.add((bag, RDF.type, RDF.Bag))
138143
# self.g.add((curr_node, fmt.rdf_map(k), bag))
139144
# for v in values:
140145
# self.g.add((bag, RDF.li, Literal(v)))
141146

142147
counter = 1
143148
for v in values:
144-
pred = "%s_%s" % (str(RDF), counter)
149+
pred = "%s_%s" % (unicode(RDF), counter)
145150
self.g.add((seq, URIRef(pred), Literal(v)))
146151
counter = counter + 1
147152

@@ -222,7 +227,11 @@ def to_odml(self):
222227

223228
def from_file(self, filename, doc_format):
224229
self.g = Graph().parse(source=filename, format=doc_format)
225-
return self.to_odml()
230+
docs = self.to_odml()
231+
for d in docs:
232+
# Provide original file name via the document
233+
d._origin_file_name = os.path.basename(filename)
234+
return docs
226235

227236
def from_string(self, file, doc_format):
228237
self.g = Graph().parse(source=StringIO(file), format=doc_format)
@@ -242,7 +251,7 @@ def parse_document(self, doc_uri):
242251
doc_attrs[attr[0]] = doc_uri.split("#", 1)[1]
243252
else:
244253
if len(elems) > 0:
245-
doc_attrs[attr[0]] = str(elems[0].toPython())
254+
doc_attrs[attr[0]] = unicode(elems[0].toPython())
246255

247256
return {'Document': doc_attrs, 'odml-version': FORMAT_VERSION}
248257

@@ -264,7 +273,7 @@ def parse_section(self, sec_uri):
264273
sec_attrs[attr[0]] = sec_uri.split("#", 1)[1]
265274
else:
266275
if len(elems) > 0:
267-
sec_attrs[attr[0]] = str(elems[0].toPython())
276+
sec_attrs[attr[0]] = unicode(elems[0].toPython())
268277
self._check_mandatory_attrs(sec_attrs)
269278
return sec_attrs
270279

@@ -293,7 +302,7 @@ def parse_property(self, prop_uri):
293302
prop_attrs[attr[0]] = prop_uri.split("#", 1)[1]
294303
else:
295304
if len(elems) > 0:
296-
prop_attrs[attr[0]] = str(elems[0].toPython())
305+
prop_attrs[attr[0]] = unicode(elems[0].toPython())
297306
self._check_mandatory_attrs(prop_attrs)
298307
return prop_attrs
299308

odml/tools/version_converter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def _parse_xml(self):
5353
doc = doc.replace(elem, val)
5454

5555
# Make sure encoding is present for the xml parser
56-
doc = doc.encode('utf-8')
56+
if sys.version_info.major > 2:
57+
doc = doc.encode('utf-8')
5758

5859
# Make pretty print available by resetting format
5960
parser = ET.XMLParser(remove_blank_text=True)

odml/tools/xmlparser.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from lxml.builder import E
1111
# this is needed for py2exe to include lxml completely
1212
from lxml import _elementpath as _dummy
13+
from os.path import basename
1314

1415
try:
1516
from StringIO import StringIO
@@ -187,7 +188,12 @@ def from_file(self, xml_file):
187188
raise ParserException(e.msg)
188189

189190
self._handle_version(root)
190-
return self.parse_element(root)
191+
doc = self.parse_element(root)
192+
193+
# Provide original file name via the in memory document
194+
if isinstance(xml_file, unicode):
195+
doc._origin_file_name = basename(xml_file)
196+
return doc
191197

192198
def from_string(self, string):
193199
try:

test/test_parser_odml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def test_json_file(self):
8383

8484
def test_rdf_file(self):
8585
self.rdf_writer.write_file(self.odml_doc, self.rdf_file)
86-
rdf_doc = self.rdf_reader.from_file(self.rdf_file, "turtle")
86+
rdf_doc = self.rdf_reader.from_file(self.rdf_file, "xml")
8787

8888
self.assertEqual(self.odml_doc, rdf_doc[0])
8989

0 commit comments

Comments
 (0)