1919# See the License for the specific language governing permissions and
2020# limitations under the License.
2121"""
22- In the DataONE Python stack, XML docs are represented in a few different ways.
22+ - Handle conversions between XML representations used in the D1 Python stack
23+ - Handle conversions between v1 and v2 DataONE XML types
2324
24- - Received and transmitted as utf-8 text documents.
25- - On the borders of the Python domain, handled as utf-8 or Unicode strings.
26- - Schema validation and manipulation in Python code as PyXB binding objects.
27- - General processing as ElementTrees.
25+ In the stack, XML docs are represented as follows:
26+
27+ - As native Unicode str, typically "pretty printed" with indentations, when
28+ formatted for display
29+ - As UTF-8 encoded byte strings when send sending or receiving over the network,
30+ or loading or saving as files
31+ - Schema validation and manipulation in Python code as PyXB binding objects
32+ - General processing as ElementTrees
33+
34+ In order to allow conversions between all representations without having to
35+ implement separate conversions for each combination of input and output
36+ representation, a "hub and spokes" model is used. Native Unicode str was
37+ selected as the "hub" representation due to:
2838
2939- PyXB provides translation to/from string and DOM.
3040- ElementTree provides translation to/from string.
31-
32- We select string as the "hub" representation for XML.
3341"""
3442
3543import re
44+ import xml .etree
3645import xml .etree .ElementTree
3746
3847import pyxb
4857# PyXB shares information about all known types between all imported bindings.
4958PYXB_BINDING = d1_common .types .dataoneTypes_v1
5059
60+ # Map common namespace prefixes to namespaces
5161NS_DICT = {
5262 'v1' : str (v1_0 .Namespace ),
5363 'v1_1' : str (v1_1 .Namespace ),
5464 'v1_2' : str (v1_2 .Namespace ),
5565 'v2' : str (v2_0 .Namespace ),
66+ 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#' ,
67+ 'ore' : 'http://www.openarchives.org/ore/terms/' ,
68+ 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' ,
69+ 'dcterms' : 'http://purl.org/dc/terms/' ,
70+ 'cito' : 'http://purl.org/spar/cito/' ,
5671}
5772
73+
74+ # Map common namespaces to prefixes
75+ NS_REVERSE_DICT = { v : k for k , v in NS_DICT .items () }
76+
5877BINDING_TO_VERSION_TAG_DICT = {
5978 v1_0 : 'v1' ,
6079 v1_1 : 'v1' ,
@@ -271,9 +290,19 @@ def str_to_pyxb(xml_str):
271290 return PYXB_BINDING .CreateFromDocument (xml_str )
272291
273292
274- def str_to_etree (xml_str ):
275- return xml .etree .ElementTree .fromstring (xml_str )
293+ def str_to_etree (xml_str , encoding = 'utf-8' ):
294+ """Parse an XML doc to an ElementTree"""
295+ # parser = xml.etree.ElementTree.XMLParser(encoding=encoding)
296+ # return xml.etree.ElementTree.ElementTree(
297+ # return xml.etree.ElementTree.fromstring(xml_str)
298+ # )
299+ # parser = xml.etree.ElementTree.XMLParser(encoding=encoding)
300+ # return xml.etree.ElementTree.ElementTree(
301+ # xml.etree.ElementTree.fromstring(xml_str, parser=parser)
302+ # )
276303
304+ parser = xml .etree .ElementTree .XMLParser (encoding = encoding )
305+ return xml .etree .ElementTree .fromstring (xml_str , parser = parser )
277306
278307def pyxb_to_str (pyxb_obj ):
279308 return pyxb_obj .toxml ('utf-8' )
@@ -292,21 +321,30 @@ def etree_to_pyxb(etree_obj):
292321
293322
294323# ElementTree
295- # https://docs.python.org/2/library/xml.etree.elementtree.html
324+
325+ def replace_namespace_with_prefix (tag_str , ns_reverse_dict = None ):
326+ """Given a tag on the form "{namespace}name", return "prefix:name"
327+ E.g.: {http://www.openarchives.org/ore/terms/}ResourceMap -> ore:ResourceMap
328+ """
329+ ns_reverse_dict = ns_reverse_dict or NS_REVERSE_DICT
330+ for namespace_str , prefix_str in ns_reverse_dict .items ():
331+ tag_str = tag_str .replace ('{{{}}}' .format (namespace_str ), '{}:' .format (prefix_str ))
332+ return tag_str
296333
297334
298335def etree_replace_namespace (etree_obj , ns_str ):
299336 _replace_namespace_recursive (etree_obj , ns_str )
300337
301338
302339def _replace_namespace_recursive (el , ns_str ):
303- el .tag = re .sub (r'\ {.*\}' , '{{{}}}' .format (ns_str ), el .tag )
340+ el .tag = re .sub (r'{.*\}' , '{{{}}}' .format (ns_str ), el .tag )
304341 el .text = el .text .strip () if el .text else None
305342 el .tail = el .tail .strip () if el .tail else None
306343 for child_el in el :
307344 _replace_namespace_recursive (child_el , ns_str )
308345
309346
347+
310348def strip_v2_elements (etree_obj ):
311349 """Remove elements and attributes that are only valid in v2 types"""
312350 if etree_obj .tag == v2_0_tag ('logEntry' ):
@@ -358,3 +396,5 @@ def strip_node_list(etree_obj):
358396
359397def v2_0_tag (element_name ):
360398 return '{{{}}}{}' .format (NS_DICT ['v2' ], element_name )
399+
400+
0 commit comments