2020from .versions import get_local_versions
2121from .char_handler import escape
2222
23-
23+ FLG_INCOMPATIBLE = '__INCOMPATIBLE_SIG__'
2424class NS :
2525 """
2626 Helper class for XML name spaces in ElementTree.
@@ -89,24 +89,29 @@ def save(self, dst=sys.stdout):
8989 root .append (f )
9090 self .indent (root )
9191 with open (dst , 'wb' ) as file_ :
92- # print >>out, ET.tostring(root,encoding='utf-8')
9392 file_ .write (ET .tostring (root ))
9493
9594 def indent (self , elem , level = 0 ):
9695 """Indent output."""
97- i = "\n " + level * " "
9896 if len (elem ):
99- if not elem .text or not elem .text .strip ():
100- elem .text = i + " "
101- if not elem .tail or not elem .tail .strip ():
102- elem .tail = i
103- for elem in elem :
104- self .indent (elem , level + 1 )
105- if not elem .tail or not elem .tail .strip ():
106- elem .tail = i
97+ self ._indent_ele (elem , level )
10798 else :
10899 if level and (not elem .tail or not elem .tail .strip ()):
109- elem .tail = i
100+ elem .tail = self ._indent_text (level )
101+
102+ def _indent_ele (self , elem , level ):
103+ """Indent the element."""
104+ if not elem .text or not elem .text .strip ():
105+ elem .text = self ._indent_text (level ) + " "
106+ if not elem .tail or not elem .tail .strip ():
107+ elem .tail = self ._indent_text (level )
108+ for elem in elem :
109+ self .indent (elem , level + 1 )
110+ if not elem .tail or not elem .tail .strip ():
111+ elem .tail = self ._indent_text (level )
112+
113+ def _indent_text (self , level ):
114+ return "\n " + level * " "
110115
111116 def load_pronom_xml (self , puid_filter = None ):
112117 """
@@ -116,18 +121,12 @@ def load_pronom_xml(self, puid_filter=None):
116121 If a @param puid is specified, only that one will be loaded.
117122 """
118123 formats = []
119- # for p in self.pronom_files:
120- # print p
121- # print self.pronom_files
122- # exit()
123124 try :
124125 zip = zipfile .ZipFile (self .pronom_files , 'r' )
125126 for item in zip .infolist ():
126- # print item.filename
127127 try :
128128 stream = zip .open (item )
129129 # Work is done here!
130- # if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
131130 format_ = self .parse_pronom_xml (stream , puid_filter )
132131 if format_ is not None :
133132 formats .append (format_ )
@@ -144,7 +143,7 @@ def load_pronom_xml(self, puid_filter=None):
144143 id_map = {}
145144 for element in formats :
146145 puid = element .find ('puid' ).text
147- # print " working on puid:", puid
146+ # print(' working on puid:{}'.format( puid))
148147 pronom_id = element .find ('pronom_id' ).text
149148 id_map [pronom_id ] = puid
150149 for element in formats :
@@ -207,17 +206,23 @@ def parse_pronom_xml(self, source, puid_filter=None):
207206 # There are some funny chars in the notes, which caused me trouble and it is a unicode string,
208207 ET .SubElement (fido_sig , 'note' ).text = get_text_tna (pronom_sig , 'SignatureNote' )
209208 for pronom_pat in pronom_sig .findall (TNA ('ByteSequence' )):
209+ # print('Parsing ID:{}'.format(puid))
210210 fido_pat = ET .SubElement (fido_sig , 'pattern' )
211211 pos = fido_position (get_text_tna (pronom_pat , 'PositionType' ))
212- bytes = get_text_tna (pronom_pat , 'ByteSequenceValue' )
212+ byte_seq = get_text_tna (pronom_pat , 'ByteSequenceValue' )
213213 offset = get_text_tna (pronom_pat , 'Offset' )
214214 max_offset = get_text_tna (pronom_pat , 'MaxOffset' )
215215 if not max_offset :
216216 pass
217217 # print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
218- regex = convert_to_regex (bytes , 'Little' , pos , offset , max_offset )
218+ try :
219+ regex = convert_to_regex (byte_seq , 'Little' , pos , offset , max_offset )
220+ except ValueError as ve :
221+ print ('ValueError converting PUID {} signature to regex: {}' .format (puid , ve ), file = sys .stderr )
222+ regex = FLG_INCOMPATIBLE
223+
219224 # print "done puid", puid
220- if regex == "__INCOMPATIBLE_SIG__" :
225+ if regex == FLG_INCOMPATIBLE :
221226 print ("Error: incompatible PRONOM signature found for puid {} skipping..." .format (puid ), file = sys .stderr )
222227 # remove the empty 'signature' nodes
223228 # now that the signature is not compatible and thus "regex" is empty
@@ -226,7 +231,7 @@ def parse_pronom_xml(self, source, puid_filter=None):
226231 fido_format .remove (r )
227232 continue
228233 ET .SubElement (fido_pat , 'position' ).text = pos
229- ET .SubElement (fido_pat , 'pronom_pattern' ).text = bytes
234+ ET .SubElement (fido_pat , 'pronom_pattern' ).text = byte_seq
230235 ET .SubElement (fido_pat , 'regex' ).text = regex
231236 # Get the format details
232237 fido_details = ET .SubElement (fido_format , 'details' )
@@ -372,7 +377,7 @@ def _convert_err_msg(msg, c, i, chars, buf):
372377 return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\n Buffer = {5}" .format (msg , c , i , chars , i * ' ' , buf .getvalue ())
373378
374379
375- def doByte (chars , i , littleendian , esc = True ):
380+ def do_byte (chars , i , littleendian , esc = True ):
376381 """
377382 Convert two chars[i] and chars[i+1] into a byte.
378383
@@ -473,7 +478,7 @@ def do_any_all_bitmasks(chars, i, predicate, littleendian):
473478 See https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#all-bitmasks
474479 and https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#any-bitmasks
475480 """
476- byt , inc = doByte (chars , i + 1 , littleendian , esc = False )
481+ byt , inc = do_byte (chars , i + 1 , littleendian , esc = False )
477482 bitmask = ord (byt )
478483 regex = '({})' .format (
479484 '|' .join (['\\ x' + hex (byte )[2 :].zfill (2 ) for byte in range (0x100 )
@@ -534,9 +539,9 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
534539 elif chars [i ] in '*+?' :
535540 state = 'specials'
536541 else :
537- raise Exception (_convert_err_msg ('Illegal character in start' , chars [i ], i , chars , buf ))
542+ raise ValueError (_convert_err_msg ('Illegal character in start' , chars [i ], i , chars , buf ))
538543 elif state == 'bytes' :
539- (byt , inc ) = doByte (chars , i , littleendian )
544+ (byt , inc ) = do_byte (chars , i , littleendian )
540545 buf .write (byt )
541546 i += inc
542547 state = 'start'
@@ -555,7 +560,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
555560 i += 2
556561 while True :
557562 if chars [i ].isalnum ():
558- (byt , inc ) = doByte (chars , i , littleendian )
563+ (byt , inc ) = do_byte (chars , i , littleendian )
559564 buf .write (byt )
560565 i += inc
561566 elif chars [i ] == '&' :
@@ -578,15 +583,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
578583 try :
579584 buf .write ('[' )
580585 i += 1
581- (byt , inc ) = doByte (chars , i , littleendian )
586+ (byt , inc ) = do_byte (chars , i , littleendian )
582587 buf .write (byt )
583588 i += inc
584589 # assert(chars[i] == ':')
585590 if chars [i ] != ':' :
586591 return "__INCOMPATIBLE_SIG__"
587592 buf .write ('-' )
588593 i += 1
589- (byt , inc ) = doByte (chars , i , littleendian )
594+ (byt , inc ) = do_byte (chars , i , littleendian )
590595 buf .write (byt )
591596 i += inc
592597 # assert(chars[i] == ']')
@@ -606,7 +611,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
606611 i += 1
607612 while True :
608613 if chars [i ].isalnum ():
609- (byt , inc ) = doByte (chars , i , littleendian )
614+ (byt , inc ) = do_byte (chars , i , littleendian )
610615 buf .write (byt )
611616 i += inc
612617 elif chars [i ] == '|' :
@@ -618,15 +623,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
618623 elif chars [i ] == '[' :
619624 buf .write ('[' )
620625 i += 1
621- (byt , inc ) = doByte (chars , i , littleendian )
626+ (byt , inc ) = do_byte (chars , i , littleendian )
622627 buf .write (byt )
623628 i += inc
624629 # assert(chars[i] == ':')
625630 if chars [i ] != ':' :
626631 return "__INCOMPATIBLE_SIG__"
627632 buf .write ('-' )
628633 i += 1
629- (byt , inc ) = doByte (chars , i , littleendian )
634+ (byt , inc ) = do_byte (chars , i , littleendian )
630635 buf .write (byt )
631636 i += inc
632637
0 commit comments