Skip to content

Commit 701fde3

Browse files
committed
REL: FIDO Release Candidate 1.6 RC5
- added update signature parameter to control signature download verison: - `-version` parameter that defaults to "latest", behaviour remains identical; - if `-version v104` is passed then v104 signatures will be created; - trapped regex creation exception so that sig file creation is not derailed; - PRONOM/DROID signature file now downloaded from URL rather than via SOAP service; - moved sleep between SOAP downloads so that it's only applied between actual downloads, not when processing cached results; - bumped version number to 1.6.0rc5 plus updated release date; - code style warnings: - some minor refactoring for complex methods; - factoring out string constants; - renamed some variables and methods; - removed some commented code; - tidied exit conditions; and - removed some unreachable code.
1 parent e0d8fd0 commit 701fde3

File tree

12 files changed

+61745
-705
lines changed

12 files changed

+61745
-705
lines changed

RELEASENOTES.md

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
RELEASE NOTES
22
=============
3+
34
Format Identification for Digital Objects (fido).
45
Copyright 2010 by Open Preservation Foundation.
56

@@ -8,12 +9,29 @@ Copyright 2010 The Open Preservation Foundation
89
Fido is made available under the Apache License, Version 2.0; see the file
910
LICENSE.txt for details.
1011

11-
Fido 1.6.0rc1
12+
Fido 1.6.0rc5
13+
-------------
14+
15+
2022-08-03
16+
17+
- added update signature parameter to control signature download verison:
18+
- trapped regex creation exception so that sig file creation is not derailed;
19+
- PRONOM/DROID signature file now downloaded from URL rather than via SOAP service;
20+
- moved sleep between SOAP downloads so that it's only applied between actual downloads, not when processing cached results;
21+
- code style warnings:
22+
- some minor refactoring for complex methods;
23+
- factoring out string constants;
24+
- renamed some variables and methods;
25+
- removed some commented code;
26+
- tidied exit conditions; and
27+
- removed some unreachable code.
28+
29+
Fido 1.6.0rc4
1230
-------------
1331

14-
2022-03-29
32+
2022-06-22
1533

16-
New command line options for updating signatures, see
34+
New command line options for updating signatures
1735

1836
- PRONOM signatures can now be updated from a web service [[#202][]].
1937
- PRONOM v104 support with successful signature compilation (see issue [#203][]) [[#204][]].

fido/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from six.moves import input as rinput
1515

1616

17-
__version__ = '1.6.0rc1'
17+
__version__ = '1.6.0rc5'
1818

1919

2020
CONFIG_DIR = join(abspath(dirname(__file__)), 'conf')

fido/conf/DROID_SignatureFile-v104.xml

Lines changed: 0 additions & 2 deletions
This file was deleted.

fido/conf/DROID_SignatureFile-v107.xml

Lines changed: 58198 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 3348 additions & 558 deletions
Large diffs are not rendered by default.

fido/conf/versions.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
<?xml version='1.0' encoding='utf-8'?>
22
<versions>
3-
<pronomVersion>104</pronomVersion>
4-
<pronomSignature>formats-v104.xml</pronomSignature>
3+
<pronomVersion>107</pronomVersion>
4+
<pronomSignature>formats-v107.xml</pronomSignature>
55
<pronomContainerSignature>container-signature-20200121.xml</pronomContainerSignature>
66
<fidoExtensionSignature>format_extensions.xml</fidoExtensionSignature>
7-
<updateScript>1.6.0rc1</updateScript>
7+
<updateScript>1.6.0rc5</updateScript>
88
<updateSite>https://fidosigs.openpreservation.org</updateSite>
99
</versions>

fido/fido.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -886,17 +886,16 @@ def main(args=None):
886886
if (not args.input) and len(args.files) == 1 and args.files[0] == '-':
887887
if fido.zip:
888888
raise RuntimeError("Multiple content read from stdin not yet supported.")
889-
sys.exit(1)
890889
fido.identify_multi_object_stream(sys.stdin, extension=not args.noextension)
891890
else:
892891
fido.identify_stream(sys.stdin, args.filename, extension=not args.noextension)
893892
else:
894893
for file in list_files(args.files, args.recurse):
895894
fido.identify_file(file, extension=not args.noextension)
896895
except KeyboardInterrupt:
897-
msg = "FIDO: Interrupt while identifying file {0}"
898-
sys.stderr.write(msg.format(fido.current_file))
899-
sys.exit(1)
896+
sys.stdout.flush()
897+
sys.stderr.flush()
898+
sys.exit('FIDO: Interrupt while identifying file {0}'.format(fido.current_file))
900899

901900
if not args.q:
902901
sys.stdout.flush()

fido/prepare.py

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from .versions import get_local_versions
2121
from .char_handler import escape
2222

23-
23+
FLG_INCOMPATIBLE = '__INCOMPATIBLE_SIG__'
2424
class NS:
2525
"""
2626
Helper class for XML name spaces in ElementTree.
@@ -89,24 +89,29 @@ def save(self, dst=sys.stdout):
8989
root.append(f)
9090
self.indent(root)
9191
with open(dst, 'wb') as file_:
92-
# print >>out, ET.tostring(root,encoding='utf-8')
9392
file_.write(ET.tostring(root))
9493

9594
def indent(self, elem, level=0):
9695
"""Indent output."""
97-
i = "\n" + level * " "
9896
if len(elem):
99-
if not elem.text or not elem.text.strip():
100-
elem.text = i + " "
101-
if not elem.tail or not elem.tail.strip():
102-
elem.tail = i
103-
for elem in elem:
104-
self.indent(elem, level + 1)
105-
if not elem.tail or not elem.tail.strip():
106-
elem.tail = i
97+
self._indent_ele(elem, level)
10798
else:
10899
if level and (not elem.tail or not elem.tail.strip()):
109-
elem.tail = i
100+
elem.tail = self._indent_text(level)
101+
102+
def _indent_ele(self, elem, level):
103+
"""Indent the element."""
104+
if not elem.text or not elem.text.strip():
105+
elem.text = self._indent_text(level) + " "
106+
if not elem.tail or not elem.tail.strip():
107+
elem.tail = self._indent_text(level)
108+
for elem in elem:
109+
self.indent(elem, level + 1)
110+
if not elem.tail or not elem.tail.strip():
111+
elem.tail = self._indent_text(level)
112+
113+
def _indent_text(self, level):
114+
return "\n" + level * " "
110115

111116
def load_pronom_xml(self, puid_filter=None):
112117
"""
@@ -116,18 +121,12 @@ def load_pronom_xml(self, puid_filter=None):
116121
If a @param puid is specified, only that one will be loaded.
117122
"""
118123
formats = []
119-
# for p in self.pronom_files:
120-
# print p
121-
# print self.pronom_files
122-
# exit()
123124
try:
124125
zip = zipfile.ZipFile(self.pronom_files, 'r')
125126
for item in zip.infolist():
126-
# print item.filename
127127
try:
128128
stream = zip.open(item)
129129
# Work is done here!
130-
# if item.filename != 'github/fido/fido/conf/pronom-xml/puid.fmt.11.xml':
131130
format_ = self.parse_pronom_xml(stream, puid_filter)
132131
if format_ is not None:
133132
formats.append(format_)
@@ -144,7 +143,7 @@ def load_pronom_xml(self, puid_filter=None):
144143
id_map = {}
145144
for element in formats:
146145
puid = element.find('puid').text
147-
# print "working on puid:",puid
146+
# print('working on puid:{}'.format(puid))
148147
pronom_id = element.find('pronom_id').text
149148
id_map[pronom_id] = puid
150149
for element in formats:
@@ -207,17 +206,23 @@ def parse_pronom_xml(self, source, puid_filter=None):
207206
# There are some funny chars in the notes, which caused me trouble and it is a unicode string,
208207
ET.SubElement(fido_sig, 'note').text = get_text_tna(pronom_sig, 'SignatureNote')
209208
for pronom_pat in pronom_sig.findall(TNA('ByteSequence')):
209+
# print('Parsing ID:{}'.format(puid))
210210
fido_pat = ET.SubElement(fido_sig, 'pattern')
211211
pos = fido_position(get_text_tna(pronom_pat, 'PositionType'))
212-
bytes = get_text_tna(pronom_pat, 'ByteSequenceValue')
212+
byte_seq = get_text_tna(pronom_pat, 'ByteSequenceValue')
213213
offset = get_text_tna(pronom_pat, 'Offset')
214214
max_offset = get_text_tna(pronom_pat, 'MaxOffset')
215215
if not max_offset:
216216
pass
217217
# print "working on puid:", puid, ", position: ", pos, "with offset, maxoffset: ", offset, ",", max_offset
218-
regex = convert_to_regex(bytes, 'Little', pos, offset, max_offset)
218+
try:
219+
regex = convert_to_regex(byte_seq, 'Little', pos, offset, max_offset)
220+
except ValueError as ve:
221+
print('ValueError converting PUID {} signature to regex: {}'.format(puid, ve), file=sys.stderr)
222+
regex = FLG_INCOMPATIBLE
223+
219224
# print "done puid", puid
220-
if regex == "__INCOMPATIBLE_SIG__":
225+
if regex == FLG_INCOMPATIBLE:
221226
print("Error: incompatible PRONOM signature found for puid {} skipping...".format(puid), file=sys.stderr)
222227
# remove the empty 'signature' nodes
223228
# now that the signature is not compatible and thus "regex" is empty
@@ -226,7 +231,7 @@ def parse_pronom_xml(self, source, puid_filter=None):
226231
fido_format.remove(r)
227232
continue
228233
ET.SubElement(fido_pat, 'position').text = pos
229-
ET.SubElement(fido_pat, 'pronom_pattern').text = bytes
234+
ET.SubElement(fido_pat, 'pronom_pattern').text = byte_seq
230235
ET.SubElement(fido_pat, 'regex').text = regex
231236
# Get the format details
232237
fido_details = ET.SubElement(fido_format, 'details')
@@ -372,7 +377,7 @@ def _convert_err_msg(msg, c, i, chars, buf):
372377
return "Conversion: {0}: char='{1}', at pos {2} in \n {3}\n {4}^\nBuffer = {5}".format(msg, c, i, chars, i * ' ', buf.getvalue())
373378

374379

375-
def doByte(chars, i, littleendian, esc=True):
380+
def do_byte(chars, i, littleendian, esc=True):
376381
"""
377382
Convert two chars[i] and chars[i+1] into a byte.
378383
@@ -473,7 +478,7 @@ def do_any_all_bitmasks(chars, i, predicate, littleendian):
473478
See https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#all-bitmasks
474479
and https://github.com/nishihatapalmer/byteseek/wiki/Regular-Expression-Syntax#any-bitmasks
475480
"""
476-
byt, inc = doByte(chars, i + 1, littleendian, esc=False)
481+
byt, inc = do_byte(chars, i + 1, littleendian, esc=False)
477482
bitmask = ord(byt)
478483
regex = '({})'.format(
479484
'|'.join(['\\x' + hex(byte)[2:].zfill(2) for byte in range(0x100)
@@ -534,9 +539,9 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
534539
elif chars[i] in '*+?':
535540
state = 'specials'
536541
else:
537-
raise Exception(_convert_err_msg('Illegal character in start', chars[i], i, chars, buf))
542+
raise ValueError(_convert_err_msg('Illegal character in start', chars[i], i, chars, buf))
538543
elif state == 'bytes':
539-
(byt, inc) = doByte(chars, i, littleendian)
544+
(byt, inc) = do_byte(chars, i, littleendian)
540545
buf.write(byt)
541546
i += inc
542547
state = 'start'
@@ -555,7 +560,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
555560
i += 2
556561
while True:
557562
if chars[i].isalnum():
558-
(byt, inc) = doByte(chars, i, littleendian)
563+
(byt, inc) = do_byte(chars, i, littleendian)
559564
buf.write(byt)
560565
i += inc
561566
elif chars[i] == '&':
@@ -578,15 +583,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
578583
try:
579584
buf.write('[')
580585
i += 1
581-
(byt, inc) = doByte(chars, i, littleendian)
586+
(byt, inc) = do_byte(chars, i, littleendian)
582587
buf.write(byt)
583588
i += inc
584589
# assert(chars[i] == ':')
585590
if chars[i] != ':':
586591
return "__INCOMPATIBLE_SIG__"
587592
buf.write('-')
588593
i += 1
589-
(byt, inc) = doByte(chars, i, littleendian)
594+
(byt, inc) = do_byte(chars, i, littleendian)
590595
buf.write(byt)
591596
i += inc
592597
# assert(chars[i] == ']')
@@ -606,7 +611,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
606611
i += 1
607612
while True:
608613
if chars[i].isalnum():
609-
(byt, inc) = doByte(chars, i, littleendian)
614+
(byt, inc) = do_byte(chars, i, littleendian)
610615
buf.write(byt)
611616
i += inc
612617
elif chars[i] == '|':
@@ -618,15 +623,15 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
618623
elif chars[i] == '[':
619624
buf.write('[')
620625
i += 1
621-
(byt, inc) = doByte(chars, i, littleendian)
626+
(byt, inc) = do_byte(chars, i, littleendian)
622627
buf.write(byt)
623628
i += inc
624629
# assert(chars[i] == ':')
625630
if chars[i] != ':':
626631
return "__INCOMPATIBLE_SIG__"
627632
buf.write('-')
628633
i += 1
629-
(byt, inc) = doByte(chars, i, littleendian)
634+
(byt, inc) = do_byte(chars, i, littleendian)
630635
buf.write(byt)
631636
i += inc
632637

fido/pronom/soap.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"""
2222
import sys
2323
import tempfile
24+
from urllib.error import HTTPError, URLError
2425
import xml.etree.ElementTree as ET
2526
from six.moves import urllib
2627

@@ -58,30 +59,24 @@ def get_pronom_sig_version():
5859
ver_ele = tree.find('.//pronom:Version/pronom:Version', NS)
5960
return int(ver_ele.text)
6061

61-
62-
def get_pronom_signature():
62+
def get_droid_signatures(version):
6363
"""
64-
Get PRONOM signature.
64+
Get a DROID signature file by version.
6565
66-
Return a tuple comprising the latest signature XML file as string and a count
67-
of the FileFormat elements contained as an integer.
66+
Return a tuple comprising the requested signature XML file as string
67+
and a count of the FileFormat elements contained as an integer.
6868
Upon error, write to `stderr` and return the tuple [], False.
6969
"""
70-
tree = _get_soap_ele_tree('getSignatureFileV1')
71-
for prefix, uri in NS.items():
72-
ET.register_namespace(prefix, uri)
73-
sigfile_ele = ET.ElementTree(tree.find('.//pronom:SignatureFile', NS))
74-
format_ele_len = len(sigfile_ele.findall('.//sig:FileFormat', NS))
75-
if format_ele_len < 1:
76-
sys.stderr.write("get_pronom_signature(): could not parse XML from SOAP response: file")
77-
return [], False
78-
# proc_inst = ET.ProcessingInstruction('xml', 'version="1.0" encoding="UTF-8"')
79-
with tempfile.TemporaryFile() as fp:
80-
sigfile_ele.write(fp, encoding='utf-8', xml_declaration=True)
81-
fp.seek(0)
82-
xml = fp.read()
83-
return xml, format_ele_len
84-
70+
xml = []
71+
format_count = False
72+
try:
73+
with urllib.request.urlopen('https://www.nationalarchives.gov.uk/documents/DROID_SignatureFile_V{}.xml'.format(version)) as f:
74+
xml = f.read().decode('utf-8')
75+
root_ele = ET.fromstring(xml)
76+
format_count = len(root_ele.findall('FileFormat'))
77+
except HTTPError as httpe:
78+
sys.stderr.write("get_droid_signatures(): could not download signature file v{} due to exception: {}\n".format(version, httpe))
79+
return xml, format_count
8580

8681
def _get_soap_ele_tree(soap_action):
8782
soap_string = '{}<soap:Envelope xmlns:xsi="{}" xmlns:xsd="{}" xmlns:soap="{}"><soap:Body><{} xmlns="{}" /></soap:Body></soap:Envelope>'.format(XML_PROC, NS.get('xsi'), NS.get('xsd'), NS.get('soap'), soap_action, PRONOM_NS).encode(ENCODING)
@@ -93,7 +88,12 @@ def _get_soap_ele_tree(soap_action):
9388

9489

9590
def _get_soap_response(soap_action, soap_string):
96-
req = urllib.request.Request('http://{}/pronom/service.asmx'.format(PRONOM_HOST), data=soap_string)
91+
try:
92+
req = urllib.request.Request('http://{}/pronom/service.asmx'.format(PRONOM_HOST), data=soap_string)
93+
except URLError:
94+
print('There was a problem contacting the PRONOM service at http://{}/pronom/service.asmx.'.format(PRONOM_HOST))
95+
print('Please check your network connection and try again.')
96+
sys.exit(1)
9797
for key, value in HEADERS.items():
9898
req.add_header(key, value)
9999
req.add_header('Content-length', '%d' % len(soap_string))

0 commit comments

Comments
 (0)