Skip to content

Commit f5941d7

Browse files
authored
Merge pull request #197 from openpreserve/fix/prep-niggles
FIX: Signature preparation niggles and bugs
2 parents 0e1ede0 + 3f03a2d commit f5941d7

File tree

7 files changed

+159
-97
lines changed

7 files changed

+159
-97
lines changed

fido/conf/fido-formats.xsd

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ Usage of DC has been based on these references:
55
http://dublincore.org/documents/usageguide/qualifiers.shtml
66
http://www.dublincore.org/documents/dc-xml-guidelines/
77
-->
8-
<xs:schema elementFormDefault="qualified"
9-
xmlns:xs="http://www.w3.org/2001/XMLSchema"
8+
<xs:schema elementFormDefault="qualified"
9+
xmlns:xs="http://www.w3.org/2001/XMLSchema"
1010
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
1111
xmlns:dc="http://purl.org/dc/elements/1.1/"
1212
xmlns:dcterms="http://purl.org/dc/terms/">
@@ -34,7 +34,7 @@ Usage of DC has been based on these references:
3434
<xs:element maxOccurs="unbounded" minOccurs="0" ref="extension"/>
3535
<xs:element maxOccurs="1" minOccurs="0" name="apple_uti" type="xs:string"/>
3636
<xs:element maxOccurs="unbounded" minOccurs="0" ref="has_priority_over"/>
37-
<xs:element maxOccurs="unbounded" ref="signature"/>
37+
<xs:element maxOccurs="unbounded" minOccurs="0" ref="signature"/>
3838
<xs:element minOccurs="0" ref="note"/>
3939
<xs:element maxOccurs="1" ref="details"/>
4040
</xs:sequence>

fido/fido.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
from fido import __version__, CONFIG_DIR
3333
from fido.package import OlePackage, ZipPackage
34-
from fido.pronomutils import get_local_pronom_versions
34+
from fido.versions import get_local_versions
3535
from fido.char_handler import escape
3636

3737

@@ -796,7 +796,7 @@ def main(args=None):
796796

797797
timer = PerfTimer()
798798

799-
versions = get_local_pronom_versions(args.confdir)
799+
versions = get_local_versions(args.confdir)
800800

801801
defaults['xml_pronomSignature'] = versions.pronom_signature
802802
defaults['containersignature_file'] = versions.pronom_container_signature

fido/prepare.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
from six.moves import cStringIO
1616
from six.moves.urllib.request import urlopen
1717
from six.moves.urllib.parse import urlparse
18+
from six.moves.urllib.error import HTTPError
1819

19-
from .pronomutils import get_local_pronom_versions
20+
from .versions import get_local_versions
2021
from .char_handler import escape
2122

2223

@@ -62,8 +63,9 @@ def prettify(elem):
6263
class FormatInfo:
6364
"""Convert PRONOM formats into FIDO signatures."""
6465

65-
def __init__(self, pronom_files, format_list=[]):
66+
def __init__(self, pronom_files, format_list=None):
6667
"""Instantiate class, take a list of PRONOM files and an optional list of formats."""
68+
format_list = format_list if format_list else []
6769
self.info = {}
6870
self.formats = []
6971
self.pronom_files = pronom_files
@@ -192,7 +194,7 @@ def parse_pronom_xml(self, source, puid_filter=None):
192194
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
193195
type = get_text_tna(id, 'IdentifierType')
194196
if type == 'Apple Uniform Type Identifier':
195-
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
197+
ET.SubElement(fido_format, 'apple_uti').text = get_text_tna(id, 'Identifier')
196198
# Handle the relationships
197199
for x in pronom_format.findall(TNA('RelatedFormat')):
198200
rel = get_text_tna(x, 'RelationshipType')
@@ -275,9 +277,15 @@ def parse_pronom_xml(self, source, puid_filter=None):
275277
ET.SubElement(rf, 'dc:identifier').text = url
276278
# And calculate the checksum of this resource:
277279
m = hashlib.md5()
278-
sock = urlopen(url)
279-
m.update(sock.read())
280-
sock.close()
280+
try:
281+
sock = urlopen(url)
282+
m.update(sock.read())
283+
sock.close()
284+
except HTTPError as http_excep:
285+
sys.stderr.write('HTTP {} error loading resource {}\n'.format(http_excep.code, url))
286+
if http_excep.code == 404:
287+
continue
288+
281289
checksum = m.hexdigest()
282290
else:
283291
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
@@ -686,7 +694,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):
686694

687695
def run(input=None, output=None, puid=None):
688696
"""Convert PRONOM formats into FIDO signatures."""
689-
versions = get_local_pronom_versions()
697+
versions = get_local_versions()
690698

691699
if input is None:
692700
input = versions.get_zip_file()

fido/pronom/http.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
"""
4+
FIDO: Format Identifier for Digital Objects.
5+
6+
Copyright 2010 The Open Preservation Foundation
7+
8+
Licensed under the Apache License, Version 2.0 (the "License");
9+
you may not use this file except in compliance with the License.
10+
You may obtain a copy of the License at
11+
12+
http://www.apache.org/licenses/LICENSE-2.0
13+
14+
Unless required by applicable law or agreed to in writing, software
15+
distributed under the License is distributed on an "AS IS" BASIS,
16+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
See the License for the specific language governing permissions and
18+
limitations under the License.
19+
20+
PRONOM format signatures HTTP calls.
21+
"""
22+
from six.moves import urllib
23+
24+
25+
def get_sig_xml_for_puid(puid):
26+
"""Return the full PRONOM signature XML for the passed PUID."""
27+
req = urllib.request.Request("http://www.nationalarchives.gov.uk/pronom/{}.xml".format(puid))
28+
response = urllib.request.urlopen(req)
29+
xml = response.read()
30+
return xml

fido/toxml.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import sys
2626

2727
from . import __version__
28-
from .pronomutils import get_local_pronom_versions
28+
from .versions import get_local_versions
2929

3030

3131
def main():
@@ -35,7 +35,7 @@ def main():
3535
<versions>
3636
<fido_version>{0}</fido_version>
3737
<signature_version>{1}</signature_version>
38-
</versions>""".format(__version__, get_local_pronom_versions().pronom_version))
38+
</versions>""".format(__version__, get_local_versions().pronom_version))
3939

4040
reader = csv.reader(sys.stdin)
4141

fido/update_signatures.py

Lines changed: 101 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,11 @@
2323
from xml.etree import ElementTree as CET
2424
import zipfile
2525

26-
from six.moves.urllib.request import urlopen
27-
from six.moves.urllib.error import URLError
28-
2926
from . import __version__, CONFIG_DIR, query_yes_no
3027
from .prepare import run as prepare_pronom_to_fido
31-
from .pronomutils import get_local_pronom_versions
32-
from .pronom.soap import get_pronom_sig_version, get_pronom_signature
28+
from .versions import get_local_versions
29+
from .pronom.soap import get_pronom_sig_version, get_pronom_signature, NS
30+
from .pronom.http import get_sig_xml_for_puid
3331

3432

3533
DEFAULTS = {
@@ -56,57 +54,18 @@ def run(defaults=None):
5654
defaults = defaults or DEFAULTS
5755
try:
5856
print("Contacting PRONOM...")
59-
currentVersion = get_pronom_sig_version()
60-
if not currentVersion:
61-
sys.exit('Failed to obtain PRONOM signature file version number, please try again.')
62-
63-
print("Querying latest signaturefile version...")
64-
signatureFile = os.path.join(CONFIG_DIR, defaults['signatureFileName'].format(currentVersion))
65-
if os.path.isfile(signatureFile):
66-
print("You already have the latest PRONOM signature file, version", currentVersion)
67-
if not query_yes_no("Update anyway?"):
68-
sys.exit('Aborting update...')
69-
70-
print("Downloading signature file version {}...".format(currentVersion))
71-
currentFile, _ = get_pronom_signature()
72-
if not currentFile:
73-
sys.exit('Failed to obtain PRONOM signature file, please try again.')
74-
print("Writing {0}...".format(defaults['signatureFileName'].format(currentVersion)))
75-
with open(signatureFile, 'w') as file_:
76-
file_.write(currentFile)
77-
57+
currentVersion, signatureFile = sig_version_check(defaults)
58+
download_sig_file(defaults, currentVersion, signatureFile)
7859
print("Extracting PRONOM PUID's from signature file...")
7960
tree = CET.parse(signatureFile)
80-
puids = []
81-
for node in tree.iter("{http://www.nationalarchives.gov.uk/pronom/SignatureFile}FileFormat"):
82-
puids.append(node.get("PUID"))
83-
print("Found {} PRONOM PUID's".format(len(puids)))
84-
85-
print("Downloading signatures can take a while")
86-
if not query_yes_no("Continue and download signatures?"):
87-
sys.exit('Aborting update...')
88-
tmpdir = defaults['tmp_dir']
89-
resume_download = False
90-
if os.path.isdir(tmpdir):
91-
print("Found previously created temporary folder for download:", tmpdir)
92-
resume_download = query_yes_no('Do you want to resume download (yes) or start over (no)?')
93-
if resume_download:
94-
print("Resuming download...")
95-
else:
96-
print("Creating temporary folder for download:", tmpdir)
97-
try:
98-
os.mkdir(tmpdir)
99-
except OSError:
100-
pass
101-
if not os.path.isdir(tmpdir):
102-
sys.stderr.write("Failed to create temporary folder for PUID's, using: " + tmpdir)
103-
104-
download_signatures(defaults, puids, resume_download, tmpdir)
105-
create_zip_file(defaults, puids, currentVersion, tmpdir)
61+
format_eles = tree.findall('.//sig:FileFormat', NS)
62+
print("Found {} PRONOM FileFormat elements".format(len(format_eles)))
63+
tmpdir, resume = init_sig_download(defaults)
64+
download_signatures(defaults, format_eles, resume, tmpdir)
65+
create_zip_file(defaults, format_eles, currentVersion, tmpdir)
10666
if defaults['deleteTempDirectory']:
10767
print("Deleting temporary folder and files...")
10868
rmtree(tmpdir, ignore_errors=True)
109-
11069
update_versions_xml(defaults, currentVersion)
11170

11271
# TODO: there should be a check here to handle prepare.main exit() signal (-1/0/1/...)
@@ -118,47 +77,106 @@ def run(defaults=None):
11877
sys.exit('Aborting update...')
11978

12079

121-
def download_signatures(defaults, puids, resume_download, tmpdir):
80+
def sig_version_check(defaults):
81+
"""Return a tuple consisting of current sig file version and the derived file name."""
82+
print("Contacting PRONOM...")
83+
currentVersion = get_pronom_sig_version()
84+
if not currentVersion:
85+
sys.exit('Failed to obtain PRONOM signature file version number, please try again.')
86+
87+
print("Querying latest signaturefile version...")
88+
signatureFile = os.path.join(CONFIG_DIR, defaults['signatureFileName'].format(currentVersion))
89+
if os.path.isfile(signatureFile):
90+
print("You already have the latest PRONOM signature file, version", currentVersion)
91+
if not query_yes_no("Update anyway?"):
92+
sys.exit('Aborting update...')
93+
return currentVersion, signatureFile
94+
95+
96+
def download_sig_file(defaults, version, signatureFile):
97+
"""Download the latest version of the PRONOM sigs to signatureFile."""
98+
print("Downloading signature file version {}...".format(version))
99+
currentFile, _ = get_pronom_signature()
100+
if not currentFile:
101+
sys.exit('Failed to obtain PRONOM signature file, please try again.')
102+
print("Writing {0}...".format(defaults['signatureFileName'].format(version)))
103+
with open(signatureFile, 'w') as file_:
104+
file_.write(currentFile)
105+
106+
107+
def init_sig_download(defaults):
108+
"""
109+
Initialise the download of individual PRONOM signatures.
110+
111+
Handles user input and resumption of interupted downloads.
112+
Return a tuple of the temp directory for writing and a boolean resume flag.
113+
"""
114+
print("Downloading signatures can take a while")
115+
if not query_yes_no("Continue and download signatures?"):
116+
sys.exit('Aborting update...')
117+
tmpdir = defaults['tmp_dir']
118+
resume = False
119+
if os.path.isdir(tmpdir):
120+
print("Found previously created temporary folder for download:", tmpdir)
121+
resume = query_yes_no('Do you want to resume download (yes) or start over (no)?')
122+
if resume:
123+
print("Resuming download...")
124+
else:
125+
print("Creating temporary folder for download:", tmpdir)
126+
try:
127+
os.mkdir(tmpdir)
128+
except OSError:
129+
pass
130+
if not os.path.isdir(tmpdir):
131+
sys.stderr.write("Failed to create temporary folder for PUID's, using: " + tmpdir)
132+
return tmpdir, resume
133+
134+
135+
def download_signatures(defaults, format_eles, resume, tmpdir):
122136
"""Download PRONOM signatures and write to individual files."""
123137
print("Downloading signatures, one moment please...")
124-
numberPuids = len(puids)
138+
numberPuids = len(format_eles)
125139
one_percent = (float(numberPuids) / 100)
126140
numfiles = 0
127-
for puid in puids:
128-
puidType, puidNum = puid.split("/")
129-
puidFileName = "puid." + puidType + "." + puidNum + ".xml"
130-
filename = os.path.join(tmpdir, puidFileName)
131-
if os.path.isfile(filename) and resume_download:
132-
numfiles += 1
133-
continue
134-
puid_url = "http://www.nationalarchives.gov.uk/pronom/{}.xml".format(puid)
135-
try:
136-
filehandle = urlopen(puid_url)
137-
except URLError as e:
138-
sys.stderr.write("Failed to download signature file:" + puid_url)
139-
sys.stderr.write("Error:" + str(e))
140-
sys.exit('Please restart and resume download.')
141-
with open(filename, 'wb') as file_:
142-
for lines in filehandle.readlines():
143-
file_.write(lines)
144-
filehandle.close()
141+
for format_ele in format_eles:
142+
download_sig(format_ele, tmpdir, resume)
145143
numfiles += 1
146144
percent = int(float(numfiles) / one_percent)
147145
print(r"{}/{} files [{}%]".format(numfiles, numberPuids, percent))
148146
time.sleep(defaults['http_throttle'])
149147
print("100%")
150148

151149

152-
def create_zip_file(defaults, puids, currentVersion, tmpdir):
150+
def download_sig(format_ele, tmpdir, resume):
151+
"""
152+
Download an individual PRONOM signature.
153+
154+
The signature to be downloaded is identified by the FileFormat element
155+
parameter format_ele. The downloaded signature is written to tmpdir.
156+
"""
157+
puid, puidFileName = get_puid_file_name(format_ele)
158+
filename = os.path.join(tmpdir, puidFileName)
159+
if os.path.isfile(filename) and resume:
160+
return
161+
try:
162+
xml = get_sig_xml_for_puid(puid)
163+
except Exception as e:
164+
sys.stderr.write("Failed to download signature file:" + puid)
165+
sys.stderr.write("Error:" + str(e))
166+
sys.exit('Please restart and resume download.')
167+
with open(filename, 'wb') as file_:
168+
file_.write(xml)
169+
170+
171+
def create_zip_file(defaults, format_eles, currentVersion, tmpdir):
153172
"""Create zip file of signatures."""
154173
print("Creating PRONOM zip...")
155174
compression = zipfile.ZIP_DEFLATED if 'zlib' in sys.modules else zipfile.ZIP_STORED
156175
modes = {zipfile.ZIP_DEFLATED: 'deflated', zipfile.ZIP_STORED: 'stored'}
157176
zf = zipfile.ZipFile(os.path.join(CONFIG_DIR, defaults['pronomZipFileName'].format(currentVersion)), mode='w')
158177
print("Adding files with compression mode", modes[compression])
159-
for puid in puids:
160-
puidType, puidNum = puid.split("/")
161-
puidFileName = "puid.{}.{}.xml".format(puidType, puidNum)
178+
for format_ele in format_eles:
179+
_, puidFileName = get_puid_file_name(format_ele)
162180
filename = os.path.join(tmpdir, puidFileName)
163181
if os.path.isfile(filename):
164182
zf.write(filename, arcname=puidFileName, compress_type=compression)
@@ -167,10 +185,17 @@ def create_zip_file(defaults, puids, currentVersion, tmpdir):
167185
zf.close()
168186

169187

188+
def get_puid_file_name(format_ele):
189+
"""Return a tupe of PUID and PUID file name derived from format_ele."""
190+
puid = format_ele.get('PUID')
191+
puidType, puidNum = puid.split("/")
192+
return puid, 'puid.{}.{}.xml'.format(puidType, puidNum)
193+
194+
170195
def update_versions_xml(defaults, currentVersion):
171196
"""Create new versions identified sig XML file."""
172197
print('Updating versions.xml...')
173-
versions = get_local_pronom_versions()
198+
versions = get_local_versions()
174199
versions.pronom_version = str(currentVersion)
175200
versions.pronom_signature = "formats-v" + str(currentVersion) + ".xml"
176201
versions.pronom_container_signature = defaults['containerVersion']
@@ -188,7 +213,6 @@ def main():
188213
args = parser.parse_args()
189214
opts = DEFAULTS.copy()
190215
opts.update(vars(args))
191-
192216
run(opts)
193217

194218

0 commit comments

Comments
 (0)