-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathauthoritymix
More file actions
executable file
·68 lines (58 loc) · 2.85 KB
/
authoritymix
File metadata and controls
executable file
·68 lines (58 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
# -*- encoding: utf-8 -*-
import itertools, logging
from metadata import JSONMetadataWrapper, CF
from copy import deepcopy
import db, dbutil, nameutil, util
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('authoritymix')
def distAuthorityTuples(alreadyPresent, revisions, k, name):
if name is None:
return
name = util.norm(name, util.NormLevel.ONLY_LETTERS)
for rev in revisions:
meta = JSONMetadataWrapper(rev.meta)
for author in meta.get('dc.contributor.author', what=None):
xget = lambda key: util.noneIfEmpty(author.get(key))
authority = xget('authority')
if authority in alreadyPresent:
continue
curName = xget(k)
if curName:
curName = util.norm(curName, util.NormLevel.ONLY_LETTERS)
yield (nameutil.levenshtein(name, curName), xget('authority'))
def main():
for mainRev, otherRevs in dbutil.yieldRevGroups():
meta = JSONMetadataWrapper(deepcopy(mainRev.meta))
alreadyPresent = set(meta.get('dc.contributor.author', what='authority')) - {None, ''}
authors = meta.get('dc.contributor.author', what=None)
for author in authors:
xget = lambda key: util.noneIfEmpty(author.get(key))
if xget('authority') is None:
# Se não estiver definido ID de autoridade para este autor, procura
# o ID do nome de autor mais próximo presente nas duplicatas
scoredTuples = sorted(itertools.chain(
distAuthorityTuples(alreadyPresent, otherRevs, 'value', xget('value')),
distAuthorityTuples(alreadyPresent, otherRevs, '_nomecompleto', xget('_nomecompleto'))
))
confidence = CF.UNCERTAIN
if len(scoredTuples) > 1 and scoredTuples[0][0] == scoredTuples[1][0]:
# Empate de distância
confidence = CF.AMBIGUOUS
if len(scoredTuples) > 0 and scoredTuples[0][1] is not None:
author['authority'] = scoredTuples[0][1]
author['confidence'] = confidence
# Cria nova revisão atualizada caso os metadados tenham mudado
if meta.json != mainRev.meta:
logger.info('Atualizado metadado do item %r: dc.contributor.author=%r',
mainRev.item_id, authors)
newRev = db.Revision(item_id=mainRev.item_id, source='authoritymix', meta=meta.json)
db.session.add(newRev)
# Atualiza revisão principal das revisões que tem esta marcada como duplicata
for rev in otherRevs:
assert rev in db.session
rev.duplicate_of = newRev
db.session.commit()
db.session.refresh_materialized_view(db.LastRevision)
if __name__ == '__main__':
main()