-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmkindex.py
More file actions
executable file
·60 lines (48 loc) · 2.04 KB
/
mkindex.py
File metadata and controls
executable file
·60 lines (48 loc) · 2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python3
import gzip
import json
import re
import sys
import urllib.parse
import urllib.request
import xml.sax
from datetime import datetime, timezone
from xml.sax.handler import ContentHandler
import argparse
def parse_args():
parser = argparse.ArgumentParser(description='Merge extracted logo files into a LogoSearch index.')
parser.add_argument('--gzip', action='store', default=True, help='compress output with gzip')
parser.add_argument('--language', action='store', default='en', help='which Wikipedia language we are processing')
parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity')
parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='output file')
parser.add_argument('files', nargs='+', help='list of files to process')
return parser.parse_args()
def main():
global startTime
startTime = datetime.now(timezone.utc)
args = parse_args()
images = []
for file in args.files:
sys.stderr.write(f'INFO: Processing {file}...\n')
fin = open(file)
images += json.load(fin)
fin.close()
sys.stderr.write(f'INFO: Writing to {args.output.name} (compression={args.gzip})...\n')
#fout = gzip.compress(args.output) if args.gzip else args.output
fout = args.output
json.dump({
'handle': f'wikipedia-{args.language}-infobox',
'images': images,
'lastmodified': datetime.now(timezone.utc).isoformat(),
'logo': 'https://www.vectorlogo.zone/logos/wikipedia/wikipedia-icon.svg',
'name': f'Wikipedia {args.language} Infobox Logos',
'provider': 'remote',
'provider_icon': 'https://logosear.ch/images/remote.svg',
'url': f'https://{args.language}.wikipedia.org/wiki/Main_Page',
'website': f'https://{args.language}.wikipedia.org/wiki/Main_Page'
}, fout, indent=2)
now = datetime.now(timezone.utc)
delta = now - startTime
sys.stderr.write(f'INFO: complete! elapsed time={delta}\n')
if __name__ == '__main__':
main()