-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrankdocuments.py
More file actions
executable file
·53 lines (44 loc) · 1.77 KB
/
rankdocuments.py
File metadata and controls
executable file
·53 lines (44 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
import argparse
import sys
import codecs
from itertools import cycle
from collections import defaultdict as dd
import re
import os.path
from heapq import heappush, heappop
scriptdir = os.path.dirname(os.path.abspath(__file__))
def getoverlap(terms, words):
''' get fraction of words that are also terms '''
return (len(terms.intersection(words))+0.0)/len(words)
def main():
parser = argparse.ArgumentParser(description="Rank documents by bag-of-words type overlap with triggers",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input data file used for making ranking decisions (original english)")
parser.add_argument("--idfile", "-d", nargs='?', type=argparse.FileType('r'), help="id file (docid per line)")
parser.add_argument("--termfile", "-t", nargs='?', type=argparse.FileType('r'), help="term file, presumed to be one term per line")
parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file")
try:
args = parser.parse_args()
except IOError as msg:
parser.error(str(msg))
infile = args.infile
idfile = args.idfile
termfile = args.termfile
outfile = args.outfile
terms = set()
docs = dd(set)
for line in termfile:
terms.add(line.strip().lower())
for doc, seg in zip(idfile, infile):
doc = doc.strip()
seg = seg.strip()
docs[doc].update([x.lower() for x in seg.split()])
scores = []
for doc, words in docs.items():
heappush(scores, (-getoverlap(terms, words), doc))
while len(scores) > 0:
score, doc = heappop(scores)
outfile.write("%s\t%f\n" % (doc, -score))
if __name__ == '__main__':
main()