-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathaddWorkingsetdocs.py
More file actions
73 lines (59 loc) · 2.79 KB
/
addWorkingsetdocs.py
File metadata and controls
73 lines (59 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
import argparse
# import xml.etree.ElementTree as etree
from collections import defaultdict
import pandas as pd
from lxml import etree as eT
parser = argparse.ArgumentParser(description='Adding WorkingSetDocs to queries xml file',
usage='Receives a query xml file and trec results file',
epilog='Adds a list of documents to each query as WorkingSet')
parser.add_argument('list', metavar='results_file', help='The results file for the WorkingSet')
parser.add_argument('queries', metavar='queries_xml_file', help='The queries xml file')
parser.add_argument('-d', '--docs', metavar='fbDocs', default=5, type=int, help='Number of Feedback documents to add')
class QueriesParser:
def __init__(self, query_file, results_df):
self.file = query_file
_parser = eT.XMLParser(remove_blank_text=True)
self.tree = eT.parse(self.file, _parser)
self.root = self.tree.getroot()
# query number: "Full command"
self.full_queries = defaultdict(str)
self.fb_docs = defaultdict(list)
self.__parse_queries()
self.res = results_df
def __parse_queries(self):
for query in self.root.iter('query'):
self.full_queries[query.find('number').text] = query.find('text').text
def add_working_set_docs(self, number_of_docs):
"""
Adds the workingSetDocs from results file to the original queries
:parameter: number_of_docs: number of docs to add to each query
"""
for qid in self.full_queries.keys():
docs = self.res.loc[qid]['docID'].head(number_of_docs)
self.fb_docs[qid] = list(docs)
def print_output(self):
for query in self.root.iter('query'):
qid = query.find('number').text
fbDocs = self.fb_docs[qid]
for doc in fbDocs:
temp = eT.SubElement(query, 'workingSetDocno')
temp.text = doc
# etree.dump(self.tree)
# etree.dump(self.tree, pretty_print=True)
print(eT.tostring(self.tree, pretty_print=True, encoding='unicode'))
def main(args):
results_file = args.list
query_file = args.queries
number_of_docs = args.docs
results_df = pd.read_table(results_file, delim_whitespace=True, header=None, index_col=0,
names=['qid', 'Q0', 'docID', 'docRank', 'docScore', 'ind'],
dtype={'qid': str, 'Q0': str, 'docID': str, 'docRank': int, 'docScore': float,
'ind': str})
results_df.index = results_df.index.map(str)
qdb = QueriesParser(query_file, results_df)
qdb.add_working_set_docs(number_of_docs)
qdb.print_output()
if __name__ == '__main__':
args = parser.parse_args()
main(args)