-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathQueryVectorizer.java
More file actions
72 lines (55 loc) · 2.37 KB
/
QueryVectorizer.java
File metadata and controls
72 lines (55 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
package utils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import java.io.*;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import java.util.*;
public class QueryVectorizer {
/**
* Return json in format {word:tf/idf} for each word in query
* @param args - args contain query string
* @param configuration - conf to open hdfs
* @return json formatted string
*/
public static String queryToVector(String[] args, Configuration configuration) throws Exception {
Map<String, Double> queryVector = new HashMap<String, Double>();
//Get text query (last argument in args)
String query = args[args.length - 1].toLowerCase();
StringTokenizer queryWords = new StringTokenizer(query, " \'\n.,!?:()[]{};\\/\"*");
//Calculate the QF for each word in the query and put to the map
while (queryWords.hasMoreTokens()) {
String word = queryWords.nextToken();
if (queryVector.containsKey(word)) {
queryVector.put(word, queryVector.get(word) + 1.0);
} else {
queryVector.put(word, 1.0);
}
}
//Load file for IDF from vocabulary
FileSystem fs = FileSystem.get(configuration);
FSDataInputStream fileWithIDF = fs.open(new Path(Paths.IND_IN2));
;
BufferedReader br = new BufferedReader(new InputStreamReader(fileWithIDF));
// For each pair (word, idf)
String line = br.readLine();
while (line != null) {
StringTokenizer lines = new StringTokenizer(line, "\t");
String word = lines.nextToken();
String idf = lines.nextToken();
//If word in map, update value in map
if (queryVector.containsKey(word)) {
queryVector.put(word, queryVector.get(word) / Double.parseDouble(idf.substring(1)));
}
line = br.readLine();
}
String result = "";
for (String key : queryVector.keySet()) {
String value = queryVector.get(key).toString();
result = result + ",\"" + key + "\":\"" + value + "\"";
}
result = "{" + result.substring(1) + "}";
return (result);
}
}