-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathContentExtractor.java
More file actions
90 lines (80 loc) · 3.01 KB
/
ContentExtractor.java
File metadata and controls
90 lines (80 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
package utils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
public class ContentExtractor {
private final static String JOB_NAME = "content extractor";
/**
* Reads list of N relevant id strings
*
* @param N number of documents
* @return list of ids
* @throws IOException
*/
private static ArrayList<String> readNRelevantIds(int N, Configuration conf) throws IOException {
ArrayList<String> result = new ArrayList<>();
FileSystem fs = FileSystem.get(conf);
try (FSDataInputStream fileWithIDRank = fs.open(new Path(Paths.CE_IDS))) {
BufferedReader br = new BufferedReader(new InputStreamReader(fileWithIDRank));
String line = br.readLine();
int iter = 0;
while (line != null && iter < N) {
result.add(line);
iter++;
line = br.readLine();
}
}
return result;
}
/**
* Reads URLs associated with document IDs
*
* @return mapping of IDs to URLs
* @throws IOException
*/
private static HashMap<String, String> readUrls(Configuration conf) throws IOException {
HashMap<String, String> idUrlMap = new HashMap<>();
FileSystem fs = FileSystem.get(conf);
try (FSDataInputStream fileWithUrls = fs.open(new Path(Paths.CE_URLS))) {
BufferedReader br = new BufferedReader(new InputStreamReader(fileWithUrls));
String line = br.readLine();
ObjectMapper mapper = new ObjectMapper();
while (line != null) {
JsonNode node = mapper.readTree(line);
String id = node.get("id").asText();
String url = node.get("url").asText();
idUrlMap.put(id, url);
line = br.readLine();
}
}
return idUrlMap;
}
/**
* Runs the MapReduce
*
* @param args 1 parameter N, arg 2 number of relevant IDs to be shown
*/
public static int run(String[] args, Configuration conf) throws Exception {
int N = Integer.parseInt(args[1]);
ArrayList<String> relevantIds = readNRelevantIds(N, conf);
HashMap<String, String> idUrlMap = readUrls(conf);
System.out.println("ID | Title | Rank | URL");
for (String idRank : relevantIds) {
String[] parts = idRank.split(" ");
String id = parts[0];
String title = parts[1];
String rank = parts[2];
String url = idUrlMap.getOrDefault(id, "URL not found");
System.out.println(id + " | " + title + " | " + rank + " | " + url);
}
return 0;
}
}