Skip to content

Commit 5aa1762

Browse files
committed
Add SruOpener (#510)
Every single output is a valid XML by itself.
1 parent b9eebb4 commit 5aa1762

File tree

3 files changed

+237
-0
lines changed

3 files changed

+237
-0
lines changed

metafacture-io/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ description = 'Modules for reading and writing data streams'
1919

2020
dependencies {
2121
api project(':metafacture-framework')
22+
api project(':metafacture-formatting')
23+
api project(':metafacture-xml')
2224
implementation project(':metafacture-commons')
2325
implementation "commons-io:commons-io:${versions.commons_io}"
2426
implementation "org.apache.commons:commons-compress:${versions.commons_compress}"
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
/* Copyright 2013 Pascal Christoph.
2+
* Licensed under the Eclipse Public License 1.0 */
3+
4+
package org.metafacture.io;
5+
6+
import org.metafacture.framework.FluxCommand;
7+
import org.metafacture.framework.MetafactureException;
8+
import org.metafacture.framework.ObjectReceiver;
9+
import org.metafacture.framework.annotations.Description;
10+
import org.metafacture.framework.annotations.In;
11+
import org.metafacture.framework.annotations.Out;
12+
import org.metafacture.framework.helpers.DefaultObjectPipe;
13+
import org.w3c.dom.Document;
14+
import org.w3c.dom.Node;
15+
import org.xml.sax.SAXException;
16+
17+
import java.io.ByteArrayInputStream;
18+
import java.io.IOException;
19+
import java.io.InputStream;
20+
import java.io.InputStreamReader;
21+
import java.io.Reader;
22+
import java.io.StringWriter;
23+
import java.net.HttpURLConnection;
24+
import java.net.URL;
25+
import javax.xml.parsers.DocumentBuilder;
26+
import javax.xml.parsers.DocumentBuilderFactory;
27+
import javax.xml.parsers.ParserConfigurationException;
28+
import javax.xml.transform.Transformer;
29+
import javax.xml.transform.TransformerException;
30+
import javax.xml.transform.TransformerFactory;
31+
import javax.xml.transform.dom.DOMSource;
32+
import javax.xml.transform.stream.StreamResult;
33+
34+
/**
35+
* Opens an SRU (Search Retrieval by URL) stream and passes a reader to the receiver. Pages through the SRU.
36+
*
37+
* @author Pascal Christoph (dr0i)
38+
*/
39+
@Description(
40+
"Opens a SRU stream and passes a reader to the receiver. The input is the base URL of the SRU service " +
41+
"to be retrieved from. Mandatory argument is: QUERY.\n" +
42+
"The output is an XML document holding the user defined \"maximumRecords\" as documents. If there are" +
43+
"more documents than defined by MAXIMUM_RECORDS and there are more documents wanted (defined by " +
44+
"\"totalRecords\") there will be consecutive XML documents output as it pages through the SRU.")
45+
@In(String.class)
46+
@Out(java.io.Reader.class)
47+
@FluxCommand("open-sru")
48+
public final class SruOpener extends DefaultObjectPipe<String, ObjectReceiver<Reader>> {
49+
50+
private static final String OPERATION = "searchRetrieve";
51+
private static final String RECORD_SCHEMA = "MARC21-xml";
52+
private static final String USER_AGENT = "metafacture-core";
53+
private static final String VERSION = "2.0";
54+
55+
private static final int CONNECTION_TIMEOUT = 11000;
56+
private static final int MAXIMUM_RECORDS = 10;
57+
private static final int START_RECORD = 1;
58+
private String operation = OPERATION;
59+
private String query;
60+
private String recordSchema = RECORD_SCHEMA;
61+
private String userAgent = USER_AGENT;
62+
private String version = VERSION;
63+
64+
private int maximumRecords = MAXIMUM_RECORDS;
65+
private int startRecord = START_RECORD;
66+
private int totalRecords = Integer.MAX_VALUE;
67+
int numberOfRecords = Integer.MAX_VALUE;
68+
69+
private boolean stopRetrieving;
70+
private int recordsRetrieved;
71+
72+
private String xmlDeclarationTemplate = "<?xml version=\"%s\" encoding=\"%s\"?>";
73+
private String xmlDeclaration;
74+
75+
/**
76+
* Default constructor
77+
*/
78+
public SruOpener() {
79+
}
80+
81+
/**
82+
* Sets the User Agent to use. <strong>Default value: {@value USER_AGENT}</strong>.
83+
*
84+
* @param userAgent a user agent to be used when opening a URL
85+
*/
86+
public void setUserAgent(final String userAgent) {
87+
this.userAgent = userAgent;
88+
}
89+
90+
/**
91+
* Sets the query of the search.
92+
* <strong>Setting a query is mandatory.</strong>
93+
*
94+
* @param query the query
95+
*/
96+
97+
public void setQuery(final String query) {
98+
this.query = query;
99+
}
100+
101+
/**
102+
* Sets total number of records to be retrieved. <strong>Default value: indefinite (as in "all")
103+
* </strong>.
104+
*
105+
* @param totalRecords total number of records to be retrieved
106+
*/
107+
public void setTotal(final String totalRecords) {
108+
this.totalRecords = Integer.parseInt(totalRecords);
109+
}
110+
111+
/**
112+
* Sets the maximum of records returned in one lookup. <strong>Default value: {@value MAXIMUM_RECORDS}</strong>.
113+
* The lookup is repeated as long as {@link #maximumRecords} is lesser than {@link #totalRecords}.
114+
*
115+
* @param maximumRecords maximum of records returned in one lookup
116+
*/
117+
public void setMaximumRecords(final String maximumRecords) {
118+
this.maximumRecords = Integer.parseInt(maximumRecords);
119+
}
120+
121+
/**
122+
* Sets where to start when retrieving records. <strong>Default value: {@value START_RECORD}</strong>.
123+
*
124+
* @param startRecord where to start when retrieving records
125+
*/
126+
public void setStartRecord(final String startRecord) {
127+
this.startRecord = Integer.parseInt(startRecord);
128+
}
129+
130+
/**
131+
* Sets the format of the retrieved record data. <strong>Default value: {@value RECORD_SCHEMA}</strong>.
132+
*
133+
* @param recordSchema the format of the data of the records
134+
*/
135+
public void setRecordSchema(final String recordSchema) {
136+
this.recordSchema = recordSchema;
137+
}
138+
139+
/**
140+
* Sets the kind of operation of the lookup. <strong>Default value: {@value OPERATION}</strong>.
141+
*
142+
* @param operation the kind of operation of the lookup
143+
*/
144+
public void setOperation(final String operation) {
145+
this.operation = operation;
146+
}
147+
148+
/**
149+
* Sets the version of the lookup. <strong>Default value: {@value VERSION}</strong>.
150+
*
151+
* @param version the version of the lookup
152+
*/
153+
public void setVersion(final String version) {
154+
this.version = version;
155+
}
156+
157+
@Override
158+
public void process(final String baseUrl) {
159+
160+
StringBuilder srUrl = new StringBuilder(baseUrl);
161+
if (query != null) {
162+
srUrl.append("?query=").append(query).append("&operation=").append(operation).append("&recordSchema=")
163+
.append(recordSchema).append("&version=").append(version);
164+
}
165+
else {
166+
throw new IllegalArgumentException("Missing mandatory parameter 'query'");
167+
}
168+
169+
while (!stopRetrieving && recordsRetrieved < totalRecords && (startRecord < numberOfRecords)) {
170+
InputStream inputStream = getXmlDocsViaSru(srUrl);
171+
getReceiver().process(new InputStreamReader(inputStream));
172+
}
173+
174+
}
175+
176+
private InputStream getXmlDocsViaSru(final StringBuilder srUrl) {
177+
try {
178+
InputStream inputStreamOfURl = retrieveUrl(srUrl, startRecord, maximumRecords);
179+
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
180+
DocumentBuilder docBuilder = factory.newDocumentBuilder();
181+
Document xmldoc = docBuilder.parse(inputStreamOfURl);
182+
183+
Transformer t = TransformerFactory.newInstance().newTransformer();
184+
StringWriter stringWriter = new StringWriter();
185+
t.transform(new DOMSource(xmldoc), new StreamResult(stringWriter));
186+
187+
numberOfRecords = getIntegerValueFromElement(xmldoc,"numberOfRecords", 0);
188+
int recordPosition = getIntegerValueFromElement(xmldoc,"recordPosition", 0);
189+
int nextRecordPosition = getIntegerValueFromElement(xmldoc,"nextRecordPosition", totalRecords);
190+
191+
recordsRetrieved = recordsRetrieved + nextRecordPosition - recordPosition;
192+
startRecord = nextRecordPosition; // grenzwert : wenn maximumRcords > als in echt
193+
194+
return new ByteArrayInputStream(stringWriter.toString().getBytes());
195+
196+
}
197+
catch (final IOException | TransformerException | SAXException | ParserConfigurationException e) {
198+
throw new MetafactureException(e);
199+
}
200+
}
201+
202+
private int getIntegerValueFromElement(final Document xmlDoc, final String tagName, final int fallback) {
203+
Node node = xmlDoc.getElementsByTagName(tagName).item(0);
204+
if (node != null) {
205+
return Integer.parseInt(node.getTextContent());
206+
}
207+
return fallback;
208+
}
209+
210+
private InputStream retrieveUrl(StringBuilder srUrl, int startRecord, int maximumRecords) throws IOException {
211+
final URL urlToOpen =
212+
new URL(srUrl.toString() + "&maximumRecords=" + maximumRecords + "&startRecord=" + startRecord);
213+
final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection();
214+
215+
connection.setConnectTimeout(CONNECTION_TIMEOUT);
216+
if (!userAgent.isEmpty()) {
217+
connection.setRequestProperty("User-Agent", userAgent);
218+
}
219+
InputStream inputStream = getInputStream(connection);
220+
221+
return inputStream;
222+
}
223+
224+
private InputStream getInputStream(final HttpURLConnection connection) {
225+
try {
226+
return connection.getInputStream();
227+
}
228+
catch (final IOException e) {
229+
stopRetrieving = true;
230+
return connection.getErrorStream();
231+
}
232+
}
233+
234+
}

metafacture-io/src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ write org.metafacture.io.ObjectWriter
2222
as-records org.metafacture.io.RecordReader
2323
open-resource org.metafacture.io.ResourceOpener
2424
open-tar org.metafacture.io.TarReader
25+
open-sru org.metafacture.io.SruOpener

0 commit comments

Comments
 (0)