Skip to content

Commit 48cf697

Browse files
authored
Merge pull request #4 from scholarsportal/language_support
Language support
2 parents e23af3f + ec6374d commit 48cf697

File tree

12 files changed

+153
-18
lines changed

12 files changed

+153
-18
lines changed

pom.xml

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>io.gdcc.export</groupId>
88
<artifactId>ddi-pdf</artifactId>
9-
<version>1.0.0-SNAPSHOT</version>
9+
<version>1.1.0-SNAPSHOT</version>
1010
<packaging>jar</packaging>
1111

1212
<name>DDI PDF Exporter</name>
@@ -44,6 +44,31 @@
4444
<artifactId>fop</artifactId>
4545
<version>2.9</version>
4646
</dependency>
47+
<dependency>
48+
<groupId>org.apache.maven.plugins</groupId>
49+
<artifactId>maven-compiler-plugin</artifactId>
50+
<version>3.13.0</version>
51+
</dependency>
52+
53+
54+
<dependency>
55+
<groupId>org.apache.tika</groupId>
56+
<artifactId>tika-core</artifactId>
57+
<version>2.9.2</version>
58+
</dependency>
59+
<dependency>
60+
<groupId>org.apache.tika</groupId>
61+
<artifactId>tika-langdetect</artifactId>
62+
<version>2.9.2</version>
63+
<type>pom</type>
64+
</dependency>
65+
<dependency>
66+
<groupId>org.apache.tika</groupId>
67+
<artifactId>tika-langdetect-optimaize</artifactId>
68+
<version>2.9.2</version>
69+
</dependency>
70+
71+
4772
</dependencies>
4873

4974
<!-- There is no <build> section here because the Parent POM takes care of it -->

src/main/java/io/gdcc/export/ddipdf/DdiPdfExportUtil.java

Lines changed: 111 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,139 @@
44
import org.apache.fop.apps.Fop;
55
import org.apache.fop.apps.FopFactory;
66
import org.apache.fop.apps.MimeConstants;
7+
import org.apache.tika.language.detect.LanguageDetector;
8+
import org.apache.tika.language.detect.LanguageResult;
9+
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
10+
711
import org.slf4j.Logger;
812
import org.slf4j.LoggerFactory;
13+
import org.w3c.dom.DOMException;
14+
import org.w3c.dom.Document;
15+
16+
import org.xml.sax.SAXException;
917

18+
import javax.xml.parsers.DocumentBuilder;
19+
import javax.xml.parsers.DocumentBuilderFactory;
20+
import javax.xml.parsers.ParserConfigurationException;
1021
import javax.xml.stream.XMLStreamException;
1122
import javax.xml.transform.Result;
1223
import javax.xml.transform.Source;
1324
import javax.xml.transform.Transformer;
1425
import javax.xml.transform.TransformerFactory;
1526
import javax.xml.transform.sax.SAXResult;
1627
import javax.xml.transform.stream.StreamSource;
17-
import java.io.File;
28+
import javax.xml.xpath.XPathConstants;
29+
import javax.xml.xpath.XPathExpressionException;
30+
import javax.xml.xpath.XPath;
31+
import javax.xml.xpath.XPathExpression;
32+
import javax.xml.xpath.XPathFactory;
33+
34+
import java.io.IOException;
1835
import java.io.InputStream;
1936
import java.io.OutputStream;
37+
import java.io.ByteArrayOutputStream;
38+
import java.io.ByteArrayInputStream;
39+
import java.io.File;
40+
41+
import java.net.URL;
2042

2143

2244
public class DdiPdfExportUtil {
2345

2446
private static final Logger logger = LoggerFactory.getLogger(DdiPdfExportUtil.class);
47+
public static class TitleAndDescription {
48+
public String Title;
49+
public String Description;
50+
public String Language;
51+
}
2552

2653
private DdiPdfExportUtil() {
2754
// As this is a util class, adding a private constructor disallows instances of this class.
2855
}
56+
57+
private static String detectLanguage(TitleAndDescription td) {
58+
String lang = "en"; //default language
59+
LanguageDetector detector = new OptimaizeLangDetector().loadModels();
60+
LanguageResult result1 = detector.detect(td.Title );
61+
String lang1 = result1.getLanguage();
62+
if (result1.isReasonablyCertain()) {
63+
lang = lang1;
64+
} else {
65+
LanguageResult result2 = detector.detect(td.Description);
66+
if (result2.isReasonablyCertain()) {
67+
lang = result2.getLanguage();
68+
}
69+
}
70+
71+
URL found = DdiPdfExportUtil.class.getResource("messages_" + lang + ".properties.xml");
72+
73+
if (found != null) {
74+
return lang;
75+
} else {
76+
return null;
77+
}
78+
}
79+
80+
private static TitleAndDescription getTitleAndDescription(InputStream datafile) {
81+
82+
TitleAndDescription titleAndDescription = new TitleAndDescription();
83+
String lang = null;
84+
try {
85+
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
86+
DocumentBuilder builder = dbf.newDocumentBuilder();
87+
Document doc = builder.parse(datafile);
88+
try {
89+
lang = doc.getDocumentElement().getAttribute("xml:lang");
90+
} catch (DOMException e) {
91+
lang = null;
92+
logger.warn("No language attribute");
93+
}
94+
if (lang != null && !lang.equals("") ) {
95+
titleAndDescription.Language = lang;
96+
} else {
97+
XPathFactory xPathfactory = XPathFactory.newInstance();
98+
XPath xpath = xPathfactory.newXPath();
99+
try {
100+
XPathExpression expr = xpath.compile("/codeBook/stdyDscr/citation/titlStmt/titl/text()");
101+
titleAndDescription.Title = (String) expr.evaluate(doc, XPathConstants.STRING);
102+
expr = xpath.compile("/codeBook/stdyDscr/stdyInfo/abstract/text()");
103+
titleAndDescription.Description = (String) expr.evaluate(doc, XPathConstants.STRING);
104+
} catch (XPathExpressionException e) {
105+
logger.error("Error finding title and description");
106+
logger.error(e.getMessage());
107+
}
108+
}
109+
110+
return titleAndDescription;
111+
} catch (ParserConfigurationException | SAXException | IOException e) {
112+
logger.warn(e.getMessage());
113+
return null;
114+
}
115+
116+
}
29117

30118
public static void datasetPdfDDI(InputStream datafile, OutputStream outputStream) throws XMLStreamException {
31119
try {
120+
String localeEnvVar = "en"; //default language
121+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
122+
datafile.transferTo(baos);
123+
124+
byte[] buffer = baos.toByteArray();
125+
InputStream clone1 = new ByteArrayInputStream(buffer);
126+
InputStream clone2 = new ByteArrayInputStream(buffer);
127+
128+
TitleAndDescription td = getTitleAndDescription(clone1);
129+
if (td != null) {
130+
if (td.Language != null) {
131+
localeEnvVar = td.Language;
132+
} else {
133+
String lang = detectLanguage(td);
134+
if (lang != null && !lang.equals("")) {
135+
localeEnvVar = lang;
136+
}
137+
}
138+
}
139+
32140
InputStream styleSheetInput = DdiPdfExportUtil.class.getResourceAsStream("ddi-to-fo.xsl");
33141

34142
final FopFactory fopFactory = FopFactory.newInstance(new File(".").toURI());
@@ -41,20 +149,11 @@ public static void datasetPdfDDI(InputStream datafile, OutputStream outputStream
41149
Source mySrc = new StreamSource(styleSheetInput);
42150
factory.setURIResolver(new FileResolver());
43151
Transformer transformer = factory.newTransformer(mySrc);
44-
45-
// Set the value of a <param> in the stylesheet
46-
String localeEnvVar = System.getenv().get("LANG");
47-
if (localeEnvVar != null) {
48-
if (localeEnvVar.indexOf('.') > 0) {
49-
localeEnvVar = localeEnvVar.substring(0, localeEnvVar.indexOf('.'));
50-
}
51-
} else {
52-
localeEnvVar = "en";
53-
}
152+
54153
transformer.setParameter("language-code", localeEnvVar);
55154

56155
// Setup input for XSLT transformation
57-
Source src = new StreamSource(datafile);
156+
Source src = new StreamSource(clone2);
58157

59158
// Resulting SAX events (the generated FO) must be piped through to FOP
60159
Result res = new SAXResult(fop.getDefaultHandler());

src/main/java/io/gdcc/export/ddipdf/FileResolver.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ public class FileResolver implements URIResolver {
1717
public Source resolve(String href, String base) throws TransformerException {
1818
logger.info("In File Resolver: {} {}", href, base);
1919
if (href.startsWith("file:")) {
20-
String url =href.substring("file:".length()); // some calculation from its parameters
21-
InputStream is = this.getClass().getResourceAsStream(url);
22-
return new StreamSource(is);
20+
int index = href.lastIndexOf("/");
21+
String url =href.substring(index + 1); // some calculation from its parameters
22+
InputStream is = this.getClass().getResourceAsStream(url);
23+
return new StreamSource(is);
24+
2325
} else {
2426
return null;
2527
}

src/main/java/io/gdcc/export/ddipdf/PdfCodeBookExporter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public String getFormatName() {
2525
public String getDisplayName(Locale locale) {
2626
//String displayName = BundleUtil.getStringFromBundle("dataset.exportBtn.itemLabel.pdf", locale);
2727
String displayName = null;
28-
return Optional.ofNullable(displayName).orElse("DDI pdf codebook");
28+
return Optional.ofNullable(displayName).orElse("DDI PDF Codebook");
2929
}
3030

3131
@Override

src/main/resources/io/gdcc/export/ddipdf/ddi-to-fo.xsl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@
128128
<!--
129129
Report title
130130
-->
131-
<xsl:param name="report-title" select=" 'Study Documentation' "/>
131+
<xsl:param name="report-title" select="$msg/*/entry[@key='Study Documentation']"/>
132+
133+
132134
<!--
133135
STYLES
134136
-->

src/main/resources/io/gdcc/export/ddipdf/messages_en.properties.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?xml version="1.0" encoding="utf-8" standalone="no"?>
22
<properties>
33
<comment>Generated by Properties2Xml on Fri Apr 11 09:45:39 EDT 2008</comment>
4+
<entry key="Study Documentation">Study Documentation</entry>
45
<entry key="Valid">Valid</entry>
56
<entry key="Frequency_table_not_shown">Frequency table not shown</entry>
67
<entry key="Derivation">Derivation</entry>

src/main/resources/io/gdcc/export/ddipdf/messages_es.properties.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?xml version="1.0" encoding="utf-8" standalone="no"?>
22
<properties>
33
<comment>Generated by Properties2Xml on Fri Apr 11 09:45:40 EDT 2008</comment>
4+
<entry key="Study Documentation">Documentación de estudio</entry>
45
<entry key="Valid">Válido</entry>
56
<entry key="Frequency_table_not_shown">No se presentan las tablas de frecuencias</entry>
67
<entry key="Derivation">Derivación</entry>

src/main/resources/io/gdcc/export/ddipdf/messages_fr.properties.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?xml version="1.0" encoding="utf-8" standalone="no"?>
22
<properties>
33
<comment>Generated by Properties2Xml on Fri Apr 11 09:45:40 EDT 2008</comment>
4+
<entry key="Study Documentation">Documentation d'étude</entry>
45
<entry key="Valid">Valide</entry>
56
<entry key="Frequency_table_not_shown">Tableau de fréquences non-affiché</entry>
67
<entry key="Derivation">Mode de calcul</entry>

src/main/resources/io/gdcc/export/ddipdf/messages_ja.properties.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?xml version="1.0" encoding="utf-8"?>
22
<properties>
33
<comment>Generated by Properties2Xml on Tue Feb 13 13:55:43 EST 2007</comment>
4+
<entry key="Study Documentation">研究文書</entry>
45
<entry key="Valid">有効な</entry>
56
<entry key="Frequency_table_not_shown">度数表(Frequency table)は表示されません</entry>
67
<entry key="Derivation">由来</entry>

src/main/resources/io/gdcc/export/ddipdf/messages_nn.properties.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?xml version="1.0" encoding="utf-8" standalone="no"?>
22
<properties>
33
<comment>Generated by Properties2Xml on Fri Apr 11 09:45:39 EDT 2008</comment>
4+
<entry key="Study Documentation">Studiedokumentasjon</entry>
45
<entry key="Valid">Gyldige</entry>
56
<entry key="Frequency_table_not_shown">Frekvenstabell ikke vist</entry>
67
<entry key="Derivation">Avledning</entry>

0 commit comments

Comments
 (0)