Skip to content

Commit 84aa683

Browse files
authored
LUCENE-9723: Hunspell: update sanity tests that load all dictionaries (#2290)
1 parent d0ae2bd commit 84aa683

File tree

4 files changed

+69
-541
lines changed

4 files changed

+69
-541
lines changed

gradle/testing/randomization/policies/tests.policy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ grant {
9393

9494
// Some Hunspell tests may read from external files specified in system properties
9595
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
96+
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
9697
};
9798

9899
// Permissions to support ant build

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/package-info.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@
1616
*/
1717

1818
/**
19-
* Stemming TokenFilter using a Java implementation of the <a
20-
* href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">Hunspell stemming
21-
* algorithm.</a>
19+
* A Java implementation of <a href="http://hunspell.github.io/">Hunspell</a> stemming and
20+
* spell-checking algorithms, and a stemming TokenFilter based on it.
2221
*
23-
* <p>Dictionaries can be found on <a
24-
* href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice's wiki</a>
22+
* <p>For dictionaries, see e.g. <a href="https://github.com/LibreOffice/dictionaries">LibreOffice
23+
* repository</a> or <a href="https://github.com/wooorm/dictionaries">Titus Wormer's collection
24+
* (UTF)</a>
25+
*
26+
* @see org.apache.lucene.analysis.hunspell.HunspellStemFilter
27+
* @see org.apache.lucene.analysis.hunspell.SpellChecker
2528
*/
2629
package org.apache.lucene.analysis.hunspell;

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java

Lines changed: 60 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -16,224 +16,87 @@
1616
*/
1717
package org.apache.lucene.analysis.hunspell;
1818

19+
import java.io.IOException;
1920
import java.io.InputStream;
2021
import java.nio.file.Files;
2122
import java.nio.file.Path;
22-
import java.nio.file.Paths;
23-
import org.apache.lucene.store.Directory;
24-
import org.apache.lucene.util.IOUtils;
23+
import java.text.ParseException;
24+
import java.util.List;
25+
import java.util.stream.Collectors;
26+
import org.apache.lucene.store.BaseDirectoryWrapper;
2527
import org.apache.lucene.util.LuceneTestCase;
2628
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
2729
import org.apache.lucene.util.RamUsageTester;
28-
import org.apache.lucene.util.TestUtil;
30+
import org.junit.Assume;
2931
import org.junit.Ignore;
3032

3133
/**
32-
* Can be retrieved via: wget --mirror -np
33-
* http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ Note some
34-
* of the files differ only in case. This may be a problem on your operating system!
34+
* Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
35+
* prints their memory usage. All *.aff files are traversed directly inside the given directory or
36+
* in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
37+
* examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
38+
* documentation}
3539
*/
3640
@Ignore("enable manually")
3741
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
3842
public class TestAllDictionaries extends LuceneTestCase {
3943

40-
// set this to the location of where you downloaded all the files
41-
static final Path DICTIONARY_HOME =
42-
Paths.get(
43-
"/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
44-
45-
final String tests[] = {
46-
/* zip file */
47-
/* dictionary */
48-
/* affix */
49-
"af_ZA.zip", "af_ZA.dic", "af_ZA.aff",
50-
"ak_GH.zip", "ak_GH.dic", "ak_GH.aff",
51-
"bg_BG.zip", "bg_BG.dic", "bg_BG.aff",
52-
"ca_ANY.zip", "catalan.dic", "catalan.aff",
53-
"ca_ES.zip", "ca_ES.dic", "ca_ES.aff",
54-
// BUG: broken flag "cop_EG.zip", "cop_EG.dic", "cop_EG.aff",
55-
"cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff",
56-
"cy_GB.zip", "cy_GB.dic", "cy_GB.aff",
57-
"da_DK.zip", "da_DK.dic", "da_DK.aff",
58-
"de_AT.zip", "de_AT.dic", "de_AT.aff",
59-
"de_CH.zip", "de_CH.dic", "de_CH.aff",
60-
"de_DE.zip", "de_DE.dic", "de_DE.aff",
61-
"de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff",
62-
"de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff",
63-
"de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff",
64-
"el_GR.zip", "el_GR.dic", "el_GR.aff",
65-
"en_AU.zip", "en_AU.dic", "en_AU.aff",
66-
"en_CA.zip", "en_CA.dic", "en_CA.aff",
67-
"en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff",
68-
"en_GB.zip", "en_GB.dic", "en_GB.aff",
69-
"en_NZ.zip", "en_NZ.dic", "en_NZ.aff",
70-
"eo.zip", "eo_l3.dic", "eo_l3.aff",
71-
"eo_EO.zip", "eo_EO.dic", "eo_EO.aff",
72-
"es_AR.zip", "es_AR.dic", "es_AR.aff",
73-
"es_BO.zip", "es_BO.dic", "es_BO.aff",
74-
"es_CL.zip", "es_CL.dic", "es_CL.aff",
75-
"es_CO.zip", "es_CO.dic", "es_CO.aff",
76-
"es_CR.zip", "es_CR.dic", "es_CR.aff",
77-
"es_CU.zip", "es_CU.dic", "es_CU.aff",
78-
"es_DO.zip", "es_DO.dic", "es_DO.aff",
79-
"es_EC.zip", "es_EC.dic", "es_EC.aff",
80-
"es_ES.zip", "es_ES.dic", "es_ES.aff",
81-
"es_GT.zip", "es_GT.dic", "es_GT.aff",
82-
"es_HN.zip", "es_HN.dic", "es_HN.aff",
83-
"es_MX.zip", "es_MX.dic", "es_MX.aff",
84-
"es_NEW.zip", "es_NEW.dic", "es_NEW.aff",
85-
"es_NI.zip", "es_NI.dic", "es_NI.aff",
86-
"es_PA.zip", "es_PA.dic", "es_PA.aff",
87-
"es_PE.zip", "es_PE.dic", "es_PE.aff",
88-
"es_PR.zip", "es_PR.dic", "es_PR.aff",
89-
"es_PY.zip", "es_PY.dic", "es_PY.aff",
90-
"es_SV.zip", "es_SV.dic", "es_SV.aff",
91-
"es_UY.zip", "es_UY.dic", "es_UY.aff",
92-
"es_VE.zip", "es_VE.dic", "es_VE.aff",
93-
"et_EE.zip", "et_EE.dic", "et_EE.aff",
94-
"fo_FO.zip", "fo_FO.dic", "fo_FO.aff",
95-
"fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff",
96-
"fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
97-
"fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff",
98-
"fy_NL.zip", "fy_NL.dic", "fy_NL.aff",
99-
"ga_IE.zip", "ga_IE.dic", "ga_IE.aff",
100-
"gd_GB.zip", "gd_GB.dic", "gd_GB.aff",
101-
"gl_ES.zip", "gl_ES.dic", "gl_ES.aff",
102-
"gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff",
103-
"gu_IN.zip", "gu_IN.dic", "gu_IN.aff",
104-
"he_IL.zip", "he_IL.dic", "he_IL.aff",
105-
"hi_IN.zip", "hi_IN.dic", "hi_IN.aff",
106-
"hil_PH.zip", "hil_PH.dic", "hil_PH.aff",
107-
"hr_HR.zip", "hr_HR.dic", "hr_HR.aff",
108-
"hu_HU.zip", "hu_HU.dic", "hu_HU.aff",
109-
"hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff",
110-
"ia.zip", "ia.dic", "ia.aff",
111-
"id_ID.zip", "id_ID.dic", "id_ID.aff",
112-
"it_IT.zip", "it_IT.dic", "it_IT.aff",
113-
"ku_TR.zip", "ku_TR.dic", "ku_TR.aff",
114-
"la.zip", "la.dic", "la.aff",
115-
"lt_LT.zip", "lt_LT.dic", "lt_LT.aff",
116-
"lv_LV.zip", "lv_LV.dic", "lv_LV.aff",
117-
"mg_MG.zip", "mg_MG.dic", "mg_MG.aff",
118-
"mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff",
119-
"mk_MK.zip", "mk_MK.dic", "mk_MK.aff",
120-
"mos_BF.zip", "mos_BF.dic", "mos_BF.aff",
121-
"mr_IN.zip", "mr_IN.dic", "mr_IN.aff",
122-
"ms_MY.zip", "ms_MY.dic", "ms_MY.aff",
123-
"nb_NO.zip", "nb_NO.dic", "nb_NO.aff",
124-
"ne_NP.zip", "ne_NP.dic", "ne_NP.aff",
125-
"nl_NL.zip", "nl_NL.dic", "nl_NL.aff",
126-
"nl_med.zip", "nl_med.dic", "nl_med.aff",
127-
"nn_NO.zip", "nn_NO.dic", "nn_NO.aff",
128-
"nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff",
129-
"ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff",
130-
"ny_MW.zip", "ny_MW.dic", "ny_MW.aff",
131-
"oc_FR.zip", "oc_FR.dic", "oc_FR.aff",
132-
"pl_PL.zip", "pl_PL.dic", "pl_PL.aff",
133-
"pt_BR.zip", "pt_BR.dic", "pt_BR.aff",
134-
"pt_PT.zip", "pt_PT.dic", "pt_PT.aff",
135-
"ro_RO.zip", "ro_RO.dic", "ro_RO.aff",
136-
"ru_RU.zip", "ru_RU.dic", "ru_RU.aff",
137-
"ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff",
138-
"ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff",
139-
"rw_RW.zip", "rw_RW.dic", "rw_RW.aff",
140-
"sk_SK.zip", "sk_SK.dic", "sk_SK.aff",
141-
"sl_SI.zip", "sl_SI.dic", "sl_SI.aff",
142-
"sq_AL.zip", "sq_AL.dic", "sq_AL.aff",
143-
"ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff",
144-
"st_ZA.zip", "st_ZA.dic", "st_ZA.aff",
145-
"sv_SE.zip", "sv_SE.dic", "sv_SE.aff",
146-
"sw_KE.zip", "sw_KE.dic", "sw_KE.aff",
147-
"tet_ID.zip", "tet_ID.dic", "tet_ID.aff",
148-
"th_TH.zip", "th_TH.dic", "th_TH.aff",
149-
"tl_PH.zip", "tl_PH.dic", "tl_PH.aff",
150-
"tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff",
151-
"ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff",
152-
"uk_UA.zip", "uk_UA.dic", "uk_UA.aff",
153-
"ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff",
154-
"vi_VN.zip", "vi_VN.dic", "vi_VN.aff",
155-
"xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff",
156-
"zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff",
157-
};
158-
159-
public void test() throws Exception {
160-
Path tmp = LuceneTestCase.createTempDir();
161-
162-
for (int i = 0; i < tests.length; i += 3) {
163-
Path f = DICTIONARY_HOME.resolve(tests[i]);
164-
assert Files.exists(f);
165-
166-
IOUtils.rm(tmp);
167-
Files.createDirectory(tmp);
168-
169-
try (InputStream in = Files.newInputStream(f);
170-
Directory tempDir = getDirectory()) {
171-
TestUtil.unzip(in, tmp);
172-
Path dicEntry = tmp.resolve(tests[i + 1]);
173-
Path affEntry = tmp.resolve(tests[i + 2]);
44+
private static List<Path> findAllAffixFiles() throws IOException {
45+
String dicDir = System.getProperty("hunspell.dictionaries");
46+
Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
47+
return Files.walk(Path.of(dicDir), 2)
48+
.filter(f -> f.toString().endsWith(".aff"))
49+
.collect(Collectors.toList());
50+
}
17451

175-
try (InputStream dictionary = Files.newInputStream(dicEntry);
176-
InputStream affix = Files.newInputStream(affEntry)) {
177-
Dictionary dic = new Dictionary(tempDir, "dictionary", affix, dictionary);
178-
System.out.println(
179-
tests[i]
180-
+ "\t"
181-
+ RamUsageTester.humanSizeOf(dic)
182-
+ "\t("
183-
+ "words="
184-
+ RamUsageTester.humanSizeOf(dic.words)
185-
+ ", "
186-
+ "flags="
187-
+ RamUsageTester.humanSizeOf(dic.flagLookup)
188-
+ ", "
189-
+ "strips="
190-
+ RamUsageTester.humanSizeOf(dic.stripData)
191-
+ ", "
192-
+ "conditions="
193-
+ RamUsageTester.humanSizeOf(dic.patterns)
194-
+ ", "
195-
+ "affixData="
196-
+ RamUsageTester.humanSizeOf(dic.affixData)
197-
+ ", "
198-
+ "prefixes="
199-
+ RamUsageTester.humanSizeOf(dic.prefixes)
200-
+ ", "
201-
+ "suffixes="
202-
+ RamUsageTester.humanSizeOf(dic.suffixes)
203-
+ ")");
204-
}
205-
}
52+
private static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
53+
String affPath = aff.toString();
54+
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
55+
assert Files.exists(dic) : dic;
56+
try (InputStream dictionary = Files.newInputStream(dic);
57+
InputStream affix = Files.newInputStream(aff);
58+
BaseDirectoryWrapper tempDir = newDirectory()) {
59+
return new Dictionary(tempDir, "dictionary", affix, dictionary);
20660
}
20761
}
20862

209-
public void testOneDictionary() throws Exception {
210-
Path tmp = LuceneTestCase.createTempDir();
211-
212-
String toTest = "zu_ZA.zip";
213-
for (int i = 0; i < tests.length; i++) {
214-
if (tests[i].equals(toTest)) {
215-
Path f = DICTIONARY_HOME.resolve(tests[i]);
216-
assert Files.exists(f);
217-
218-
IOUtils.rm(tmp);
219-
Files.createDirectory(tmp);
220-
221-
try (InputStream in = Files.newInputStream(f)) {
222-
TestUtil.unzip(in, tmp);
223-
Path dicEntry = tmp.resolve(tests[i + 1]);
224-
Path affEntry = tmp.resolve(tests[i + 2]);
225-
226-
try (InputStream dictionary = Files.newInputStream(dicEntry);
227-
InputStream affix = Files.newInputStream(affEntry);
228-
Directory tempDir = getDirectory()) {
229-
new Dictionary(tempDir, "dictionary", affix, dictionary);
230-
}
231-
}
63+
public void testDictionariesLoadSuccessfully() throws Exception {
64+
int failures = 0;
65+
for (Path aff : findAllAffixFiles()) {
66+
try {
67+
System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
68+
} catch (Throwable e) {
69+
failures++;
70+
System.err.println("While checking " + aff + ":");
71+
e.printStackTrace();
23272
}
23373
}
74+
assertEquals(failures + " failures!", 0, failures);
23475
}
23576

236-
private Directory getDirectory() {
237-
return newDirectory();
77+
private static String memoryUsage(Dictionary dic) {
78+
return RamUsageTester.humanSizeOf(dic)
79+
+ "\t("
80+
+ "words="
81+
+ RamUsageTester.humanSizeOf(dic.words)
82+
+ ", "
83+
+ "flags="
84+
+ RamUsageTester.humanSizeOf(dic.flagLookup)
85+
+ ", "
86+
+ "strips="
87+
+ RamUsageTester.humanSizeOf(dic.stripData)
88+
+ ", "
89+
+ "conditions="
90+
+ RamUsageTester.humanSizeOf(dic.patterns)
91+
+ ", "
92+
+ "affixData="
93+
+ RamUsageTester.humanSizeOf(dic.affixData)
94+
+ ", "
95+
+ "prefixes="
96+
+ RamUsageTester.humanSizeOf(dic.prefixes)
97+
+ ", "
98+
+ "suffixes="
99+
+ RamUsageTester.humanSizeOf(dic.suffixes)
100+
+ ")";
238101
}
239102
}

0 commit comments

Comments
 (0)