|
16 | 16 | */
|
17 | 17 | package org.apache.lucene.analysis.hunspell;
|
18 | 18 |
|
| 19 | +import java.io.IOException; |
19 | 20 | import java.io.InputStream;
|
20 | 21 | import java.nio.file.Files;
|
21 | 22 | import java.nio.file.Path;
|
22 |
| -import java.nio.file.Paths; |
23 |
| -import org.apache.lucene.store.Directory; |
24 |
| -import org.apache.lucene.util.IOUtils; |
| 23 | +import java.text.ParseException; |
| 24 | +import java.util.List; |
| 25 | +import java.util.stream.Collectors; |
| 26 | +import org.apache.lucene.store.BaseDirectoryWrapper; |
25 | 27 | import org.apache.lucene.util.LuceneTestCase;
|
26 | 28 | import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
|
27 | 29 | import org.apache.lucene.util.RamUsageTester;
|
28 |
| -import org.apache.lucene.util.TestUtil; |
| 30 | +import org.junit.Assume; |
29 | 31 | import org.junit.Ignore;
|
30 | 32 |
|
31 | 33 | /**
|
32 |
| - * Can be retrieved via: wget --mirror -np |
33 |
| - * http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ Note some |
34 |
| - * of the files differ only in case. This may be a problem on your operating system! |
| 34 | + * Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and |
| 35 | + * prints their memory usage. All *.aff files are traversed directly inside the given directory or |
| 36 | + * in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For |
| 37 | + * examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package |
| 38 | + * documentation} |
35 | 39 | */
|
36 | 40 | @Ignore("enable manually")
|
37 | 41 | @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
|
38 | 42 | public class TestAllDictionaries extends LuceneTestCase {
|
39 | 43 |
|
40 |
| - // set this to the location of where you downloaded all the files |
41 |
| - static final Path DICTIONARY_HOME = |
42 |
| - Paths.get( |
43 |
| - "/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); |
44 |
| - |
45 |
| - final String tests[] = { |
46 |
| - /* zip file */ |
47 |
| - /* dictionary */ |
48 |
| - /* affix */ |
49 |
| - "af_ZA.zip", "af_ZA.dic", "af_ZA.aff", |
50 |
| - "ak_GH.zip", "ak_GH.dic", "ak_GH.aff", |
51 |
| - "bg_BG.zip", "bg_BG.dic", "bg_BG.aff", |
52 |
| - "ca_ANY.zip", "catalan.dic", "catalan.aff", |
53 |
| - "ca_ES.zip", "ca_ES.dic", "ca_ES.aff", |
54 |
| - // BUG: broken flag "cop_EG.zip", "cop_EG.dic", "cop_EG.aff", |
55 |
| - "cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff", |
56 |
| - "cy_GB.zip", "cy_GB.dic", "cy_GB.aff", |
57 |
| - "da_DK.zip", "da_DK.dic", "da_DK.aff", |
58 |
| - "de_AT.zip", "de_AT.dic", "de_AT.aff", |
59 |
| - "de_CH.zip", "de_CH.dic", "de_CH.aff", |
60 |
| - "de_DE.zip", "de_DE.dic", "de_DE.aff", |
61 |
| - "de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff", |
62 |
| - "de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff", |
63 |
| - "de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff", |
64 |
| - "el_GR.zip", "el_GR.dic", "el_GR.aff", |
65 |
| - "en_AU.zip", "en_AU.dic", "en_AU.aff", |
66 |
| - "en_CA.zip", "en_CA.dic", "en_CA.aff", |
67 |
| - "en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff", |
68 |
| - "en_GB.zip", "en_GB.dic", "en_GB.aff", |
69 |
| - "en_NZ.zip", "en_NZ.dic", "en_NZ.aff", |
70 |
| - "eo.zip", "eo_l3.dic", "eo_l3.aff", |
71 |
| - "eo_EO.zip", "eo_EO.dic", "eo_EO.aff", |
72 |
| - "es_AR.zip", "es_AR.dic", "es_AR.aff", |
73 |
| - "es_BO.zip", "es_BO.dic", "es_BO.aff", |
74 |
| - "es_CL.zip", "es_CL.dic", "es_CL.aff", |
75 |
| - "es_CO.zip", "es_CO.dic", "es_CO.aff", |
76 |
| - "es_CR.zip", "es_CR.dic", "es_CR.aff", |
77 |
| - "es_CU.zip", "es_CU.dic", "es_CU.aff", |
78 |
| - "es_DO.zip", "es_DO.dic", "es_DO.aff", |
79 |
| - "es_EC.zip", "es_EC.dic", "es_EC.aff", |
80 |
| - "es_ES.zip", "es_ES.dic", "es_ES.aff", |
81 |
| - "es_GT.zip", "es_GT.dic", "es_GT.aff", |
82 |
| - "es_HN.zip", "es_HN.dic", "es_HN.aff", |
83 |
| - "es_MX.zip", "es_MX.dic", "es_MX.aff", |
84 |
| - "es_NEW.zip", "es_NEW.dic", "es_NEW.aff", |
85 |
| - "es_NI.zip", "es_NI.dic", "es_NI.aff", |
86 |
| - "es_PA.zip", "es_PA.dic", "es_PA.aff", |
87 |
| - "es_PE.zip", "es_PE.dic", "es_PE.aff", |
88 |
| - "es_PR.zip", "es_PR.dic", "es_PR.aff", |
89 |
| - "es_PY.zip", "es_PY.dic", "es_PY.aff", |
90 |
| - "es_SV.zip", "es_SV.dic", "es_SV.aff", |
91 |
| - "es_UY.zip", "es_UY.dic", "es_UY.aff", |
92 |
| - "es_VE.zip", "es_VE.dic", "es_VE.aff", |
93 |
| - "et_EE.zip", "et_EE.dic", "et_EE.aff", |
94 |
| - "fo_FO.zip", "fo_FO.dic", "fo_FO.aff", |
95 |
| - "fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff", |
96 |
| - "fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff", |
97 |
| - "fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff", |
98 |
| - "fy_NL.zip", "fy_NL.dic", "fy_NL.aff", |
99 |
| - "ga_IE.zip", "ga_IE.dic", "ga_IE.aff", |
100 |
| - "gd_GB.zip", "gd_GB.dic", "gd_GB.aff", |
101 |
| - "gl_ES.zip", "gl_ES.dic", "gl_ES.aff", |
102 |
| - "gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff", |
103 |
| - "gu_IN.zip", "gu_IN.dic", "gu_IN.aff", |
104 |
| - "he_IL.zip", "he_IL.dic", "he_IL.aff", |
105 |
| - "hi_IN.zip", "hi_IN.dic", "hi_IN.aff", |
106 |
| - "hil_PH.zip", "hil_PH.dic", "hil_PH.aff", |
107 |
| - "hr_HR.zip", "hr_HR.dic", "hr_HR.aff", |
108 |
| - "hu_HU.zip", "hu_HU.dic", "hu_HU.aff", |
109 |
| - "hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff", |
110 |
| - "ia.zip", "ia.dic", "ia.aff", |
111 |
| - "id_ID.zip", "id_ID.dic", "id_ID.aff", |
112 |
| - "it_IT.zip", "it_IT.dic", "it_IT.aff", |
113 |
| - "ku_TR.zip", "ku_TR.dic", "ku_TR.aff", |
114 |
| - "la.zip", "la.dic", "la.aff", |
115 |
| - "lt_LT.zip", "lt_LT.dic", "lt_LT.aff", |
116 |
| - "lv_LV.zip", "lv_LV.dic", "lv_LV.aff", |
117 |
| - "mg_MG.zip", "mg_MG.dic", "mg_MG.aff", |
118 |
| - "mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff", |
119 |
| - "mk_MK.zip", "mk_MK.dic", "mk_MK.aff", |
120 |
| - "mos_BF.zip", "mos_BF.dic", "mos_BF.aff", |
121 |
| - "mr_IN.zip", "mr_IN.dic", "mr_IN.aff", |
122 |
| - "ms_MY.zip", "ms_MY.dic", "ms_MY.aff", |
123 |
| - "nb_NO.zip", "nb_NO.dic", "nb_NO.aff", |
124 |
| - "ne_NP.zip", "ne_NP.dic", "ne_NP.aff", |
125 |
| - "nl_NL.zip", "nl_NL.dic", "nl_NL.aff", |
126 |
| - "nl_med.zip", "nl_med.dic", "nl_med.aff", |
127 |
| - "nn_NO.zip", "nn_NO.dic", "nn_NO.aff", |
128 |
| - "nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff", |
129 |
| - "ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff", |
130 |
| - "ny_MW.zip", "ny_MW.dic", "ny_MW.aff", |
131 |
| - "oc_FR.zip", "oc_FR.dic", "oc_FR.aff", |
132 |
| - "pl_PL.zip", "pl_PL.dic", "pl_PL.aff", |
133 |
| - "pt_BR.zip", "pt_BR.dic", "pt_BR.aff", |
134 |
| - "pt_PT.zip", "pt_PT.dic", "pt_PT.aff", |
135 |
| - "ro_RO.zip", "ro_RO.dic", "ro_RO.aff", |
136 |
| - "ru_RU.zip", "ru_RU.dic", "ru_RU.aff", |
137 |
| - "ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff", |
138 |
| - "ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff", |
139 |
| - "rw_RW.zip", "rw_RW.dic", "rw_RW.aff", |
140 |
| - "sk_SK.zip", "sk_SK.dic", "sk_SK.aff", |
141 |
| - "sl_SI.zip", "sl_SI.dic", "sl_SI.aff", |
142 |
| - "sq_AL.zip", "sq_AL.dic", "sq_AL.aff", |
143 |
| - "ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff", |
144 |
| - "st_ZA.zip", "st_ZA.dic", "st_ZA.aff", |
145 |
| - "sv_SE.zip", "sv_SE.dic", "sv_SE.aff", |
146 |
| - "sw_KE.zip", "sw_KE.dic", "sw_KE.aff", |
147 |
| - "tet_ID.zip", "tet_ID.dic", "tet_ID.aff", |
148 |
| - "th_TH.zip", "th_TH.dic", "th_TH.aff", |
149 |
| - "tl_PH.zip", "tl_PH.dic", "tl_PH.aff", |
150 |
| - "tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff", |
151 |
| - "ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff", |
152 |
| - "uk_UA.zip", "uk_UA.dic", "uk_UA.aff", |
153 |
| - "ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff", |
154 |
| - "vi_VN.zip", "vi_VN.dic", "vi_VN.aff", |
155 |
| - "xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff", |
156 |
| - "zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff", |
157 |
| - }; |
158 |
| - |
159 |
| - public void test() throws Exception { |
160 |
| - Path tmp = LuceneTestCase.createTempDir(); |
161 |
| - |
162 |
| - for (int i = 0; i < tests.length; i += 3) { |
163 |
| - Path f = DICTIONARY_HOME.resolve(tests[i]); |
164 |
| - assert Files.exists(f); |
165 |
| - |
166 |
| - IOUtils.rm(tmp); |
167 |
| - Files.createDirectory(tmp); |
168 |
| - |
169 |
| - try (InputStream in = Files.newInputStream(f); |
170 |
| - Directory tempDir = getDirectory()) { |
171 |
| - TestUtil.unzip(in, tmp); |
172 |
| - Path dicEntry = tmp.resolve(tests[i + 1]); |
173 |
| - Path affEntry = tmp.resolve(tests[i + 2]); |
| 44 | + private static List<Path> findAllAffixFiles() throws IOException { |
| 45 | + String dicDir = System.getProperty("hunspell.dictionaries"); |
| 46 | + Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null); |
| 47 | + return Files.walk(Path.of(dicDir), 2) |
| 48 | + .filter(f -> f.toString().endsWith(".aff")) |
| 49 | + .collect(Collectors.toList()); |
| 50 | + } |
174 | 51 |
|
175 |
| - try (InputStream dictionary = Files.newInputStream(dicEntry); |
176 |
| - InputStream affix = Files.newInputStream(affEntry)) { |
177 |
| - Dictionary dic = new Dictionary(tempDir, "dictionary", affix, dictionary); |
178 |
| - System.out.println( |
179 |
| - tests[i] |
180 |
| - + "\t" |
181 |
| - + RamUsageTester.humanSizeOf(dic) |
182 |
| - + "\t(" |
183 |
| - + "words=" |
184 |
| - + RamUsageTester.humanSizeOf(dic.words) |
185 |
| - + ", " |
186 |
| - + "flags=" |
187 |
| - + RamUsageTester.humanSizeOf(dic.flagLookup) |
188 |
| - + ", " |
189 |
| - + "strips=" |
190 |
| - + RamUsageTester.humanSizeOf(dic.stripData) |
191 |
| - + ", " |
192 |
| - + "conditions=" |
193 |
| - + RamUsageTester.humanSizeOf(dic.patterns) |
194 |
| - + ", " |
195 |
| - + "affixData=" |
196 |
| - + RamUsageTester.humanSizeOf(dic.affixData) |
197 |
| - + ", " |
198 |
| - + "prefixes=" |
199 |
| - + RamUsageTester.humanSizeOf(dic.prefixes) |
200 |
| - + ", " |
201 |
| - + "suffixes=" |
202 |
| - + RamUsageTester.humanSizeOf(dic.suffixes) |
203 |
| - + ")"); |
204 |
| - } |
205 |
| - } |
| 52 | + private static Dictionary loadDictionary(Path aff) throws IOException, ParseException { |
| 53 | + String affPath = aff.toString(); |
| 54 | + Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic"); |
| 55 | + assert Files.exists(dic) : dic; |
| 56 | + try (InputStream dictionary = Files.newInputStream(dic); |
| 57 | + InputStream affix = Files.newInputStream(aff); |
| 58 | + BaseDirectoryWrapper tempDir = newDirectory()) { |
| 59 | + return new Dictionary(tempDir, "dictionary", affix, dictionary); |
206 | 60 | }
|
207 | 61 | }
|
208 | 62 |
|
209 |
| - public void testOneDictionary() throws Exception { |
210 |
| - Path tmp = LuceneTestCase.createTempDir(); |
211 |
| - |
212 |
| - String toTest = "zu_ZA.zip"; |
213 |
| - for (int i = 0; i < tests.length; i++) { |
214 |
| - if (tests[i].equals(toTest)) { |
215 |
| - Path f = DICTIONARY_HOME.resolve(tests[i]); |
216 |
| - assert Files.exists(f); |
217 |
| - |
218 |
| - IOUtils.rm(tmp); |
219 |
| - Files.createDirectory(tmp); |
220 |
| - |
221 |
| - try (InputStream in = Files.newInputStream(f)) { |
222 |
| - TestUtil.unzip(in, tmp); |
223 |
| - Path dicEntry = tmp.resolve(tests[i + 1]); |
224 |
| - Path affEntry = tmp.resolve(tests[i + 2]); |
225 |
| - |
226 |
| - try (InputStream dictionary = Files.newInputStream(dicEntry); |
227 |
| - InputStream affix = Files.newInputStream(affEntry); |
228 |
| - Directory tempDir = getDirectory()) { |
229 |
| - new Dictionary(tempDir, "dictionary", affix, dictionary); |
230 |
| - } |
231 |
| - } |
| 63 | + public void testDictionariesLoadSuccessfully() throws Exception { |
| 64 | + int failures = 0; |
| 65 | + for (Path aff : findAllAffixFiles()) { |
| 66 | + try { |
| 67 | + System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff))); |
| 68 | + } catch (Throwable e) { |
| 69 | + failures++; |
| 70 | + System.err.println("While checking " + aff + ":"); |
| 71 | + e.printStackTrace(); |
232 | 72 | }
|
233 | 73 | }
|
| 74 | + assertEquals(failures + " failures!", 0, failures); |
234 | 75 | }
|
235 | 76 |
|
236 |
| - private Directory getDirectory() { |
237 |
| - return newDirectory(); |
| 77 | + private static String memoryUsage(Dictionary dic) { |
| 78 | + return RamUsageTester.humanSizeOf(dic) |
| 79 | + + "\t(" |
| 80 | + + "words=" |
| 81 | + + RamUsageTester.humanSizeOf(dic.words) |
| 82 | + + ", " |
| 83 | + + "flags=" |
| 84 | + + RamUsageTester.humanSizeOf(dic.flagLookup) |
| 85 | + + ", " |
| 86 | + + "strips=" |
| 87 | + + RamUsageTester.humanSizeOf(dic.stripData) |
| 88 | + + ", " |
| 89 | + + "conditions=" |
| 90 | + + RamUsageTester.humanSizeOf(dic.patterns) |
| 91 | + + ", " |
| 92 | + + "affixData=" |
| 93 | + + RamUsageTester.humanSizeOf(dic.affixData) |
| 94 | + + ", " |
| 95 | + + "prefixes=" |
| 96 | + + RamUsageTester.humanSizeOf(dic.prefixes) |
| 97 | + + ", " |
| 98 | + + "suffixes=" |
| 99 | + + RamUsageTester.humanSizeOf(dic.suffixes) |
| 100 | + + ")"; |
238 | 101 | }
|
239 | 102 | }
|
0 commit comments