21
21
* @link http://langdetect.blogspot.com/
22
22
*/
23
23
24
- require_once 'PEAR.php ' ;
25
24
require_once 'Text/LanguageDetect/Exception.php ' ;
26
25
require_once 'Text/LanguageDetect/Parser.php ' ;
27
26
require_once 'Text/LanguageDetect/ISO639.php ' ;
@@ -176,15 +175,13 @@ class Text_LanguageDetect
176
175
/**
177
176
* Constructor
178
177
*
179
- * Will attempt to load the language database.
180
- *
181
- * @throws Text_LanguageDetect_Exception
182
- * @todo Avoid work in the constructor
178
+ * Will attempt to load the language database. If it fails, you will get
179
+ * an exception.
183
180
*/
184
- function Text_LanguageDetect ()
181
+ function __construct ()
185
182
{
186
183
$ data = $ this ->_readdb ($ this ->_db_filename );
187
-
184
+ $ this -> _checkTrigram ( $ data [ ' trigram ' ]);
188
185
$ this ->_lang_db = $ data ['trigram ' ];
189
186
190
187
if (isset ($ data ['trigram-unicodemap ' ])) {
@@ -195,7 +192,6 @@ function Text_LanguageDetect()
195
192
if (isset ($ data ['trigram-clusters ' ])) {
196
193
$ this ->_clusters = $ data ['trigram-clusters ' ];
197
194
}
198
-
199
195
}
200
196
201
197
/**
@@ -240,44 +236,46 @@ function _readdb($fname)
240
236
241
237
// input check
242
238
if (!file_exists ($ fname )) {
243
- throw new Text_LanguageDetect_Exception ('Language database does not exist. ' );
239
+ throw new Text_LanguageDetect_Exception (
240
+ 'Language database does not exist. ' ,
241
+ Text_LanguageDetect_Exception::DB_NOT_FOUND
242
+ );
244
243
} elseif (!is_readable ($ fname )) {
245
- throw new Text_LanguageDetect_Exception ('Language database is not readable. ' );
244
+ throw new Text_LanguageDetect_Exception (
245
+ 'Language database is not readable. ' ,
246
+ Text_LanguageDetect_Exception::DB_NOT_READABLE
247
+ );
246
248
}
247
249
248
- if (function_exists ('file_get_contents ' )) {
249
- return unserialize (file_get_contents ($ fname ));
250
- } else {
251
- // if you don't have file_get_contents(),
252
- // then this is the next fastest way
253
- ob_start ();
254
- readfile ($ fname );
255
- $ contents = ob_get_contents ();
256
- ob_end_clean ();
257
- return unserialize ($ contents );
258
- }
250
+ return unserialize (file_get_contents ($ fname ));
259
251
}
260
252
261
253
262
254
/**
263
255
* Checks if this object is ready to detect languages
264
256
*
265
257
* @access private
266
- * @return bool true if no errors
267
- * @throws Text_LanguageDetect_Exception
258
+ *
259
+ * @return void
268
260
*/
269
- function _setup_ok ( )
261
+ function _checkTrigram ( $ trigram )
270
262
{
271
- if (!is_array ($ this -> _lang_db )) {
263
+ if (!is_array ($ trigram )) {
272
264
if (ini_get ('magic_quotes_runtime ' )) {
273
- throw new Text_LanguageDetect_Exception ('Error loading database. Try turning magic_quotes_runtime off. ' );
274
- } else {
275
- throw new Text_LanguageDetect_Exception ('Language database is not an array. ' );
265
+ throw new Text_LanguageDetect_Exception (
266
+ 'Error loading database. Try turning magic_quotes_runtime off. ' ,
267
+ Text_LanguageDetect_Exception::MAGIC_QUOTES
268
+ );
276
269
}
277
- } elseif (empty ($ this ->_lang_db )) {
278
- throw new Text_LanguageDetect_Exception ('Language database has no elements. ' );
279
- } else {
280
- return true ;
270
+ throw new Text_LanguageDetect_Exception (
271
+ 'Language database is not an array. ' ,
272
+ Text_LanguageDetect_Exception::DB_NOT_ARRAY
273
+ );
274
+ } elseif (empty ($ trigram )) {
275
+ throw new Text_LanguageDetect_Exception (
276
+ 'Language database has no elements. ' ,
277
+ Text_LanguageDetect_Exception::DB_EMPTY
278
+ );
281
279
}
282
280
}
283
281
@@ -299,8 +297,6 @@ function _setup_ok()
299
297
*/
300
298
function omitLanguages ($ omit_list , $ include_only = false )
301
299
{
302
- $ this ->_setup_ok ();
303
-
304
300
$ deleted = 0 ;
305
301
306
302
$ omit_list = $ this ->_convertFromNameMode ($ omit_list );
@@ -360,31 +356,21 @@ function omitLanguages($omit_list, $include_only = false)
360
356
*/
361
357
function getLanguageCount ()
362
358
{
363
- $ this ->_setup_ok ();
364
-
365
359
return count ($ this ->_lang_db );
366
360
}
367
361
368
362
/**
369
- * Returns true if a given language exists
370
- *
371
- * If passed an array of names, will return true only if all exist
372
- *
373
363
* @access public
374
364
* @param mixed $lang language name or array of language names
375
365
* @return bool true if language model exists
376
- * @throws Text_LanguageDetect_Exception
377
366
*/
378
367
function languageExists ($ lang )
379
368
{
380
- $ this ->_setup_ok ();
381
-
382
369
$ lang = $ this ->_convertFromNameMode ($ lang );
383
370
// string
384
371
if (is_string ($ lang )) {
385
372
return isset ($ this ->_lang_db [strtolower ($ lang )]);
386
373
387
- // array
388
374
} elseif (is_array ($ lang )) {
389
375
foreach ($ lang as $ test_lang ) {
390
376
if (!isset ($ this ->_lang_db [strtolower ($ test_lang )])) {
@@ -393,23 +379,23 @@ function languageExists($lang)
393
379
}
394
380
return true ;
395
381
396
- // other (error)
397
382
} else {
398
- throw new Text_LanguageDetect_Exception ('Unknown type passed to languageExists() ' );
383
+ throw new Text_LanguageDetect_Exception (
384
+ 'Unknown type passed to languageExists() ' ,
385
+ Text_LanguageDetect_Exception::UNKNOWN_TYPE
386
+ );
399
387
}
400
388
}
401
389
402
390
/**
403
391
* Returns the list of detectable languages
404
392
*
405
393
* @access public
406
- * @return array the names of the languages known to this object
394
+ * @return array the names of the languages known to this object<<<<<<<
407
395
* @throws Text_LanguageDetect_Exception
408
396
*/
409
397
function getLanguages ()
410
398
{
411
- $ this ->_setup_ok ();
412
-
413
399
return $ this ->_convertToNameMode (
414
400
array_keys ($ this ->_lang_db )
415
401
);
@@ -677,8 +663,6 @@ function _normalize_score($score, $base_count = null)
677
663
*/
678
664
function detect ($ sample , $ limit = 0 )
679
665
{
680
- $ this ->_setup_ok ();
681
-
682
666
// input check
683
667
if (!Text_LanguageDetect_Parser::validateString ($ sample )) {
684
668
return array ();
@@ -739,7 +723,10 @@ function detect($sample, $limit = 0)
739
723
if (is_array ($ blocks )) {
740
724
$ present_blocks = array_keys ($ blocks );
741
725
} else {
742
- throw new Text_LanguageDetect_Exception ('Error during block detection ' );
726
+ throw new Text_LanguageDetect_Exception (
727
+ 'Error during block detection ' ,
728
+ Text_LanguageDetect_Exception::ERR_BLOCK_DETECTION
729
+ );
743
730
}
744
731
745
732
$ possible_langs = array ();
@@ -921,19 +908,25 @@ function detectUnicodeBlocks($str, $skip_symbols)
921
908
{
922
909
// input check
923
910
if (!is_bool ($ skip_symbols )) {
924
- throw new Text_LanguageDetect_Exception ('Second parameter must be boolean ' );
911
+ throw new Text_LanguageDetect_Exception (
912
+ 'Second parameter must be boolean ' ,
913
+ Text_LanguageDetect_Exception::ERR_PARAM_TYPE
914
+ );
925
915
}
926
916
927
917
if (!is_string ($ str )) {
928
- throw new Text_LanguageDetect_Exception ('First parameter was not a string ' );
918
+ throw new Text_LanguageDetect_Exception (
919
+ 'First parameter was not a string ' ,
920
+ Text_LanguageDetect_Exception::ERR_PARAM_TYPE
921
+ );
929
922
}
930
923
931
924
$ sample_obj = new Text_LanguageDetect_Parser ($ str );
932
925
$ sample_obj ->prepareUnicode ();
933
926
$ sample_obj ->prepareTrigram (false );
934
927
$ sample_obj ->setUnicodeSkipSymbols ($ skip_symbols );
935
928
$ sample_obj ->analyze ();
936
- $ blocks =& $ sample_obj ->getUnicodeBlocks ();
929
+ $ blocks = $ sample_obj ->getUnicodeBlocks ();
937
930
unset($ sample_obj );
938
931
return $ blocks ;
939
932
}
@@ -958,21 +951,30 @@ function unicodeBlockName($unicode) {
958
951
959
952
// input check
960
953
if ($ this ->utf8strlen ($ unicode ) > 1 ) {
961
- throw new Text_LanguageDetect_Exception ('Pass this function only a single char ' );
954
+ throw new Text_LanguageDetect_Exception (
955
+ 'Pass this function only a single char ' ,
956
+ Text_LanguageDetect_Exception::ERR_PARAM_TYPE
957
+ );
962
958
}
963
959
964
960
$ unicode = $ this ->_utf8char2unicode ($ unicode );
965
961
966
962
if ($ unicode == -1 ) {
967
- throw new Text_LanguageDetect_Exception ('Malformatted char ' );
963
+ throw new Text_LanguageDetect_Exception (
964
+ 'Malformatted char ' ,
965
+ Text_LanguageDetect_Exception::ERR_INVALID_CHAR
966
+ );
968
967
}
969
968
970
969
// input check
971
970
} elseif (!is_int ($ unicode )) {
972
- throw new Text_LanguageDetect_Exception ('Input must be of type string or int. ' );
971
+ throw new Text_LanguageDetect_Exception (
972
+ 'Input must be of type string or int. ' ,
973
+ Text_LanguageDetect_Exception::ERR_PARAM_TYPE
974
+ );
973
975
}
974
976
975
- $ blocks =& $ this ->_read_unicode_block_db ();
977
+ $ blocks = $ this ->_read_unicode_block_db ();
976
978
977
979
$ result = $ this ->_unicode_block_name ($ unicode , $ blocks );
978
980
@@ -1046,9 +1048,9 @@ function _unicode_block_name($unicode, &$blocks, $block_count = -1) {
1046
1048
*
1047
1049
* @access protected
1048
1050
* @return array the database of unicode block definitions
1049
- * @throws Text_LanguageDetect_Exception
1051
+ * @throws Text_LanguageDetect_Exception
1050
1052
*/
1051
- function & _read_unicode_block_db () {
1053
+ function _read_unicode_block_db () {
1052
1054
// since the unicode definitions are always going to be the same,
1053
1055
// might as well share the memory for the db with all other instances
1054
1056
// of this class
@@ -1082,8 +1084,6 @@ function &_read_unicode_block_db() {
1082
1084
*/
1083
1085
function languageSimilarity ($ lang1 = null , $ lang2 = null )
1084
1086
{
1085
- $ this ->_setup_ok ();
1086
-
1087
1087
$ lang1 = $ this ->_convertFromNameMode ($ lang1 );
1088
1088
$ lang2 = $ this ->_convertFromNameMode ($ lang2 );
1089
1089
if ($ lang1 != null ) {
@@ -1184,10 +1184,6 @@ function languageSimilarity($lang1 = null, $lang2 = null)
1184
1184
function clusterLanguages ()
1185
1185
{
1186
1186
// todo: set the maximum number of clusters
1187
-
1188
- // setup check
1189
- $ this ->_setup_ok ();
1190
-
1191
1187
// return cached result, if any
1192
1188
if (isset ($ this ->_clusters )) {
1193
1189
return $ this ->_clusters ;
@@ -1201,7 +1197,10 @@ function clusterLanguages()
1201
1197
1202
1198
foreach ($ langs as $ lang ) {
1203
1199
if (!isset ($ this ->_lang_db [$ lang ])) {
1204
- throw new Text_LanguageDetect_Exception ("missing $ lang! \n" );
1200
+ throw new Text_LanguageDetect_Exception (
1201
+ "missing $ lang! " ,
1202
+ Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1203
+ );
1205
1204
}
1206
1205
}
1207
1206
@@ -1229,7 +1228,10 @@ function clusterLanguages()
1229
1228
1230
1229
if (!$ highest_key1 ) {
1231
1230
// should not ever happen
1232
- throw new Text_LanguageDetect_Exception ("no highest key? (step: $ i) " );
1231
+ throw new Text_LanguageDetect_Exception (
1232
+ "no highest key? (step: $ i) " ,
1233
+ Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1234
+ );
1233
1235
}
1234
1236
1235
1237
if ($ highest_score == 0 ) {
@@ -1365,11 +1367,10 @@ function clusterLanguages()
1365
1367
* @access public
1366
1368
* @param string $str input string
1367
1369
* @return array language scores (only those compared)
1368
- * @throws Text_LanguageDetect_Exception
1370
+ * @throws Text_LanguageDetect_Exception
1369
1371
*/
1370
1372
function clusteredSearch ($ str )
1371
1373
{
1372
-
1373
1374
// input check
1374
1375
if (!Text_LanguageDetect_Parser::validateString ($ str )) {
1375
1376
return array ();
0 commit comments