1- #!/usr/bin/env python3
21# -*- coding: utf-8 -*-
32
43"""
4140# for nltk, use the environment variable NLTK_DATA to specify a custom data path (instead of $HOME/.nltk).
4241# for spacy, use SPACY_DATA; the latter is a custom Mathics variable.
4342
44- from mathics .builtin .base import Builtin , MessageException
45- from mathics .builtin .numbers .randomnumbers import RandomEnv
46- from mathics .builtin .codetables import iso639_3
47- from mathics .builtin .strings import to_regex , anchor_pattern
48- from mathics .core .atoms import Integer , String , Real
49- from mathics .core .expression import (
50- Expression ,
51- Symbol ,
52- strip_context ,
53- string_list ,
54- )
55- from mathics .core .listg import to_list_expression
56- from mathics .core .symbols import SymbolDivide
57- from mathics .core .systemsymbols import SymbolN
58-
43+ import heapq
44+ import itertools
45+ import math
5946import os
6047import re
61- import itertools
6248from itertools import chain
63- import heapq
64- import math
49+
50+ from mathics .builtin .atomic .strings import anchor_pattern , to_regex
51+ from mathics .builtin .base import Builtin , MessageException
52+ from mathics .builtin .codetables import iso639_3
53+ from mathics .builtin .numbers .randomnumbers import RandomEnv
54+ from mathics .core .atoms import Integer , Real , String
55+ from mathics .core .convert .expression import ListExpression , to_mathics_list
56+ from mathics .core .expression import Expression
57+ from mathics .core .symbols import Symbol , SymbolList , SymbolTrue , strip_context
58+ from mathics .core .systemsymbols import SymbolN
6559
6660
6761def _parse_nltk_lookup_error (e ):
@@ -138,7 +132,6 @@ def _init_nltk_maps():
138132 }
139133 )
140134
141-
142135except ImportError :
143136 pass
144137
@@ -240,7 +233,7 @@ def _load_spacy(self, evaluation, options):
240233 language_name = String ("Undefined" )
241234 if isinstance (language_name , String ):
242235 language_code = _SpacyBuiltin ._language_codes .get (
243- language_name .get_string_value ()
236+ language_name .value
244237 )
245238 if not language_code :
246239 evaluation .message (
@@ -256,7 +249,7 @@ def _load_spacy(self, evaluation, options):
256249 if "SPACY_DATA" in os .environ :
257250 instance = spacy .load (language_code , via = os .environ ["SPACY_DATA" ])
258251 else :
259- instance = spacy .load (language_code )
252+ instance = spacy .load (f" { language_code } _core_web_sm" )
260253
261254 _SpacyBuiltin ._spacy_instances [language_code ] = instance
262255 return instance
@@ -296,7 +289,7 @@ class WordFrequencyData(_SpacyBuiltin):
296289
297290 def apply (self , word , evaluation , options ):
298291 "WordFrequencyData[word_String, OptionsPattern[%(name)s]]"
299- doc = self ._nlp (word .get_string_value () , evaluation , options )
292+ doc = self ._nlp (word .value , evaluation , options )
300293 frequency = 0.0
301294 if doc :
302295 if len (doc ) == 1 :
@@ -317,7 +310,7 @@ class WordCount(_SpacyBuiltin):
317310
318311 def apply (self , text , evaluation , options ):
319312 "WordCount[text_String, OptionsPattern[%(name)s]]"
320- doc = self ._nlp (text .get_string_value () , evaluation , options )
313+ doc = self ._nlp (text .value , evaluation , options )
321314 if doc :
322315 punctuation = spacy .parts_of_speech .PUNCT
323316 return Integer (sum (1 for word in doc if word .pos != punctuation ))
@@ -339,27 +332,24 @@ class TextWords(_SpacyBuiltin):
339332
340333 def apply (self , text , evaluation , options ):
341334 "TextWords[text_String, OptionsPattern[%(name)s]]"
342- doc = self ._nlp (text .get_string_value () , evaluation , options )
335+ doc = self ._nlp (text .value , evaluation , options )
343336 if doc :
344337 punctuation = spacy .parts_of_speech .PUNCT
345- return string_list (
346- "List" ,
347- [String (word .text ) for word in doc if word .pos != punctuation ],
348- evaluation ,
338+ return ListExpression (
339+ * [String (word .text ) for word in doc if word .pos != punctuation ],
349340 )
350341
351342 def apply_n (self , text , n , evaluation , options ):
352343 "TextWords[text_String, n_Integer, OptionsPattern[%(name)s]]"
344+ from trepan .api import debug ; debug ()
353345 doc = self ._nlp (text .get_string_value (), evaluation , options )
354346 if doc :
355347 punctuation = spacy .parts_of_speech .PUNCT
356- return string_list (
357- "List" ,
358- itertools .islice (
348+ return ListExpression (
349+ * itertools .islice (
359350 (String (word .text ) for word in doc if word .pos != punctuation ),
360351 n .get_int_value (),
361352 ),
362- evaluation ,
363353 )
364354
365355
@@ -387,20 +377,18 @@ def apply(self, text, evaluation, options):
387377 "TextSentences[text_String, OptionsPattern[%(name)s]]"
388378 doc = self ._nlp (text .get_string_value (), evaluation , options )
389379 if doc :
390- return string_list (
391- "List" , [String (sent .text ) for sent in doc .sents ], evaluation
380+ return ListExpression (
381+ * [String (sent .text ) for sent in doc .sents ]
392382 )
393383
394384 def apply_n (self , text , n , evaluation , options ):
395385 "TextSentences[text_String, n_Integer, OptionsPattern[%(name)s]]"
396386 doc = self ._nlp (text .get_string_value (), evaluation , options )
397387 if doc :
398- return string_list (
399- "List" ,
388+ return ListExpression (
400389 itertools .islice (
401390 (String (sent .text ) for sent in doc .sents ), n .get_int_value ()
402391 ),
403- evaluation ,
404392 )
405393
406394
@@ -433,11 +421,11 @@ def filter_words(words):
433421 elif not is_stop (s ):
434422 yield String (s )
435423
436- return string_list ( "List" , filter_words (l .leaves ), evaluation )
424+ return ListExpression ( * list ( filter_words (l .elements )) )
437425
438426 def apply_string (self , s , evaluation , options ):
439427 "DeleteStopwords[s_String, OptionsPattern[%(name)s]]"
440- doc = self ._nlp (s .get_string_value () , evaluation , options )
428+ doc = self ._nlp (s .value , evaluation , options )
441429 if doc :
442430 is_stop = self ._is_stop_lambda (evaluation , options )
443431 if is_stop :
@@ -473,18 +461,18 @@ class WordFrequency(_SpacyBuiltin):
473461
474462 def apply (self , text , word , evaluation , options ):
475463 "WordFrequency[text_String, word_, OptionsPattern[%(name)s]]"
476- doc = self ._nlp (text .get_string_value () , evaluation , options )
464+ doc = self ._nlp (text .value , evaluation , options )
477465 if not doc :
478466 return
479467 if isinstance (word , String ):
480- words = set (( word .get_string_value (),) )
468+ words = set ([ word .value ] )
481469 elif word .get_head_name () == "System`Alternatives" :
482- if not all (isinstance (a , String ) for a in word .leaves ):
470+ if not all (isinstance (a , String ) for a in word .elements ):
483471 return # error
484- words = set (a .get_string_value () for a in word .leaves )
472+ words = set (a .value for a in word .elements )
485473 else :
486474 return # error
487- ignore_case = self .get_option (options , "IgnoreCase" , evaluation ). is_true ()
475+ ignore_case = self .get_option (options , "IgnoreCase" , evaluation ) is SymbolTrue
488476 if ignore_case :
489477 words = [w .lower () for w in words ]
490478 n = 0
@@ -494,7 +482,9 @@ def apply(self, text, word, evaluation, options):
494482 text = text .lower ()
495483 if text in words :
496484 n += 1
497- return Expression (SymbolN , Expression (SymbolDivide , Integer (n ), Integer (len (doc ))))
485+ return Expression (
486+ SymbolN , Integer (n ) / Integer (len (doc ))
487+ )
498488
499489
500490class Containing (Builtin ):
@@ -505,12 +495,12 @@ def _cases(doc, form):
505495 if isinstance (form , String ):
506496 generators = [_forms .get (form .get_string_value ())]
507497 elif form .get_head_name () == "System`Alternatives" :
508- if not all (isinstance (f , String ) for f in form .leaves ):
498+ if not all (isinstance (f , String ) for f in form .elements ):
509499 return # error
510- generators = [_forms .get (f .get_string_value ()) for f in form .leaves ]
500+ generators = [_forms .get (f .get_string_value ()) for f in form .elements ]
511501 elif form .get_head_name () == "PyMathics`Containing" :
512- if len (form .leaves ) == 2 :
513- for t in _containing (doc , * form .leaves ):
502+ if len (form .elements ) == 2 :
503+ for t in _containing (doc , * form .elements ):
514504 yield t
515505 return
516506 else :
@@ -582,18 +572,17 @@ class TextCases(_SpacyBuiltin):
582572
583573 def apply (self , text , form , evaluation , options ):
584574 "TextCases[text_String, form_, OptionsPattern[%(name)s]]"
585- doc = self ._nlp (text .get_string_value () , evaluation , options )
575+ doc = self ._nlp (text .value , evaluation , options )
586576 if doc :
587- return to_list_expression (* [t .text for t in _cases (doc , form )])
577+ return to_mathics_list (* [t .text for t in _cases (doc , form )])
588578
589579 def apply_n (self , text , form , n , evaluation , options ):
590580 "TextCases[text_String, form_, n_Integer, OptionsPattern[%(name)s]]"
591- doc = self ._nlp (text .get_string_value () , evaluation , options )
581+ doc = self ._nlp (text .value , evaluation , options )
592582 if doc :
593- return Expression (
594- "List" ,
583+ return to_mathics_list (
595584 * itertools .islice (
596- (t .text for t in _cases (doc , form )), n .get_int_value ()
585+ (t .text for t in _cases (doc , form )), n .value
597586 )
598587 )
599588
@@ -611,9 +600,9 @@ class TextPosition(_SpacyBuiltin):
611600
612601 def apply (self , text , form , evaluation , options ):
613602 "TextPosition[text_String, form_, OptionsPattern[%(name)s]]"
614- doc = self ._nlp (text .get_string_value () , evaluation , options )
603+ doc = self ._nlp (text .value , evaluation , options )
615604 if doc :
616- return Expression ( "List" , * [_position (t ) for t in _cases (doc , form )])
605+ return ListExpression ( * [_position (t ) for t in _cases (doc , form )])
617606
618607 def apply_n (self , text , form , n , evaluation , options ):
619608 "TextPosition[text_String, form_, n_Integer, OptionsPattern[%(name)s]]"
@@ -682,7 +671,7 @@ def apply(self, text, evaluation, options):
682671 if doc :
683672 tree = self ._to_tree (list (doc ))
684673 sents = ["(Sentence, (%s))" % self ._to_constituent_string (x ) for x in tree ]
685- return Expression ( "List" , * [ String ( s ) for s in sents ] )
674+ return to_mathics_list ( * sents , elements_conversion_fn = String )
686675
687676
688677class WordSimilarity (_SpacyBuiltin ):
@@ -737,17 +726,17 @@ def apply_pair(self, text1, i1, text2, i2, evaluation, options):
737726 i1 .get_head_name () == "System`List"
738727 and i2 .get_head_name () == "System`List"
739728 ):
740- if len (i1 .leaves ) != len (i2 .leaves ):
729+ if len (i1 .elements ) != len (i2 .elements ):
741730 evaluation .message ("TextSimilarity" , "idxfmt" )
742731 return
743732 if any (
744- not all (isinstance (i , Integer ) for i in l .leaves )
733+ not all (isinstance (i , Integer ) for i in l .elements )
745734 for l in (i1 , i2 )
746735 ):
747736 evaluation .message ("TextSimilarity" , "idxfmt" )
748737 return
749- indices1 = [i .get_int_value () for i in i1 .leaves ]
750- indices2 = [i .get_int_value () for i in i2 .leaves ]
738+ indices1 = [i .get_int_value () for i in i1 .elements ]
739+ indices2 = [i .get_int_value () for i in i2 .elements ]
751740 multiple = True
752741 elif isinstance (i1 , Integer ) and isinstance (i2 , Integer ):
753742 indices1 = [i1 .get_int_value ()]
@@ -814,11 +803,11 @@ def apply(self, word, evaluation):
814803
815804 def apply_list (self , words , evaluation ):
816805 "WordStem[words_List]"
817- if all (isinstance (w , String ) for w in words .leaves ):
806+ if all (isinstance (w , String ) for w in words .elements ):
818807 stemmer = self ._get_porter_stemmer ()
819808 return Expression (
820809 "List" ,
821- * [String (stemmer .stem (w .get_string_value ())) for w in words .leaves ]
810+ * [String (stemmer .stem (w .get_string_value ())) for w in words .elements ]
822811 )
823812
824813
@@ -1163,10 +1152,10 @@ def _parse_word(self, word):
11631152 if isinstance (word , String ):
11641153 return word .get_string_value ().lower ()
11651154 elif word .get_head_name () == "System`List" :
1166- if len (word .leaves ) == 3 and all (
1167- isinstance (s , String ) for s in word .leaves
1155+ if len (word .elements ) == 3 and all (
1156+ isinstance (s , String ) for s in word .elements
11681157 ):
1169- return tuple (s .get_string_value () for s in word .leaves )
1158+ return tuple (s .get_string_value () for s in word .elements )
11701159
11711160 def _standard_property (
11721161 self , py_word , py_form , py_property , wordnet , language_code , evaluation
@@ -1465,7 +1454,6 @@ class LanguageIdentify(Builtin):
14651454 def apply (self , text , evaluation ):
14661455 "LanguageIdentify[text_String]"
14671456 import langid # see https://github.com/saffsd/langid.py
1468-
14691457 # an alternative: https://github.com/Mimino666/langdetect
14701458 import pycountry
14711459
0 commit comments