@@ -45,18 +45,19 @@ def detect_languages(texts: Iterable[str]) -> list[t.Optional[str]]:
4545 ]
4646
4747
48- def get_lang_to_models () -> dict [str , list [str ]]:
48+ @functools .cache
49+ def get_lang_to_models () -> dict [str , str ]:
4950 """Get a mapping of ISO language code to installed spacy language models."""
50- lang_to_models = collections . defaultdict ( list )
51+ lang_to_models = {}
5152 models = spacy .util .get_installed_models ()
5253 for model in models :
5354 if "_" in model :
5455 lang , _ = model .split ("_" , 1 )
55- lang_to_models [lang ]. append ( model )
56+ lang_to_models [lang ] = model
5657 else :
5758 LOGGER .warning ("found unexpected spacy model name: %s" , model )
5859
59- return dict ( lang_to_models )
60+ return lang_to_models
6061
6162
6263@functools .lru_cache (maxsize = 10 )
@@ -89,6 +90,41 @@ def load_spacy_lang(name: str, **kwargs) -> SpacyLang:
8990 return spacy_lang
9091
9192
93+ def process_text_into_doc (
94+ text : str ,
95+ * ,
96+ max_len : t .Optional [int ] = 1000 ,
97+ fallback_lang : t .Optional [str ] = "en" ,
98+ ** kwargs ,
99+ ) -> t .Optional [SpacyDoc ]:
100+ """
101+ Args:
102+ text
103+ max_len: Maximum number of chars (code points) in text to include
104+ when identifying its language and processing into a spacy document.
105+ fallback_lang: Fallback language used in place of low-confidence predictions.
106+ **kwargs: Passed as-is into :func:`load_spacy_lang()` .
107+ """
108+ # clean up whitespace, to make it easier on lang detector
109+ text = text .strip ().replace ("\n " , " " )
110+ # truncate texts, optionally
111+ if max_len is not None :
112+ text = text [:max_len ]
113+ # identify most probable language (w/ optional fallback) for text
114+ lang = detect_language (text ) or fallback_lang
115+ lang_models = get_lang_to_models ()
116+ if lang in lang_models :
117+ spacy_lang : SpacyLang = load_spacy_lang (lang_models [lang ], ** kwargs )
118+ spacy_doc = spacy_lang (text )
119+ return spacy_doc
120+ else :
121+ LOGGER .info (
122+ "unable to load spacy model for text with lang='%s'; doc set to null ..." ,
123+ lang ,
124+ )
125+ return None
126+
127+
92128def process_texts_into_docs (
93129 texts : Iterable [str ],
94130 * ,
@@ -120,7 +156,7 @@ def process_texts_into_docs(
120156 lang_models = get_lang_to_models ()
121157 for lang , tl_grp in itertools .groupby (text_langs , key = itemgetter (1 )):
122158 if lang in lang_models :
123- spacy_lang = load_spacy_lang (lang_models [lang ][ 0 ] , ** kwargs )
159+ spacy_lang = load_spacy_lang (lang_models [lang ], ** kwargs )
124160 spacy_docs = spacy_lang .pipe ((text for text , _ in tl_grp ), n_process = 1 )
125161 for spacy_doc in spacy_docs :
126162 yield spacy_doc
0 commit comments