Skip to content

Commit f019fbe

Browse files
ekaftomaarsen
andauthored
Support both iso639-3 codes and BCP-47 language tags (nltk#3060)
* Add support for iso639-3 language codes * Add support for retired language codes * Move langnames.py to the top-level * Add langcode() function * Add iso639retired dictionary * Improve wrapper functions * Add module docstring with doctest * Add 2-letter language codes * Add regular expression check * Improve inverse lookup of retired codes * Support BCP-47 * Avoid deprecated langcodes * Set stack level for warnings to warn on the langname call Now it throws e.g. ``` ...\nltk_3060.py:9: UserWarning: Shortening 'smo' to 'sm' print(f"{lang}: {langname(code)}") ``` Rather than ``` ...\nltk\langnames.py:64: UserWarning: Shortening zha to za warn(f"Shortening {code} to {code2}") ``` * Dict key membership is equivalent to dict membership * Resolve bug: subtag -> tag * Capitalize BCP47 in CorpusReader name * Reimplement removed type hint changes from nltk#3081 Co-authored-by: Tom Aarsen <[email protected]>
1 parent 3ca43e2 commit f019fbe

File tree

4 files changed

+953
-0
lines changed

4 files changed

+953
-0
lines changed

nltk/corpus/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@
7474
alpino: AlpinoCorpusReader = LazyCorpusLoader(
7575
"alpino", AlpinoCorpusReader, tagset="alpino"
7676
)
77+
bcp47: BCP47CorpusReader = LazyCorpusLoader(
78+
"bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
79+
)
7780
brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
7881
"brown",
7982
CategorizedTaggedCorpusReader,

nltk/corpus/reader/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105
from nltk.corpus.reader.comparative_sents import *
106106
from nltk.corpus.reader.panlex_lite import *
107107
from nltk.corpus.reader.panlex_swadesh import *
108+
from nltk.corpus.reader.bcp47 import *
108109

109110
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
110111
# the function bracket_parse() defined in nltk.tree:
@@ -181,4 +182,5 @@
181182
"UnicharsCorpusReader",
182183
"MWAPPDBCorpusReader",
183184
"PanlexSwadeshCorpusReader",
185+
"BCP47CorpusReader",
184186
]

nltk/corpus/reader/bcp47.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
# Natural Language Toolkit: BCP-47 language tags
2+
#
3+
# Copyright (C) 2022 NLTK Project
4+
# Author: Eric Kafe <[email protected]>
5+
# URL: <https://www.nltk.org/>
6+
# For license information, see LICENSE.TXT
7+
8+
import re
9+
from warnings import warn
10+
from xml.etree import ElementTree as et
11+
12+
from nltk.corpus.reader import CorpusReader
13+
14+
15+
class BCP47CorpusReader(CorpusReader):
16+
"""
17+
Parse BCP-47 composite language tags
18+
19+
Supports all the main subtags, and the 'u-sd' extension:
20+
21+
>>> from nltk.corpus import bcp47
22+
>>> bcp47.name('oc-gascon-u-sd-fr64')
23+
'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'
24+
25+
Can load a conversion table to Wikidata Q-codes:
26+
>>> bcp47.load_wiki_q()
27+
>>> bcp47.wiki_q['en-GI-spanglis']
28+
'Q79388'
29+
30+
"""
31+
32+
def __init__(self, root, fileids):
33+
"""Read the BCP-47 database"""
34+
super().__init__(root, fileids)
35+
self.langcode = {}
36+
with self.open("iana/language-subtag-registry.txt") as fp:
37+
self.db = self.data_dict(fp.read().split("%%\n"))
38+
with self.open("cldr/common-subdivisions-en.xml") as fp:
39+
self.subdiv = self.subdiv_dict(
40+
et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
41+
)
42+
self.morphology()
43+
44+
def load_wiki_q(self):
45+
"""Load conversion table to Wikidata Q-codes (only if needed)"""
46+
with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
47+
self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])
48+
49+
def wiki_dict(self, lines):
50+
"""Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
51+
return {
52+
pair[1]: pair[0].split("/")[-1]
53+
for pair in [line.strip().split("\t") for line in lines]
54+
}
55+
56+
def subdiv_dict(self, subdivs):
57+
"""Convert the CLDR subdivisions list to a dictionary"""
58+
return {sub.attrib["type"]: sub.text for sub in subdivs}
59+
60+
def morphology(self):
61+
self.casing = {
62+
"language": str.lower,
63+
"extlang": str.lower,
64+
"script": str.title,
65+
"region": str.upper,
66+
"variant": str.lower,
67+
}
68+
dig = "[0-9]"
69+
low = "[a-z]"
70+
up = "[A-Z]"
71+
alnum = "[a-zA-Z0-9]"
72+
self.format = {
73+
"language": re.compile(f"{low*3}?"),
74+
"extlang": re.compile(f"{low*3}"),
75+
"script": re.compile(f"{up}{low*3}"),
76+
"region": re.compile(f"({up*2})|({dig*3})"),
77+
"variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
78+
"singleton": re.compile(f"{low}"),
79+
}
80+
81+
def data_dict(self, records):
82+
"""Convert the BCP-47 language subtag registry to a dictionary"""
83+
self.version = records[0].replace("File-Date:", "").strip()
84+
dic = {}
85+
dic["deprecated"] = {}
86+
for label in [
87+
"language",
88+
"extlang",
89+
"script",
90+
"region",
91+
"variant",
92+
"redundant",
93+
"grandfathered",
94+
]:
95+
dic["deprecated"][label] = {}
96+
for record in records[1:]:
97+
fields = [field.split(": ") for field in record.strip().split("\n")]
98+
typ = fields[0][1]
99+
tag = fields[1][1]
100+
if typ not in dic:
101+
dic[typ] = {}
102+
subfields = {}
103+
for field in fields[2:]:
104+
if len(field) == 2:
105+
[key, val] = field
106+
if key not in subfields:
107+
subfields[key] = [val]
108+
else: # multiple value
109+
subfields[key].append(val)
110+
else: # multiline field
111+
subfields[key][-1] += " " + field[0].strip()
112+
if (
113+
"Deprecated" not in record
114+
and typ == "language"
115+
and key == "Description"
116+
):
117+
self.langcode[subfields[key][-1]] = tag
118+
for key in subfields:
119+
if len(subfields[key]) == 1: # single value
120+
subfields[key] = subfields[key][0]
121+
if "Deprecated" in record:
122+
dic["deprecated"][typ][tag] = subfields
123+
else:
124+
dic[typ][tag] = subfields
125+
return dic
126+
127+
def val2str(self, val):
128+
"""Return only first value"""
129+
if type(val) == list:
130+
# val = "/".join(val) # Concatenate all values
131+
val = val[0]
132+
return val
133+
134+
def lang2str(self, lg_record):
135+
"""Concatenate subtag values"""
136+
name = f"{lg_record['language']}"
137+
for label in ["extlang", "script", "region", "variant", "extension"]:
138+
if label in lg_record:
139+
name += f": {lg_record[label]}"
140+
return name
141+
142+
def parse_tag(self, tag):
143+
"""Convert a BCP-47 tag to a dictionary of labelled subtags"""
144+
subtags = tag.split("-")
145+
lang = {}
146+
labels = ["language", "extlang", "script", "region", "variant", "variant"]
147+
while subtags and labels:
148+
subtag = subtags.pop(0)
149+
found = False
150+
while labels:
151+
label = labels.pop(0)
152+
subtag = self.casing[label](subtag)
153+
if self.format[label].fullmatch(subtag):
154+
if subtag in self.db[label]:
155+
found = True
156+
valstr = self.val2str(self.db[label][subtag]["Description"])
157+
if label == "variant" and label in lang:
158+
lang[label] += ": " + valstr
159+
else:
160+
lang[label] = valstr
161+
break
162+
elif subtag in self.db["deprecated"][label]:
163+
found = True
164+
note = f"The {subtag!r} {label} code is deprecated"
165+
if "Preferred-Value" in self.db["deprecated"][label][subtag]:
166+
prefer = self.db["deprecated"][label][subtag][
167+
"Preferred-Value"
168+
]
169+
note += f"', prefer '{self.val2str(prefer)}'"
170+
lang[label] = self.val2str(
171+
self.db["deprecated"][label][subtag]["Description"]
172+
)
173+
warn(note)
174+
break
175+
if not found:
176+
if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions
177+
sd = subtags[1]
178+
if sd in self.subdiv:
179+
ext = self.subdiv[sd]
180+
else:
181+
ext = f"<Unknown subdivision: {ext}>"
182+
else: # other extension subtags are not supported yet
183+
ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
184+
if not self.format["singleton"].fullmatch(subtag):
185+
ext = f"<Invalid extension: {ext}>"
186+
warn(ext)
187+
lang["extension"] = ext
188+
subtags = []
189+
return lang
190+
191+
def name(self, tag):
192+
"""
193+
Convert a BCP-47 tag to a colon-separated string of subtag names
194+
195+
>>> from nltk.corpus import bcp47
196+
>>> bcp47.name('ca-Latn-ES-valencia')
197+
'Catalan: Latin: Spain: Valencian'
198+
199+
"""
200+
for label in ["redundant", "grandfathered"]:
201+
val = None
202+
if tag in self.db[label]:
203+
val = f"{self.db[label][tag]['Description']}"
204+
note = f"The {tag!r} code is {label}"
205+
elif tag in self.db["deprecated"][label]:
206+
val = f"{self.db['deprecated'][label][tag]['Description']}"
207+
note = f"The {tag!r} code is {label} and deprecated"
208+
if "Preferred-Value" in self.db["deprecated"][label][tag]:
209+
prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
210+
note += f", prefer {self.val2str(prefer)!r}"
211+
if val:
212+
warn(note)
213+
return val
214+
try:
215+
return self.lang2str(self.parse_tag(tag))
216+
except:
217+
warn(f"Tag {tag!r} was not recognized")
218+
return None

0 commit comments

Comments
 (0)