Skip to content

Commit 3432dc8

Browse files
committed
Update grammar, use dataclasses, refactor type cast, version 0.3.0-dev
1 parent a9d2928 commit 3432dc8

File tree

8 files changed

+651
-616
lines changed

8 files changed

+651
-616
lines changed

.github/workflows/test.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ubuntu-latest
1212
strategy:
1313
matrix:
14-
python-version: [3.7, 3.8, 3.9, '3.10']
14+
python-version: [3.9, '3.10', '3.11', '3.12', '3.13']
1515
steps:
1616
- uses: actions/checkout@v2
1717
- name: Set up Python ${{ matrix.python-version }}
@@ -21,8 +21,11 @@ jobs:
2121
- name: Install Python dependencies
2222
run: |
2323
python -m pip install --upgrade pip wheel
24-
pip install pytest
24+
pip install pytest mypy
2525
pip install .
26+
- name: Static type check
27+
run: |
28+
mypy gedcom7
2629
- name: Test with pytest
2730
run: |
2831
pytest

gedcom7/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
from .parser import loads
44

5-
__version__ = "0.2.1"
5+
__version__ = "0.3.0-dev"

gedcom7/cast.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
"""Cast strings values to the appropriate data type."""
2+
3+
import re
4+
from collections.abc import Callable
5+
6+
from . import const, types, grammar
7+
8+
9+
def cast_value(text: str, type_id: str) -> types.DataType | None:
10+
if not text:
11+
return None
12+
payload = const.payloads.get(type_id)
13+
if not payload:
14+
return None
15+
cast_fuction = CAST_FUNCTIONS.get(payload)
16+
if not cast_fuction:
17+
return text
18+
return cast_fuction(text)
19+
20+
21+
def _cast_bool(value: str) -> bool:
22+
"""Cast a string to a boolean."""
23+
if value == "Y":
24+
return True
25+
if not value:
26+
return False
27+
else:
28+
raise ValueError(f"Cannot interpret {value} as boolean")
29+
30+
31+
def _cast_integer(value: str) -> int:
32+
"""Cast a string to an integer."""
33+
try:
34+
return int(value)
35+
except ValueError:
36+
raise ValueError(f"Cannot interpret {value} as integer")
37+
38+
39+
def _cast_list_text(value: str) -> list[str]:
40+
"""Cast a string to a list of strings."""
41+
return [el.strip() for el in value.split(",")]
42+
43+
44+
def _match(text: str, regex: str, type_name: str) -> re.Match:
45+
"""Match a string and raise if not compatible."""
46+
match = re.fullmatch(regex, text)
47+
if not match:
48+
raise ValueError(f"Cannot interpret {text} as type {type_name}")
49+
return match
50+
51+
52+
def _cast_personal_name(value: str) -> types.PersonalName:
53+
"""Cast a string to a PersonalName."""
54+
match = _match(value, grammar.personalname, "PersonalName")
55+
return types.PersonalName(
56+
fullname=value.replace("/", ""),
57+
surname=match.group("surname"),
58+
)
59+
60+
61+
def _cast_time(value: str) -> types.Time:
62+
"""Cast a string to a Time."""
63+
match = _match(value, grammar.time, "Time")
64+
return types.Time(
65+
tz=match.group("tz"),
66+
hour=int(match.group("hour")),
67+
minute=int(match.group("minute")),
68+
second=int(match.group("second")) if match.group("second") else None,
69+
fraction=int(match.group("fraction")) if match.group("fraction") else None,
70+
)
71+
72+
73+
def _cast_age(value: str) -> types.Age:
74+
"""Cast a string to an Age."""
75+
match = _match(value, grammar.age, "Age")
76+
res = {
77+
"agebound": match.group("agebound"),
78+
"years": match.group("years"),
79+
"months": match.group("months1") or match.group("months2"),
80+
"weeks": match.group("weeks1")
81+
or match.group("weeks2")
82+
or match.group("weeks3"),
83+
"days": match.group("days1")
84+
or match.group("days2")
85+
or match.group("days3")
86+
or match.group("days4"),
87+
}
88+
return types.Age(
89+
agebound=res["agebound"],
90+
years=int(res["years"].rstrip("y")) if res["years"] else None,
91+
months=int(res["months"].rstrip("m")) if res["months"] else None,
92+
weeks=int(res["weeks"].rstrip("w")) if res["weeks"] else None,
93+
days=int(res["days"].rstrip("d")) if res["days"] else None,
94+
)
95+
96+
97+
def _cast_enum(value: str) -> str:
98+
"""Cast a string to an Enum."""
99+
match = _match(value, grammar.enum, "Enum")
100+
return match.group(0)
101+
102+
103+
def _cast_list_enum(value: str) -> list[str]:
104+
"""Cast a string to a list of Enums."""
105+
match = _match(value, grammar.list_enum, "ListEnum")
106+
return [el.strip() for el in match.group(0).split(",")]
107+
108+
109+
def _cast_mediatype(value: str) -> types.MediaType:
110+
"""Cast a string to a MediaType."""
111+
match = _match(value, grammar.mediatype, "MediaType")
112+
return types.MediaType(media_type=match.group(0))
113+
114+
115+
def _cast_date_exact(value: str) -> types.DateExact:
116+
"""Cast a string to a DateExact."""
117+
match = _match(value, grammar.dateexact, "DateExact")
118+
return types.DateExact(
119+
day=int(match.group("day")),
120+
month=match.group("month"),
121+
year=int(match.group("year")),
122+
)
123+
124+
125+
def _cast_date(value: str) -> types.Date:
126+
"""Cast a string to a Date."""
127+
match = _match(value, grammar.date_capture, "Date")
128+
return types.Date(
129+
calendar=match.group("calendar"),
130+
day=int(match.group("day")) if match.group("day") else None,
131+
month=match.group("month"),
132+
year=int(match.group("year")),
133+
epoch=match.group("epoch"),
134+
)
135+
136+
137+
def _cast_date_period(value: str) -> types.DatePeriod:
138+
"""Cast a string to a DatePeriod."""
139+
match = _match(value, grammar.dateperiod, "DatePeriod")
140+
res = {}
141+
if match.group("todate1") or match.group("todate2"):
142+
to_date = match.group("todate1") or match.group("todate2")
143+
to_date_match = re.fullmatch(grammar.date_capture, to_date)
144+
if to_date_match:
145+
res["to"] = types.Date(
146+
calendar=to_date_match.group("calendar"),
147+
day=int(to_date_match.group("day")) if to_date_match.group("day") else None,
148+
month=to_date_match.group("month"),
149+
year=int(to_date_match.group("year")),
150+
epoch=to_date_match.group("epoch"),
151+
)
152+
if match.group("fromdate"):
153+
from_date_match = re.fullmatch(grammar.date_capture, match.group("fromdate"))
154+
if from_date_match:
155+
res["from_"] = types.Date(
156+
calendar=from_date_match.group("calendar"),
157+
day=int(from_date_match.group("day")) if from_date_match.group("day") else None,
158+
month=from_date_match.group("month"),
159+
year=int(from_date_match.group("year")),
160+
epoch=from_date_match.group("epoch"),
161+
)
162+
return types.DatePeriod(**res)
163+
164+
165+
CAST_FUNCTIONS: dict[str, Callable[[str], types.DataType] | None] = {
166+
"Y|<NULL>": _cast_bool,
167+
"http://www.w3.org/2001/XMLSchema#Language": None,
168+
"http://www.w3.org/2001/XMLSchema#nonNegativeInteger": _cast_integer,
169+
"http://www.w3.org/2001/XMLSchema#string": None,
170+
"http://www.w3.org/ns/dcat#mediaType": _cast_mediatype,
171+
"https://gedcom.io/terms/v7/type-Age": _cast_age,
172+
"https://gedcom.io/terms/v7/type-Date": _cast_date,
173+
"https://gedcom.io/terms/v7/type-Date#exact": _cast_date_exact,
174+
"https://gedcom.io/terms/v7/type-Date#period": _cast_date_period,
175+
"https://gedcom.io/terms/v7/type-Enum": _cast_enum,
176+
"https://gedcom.io/terms/v7/type-List#Enum": _cast_list_enum,
177+
"https://gedcom.io/terms/v7/type-List#Text": _cast_list_text,
178+
"https://gedcom.io/terms/v7/type-Name": _cast_personal_name,
179+
"https://gedcom.io/terms/v7/type-Time": _cast_time,
180+
}

gedcom7/grammar.py

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
f'(?P<days2>{days}))?|(?P<weeks3>{weeks})({d}(?P<days3>{days}))?|(?P<days4>{days}))'
1212
)
1313
age = f'((?P<agebound>{agebound}){d})?{ageduration}'
14+
alphanum = '[a-zA-Z0-9]'
1415
anychar = '[\t-\\U0010ffff]'
1516
atsign = '@'
1617
banned = (
@@ -23,6 +24,16 @@
2324
exttag = f'{underscore}({tagchar})+'
2425
daterestrict = 'FROM|TO|BET|AND|BEF|AFT|ABT|CAL|EST'
2526
calendar = f'(?!{daterestrict})(GREGORIAN|JULIAN|FRENCH_R|HEBREW|{exttag})'
27+
restricted_name_first = '[a-zA-Z0-9]'
28+
restricted_name_chars = '[a-zA-Z0-9!#$&\\-^_.+]'
29+
restricted_name = f'{restricted_name_first}({restricted_name_chars}){{,126}}'
30+
type_name = f'{restricted_name}'
31+
ietf_token = f'{type_name}'
32+
tchar = "[!#$%&'*+\\-.^_`|~0-9a-zA-Z]"
33+
token = f'({tchar})+'
34+
x_token = f'x-{token}'
35+
extension_token = f'({ietf_token}|{x_token})'
36+
composite_type = f'(message|multipart|{extension_token})'
2637
day = f'{integer}'
2738
stdtag = f'{ucletter}({tagchar})*'
2839
month = f'(?!{daterestrict})({stdtag}|{exttag})'
@@ -35,11 +46,38 @@
3546
dateperiod = f'((TO{d}(?P<todate1>{date}))?|FROM{d}(?P<fromdate>{date})({d}TO{d}(?P<todate2>{date}))?)'
3647
daterange = f'(BET{d}(?P<between>{date}){d}AND{d}(?P<and>{date})|AFT{d}(?P<after>{date})|BEF{d}(?P<before>{date}))'
3748
datevalue = f'({date}|{dateperiod}|{daterange}|{dateapprox})?'
49+
discrete_type = f'(text|image|audio|video|application|{extension_token})'
50+
stdenum = f'({stdtag}|{integer})'
51+
enum = f'({stdenum}|{exttag})'
3852
tag = f'({stdtag}|{exttag})'
39-
enum = f'{tag}'
4053
eol = '(\\\r(\\\n)?|\\\n)'
54+
singleton = '[0-9A-WY-Za-wy-z]'
55+
extension = f'{singleton}(-({alphanum}){{2,8}})+'
56+
extlang = '[a-zA-Z]{3}(-[a-zA-Z]{3}){,2}'
4157
fraction = '[0-9]+'
58+
irregular = (
59+
'(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|'
60+
'i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|'
61+
'i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)'
62+
)
63+
regular = (
64+
'(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|'
65+
'zh-min|zh-min-nan|zh-xiang)'
66+
)
67+
grandfathered = f'({irregular}|{regular})'
4268
hour = '([0-9]|[01][0-9]|2[0123])'
69+
subtype_name = f'{restricted_name}'
70+
iana_token = f'{subtype_name}'
71+
language = f'([a-zA-Z]{{2,3}}(-{extlang})?|[a-zA-Z]{{4}}|[a-zA-Z]{{5,8}})'
72+
script = '[a-zA-Z]{4}'
73+
region = '([a-zA-Z]{2}|[0-9]{3})'
74+
variant = f'(({alphanum}){{5,8}}|[0-9]({alphanum}){{3}})'
75+
privateuse = f'[xX](-({alphanum}){{1,8}})+'
76+
langtag = (
77+
f'{language}(-{script})?(-{region})?(-{variant})*(-{extension})*(-'
78+
f'{privateuse})?'
79+
)
80+
language_tag = f'({langtag}|{privateuse}|{grandfathered})'
4381
nonzero = '[1-9]'
4482
level = f'(?P<level>0|{nonzero}[0-9]*)'
4583
xref = f'{atsign}({tagchar})+{atsign}'
@@ -57,17 +95,18 @@
5795
list = f'{listitem}({listdelim}{listitem})*'
5896
list_enum = f'{enum}({listdelim}{enum})*'
5997
list_text = f'{list}'
60-
mt_char = "[ -!#-'*-+\\--.0-9A-Z^-~]"
61-
mt_token = f'({mt_char})+'
62-
mt_type = f'{mt_token}'
63-
mt_subtype = f'{mt_token}'
64-
mt_attribute = f'{mt_token}'
65-
mt_qtext = '[\t-\n -!#-\\[\\]-~]'
66-
mt_qpair = '\\\\[\t-~]'
67-
mt_qstring = f'"({mt_qtext}|{mt_qpair})*"'
68-
mt_value = f'({mt_token}|{mt_qstring})'
69-
mt_parameter = f'{mt_attribute}={mt_value}'
70-
mediatype = f'{mt_type}/{mt_subtype}(;{mt_parameter})*'
98+
type = f'({discrete_type}|{composite_type})'
99+
subtype = f'({extension_token}|{iana_token})'
100+
ows = '[ \t]*'
101+
parameter_name = f'{token}'
102+
obs_text = '[\\x80-\\xff]'
103+
qdtext = f'([\t !#-\\[\\]-~]|{obs_text})'
104+
quoted_pair = f'\\\\([\t !-~]|{obs_text})'
105+
quoted_string = f'"({qdtext}|{quoted_pair})*"'
106+
parameter_value = f'({token}|{quoted_string})'
107+
parameter = f'{parameter_name}={parameter_value}'
108+
parameters = f'({ows};{ows}({parameter})?)*'
109+
mediatype = f'{type}/{subtype}{parameters}'
71110
minute = '[012345][0-9]'
72111
namechar = '[ -.0-\\U0010ffff]'
73112
namestr = f'({namechar})+'

0 commit comments

Comments
 (0)