Skip to content

Commit aef3007

Browse files
committed
Maintenance: Introduced auto-section mapping and adapted sources
- enhanced author hints on lookup failures - more probably unique slugs - make succeeds albeit the GFM-Plus and HTML items do not yet use the coorect new lookups for toc - make render-pdf not yet tested - WIP Signed-off-by: Stefan Hagen <stefan@hagen.link>
1 parent e04a3c6 commit aef3007

14 files changed

+1058
-331
lines changed
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
#! /usr/bin/env python
2+
"""Extract sections from source files concat per binder and write display to label(slug) mapping."""
3+
import pathlib
4+
import os
5+
import re
6+
import sys
7+
8+
from inverso.implementation import process as process_inversion, nonjective # type: ignore
9+
from muuntuu.implementation import json_dump # type: ignore
10+
11+
# "Un-lit(t)er-al" the code
12+
DASH = '-'
13+
DOT = '.'
14+
ENCODING = 'utf-8'
15+
ENC_ERRS = 'ignore'
16+
FULL_STOP = '.'
17+
HASH = '#'
18+
NL = '\n'
19+
RS = chr(30) # Record Separator
20+
SPACE = ' '
21+
22+
PathLike = str | pathlib.Path
23+
24+
DEBUG = bool(os.getenv('SECTIONS_DEBUG', ''))
25+
26+
# Configuration and runtime parameter candidates:
27+
GREMLINS = ' .,;?!_()[]{}<>\\/$:"\'`´'
28+
BINDER_AT = pathlib.Path('etc') / 'bind.txt'
29+
SOURCE_AT = pathlib.Path('src')
30+
SECTION_DISPLAY_TO_LABEL_AT = pathlib.Path('etc') / 'section-display-to-label.json'
31+
SECTION_LABEL_TO_DISPLAY_AT = pathlib.Path('etc') / 'section-label-to-display.json'
32+
SECTION_DISPLAY_TO_TEXT_AT = pathlib.Path('etc') / 'section-display-to-text.json'
33+
TOK_LAB = '{#'
34+
CLEAN_MD_START = '# Introduction'
35+
FENCED_BLOCK_FLIP_FLOP = '```'
36+
APPENDIX_INNER_PATTERN = re.compile(r'(?P<display>[A-Z][\.0-9]+)\ +(?P<rest>.+)')
37+
38+
39+
def load_binder(binder_at: PathLike) -> list[pathlib.Path]:
40+
"""Load the linear binder text file into a list of file paths."""
41+
with open(binder_at, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource:
42+
return [pathlib.Path(entry.strip()) for entry in resource.readlines() if entry.strip()]
43+
44+
45+
def load_document(path: PathLike) -> list[str]:
46+
"""Load the text file into a list of strings."""
47+
with open(path, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource:
48+
return resource.readlines()
49+
50+
51+
def slugify(
52+
text: str,
53+
connector: str = DASH,
54+
marker: str = RS,
55+
gremlins: str = GREMLINS,
56+
policy: str = 'lower',
57+
) -> str:
58+
"""Derive kebab style slug from text.
59+
60+
Implementer notes:
61+
62+
- Every character not in gremlins is kept.
63+
- Incoming connector chars (default dashes) are preserved by
64+
sandwich transform to and from marker char (default ASCII RS).
65+
If the marker char occurs in the text, it will be replaced
66+
with the connector char during the back transform.
67+
"""
68+
ds = connector
69+
rs = marker
70+
71+
sl = text.strip().replace(ds, rs)
72+
for gremlin in gremlins:
73+
sl = sl.replace(gremlin, ds)
74+
75+
return getattr(
76+
ds.join(s.replace(rs, ds) for s in sl.split(ds) if s and s != ds),
77+
policy
78+
)()
79+
80+
81+
def invert(
82+
source: dict[str, str],
83+
options: dict[str, bool | int | str] | None = None
84+
) -> dict[str, str]:
85+
"""Invert between and within the known formats (inverso API has a gap)."""
86+
if options is None:
87+
options = {}
88+
auto_serial: bool = options.get('auto_serial', False)
89+
auto_serial_step: int = options.get('auto_serial_step', 1)
90+
generator_caveat: bool = options.get('generator_caveat', True)
91+
marker_token: str = options.get('marker_token', '')
92+
marker_is_value: bool = options.get('marker_is_value', False)
93+
94+
cleansed: dict[str, str] = {}
95+
for k, v in source.items():
96+
key, value, seen = process_inversion(k, v, auto_serial, marker_token, marker_is_value)
97+
cleansed[key] = value
98+
if auto_serial:
99+
if not marker_token or not seen:
100+
auto_serial += auto_serial_step
101+
102+
if findings := nonjective(cleansed):
103+
print('Error: source has ambiguous values.', file=sys.stderr)
104+
for finding in findings:
105+
print(f'- {finding}', file=sys.stderr)
106+
return {}
107+
108+
inverted = {v: k for k, v in cleansed.items()}
109+
ordered = {'Please do not edit manually!': 'Cf. documentation'} if generator_caveat else {}
110+
for k in sorted(inverted):
111+
ordered[k] = inverted[k]
112+
113+
return ordered
114+
115+
116+
def main(argv: list[str]) -> int:
117+
"""Drive the extraction."""
118+
119+
bind_seq_path = pathlib.Path(argv[0]) if argv else BINDER_AT
120+
binder = load_binder(bind_seq_path)
121+
for resource in binder:
122+
if not (SOURCE_AT / resource).is_file():
123+
print(f'Problem reading {resource}')
124+
return 1
125+
126+
lines: list[str] = []
127+
for resource in binder:
128+
part_lines = load_document(SOURCE_AT / resource)
129+
if part_lines[-1] != NL:
130+
part_lines.append(NL)
131+
lines.extend(part_lines)
132+
133+
in_fenced_block = False
134+
clean_headings = False
135+
sections = []
136+
for line in lines:
137+
if line.startswith(CLEAN_MD_START):
138+
clean_headings = True
139+
140+
if not clean_headings:
141+
continue
142+
143+
if line.startswith(FENCED_BLOCK_FLIP_FLOP):
144+
in_fenced_block = not in_fenced_block
145+
146+
if line.startswith(HASH) and not in_fenced_block:
147+
if line.lstrip(HASH).startswith(SPACE):
148+
sections.append(line.rstrip(NL))
149+
150+
print(f'Identified {len(sections)} relevant sections ...')
151+
152+
level_counts = {n: 0 for n in range(1, 7)}
153+
previous_level = 0
154+
defects = 0
155+
for section in sections:
156+
level = len(section.split(SPACE, 1)[0])
157+
level_counts[level] += 1
158+
if level > previous_level:
159+
if level - previous_level > 1:
160+
defects += 1
161+
print(
162+
f'! LEVEL_NEST_ERROR jumping from level {previous_level}'
163+
f' directly to {level}'
164+
)
165+
print(f'>>> {section}')
166+
previous_level = level
167+
168+
for level, count in level_counts.items():
169+
print(f'- {count} level {level} sections')
170+
171+
if defects:
172+
print(f'Found {defects} defects in section nesting!')
173+
else:
174+
print('Section level nesting is valid')
175+
176+
db = []
177+
is_appendix = False
178+
root: int = 0
179+
appr = ''
180+
for section in sections:
181+
display = ''
182+
level = len(section.split(SPACE, 1)[0])
183+
if level == 1:
184+
root += 1
185+
text_plus = section[level + 1:]
186+
if '<mark title="Ephemeral region marking">' in text_plus:
187+
text_plus = (
188+
text_plus
189+
.replace('<mark title="Ephemeral region marking">', '')
190+
.replace('</mark>', '')
191+
)
192+
if text_plus.startswith('Appendix '):
193+
appr = text_plus.replace('Appendix ', '')[0]
194+
display = f'Appendix {appr}.'
195+
text_plus = text_plus.replace(f'{display} ', '')
196+
is_appendix = True
197+
else:
198+
match = APPENDIX_INNER_PATTERN.match(text_plus)
199+
if match:
200+
found = match.groupdict()
201+
display = found['display']
202+
text_plus = text_plus.replace(f'{display} ', '')
203+
204+
if TOK_LAB in text_plus:
205+
text, slug = text_plus.rstrip(SPACE).rstrip('}').split(TOK_LAB, 1)
206+
else:
207+
text = text_plus.rstrip(SPACE)
208+
slug = slugify(text)
209+
if not is_appendix:
210+
a_root = str(root)
211+
else:
212+
a_root = appr
213+
db.append([is_appendix, a_root, level, display, text, slug])
214+
215+
if DEBUG:
216+
for is_appendix, a_root, level, display, text, slug in db:
217+
print(
218+
f'{" " if not is_appendix else "APPENDIX"} | {a_root} |'
219+
f' {(HASH * level).rjust(7)} "{text}" <-- {slug}'
220+
)
221+
222+
display_to_label = {}
223+
display_to_text = {}
224+
lvl_min, lvl_sup = 1, 7
225+
level_domain: tuple[int, ...] = tuple(range(lvl_min, lvl_sup))
226+
sec_cnt: dict[str, int] = {f'{HASH * level} ': 0 for level in level_domain}
227+
sec_lvl: dict[str, int] = {f'{HASH * level} ': level for level in level_domain}
228+
lvl_sec: dict[int, str] = {level: f'{HASH * level} ' for level in level_domain}
229+
cur_lvl: int = sec_lvl[f'{HASH * 1} ']
230+
for is_appendix, a_root, level, display, text, slug in db:
231+
if not is_appendix:
232+
tag = f'{HASH * level} '
233+
nxt_lvl = sec_lvl[tag]
234+
sec_cnt[tag] += 1
235+
if nxt_lvl < cur_lvl:
236+
for lvl in range(nxt_lvl + 1, lvl_sup):
237+
sec_cnt[lvl_sec[lvl]] = 0
238+
sec_cnt_disp_vec = []
239+
for s_tag, cnt in sec_cnt.items():
240+
if cnt == 0:
241+
raise RuntimeError(f'ERROR: Counting is hard: {sec_cnt} at {tag} for {text}')
242+
sec_cnt_disp_vec.append(str(cnt))
243+
if s_tag == tag:
244+
break
245+
sec_cnt_disp = FULL_STOP.join(sec_cnt_disp_vec)
246+
cur_lvl = nxt_lvl
247+
248+
display = sec_cnt_disp.rstrip(DOT)
249+
else:
250+
pass # display, text = text.split(SPACE, 1)
251+
display_to_label[display] = slug
252+
display_to_text[display] = text
253+
if DEBUG:
254+
print(f' {display} "{display_to_text[display]}" <-- {slug}')
255+
256+
json_dump(display_to_label, SECTION_DISPLAY_TO_LABEL_AT, options={'debug': DEBUG})
257+
json_dump(invert(display_to_label), SECTION_LABEL_TO_DISPLAY_AT, options={'debug': DEBUG})
258+
json_dump(display_to_text, SECTION_DISPLAY_TO_TEXT_AT, options={'debug': DEBUG})
259+
260+
return 0
261+
262+
263+
if __name__ == '__main__':
264+
sys.exit(main(sys.argv[1:]))

0 commit comments

Comments
 (0)