Skip to content

Commit 950b46d

Browse files
committed
Maintenance: PDF generator fixed to not bail out with new section mapping (WIP still appendix mappings broken)
Signed-off-by: Stefan Hagen <stefan@hagen.link>
1 parent 7eee3cc commit 950b46d

File tree

1 file changed

+81
-24
lines changed

1 file changed

+81
-24
lines changed

csaf_2.1/prose/edit/bin/lapidify.py

Lines changed: 81 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
import yaml
1111

1212
ENCODING = 'utf-8'
13+
ENC_ERRS = 'ignore'
1314
NL = '\n'
15+
RS = chr(30) # Record Separator
1416
CB_END = '}'
1517
COLON = ':'
1618
DASH = '-'
@@ -22,7 +24,7 @@
2224
SPACE = ' '
2325
TM = '™'
2426

25-
DEBUG = bool(os.getenv('LAPIDIFY_DEBUG', ''))
27+
DEBUG = bool(os.getenv('DEBUG_LAPIDIFY', ''))
2628
TARGETS = (
2729
PDF := 'pdf',
2830
GFM_PLUS := 'gfm+vendor_hacks',
@@ -32,11 +34,13 @@
3234
DUMP_LUT = bool(os.getenv('DUMP_LUT', ''))
3335

3436
# Configuration and runtime parameter candidates:
37+
GREMLINS = ' .,;?!_()[]{}<>\\/$:"\'`´'
3538
BINDER_AT = pathlib.Path('etc') / 'bind.txt'
3639
SOURCE_AT = pathlib.Path('src')
3740
BUILD_AT = pathlib.Path('build')
3841
SECTION_DISPLAY_TO_LABEL_AT = pathlib.Path('etc') / 'section-display-to-label.json'
3942
SECTION_LABEL_TO_DISPLAY_AT = pathlib.Path('etc') / 'section-label-to-display.json'
43+
SECTION_DISPLAY_TO_TEXT_AT = pathlib.Path('etc') / 'section-display-to-text.json'
4044
EG_GLOBAL_TO_LABEL_AT = pathlib.Path('etc') / 'example-global-to-local.json'
4145
EG_LABEL_TO_GLOBAL_AT = pathlib.Path('etc') / 'example-local-to-global.json'
4246

@@ -81,6 +85,7 @@
8185
"""
8286
CLEAN_MD_START = '# Introduction'
8387
FENCED_BLOCK_FLIP_FLOP = '```'
88+
APPENDIX_INNER_PATTERN = re.compile(r'(?P<display>[A-Z][\.0-9]+)\ +(?P<rest>.+)')
8489
LOGO_URL = 'https://docs.oasis-open.org/templates/OASISLogo-v3.0.png'
8590
LOGO_LOCAL_PATH = 'images/OASISLogo-v3.0.png'
8691
TOP_LOGO_LINE = f'![OASIS Logo]({LOGO_URL})'
@@ -137,7 +142,7 @@
137142

138143
def load_binder(binder_at: Union[str, pathlib.Path], ignores: Union[list[str], None] = None) -> list[pathlib.Path]:
139144
"""Load the linear binder text file into a list of file paths."""
140-
with open(binder_at, 'rt', encoding=ENCODING) as resource:
145+
with open(binder_at, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource:
141146
collation = (pathlib.Path(entry.strip()) for entry in resource.readlines() if entry.strip())
142147
return [path for path in collation if str(path) not in ignores] if ignores else list(collation)
143148

@@ -165,25 +170,44 @@ def detect_meta(text_lines: list[str]) -> tuple[META_TOC_TYPE, list[str]]:
165170

166171
def load_document(path: Union[str, pathlib.Path]) -> tuple[META_TOC_TYPE, list[str]]:
167172
"""Load the text file into a list of strings and harvest any YAML meta info (if present remove the lines)."""
168-
with open(path, 'rt', encoding=ENCODING) as resource:
173+
with open(path, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource:
169174
return detect_meta(resource.readlines())
170175

171176

172177
def dump_assembly(text_lines: list[str], to_path: Union[str, pathlib.Path]) -> None:
173178
"""Dump the lines of text into the text file at path."""
174-
with open(to_path, 'wt', encoding=ENCODING) as resource:
179+
with open(to_path, 'wt', encoding=ENCODING, errors=ENC_ERRS) as resource:
175180
resource.write(''.join(text_lines))
176181

177182

178-
def label_derive_from(text: str) -> str:
179-
"""Transform text to kebab style conventional label assuming no newlines present."""
180-
good_nuff = (' ', '.', ',', ';', '?', '!', '_', '(', ')', '[', ']', '{', '}', '<', '>', '\\', '/', '$', ':')
181-
slug = text.strip()
182-
for bad in good_nuff:
183-
slug = slug.replace(bad, DASH)
184-
parts = slug.split(DASH)
185-
slug = DASH.join(s for s in parts if s) # and s != DASH)
186-
return slug.lower()
183+
def slugify(
184+
text: str,
185+
connector: str = DASH,
186+
marker: str = RS,
187+
gremlins: str = GREMLINS,
188+
policy: str = 'lower',
189+
) -> str:
190+
"""Derive kebab style slug from text.
191+
192+
Implementer notes:
193+
194+
- Every character not in gremlins is kept.
195+
- Incoming connector chars (default dashes) are preserved by
196+
sandwich transform to and from marker char (default ASCII RS).
197+
If the marker char occurs in the text, it will be replaced
198+
with the connector char during the back transform.
199+
"""
200+
ds = connector
201+
rs = marker
202+
203+
sl = text.strip().replace(ds, rs)
204+
for gremlin in gremlins:
205+
sl = sl.replace(gremlin, ds)
206+
207+
return getattr(
208+
ds.join(s.replace(rs, ds) for s in sl.split(ds) if s and s != ds),
209+
policy
210+
)()
187211

188212

189213
def label_in(text: str) -> bool:
@@ -217,25 +241,31 @@ def code_block_label_in(text: str) -> bool:
217241

218242
def load_label_to_display_lut(path: Union[str, pathlib.Path] = SECTION_LABEL_TO_DISPLAY_AT) -> dict[str, str]:
219243
"""Load the LUT for section labels -> display."""
220-
with pathlib.Path(path).open('rt', encoding=ENCODING) as handle:
244+
with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle:
221245
return json.load(handle)
222246

223247

224248
def load_display_to_label_lut(path: Union[str, pathlib.Path] = SECTION_DISPLAY_TO_LABEL_AT) -> dict[str, str]:
225249
"""Load the LUT for section display -> labels."""
226-
with pathlib.Path(path).open('rt', encoding=ENCODING) as handle:
250+
with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle:
251+
return json.load(handle)
252+
253+
254+
def load_display_to_text_lut(path: Union[str, pathlib.Path] = SECTION_DISPLAY_TO_TEXT_AT) -> dict[str, str]:
255+
"""Load the LUT for section display -> labels."""
256+
with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle:
227257
return json.load(handle)
228258

229259

230260
def load_eg_label_to_global_lut(path: Union[str, pathlib.Path] = EG_LABEL_TO_GLOBAL_AT) -> dict[str, str]:
231261
"""Load the LUT for example labels -> global."""
232-
with pathlib.Path(path).open('rt', encoding=ENCODING) as handle:
262+
with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle:
233263
return json.load(handle)
234264

235265

236266
def load_eg_global_to_label_lut(path: Union[str, pathlib.Path] = EG_GLOBAL_TO_LABEL_AT) -> dict[str, str]:
237267
"""Load the LUT for example global -> labels."""
238-
with pathlib.Path(path).open('rt', encoding=ENCODING) as handle:
268+
with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle:
239269
return json.load(handle)
240270

241271

@@ -281,6 +311,13 @@ def insert_any_section_reference(record: str) -> str:
281311
raise RuntimeError(f'false positive sec ref in ({record.rstrip(NL)})')
282312
label = found['label']
283313
if label not in SEC_LABEL_TEXT:
314+
print(f'ERROR: in insert-any-section-reference ({record=})')
315+
print(f'ERROR-CONTEXT: {record=} - {trigger_text=}')
316+
print(f'ERROR-CONTEXT: {record=} - {label=}')
317+
for skey in SEC_LABEL_TEXT:
318+
if skey.startswith(label[:len(label) // 2]):
319+
print(f'DEBUG: - similar {skey=} exists')
320+
print(f'DEBUG: You may want to execute grep -n {label} src/*.md')
284321
raise RuntimeError(f'missing register label for sec ref in ({record.rstrip(NL)})')
285322
text = SEC_LABEL_TEXT[label]
286323
sem_ref = f'[sec](#{label})'
@@ -402,7 +439,7 @@ def main(args: list[str]) -> int:
402439
in_definition = True
403440
# prepare the data triplet
404441
term = line.strip()
405-
label = 'def;' + label_derive_from(term)
442+
label = 'def;' + slugify(term)
406443
definition = ''
407444
continue
408445
if in_definition:
@@ -527,7 +564,7 @@ def main(args: list[str]) -> int:
527564
label = text.split(TOK_LAB, 1)[1].rstrip(CB_END)
528565
# reduced_text = text.split(TOK_LAB, 1)[0]
529566
else:
530-
label = label_derive_from(text)
567+
label = slugify(text)
531568
clean_sec_cnt_disp = (f'{sec_cnt_disp}' if is_plain else sec_cnt_disp).rstrip(FULL_STOP)
532569
SEC_LABEL_TEXT[label] = clean_sec_cnt_disp
533570
SECTION_DISPLAY_TO_LABEL[clean_sec_cnt_disp] = label
@@ -595,12 +632,32 @@ def main(args: list[str]) -> int:
595632
pl_anchor = TOK_EG.replace('$thing$', magic_label)
596633
line = line.rstrip(NL) + pl_anchor + NL
597634
# now the UX bonus:
598-
sec_disp = 'sec-' + display_from[section].replace(FULL_STOP, '-') # type: ignore
635+
try:
636+
sec_disp_context_part = display_from[section] # type: ignore
637+
except KeyError as err:
638+
print(f'ERROR: {slot=} in example-refs-processing ({err})')
639+
print(f'ERROR-CONTEXT: {slot=} - {line=}')
640+
print(f'ERROR-CONTEXT: {slot=} - {section=}')
641+
for skey in display_from:
642+
if skey.startswith(section[:len(section) // 2]): # type: ignore
643+
print(f'DEBUG: - similar {skey=} exists')
644+
return 1
645+
sec_disp = 'sec-' + sec_disp_context_part.replace(FULL_STOP, '-') # type: ignore
599646
sec_disp_num_label = f'{sec_disp}-eg-{num}'
600647
sec_disp_num_anchor = TOK_EG.replace('$thing$', sec_disp_num_label)
601648
line = line.rstrip(NL) + sec_disp_num_anchor + NL
602649
# now the global counter extra:
603-
global_example_num = eg_global_from[magic_label]
650+
try:
651+
global_example_num = eg_global_from[magic_label]
652+
except KeyError as err:
653+
print(f'ERROR: {slot=} in example-refs-global-counter-lookup ({err})')
654+
print(f'ERROR-CONTEXT: {slot=} - {line=}')
655+
print(f'ERROR-CONTEXT: {slot=} - {magic_label=}')
656+
for ekey in eg_global_from:
657+
if ekey.startswith(magic_label[:len(magic_label) // 2]):
658+
print(f'DEBUG: - similar {ekey=} exists')
659+
return 1
660+
604661
global_example_num_label = f'example-{global_example_num}'
605662
global_example_num_anchor = TOK_EG.replace('$thing$', global_example_num_label)
606663
line = line.rstrip(NL) + global_example_num_anchor + NL
@@ -710,16 +767,16 @@ def main(args: list[str]) -> int:
710767
BUILD_AT.mkdir(parents=True, exist_ok=True)
711768
dump_assembly(lines, BUILD_AT / 'pdf.md')
712769

713-
with open(BUILD_AT / 'toc-mint.json', 'wt', encoding=ENCODING) as handle:
770+
with open(BUILD_AT / 'toc-mint.json', 'wt', encoding=ENCODING, errors=ENC_ERRS) as handle:
714771
json.dump(mint, handle, indent=2)
715772

716773
if DUMP_LUT:
717-
with SECTION_DISPLAY_TO_LABEL_AT.open('wt', encoding=ENCODING) as handle:
774+
with SECTION_DISPLAY_TO_LABEL_AT.open('wt', encoding=ENCODING, errors=ENC_ERRS) as handle:
718775
json.dump(SECTION_DISPLAY_TO_LABEL, handle, indent=2)
719776
section_label_to_display = {
720777
label: disp for label, disp in sorted((label, disp) for disp, label in SECTION_DISPLAY_TO_LABEL.items())
721778
}
722-
with SECTION_LABEL_TO_DISPLAY_AT.open('wt', encoding=ENCODING) as handle:
779+
with SECTION_LABEL_TO_DISPLAY_AT.open('wt', encoding=ENCODING, errors=ENC_ERRS) as handle:
723780
json.dump(section_label_to_display, handle, indent=2)
724781

725782
return 0

0 commit comments

Comments
 (0)