|
10 | 10 | import yaml |
11 | 11 |
|
12 | 12 | ENCODING = 'utf-8' |
| 13 | +ENC_ERRS = 'ignore' |
13 | 14 | NL = '\n' |
| 15 | +RS = chr(30) # Record Separator |
14 | 16 | CB_END = '}' |
15 | 17 | COLON = ':' |
16 | 18 | DASH = '-' |
|
22 | 24 | SPACE = ' ' |
23 | 25 | TM = '™' |
24 | 26 |
|
25 | | -DEBUG = bool(os.getenv('LAPIDIFY_DEBUG', '')) |
| 27 | +DEBUG = bool(os.getenv('DEBUG_LAPIDIFY', '')) |
26 | 28 | TARGETS = ( |
27 | 29 | PDF := 'pdf', |
28 | 30 | GFM_PLUS := 'gfm+vendor_hacks', |
|
32 | 34 | DUMP_LUT = bool(os.getenv('DUMP_LUT', '')) |
33 | 35 |
|
34 | 36 | # Configuration and runtime parameter candidates: |
| 37 | +GREMLINS = ' .,;?!_()[]{}<>\\/$:"\'`´' |
35 | 38 | BINDER_AT = pathlib.Path('etc') / 'bind.txt' |
36 | 39 | SOURCE_AT = pathlib.Path('src') |
37 | 40 | BUILD_AT = pathlib.Path('build') |
38 | 41 | SECTION_DISPLAY_TO_LABEL_AT = pathlib.Path('etc') / 'section-display-to-label.json' |
39 | 42 | SECTION_LABEL_TO_DISPLAY_AT = pathlib.Path('etc') / 'section-label-to-display.json' |
| 43 | +SECTION_DISPLAY_TO_TEXT_AT = pathlib.Path('etc') / 'section-display-to-text.json' |
40 | 44 | EG_GLOBAL_TO_LABEL_AT = pathlib.Path('etc') / 'example-global-to-local.json' |
41 | 45 | EG_LABEL_TO_GLOBAL_AT = pathlib.Path('etc') / 'example-local-to-global.json' |
42 | 46 |
|
|
81 | 85 | """ |
82 | 86 | CLEAN_MD_START = '# Introduction' |
83 | 87 | FENCED_BLOCK_FLIP_FLOP = '```' |
| 88 | +APPENDIX_INNER_PATTERN = re.compile(r'(?P<display>[A-Z][\.0-9]+)\ +(?P<rest>.+)') |
84 | 89 | LOGO_URL = 'https://docs.oasis-open.org/templates/OASISLogo-v3.0.png' |
85 | 90 | LOGO_LOCAL_PATH = 'images/OASISLogo-v3.0.png' |
86 | 91 | TOP_LOGO_LINE = f'' |
|
137 | 142 |
|
138 | 143 | def load_binder(binder_at: Union[str, pathlib.Path], ignores: Union[list[str], None] = None) -> list[pathlib.Path]: |
139 | 144 | """Load the linear binder text file into a list of file paths.""" |
140 | | - with open(binder_at, 'rt', encoding=ENCODING) as resource: |
| 145 | + with open(binder_at, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource: |
141 | 146 | collation = (pathlib.Path(entry.strip()) for entry in resource.readlines() if entry.strip()) |
142 | 147 | return [path for path in collation if str(path) not in ignores] if ignores else list(collation) |
143 | 148 |
|
@@ -165,25 +170,44 @@ def detect_meta(text_lines: list[str]) -> tuple[META_TOC_TYPE, list[str]]: |
165 | 170 |
|
166 | 171 | def load_document(path: Union[str, pathlib.Path]) -> tuple[META_TOC_TYPE, list[str]]: |
167 | 172 | """Load the text file into a list of strings and harvest any YAML meta info (if present remove the lines).""" |
168 | | - with open(path, 'rt', encoding=ENCODING) as resource: |
| 173 | + with open(path, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource: |
169 | 174 | return detect_meta(resource.readlines()) |
170 | 175 |
|
171 | 176 |
|
172 | 177 | def dump_assembly(text_lines: list[str], to_path: Union[str, pathlib.Path]) -> None: |
173 | 178 | """Dump the lines of text into the text file at path.""" |
174 | | - with open(to_path, 'wt', encoding=ENCODING) as resource: |
| 179 | + with open(to_path, 'wt', encoding=ENCODING, errors=ENC_ERRS) as resource: |
175 | 180 | resource.write(''.join(text_lines)) |
176 | 181 |
|
177 | 182 |
|
178 | | -def label_derive_from(text: str) -> str: |
179 | | - """Transform text to kebab style conventional label assuming no newlines present.""" |
180 | | - good_nuff = (' ', '.', ',', ';', '?', '!', '_', '(', ')', '[', ']', '{', '}', '<', '>', '\\', '/', '$', ':') |
181 | | - slug = text.strip() |
182 | | - for bad in good_nuff: |
183 | | - slug = slug.replace(bad, DASH) |
184 | | - parts = slug.split(DASH) |
185 | | - slug = DASH.join(s for s in parts if s) # and s != DASH) |
186 | | - return slug.lower() |
| 183 | +def slugify( |
| 184 | + text: str, |
| 185 | + connector: str = DASH, |
| 186 | + marker: str = RS, |
| 187 | + gremlins: str = GREMLINS, |
| 188 | + policy: str = 'lower', |
| 189 | +) -> str: |
| 190 | + """Derive kebab style slug from text. |
| 191 | +
|
| 192 | + Implementer notes: |
| 193 | +
|
| 194 | + - Every character not in gremlins is kept. |
| 195 | + - Incoming connector chars (default dashes) are preserved by |
| 196 | + sandwich transform to and from marker char (default ASCII RS). |
| 197 | + If the marker char occurs in the text, it will be replaced |
| 198 | + with the connector char during the back transform. |
| 199 | + """ |
| 200 | + ds = connector |
| 201 | + rs = marker |
| 202 | + |
| 203 | + sl = text.strip().replace(ds, rs) |
| 204 | + for gremlin in gremlins: |
| 205 | + sl = sl.replace(gremlin, ds) |
| 206 | + |
| 207 | + return getattr( |
| 208 | + ds.join(s.replace(rs, ds) for s in sl.split(ds) if s and s != ds), |
| 209 | + policy |
| 210 | + )() |
187 | 211 |
|
188 | 212 |
|
189 | 213 | def label_in(text: str) -> bool: |
@@ -217,25 +241,31 @@ def code_block_label_in(text: str) -> bool: |
217 | 241 |
|
218 | 242 | def load_label_to_display_lut(path: Union[str, pathlib.Path] = SECTION_LABEL_TO_DISPLAY_AT) -> dict[str, str]: |
219 | 243 | """Load the LUT for section labels -> display.""" |
220 | | - with pathlib.Path(path).open('rt', encoding=ENCODING) as handle: |
| 244 | + with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
221 | 245 | return json.load(handle) |
222 | 246 |
|
223 | 247 |
|
224 | 248 | def load_display_to_label_lut(path: Union[str, pathlib.Path] = SECTION_DISPLAY_TO_LABEL_AT) -> dict[str, str]: |
225 | 249 | """Load the LUT for section display -> labels.""" |
226 | | - with pathlib.Path(path).open('rt', encoding=ENCODING) as handle: |
| 250 | + with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
| 251 | + return json.load(handle) |
| 252 | + |
| 253 | + |
| 254 | +def load_display_to_text_lut(path: Union[str, pathlib.Path] = SECTION_DISPLAY_TO_TEXT_AT) -> dict[str, str]: |
| 255 | + """Load the LUT for section display -> labels.""" |
| 256 | + with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
227 | 257 | return json.load(handle) |
228 | 258 |
|
229 | 259 |
|
230 | 260 | def load_eg_label_to_global_lut(path: Union[str, pathlib.Path] = EG_LABEL_TO_GLOBAL_AT) -> dict[str, str]: |
231 | 261 | """Load the LUT for example labels -> global.""" |
232 | | - with pathlib.Path(path).open('rt', encoding=ENCODING) as handle: |
| 262 | + with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
233 | 263 | return json.load(handle) |
234 | 264 |
|
235 | 265 |
|
236 | 266 | def load_eg_global_to_label_lut(path: Union[str, pathlib.Path] = EG_GLOBAL_TO_LABEL_AT) -> dict[str, str]: |
237 | 267 | """Load the LUT for example global -> labels.""" |
238 | | - with pathlib.Path(path).open('rt', encoding=ENCODING) as handle: |
| 268 | + with pathlib.Path(path).open('rt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
239 | 269 | return json.load(handle) |
240 | 270 |
|
241 | 271 |
|
@@ -281,6 +311,13 @@ def insert_any_section_reference(record: str) -> str: |
281 | 311 | raise RuntimeError(f'false positive sec ref in ({record.rstrip(NL)})') |
282 | 312 | label = found['label'] |
283 | 313 | if label not in SEC_LABEL_TEXT: |
| 314 | + print(f'ERROR: in insert-any-section-reference ({record=})') |
| 315 | + print(f'ERROR-CONTEXT: {record=} - {trigger_text=}') |
| 316 | + print(f'ERROR-CONTEXT: {record=} - {label=}') |
| 317 | + for skey in SEC_LABEL_TEXT: |
| 318 | + if skey.startswith(label[:len(label) // 2]): |
| 319 | + print(f'DEBUG: - similar {skey=} exists') |
| 320 | + print(f'DEBUG: You may want to execute grep -n {label} src/*.md') |
284 | 321 | raise RuntimeError(f'missing register label for sec ref in ({record.rstrip(NL)})') |
285 | 322 | text = SEC_LABEL_TEXT[label] |
286 | 323 | sem_ref = f'[sec](#{label})' |
@@ -402,7 +439,7 @@ def main(args: list[str]) -> int: |
402 | 439 | in_definition = True |
403 | 440 | # prepare the data triplet |
404 | 441 | term = line.strip() |
405 | | - label = 'def;' + label_derive_from(term) |
| 442 | + label = 'def;' + slugify(term) |
406 | 443 | definition = '' |
407 | 444 | continue |
408 | 445 | if in_definition: |
@@ -527,7 +564,7 @@ def main(args: list[str]) -> int: |
527 | 564 | label = text.split(TOK_LAB, 1)[1].rstrip(CB_END) |
528 | 565 | # reduced_text = text.split(TOK_LAB, 1)[0] |
529 | 566 | else: |
530 | | - label = label_derive_from(text) |
| 567 | + label = slugify(text) |
531 | 568 | clean_sec_cnt_disp = (f'{sec_cnt_disp}' if is_plain else sec_cnt_disp).rstrip(FULL_STOP) |
532 | 569 | SEC_LABEL_TEXT[label] = clean_sec_cnt_disp |
533 | 570 | SECTION_DISPLAY_TO_LABEL[clean_sec_cnt_disp] = label |
@@ -595,12 +632,32 @@ def main(args: list[str]) -> int: |
595 | 632 | pl_anchor = TOK_EG.replace('$thing$', magic_label) |
596 | 633 | line = line.rstrip(NL) + pl_anchor + NL |
597 | 634 | # now the UX bonus: |
598 | | - sec_disp = 'sec-' + display_from[section].replace(FULL_STOP, '-') # type: ignore |
| 635 | + try: |
| 636 | + sec_disp_context_part = display_from[section] # type: ignore |
| 637 | + except KeyError as err: |
| 638 | + print(f'ERROR: {slot=} in example-refs-processing ({err})') |
| 639 | + print(f'ERROR-CONTEXT: {slot=} - {line=}') |
| 640 | + print(f'ERROR-CONTEXT: {slot=} - {section=}') |
| 641 | + for skey in display_from: |
| 642 | + if skey.startswith(section[:len(section) // 2]): # type: ignore |
| 643 | + print(f'DEBUG: - similar {skey=} exists') |
| 644 | + return 1 |
| 645 | + sec_disp = 'sec-' + sec_disp_context_part.replace(FULL_STOP, '-') # type: ignore |
599 | 646 | sec_disp_num_label = f'{sec_disp}-eg-{num}' |
600 | 647 | sec_disp_num_anchor = TOK_EG.replace('$thing$', sec_disp_num_label) |
601 | 648 | line = line.rstrip(NL) + sec_disp_num_anchor + NL |
602 | 649 | # now the global counter extra: |
603 | | - global_example_num = eg_global_from[magic_label] |
| 650 | + try: |
| 651 | + global_example_num = eg_global_from[magic_label] |
| 652 | + except KeyError as err: |
| 653 | + print(f'ERROR: {slot=} in example-refs-global-counter-lookup ({err})') |
| 654 | + print(f'ERROR-CONTEXT: {slot=} - {line=}') |
| 655 | + print(f'ERROR-CONTEXT: {slot=} - {magic_label=}') |
| 656 | + for ekey in eg_global_from: |
| 657 | + if ekey.startswith(magic_label[:len(magic_label) // 2]): |
| 658 | + print(f'DEBUG: - similar {ekey=} exists') |
| 659 | + return 1 |
| 660 | + |
604 | 661 | global_example_num_label = f'example-{global_example_num}' |
605 | 662 | global_example_num_anchor = TOK_EG.replace('$thing$', global_example_num_label) |
606 | 663 | line = line.rstrip(NL) + global_example_num_anchor + NL |
@@ -710,16 +767,16 @@ def main(args: list[str]) -> int: |
710 | 767 | BUILD_AT.mkdir(parents=True, exist_ok=True) |
711 | 768 | dump_assembly(lines, BUILD_AT / 'pdf.md') |
712 | 769 |
|
713 | | - with open(BUILD_AT / 'toc-mint.json', 'wt', encoding=ENCODING) as handle: |
| 770 | + with open(BUILD_AT / 'toc-mint.json', 'wt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
714 | 771 | json.dump(mint, handle, indent=2) |
715 | 772 |
|
716 | 773 | if DUMP_LUT: |
717 | | - with SECTION_DISPLAY_TO_LABEL_AT.open('wt', encoding=ENCODING) as handle: |
| 774 | + with SECTION_DISPLAY_TO_LABEL_AT.open('wt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
718 | 775 | json.dump(SECTION_DISPLAY_TO_LABEL, handle, indent=2) |
719 | 776 | section_label_to_display = { |
720 | 777 | label: disp for label, disp in sorted((label, disp) for disp, label in SECTION_DISPLAY_TO_LABEL.items()) |
721 | 778 | } |
722 | | - with SECTION_LABEL_TO_DISPLAY_AT.open('wt', encoding=ENCODING) as handle: |
| 779 | + with SECTION_LABEL_TO_DISPLAY_AT.open('wt', encoding=ENCODING, errors=ENC_ERRS) as handle: |
723 | 780 | json.dump(section_label_to_display, handle, indent=2) |
724 | 781 |
|
725 | 782 | return 0 |
|
0 commit comments