|
| 1 | +#! /usr/bin/env python |
| 2 | +"""Extract sections from source files concat per binder and write display to label(slug) mapping.""" |
| 3 | +import pathlib |
| 4 | +import os |
| 5 | +import re |
| 6 | +import sys |
| 7 | + |
| 8 | +from inverso.implementation import process as process_inversion, nonjective # type: ignore |
| 9 | +from muuntuu.implementation import json_dump # type: ignore |
| 10 | + |
| 11 | +# "Un-lit(t)er-al" the code |
| 12 | +DASH = '-' |
| 13 | +DOT = '.' |
| 14 | +ENCODING = 'utf-8' |
| 15 | +ENC_ERRS = 'ignore' |
| 16 | +FULL_STOP = '.' |
| 17 | +HASH = '#' |
| 18 | +NL = '\n' |
| 19 | +RS = chr(30) # Record Separator |
| 20 | +SPACE = ' ' |
| 21 | + |
| 22 | +PathLike = str | pathlib.Path |
| 23 | + |
| 24 | +DEBUG = bool(os.getenv('SECTIONS_DEBUG', '')) |
| 25 | + |
| 26 | +# Configuration and runtime parameter candidates: |
| 27 | +GREMLINS = ' .,;?!_()[]{}<>\\/$:"\'`´' |
| 28 | +BINDER_AT = pathlib.Path('etc') / 'bind.txt' |
| 29 | +SOURCE_AT = pathlib.Path('src') |
| 30 | +SECTION_DISPLAY_TO_LABEL_AT = pathlib.Path('etc') / 'section-display-to-label.json' |
| 31 | +SECTION_LABEL_TO_DISPLAY_AT = pathlib.Path('etc') / 'section-label-to-display.json' |
| 32 | +SECTION_DISPLAY_TO_TEXT_AT = pathlib.Path('etc') / 'section-display-to-text.json' |
| 33 | +TOK_LAB = '{#' |
| 34 | +CLEAN_MD_START = '# Introduction' |
| 35 | +FENCED_BLOCK_FLIP_FLOP = '```' |
| 36 | +APPENDIX_INNER_PATTERN = re.compile(r'(?P<display>[A-Z][\.0-9]+)\ +(?P<rest>.+)') |
| 37 | + |
| 38 | + |
| 39 | +def load_binder(binder_at: PathLike) -> list[pathlib.Path]: |
| 40 | + """Load the linear binder text file into a list of file paths.""" |
| 41 | + with open(binder_at, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource: |
| 42 | + return [pathlib.Path(entry.strip()) for entry in resource.readlines() if entry.strip()] |
| 43 | + |
| 44 | + |
| 45 | +def load_document(path: PathLike) -> list[str]: |
| 46 | + """Load the text file into a list of strings.""" |
| 47 | + with open(path, 'rt', encoding=ENCODING, errors=ENC_ERRS) as resource: |
| 48 | + return resource.readlines() |
| 49 | + |
| 50 | + |
| 51 | +def slugify( |
| 52 | + text: str, |
| 53 | + connector: str = DASH, |
| 54 | + marker: str = RS, |
| 55 | + gremlins: str = GREMLINS, |
| 56 | + policy: str = 'lower', |
| 57 | +) -> str: |
| 58 | + """Derive kebab style slug from text. |
| 59 | +
|
| 60 | + Implementer notes: |
| 61 | +
|
| 62 | + - Every character not in gremlins is kept. |
| 63 | + - Incoming connector chars (default dashes) are preserved by |
| 64 | + sandwich transform to and from marker char (default ASCII RS). |
| 65 | + If the marker char occurs in the text, it will be replaced |
| 66 | + with the connector char during the back transform. |
| 67 | + """ |
| 68 | + ds = connector |
| 69 | + rs = marker |
| 70 | + |
| 71 | + sl = text.strip().replace(ds, rs) |
| 72 | + for gremlin in gremlins: |
| 73 | + sl = sl.replace(gremlin, ds) |
| 74 | + |
| 75 | + return getattr( |
| 76 | + ds.join(s.replace(rs, ds) for s in sl.split(ds) if s and s != ds), |
| 77 | + policy |
| 78 | + )() |
| 79 | + |
| 80 | + |
| 81 | +def invert( |
| 82 | + source: dict[str, str], |
| 83 | + options: dict[str, bool | int | str] | None = None |
| 84 | +) -> dict[str, str]: |
| 85 | + """Invert between and within the known formats (inverso API has a gap).""" |
| 86 | + if options is None: |
| 87 | + options = {} |
| 88 | + auto_serial: bool = options.get('auto_serial', False) |
| 89 | + auto_serial_step: int = options.get('auto_serial_step', 1) |
| 90 | + generator_caveat: bool = options.get('generator_caveat', True) |
| 91 | + marker_token: str = options.get('marker_token', '') |
| 92 | + marker_is_value: bool = options.get('marker_is_value', False) |
| 93 | + |
| 94 | + cleansed: dict[str, str] = {} |
| 95 | + for k, v in source.items(): |
| 96 | + key, value, seen = process_inversion(k, v, auto_serial, marker_token, marker_is_value) |
| 97 | + cleansed[key] = value |
| 98 | + if auto_serial: |
| 99 | + if not marker_token or not seen: |
| 100 | + auto_serial += auto_serial_step |
| 101 | + |
| 102 | + if findings := nonjective(cleansed): |
| 103 | + print('Error: source has ambiguous values.', file=sys.stderr) |
| 104 | + for finding in findings: |
| 105 | + print(f'- {finding}', file=sys.stderr) |
| 106 | + return {} |
| 107 | + |
| 108 | + inverted = {v: k for k, v in cleansed.items()} |
| 109 | + ordered = {'Please do not edit manually!': 'Cf. documentation'} if generator_caveat else {} |
| 110 | + for k in sorted(inverted): |
| 111 | + ordered[k] = inverted[k] |
| 112 | + |
| 113 | + return ordered |
| 114 | + |
| 115 | + |
| 116 | +def main(argv: list[str]) -> int: |
| 117 | + """Drive the extraction.""" |
| 118 | + |
| 119 | + bind_seq_path = pathlib.Path(argv[0]) if argv else BINDER_AT |
| 120 | + binder = load_binder(bind_seq_path) |
| 121 | + for resource in binder: |
| 122 | + if not (SOURCE_AT / resource).is_file(): |
| 123 | + print(f'Problem reading {resource}') |
| 124 | + return 1 |
| 125 | + |
| 126 | + lines: list[str] = [] |
| 127 | + for resource in binder: |
| 128 | + part_lines = load_document(SOURCE_AT / resource) |
| 129 | + if part_lines[-1] != NL: |
| 130 | + part_lines.append(NL) |
| 131 | + lines.extend(part_lines) |
| 132 | + |
| 133 | + in_fenced_block = False |
| 134 | + clean_headings = False |
| 135 | + sections = [] |
| 136 | + for line in lines: |
| 137 | + if line.startswith(CLEAN_MD_START): |
| 138 | + clean_headings = True |
| 139 | + |
| 140 | + if not clean_headings: |
| 141 | + continue |
| 142 | + |
| 143 | + if line.startswith(FENCED_BLOCK_FLIP_FLOP): |
| 144 | + in_fenced_block = not in_fenced_block |
| 145 | + |
| 146 | + if line.startswith(HASH) and not in_fenced_block: |
| 147 | + if line.lstrip(HASH).startswith(SPACE): |
| 148 | + sections.append(line.rstrip(NL)) |
| 149 | + |
| 150 | + print(f'Identified {len(sections)} relevant sections ...') |
| 151 | + |
| 152 | + level_counts = {n: 0 for n in range(1, 7)} |
| 153 | + previous_level = 0 |
| 154 | + defects = 0 |
| 155 | + for section in sections: |
| 156 | + level = len(section.split(SPACE, 1)[0]) |
| 157 | + level_counts[level] += 1 |
| 158 | + if level > previous_level: |
| 159 | + if level - previous_level > 1: |
| 160 | + defects += 1 |
| 161 | + print( |
| 162 | + f'! LEVEL_NEST_ERROR jumping from level {previous_level}' |
| 163 | + f' directly to {level}' |
| 164 | + ) |
| 165 | + print(f'>>> {section}') |
| 166 | + previous_level = level |
| 167 | + |
| 168 | + for level, count in level_counts.items(): |
| 169 | + print(f'- {count} level {level} sections') |
| 170 | + |
| 171 | + if defects: |
| 172 | + print(f'Found {defects} defects in section nesting!') |
| 173 | + else: |
| 174 | + print('Section level nesting is valid') |
| 175 | + |
| 176 | + db = [] |
| 177 | + is_appendix = False |
| 178 | + root: int = 0 |
| 179 | + appr = '' |
| 180 | + for section in sections: |
| 181 | + display = '' |
| 182 | + level = len(section.split(SPACE, 1)[0]) |
| 183 | + if level == 1: |
| 184 | + root += 1 |
| 185 | + text_plus = section[level + 1:] |
| 186 | + if '<mark title="Ephemeral region marking">' in text_plus: |
| 187 | + text_plus = ( |
| 188 | + text_plus |
| 189 | + .replace('<mark title="Ephemeral region marking">', '') |
| 190 | + .replace('</mark>', '') |
| 191 | + ) |
| 192 | + if text_plus.startswith('Appendix '): |
| 193 | + appr = text_plus.replace('Appendix ', '')[0] |
| 194 | + display = f'Appendix {appr}.' |
| 195 | + text_plus = text_plus.replace(f'{display} ', '') |
| 196 | + is_appendix = True |
| 197 | + else: |
| 198 | + match = APPENDIX_INNER_PATTERN.match(text_plus) |
| 199 | + if match: |
| 200 | + found = match.groupdict() |
| 201 | + display = found['display'] |
| 202 | + text_plus = text_plus.replace(f'{display} ', '') |
| 203 | + |
| 204 | + if TOK_LAB in text_plus: |
| 205 | + text, slug = text_plus.rstrip(SPACE).rstrip('}').split(TOK_LAB, 1) |
| 206 | + else: |
| 207 | + text = text_plus.rstrip(SPACE) |
| 208 | + slug = slugify(text) |
| 209 | + if not is_appendix: |
| 210 | + a_root = str(root) |
| 211 | + else: |
| 212 | + a_root = appr |
| 213 | + db.append([is_appendix, a_root, level, display, text, slug]) |
| 214 | + |
| 215 | + if DEBUG: |
| 216 | + for is_appendix, a_root, level, display, text, slug in db: |
| 217 | + print( |
| 218 | + f'{" " if not is_appendix else "APPENDIX"} | {a_root} |' |
| 219 | + f' {(HASH * level).rjust(7)} "{text}" <-- {slug}' |
| 220 | + ) |
| 221 | + |
| 222 | + display_to_label = {} |
| 223 | + display_to_text = {} |
| 224 | + lvl_min, lvl_sup = 1, 7 |
| 225 | + level_domain: tuple[int, ...] = tuple(range(lvl_min, lvl_sup)) |
| 226 | + sec_cnt: dict[str, int] = {f'{HASH * level} ': 0 for level in level_domain} |
| 227 | + sec_lvl: dict[str, int] = {f'{HASH * level} ': level for level in level_domain} |
| 228 | + lvl_sec: dict[int, str] = {level: f'{HASH * level} ' for level in level_domain} |
| 229 | + cur_lvl: int = sec_lvl[f'{HASH * 1} '] |
| 230 | + for is_appendix, a_root, level, display, text, slug in db: |
| 231 | + if not is_appendix: |
| 232 | + tag = f'{HASH * level} ' |
| 233 | + nxt_lvl = sec_lvl[tag] |
| 234 | + sec_cnt[tag] += 1 |
| 235 | + if nxt_lvl < cur_lvl: |
| 236 | + for lvl in range(nxt_lvl + 1, lvl_sup): |
| 237 | + sec_cnt[lvl_sec[lvl]] = 0 |
| 238 | + sec_cnt_disp_vec = [] |
| 239 | + for s_tag, cnt in sec_cnt.items(): |
| 240 | + if cnt == 0: |
| 241 | + raise RuntimeError(f'ERROR: Counting is hard: {sec_cnt} at {tag} for {text}') |
| 242 | + sec_cnt_disp_vec.append(str(cnt)) |
| 243 | + if s_tag == tag: |
| 244 | + break |
| 245 | + sec_cnt_disp = FULL_STOP.join(sec_cnt_disp_vec) |
| 246 | + cur_lvl = nxt_lvl |
| 247 | + |
| 248 | + display = sec_cnt_disp.rstrip(DOT) |
| 249 | + else: |
| 250 | + pass # display, text = text.split(SPACE, 1) |
| 251 | + display_to_label[display] = slug |
| 252 | + display_to_text[display] = text |
| 253 | + if DEBUG: |
| 254 | + print(f' {display} "{display_to_text[display]}" <-- {slug}') |
| 255 | + |
| 256 | + json_dump(display_to_label, SECTION_DISPLAY_TO_LABEL_AT, options={'debug': DEBUG}) |
| 257 | + json_dump(invert(display_to_label), SECTION_LABEL_TO_DISPLAY_AT, options={'debug': DEBUG}) |
| 258 | + json_dump(display_to_text, SECTION_DISPLAY_TO_TEXT_AT, options={'debug': DEBUG}) |
| 259 | + |
| 260 | + return 0 |
| 261 | + |
| 262 | + |
| 263 | +if __name__ == '__main__': |
| 264 | + sys.exit(main(sys.argv[1:])) |
0 commit comments