tf docs

dirkroorda · dirkroorda · commit 5d918842bc76 · 2021-11-02T09:41:16.000+01:00
diff --git a/fusus/about/transcriptionl.py b/fusus/about/transcriptionl.py
@@ -0,0 +1,3 @@
+"""
+.. include:: ../docs/about/transcriptionl.md
+"""
diff --git a/fusus/docs/about/transcriptionl.md b/fusus/docs/about/transcriptionl.md
@@ -0,0 +1,111 @@
+# Lakhnawi transcription
+
+The Text-Fabric data is derived from the Lakhnawi PDF by reverse engineering.
+The PDF is a textual PDF with an unusual usage of fonts to obtain desired effects with
+ligatures and diacritics.
+
+# Divisions
+
+The text is divided into the following chunks
+
+## Piece
+
+**Section level 1**
+
+Logical unit, corresponding to the main division of the work: *bezel*.
+(The title of the work is: *bezels* of wisdom.)
+
+Some pieces are in fact introductory chapters, and not the *bezels* of the
+main work.
+
+**Features**
+
+name | type | description
+--- | --- | ---
+`n` | int | sequence number of a piece, starting with 1
+`np` | int | sequence number of a proper content piece, i.e. a *bezel*
+`title` | str | title of a piece
+
+## Page
+
+**Section level 2**
+
+Physical unit: a printed page.
+
+**Features**
+
+name | type | description
+--- | --- | ---
+`n` | int | sequence number of a page, starting with 1
+
+## Line
+
+**Section level 3**
+
+Physical unit: a printed line within a page.
+
+**Features**
+
+name | type | description
+--- | --- | ---
+`n` | int | sequence number of a page, starting with 1
+
+## Column
+
+Logical/physical unit: a column within a line.
+
+Note that the page is not divided into columns.
+Some lines are divided into columns in
+hemistic poems. See `fusus.lakhnawi.Lakhnawi.columns`.
+
+## Span
+
+Logical/physical unit: a strectch of text with the same writing direction.
+Whenever the writing direction reverses, a new span is started.
+
+
+**Features**
+
+name | type | description
+--- | --- | ---
+`n` | int | sequence number of a span within a column or line
+`dir` | str | writing direction of a span; either `r` or `l`
+
+## Sentence
+
+Logical unit: a sentence, defined by the full-stop marker.
+Whenever the writing direction reverses, a new span is started.
+
+
+**Features**
+
+name | type | description
+--- | --- | ---
+`n` | int | sequence number of a span within a column or line
+
+## Word
+
+Logical/physical unit: individual words in as far they are separated
+by whitespace.
+
+!!! caution "Imperfect whitespace detection"
+    We do not guarantee that whitespace has been detected
+    perfectly.
+    So we do miss word boundaries on the one hand, and we
+    have spurious word boundaries on the other hand.
+
+**Features**
+
+name | type | description
+--- | --- | ---
+`boxl` | int | left x-coordinate of the bounding box of a word
+`boxt` | int | top y-coordinate of the bounding box of a word
+`boxr` | int | right x-coordinate of the bounding box of a word
+`boxb` | int | bottom y-coordinate of the bounding box of a word
+`letters` | str | the text of a word in Arabic, unicode, without punctuation
+`lettersn` | str | the text of a word in beta code, latin + diacritics
+`lettersp` | str | the text of a word in beta code, ascii
+`letterst` | str | the text of a word in romanized transcription
+`punc` | str | the punctuation and/or space immediately after a word in Arabic, unicode
+`punca` | str | the punctuation and/or space immediately after a word in ascii
+
diff --git a/legacy/notebooks/test copy.py b/legacy/notebooks/test copy.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.11.4
+#   kernelspec:
+#     display_name: Python3.9
+#     language: python
+#     name: python3
+# ---
+
+import re
+
+WORD_RE = re.compile(r"""
+      ([x-z]+)
+      |
+      ([a-d]+)
+""", re.X)
+
+string = "ccxzaayzbbzz"
+
+x = WORD_RE.findall(string)
+x
+
+CHUNK_RE = re.compile(fr"[{nonLetterRange}]")
+
+string = "..aa"
+
+match = CHUNK_RE.match(string)
+match
+
+# +
+PART = r"""!"\#\$%\&\'\(\)\*\+,\-\./:;<=>\?@\[\]\{\}«»ʰʱʲʳʴʵʶʷʸʹʺʻʼʽʾʿˀˁ˂˃˄˅ˆˇˈˉˊˋˌˍˎˏːˑ˒˓˔˕˖˗˘˙˚˛˜˝˞˟ˠˡˢˣˤ˥˦˧˨˩˪˫ˬ˭ˮ˯˰˱˲˳˴˵˶˷˸˹˺˻˼˽˾˿̀́̂̃̄،؛\u061c\u061d؞؟‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․ ‥…‧\u2028\u2029‹›⁅⁆⁌⁍﴾﴿"""
+
+WORD_RE = re.compile(f"""
+(
+[^{PART}]+
+)
+|
+(
+[{PART}]+
+)
+""", re.X)
+# -
+
+string = 'إِلىٰأَكْثَرَ،إِلىٰ' 
+
+# +
+parts = []
+first = True
+
+for (letters, nonLetters) in WORD_RE.findall(string):
+    print(f"PART {letters=} {nonLetters=}")
+    if first:
+        parts.append([nonLetters, letters, ""])
+        first = False
+    elif letters:
+        parts.append(["", letters, ""])
+    else:
+        parts[-1][-1] += nonLetters
+    if parts:
+        parts[-1][-1] += " "
+
+# -
+
+for part in parts:
+    print("PART")
+    print(f"\t{part[0]=}")
+    print(f"\t{part[1]=}")
+    print(f"\t{part[2]=}")
+
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"""`
	`2`	`+.. include:: ../docs/about/transcriptionl.md`
	`3`	`+"""`