Skip to content
This repository was archived by the owner on Oct 10, 2025. It is now read-only.

Commit ad85ef7

Browse files
committed
releasing
1 parent 59dd2b3 commit ad85ef7

File tree

27 files changed

+8534
-30768
lines changed

27 files changed

+8534
-30768
lines changed

fusus/book.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ def __init__(self, cd=None, **params):
136136
cd = os.path.expanduser(cd)
137137
os.chdir(cd)
138138

139+
if cd:
140+
cd = f"{cd}/"
141+
139142
self.cd = cd
140143

141144
tm = Timestamp()
@@ -444,9 +447,9 @@ def process(
444447
outDir = C.outDir
445448
cleanDir = C.cleanDir
446449
proofDir = C.proofDir
447-
textDir = C.textDir
450+
htmlDir = C.htmlDir
448451

449-
for d in (interDir, outDir, cleanDir, proofDir, textDir):
452+
for d in (interDir, outDir, cleanDir, proofDir, htmlDir):
450453
if not os.path.exists(d):
451454
os.makedirs(d, exist_ok=True)
452455

@@ -519,8 +522,6 @@ def measureQuality(self, pages=None, showStats=True, updateProofs=False):
519522
indent = tm.indent
520523

521524
cd = self.cd
522-
if cd:
523-
cd = f"{cd}/"
524525

525526
allPages = self.allPages
526527

@@ -698,8 +699,8 @@ def exportTsv(self, pages=None):
698699
else:
699700
info("Nothing written")
700701

701-
def plainText(self, pages=None):
702-
"""Get the plain text from the ocr output in one file
702+
def htmlPages(self, pages=None):
703+
"""Get the text in html from the ocr output in one file
703704
704705
pages: string | int, optional `None`
705706
Specification of pages to do. If absent or `None`: all pages.
@@ -719,10 +720,10 @@ def plainText(self, pages=None):
719720

720721
C = self.C
721722
cd = self.cd
722-
textDir = C.textDir
723+
htmlDir = C.htmlDir
723724

724-
if not os.path.exists(textDir):
725-
os.makedirs(textDir, exist_ok=True)
725+
if not os.path.exists(htmlDir):
726+
os.makedirs(htmlDir, exist_ok=True)
726727

727728
allPages = self.allPages
728729

@@ -735,7 +736,7 @@ def plainText(self, pages=None):
735736
page = None
736737

737738
fileName = f"{pagesDesc}.html"
738-
path = f"{textDir}/{fileName}"
739+
path = f"{htmlDir}/{fileName}"
739740

740741
doc = """\
741742
<html>
@@ -827,8 +828,9 @@ def plainText(self, pages=None):
827828
)
828829
(prevStripe, prevBlock, prevLine) = (stripe, block, line)
829830

830-
word = fields[-1]
831-
lineMaterial.append(word)
831+
word = fields[-2]
832+
punc = fields[-1]
833+
lineMaterial.append(f"{word}{punc}")
832834

833835
blockMaterial.append(" ".join(lineMaterial))
834836
stripeMaterial.append("\n".join(blockMaterial))

fusus/page.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,6 @@ def show(self, stage=None, band=None, mark=None, **displayParams):
184184
error = tm.error
185185
C = engine.C
186186
cd = engine.cd
187-
if cd:
188-
cd = f"{cd}/"
189187

190188
stages = self.stages
191189
marks = engine.marks
@@ -216,7 +214,8 @@ def show(self, stage=None, band=None, mark=None, **displayParams):
216214
else:
217215
display(
218216
HTML(
219-
f"<hr>\n<div><b>{s}</b>: <i>{showPath} does not exist.</i></div>"
217+
f"<hr>\n<div><b>{s}</b>: "
218+
f"<i>{showPath} does not exist.</i></div>"
220219
)
221220
)
222221
else:
@@ -566,7 +565,7 @@ def _ingest(self, stage, tp, extension, f):
566565
for (i, x) in enumerate(fields)
567566
)
568567
data.append(fields)
569-
elif extension == "tsv":
568+
elif extension == "json":
570569
data = json.load(f)
571570
elif extension == "html":
572571
pass

fusus/parameters.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@
147147
interDir="inter",
148148
cleanDir="clean",
149149
proofDir="proof",
150-
textDir="text",
150+
htmlDir="html",
151151
marksDir="marks",
152152
blurX=21,
153153
blurY=21,

notebooks/Affifi/do.ipynb

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6430,22 +6430,61 @@
64306430
},
64316431
{
64326432
"cell_type": "code",
6433-
"execution_count": 11,
6433+
"execution_count": 5,
6434+
"metadata": {},
6435+
"outputs": [
6436+
{
6437+
"name": "stdout",
6438+
"output_type": "stream",
6439+
"text": [
6440+
" 53s Batch of 7 pages: 52-58\n",
6441+
" 53s Start producing plain text of these pages\n",
6442+
" 53s written to html/52-58.html \n"
6443+
]
6444+
},
6445+
{
6446+
"data": {
6447+
"text/html": [
6448+
"<a target=\"_blank\" href=\"https://nbviewer.jupyter.org/github/among/fusus/blob/master/ur/Affifi/html/52-58.html\">52-58.html</a> (local file: ~/github/among/fusus/ur/Affifi/html/52-58.html)"
6449+
],
6450+
"text/plain": [
6451+
"<IPython.core.display.HTML object>"
6452+
]
6453+
},
6454+
"metadata": {},
6455+
"output_type": "display_data"
6456+
}
6457+
],
6458+
"source": [
6459+
"B.htmlPages(pages=\"52-58\")"
6460+
]
6461+
},
6462+
{
6463+
"cell_type": "markdown",
6464+
"metadata": {},
6465+
"source": [
6466+
"If the OCR for all the pages has already been executed, you can get the plain HTML text from the TSV pages\n",
6467+
"by simply this:"
6468+
]
6469+
},
6470+
{
6471+
"cell_type": "code",
6472+
"execution_count": 6,
64346473
"metadata": {},
64356474
"outputs": [
64366475
{
64376476
"name": "stdout",
64386477
"output_type": "stream",
64396478
"text": [
6440-
"19m 19s Batch of 7 pages: 52-58\n",
6441-
"19m 19s Start producing plain text of these pages\n",
6442-
"19m 19s written to text/52-58.html \n"
6479+
" 56s Batch of 180 pages: 47-226\n",
6480+
" 56s Start producing plain text of these pages\n",
6481+
" 56s written to html/47-226.html \n"
64436482
]
64446483
},
64456484
{
64466485
"data": {
64476486
"text/html": [
6448-
"<a target=\"_blank\" href=\"https://nbviewer.jupyter.org/github/among/fusus/blob/master/ur/Affifitext/52-58.html\">52-58.html</a> (local file: ~/github/among/fusus/ur/Affifitext/52-58.html)"
6487+
"<a target=\"_blank\" href=\"https://nbviewer.jupyter.org/github/among/fusus/blob/master/ur/Affifi/html/47-226.html\">47-226.html</a> (local file: ~/github/among/fusus/ur/Affifi/html/47-226.html)"
64496488
],
64506489
"text/plain": [
64516490
"<IPython.core.display.HTML object>"
@@ -6456,7 +6495,7 @@
64566495
}
64576496
],
64586497
"source": [
6459-
"B.plainText(pages=\"52-58\")"
6498+
"B.htmlPages()"
64606499
]
64616500
},
64626501
{

0 commit comments

Comments
 (0)