From 00dc0604a9bda18a1f0129ac882535bd7a68fd64 Mon Sep 17 00:00:00 2001 From: Chrysanthemum Lovelace Date: Mon, 8 Sep 2025 10:17:22 -0400 Subject: [PATCH 1/4] font size calculation, and moved xml file saving up out of processor --- FontSizeCalculator.cs | 32 ++++++++++++++++++++++++++++++++ HocrHeaderFixer.cs | 12 ++---------- IHocrXmlProcessor.cs | 12 +++++++++++- OcrGenerator.cs | 7 ++++--- Processor.cs | 28 ++++++++++++++++++++++------ 5 files changed, 71 insertions(+), 20 deletions(-) create mode 100644 FontSizeCalculator.cs diff --git a/FontSizeCalculator.cs b/FontSizeCalculator.cs new file mode 100644 index 0000000..3d6b53a --- /dev/null +++ b/FontSizeCalculator.cs @@ -0,0 +1,32 @@ +using System.Xml.Linq; + +class FontSizeCalculator(double pixelToFontSizeConversionFactor) : IHocrXmlProcessor +{ + public double PixelToFontSizeConversionFactor { get; set; } = pixelToFontSizeConversionFactor; + + public void Init() + { + } + + public bool Process(string hocrFile, XDocument hocrXml) + { + XNamespace ns = "http://www.w3.org/1999/xhtml"; + foreach (var paragraph in hocrXml.Descendants(ns + "p")) + { + var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); + foreach (var line in lines) + { + var lineAttributes = line.Attribute("title").Value; + var bbox = lineAttributes.Split(';') + .Select(x => x.Trim()) + .Where(x => x.StartsWith("bbox")) + .First() + .Split(' '); + var pixelHeight = int.Parse(bbox[4]) - int.Parse(bbox[2]); + var fontSize = Math.Floor(pixelHeight * PixelToFontSizeConversionFactor); + line.Attribute("title").SetValue(lineAttributes + $"; x_fsize {fontSize}"); + } + } + return true; + } +} \ No newline at end of file diff --git a/HocrHeaderFixer.cs b/HocrHeaderFixer.cs index beb2e07..74a58cf 100644 --- a/HocrHeaderFixer.cs +++ b/HocrHeaderFixer.cs @@ -19,20 +19,12 @@ public void Init() Console.WriteLine("Fixing hOCR file headers..."); } - public void Process(string hocrFile, XDocument hocrXml) + public bool Process(string hocrFile, XDocument hocrXml) { XNamespace ns = "http://www.w3.org/1999/xhtml"; var head = hocrXml.Element(ns + "html").Element(ns + "head"); head.Element(ns + "title").Value = "Image: " + FileName ?? Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_JP2.jpg"); head.Add(new XElement(ns + "meta", new XAttribute("name", "ocr-system"), new XAttribute("content", HtrId.HasValue ? $"Transkribus-HtrId:{HtrId}" : "Transkribus"))); - using var fileStream = File.Open(hocrFile, FileMode.Truncate); - var writer = XmlWriter.Create(fileStream, new XmlWriterSettings - { - // need to specify false here to stop it from emitting a byte order mark - Encoding = new UTF8Encoding(false), - Indent = true - }); - hocrXml.Save(writer); - writer.Close(); + return true; } } \ No newline at end of file diff --git a/IHocrXmlProcessor.cs b/IHocrXmlProcessor.cs index 874c291..eed261f 100644 --- a/IHocrXmlProcessor.cs +++ b/IHocrXmlProcessor.cs @@ -2,6 +2,16 @@ interface IHocrXmlProcessor { + /// + /// Used to set up any initial state, such as creating directories + /// void Init(); - void Process(string hocrFile, XDocument hocrXml); + + /// + /// Receives each hOCR xml tree for processing + /// + /// Path to the hOCR file + /// Xml tree representing the hOCR file + /// True if the xml tree was modified and needs to be saved back to the file. + bool Process(string hocrFile, XDocument hocrXml); } diff --git a/OcrGenerator.cs b/OcrGenerator.cs index 3f11e9f..e90858f 100644 --- a/OcrGenerator.cs +++ b/OcrGenerator.cs @@ -16,14 +16,14 @@ public void Init() Directory.CreateDirectory(OcrDirectory); } - public void Process(string hocrFile, XDocument hocrXml) + public bool Process(string hocrFile, XDocument hocrXml) { var text = new StringBuilder(); XNamespace ns = "http://www.w3.org/1999/xhtml"; foreach (var paragraph in hocrXml.Descendants(ns + "p")) { - var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); - foreach (var line in lines) + var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); + foreach (var line in lines) { var words = line.Elements(ns + "span").Select(span => span.Value); text.AppendJoin(' ', words); @@ -33,5 +33,6 @@ public void Process(string hocrFile, XDocument hocrXml) } var ocrFile = Path.Join(OcrDirectory, Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_OCR.asc")); File.WriteAllText(ocrFile, text.ToString().Trim()); + return false; } } diff --git a/Processor.cs b/Processor.cs index e161089..673f6be 100644 --- a/Processor.cs +++ b/Processor.cs @@ -2,6 +2,7 @@ using System.Runtime.InteropServices; using System.Text; using System.Text.RegularExpressions; +using System.Xml; using System.Xml.Linq; using Flurl.Http; @@ -50,10 +51,12 @@ public async Task ProcessSinglePage(Uri fileUri, MicroservicePageOptions Directory.CreateDirectory(Jp2Directory); await File.WriteAllBytesAsync(Path.Join(Jp2Directory, Path.GetFileName(fileUri.LocalPath)), sourceFile); await ConvertJp2sToJpgs(); - var page = await SendSinglePageToTranskribus(options); + var page = await SendSinglePageToTranskribus(options); await GetSinglePageTranskribusAltoXml(page); await ConvertAltoToHocr(); - ProcessHocrXml(new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath))); + ProcessHocrXml( + new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)), + new FontSizeCalculator(double.TryParse(Config["PIXEL_TO_FONTSIZE_CONVERSION"], out var factor) ? factor : 0.1)); var hocrFile = Directory.EnumerateFiles(HocrDirectory).Single(); return await File.ReadAllBytesAsync(hocrFile); } @@ -110,14 +113,14 @@ public async Task CheckProgress(IdCrudOptions options) DeleteDirectoryIfExists(OcrDirectory); } } - + public async Task CreateOcrDatastreamsFromHocr(OcrOptions options) { string pidFilePath = null; try { - pidFilePath = options.PidFile is null - ? await GetPagePids(options, options.Pid) + pidFilePath = options.PidFile is null + ? await GetPagePids(options, options.Pid) : Path.GetFullPath(options.PidFile); await GetHocrDatastreams(options, pidFilePath); ProcessHocrXml(new OcrGenerator(OcrDirectory)); @@ -401,9 +404,22 @@ void ProcessHocrXml(params IHocrXmlProcessor[] processors) foreach (var hocrFile in Directory.EnumerateFiles(HocrDirectory)) { var xml = XDocument.Load(hocrFile); + var xmlModified = false; foreach (var processor in processors) { - processor.Process(hocrFile, xml); + xmlModified |= processor.Process(hocrFile, xml); + } + if (xmlModified) + { + using var fileStream = File.Open(hocrFile, FileMode.Create); + var writer = XmlWriter.Create(fileStream, new XmlWriterSettings + { + // need to specify false here to stop it from emitting a byte order mark + Encoding = new UTF8Encoding(false), + Indent = true + }); + xml.Save(writer); + writer.Close(); } } } From 6412a84408502680d6911dbfc991daad849ac3bd Mon Sep 17 00:00:00 2001 From: Chrysanthemum Lovelace Date: Fri, 19 Sep 2025 13:31:16 -0400 Subject: [PATCH 2/4] try swapping x coords --- FontSizeCalculator.cs | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/FontSizeCalculator.cs b/FontSizeCalculator.cs index 3d6b53a..48c6d40 100644 --- a/FontSizeCalculator.cs +++ b/FontSizeCalculator.cs @@ -16,15 +16,38 @@ public bool Process(string hocrFile, XDocument hocrXml) var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); foreach (var line in lines) { - var lineAttributes = line.Attribute("title").Value; - var bbox = lineAttributes.Split(';') - .Select(x => x.Trim()) - .Where(x => x.StartsWith("bbox")) - .First() - .Split(' '); - var pixelHeight = int.Parse(bbox[4]) - int.Parse(bbox[2]); - var fontSize = Math.Floor(pixelHeight * PixelToFontSizeConversionFactor); - line.Attribute("title").SetValue(lineAttributes + $"; x_fsize {fontSize}"); + var words = line.Elements(ns + "span").ToList(); + for (int i = 0; i < words.Count - 1; i++) + { + var leftWord = words[i]; + var rightWord = words[i + 1]; + var leftBbox = leftWord.Attribute("title").Value.Split(';') + .Select(x => x.Trim()) + .First(x => x.StartsWith("bbox")) + .Split(' '); + var rightBbox = rightWord.Attribute("title").Value.Split(';') + .Select(x => x.Trim()) + .First(x => x.StartsWith("bbox")) + .Split(' '); + var leftEnd = int.Parse(leftBbox[3]); + var rightStart = int.Parse(rightBbox[1]); + if (leftEnd > rightStart) + { + leftBbox[3] = rightStart.ToString(); + rightBbox[1] = leftEnd.ToString(); + leftWord.Attribute("title").SetValue(string.Join(' ', leftBbox)); + rightWord.Attribute("title").SetValue(string.Join(' ', rightBbox)); + } + } + + // var lineAttributes = line.Attribute("title").Value; + // var bbox = lineAttributes.Split(';') + // .Select(x => x.Trim()) + // .First(x => x.StartsWith("bbox")) + // .Split(' '); + // var pixelHeight = int.Parse(bbox[4]) - int.Parse(bbox[2]); + // var fontSize = Math.Floor(pixelHeight * PixelToFontSizeConversionFactor); + // line.Attribute("title").SetValue(lineAttributes + $"; x_fsize {fontSize}"); } } return true; From 803b9dd0e7199e2426fee750ce84301ebb1e8577 Mon Sep 17 00:00:00 2001 From: Chrysanthemum Lovelace Date: Tue, 23 Sep 2025 14:32:05 -0400 Subject: [PATCH 3/4] try adjusting bboxes based on char length --- FontSizeCalculator.cs | 55 ------------------------------------------- Processor.cs | 2 +- WordAlignmentFixer.cs | 46 ++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 56 deletions(-) delete mode 100644 FontSizeCalculator.cs create mode 100644 WordAlignmentFixer.cs diff --git a/FontSizeCalculator.cs b/FontSizeCalculator.cs deleted file mode 100644 index 48c6d40..0000000 --- a/FontSizeCalculator.cs +++ /dev/null @@ -1,55 +0,0 @@ -using System.Xml.Linq; - -class FontSizeCalculator(double pixelToFontSizeConversionFactor) : IHocrXmlProcessor -{ - public double PixelToFontSizeConversionFactor { get; set; } = pixelToFontSizeConversionFactor; - - public void Init() - { - } - - public bool Process(string hocrFile, XDocument hocrXml) - { - XNamespace ns = "http://www.w3.org/1999/xhtml"; - foreach (var paragraph in hocrXml.Descendants(ns + "p")) - { - var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); - foreach (var line in lines) - { - var words = line.Elements(ns + "span").ToList(); - for (int i = 0; i < words.Count - 1; i++) - { - var leftWord = words[i]; - var rightWord = words[i + 1]; - var leftBbox = leftWord.Attribute("title").Value.Split(';') - .Select(x => x.Trim()) - .First(x => x.StartsWith("bbox")) - .Split(' '); - var rightBbox = rightWord.Attribute("title").Value.Split(';') - .Select(x => x.Trim()) - .First(x => x.StartsWith("bbox")) - .Split(' '); - var leftEnd = int.Parse(leftBbox[3]); - var rightStart = int.Parse(rightBbox[1]); - if (leftEnd > rightStart) - { - leftBbox[3] = rightStart.ToString(); - rightBbox[1] = leftEnd.ToString(); - leftWord.Attribute("title").SetValue(string.Join(' ', leftBbox)); - rightWord.Attribute("title").SetValue(string.Join(' ', rightBbox)); - } - } - - // var lineAttributes = line.Attribute("title").Value; - // var bbox = lineAttributes.Split(';') - // .Select(x => x.Trim()) - // .First(x => x.StartsWith("bbox")) - // .Split(' '); - // var pixelHeight = int.Parse(bbox[4]) - int.Parse(bbox[2]); - // var fontSize = Math.Floor(pixelHeight * PixelToFontSizeConversionFactor); - // line.Attribute("title").SetValue(lineAttributes + $"; x_fsize {fontSize}"); - } - } - return true; - } -} \ No newline at end of file diff --git a/Processor.cs b/Processor.cs index 673f6be..17145ac 100644 --- a/Processor.cs +++ b/Processor.cs @@ -56,7 +56,7 @@ public async Task ProcessSinglePage(Uri fileUri, MicroservicePageOptions await ConvertAltoToHocr(); ProcessHocrXml( new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)), - new FontSizeCalculator(double.TryParse(Config["PIXEL_TO_FONTSIZE_CONVERSION"], out var factor) ? factor : 0.1)); + new WordAlignmentFixer()); var hocrFile = Directory.EnumerateFiles(HocrDirectory).Single(); return await File.ReadAllBytesAsync(hocrFile); } diff --git a/WordAlignmentFixer.cs b/WordAlignmentFixer.cs new file mode 100644 index 0000000..e050ec3 --- /dev/null +++ b/WordAlignmentFixer.cs @@ -0,0 +1,46 @@ +using System.Text.RegularExpressions; +using System.Xml.Linq; + +partial class WordAlignmentFixer : IHocrXmlProcessor +{ + public void Init() + { + Console.WriteLine("Fixing horizontal word alignment..."); + } + + public bool Process(string hocrFile, XDocument hocrXml) + { + XNamespace ns = "http://www.w3.org/1999/xhtml"; + foreach (var paragraph in hocrXml.Descendants(ns + "p")) + { + var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); + foreach (var line in lines) + { + var lineAttributes = line.Attribute("title").Value; + var lineBbox = BboxRegex().Match(lineAttributes).Groups + .Cast() + .Skip(1) + .Select(g => int.Parse(g.Value)) + .ToArray(); + var words = line.Elements(ns + "span"); + // # of characters = sum of word lengths + the spaces between + var lineCharLength = words.Sum(w => w.Value.Length) + words.Count() - 1; + var linePixLength = lineBbox[2] - lineBbox[0]; + var pixPerChar = linePixLength / lineCharLength; + var currentPosition = lineBbox[0]; + foreach (var word in words) + { + var newLeft = currentPosition; + var newRight = currentPosition += word.Value.Length * pixPerChar; + var newBbox = $"bbox {newLeft} {lineBbox[1]} {newRight} {lineBbox[3]}"; + word.Attribute("title").SetValue(BboxRegex().Replace(word.Attribute("title").Value, newBbox)); + currentPosition += pixPerChar; + } + } + } + return true; + } + + [GeneratedRegex(@"bbox (\d+) (\d+) (\d+) (\d+)")] + private static partial Regex BboxRegex(); +} \ No newline at end of file From d283ce0bdb1642ed8512bd4e173e61e76caeacb1 Mon Sep 17 00:00:00 2001 From: Chrysanthemum Lovelace Date: Thu, 25 Sep 2025 14:53:19 -0400 Subject: [PATCH 4/4] version number increase --- transkribus-process.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transkribus-process.csproj b/transkribus-process.csproj index b0e856c..d9f8d2f 100644 --- a/transkribus-process.csproj +++ b/transkribus-process.csproj @@ -5,7 +5,7 @@ net8.0 transkribus_process enable - 1.1.0 + 1.2.0