|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.Diagnostics; |
| 4 | +using System.IO; |
| 5 | +using System.IO.Compression; |
| 6 | +using System.Linq; |
| 7 | +using System.Text.RegularExpressions; |
| 8 | +using System.Threading.Tasks; |
| 9 | +using System.Xml.Linq; |
| 10 | +using DocSharp.Xml; |
| 11 | +using DocumentFormat.OpenXml; |
| 12 | +using DocumentFormat.OpenXml.Packaging; |
| 13 | +using DocumentFormat.OpenXml.Wordprocessing; |
| 14 | +using EpubCore; |
| 15 | +using HtmlToOpenXml; |
| 16 | + |
| 17 | +namespace DocSharp.Docx; |
| 18 | + |
| 19 | +// NOTE: internal links currently don't work due to the Html2OpenXml library not creating the necessary bookmarks |
| 20 | +// for HTML ids. |
| 21 | + |
| 22 | +/// <summary> |
| 23 | +/// Basic experimental EPUB to DOCX converter that performs the following steps: |
| 24 | +/// 1. Reads the EPUB using EpubCore to get the chapters in reading order |
| 25 | +/// 2. Extract the EPUB into a temp folder |
| 26 | +/// 3. For each chapter, replace image sources with absolute URIs, |
| 27 | +/// attempt to fix links to other chapters, |
| 28 | +/// and move CSS styles inline using the PreMailer library. |
| 29 | +/// 4. Convert HTML to DOCX using the HtmlToOpenXml library and append to the DOCX document. |
| 30 | +/// 5. Delete the temp directory. |
| 31 | +/// </summary> |
| 32 | +internal class EpubToDocxConverter : IBinaryToDocxConverter |
| 33 | +{ |
| 34 | + /// <summary> |
| 35 | + /// If true, only the "core" chapters will get converted. |
| 36 | + /// The default is false, thus including cover, table of contents and other transition pages in the output document. |
| 37 | + /// </summary> |
| 38 | + public bool ChaptersOnly { get; set; } = false; |
| 39 | + |
| 40 | + /// <summary> |
| 41 | + /// If true, adds a page break after each chapter. |
| 42 | + /// </summary> |
| 43 | + public bool PageBreakAfterChapters { get; set; } = true; |
| 44 | + |
| 45 | + /// <summary> |
| 46 | + /// If true, the converter will attempt to preserve CSS styles embedded in the eBook and create equivalents in the output DOCX document. |
| 47 | + /// The default is true, set to false to produce a more minimal document if unexpected/undesired formatting is present. |
| 48 | + /// </summary> |
| 49 | + public bool PreserveCssStyles { get; set; } = true; |
| 50 | + |
| 51 | + /// <summary> |
| 52 | + /// The page width in millimeters. If not set, the default page size is A4 (210 x 297 mm) for regions using metric units and Letter (8.5 x 11 inches) for regions using imperial units. |
| 53 | + /// </summary> |
| 54 | + public int PageWidth { get; set; } = -1; |
| 55 | + |
| 56 | + /// <summary> |
| 57 | + /// The page height in millimeters. If not set, the default page size is A4 (210 x 297 mm) for regions using metric units and Letter (8.5 x 11 inches) for regions using imperial units. |
| 58 | + /// </summary> |
| 59 | + public int PageHeight { get; set; } = -1; |
| 60 | + |
| 61 | + /// <summary> |
| 62 | + /// The page top margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm |
| 63 | + /// </summary> |
| 64 | + public int PageLeftMargin { get; set; } = -1; |
| 65 | + |
| 66 | + /// <summary> |
| 67 | + /// The page top margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm |
| 68 | + /// </summary> |
| 69 | + public int PageTopMargin { get; set; } = -1; |
| 70 | + |
| 71 | + /// <summary> |
| 72 | + /// The page right margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm |
| 73 | + /// </summary> |
| 74 | + public int PageRightMargin { get; set; } = -1; |
| 75 | + |
| 76 | + /// <summary> |
| 77 | + /// The page bottom margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm |
| 78 | + /// </summary> |
| 79 | + public int PageBottomMargin { get; set; } = -1; |
| 80 | + |
| 81 | + public async Task BuildDocxAsync(Stream input, WordprocessingDocument targetDocument) |
| 82 | + { |
| 83 | + // Read EPUB |
| 84 | + var book = EpubReader.Read(input, leaveOpen: true); |
| 85 | + |
| 86 | + // Get chapters (or all html pages including cover and table of contents), |
| 87 | + // depending on the ChaptersOnly property. |
| 88 | + var chapters = ChaptersOnly ? book.TableOfContents.Select(chapter => book.FetchHtmlFileForChapter(chapter)) : |
| 89 | + book.SpecialResources.HtmlInReadingOrder; |
| 90 | + var chapterFileNames = chapters.Select(file => file.FileName).ToList(); |
| 91 | + |
| 92 | + // Create temp directory |
| 93 | + var tempDir = Path.Combine(Path.GetTempPath(), "epub_extract_" + Path.GetRandomFileName()); |
| 94 | + if (!tempDir.EndsWith(Path.DirectorySeparatorChar)) |
| 95 | + // Add the final slash, as the PreMailer library has issues in finding resources. |
| 96 | + tempDir += Path.DirectorySeparatorChar; |
| 97 | + try |
| 98 | + { |
| 99 | + Directory.CreateDirectory(tempDir); |
| 100 | + } |
| 101 | + catch (Exception ex) |
| 102 | + { |
| 103 | + throw new SystemException($"Unable to create a temp directory. This step is necessary for EPUB processing. Details: {ex.Message}"); |
| 104 | + } |
| 105 | + |
| 106 | + try |
| 107 | + { |
| 108 | + // Extract EPUB to temp directory |
| 109 | + ZipFile.ExtractToDirectory(input, tempDir, overwriteFiles: true); |
| 110 | + |
| 111 | + // Initialize document |
| 112 | + var mainPart = targetDocument.MainDocumentPart ?? targetDocument.AddMainDocumentPart(); |
| 113 | + mainPart.Document ??= new Document(); |
| 114 | + mainPart.Document.RemoveAllChildren(); |
| 115 | + var body = mainPart.Document.AppendChild(new Body()); |
| 116 | + |
| 117 | + // Initialize HTML to DOCX converter |
| 118 | + var converter = new HtmlConverter(mainPart) |
| 119 | + { |
| 120 | + ImageProcessing = ImageProcessingMode.Embed, |
| 121 | + SupportsAnchorLinks = true, |
| 122 | + SupportsHeadingNumbering = true |
| 123 | + }; |
| 124 | + |
| 125 | + // Enumerate chapters |
| 126 | + foreach (var chapter in chapters) |
| 127 | + { |
| 128 | + // Get chapter file name and XHTML content |
| 129 | + var fileName = chapter.FileName; |
| 130 | + var htmlContent = chapter.TextContent; |
| 131 | + |
| 132 | + // Attempt to fix external images sources and links pointing to other chapters. |
| 133 | + var normalizedHtml = HtmlUtils.NormalizeHtml(htmlContent, tempDir, chapterFileNames); |
| 134 | + |
| 135 | + // HtmlToOpenXml can load external images, while styles should be moved inline: |
| 136 | + // https://github.com/onizet/html2openxml/wiki/Style |
| 137 | + // Move styles inline using the PreMailer.Net library, unless style conversion is disabled. |
| 138 | + var htmlWithInlinedCss = normalizedHtml; |
| 139 | + if (PreserveCssStyles) |
| 140 | + { |
| 141 | + try |
| 142 | + { |
| 143 | + var inliner = new PreMailer.Net.PreMailer(normalizedHtml, new Uri(tempDir)); |
| 144 | + var inlinerResult = inliner.MoveCssInline( |
| 145 | + removeStyleElements: true, |
| 146 | + stripIdAndClassAttributes: true |
| 147 | + ); |
| 148 | + htmlWithInlinedCss = inlinerResult.Html; |
| 149 | + } |
| 150 | + catch(Exception ex) |
| 151 | + { |
| 152 | + #if DEBUG |
| 153 | + Debug.WriteLine($"Error during style inlining. Details: {ex.Message}"); |
| 154 | + #endif |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + // Before each chapter, add a bookmark in DOCX to make internal links work |
| 159 | + string anchorName = $"_{fileName.Replace(' ', '_')}"; |
| 160 | + int id = new Random().Next(100000, 999999); // TODO: improve id generation |
| 161 | + body.AppendChild(new Paragraph([ |
| 162 | + new BookmarkStart() { Name = anchorName, Id = id.ToString() }, |
| 163 | + new BookmarkEnd() { Id = id.ToString() } |
| 164 | + ])); |
| 165 | + |
| 166 | + // Parse the HTML body, convert to Open XML and append to the DOCX. |
| 167 | + await converter.ParseBody(htmlWithInlinedCss); |
| 168 | + |
| 169 | + if (PageBreakAfterChapters) |
| 170 | + { |
| 171 | + // Add a page break after each chapter if desired. |
| 172 | + body.AppendChild(new Paragraph(new Run(new Break() { Type = BreakValues.Page }))); |
| 173 | + } |
| 174 | + } |
| 175 | + |
| 176 | + // Add default section properties |
| 177 | + body.AppendChild(new SectionProperties( |
| 178 | + new PageSize() |
| 179 | + { |
| 180 | + Width = (uint)(PageWidth > 0 ? UnitMetricHelper.ConvertToTwips(PageWidth, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageWidth()), |
| 181 | + Height = (uint)(PageHeight > 0 ? UnitMetricHelper.ConvertToTwips(PageHeight, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageHeight()), |
| 182 | + }, |
| 183 | + new PageMargin() |
| 184 | + { |
| 185 | + // Notes: |
| 186 | + // - PageMargin uses uint for Left and Right margins, and int for top and bottom (enforced by Open XML SDK) |
| 187 | + // - 0 is allowed for margins but not recommended |
| 188 | + Left = (uint)(PageLeftMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageLeftMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageLeftMargin()), |
| 189 | + Right = (uint)(PageRightMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageRightMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageRightMargin()), |
| 190 | + Top = (int)(PageTopMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageTopMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageTopMargin()), |
| 191 | + Bottom = (int)(PageBottomMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageBottomMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageBottomMargin()), |
| 192 | + })); |
| 193 | + |
| 194 | + if (targetDocument.CanSave) |
| 195 | + targetDocument.Save(); |
| 196 | + } |
| 197 | + catch(Exception) |
| 198 | + { |
| 199 | + throw; |
| 200 | + } |
| 201 | + finally |
| 202 | + { |
| 203 | + // Clear temp folder |
| 204 | + try |
| 205 | + { |
| 206 | + Directory.Delete(tempDir, true); |
| 207 | + } |
| 208 | + catch(Exception ex) |
| 209 | + { |
| 210 | + #if DEBUG |
| 211 | + Debug.WriteLine($"EPUB to DOCX: Unable to delete temp folder \"{tempDir}\". Details: {ex.Message}"); |
| 212 | + #endif |
| 213 | + /* Write to console and ignore */ |
| 214 | + } |
| 215 | + } |
| 216 | + } |
| 217 | +} |
0 commit comments