Skip to content

Commit 77a6325

Browse files
committed
Add experimental EPUB to DOCX converter
1 parent c5848f4 commit 77a6325

File tree

10 files changed

+801
-2
lines changed

10 files changed

+801
-2
lines changed

samples/WpfApp1/WpfApp1.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
<ProjectReference Include="..\..\src\DocSharp.Docx\DocSharp.Docx.csproj" />
2323
<ProjectReference Include="..\..\src\DocSharp.ImageSharp\DocSharp.ImageSharp.csproj" />
2424
<ProjectReference Include="..\..\src\DocSharp.Markdown\DocSharp.Markdown.csproj" />
25+
<ProjectReference Include="..\..\src\DocSharp.Epub\DocSharp.Epub.csproj" />
2526
<ProjectReference Include="..\..\src\DocSharp.Renderer\DocSharp.Renderer.csproj" />
2627
<ProjectReference Include="..\..\src\DocSharp.SystemDrawing\DocSharp.SystemDrawing.csproj" />
2728
</ItemGroup>
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace DocSharp.Xml;
2+
3+
public static class XmlNamespaces
4+
{
5+
public const string XhtmlNamespace = "http://www.w3.org/1999/xhtml";
6+
}

src/DocSharp.Docx/BinaryToDocxExtensions.cs

Lines changed: 321 additions & 1 deletion
Large diffs are not rendered by default.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
using System;
2+
using System.Globalization;
3+
4+
namespace DocSharp.Docx;
5+
6+
public static class DocumentSettingsHelpers
7+
{
8+
/// <summary>
9+
/// Gets the default page width in twips. The default page size is A4 (21 x 29.7 cm) for regions using metric units and Letter for regions using imperial units.
10+
/// </summary>
11+
/// <returns></returns>
12+
public static int GetDefaultPageWidth()
13+
{
14+
return RegionInfo.CurrentRegion.IsMetric ? 11906 : 12240;
15+
}
16+
17+
/// <summary>
18+
/// Gets the default page width in twips. The default page size is A4 (21 x 29.7 cm) for regions using metric units and Letter for regions using imperial units.
19+
/// </summary>
20+
/// <returns></returns>
21+
public static int GetDefaultPageHeight()
22+
{
23+
return RegionInfo.CurrentRegion.IsMetric ? 16838 : 15839;
24+
}
25+
26+
/// <summary>
27+
/// Gets the default page left margin (2 cm) in twips.
28+
/// </summary>
29+
/// <returns></returns>
30+
public static int GetDefaultPageLeftMargin()
31+
{
32+
return 1134;
33+
}
34+
35+
/// <summary>
36+
/// Gets the default page top margin (2.5 cm) in twips.
37+
/// </summary>
38+
/// <returns></returns>
39+
public static int GetDefaultPageTopMargin()
40+
{
41+
return 1417;
42+
}
43+
44+
/// <summary>
45+
/// Gets the default page right margin (2 cm) in twips.
46+
/// </summary>
47+
/// <returns></returns>
48+
public static int GetDefaultPageRightMargin()
49+
{
50+
return 1134;
51+
}
52+
53+
/// <summary>
54+
/// Gets the default page bottom margin (2 cm) in twips.
55+
/// </summary>
56+
/// <returns></returns>
57+
public static int GetDefaultPageBottomMargin()
58+
{
59+
return 1134;
60+
}
61+
}

src/DocSharp.Docx/IBinaryToDocxConverter.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.IO;
2+
using System.Threading.Tasks;
23
using DocumentFormat.OpenXml;
34
using DocumentFormat.OpenXml.Packaging;
45

@@ -12,5 +13,5 @@ public interface IBinaryToDocxConverter
1213
/// </summary>
1314
/// <param name="input">The input stream.</param>
1415
/// <param name="targetDocument">The target DOCX document.</param>
15-
void BuildDocx(Stream input, WordprocessingDocument targetDocument);
16+
Task BuildDocxAsync(Stream input, WordprocessingDocument targetDocument);
1617
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFrameworks>net8.0;net9.0;net10.0;net462</TargetFrameworks>
5+
<ImplicitUsings>disable</ImplicitUsings>
6+
<Nullable>enable</Nullable>
7+
<LangVersion>latest</LangVersion>
8+
</PropertyGroup>
9+
10+
<!-- NuGet package -->
11+
<PropertyGroup>
12+
<AssemblyName>DocSharp.Epub</AssemblyName>
13+
<AssemblyTitle>DocSharp.Epub</AssemblyTitle>
14+
<PackageId>DocSharp.Epub</PackageId>
15+
<Title>DocSharp.Epub</Title>
16+
<Description>.NET library for converting documents. The DocSharp.Epub package provides EPUB to DOCX conversion.</Description>
17+
<PackageTags>epub docx convert converter renderer openxml office word epubcore</PackageTags>
18+
</PropertyGroup>
19+
20+
<ItemGroup>
21+
<PackageReference Include="EpubCore" Version="1.6.2" />
22+
<PackageReference Include="HtmlToOpenXml.dll" Version="3.2.8" />
23+
<PackageReference Include="PreMailer.Net" Version="2.7.2" />
24+
</ItemGroup>
25+
26+
<ItemGroup>
27+
<ProjectReference Include="..\DocSharp.Docx\DocSharp.Docx.csproj" />
28+
</ItemGroup>
29+
30+
</Project>
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Diagnostics;
4+
using System.IO;
5+
using System.IO.Compression;
6+
using System.Linq;
7+
using System.Text.RegularExpressions;
8+
using System.Threading.Tasks;
9+
using System.Xml.Linq;
10+
using DocSharp.Xml;
11+
using DocumentFormat.OpenXml;
12+
using DocumentFormat.OpenXml.Packaging;
13+
using DocumentFormat.OpenXml.Wordprocessing;
14+
using EpubCore;
15+
using HtmlToOpenXml;
16+
17+
namespace DocSharp.Docx;
18+
19+
// NOTE: internal links currently don't work due to the Html2OpenXml library not creating the necessary bookmarks
20+
// for HTML ids.
21+
22+
/// <summary>
23+
/// Basic experimental EPUB to DOCX converter that performs the following steps:
24+
/// 1. Reads the EPUB using EpubCore to get the chapters in reading order
25+
/// 2. Extract the EPUB into a temp folder
26+
/// 3. For each chapter, replace image sources with absolute URIs,
27+
/// attempt to fix links to other chapters,
28+
/// and move CSS styles inline using the PreMailer library.
29+
/// 4. Convert HTML to DOCX using the HtmlToOpenXml library and append to the DOCX document.
30+
/// 5. Delete the temp directory.
31+
/// </summary>
32+
internal class EpubToDocxConverter : IBinaryToDocxConverter
33+
{
34+
/// <summary>
35+
/// If true, only the "core" chapters will get converted.
36+
/// The default is false, thus including cover, table of contents and other transition pages in the output document.
37+
/// </summary>
38+
public bool ChaptersOnly { get; set; } = false;
39+
40+
/// <summary>
41+
/// If true, adds a page break after each chapter.
42+
/// </summary>
43+
public bool PageBreakAfterChapters { get; set; } = true;
44+
45+
/// <summary>
46+
/// If true, the converter will attempt to preserve CSS styles embedded in the eBook and create equivalents in the output DOCX document.
47+
/// The default is true, set to false to produce a more minimal document if unexpected/undesired formatting is present.
48+
/// </summary>
49+
public bool PreserveCssStyles { get; set; } = true;
50+
51+
/// <summary>
52+
/// The page width in millimeters. If not set, the default page size is A4 (210 x 297 mm) for regions using metric units and Letter (8.5 x 11 inches) for regions using imperial units.
53+
/// </summary>
54+
public int PageWidth { get; set; } = -1;
55+
56+
/// <summary>
57+
/// The page height in millimeters. If not set, the default page size is A4 (210 x 297 mm) for regions using metric units and Letter (8.5 x 11 inches) for regions using imperial units.
58+
/// </summary>
59+
public int PageHeight { get; set; } = -1;
60+
61+
/// <summary>
62+
/// The page top margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm
63+
/// </summary>
64+
public int PageLeftMargin { get; set; } = -1;
65+
66+
/// <summary>
67+
/// The page top margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm
68+
/// </summary>
69+
public int PageTopMargin { get; set; } = -1;
70+
71+
/// <summary>
72+
/// The page right margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm
73+
/// </summary>
74+
public int PageRightMargin { get; set; } = -1;
75+
76+
/// <summary>
77+
/// The page bottom margin in millimeters. If not set, the default page margins are top = 25 mm, left/right/bottom = 20 mm
78+
/// </summary>
79+
public int PageBottomMargin { get; set; } = -1;
80+
81+
public async Task BuildDocxAsync(Stream input, WordprocessingDocument targetDocument)
82+
{
83+
// Read EPUB
84+
var book = EpubReader.Read(input, leaveOpen: true);
85+
86+
// Get chapters (or all html pages including cover and table of contents),
87+
// depending on the ChaptersOnly property.
88+
var chapters = ChaptersOnly ? book.TableOfContents.Select(chapter => book.FetchHtmlFileForChapter(chapter)) :
89+
book.SpecialResources.HtmlInReadingOrder;
90+
var chapterFileNames = chapters.Select(file => file.FileName).ToList();
91+
92+
// Create temp directory
93+
var tempDir = Path.Combine(Path.GetTempPath(), "epub_extract_" + Path.GetRandomFileName());
94+
if (!tempDir.EndsWith(Path.DirectorySeparatorChar))
95+
// Add the final slash, as the PreMailer library has issues in finding resources.
96+
tempDir += Path.DirectorySeparatorChar;
97+
try
98+
{
99+
Directory.CreateDirectory(tempDir);
100+
}
101+
catch (Exception ex)
102+
{
103+
throw new SystemException($"Unable to create a temp directory. This step is necessary for EPUB processing. Details: {ex.Message}");
104+
}
105+
106+
try
107+
{
108+
// Extract EPUB to temp directory
109+
ZipFile.ExtractToDirectory(input, tempDir, overwriteFiles: true);
110+
111+
// Initialize document
112+
var mainPart = targetDocument.MainDocumentPart ?? targetDocument.AddMainDocumentPart();
113+
mainPart.Document ??= new Document();
114+
mainPart.Document.RemoveAllChildren();
115+
var body = mainPart.Document.AppendChild(new Body());
116+
117+
// Initialize HTML to DOCX converter
118+
var converter = new HtmlConverter(mainPart)
119+
{
120+
ImageProcessing = ImageProcessingMode.Embed,
121+
SupportsAnchorLinks = true,
122+
SupportsHeadingNumbering = true
123+
};
124+
125+
// Enumerate chapters
126+
foreach (var chapter in chapters)
127+
{
128+
// Get chapter file name and XHTML content
129+
var fileName = chapter.FileName;
130+
var htmlContent = chapter.TextContent;
131+
132+
// Attempt to fix external images sources and links pointing to other chapters.
133+
var normalizedHtml = HtmlUtils.NormalizeHtml(htmlContent, tempDir, chapterFileNames);
134+
135+
// HtmlToOpenXml can load external images, while styles should be moved inline:
136+
// https://github.com/onizet/html2openxml/wiki/Style
137+
// Move styles inline using the PreMailer.Net library, unless style conversion is disabled.
138+
var htmlWithInlinedCss = normalizedHtml;
139+
if (PreserveCssStyles)
140+
{
141+
try
142+
{
143+
var inliner = new PreMailer.Net.PreMailer(normalizedHtml, new Uri(tempDir));
144+
var inlinerResult = inliner.MoveCssInline(
145+
removeStyleElements: true,
146+
stripIdAndClassAttributes: true
147+
);
148+
htmlWithInlinedCss = inlinerResult.Html;
149+
}
150+
catch(Exception ex)
151+
{
152+
#if DEBUG
153+
Debug.WriteLine($"Error during style inlining. Details: {ex.Message}");
154+
#endif
155+
}
156+
}
157+
158+
// Before each chapter, add a bookmark in DOCX to make internal links work
159+
string anchorName = $"_{fileName.Replace(' ', '_')}";
160+
int id = new Random().Next(100000, 999999); // TODO: improve id generation
161+
body.AppendChild(new Paragraph([
162+
new BookmarkStart() { Name = anchorName, Id = id.ToString() },
163+
new BookmarkEnd() { Id = id.ToString() }
164+
]));
165+
166+
// Parse the HTML body, convert to Open XML and append to the DOCX.
167+
await converter.ParseBody(htmlWithInlinedCss);
168+
169+
if (PageBreakAfterChapters)
170+
{
171+
// Add a page break after each chapter if desired.
172+
body.AppendChild(new Paragraph(new Run(new Break() { Type = BreakValues.Page })));
173+
}
174+
}
175+
176+
// Add default section properties
177+
body.AppendChild(new SectionProperties(
178+
new PageSize()
179+
{
180+
Width = (uint)(PageWidth > 0 ? UnitMetricHelper.ConvertToTwips(PageWidth, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageWidth()),
181+
Height = (uint)(PageHeight > 0 ? UnitMetricHelper.ConvertToTwips(PageHeight, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageHeight()),
182+
},
183+
new PageMargin()
184+
{
185+
// Notes:
186+
// - PageMargin uses uint for Left and Right margins, and int for top and bottom (enforced by Open XML SDK)
187+
// - 0 is allowed for margins but not recommended
188+
Left = (uint)(PageLeftMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageLeftMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageLeftMargin()),
189+
Right = (uint)(PageRightMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageRightMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageRightMargin()),
190+
Top = (int)(PageTopMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageTopMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageTopMargin()),
191+
Bottom = (int)(PageBottomMargin >= 0 ? UnitMetricHelper.ConvertToTwips(PageBottomMargin, UnitMetric.Millimeter) : DocumentSettingsHelpers.GetDefaultPageBottomMargin()),
192+
}));
193+
194+
if (targetDocument.CanSave)
195+
targetDocument.Save();
196+
}
197+
catch(Exception)
198+
{
199+
throw;
200+
}
201+
finally
202+
{
203+
// Clear temp folder
204+
try
205+
{
206+
Directory.Delete(tempDir, true);
207+
}
208+
catch(Exception ex)
209+
{
210+
#if DEBUG
211+
Debug.WriteLine($"EPUB to DOCX: Unable to delete temp folder \"{tempDir}\". Details: {ex.Message}");
212+
#endif
213+
/* Write to console and ignore */
214+
}
215+
}
216+
}
217+
}

0 commit comments

Comments
 (0)