Skip to content
This repository was archived by the owner on Jan 5, 2026. It is now read-only.

Commit e1cc7e7

Browse files
author
Lowell Stewart
committed
emulate Word's treatment of xml:space="preserve"
(when converting DOCX content to Unicode strings)
1 parent e98c87c commit e1cc7e7

File tree

2 files changed

+56
-1
lines changed

2 files changed

+56
-1
lines changed

OpenXmlPowerTools.Tests/UnicodeMapperTests.cs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,40 @@ public void CanStringifySymbols()
142142
Assert.Equal(symFromChar1.ToString(SaveOptions.None), symFromChar2.ToString(SaveOptions.None));
143143
Assert.Equal(symFromChar1.ToString(SaveOptions.None), symFromChar3.ToString(SaveOptions.None));
144144
}
145+
146+
[Fact]
147+
public void HonorsXmlSpace()
148+
{
149+
XDocument partDocument = XDocument.Parse(PreserveSpacingXmlString);
150+
XElement p = partDocument.Descendants(W.p).Last();
151+
string innerText = p.Descendants(W.r)
152+
.Select(UnicodeMapper.RunToString)
153+
.StringConcatenate();
154+
Assert.Equal(@"The following space is retained: but this one is not:. Similarly these two lines should have only a space between them: Line 1! Line 2!", innerText);
155+
}
156+
157+
private const string PreserveSpacingXmlString =
158+
@"<w:document xmlns:w=""http://schemas.openxmlformats.org/wordprocessingml/2006/main"">
159+
<w:body>
160+
<w:p>
161+
<w:r>
162+
<w:t xml:space=""preserve"">The following space is retained: </w:t>
163+
</w:r>
164+
<w:r>
165+
<w:t>but this one is not: </w:t>
166+
</w:r>
167+
<w:r>
168+
<w:t xml:space=""preserve"">. Similarly these two lines should have only a space between them: </w:t>
169+
</w:r>
170+
<w:r>
171+
<w:t>
172+
Line 1!
173+
Line 2!
174+
</w:t>
175+
</w:r>
176+
</w:p>
177+
</w:body>
178+
</w:document>";
145179
}
146180
}
147181

OpenXmlPowerTools/UnicodeMapper.cs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,12 @@ public static string RunToString(XElement element)
6969

7070
// For w:t elements, we obviously want the element's value.
7171
if (element.Name == W.t)
72-
return (string) element;
72+
{
73+
// Emulate Word's handling of the xml:space attribute on text elements
74+
XAttribute spaceAttribute = element.Attribute(XNamespace.Xml + "space");
75+
string space = spaceAttribute != null ? spaceAttribute.Value : null;
76+
return space == "preserve" ? (string)element : IgnoreTextSpacing((string)element);
77+
}
7378

7479
// Turn elements representing special characters into their corresponding
7580
// unicode characters.
@@ -125,6 +130,22 @@ public static string RunToString(XElement element)
125130
return StartOfHeading.ToString();
126131
}
127132

133+
/// <summary>
134+
/// Emulate the way Word treats text elements when attribute xml:space="preserve"
135+
/// is NOT present.
136+
/// </summary>
137+
/// <param name="text">The entire content of the w:t element.</param>
138+
/// <returns>The corresponding text string Word would display, print, and
139+
/// allow to be edited.</returns>
140+
private static string IgnoreTextSpacing(string text)
141+
{
142+
// all whitespace at beginning and end of entire string is ignored
143+
// if text contains line breaks, they are ignored/replaced with a single space
144+
return string.Join(" ",
145+
text.Split(new char[] { '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries)
146+
).Trim();
147+
}
148+
128149
/// <summary>
129150
/// Translate a symbol into a Unicode character, using the specified w:font attribute
130151
/// value and unicode value (represented by the w:sym element's w:char attribute),

0 commit comments

Comments
 (0)