|
| 1 | +using Nikse.SubtitleEdit.Core.Interfaces; |
| 2 | +using System; |
| 3 | +using System.Collections.Generic; |
| 4 | +using System.Text; |
| 5 | +using System.Text.RegularExpressions; |
| 6 | + |
| 7 | +namespace Nikse.SubtitleEdit.Core |
| 8 | +{ |
| 9 | + /// <summary> |
| 10 | + /// Rich Text to plain text |
| 11 | + /// </summary> |
| 12 | + /// <remarks> |
| 13 | + /// Translated from Python located at: |
| 14 | + /// http://stackoverflow.com/a/188877/448 |
| 15 | + /// to C# by Chris Benard - http://chrisbenard.net/2014/08/20/Extract-Text-from-RTF-in-.Net |
| 16 | + /// </remarks> |
| 17 | + public static class RichTextToPlainText |
| 18 | + { |
| 19 | + |
| 20 | + public static IRtfTextConverter NativeRtfTextConverter { get; set; } |
| 21 | + |
| 22 | + private class StackEntry |
| 23 | + { |
| 24 | + public int NumberOfCharactersToSkip { get; private set; } |
| 25 | + public bool Ignorable { get; private set; } |
| 26 | + |
| 27 | + public StackEntry(int numberOfCharactersToSkip, bool ignorable) |
| 28 | + { |
| 29 | + NumberOfCharactersToSkip = numberOfCharactersToSkip; |
| 30 | + Ignorable = ignorable; |
| 31 | + } |
| 32 | + } |
| 33 | + |
| 34 | + private static readonly Regex RtfRegex = new Regex(@"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase); |
| 35 | + |
| 36 | + private static readonly List<string> Destinations = new List<string> |
| 37 | + { |
| 38 | + "aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid", |
| 39 | + "atnparent","atnref","atntime","atrfend","atrfstart","author","background", |
| 40 | + "bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping", |
| 41 | + "colortbl","comment","company","creatim","datafield","datastore","defchp","defpap", |
| 42 | + "do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt", |
| 43 | + "fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl", |
| 44 | + "ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype", |
| 45 | + "fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr", |
| 46 | + "footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl", |
| 47 | + "header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc", |
| 48 | + "hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers", |
| 49 | + "leveltext","lfolevel","linkval","list","listlevel","listname","listoverride", |
| 50 | + "listoverridetable","listpicture","liststylename","listtable","listtext", |
| 51 | + "lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr", |
| 52 | + "mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr", |
| 53 | + "mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me", |
| 54 | + "mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr", |
| 55 | + "mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag", |
| 56 | + "mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname", |
| 57 | + "mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr", |
| 58 | + "mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject", |
| 59 | + "mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname", |
| 60 | + "mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl", |
| 61 | + "mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr", |
| 62 | + "mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu", |
| 63 | + "mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr", |
| 64 | + "mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup", |
| 65 | + "msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide", |
| 66 | + "msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol", |
| 67 | + "mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables", |
| 68 | + "objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops", |
| 69 | + "oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password", |
| 70 | + "passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta", |
| 71 | + "pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe", |
| 72 | + "result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst", |
| 73 | + "shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv", |
| 74 | + "svb","tc","template","themedata","title","txe","ud","upr","userprops", |
| 75 | + "wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform", |
| 76 | + "xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl", |
| 77 | + "xmlopen" |
| 78 | + }; |
| 79 | + |
| 80 | + private static readonly Dictionary<string, string> SpecialCharacters = new Dictionary<string, string> |
| 81 | + { |
| 82 | + { "par", "\n" }, |
| 83 | + { "sect", "\n\n" }, |
| 84 | + { "page", "\n\n" }, |
| 85 | + { "line", "\n" }, |
| 86 | + { "tab", "\t" }, |
| 87 | + { "emdash", "\u2014" }, |
| 88 | + { "endash", "\u2013" }, |
| 89 | + { "emspace", "\u2003" }, |
| 90 | + { "enspace", "\u2002" }, |
| 91 | + { "qmspace", "\u2005" }, |
| 92 | + { "bullet", "\u2022" }, |
| 93 | + { "lquote", "\u2018" }, |
| 94 | + { "rquote", "\u2019" }, |
| 95 | + { "ldblquote", "\u201C" }, |
| 96 | + { "rdblquote", "\u201D" }, |
| 97 | + }; |
| 98 | + |
| 99 | + /// <summary> |
| 100 | + /// Strip RTF Tags from RTF Text |
| 101 | + /// </summary> |
| 102 | + /// <param name="inputRtf">RTF formatted text</param> |
| 103 | + /// <returns>Plain text from RTF</returns> |
| 104 | + internal static string ConvertToText(string inputRtf) |
| 105 | + { |
| 106 | + if (inputRtf == null) |
| 107 | + { |
| 108 | + return null; |
| 109 | + } |
| 110 | + |
| 111 | + // use interface converter if available |
| 112 | + if (NativeRtfTextConverter != null) |
| 113 | + { |
| 114 | + NativeRtfTextConverter.RtfToText(inputRtf); |
| 115 | + } |
| 116 | + |
| 117 | + var stack = new Stack<StackEntry>(); |
| 118 | + bool ignorable = false; // Whether this group (and all inside it) are "ignorable". |
| 119 | + int ucskip = 1; // Number of ASCII characters to skip after a unicode character. |
| 120 | + int curskip = 0; // Number of ASCII characters left to skip |
| 121 | + var outList = new List<string>(); // Output buffer. |
| 122 | + |
| 123 | + MatchCollection matches = RtfRegex.Matches(inputRtf); |
| 124 | + |
| 125 | + if (matches.Count > 0) |
| 126 | + { |
| 127 | + foreach (Match match in matches) |
| 128 | + { |
| 129 | + string word = match.Groups[1].Value; |
| 130 | + string arg = match.Groups[2].Value; |
| 131 | + string hex = match.Groups[3].Value; |
| 132 | + string character = match.Groups[4].Value; |
| 133 | + string brace = match.Groups[5].Value; |
| 134 | + string tchar = match.Groups[6].Value; |
| 135 | + |
| 136 | + if (!String.IsNullOrEmpty(brace)) |
| 137 | + { |
| 138 | + curskip = 0; |
| 139 | + if (brace == "{") |
| 140 | + { |
| 141 | + // Push state |
| 142 | + stack.Push(new StackEntry(ucskip, ignorable)); |
| 143 | + } |
| 144 | + else if (brace == "}") |
| 145 | + { |
| 146 | + // Pop state |
| 147 | + StackEntry entry = stack.Pop(); |
| 148 | + ucskip = entry.NumberOfCharactersToSkip; |
| 149 | + ignorable = entry.Ignorable; |
| 150 | + } |
| 151 | + } |
| 152 | + else if (!String.IsNullOrEmpty(character)) // \x (not a letter) |
| 153 | + { |
| 154 | + curskip = 0; |
| 155 | + if (character == "~") |
| 156 | + { |
| 157 | + if (!ignorable) |
| 158 | + { |
| 159 | + outList.Add("\xA0"); |
| 160 | + } |
| 161 | + } |
| 162 | + else if ("{}\\".Contains(character)) |
| 163 | + { |
| 164 | + if (!ignorable) |
| 165 | + { |
| 166 | + outList.Add(character); |
| 167 | + } |
| 168 | + } |
| 169 | + else if (character == "*") |
| 170 | + { |
| 171 | + ignorable = true; |
| 172 | + } |
| 173 | + } |
| 174 | + else if (!String.IsNullOrEmpty(word)) // \foo |
| 175 | + { |
| 176 | + curskip = 0; |
| 177 | + if (Destinations.Contains(word)) |
| 178 | + { |
| 179 | + ignorable = true; |
| 180 | + } |
| 181 | + else if (ignorable) |
| 182 | + { |
| 183 | + } |
| 184 | + else if (SpecialCharacters.ContainsKey(word)) |
| 185 | + { |
| 186 | + outList.Add(SpecialCharacters[word]); |
| 187 | + } |
| 188 | + else if (word == "uc") |
| 189 | + { |
| 190 | + ucskip = Int32.Parse(arg); |
| 191 | + } |
| 192 | + else if (word == "u") |
| 193 | + { |
| 194 | + int c = Int32.Parse(arg); |
| 195 | + if (c < 0) |
| 196 | + { |
| 197 | + c += 0x10000; |
| 198 | + } |
| 199 | + outList.Add(Char.ConvertFromUtf32(c)); |
| 200 | + curskip = ucskip; |
| 201 | + } |
| 202 | + } |
| 203 | + else if (!String.IsNullOrEmpty(hex)) // \'xx |
| 204 | + { |
| 205 | + if (curskip > 0) |
| 206 | + { |
| 207 | + curskip -= 1; |
| 208 | + } |
| 209 | + else if (!ignorable) |
| 210 | + { |
| 211 | + int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber); |
| 212 | + outList.Add(Char.ConvertFromUtf32(c)); |
| 213 | + } |
| 214 | + } |
| 215 | + else if (!String.IsNullOrEmpty(tchar)) |
| 216 | + { |
| 217 | + if (curskip > 0) |
| 218 | + { |
| 219 | + curskip -= 1; |
| 220 | + } |
| 221 | + else if (!ignorable) |
| 222 | + { |
| 223 | + outList.Add(tchar); |
| 224 | + } |
| 225 | + } |
| 226 | + } |
| 227 | + } |
| 228 | + return String.Join(String.Empty, outList.ToArray()); |
| 229 | + } |
| 230 | + |
| 231 | + internal static string ConvertToRtf(string value) |
| 232 | + { |
| 233 | + if (string.IsNullOrWhiteSpace(value)) |
| 234 | + { |
| 235 | + return string.Empty; |
| 236 | + } |
| 237 | + |
| 238 | + // use interface converter if available |
| 239 | + if (NativeRtfTextConverter != null) |
| 240 | + { |
| 241 | + NativeRtfTextConverter.TextToRtf(value); |
| 242 | + } |
| 243 | + |
| 244 | + // special RTF chars |
| 245 | + var backslashed = new StringBuilder(value); |
| 246 | + backslashed.Replace(@"\", @"\\"); |
| 247 | + backslashed.Replace(@"{", @"\{"); |
| 248 | + backslashed.Replace(@"}", @"\}"); |
| 249 | + backslashed.Replace(Environment.NewLine, @"\par" + Environment.NewLine); |
| 250 | + |
| 251 | + // convert string char by char |
| 252 | + var sb = new StringBuilder(); |
| 253 | + foreach (char character in backslashed.ToString()) |
| 254 | + { |
| 255 | + if (character <= 0x7f) |
| 256 | + sb.Append(character); |
| 257 | + else |
| 258 | + sb.Append("\\u" + Convert.ToUInt32(character) + "?"); |
| 259 | + } |
| 260 | + |
| 261 | + return @"{\rtf1\ansi\ansicpg1252\deff0{\fonttbl\f0\fswiss Helvetica;}\f0\pard " + sb + @"\par" + Environment.NewLine + "}"; |
| 262 | + } |
| 263 | + |
| 264 | + } |
| 265 | + |
| 266 | +} |
0 commit comments