diff --git a/PortablePorts/NReadability/NReadabilityTranscoder.cs b/PortablePorts/NReadability/NReadabilityTranscoder.cs index f7d0125..c6b0847 100644 --- a/PortablePorts/NReadability/NReadabilityTranscoder.cs +++ b/PortablePorts/NReadability/NReadabilityTranscoder.cs @@ -136,7 +136,7 @@ private class LinkData private static readonly Regex _MailtoHrefRegex = new Regex("^\\s*mailto\\s*:", RegexOptions.IgnoreCase); private static readonly Regex _TitleWhitespacesCleanUpRegex = new Regex("\\s+"); - private static readonly Dictionary _articleContentElementHints = new Dictionary + private readonly Dictionary _articleContentElementHints = new Dictionary { { new Regex("^https?://(www|mobile)\\.theverge.com", RegexOptions.IgnoreCase), ".entry-body" }, { new Regex("^https?://(www|blog)\\.bufferapp.com", RegexOptions.IgnoreCase), ".post" }, @@ -146,7 +146,8 @@ private class LinkData { new Regex("^https?://(m\\.|www\\.)?youtube.com", RegexOptions.IgnoreCase), "#watch-description-content" }, { new Regex("^https?://(www.)?vimeo.com", RegexOptions.IgnoreCase), ".description_wrapper" }, { new Regex("^https?://(www.)?zdnet.com", RegexOptions.IgnoreCase), ".storyBody" }, - { new Regex("^https?://(www.)?recode.net", RegexOptions.IgnoreCase), ".postarea" } + { new Regex("^https?://(www.)?recode.net", RegexOptions.IgnoreCase), ".postarea" }, + { new Regex("^https?://(.*\\.)?slashdot.org", RegexOptions.IgnoreCase), ".body" } }; #endregion @@ -172,20 +173,49 @@ private class LinkData private Func _anchorHrefTransformer; private Func _metaTransformer; - #endregion - - #region Constructor(s) + #endregion + + #region Constructor(s) + + /// + /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options. + /// + /// Determines whether elements that are unlikely to be a part of main content will be removed. + /// Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces). + /// Determines whether 'weight-class' algorithm will be used when cleaning content. + /// Styling for the extracted article. + /// Margin for the extracted article. + /// Font size for the extracted article. + /// Hints for sites with difficult to find article bodies, in form of a dictionary of a regex matching the url of the content and the selector for the element containing the article + public NReadabilityTranscoder( + bool dontStripUnlikelys, + bool dontNormalizeSpacesInTextContent, + bool dontWeightClasses, + ReadingStyle readingStyle, + ReadingMargin readingMargin, + ReadingSize readingSize, + IDictionary articleElementHints) + : this(dontStripUnlikelys, dontNormalizeSpacesInTextContent, dontWeightClasses, readingStyle, readingMargin, readingSize) + { + if (articleElementHints != null) + { + foreach (var kvp in articleElementHints) + { + _articleContentElementHints[kvp.Key] = kvp.Value; + } + } + } - /// - /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options. - /// - /// Determines whether elements that are unlikely to be a part of main content will be removed. - /// Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces). - /// Determines whether 'weight-class' algorithm will be used when cleaning content. - /// Styling for the extracted article. - /// Margin for the extracted article. - /// Font size for the extracted article. - public NReadabilityTranscoder( + /// + /// Initializes a new instance of NReadabilityTranscoder. Allows setting all options. + /// + /// Determines whether elements that are unlikely to be a part of main content will be removed. + /// Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces). + /// Determines whether 'weight-class' algorithm will be used when cleaning content. + /// Styling for the extracted article. + /// Margin for the extracted article. + /// Font size for the extracted article. + public NReadabilityTranscoder( bool dontStripUnlikelys, bool dontNormalizeSpacesInTextContent, bool dontWeightClasses, @@ -1928,7 +1958,7 @@ private static XElement TryFindArticleContentElement(XDocument document, string } } - private static string GetArticleContentElementHint(string url) + private string GetArticleContentElementHint(string url) { if (string.IsNullOrEmpty(url)) { diff --git a/ReadSharp/Models/TranscoderOptions.cs b/ReadSharp/Models/TranscoderOptions.cs new file mode 100644 index 0000000..96a741a --- /dev/null +++ b/ReadSharp/Models/TranscoderOptions.cs @@ -0,0 +1,18 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace ReadSharp.Models +{ + public class TranscoderOptions + { + /// + /// A dictionary of url matching regex as key to html element selector as value that represents hints for the transcoder to be able to find the actual article content within downlaoded HTML + /// + public IDictionary ArticleElementHints { get; set; } + + public TranscoderOptions() + { + ArticleElementHints = new Dictionary(); + } + } +} diff --git a/ReadSharp/ReadSharp.csproj b/ReadSharp/ReadSharp.csproj index afc7735..5572bd9 100644 --- a/ReadSharp/ReadSharp.csproj +++ b/ReadSharp/ReadSharp.csproj @@ -86,6 +86,7 @@ + diff --git a/ReadSharp/Reader.cs b/ReadSharp/Reader.cs index fa22cfb..3bfede8 100644 --- a/ReadSharp/Reader.cs +++ b/ReadSharp/Reader.cs @@ -10,6 +10,7 @@ using System.Threading; using System.Threading.Tasks; using System.Xml.Linq; +using ReadSharp.Models; namespace ReadSharp @@ -63,26 +64,23 @@ public class Reader : IReader /// Initializes a new instance of the class. /// /// The HTTP options. - public Reader(HttpOptions options = null) - { - // initialize transcoder - _transcoder = new NReadabilityTranscoder( - dontStripUnlikelys: false, - dontNormalizeSpacesInTextContent: true, - dontWeightClasses: false, - readingStyle: ReadingStyle.Ebook, - readingMargin: ReadingMargin.Narrow, - readingSize: ReadingSize.Medium - ); - - // get default HTTP options if none available + public Reader(HttpOptions options = null, TranscoderOptions transcoderOptions = null) + { + // get default HTTP options if none available if (options == null) { options = HttpOptions.CreateDefault(); } + if (transcoderOptions == null) + { + transcoderOptions = new TranscoderOptions(); + } + _options = options; + _transcoder = CreateTranscoder(transcoderOptions); + // initialize custom encoder _encoder = new Encodings.Encoder(true); @@ -113,8 +111,7 @@ public Reader(HttpOptions options = null) } - - /// + /// /// Reads article content from the given URI. /// /// An URI to extract the content from. @@ -245,7 +242,18 @@ protected TranscodingResult ExtractReadableInformation( return _transcoder.Transcode(transcodingInput); } - + private static NReadabilityTranscoder CreateTranscoder(TranscoderOptions transcoderOptions) + { + return new NReadabilityTranscoder( + dontStripUnlikelys: false, + dontNormalizeSpacesInTextContent: true, + dontWeightClasses: false, + readingStyle: ReadingStyle.Ebook, + readingMargin: ReadingMargin.Narrow, + readingSize: ReadingSize.Medium, + articleElementHints: transcoderOptions.ArticleElementHints + ); + } /// /// Reverses the deep links.