Skip to content
This repository was archived by the owner on May 11, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 46 additions & 16 deletions PortablePorts/NReadability/NReadabilityTranscoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ private class LinkData
private static readonly Regex _MailtoHrefRegex = new Regex("^\\s*mailto\\s*:", RegexOptions.IgnoreCase);
private static readonly Regex _TitleWhitespacesCleanUpRegex = new Regex("\\s+");

private static readonly Dictionary<Regex, string> _articleContentElementHints = new Dictionary<Regex, string>
private readonly Dictionary<Regex, string> _articleContentElementHints = new Dictionary<Regex, string>
{
{ new Regex("^https?://(www|mobile)\\.theverge.com", RegexOptions.IgnoreCase), ".entry-body" },
{ new Regex("^https?://(www|blog)\\.bufferapp.com", RegexOptions.IgnoreCase), ".post" },
Expand All @@ -146,7 +146,8 @@ private class LinkData
{ new Regex("^https?://(m\\.|www\\.)?youtube.com", RegexOptions.IgnoreCase), "#watch-description-content" },
{ new Regex("^https?://(www.)?vimeo.com", RegexOptions.IgnoreCase), ".description_wrapper" },
{ new Regex("^https?://(www.)?zdnet.com", RegexOptions.IgnoreCase), ".storyBody" },
{ new Regex("^https?://(www.)?recode.net", RegexOptions.IgnoreCase), ".postarea" }
{ new Regex("^https?://(www.)?recode.net", RegexOptions.IgnoreCase), ".postarea" },
{ new Regex("^https?://(.*\\.)?slashdot.org", RegexOptions.IgnoreCase), ".body" }
};

#endregion
Expand All @@ -172,20 +173,49 @@ private class LinkData
private Func<AttributeTransformationInput, AttributeTransformationResult> _anchorHrefTransformer;
private Func<AttributeTransformationInput, AttributeTransformationResult> _metaTransformer;

#endregion

#region Constructor(s)
#endregion

#region Constructor(s)

/// <summary>
/// Initializes a new instance of NReadabilityTranscoder. Allows setting all options.
/// </summary>
/// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param>
/// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param>
/// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param>
/// <param name="readingStyle">Styling for the extracted article.</param>
/// <param name="readingMargin">Margin for the extracted article.</param>
/// <param name="readingSize">Font size for the extracted article.</param>
/// <param name="articleElementHints">Hints for sites with difficult to find article bodies, in form of a dictionary of a regex matching the url of the content and the selector for the element containing the article</param>
public NReadabilityTranscoder(
bool dontStripUnlikelys,
bool dontNormalizeSpacesInTextContent,
bool dontWeightClasses,
ReadingStyle readingStyle,
ReadingMargin readingMargin,
ReadingSize readingSize,
IDictionary<Regex, string> articleElementHints)
: this(dontStripUnlikelys, dontNormalizeSpacesInTextContent, dontWeightClasses, readingStyle, readingMargin, readingSize)
{
if (articleElementHints != null)
{
foreach (var kvp in articleElementHints)
{
_articleContentElementHints[kvp.Key] = kvp.Value;
}
}
}

/// <summary>
/// Initializes a new instance of NReadabilityTranscoder. Allows setting all options.
/// </summary>
/// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param>
/// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param>
/// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param>
/// <param name="readingStyle">Styling for the extracted article.</param>
/// <param name="readingMargin">Margin for the extracted article.</param>
/// <param name="readingSize">Font size for the extracted article.</param>
public NReadabilityTranscoder(
/// <summary>
/// Initializes a new instance of NReadabilityTranscoder. Allows setting all options.
/// </summary>
/// <param name="dontStripUnlikelys">Determines whether elements that are unlikely to be a part of main content will be removed.</param>
/// <param name="dontNormalizeSpacesInTextContent">Determines whether spaces in InnerText properties of elements will be normalized automatically (eg. whether double spaces will be replaced with single spaces).</param>
/// <param name="dontWeightClasses">Determines whether 'weight-class' algorithm will be used when cleaning content.</param>
/// <param name="readingStyle">Styling for the extracted article.</param>
/// <param name="readingMargin">Margin for the extracted article.</param>
/// <param name="readingSize">Font size for the extracted article.</param>
public NReadabilityTranscoder(
bool dontStripUnlikelys,
bool dontNormalizeSpacesInTextContent,
bool dontWeightClasses,
Expand Down Expand Up @@ -1928,7 +1958,7 @@ private static XElement TryFindArticleContentElement(XDocument document, string
}
}

private static string GetArticleContentElementHint(string url)
private string GetArticleContentElementHint(string url)
{
if (string.IsNullOrEmpty(url))
{
Expand Down
18 changes: 18 additions & 0 deletions ReadSharp/Models/TranscoderOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace ReadSharp.Models
{
public class TranscoderOptions
{
/// <summary>
/// A dictionary of url matching regex as key to html element selector as value that represents hints for the transcoder to be able to find the actual article content within downlaoded HTML
/// </summary>
public IDictionary<Regex, string> ArticleElementHints { get; set; }

public TranscoderOptions()
{
ArticleElementHints = new Dictionary<Regex, string>();
}
}
}
1 change: 1 addition & 0 deletions ReadSharp/ReadSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
<Compile Include="Models\ArticleImage.cs" />
<Compile Include="Models\HttpOptions.cs" />
<Compile Include="Models\ReadOptions.cs" />
<Compile Include="Models\TranscoderOptions.cs" />
<Compile Include="Reader.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Models\Response.cs" />
Expand Down
40 changes: 24 additions & 16 deletions ReadSharp/Reader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using System.Threading;
using System.Threading.Tasks;
using System.Xml.Linq;
using ReadSharp.Models;


namespace ReadSharp
Expand Down Expand Up @@ -63,26 +64,23 @@ public class Reader : IReader
/// Initializes a new instance of the <see cref="Reader" /> class.
/// </summary>
/// <param name="options">The HTTP options.</param>
public Reader(HttpOptions options = null)
{
// initialize transcoder
_transcoder = new NReadabilityTranscoder(
dontStripUnlikelys: false,
dontNormalizeSpacesInTextContent: true,
dontWeightClasses: false,
readingStyle: ReadingStyle.Ebook,
readingMargin: ReadingMargin.Narrow,
readingSize: ReadingSize.Medium
);

// get default HTTP options if none available
public Reader(HttpOptions options = null, TranscoderOptions transcoderOptions = null)
{
// get default HTTP options if none available
if (options == null)
{
options = HttpOptions.CreateDefault();
}

if (transcoderOptions == null)
{
transcoderOptions = new TranscoderOptions();
}

_options = options;

_transcoder = CreateTranscoder(transcoderOptions);

// initialize custom encoder
_encoder = new Encodings.Encoder(true);

Expand Down Expand Up @@ -113,8 +111,7 @@ public Reader(HttpOptions options = null)
}



/// <summary>
/// <summary>
/// Reads article content from the given URI.
/// </summary>
/// <param name="uri">An URI to extract the content from.</param>
Expand Down Expand Up @@ -245,7 +242,18 @@ protected TranscodingResult ExtractReadableInformation(
return _transcoder.Transcode(transcodingInput);
}


private static NReadabilityTranscoder CreateTranscoder(TranscoderOptions transcoderOptions)
{
return new NReadabilityTranscoder(
dontStripUnlikelys: false,
dontNormalizeSpacesInTextContent: true,
dontWeightClasses: false,
readingStyle: ReadingStyle.Ebook,
readingMargin: ReadingMargin.Narrow,
readingSize: ReadingSize.Medium,
articleElementHints: transcoderOptions.ArticleElementHints
);
}

/// <summary>
/// Reverses the deep links.
Expand Down