Skip to content

Commit 5320b32

Browse files
committed
Additional Yandex parsing
1 parent 2979ce0 commit 5320b32

File tree

4 files changed

+71
-51
lines changed

4 files changed

+71
-51
lines changed

SmartImage.Lib 3/Engines/Impl/Search/EHentaiEngine.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,23 +344,23 @@ public static EhResult Parse(INode n)
344344
// ReSharper disable InconsistentNaming
345345
var eh = new EhResult();
346346

347-
var gl1c = n.ChildNodes.TryFindElementByClassName("gl1c");
347+
var gl1c = n.ChildNodes.TryFindSingleElementByClassName("gl1c");
348348

349349
if (gl1c is { }) {
350350
if (gl1c.FirstChild is { } t) {
351351
eh.Type = t.TextContent;
352352
}
353353
}
354354

355-
var gl2c = n.ChildNodes.TryFindElementByClassName("gl2c");
355+
var gl2c = n.ChildNodes.TryFindSingleElementByClassName("gl2c");
356356

357357
if (gl2c is { }) {
358358
if (gl2c.ChildNodes[1].ChildNodes[1].ChildNodes[1].ChildNodes[1] is { } div) {
359359
eh.Pages = div.TextContent;
360360
}
361361
}
362362

363-
var gl3c = n.ChildNodes.TryFindElementByClassName("gl3c glname");
363+
var gl3c = n.ChildNodes.TryFindSingleElementByClassName("gl3c glname");
364364

365365
if (gl3c is { }) {
366366
if (gl3c.FirstChild is { } f) {
@@ -394,7 +394,7 @@ public static EhResult Parse(INode n)
394394
}
395395
}
396396

397-
var gl4c = n.ChildNodes.TryFindElementByClassName("gl4c glhide");
397+
var gl4c = n.ChildNodes.TryFindSingleElementByClassName("gl4c glhide");
398398

399399
if (gl4c is { }) {
400400
if (gl4c.ChildNodes[0] is { FirstChild: { } div1 } div1Outer) {

SmartImage.Lib 3/Engines/Impl/Search/YandexEngine.cs

Lines changed: 59 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
using System.Diagnostics;
2+
using System.Dynamic;
23
using AngleSharp.Dom;
34
using AngleSharp.XPath;
45
using Kantan.Net.Utilities;
56
using Kantan.Text;
67
using SmartImage.Lib.Results;
8+
using SmartImage.Lib.Utilities;
79

810
// ReSharper disable SuggestVarOrType_SimpleTypes
911

@@ -27,8 +29,7 @@ public YandexEngine() : base("https://yandex.com/images/search?rpt=imageview&url
2729

2830
private static string GetAnalysis(IDocument doc)
2931
{
30-
if (doc.Body is not { })
31-
{
32+
if (doc.Body is not { }) {
3233
return null;
3334
}
3435

@@ -38,8 +39,7 @@ private static string GetAnalysis(IDocument doc)
3839

3940
nodes.AddRange(nodes2);
4041

41-
if (!nodes.Any())
42-
{
42+
if (!nodes.Any()) {
4343
return null;
4444
}
4545

@@ -52,32 +52,31 @@ private static IEnumerable<SearchResultItem> GetOtherImages(IDocument doc, Searc
5252
{
5353
var tagsItem = doc.Body.SelectNodes(Serialization.S_Yandex_OtherImages);
5454

55-
if (tagsItem == null)
56-
{
55+
if (tagsItem == null) {
5756
return Enumerable.Empty<SearchResultItem>();
5857
}
5958

6059
SearchResultItem Parse(INode siz)
6160
{
62-
string link = siz.FirstChild.TryGetAttribute(Serialization.Atr_href);
61+
string link = siz.FirstChild.TryGetAttribute(Serialization.Atr_href);
6362
string resText = siz.FirstChild.ChildNodes[1].FirstChild.TextContent;
6463

6564
//other-sites__snippet
6665

6766
var snippet = siz.ChildNodes[1];
68-
var title = snippet.FirstChild;
69-
var site = snippet.ChildNodes[1];
70-
var desc = snippet.ChildNodes[2];
67+
var title = snippet.FirstChild;
68+
var site = snippet.ChildNodes[1];
69+
var desc = snippet.ChildNodes[2];
7170

7271
var (w, h) = ParseResolution(resText);
7372

7473
return new SearchResultItem(r)
7574
{
76-
Url = new Uri(link),
77-
Site = site.TextContent,
75+
Url = new Uri(link),
76+
Site = site.TextContent,
7877
Description = title?.TextContent,
79-
Width = w,
80-
Height = h,
78+
Width = w,
79+
Height = h,
8180
};
8281
}
8382

@@ -90,18 +89,15 @@ private static (int? w, int? h) ParseResolution(string resText)
9089

9190
int? w = null, h = null;
9291

93-
if (resFull.Length == 1 && resFull[0] == resText)
94-
{
92+
if (resFull.Length == 1 && resFull[0] == resText) {
9593
const string TIMES_DELIM = "&times;";
9694

97-
if (resText.Contains(TIMES_DELIM))
98-
{
95+
if (resText.Contains(TIMES_DELIM)) {
9996
resFull = resText.Split(TIMES_DELIM);
10097
}
10198
}
10299

103-
if (resFull.Length == 2)
104-
{
100+
if (resFull.Length == 2) {
105101
w = int.Parse(resFull[0]);
106102
h = int.Parse(resFull[1]);
107103
}
@@ -123,29 +119,25 @@ public override async Task<SearchResult> GetResultAsync(SearchQuery query, Cance
123119

124120
IDocument doc = null;
125121

126-
try
127-
{
122+
try {
128123
doc = await GetDocumentAsync(url, query: query, token: token.Value);
129124
}
130-
catch (Exception e)
131-
{
125+
catch (Exception e) {
132126
// Console.WriteLine(e);
133127
// throw;
134128
doc = null;
135129
Debug.WriteLine($"{Name}: {e.Message}", nameof(GetResultAsync));
136130
}
137131

138-
if (doc is null or { Body: null })
139-
{
132+
if (doc is null or { Body: null }) {
140133
sr.Status = SearchResultStatus.Failure;
141134
return sr;
142135
}
143136

144137
// Automation detected
145138
const string AUTOMATION_ERROR_MSG = "Please confirm that you and not a robot are sending requests";
146139

147-
if (doc.Body.TextContent.Contains(AUTOMATION_ERROR_MSG))
148-
{
140+
if (doc.Body.TextContent.Contains(AUTOMATION_ERROR_MSG)) {
149141
sr.Status = SearchResultStatus.Cooldown;
150142
return sr;
151143
}
@@ -154,19 +146,19 @@ public override async Task<SearchResult> GetResultAsync(SearchQuery query, Cance
154146
* Find and sort through high resolution image matches
155147
*/
156148

157-
foreach (var node in await GetNodes(doc))
158-
{
149+
foreach (var node in await GetNodes(doc)) {
159150
var sri = await ParseNodeToItem(node, sr);
160151

161-
if (sri != null)
162-
{
152+
if (sri != null) {
163153
sr.Results.Add(sri);
164154
}
165155
}
166156

167157
var otherImages = GetOtherImages(doc, sr);
168158
sr.Results.AddRange(otherImages);
169159

160+
await ParseExternalInfo(doc,sr);
161+
170162
//
171163

172164
/*
@@ -175,32 +167,55 @@ public override async Task<SearchResult> GetResultAsync(SearchQuery query, Cance
175167

176168
string looksLike = GetAnalysis(doc);
177169

178-
if (looksLike != null)
179-
{
170+
if (looksLike != null) {
180171
sr.Overview = looksLike;
181172
}
182173

183174
const string NO_MATCHING = "No matching images found";
184175

185-
if (doc.Body.TextContent.Contains(NO_MATCHING))
186-
{
176+
if (doc.Body.TextContent.Contains(NO_MATCHING)) {
187177

188178
sr.ErrorMessage = NO_MATCHING;
189-
sr.Status = SearchResultStatus.Extraneous;
179+
sr.Status = SearchResultStatus.Extraneous;
190180
}
191181

192182
sr.Update();
193183
return sr;
194184
}
195185

186+
/// <summary>
187+
/// Parses <em>sites containing information about the image</em>
188+
/// </summary>
189+
private static async ValueTask ParseExternalInfo(IDocument doc, SearchResult r)
190+
{
191+
var items = doc.Body.SelectNodes("//li[contains(@class,'CbirSites-Item')]");
192+
193+
foreach (INode item in items) {
194+
// var thumb = item.ChildNodes[0];
195+
var info = item.ChildNodes[1];
196+
var title = info.ChildNodes[0].TextContent;
197+
var href = info.ChildNodes[0].ChildNodes[0].TryGetAttribute(Serialization.Atr_href);
198+
var thumb = item.ChildNodes[0].ChildNodes[0].TryGetAttribute(Serialization.Atr_href);
199+
200+
var sri = new SearchResultItem(r)
201+
{
202+
Title = title,
203+
Url = href
204+
};
205+
206+
sri.Metadata.thumb = thumb;
207+
208+
r.Results.Add(sri);
209+
}
210+
}
211+
196212
public override void Dispose() { }
197213

198214
protected override async ValueTask<INode[]> GetNodes(IDocument doc)
199215
{
200216
var tagsItem = doc.Body.SelectNodes(NodesSelector);
201217

202-
if (!tagsItem.Any())
203-
{
218+
if (!tagsItem.Any()) {
204219
// return await Task.FromResult(Enumerable.Empty<INode>());
205220
return await Task.FromResult(tagsItem.ToArray());
206221
// return tagsItem;
@@ -213,6 +228,7 @@ protected override async ValueTask<INode[]> GetNodes(IDocument doc)
213228
// return sizeTags;
214229
}
215230

231+
[ICBN]
216232
protected override ValueTask<SearchResultItem> ParseNodeToItem(INode siz, SearchResult r)
217233
{
218234
string link = siz.TryGetAttribute(Serialization.Atr_href);
@@ -221,19 +237,17 @@ protected override ValueTask<SearchResultItem> ParseNodeToItem(INode siz, Search
221237

222238
(int? w, int? h) = ParseResolution(resText!);
223239

224-
if (!w.HasValue || !h.HasValue)
225-
{
240+
if (!w.HasValue || !h.HasValue) {
226241
w = null;
227242
h = null;
228243
//link = null;
229244
}
230245

231-
if (UriUtilities.IsUri(link, out var link2))
232-
{
246+
if (UriUtilities.IsUri(link, out var link2)) {
233247
var sri = new SearchResultItem(r)
234248
{
235-
Url = link2,
236-
Width = w,
249+
Url = link2,
250+
Width = w,
237251
Height = h,
238252
};
239253
return ValueTask.FromResult(sri);

SmartImage.Lib 3/SearchClient.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public async Task<SearchResult[]> RunSearchAsync(SearchQuery query, Cancellation
6969

7070
token ??= CancellationToken.None;
7171

72-
List<Task<SearchResult>> tasks = GetSearchTasks(query, token.Value);
72+
var tasks = GetSearchTasks(query, token.Value);
7373

7474
var results = new SearchResult[tasks.Count];
7575
int i = 0;

SmartImage.Lib 3/Utilities/NodeHelper.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ namespace SmartImage.Lib.Utilities;
77

88
internal static class NodeHelper
99
{
10-
internal static INode TryFindElementByClassName(this INodeList nodes, string className)
10+
internal static INode TryFindSingleElementByClassName(this INodeList nodes, string className)
1111
{
1212
return nodes.FirstOrDefault(f => f is IElement e && e.ClassName == className);
1313
}
14+
15+
internal static INode[] TryFindElementsByClassName(this INodeList nodes, string className)
16+
{
17+
return nodes.Where(f => f is IElement e && e.ClassName == className).ToArray();
18+
}
19+
1420
}

0 commit comments

Comments
 (0)