Skip to content

Commit 772758c

Browse files
committed
Further improve image scanning
1 parent 400f64f commit 772758c

File tree

3 files changed

+65
-81
lines changed

3 files changed

+65
-81
lines changed

SmartImage.Lib/SearchClient.cs

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -218,32 +218,28 @@ public async void FindDirectResults(SearchResult result)
218218
var b = await ir.TryScanForDirectImages();
219219

220220
if (b && !DirectResults.Contains(ir)) {
221-
ir.Direct.Url = new Uri(UriUtilities.NormalizeUrl(ir.Direct.Url));
222-
221+
223222
Debug.WriteLine($"{nameof(SearchClient)}: Found direct result {ir.Direct.Url}");
224-
225223
DirectResults.Add(ir);
226-
227224
result.PrimaryResult.Direct.Url ??= ir.Direct.Url;
228225

229226
DirectFound?.Invoke(null, new DirectResultsFoundEventArgs
230227
{
231228
DirectResultsSubset = new() { ir },
232229
});
230+
233231
ResultUpdated?.Invoke(null, EventArgs.Empty);
234232
}
235233
}
236234
}
237235

238236

239-
private void FindDirectResults(object state, SearchResult value, int take1 = 10, int take2 = 5)
237+
private void FindDirectResults(object state, SearchResult value, int take2 = 5)
240238
{
241239
var imageResults = value.AllResults;
242240

243241
var images = imageResults.AsParallel()
244-
/*.Where(x => x.CheckDirect(DirectImageCriterion.Regex))
245-
.Take(take1)*/
246-
.Where(x => x.IsAlreadyDirect(DirectImageCriterion.Binary))
242+
.Where(x => x.IsAlreadyDirect())
247243
.Take(take2)
248244
.ToList();
249245

SmartImage.Lib/Searching/ImageResult.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,15 +236,15 @@ public async Task<bool> TryScanForDirectImages()
236236
}
237237

238238

239-
public bool IsAlreadyDirect(DirectImageCriterion d)
239+
public bool IsAlreadyDirect()
240240
{
241241
if (Url is not { }) {
242242
return false;
243243
}
244244

245245
var s = Url.ToString();
246246

247-
var b = ImageHelper.IsImage(s, out var di, d);
247+
var b = ImageHelper.IsImage(s, out var di);
248248

249249
if (b) {
250250
Image = Image.FromStream(di.Stream);

SmartImage.Lib/Utilities/ImageHelper.cs

Lines changed: 59 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.IO;
77
using System.Linq;
88
using System.Net;
9+
using System.Net.Http;
910
using System.Text.RegularExpressions;
1011
using System.Threading;
1112
using System.Threading.Tasks;
@@ -98,7 +99,6 @@ public static string Download(Uri src, string path)
9899
// using var h = new HttpClient();
99100
// h.DownloadFile(src.ToString(), combine);
100101

101-
102102
return combine;
103103
}
104104
catch (Exception e) {
@@ -124,41 +124,49 @@ public static async Task<List<DirectImage>> ScanForImages(string url, int count
124124
document = WebUtilities.GetHtmlDocument(url);
125125
}
126126
catch (Exception e) {
127-
Debug.WriteLine($"{nameof(ImageHelper)}: {e.Message}", C_ERROR);
127+
Debug.WriteLine($"{nameof(WebUtilities)}: {e.Message}", C_ERROR);
128128

129129
return null;
130130
}
131131

132-
using var cts = new CancellationTokenSource();
133-
var flat = new List<string>();
132+
using var cts = new CancellationTokenSource();
133+
134+
var urls = new List<string>();
134135

135-
flat.AddRange(document.QuerySelectorAttributes("a", "href"));
136-
flat.AddRange(document.QuerySelectorAttributes("img", "src"));
136+
urls.AddRange(document.QuerySelectorAttributes("a", "href"));
137+
urls.AddRange(document.QuerySelectorAttributes("img", "src"));
138+
139+
urls = urls.Where(x => x != null).Select(u1 =>
140+
{
141+
if (UriUtilities.IsUri(u1, out var u2)) {
142+
return UriUtilities.NormalizeUrl(u2);
143+
}
137144

138-
flat = flat.Where(x=>x!=null).Distinct().ToList();
145+
return u1;
146+
}).Distinct().ToList();
139147

140-
var tasks = new List<Task<DirectImage>>();
141148

149+
var tasks = new List<Task<DirectImage>>();
142150
var hostComponent = UriUtilities.GetHostComponent(new Uri(url));
143151

144152
switch (hostComponent) {
145153
case "www.deviantart.com":
146154
//https://images-wixmp-
147-
flat = flat.Where(x => x.StartsWith("https://images-wixmp")).ToList();
155+
urls = urls.Where(x => x.StartsWith("https://images-wixmp")).ToList();
148156
break;
149157
default:
150158
break;
151159
}
152-
153160

154-
for (int i = 0; i < flat.Count; i++) {
161+
162+
for (int i = 0; i < urls.Count; i++) {
155163
int iCopy = i;
156164

157165
tasks.Add(Task<DirectImage>.Factory.StartNew(() =>
158166
{
159-
string s = flat[iCopy];
167+
string s = urls[iCopy];
160168

161-
if (IsImage(s, (int) timeoutMS, DirectImageCriterion.Binary, out var di)) {
169+
if (IsImage(s, (int) timeoutMS, out var di)) {
162170
return di;
163171
}
164172

@@ -174,7 +182,7 @@ public static async Task<List<DirectImage>> ScanForImages(string url, int count
174182
var result = task.Result;
175183

176184
if (result is { } && count > 0) {
177-
result.Url = new Uri(UriUtilities.NormalizeUrl(result.Url));
185+
// result.Url = new Uri(UriUtilities.NormalizeUrl(result.Url));
178186
images.Add(result);
179187
count--;
180188
}
@@ -184,68 +192,54 @@ public static async Task<List<DirectImage>> ScanForImages(string url, int count
184192
return images;
185193
}
186194

187-
public static bool IsImage(string url, out DirectImage di, DirectImageCriterion directCriterion = DirectImageCriterion.Binary)
188-
=> IsImage(url, TimeoutMS, directCriterion, out di);
189-
190-
191-
public static bool IsImage(string url, long timeout, DirectImageCriterion directCriterion, out DirectImage di)
192-
{
193-
di = new DirectImage(){};
194-
195-
switch (directCriterion) {
196-
case DirectImageCriterion.Regex:
197-
var image = Regex.IsMatch(
198-
url,
199-
@"(?:([^:\/?#]+):)?(?:\/\/([^\/?#]*))?([^?#]*\.(?:bmp|gif|ico|jfif|jpe?g|png|svg|tiff?|webp))(?:\?([^#]*))?(?:#(.*))?",
200-
RegexOptions.IgnoreCase);
201-
di.Url = new Uri(url);
202-
203-
return image;
204-
case DirectImageCriterion.Binary:
205-
if (!UriUtilities.IsUri(url, out var u)) {
206-
return false;
207-
}
208-
209-
var response = HttpUtilities.GetResponse(u.ToString(), (int) timeout, Method.HEAD);
210-
211-
if (!response.IsSuccessful) {
195+
public static bool IsImage(string url, out DirectImage di) => IsImage(url, TimeoutMS, out di);
212196

213-
return false;
214-
}
215197

216-
di.Url = new Uri(url);
198+
public static bool IsImage(string url, long timeout, out DirectImage di)
199+
{
200+
di = new DirectImage() { };
217201

218-
di.Response = response;
202+
var response = HttpUtilities.GetResponse(url, (int) timeout, Method.HEAD);
219203

220-
/* Check content-type */
204+
if (!response.IsSuccessful) {
205+
return false;
206+
}
221207

222-
// The content-type returned from the response may not be the actual content-type, so
223-
// we'll resolve it using binary data instead to be sure
224-
bool a, b;
208+
di.Url = new Uri(url);
209+
di.Response = response;
225210

226-
try {
227-
var stream = WebUtilities.GetStream(url);
228-
var buffer = new byte[256];
229-
stream.Read(buffer, 0, buffer.Length);
230-
// var rg = response.RawBytes;
231-
var m = MediaTypes.ResolveFromData(buffer);
232-
a = m.StartsWith("image") && m != "image/svg+xml";
233-
b = response.ContentLength is -1 or >= 50_000;
234-
di.Stream = stream;
235-
}
236-
catch {
237-
a = response.ContentType.StartsWith("image") && response.ContentType != "image/svg+xml";
238-
b = response.ContentLength >= 50_000;
239-
}
211+
/* Check content-type */
240212

213+
// The content-type returned from the response may not be the actual content-type, so
214+
// we'll resolve it using binary data instead to be sure
215+
bool type, size;
241216

242-
// var b = stream.Length >= 50_000;
217+
const string svg_xml = "image/svg+xml";
218+
const string image = "image";
219+
const int min_size_b = 50_000;
243220

244-
return a && b;
245-
default:
246-
throw new ArgumentOutOfRangeException(nameof(directCriterion), directCriterion, null);
221+
try {
222+
using var client = new HttpClient();
223+
var task = client.GetStreamAsync(url);
224+
task.Wait((int) timeout);
225+
226+
var stream = task.Result;
227+
228+
var buffer = new byte[256];
229+
stream.Read(buffer, 0, buffer.Length);
230+
var m = MediaTypes.ResolveFromData(buffer);
231+
type = m.StartsWith(image) && m != svg_xml;
232+
size = response.ContentLength is -1 or >= min_size_b;
233+
di.Stream = stream;
234+
}
235+
catch (Exception x) {
236+
type = response.ContentType.StartsWith(image) && response.ContentType != svg_xml;
237+
size = response.ContentLength >= min_size_b;
238+
Debug.WriteLine($"{x.Message}");
247239
}
248240

241+
return type && size;
242+
249243
}
250244

251245
/*
@@ -339,12 +333,6 @@ public static Bitmap ResizeImage(Bitmap mg, Size newSize)
339333
}
340334
}
341335

342-
public enum DirectImageCriterion
343-
{
344-
Binary,
345-
Regex
346-
}
347-
348336
public enum DisplayResolutionType
349337
{
350338
Unknown,

0 commit comments

Comments
 (0)