Skip to content

Commit 4eb7f9d

Browse files
committed
Fix issue #619 Twitter/X crawler issues
1 parent 444056b commit 4eb7f9d

File tree

12 files changed

+149
-7
lines changed

12 files changed

+149
-7
lines changed

src/TumblThree/SharedAssemblyInfo.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212

1313
[assembly: ComVisible(false)]
1414
[assembly: NeutralResourcesLanguage("en-US", UltimateResourceFallbackLocation.MainAssembly)]
15-
[assembly: AssemblyVersion("2.18.0.0")]
16-
[assembly: AssemblyFileVersion("2.18.0.0")]
15+
[assembly: AssemblyVersion("2.18.1.0")]
16+
[assembly: AssemblyFileVersion("2.18.1.0")]

src/TumblThree/TumblThree.Applications/Crawler/TwitterCrawler.cs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,16 @@ private async Task<string> RequestApiDataAsync(string url, string bearerToken, s
271271
request.Accept = "application/json";
272272

273273
requestRegistration = Ct.Register(() => request.Abort());
274-
return await WebRequestFactory.ReadRequestToEndAsync(request, true);
274+
var content = await WebRequestFactory.ReadRequestToEndAsync(request, true);
275+
276+
if (ShellService.Settings.AdaptToTwitterRateLimits)
277+
{
278+
ShellService.Settings.RateLimitTwitterLimit = WebRequestFactory.RateLimitHeaders["x-rate-limit-limit"];
279+
ShellService.Settings.RateLimitTwitterRemaining = WebRequestFactory.RateLimitHeaders["x-rate-limit-remaining"];
280+
ShellService.Settings.RateLimitTwitterReset = WebRequestFactory.RateLimitHeaders["x-rate-limit-reset"];
281+
}
282+
283+
return content;
275284
}
276285
finally
277286
{
@@ -346,6 +355,11 @@ private async Task<string> GetApiPageAsync(byte type, string cursor)
346355

347356
if (ShellService.Settings.LimitConnectionsTwitterApi)
348357
{
358+
if (ShellService.Settings.AdaptToTwitterRateLimits && ShellService.Settings.RateLimitTwitterRemaining < 5)
359+
{
360+
double permitsPerSecond = (double)ShellService.Settings.RateLimitTwitterLimit / 15 / 60;
361+
if (permitsPerSecond != CrawlerService.TimeconstraintTwitterApi.GetRate()) { CrawlerService.TimeconstraintTwitterApi.SetRate(permitsPerSecond); }
362+
}
349363
CrawlerService.TimeconstraintTwitterApi.Acquire();
350364
}
351365

@@ -523,7 +537,7 @@ private static List<Tweet> SelectTweets(Entry entry)
523537
List<ItemContent> list = new List<ItemContent>();
524538
if (entry?.Content?.ItemContent != null) list.Add(entry.Content.ItemContent);
525539
if (entry?.Content?.Items != null) list.AddRange(entry.Content.Items.Select(s => s.Item.ItemContent));
526-
return list.Select(s => s.TweetResults.Tweet).ToList();
540+
return list.Select(s => s.TweetResults.Tweet).Where(x => x != null).ToList();
527541
}
528542

529543
private async Task CrawlPageAsync(int pageNo)
@@ -608,6 +622,12 @@ private async Task CrawlPageAsync(int pageNo)
608622
retries = 403;
609623
}
610624
}
625+
if (((HttpWebResponse)webException?.Response).StatusCode == HttpStatusCode.NotFound)
626+
{
627+
Logger.Error("TwitterCrawler.CrawlPageAsync: {0}", $"{Blog.Name} ({GetCollectionName(Blog)}) {webException.Message}");
628+
completeGrab = false;
629+
retries = 404;
630+
}
611631
}
612632
catch (TimeoutException timeoutException)
613633
{
@@ -902,8 +922,8 @@ private static string GetTweetText(Tweet post)
902922
{
903923
if (text.Contains(item.Url))
904924
{
905-
text = text.Replace(item.Url, item.ExpandedUrl.EndsWith(item.DisplayUrl) ? item.ExpandedUrl : item.DisplayUrl);
906-
if (!item.ExpandedUrl.EndsWith(item.DisplayUrl)) links.Add(item.DisplayUrl, item.ExpandedUrl);
925+
text = text.Replace(item.Url, item.ExpandedUrl?.EndsWith(item.DisplayUrl) == true ? item.ExpandedUrl : item.DisplayUrl);
926+
if (!links.ContainsKey(item.DisplayUrl) && item.ExpandedUrl?.EndsWith(item.DisplayUrl) == false) { links.Add(item.DisplayUrl, item.ExpandedUrl); }
907927
}
908928
}
909929
var url = post.Legacy.Url ?? $"https://x.com/{post.User.Legacy.ScreenName}/status/{post.RestId}";

src/TumblThree/TumblThree.Applications/Properties/AppSettings.cs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ namespace TumblThree.Applications.Properties
1616
public sealed class AppSettings : IExtensibleDataObject
1717
{
1818
[IgnoreDataMember]
19-
public static readonly string USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36";
19+
public static readonly string USERAGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36";
2020

2121
[IgnoreDataMember]
2222
[System.Diagnostics.CodeAnalysis.SuppressMessage("Naming", "CA1707:Identifiers should not contain underscores", Justification = "<Pending>")]
@@ -422,6 +422,18 @@ public AppSettings()
422422
[DataMember]
423423
public bool NoCrawlerDataUpdate { get; set; }
424424

425+
[DataMember]
426+
public bool AdaptToTwitterRateLimits { get; set; }
427+
428+
[IgnoreDataMember]
429+
public int RateLimitTwitterLimit { get; set; }
430+
431+
[IgnoreDataMember]
432+
public int RateLimitTwitterRemaining { get; set; }
433+
434+
[IgnoreDataMember]
435+
public int RateLimitTwitterReset { get; set; }
436+
425437
[DataMember]
426438
public Dictionary<object, Tuple<int, double, Visibility>> ColumnSettings { get; set; }
427439

@@ -562,6 +574,8 @@ public static bool Upgrade(AppSettings settings)
562574
}
563575
}
564576

577+
settings.RateLimitTwitterRemaining = 150;
578+
565579
return updated;
566580
}
567581

@@ -681,6 +695,7 @@ private void Initialize()
681695
ZipCrawlerData = false;
682696
ZipExistingCrawlerData = false;
683697
NoCrawlerDataUpdate = false;
698+
AdaptToTwitterRateLimits = true;
684699
}
685700

686701
[OnDeserializing]

src/TumblThree/TumblThree.Applications/Services/IWebRequestFactory.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ namespace TumblThree.Applications.Services
77
{
88
public interface IWebRequestFactory
99
{
10+
SortedList<string, int> RateLimitHeaders { get; }
11+
1012
HttpWebRequest CreateGetRequest(string url, string referer = "", Dictionary<string, string> headers = null, bool allowAutoRedirect = true);
1113

1214
HttpWebRequest CreateGetXhrRequest(string url, string referer = "", Dictionary<string, string> headers = null);

src/TumblThree/TumblThree.Applications/Services/WebRequestFactory.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System;
12
using System.Collections.Generic;
23
using System.ComponentModel.Composition;
34
using System.IO;
@@ -9,6 +10,7 @@
910

1011
using TumblThree.Applications.Extensions;
1112
using TumblThree.Applications.Properties;
13+
using TumblThree.Domain;
1214

1315
namespace TumblThree.Applications.Services
1416
{
@@ -18,6 +20,7 @@ public class WebRequestFactory : IWebRequestFactory
1820
private readonly IShellService shellService;
1921
private readonly ISharedCookieService cookieService;
2022
private readonly AppSettings settings;
23+
private readonly SortedList<string, int> _rateLimitHeaders = new SortedList<string, int>();
2124

2225
[ImportingConstructor]
2326
public WebRequestFactory(IShellService shellService, ISharedCookieService cookieService, AppSettings settings)
@@ -27,6 +30,8 @@ public WebRequestFactory(IShellService shellService, ISharedCookieService cookie
2730
this.settings = settings;
2831
}
2932

33+
public SortedList<string, int> RateLimitHeaders => _rateLimitHeaders;
34+
3035
private HttpWebRequest CreateStubRequest(string url, string referer = "", Dictionary<string, string> headers = null, bool allowAutoRedirect = true)
3136
{
3237
var request = (HttpWebRequest)WebRequest.Create(url); //HttpUtility.UrlDecode(url) what was the use case!?
@@ -132,6 +137,13 @@ public async Task<string> ReadRequestToEndAsync(HttpWebRequest request, bool sto
132137
{
133138
cookieService.SetUriCookie(response.Cookies);
134139
}
140+
if (response.Headers.Get("x-rate-limit-limit") != null)
141+
{
142+
if (int.TryParse(response.Headers.Get("x-rate-limit-limit"), out var value)) _rateLimitHeaders["x-rate-limit-limit"] = value;
143+
if (int.TryParse(response.Headers.Get("x-rate-limit-remaining"), out value)) _rateLimitHeaders["x-rate-limit-remaining"] = value;
144+
if (int.TryParse(response.Headers.Get("x-rate-limit-reset"), out value)) _rateLimitHeaders["x-rate-limit-reset"] = value;
145+
Logger.Verbose($"x-rate-limit limit: {_rateLimitHeaders["x-rate-limit-limit"]} remaining: {_rateLimitHeaders["x-rate-limit-remaining"]} reset: {DateTimeOffset.FromUnixTimeSeconds(_rateLimitHeaders["x-rate-limit-reset"]).ToLocalTime()}");
146+
}
135147
using (Stream stream = GetStreamForApiRequest(response.GetResponseStream()))
136148
{
137149
using (var buffer = new BufferedStream(stream))

src/TumblThree/TumblThree.Applications/ViewModels/SettingsViewModel.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ public class SettingsViewModel : ViewModel<ISettingsView>
156156
private bool _saveTextsIndividualFiles;
157157
private bool _zipExistingCrawlerData;
158158
private bool _noCrawlerDataUpdate;
159+
private bool _adaptToTwitterRateLimits;
159160

160161
[ImportingConstructor]
161162
public SettingsViewModel(ISettingsView view, IShellService shellService, ICrawlerService crawlerService, IManagerService managerService,
@@ -1011,6 +1012,12 @@ public bool ZipExistingCrawlerData
10111012
set => SetProperty(ref _zipExistingCrawlerData, value);
10121013
}
10131014

1015+
public bool AdaptToTwitterRateLimits
1016+
{
1017+
get => _adaptToTwitterRateLimits;
1018+
set => SetProperty(ref _adaptToTwitterRateLimits, value);
1019+
}
1020+
10141021
public void ShowDialog(object owner) => ViewCore.ShowDialog(owner);
10151022

10161023
private void ViewClosed(object sender, EventArgs e)
@@ -1380,6 +1387,7 @@ private void LoadSettings()
13801387
SaveTextsIndividualFiles = _settings.SaveTextsIndividualFiles;
13811388
ZipExistingCrawlerData = _settings.ZipExistingCrawlerData;
13821389
NoCrawlerDataUpdate = _settings.NoCrawlerDataUpdate;
1390+
AdaptToTwitterRateLimits = _settings.AdaptToTwitterRateLimits;
13831391
}
13841392
else
13851393
{
@@ -1481,6 +1489,7 @@ private void LoadSettings()
14811489
SaveTextsIndividualFiles = false;
14821490
ZipExistingCrawlerData = false;
14831491
NoCrawlerDataUpdate = false;
1492+
AdaptToTwitterRateLimits = true;
14841493
}
14851494
}
14861495

src/TumblThree/TumblThree.Presentation/Properties/Resources.Designer.cs

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/TumblThree/TumblThree.Presentation/Properties/Resources.de.resx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,4 +1066,14 @@ ist, werden beim nächsten Crawlen des Blogs auch die bereits vorhandenen Crawle
10661066
<value>Wenn diese Option aktiviert ist, werden vorhandene Crawler-Daten, für die bereits eine Dump-Datei existiert, ignoriert.
10671067
Normalerweise werden beim erneuten Scannen alle Crawler-Datendateien mit der aktuell heruntergeladenen Version überschrieben.</value>
10681068
</data>
1069+
<data name="AdaptToTwitterRateLimits" xml:space="preserve">
1070+
<value>Anpassung an die Ratenbegrenzungen des Twitter/X Servers</value>
1071+
</data>
1072+
<data name="ToolTipAdaptToTwitterRateLimits" xml:space="preserve">
1073+
<value>Anpassung der Geschwindigkeit an die Ratenbegrenzungen des Twitter/X Servers</value>
1074+
</data>
1075+
<data name="ToolTipAdaptToTwitterRateLimitsDescription" xml:space="preserve">
1076+
<value>Wenn diese Option deaktiviert ist, werden die in den Einstellungen konfigurierten Ratenbegrenzungen strikt
1077+
eingehalten. Dies kann zu einer Ablehnung durch den Server führen. Dies geschieht auf eigene Verantwortung.</value>
1078+
</data>
10691079
</root>

src/TumblThree/TumblThree.Presentation/Properties/Resources.es.resx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,4 +1054,14 @@ de volcado ya existentes también se archivarán, si la opción 'Archivar archiv
10541054
<value>Si esta opción está habilitada, se ignoran los datos del rastreador existentes para los que ya existe un archivo de volcado.
10551055
Normalmente, al volver a escanear, todos los archivos de datos del rastreador se sobrescriben con la versión descargada actualmente.</value>
10561056
</data>
1057+
<data name="AdaptToTwitterRateLimits" xml:space="preserve">
1058+
<value>Adaptarse a los límites de velocidad del servidor Twitter/X</value>
1059+
</data>
1060+
<data name="ToolTipAdaptToTwitterRateLimitsDescription" xml:space="preserve">
1061+
<value>Si esta opción está desactivada, los límites de velocidad configurados en la configuración se mantendrán estrictamente,
1062+
lo que podría resultar en el rechazo por parte del servidor. Esto es bajo su propia responsabilidad.</value>
1063+
</data>
1064+
<data name="ToolTipAdaptToTwitterRateLimits" xml:space="preserve">
1065+
<value>Ajusta la velocidad a los límites de velocidad del servidor Twitter/X</value>
1066+
</data>
10571067
</root>

src/TumblThree/TumblThree.Presentation/Properties/Resources.fr.resx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,4 +1058,14 @@ de données du robot déjà existants seront également archivés, si l'option
10581058
<value>Si cette option est activée, les données de robot existantes pour lesquelles un fichier de vidage existe déjà sont ignorées.
10591059
Normalement, lors de la nouvelle analyse, tous les fichiers de données du robot sont écrasés par la version actuellement téléchargée.</value>
10601060
</data>
1061+
<data name="AdaptToTwitterRateLimits" xml:space="preserve">
1062+
<value>Adapter aux limites de débit du serveur Twitter/X</value>
1063+
</data>
1064+
<data name="ToolTipAdaptToTwitterRateLimits" xml:space="preserve">
1065+
<value>Ajuste la vitesse aux limites de débit du serveur Twitter/X</value>
1066+
</data>
1067+
<data name="ToolTipAdaptToTwitterRateLimitsDescription" xml:space="preserve">
1068+
<value>Si cette option est désactivée, les limites de débit configurées dans les paramètres seront strictement
1069+
respectées, ce qui peut entraîner un rejet par le serveur. Cette action est à vos risques et périls.</value>
1070+
</data>
10611071
</root>

0 commit comments

Comments
 (0)