Skip to content

Commit 4a14391

Browse files
author
Andrii Bondarchuk
committed
Add EverestingHOFScraperHttpTriggerFunction
1 parent 68896fe commit 4a14391

File tree

1 file changed

+240
-0
lines changed

1 file changed

+240
-0
lines changed
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
using System.Text;
2+
using System.Text.Json;
3+
using System.Text.RegularExpressions;
4+
using HtmlAgilityPack;
5+
using Microsoft.AspNetCore.Http;
6+
using Microsoft.AspNetCore.Mvc;
7+
using Microsoft.AspNetCore.WebUtilities;
8+
using Microsoft.Azure.Cosmos;
9+
using Microsoft.Azure.Functions.Worker;
10+
using Microsoft.Extensions.Logging;
11+
using Microsoft.Extensions.Primitives;
12+
using Newtonsoft.Json.Linq;
13+
14+
namespace FitSyncHub.Functions.Functions;
15+
16+
public sealed class EverestingHOFScraperHttpTriggerFunction
17+
{
18+
private readonly HttpClient _httpClient;
19+
private readonly Container _everestingHOFContainer;
20+
private readonly ILogger<EverestingHOFScraperHttpTriggerFunction> _logger;
21+
22+
private readonly HashSet<string> _availableModalities =
23+
["quarter", "full", "roam", "triple", "half", "10k", "double"];
24+
25+
26+
public EverestingHOFScraperHttpTriggerFunction(
27+
HttpClient httpClient,
28+
CosmosClient cosmosClient,
29+
ILogger<EverestingHOFScraperHttpTriggerFunction> logger)
30+
{
31+
_httpClient = httpClient;
32+
_everestingHOFContainer = cosmosClient.GetDatabase("fit-sync-hub").GetContainer("EverestingHOF");
33+
34+
_logger = logger;
35+
}
36+
37+
#if DEBUG
38+
[Function(nameof(EverestingHOFScraperHttpTriggerFunction))]
39+
#endif
40+
public async Task<IActionResult> Run(
41+
[HttpTrigger(AuthorizationLevel.Function, "get", Route = "everesting-hof-scraper")] HttpRequest req,
42+
CancellationToken cancellationToken)
43+
{
44+
var lastSyncedDateTime = await GetLastSynchronizedDate(cancellationToken);
45+
46+
// do not know how HOF works, add this delta to be sure that we synced all
47+
// maybe need to increase days
48+
lastSyncedDateTime = lastSyncedDateTime.AddDays(-5);
49+
50+
const string BaseUrl = "https://hof.everesting.com/activities";
51+
52+
var page = 1;
53+
int? totalPages = default;
54+
55+
do
56+
{
57+
var url = QueryHelpers.AddQueryString(BaseUrl, new Dictionary<string, StringValues>
58+
{
59+
{"page", page.ToString() },
60+
});
61+
62+
try
63+
{
64+
var response = await _httpClient.GetAsync(url, cancellationToken);
65+
66+
var content = await response.Content.ReadAsStringAsync(cancellationToken);
67+
var activitiesPortionJsonDocument = ExtractActivitiesJson(content);
68+
if (!totalPages.HasValue)
69+
{
70+
totalPages = GetTotalPages(activitiesPortionJsonDocument);
71+
}
72+
73+
if (AllActiviesSynced(activitiesPortionJsonDocument, lastSyncedDateTime))
74+
{
75+
break;
76+
}
77+
78+
await StoreData(activitiesPortionJsonDocument, cancellationToken);
79+
}
80+
catch (Exception ex)
81+
{
82+
_ = ex;
83+
throw;
84+
}
85+
finally
86+
{
87+
page++;
88+
}
89+
}
90+
while (page <= totalPages);
91+
92+
return new OkObjectResult("Success");
93+
}
94+
95+
private static bool AllActiviesSynced(JsonDocument activitiesPortionJsonDocument, DateTime lastSyncedDateTime)
96+
{
97+
var activities = GetActivitiesArray(activitiesPortionJsonDocument);
98+
HashSet<DateTime> dates = [];
99+
100+
foreach (var activity in activities.EnumerateArray())
101+
{
102+
var date = activity.GetProperty("date");
103+
var dateParsed = DateTime.ParseExact(date.ToString(), "yyyy-MM-dd", null);
104+
dates.Add(dateParsed);
105+
}
106+
107+
return dates.All(d => d < lastSyncedDateTime);
108+
}
109+
110+
private async Task<DateTime> GetLastSynchronizedDate(CancellationToken cancellationToken)
111+
{
112+
var query = new QueryDefinition(
113+
"""
114+
SELECT top 1 * FROM c
115+
ORDER BY c.date DESC
116+
""");
117+
118+
var feed = _everestingHOFContainer.GetItemQueryIterator<JObject>(
119+
queryDefinition: query
120+
);
121+
122+
while (feed.HasMoreResults)
123+
{
124+
var response = await feed.ReadNextAsync(cancellationToken);
125+
if (response.Count > 1)
126+
{
127+
throw new Exception("Some unexpected count of items");
128+
}
129+
130+
var item = response.Single();
131+
132+
var date = item["date"];
133+
if (date is null || date.Type != JTokenType.String)
134+
{
135+
throw new Exception();
136+
}
137+
138+
return DateTime.ParseExact(date.ToString(), "yyyy-MM-dd", null);
139+
}
140+
141+
throw new Exception("Can't get last synced date time");
142+
}
143+
144+
private static JsonDocument ExtractActivitiesJson(string html)
145+
{
146+
var doc = new HtmlDocument();
147+
doc.LoadHtml(html);
148+
149+
var scripts = doc
150+
.DocumentNode
151+
.SelectNodes("//script")
152+
?? throw new Exception("No script tags found");
153+
154+
var sb = new StringBuilder();
155+
foreach (var script in scripts)
156+
{
157+
var text = script.InnerText;
158+
if (string.IsNullOrWhiteSpace(text))
159+
{
160+
continue;
161+
}
162+
163+
if (!text.Contains("self.__next_f.push([1,\""))
164+
{
165+
continue;
166+
}
167+
168+
text = text
169+
.Replace("self.__next_f.push([1,\"", "")
170+
.Replace("\"])", "");
171+
172+
text = Regex.Unescape(text);
173+
sb.Append(text);
174+
}
175+
176+
var fullScript = sb.ToString();
177+
178+
const string ActivitiesJsonStartPattern = "12:[\"$\",\"$L1c\",null,";
179+
var activitiesJsonStartIndex = fullScript.IndexOf(ActivitiesJsonStartPattern);
180+
181+
var startFrom = activitiesJsonStartIndex + ActivitiesJsonStartPattern.Length;
182+
// -2 to remove ']}'
183+
var length = fullScript.Length - startFrom - 2;
184+
185+
var json = fullScript.Substring(startFrom, length);
186+
187+
return JsonDocument.Parse(json);
188+
}
189+
190+
private static JsonElement GetActivitiesArray(JsonDocument doc)
191+
{
192+
var root = doc.RootElement;
193+
return root.GetProperty("activities");
194+
}
195+
196+
private static int GetCurrentPage(JsonDocument doc)
197+
{
198+
var root = doc.RootElement;
199+
return root.GetProperty("currentPage").GetInt32();
200+
}
201+
202+
private static int GetTotalPages(JsonDocument doc)
203+
{
204+
var root = doc.RootElement;
205+
return root.GetProperty("totalPages").GetInt32();
206+
}
207+
208+
private async Task StoreData(
209+
JsonDocument doc,
210+
CancellationToken cancellationToken)
211+
{
212+
var activities = GetActivitiesArray(doc);
213+
214+
_logger.LogInformation("Page {CurrentPage} / {TotalPages}", GetCurrentPage(doc), GetTotalPages(doc));
215+
_logger.LogInformation("Activities: {ActivitiesCount}", activities.GetArrayLength());
216+
217+
var tasks = new List<Task>();
218+
foreach (var activity in activities.EnumerateArray())
219+
{
220+
// Convert JsonElement → stream
221+
await using var stream = new MemoryStream();
222+
await using (var writer = new Utf8JsonWriter(stream))
223+
{
224+
activity.WriteTo(writer);
225+
}
226+
227+
stream.Position = 0;
228+
229+
// Upsert in bulk
230+
tasks.Add(_everestingHOFContainer.UpsertItemStreamAsync(
231+
stream,
232+
new PartitionKey(activity.GetProperty("id").GetString()),
233+
cancellationToken: cancellationToken)
234+
);
235+
}
236+
237+
// Fire all requests in parallel
238+
await Task.WhenAll(tasks);
239+
}
240+
}

0 commit comments

Comments
 (0)