Skip to content

Commit 344955f

Browse files
author
Andrii Bondarchuk
committed
Script improvements
1 parent 2373fd4 commit 344955f

File tree

1 file changed

+52
-36
lines changed

1 file changed

+52
-36
lines changed

src/FitSyncHub.Functions/Functions/EverestingHOFScraperHttpTriggerFunction.cs

Lines changed: 52 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
using Microsoft.Azure.Functions.Worker;
1010
using Microsoft.Extensions.Logging;
1111
using Microsoft.Extensions.Primitives;
12-
using Newtonsoft.Json.Linq;
1312

1413
namespace FitSyncHub.Functions.Functions;
1514

@@ -91,9 +90,11 @@ public async Task<IActionResult> Run(
9190
return new OkObjectResult("Success");
9291
}
9392

94-
private static bool AllActiviesSynced(JsonDocument activitiesPortionJsonDocument, DateTime lastSyncedDateTime)
93+
private static bool AllActiviesSynced(
94+
JsonElement root,
95+
DateTime lastSyncedDateTime)
9596
{
96-
var activities = GetActivitiesArray(activitiesPortionJsonDocument);
97+
var activities = GetActivitiesArray(root);
9798
HashSet<DateTime> dates = [];
9899

99100
foreach (var activity in activities.EnumerateArray())
@@ -114,9 +115,9 @@ SELECT top 1 * FROM c
114115
ORDER BY c.date DESC
115116
""");
116117

117-
var feed = _everestingHOFContainer.GetItemQueryIterator<JObject>(
118-
queryDefinition: query
119-
);
118+
var feed = _everestingHOFContainer.GetItemQueryIterator<Newtonsoft.Json.Linq.JObject>(
119+
queryDefinition: query
120+
);
120121

121122
while (feed.HasMoreResults)
122123
{
@@ -129,7 +130,7 @@ ORDER BY c.date DESC
129130
var item = response.Single();
130131

131132
var date = item["date"];
132-
if (date is null || date.Type != JTokenType.String)
133+
if (date is null || date.Type != Newtonsoft.Json.Linq.JTokenType.String)
133134
{
134135
throw new Exception();
135136
}
@@ -140,7 +141,13 @@ ORDER BY c.date DESC
140141
throw new Exception("Can't get last synced date time");
141142
}
142143

143-
private static JsonDocument ExtractActivitiesJson(string html)
144+
private static JsonElement ExtractActivitiesJson(string html)
145+
{
146+
var scriptParts = GetScriptParts(html);
147+
return ParseActivitiesJsonFromFullScript(scriptParts);
148+
}
149+
150+
private static IEnumerable<string> GetScriptParts(string html)
144151
{
145152
var doc = new HtmlDocument();
146153
doc.LoadHtml(html);
@@ -150,7 +157,6 @@ private static JsonDocument ExtractActivitiesJson(string html)
150157
.SelectNodes("//script")
151158
?? throw new Exception("No script tags found");
152159

153-
var sb = new StringBuilder();
154160
foreach (var script in scripts)
155161
{
156162
var text = script.InnerText;
@@ -169,48 +175,58 @@ private static JsonDocument ExtractActivitiesJson(string html)
169175
.Replace("\"])", "");
170176

171177
text = Regex.Unescape(text);
172-
sb.Append(text);
178+
yield return text;
173179
}
180+
}
174181

175-
var fullScript = sb.ToString();
182+
private static JsonElement ParseActivitiesJsonFromFullScript(IEnumerable<string> scripts)
183+
{
184+
const string StartPattern = "12:[\"$\",\"$L1c\",null,";
185+
byte[]? data = default;
176186

177-
const string ActivitiesJsonStartPattern = "12:[\"$\",\"$L1c\",null,";
178-
var activitiesJsonStartIndex = fullScript.IndexOf(ActivitiesJsonStartPattern);
187+
foreach (var script in scripts)
188+
{
189+
if (data is null)
190+
{
191+
var idx = script.IndexOf(StartPattern);
192+
if (idx < 0)
193+
{
194+
continue;
195+
}
179196

180-
var startFrom = activitiesJsonStartIndex + ActivitiesJsonStartPattern.Length;
181-
// -2 to remove ']}'
182-
var length = fullScript.Length - startFrom - 2;
197+
data = Encoding.UTF8.GetBytes(script[(idx + StartPattern.Length)..]);
183198

184-
var json = fullScript.Substring(startFrom, length);
199+
}
200+
else
201+
{
202+
data = [.. data, .. Encoding.UTF8.GetBytes(script)];
203+
}
185204

186-
return JsonDocument.Parse(json);
187-
}
205+
//isFinalBlock: false to avoid exception throwing
206+
var reader = new Utf8JsonReader(data, isFinalBlock: false, new());
207+
if (JsonDocument.TryParseValue(ref reader, out var document))
208+
{
209+
return document.RootElement;
210+
}
188211

189-
private static JsonElement GetActivitiesArray(JsonDocument doc)
190-
{
191-
var root = doc.RootElement;
192-
return root.GetProperty("activities");
193-
}
212+
// Not enough data yet — continue
213+
}
194214

195-
private static int GetCurrentPage(JsonDocument doc)
196-
{
197-
var root = doc.RootElement;
198-
return root.GetProperty("currentPage").GetInt32();
215+
throw new InvalidOperationException("no json");
199216
}
200217

201-
private static int GetTotalPages(JsonDocument doc)
202-
{
203-
var root = doc.RootElement;
204-
return root.GetProperty("totalPages").GetInt32();
205-
}
218+
219+
private static JsonElement GetActivitiesArray(JsonElement root) => root.GetProperty("activities");
220+
private static int GetCurrentPage(JsonElement root) => root.GetProperty("currentPage").GetInt32();
221+
private static int GetTotalPages(JsonElement root) => root.GetProperty("totalPages").GetInt32();
206222

207223
private async Task StoreData(
208-
JsonDocument doc,
224+
JsonElement root,
209225
CancellationToken cancellationToken)
210226
{
211-
var activities = GetActivitiesArray(doc);
227+
var activities = GetActivitiesArray(root);
212228

213-
_logger.LogInformation("Page {CurrentPage} / {TotalPages}", GetCurrentPage(doc), GetTotalPages(doc));
229+
_logger.LogInformation("Page {CurrentPage} / {TotalPages}", GetCurrentPage(root), GetTotalPages(root));
214230
_logger.LogInformation("Activities: {ActivitiesCount}", activities.GetArrayLength());
215231

216232
var tasks = new List<Task>();

0 commit comments

Comments
 (0)