more robust parsing of user counts (#113)

jtracey · web-flow · commit df6ada5e74c1 · 2025-01-29T15:46:43.000-06:00
The latest userstats-relay-country.csv file has a row with scientific
notation. Parsing as a float first prevents failing on that.
diff --git a/tornettools/stage.py b/tornettools/stage.py
@@ -60,7 +60,9 @@ def stage_users(args, min_unix_time, max_unix_time):
 
             date = str(parts[0]) # like '2019-01-01'
             country_code = str(parts[1]) # like 'us'
-            user_count = int(parts[2]) # like '14714'
+            # At least one float has been observed in the file:
+            # <https://gitlab.torproject.org/tpo/network-health/metrics/website/-/issues/40121>
+            user_count = int(float(parts[2])) # like '14714' or '2e+05'
 
             dt = datetime.strptime(date, "%Y-%m-%d").replace(tzinfo=timezone.utc)
             unix_time = int(dt.strftime("%s")) # returns stamp like 1548910800