Skip to content

Commit 109df6a

Browse files
Thomas StrombergThomas Stromberg
authored andcommitted
make timestamp parsing not suck
1 parent 1d9e4b0 commit 109df6a

File tree

4 files changed

+251
-20
lines changed

4 files changed

+251
-20
lines changed

poll/poll.go

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,18 +120,39 @@ func (m *Monitor) CheckAll(ctx context.Context) error {
120120

121121
// Use any subscriber's thread info to check intervals (they should all be the same)
122122
thread := info.thread
123-
interval := calculateInterval(thread.LastPostTime, thread.LastPolledAt)
123+
interval, reason := calculateInterval(thread.LastPostTime, thread.LastPolledAt)
124124
timeSinceLastPoll := time.Now().Sub(thread.LastPolledAt)
125125
needsCheck := timeSinceLastPoll >= interval
126126

127+
// Format times for logging, handling zero values
128+
lastPolledStr := "never"
129+
if !thread.LastPolledAt.IsZero() {
130+
lastPolledStr = thread.LastPolledAt.Format(time.RFC3339)
131+
}
132+
lastPostTimeStr := "none"
133+
if !thread.LastPostTime.IsZero() {
134+
lastPostTimeStr = thread.LastPostTime.Format(time.RFC3339)
135+
}
136+
timeSinceLastPostStr := "n/a"
137+
if !thread.LastPostTime.IsZero() {
138+
timeSinceLastPostStr = time.Since(thread.LastPostTime).Round(time.Second).String()
139+
}
140+
timeSincePollStr := "n/a"
141+
if !thread.LastPolledAt.IsZero() {
142+
timeSincePollStr = timeSinceLastPoll.Round(time.Second).String()
143+
}
144+
127145
m.logger.Info(fmt.Sprintf("Thread %d/%d: Evaluating", threadNum, len(uniqueThreads)),
128146
"cycle", m.cycleNumber,
129147
"thread_url", threadURL,
130148
"thread_title", thread.ThreadTitle,
131149
"subscriber_count", len(info.subscribers),
132-
"last_polled", thread.LastPolledAt.Format(time.RFC3339),
133-
"time_since_poll", timeSinceLastPoll.Round(time.Second).String(),
150+
"last_polled", lastPolledStr,
151+
"last_post_time", lastPostTimeStr,
152+
"time_since_last_post", timeSinceLastPostStr,
153+
"time_since_poll", timeSincePollStr,
134154
"required_interval", interval.String(),
155+
"interval_reason", reason,
135156
"needs_check", needsCheck)
136157

137158
if !needsCheck {
@@ -288,6 +309,13 @@ func (m *Monitor) checkThreadForSubscribers(ctx context.Context, info *threadChe
288309
thread.LastPolledAt = now
289310
if !latestPostTime.IsZero() {
290311
thread.LastPostTime = latestPostTime
312+
} else if thread.LastPostTime.IsZero() {
313+
// Log warning if we still don't have a post time after fetching
314+
m.logger.Warn("No post timestamp available after fetching thread - interval calculation will default to immediate polling",
315+
"cycle", m.cycleNumber,
316+
"email", email,
317+
"thread_url", threadURL,
318+
"thread_title", thread.ThreadTitle)
291319
}
292320

293321
// First check for this subscriber - just record the latest post ID
@@ -445,28 +473,48 @@ func (m *Monitor) checkThreadForSubscribers(ctx context.Context, info *threadChe
445473
}
446474

447475
// calculateInterval determines how often to poll a thread based on activity.
448-
func calculateInterval(lastPostTime, lastPolledAt time.Time) time.Duration {
449-
// If never polled or never seen a post, poll now
450-
if lastPolledAt.IsZero() || lastPostTime.IsZero() {
451-
return 0
476+
// Returns the interval duration and a human-readable reason explaining the decision.
477+
// NEVER returns 0s - always returns a minimum interval to prevent polling loops.
478+
func calculateInterval(lastPostTime, lastPolledAt time.Time) (time.Duration, string) {
479+
const minInterval = 5 * time.Minute // Minimum safe interval
480+
481+
// If never polled before, use minimum interval
482+
if lastPolledAt.IsZero() {
483+
return minInterval, "never polled before (using minimum interval)"
484+
}
485+
486+
// If no post time recorded, use maximum interval (something is wrong)
487+
if lastPostTime.IsZero() {
488+
return 6 * time.Hour, "ERROR: no post time recorded (using maximum interval to avoid polling loop)"
452489
}
453490

454491
// Calculate time since last post
455492
timeSinceLastPost := time.Since(lastPostTime)
456493

457494
var interval time.Duration
495+
var reason string
458496
switch {
459497
case timeSinceLastPost < 30*time.Minute:
460498
interval = 5 * time.Minute
499+
reason = "very active thread (post < 30m ago)"
461500
case timeSinceLastPost < 2*time.Hour:
462501
interval = 10 * time.Minute
502+
reason = "active thread (post < 2h ago)"
463503
case timeSinceLastPost < 6*time.Hour:
464504
interval = 20 * time.Minute
505+
reason = "moderately active thread (post < 6h ago)"
465506
case timeSinceLastPost < 24*time.Hour:
466507
interval = 1 * time.Hour
508+
reason = "daily active thread (post < 24h ago)"
467509
default:
468510
interval = 6 * time.Hour
511+
reason = "inactive thread (post > 24h ago)"
512+
}
513+
514+
// Safety check: never return 0s interval
515+
if interval == 0 {
516+
return minInterval, "ERROR: interval calculation resulted in 0s (using minimum interval)"
469517
}
470518

471-
return interval
519+
return interval, reason
472520
}

scraper/scraper.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,8 +309,30 @@ func parsePage(body interface{ Read([]byte) (int, error) }, threadURL string) (*
309309
// Extract author
310310
author := strings.TrimSpace(s.Find("a.username").First().Text())
311311

312-
// Extract timestamp
313-
timestamp, _ := s.Find("time").First().Attr("datetime")
312+
// Extract timestamp - ADVRider uses two formats:
313+
// 1. Older posts: <span class="DateTime" title="Jul 24, 2008 at 12:50 PM">
314+
// 2. Recent posts: <abbr class="DateTime" data-time="1760448714" title="Oct 14, 2025 at 9:31 AM">
315+
var timestamp string
316+
dateTimeElem := s.Find(".DateTime").First()
317+
if dateTimeElem.Length() > 0 {
318+
// Try abbr with data-time (Unix timestamp) first - this is the most accurate
319+
if unixStr, exists := dateTimeElem.Attr("data-time"); exists && unixStr != "" {
320+
var unixSec int64
321+
if _, err := fmt.Sscanf(unixStr, "%d", &unixSec); err == nil {
322+
timestamp = time.Unix(unixSec, 0).UTC().Format(time.RFC3339)
323+
}
324+
}
325+
326+
// Fall back to title attribute (human-readable format)
327+
if timestamp == "" {
328+
if titleStr, exists := dateTimeElem.Attr("title"); exists && titleStr != "" {
329+
// Parse ADVRider's title format: "Oct 14, 2025 at 9:31 AM"
330+
if t, err := time.Parse("Jan 2, 2006 at 3:04 PM", titleStr); err == nil {
331+
timestamp = t.UTC().Format(time.RFC3339)
332+
}
333+
}
334+
}
335+
}
314336

315337
// Extract content from blockquote
316338
blockquote := s.Find("blockquote.messageText").First()

scraper/scraper_test.go

Lines changed: 159 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,17 @@ func TestParseDurhamThread(t *testing.T) {
6767
if firstPost.URL == "" {
6868
t.Error("First post should have a URL")
6969
}
70+
if firstPost.Timestamp == "" {
71+
t.Error("First post should have a timestamp")
72+
} else {
73+
// Validate it can be parsed
74+
if _, err := time.Parse(time.RFC3339, firstPost.Timestamp); err != nil {
75+
t.Errorf("Failed to parse timestamp %q: %v", firstPost.Timestamp, err)
76+
}
77+
}
7078

71-
t.Logf("First post ID: %s, Author: %s, Content length: %d bytes",
72-
firstPost.ID, firstPost.Author, len(firstPost.Content))
79+
t.Logf("First post ID: %s, Author: %s, Timestamp: %s, Content length: %d bytes",
80+
firstPost.ID, firstPost.Author, firstPost.Timestamp, len(firstPost.Content))
7381

7482
// Validate URL format
7583
expectedURLPrefix := threadURL + "#post-"
@@ -78,8 +86,8 @@ func TestParseDurhamThread(t *testing.T) {
7886
}
7987
}
8088

81-
// TestParseDurhamThreadLastPage validates we can parse a specific page number.
82-
func TestParseDurhamThreadLastPage(t *testing.T) {
89+
// TestParseDurhamThreadPage293 validates we can parse a specific page number.
90+
func TestParseDurhamThreadPage293(t *testing.T) {
8391
if testing.Short() {
8492
t.Skip("skipping integration test in short mode")
8593
}
@@ -108,4 +116,151 @@ func TestParseDurhamThreadLastPage(t *testing.T) {
108116
t.Fatal("No posts found on page 293")
109117
}
110118
t.Logf("Found %d posts on page 293", len(page.Posts))
119+
120+
// Validate last post has timestamp (most important for latest post tracking)
121+
lastPost := page.Posts[len(page.Posts)-1]
122+
if lastPost.Timestamp == "" {
123+
t.Error("Last post on page 293 should have a timestamp")
124+
} else {
125+
// Validate it can be parsed
126+
parsedTime, err := time.Parse(time.RFC3339, lastPost.Timestamp)
127+
if err != nil {
128+
t.Errorf("Failed to parse timestamp %q: %v", lastPost.Timestamp, err)
129+
} else {
130+
t.Logf("Last post ID: %s, Author: %s, Timestamp: %s",
131+
lastPost.ID, lastPost.Author, parsedTime.Format(time.RFC3339))
132+
}
133+
}
134+
}
135+
136+
// TestParseDurhamThreadLatestPost validates we can fetch the absolute latest post.
137+
func TestParseDurhamThreadLatestPost(t *testing.T) {
138+
if testing.Short() {
139+
t.Skip("skipping integration test in short mode")
140+
}
141+
142+
client := &http.Client{Timeout: 30 * time.Second}
143+
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
144+
s := New(client, logger)
145+
146+
ctx := context.Background()
147+
threadURL := "https://advrider.com/f/threads/durham-rtp-wednesday-advlunch.365943/"
148+
149+
// Use LatestPost to fetch the most recent post (same as subscription creation does)
150+
post, title, err := s.LatestPost(ctx, threadURL)
151+
if err != nil {
152+
t.Fatalf("Failed to fetch Durham thread latest post: %v", err)
153+
}
154+
155+
t.Logf("Thread title: %q", title)
156+
if title == "" {
157+
t.Error("Thread title should not be empty")
158+
}
159+
160+
// Validate post ID
161+
if post.ID == "" {
162+
t.Fatal("Latest post should have an ID")
163+
}
164+
t.Logf("Latest post ID: %s", post.ID)
165+
166+
// Validate post timestamp - CRITICAL
167+
if post.Timestamp == "" {
168+
t.Fatal("Latest post should have a timestamp")
169+
}
170+
t.Logf("Latest post timestamp: %s", post.Timestamp)
171+
172+
// Validate timestamp can be parsed as RFC3339
173+
parsedTime, err := time.Parse(time.RFC3339, post.Timestamp)
174+
if err != nil {
175+
t.Fatalf("Failed to parse timestamp %q as RFC3339: %v", post.Timestamp, err)
176+
}
177+
t.Logf("Parsed timestamp: %s", parsedTime.Format(time.RFC3339))
178+
179+
// Validate timestamp is reasonable (thread started in 2008, so posts should be after that)
180+
now := time.Now()
181+
if parsedTime.IsZero() {
182+
t.Error("Parsed timestamp should not be zero")
183+
}
184+
if parsedTime.After(now) {
185+
t.Errorf("Parsed timestamp %s is in the future (now: %s)", parsedTime, now)
186+
}
187+
if parsedTime.Year() < 2008 {
188+
t.Errorf("Parsed timestamp %s is before thread creation (2008)", parsedTime)
189+
}
190+
191+
// Validate other post fields
192+
if post.Author == "" {
193+
t.Error("Latest post should have an author")
194+
}
195+
if post.Content == "" {
196+
t.Error("Latest post should have content")
197+
}
198+
t.Logf("Latest post by %s, content length: %d bytes", post.Author, len(post.Content))
199+
}
200+
201+
// TestParseElectricMotorcycleThread validates timestamp parsing for the Electric Motorcycle thread.
202+
func TestParseElectricMotorcycleThread(t *testing.T) {
203+
if testing.Short() {
204+
t.Skip("skipping integration test in short mode")
205+
}
206+
207+
client := &http.Client{Timeout: 30 * time.Second}
208+
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
209+
s := New(client, logger)
210+
211+
ctx := context.Background()
212+
threadURL := "https://advrider.com/f/threads/electric-motorcycle-scooter-news-updates.1154248/"
213+
214+
// Fetch the latest post
215+
post, title, err := s.LatestPost(ctx, threadURL)
216+
if err != nil {
217+
t.Fatalf("Failed to fetch Electric Motorcycle thread: %v", err)
218+
}
219+
220+
// Validate thread title
221+
t.Logf("Thread title: %q", title)
222+
if title == "" {
223+
t.Error("Thread title should not be empty")
224+
}
225+
226+
// Validate post ID
227+
if post.ID == "" {
228+
t.Fatal("Latest post should have an ID")
229+
}
230+
t.Logf("Latest post ID: %s", post.ID)
231+
232+
// Validate post timestamp - THIS IS THE CRITICAL CHECK
233+
if post.Timestamp == "" {
234+
t.Fatal("Latest post should have a timestamp")
235+
}
236+
t.Logf("Latest post timestamp: %s", post.Timestamp)
237+
238+
// Validate timestamp can be parsed as RFC3339
239+
parsedTime, err := time.Parse(time.RFC3339, post.Timestamp)
240+
if err != nil {
241+
t.Fatalf("Failed to parse timestamp %q as RFC3339: %v", post.Timestamp, err)
242+
}
243+
t.Logf("Parsed timestamp: %s", parsedTime.Format(time.RFC3339))
244+
245+
// Validate timestamp is not zero and is reasonable (not in the future, not too old)
246+
now := time.Now()
247+
if parsedTime.IsZero() {
248+
t.Error("Parsed timestamp should not be zero")
249+
}
250+
if parsedTime.After(now) {
251+
t.Errorf("Parsed timestamp %s is in the future (now: %s)", parsedTime, now)
252+
}
253+
// Electric motorcycle thread started in 2013, so posts should be after that
254+
if parsedTime.Year() < 2013 {
255+
t.Errorf("Parsed timestamp %s is before thread creation (2013)", parsedTime)
256+
}
257+
258+
// Validate other post fields
259+
if post.Author == "" {
260+
t.Error("Latest post should have an author")
261+
}
262+
if post.Content == "" {
263+
t.Error("Latest post should have content")
264+
}
265+
t.Logf("Latest post by %s, content length: %d bytes", post.Author, len(post.Content))
111266
}

server/subscribe.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,18 @@ func (s *Server) handleSubscribe(w http.ResponseWriter, r *http.Request) {
132132
return
133133
}
134134

135-
// Parse post timestamp to initialize LastPostTime
136-
var lastPostTime time.Time
137-
if post.Timestamp != "" {
138-
if t, err := time.Parse(time.RFC3339, post.Timestamp); err == nil {
139-
lastPostTime = t
140-
}
135+
// Validate and parse post timestamp to initialize LastPostTime
136+
if post.Timestamp == "" {
137+
s.logger.Error("Latest post has empty timestamp", "url", baseThreadURL, "title", threadTitle, "post_id", post.ID)
138+
http.Error(w, "Could not determine post timestamp - the page structure may have changed", http.StatusInternalServerError)
139+
return
140+
}
141+
142+
lastPostTime, err := time.Parse(time.RFC3339, post.Timestamp)
143+
if err != nil {
144+
s.logger.Error("Failed to parse post timestamp", "url", baseThreadURL, "title", threadTitle, "post_id", post.ID, "timestamp", post.Timestamp, "error", err)
145+
http.Error(w, "Could not parse post timestamp - the page structure may have changed", http.StatusInternalServerError)
146+
return
141147
}
142148

143149
s.logger.Info("Creating subscription with latest post ID",

0 commit comments

Comments
 (0)