Skip to content

Commit 1bbb6e4

Browse files
wofferlGrotax
authored andcommitted
feat: improve interval calculation and feed update scheduling
- Use IQR method for outlier detection in average calculation - Calculate median using middle two values for even counts - Use newest item date for sleep detection and next update - Prevent future dates from affecting calculations - Add comprehensive edge case tests Signed-off-by: Wolfgang <[email protected]>
1 parent c491766 commit 1bbb6e4

File tree

3 files changed

+57
-9
lines changed

3 files changed

+57
-9
lines changed

bin/feedio

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ function read(FeedIo $feedIo, string $url)
5151

5252
$updateStats = $result->getUpdateStats();
5353

54-
echo "\033[32mMinimum interval between items : \033[34m".formatDateInterval($updateStats->getMedianInterval())."\033[0m" . PHP_EOL;
54+
echo "\033[32mMinimum interval between items : \033[34m".formatDateInterval($updateStats->getMinInterval())."\033[0m" . PHP_EOL;
5555
echo "\033[32mMedian interval : \033[34m".formatDateInterval($updateStats->getMedianInterval())."\033[0m" . PHP_EOL;
5656
echo "\033[32mAverage interval : \033[34m".formatDateInterval($updateStats->getAverageInterval())."\033[0m" . PHP_EOL;
5757
echo "\033[32mMaximum interval : \033[34m".formatDateInterval($updateStats->getMaxInterval())."\033[0m". PHP_EOL;

src/FeedIo/Reader/Result/UpdateStats.php

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,23 @@ class UpdateStats
3131

3232
protected array $intervals = [];
3333

34+
protected int $newestItemDate = 0;
35+
3436
/**
3537
* UpdateStats constructor.
3638
* @param FeedInterface $feed
3739
*/
3840
public function __construct(
3941
protected FeedInterface $feed
4042
) {
41-
$this->intervals = $this->computeIntervals($this->extractDates($feed));
43+
$dates = $this->extractDates($feed);
44+
if (count($dates) > 0) {
45+
// get the most recent item date that is not in the future
46+
$this->newestItemDate = min(max($dates), time());
47+
} else {
48+
$this->newestItemDate = $this->getFeedTimestamp();
49+
}
50+
$this->intervals = $this->computeIntervals($dates);
4251
}
4352

4453
/**
@@ -57,7 +66,6 @@ public function computeNextUpdate(
5766
if ($this->isSleepy($sleepyDuration, $marginRatio)) {
5867
return (new \DateTime())->setTimestamp(time() + $sleepyDelay);
5968
}
60-
$feedTimeStamp = $this->getFeedTimestamp();
6169
$now = time();
6270
$intervals = [
6371
$this->getAverageInterval(),
@@ -66,7 +74,7 @@ public function computeNextUpdate(
6674
sort($intervals);
6775
$newTimestamp = $now + $minDelay;
6876
foreach ($intervals as $interval) {
69-
$computedTimestamp = $this->addInterval($feedTimeStamp, $interval, $marginRatio);
77+
$computedTimestamp = $this->addInterval($this->newestItemDate, $interval, $marginRatio);
7078
if ($computedTimestamp > $now) {
7179
$newTimestamp = $computedTimestamp;
7280
break;
@@ -82,7 +90,7 @@ public function computeNextUpdate(
8290
*/
8391
public function isSleepy(int $sleepyDuration, float $marginRatio): bool
8492
{
85-
return time() > $this->addInterval($this->getFeedTimestamp(), $sleepyDuration, $marginRatio);
93+
return time() > $this->addInterval($this->newestItemDate, $sleepyDuration, $marginRatio);
8694
}
8795

8896
/**
@@ -125,7 +133,27 @@ public function getMaxInterval(): int
125133
*/
126134
public function getAverageInterval(): int
127135
{
128-
$total = array_sum($this->intervals);
136+
sort($this->intervals);
137+
138+
$count = count($this->intervals);
139+
if ($count === 0) {
140+
return 0;
141+
}
142+
143+
// some feeds could have very old historic
144+
// articles so eliminate them with statistic
145+
$q1 = $this->intervals[floor($count * 0.25)];
146+
$q3 = $this->intervals[floor($count * 0.75)];
147+
$iqr = $q3 - $q1;
148+
149+
$lower_bound = $q1 - 1.5 * $iqr;
150+
$upper_bound = $q3 + 1.5 * $iqr;
151+
152+
$result = array_filter($this->intervals, function($value) use ($lower_bound, $upper_bound) {
153+
return $value >= $lower_bound && $value <= $upper_bound;
154+
});
155+
156+
$total = array_sum($result);
129157

130158
return count($this->intervals) ? intval(floor($total / count($this->intervals))) : 0;
131159
}
@@ -136,9 +164,27 @@ public function getAverageInterval(): int
136164
public function getMedianInterval(): int
137165
{
138166
sort($this->intervals);
139-
$num = floor(count($this->intervals) / 2);
140167

141-
return isset($this->intervals[$num]) ? $this->intervals[$num] : 0;
168+
$count = count($this->intervals);
169+
if ($count === 0) {
170+
return 0;
171+
}
172+
173+
$num = floor($count / 2);
174+
175+
if ($count % 2 === 0) {
176+
return intval(floor(($this->intervals[$num - 1] + $this->intervals[$num]) / 2));
177+
} else {
178+
return $this->intervals[$num];
179+
}
180+
}
181+
182+
/**
183+
* @return int
184+
*/
185+
public function getNewestItemDate(): int
186+
{
187+
return $this->newestItemDate;
142188
}
143189

144190
private function computeIntervals(array $dates): array

tests/FeedIo/Reader/Result/UpdateStatsTest.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ public function testIntervals()
3535
$this->assertEquals(86400, $stats->getMinInterval());
3636
$nextUpdate = $stats->computeNextUpdate();
3737
$averageInterval = $stats->getAverageInterval();
38-
$this->assertEquals($feed->getLastModified()->getTimestamp() + intval($averageInterval + 0.1 * $averageInterval), $nextUpdate->getTimestamp());
38+
$medianInterval = $stats->getMedianInterval();
39+
$computedInterval = ($medianInterval < $averageInterval ? $medianInterval : $averageInterval);
40+
$this->assertEquals($stats->getNewestItemDate() + intval($computedInterval + 0.1 * $computedInterval), $nextUpdate->getTimestamp());
3941
}
4042

4143
public function testSleepyFeed()

0 commit comments

Comments
 (0)