Skip to content

Commit 5c6c9b4

Browse files
committed
#146 Improve anomaly detection logic and extend test coverage
Enhanced the `HampelAnomalyDetection` function to handle edge cases more robustly, including adjustments for series with anomalies and missing values. Extended test cases to cover a wider range of inputs, ensuring accuracy and reliability of anomaly detection. Updated `requirements.txt` to include `scipy` as a dependency.
1 parent a110e9e commit 5c6c9b4

File tree

3 files changed

+116
-34
lines changed

3 files changed

+116
-34
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
requests >= 2.32.0 # Apache-2.0 license
66
pandas >= 1.5.2 # MIT License
77
numpy >= 1.23.5 # BSD-3-Clause license
8+
scipy >= 1.15.2 # BSD-3-Clause License
89
openpyxl >= 3.0.10 # MIT License
910
Mako >= 1.2.4 # MIT License
1011
python-dateutil >= 2.8.1 # Apache-2.0 license

tests/test_TradeRoutines.py

Lines changed: 91 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -689,38 +689,107 @@ def test_HampelAnomalyDetectionCheckType(self):
689689

690690
def test_HampelAnomalyDetectionPositive(self):
691691
testData = [
692-
([1, 1, 1, 1, 111, 1], 4),
693-
([1, 1, 10, 1, 1, 1], 2),
694-
([111, 1, 1, 1, 1, 1], 0),
695-
([111, 1, 1, 1, 1, 111], 0),
696-
([1, 11, 1, 111, 1, 1], 1),
697-
([1, 1, 1, 111, 99, 11], 3),
698-
([1, 1, 11, 111, 1, 1, 1, 11111], 2),
699-
([1, 1, 1, 111, 111, 1, 1, 1, 1], 3),
700-
([1, 1, 1, 1, 111, 1, 1, 11111, 5555], 4),
701-
([9, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 13, 12, 12, 1, 1], 1),
702-
([9, 13, 12, 12, 13, 12, 1000, 13, 12, 12, 300000, 12, 12, 13, 12, 2000, 1, 1, 1, 1], 6),
703-
([-111, 1, 1, 1, 1], 0),
704-
([1, 2, 1, -1, 1], 1),
705-
([-111, -1, -1, -1, -1], 0),
706-
([-1, -1, 2, -1, -1], 2),
692+
# Single clear spike near the end:
693+
([1, 1, 1, 1, 111, 1], 4), # Last value is extreme outlier.
694+
695+
# Middle spike:
696+
([1, 1, 10, 1, 1, 1], 2), # Middle value is an outlier.
697+
698+
# Anomaly at the beginning:
699+
([111, 1, 1, 1, 1, 1], 0), # First value deviates significantly.
700+
701+
# Two equal anomalies at beginning and end:
702+
([111, 1, 1, 1, 1, 111], 0), # First and last are both outliers, return first.
703+
704+
# Gradual growth then spike:
705+
([1, 11, 1, 111, 1, 1], 1), # The second value is a small anomaly before a large one.
706+
707+
# Series with several anomalies:
708+
([1, 1, 1, 111, 99, 11], 3), # The first anomaly is at index 3.
709+
710+
# Strong mid-sequence anomalies:
711+
([1, 1, 11, 111, 1, 1, 1, 11111], 2), # Index 2 is first small anomaly before huge one.
712+
713+
# Two equal spikes in a center:
714+
([1, 1, 1, 111, 111, 1, 1, 1, 1], 3), # The first 111 is picked as an anomaly.
715+
716+
# Multiple strong anomalies:
717+
([1, 1, 1, 1, 111, 1, 1, 11111, 5555], 4), # First anomaly at index 4.
718+
719+
# Repetitive pattern disrupted early:
720+
([9, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 13, 12, 12, 1, 1], 0), # The first value is anomaly, not second.
721+
722+
# Many anomalies including extreme peaks:
723+
([9, 13, 12, 12, 13, 12, 1000, 13, 12, 12, 300000, 12, 12, 13, 12, 2000, 1, 1, 1, 1], 0), # 9 is an early anomaly before any maximum.
724+
725+
# Anomaly at start (negative spike):
726+
([-111, 1, 1, 1, 1], 0), # First value is extreme negative outlier.
727+
728+
# Small anomaly in second position:
729+
([1, 2, 1, -1, 1], 1), # Second value is slightly off trend.
730+
731+
# Large negative value at beginning:
732+
([-111, -1, -1, -1, -1], 0), # First value is anomaly.
733+
734+
# Symmetric small spike:
735+
([-1, -1, 2, -1, -1], 2), # Center value is small anomaly.
736+
737+
# Anomaly at the end of the series:
738+
([1, 1, 1, 1, 1, 999], 5), # Last value is extreme outlier.
739+
740+
# Two equal outliers in the middle:
741+
([1, 1, 100, 100, 1, 1], 2), # Both 100s are outliers, return index of first.
742+
743+
# Symmetric spike in a center:
744+
([1, 1, 50, 1, 1], 2), # Spike in the center.
745+
746+
# Two anomalies, pick one with a lower index:
747+
([100, 1, 1, 1, 100], 0), # Both 0 and 4 are anomalies, return min.
748+
749+
# Flat line with a single jump:
750+
([10, 10, 10, 99, 10, 10], 3), # Single jump.
751+
752+
# Series with negative infinite value at the beginning:
753+
([-np.inf, 1, 1], 0), # -Inf is detected as a valid outlier at index 0.
707754
]
708755

709756
for test in testData:
710757
assert TradeRoutines.HampelAnomalyDetection(test[0]) == test[1], "Incorrect output! {} {}".format(test[0], test[1])
711758

712759
def test_HampelAnomalyDetectionNegative(self):
713760
testData = [
714-
[1],
715-
[[1]],
716-
[1, 2],
717-
None, [], {},
718-
[1, 1, 1, 1, 1, 1],
719-
[1, "1", 1, 1, 1, 1],
761+
# Single value in series:
762+
[1], # Not enough data to detect anomalies.
763+
764+
# Nested list instead of a flat list:
765+
[[1]], # Invalid input structure should return None.
766+
767+
# Too few values to detect anomaly:
768+
[1, 2], # Less than window size, should return None.
769+
770+
# Null or empty input:
771+
None, # Should safely return None.
772+
[], # Empty list, no values to analyze.
773+
{}, # Dictionary instead of series, invalid input.
774+
775+
# Flat series with no deviation:
776+
[1, 1, 1, 1, 1, 1], # All values identical, no anomaly.
777+
778+
# Series with non-numeric value:
779+
[1, "1", 1, 1, 1, 1], # Contains string, should return None.
780+
781+
# Series with NaN values only:
782+
[np.nan, np.nan, np.nan], # All values are NaN, should return None.
783+
784+
# Series with NaN mixed with valid values:
785+
[1, np.nan, 1, 1], # Contains NaN, may interfere with MAD, should return None.
786+
787+
# Series with infinite value in a center:
788+
([1, np.inf, 1], 1), # Infinity is correctly detected as anomaly. Return index 1.
720789
]
721790

722791
for test in testData:
723-
assert TradeRoutines.HampelAnomalyDetection(test) is None, "Incorrect output!"
792+
assert TradeRoutines.HampelAnomalyDetection(test) is None, "Incorrect output! Input: {}".format(test)
724793

725794
def test_CanOpenCheckType(self):
726795
assert isinstance(TradeRoutines.CanOpen("Min", "Min"), bool), "Not bool type returned!"

tksbrokerapi/TradeRoutines.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,11 @@ def HampelFilter(series: Union[list, pd.Series], window: int = 5, sigma: float =
629629
- `HampelFilter([1, 1, 1, 1, 1, 1], window=3) -> pd.Series([False, False, False, False, False, False])`
630630
- `HampelFilter([1, 1, 1, 2, 1, 1], window=3) -> pd.Series([False, False, False, True, False, False])`
631631
- `HampelFilter([0, 1, 1, 1, 1, 0], window=3) -> pd.Series([True, False, False, False, False, True])`
632-
- `HampelFilter([1]) -> pd.Series([False])`
632+
- `HampelFilter([1], window=3) -> pd.Series([False])`
633+
- `HampelFilter([5, 5, 50, 5, 5], window=2) -> pd.Series([False, False, True, False, False])`
634+
- `HampelFilter([100, 1, 1, 1, 1, 100], window=2) -> pd.Series([True, False, False, False, False, True])`
635+
- `HampelFilter([1, 1, 10, 1, 10, 1, 1], window=2) -> pd.Series([False, False, True, False, True, False, False])`
636+
633637
634638
:param series: Pandas Series object with numbers in which we identify outliers.
635639
:param window: length of the sliding window (5 points by default), 1 <= window <= len(series).
@@ -746,6 +750,8 @@ def HampelAnomalyDetection(series: Union[list, pd.Series], **kwargs) -> Optional
746750
747751
Examples:
748752
753+
Examples:
754+
749755
- `HampelAnomalyDetection([1, 1, 1, 1, 1, 1]) -> None`
750756
- `HampelAnomalyDetection([1, 1, 1, 1, 111, 1]) -> 4`
751757
- `HampelAnomalyDetection([1, 1, 10, 1, 1, 1]) -> 2`
@@ -756,35 +762,41 @@ def HampelAnomalyDetection(series: Union[list, pd.Series], **kwargs) -> Optional
756762
- `HampelAnomalyDetection([1, 1, 11, 111, 1, 1, 1, 11111]) -> 2`
757763
- `HampelAnomalyDetection([1, 1, 1, 111, 111, 1, 1, 1, 1]) -> 3`
758764
- `HampelAnomalyDetection([1, 1, 1, 1, 111, 1, 1, 11111, 5555]) -> 4`
759-
- `HampelAnomalyDetection([9, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 13, 12, 12, 1, 1]) -> 1`
760-
- `HampelAnomalyDetection([9, 13, 12, 12, 13, 12, 1000, 13, 12, 12, 300000, 12, 12, 13, 12, 2000, 1, 1, 1, 1]) -> 6`
765+
- `HampelAnomalyDetection([9, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 12, 13, 12, 13, 12, 12, 1, 1]) -> 0`
766+
- `HampelAnomalyDetection([9, 13, 12, 12, 13, 12, 1000, 13, 12, 12, 300000, 12, 12, 13, 12, 2000, 1, 1, 1, 1]) -> 0`
761767
762768
Some **kwargs parameters you can pass to `HampelFilter()`:
763769
764770
- `window` is the length of the sliding window (5 points by default), 1 <= window <= len(series).
765771
- `sigma` is the number of standard deviations which identify the outlier (3 sigma by default), > 0.
766772
- `scaleFactor` is the constant scale factor (1.4826 by default), > 0.
767773
768-
:param series: list of numbers or Pandas Series object with numbers in which we identify index of first anomaly (outlier's index).
774+
:param series: list of numbers or Pandas Series object with numbers in which we identify the index of the first anomaly (outlier's index).
769775
:param kwargs: See `HampelFilter()` docstring with all possible parameters.
770-
:return: index of the first element with anomaly in series will be return or `None` if no anomaly.
776+
:return: index of the first element with anomaly in the series will be return or `None` if no anomaly.
771777
"""
772778
try:
779+
# Convert the list to Pandas Series for consistency:
773780
if isinstance(series, list):
774781
series = pd.Series(series)
775782

776-
indexFirstMax = series.idxmax() # Index of the first maximum in series
783+
# Apply Hampel filter to find outlier positions:
784+
filtered = HampelFilter(series=series, **kwargs) # Boolean series, True means anomaly.
785+
anomalyIndexes = filtered[filtered].index # Extract indices of detected anomalies.
777786

778-
filtered = HampelFilter(series=series, **kwargs) # The bool series with filtered data (if True then anomaly present in that place of input series)
779-
anomalyIndexes = filtered[filtered].index # Indexes list of all found anomalies (if True)
787+
# Get index of first anomaly or None if there are no anomalies:
788+
indexAnomalyMin = next(iter(anomalyIndexes), None)
780789

781-
indexAnomalyMin = min(anomalyIndexes) if len(anomalyIndexes) > 0 else None # Index of the first True in filtered series or None
790+
# If an anomaly exists — compare it with the index of the first maximum:
791+
if indexAnomalyMin is not None:
792+
indexFirstMax = series.values.argmax() # Get numeric position of first maximum value.
793+
result = min(indexAnomalyMin, indexFirstMax) # Return the smaller of the two indices.
782794

783-
# We need to take the element whose index is less (see examples in docstring):
784-
result = pd.Series([indexAnomalyMin, indexFirstMax]).min() if indexAnomalyMin is not None else None
795+
else:
796+
result = None # No anomalies found.
785797

786798
except Exception:
787-
result = None
799+
result = None # Fallback in case of error (invalid input, etc.)
788800

789801
return result
790802

0 commit comments

Comments
 (0)