Skip to content

Commit d076571

Browse files
authored
Merge pull request #404 from kaituo/log4.1
Add Python Wrapper for RCF and Fix Error Message
2 parents 0859252 + f5c078a commit d076571

File tree

7 files changed

+257
-1
lines changed

7 files changed

+257
-1
lines changed

Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
import java.util.Arrays;
2929
import java.util.List;
30+
import java.util.Locale;
3031
import java.util.Random;
3132

3233
import com.amazon.randomcutforest.RandomCutForest;
@@ -960,7 +961,7 @@ public void setLastScore(double[] score) {
960961
}
961962

962963
void validateIgnore(double[] shift, int length) {
963-
checkArgument(shift.length == length, () -> "has to be of length " + 4 * baseDimension);
964+
checkArgument(shift.length == length, () -> String.format(Locale.ROOT, "has to be of length %d but is %d", length, shift.length));
964965
for (double element : shift) {
965966
checkArgument(element >= 0, "has to be non-negative");
966967
}

python_rcf_wrapper/README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Random Cut Forest (RCF) in Python
2+
3+
RCF (Random Cut Forest) is implemented in Java and Rust. To use it in Python, follow these steps:
4+
5+
## Step 1: Install JPype
6+
7+
Install JPype to enable the interaction between Python and Java. You can find the installation instructions at [JPype Installation](https://jpype.readthedocs.io/en/latest/install.html).
8+
9+
## Step 2: Import and Use TRCF from `python_rcf_wrapper`
10+
11+
You need to import `TRCF` from the `python_rcf_wrapper` and call its `process` method. Below is an example Python script to demonstrate this:
12+
13+
```python
14+
from python_rcf_wrapper.trcf_model import TRandomCutForestModel as TRCF
15+
import numpy as np
16+
17+
# Parameters for the RCF model
18+
shingle_size = 8
19+
dimensions = 2
20+
num_trees = 50
21+
output_after = 32
22+
sample_size = 256
23+
24+
# Initialize the RCF model
25+
model = TRCF(
26+
rcf_dimensions=shingle_size * dimensions,
27+
shingle_size=shingle_size,
28+
num_trees=num_trees,
29+
output_after=output_after,
30+
anomaly_rate=0.001,
31+
z_factor=3,
32+
score_differencing=0.5,
33+
sample_size=sample_size
34+
)
35+
36+
# Generate test data
37+
TEST_DATA = np.random.normal(size=(300, 2))
38+
39+
# Process each data point and print the RCF score and anomaly grade
40+
for point in TEST_DATA:
41+
descriptor = model.process(point)
42+
print("RCF score: {}, Anomaly grade: {}".format(descriptor.getRCFScore(), descriptor.getAnomalyGrade()))
43+
```

python_rcf_wrapper/__init__.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from pathlib import Path
2+
3+
import logging
4+
5+
# Import JPype Module for Java imports
6+
import jpype.imports
7+
from jpype.types import *
8+
9+
import os
10+
11+
java_home = os.environ.get("JAVA_HOME", None)
12+
13+
DEFAULT_JAVA_PATH = Path(__file__).parent / "lib"
14+
15+
16+
java_path = str(Path(os.environ.get("JAVA_LIB", DEFAULT_JAVA_PATH)) / "*")
17+
18+
jpype.addClassPath(java_path)
19+
20+
# Launch the JVM
21+
jpype.startJVM(convertStrings=False)
22+
23+
logging.info("availableProcess {}".format(jpype.java.lang.Runtime.getRuntime().availableProcessors()))
312 KB
Binary file not shown.
Binary file not shown.

python_rcf_wrapper/rcf_model.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Java imports
2+
from typing import List, Optional, Tuple, Any
3+
4+
import numpy as np
5+
import logging
6+
from com.amazon.randomcutforest import RandomCutForest
7+
import jpype
8+
9+
class RandomCutForestModel:
10+
"""
11+
Random Cut Forest Python Binding around the AWS Random Cut Forest Official Java version:
12+
https://github.com/aws/random-cut-forest-by-aws
13+
"""
14+
15+
def __init__(self, forest: RandomCutForest = None, shingle_size: int = 8,
16+
num_trees: int = 100, random_seed: int = None,
17+
sample_size: int = 256, parallel_execution_enabled: bool = True,
18+
thread_pool_size: Optional[int] = None, lam: float=0.0001,
19+
output_after: int=256):
20+
if forest is not None:
21+
self.forest = forest
22+
else:
23+
builder = RandomCutForest.builder().numberOfTrees(num_trees). \
24+
sampleSize(sample_size). \
25+
dimensions(shingle_size). \
26+
storeSequenceIndexesEnabled(True). \
27+
centerOfMassEnabled(True). \
28+
parallelExecutionEnabled(parallel_execution_enabled). \
29+
timeDecay(lam). \
30+
outputAfter(output_after)
31+
if thread_pool_size is not None:
32+
builder.threadPoolSize(thread_pool_size)
33+
34+
if random_seed is not None:
35+
builder = builder.randomSeed(random_seed)
36+
37+
self.forest = builder.build()
38+
39+
def score(self, point: List[float]) -> float:
40+
"""
41+
Compute an anomaly score for the given point.
42+
43+
Parameters
44+
----------
45+
point: List[float]
46+
A data point with shingle size
47+
48+
Returns
49+
-------
50+
float
51+
The anomaly score for the given point
52+
53+
"""
54+
return self.forest.getAnomalyScore(point)
55+
56+
def update(self, point: List[float]):
57+
"""
58+
Update the model with the data point.
59+
60+
Parameters
61+
----------
62+
point: List[float]
63+
Point with shingle size
64+
"""
65+
self.forest.update(point)
66+
67+
68+
def impute(self, point: List[float]) -> List[float]:
69+
"""
70+
Given a point with missing values, return a new point with the missing values imputed. Each tree in the forest
71+
individual produces an imputed value. For 1-dimensional points, the median imputed value is returned. For
72+
points with more than 1 dimension, the imputed point with the 25th percentile anomaly score is returned.
73+
74+
Parameters
75+
----------
76+
point: List[float]
77+
The point with shingle size
78+
79+
Returns
80+
-------
81+
List[float]
82+
The imputed point.
83+
"""
84+
85+
num_missing = np.isnan(point).sum()
86+
if num_missing == 0:
87+
return point
88+
missing_index = np.argwhere(np.isnan(point)).flatten()
89+
imputed_shingle = list(self.forest.imputeMissingValues(point, num_missing, missing_index))
90+
return imputed_shingle
91+
92+
def forecast(self, point: List[float]) -> float:
93+
"""
94+
Given one shingled data point, return one step forecast containing the next value.
95+
96+
Parameters
97+
----------
98+
point: List[float]
99+
The point with shingle size
100+
101+
Returns
102+
-------
103+
float
104+
Forecast value of next timestamp.
105+
106+
"""
107+
val = list(self.forest.extrapolateBasic(point, 1, 1, False, 0))[0]
108+
return val
109+
110+
@property
111+
def shingle_size(self) -> int:
112+
"""
113+
Returns
114+
-------
115+
int
116+
Shingle size of random cut trees.
117+
"""
118+
return self.forest.getDimensions()
119+
120+
def get_attribution(self, point: List[float]) -> Tuple[List[float], List[float]]:
121+
try:
122+
attribution_di_vec: Any = self.forest.getAnomalyAttribution(point)
123+
low: List[float] = list(attribution_di_vec.low)
124+
high: List[float] = list(attribution_di_vec.high)
125+
return low, high
126+
except jpype.JException as exception:
127+
logging.info("Error when loading the model: %s", exception.message())
128+
logging.info("Stack track: %s", exception.stacktrace())
129+
# Throw it back
130+
raise exception
131+

python_rcf_wrapper/trcf_model.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Java imports
2+
from typing import List, Optional, Tuple, Any
3+
4+
import numpy as np
5+
import logging
6+
from com.amazon.randomcutforest.parkservices import ThresholdedRandomCutForest
7+
from com.amazon.randomcutforest.config import Precision
8+
from com.amazon.randomcutforest.parkservices import AnomalyDescriptor
9+
from com.amazon.randomcutforest.config import TransformMethod
10+
import jpype
11+
12+
class TRandomCutForestModel:
13+
"""
14+
Random Cut Forest Python Binding around the AWS Random Cut Forest Official Java version:
15+
https://github.com/aws/random-cut-forest-by-aws
16+
"""
17+
18+
def __init__(self, rcf_dimensions, shingle_size, num_trees: int = 30, output_after: int=256, anomaly_rate=0.005,
19+
z_factor=2.5, score_differencing=0.5, ignore_delta_threshold=0, sample_size=256):
20+
self.forest = (ThresholdedRandomCutForest
21+
.builder()
22+
.dimensions(rcf_dimensions)
23+
.sampleSize(sample_size)
24+
.numberOfTrees(num_trees)
25+
.timeDecay(0.0001)
26+
.initialAcceptFraction(output_after*1.0/sample_size)
27+
.parallelExecutionEnabled(True)
28+
.compact(True)
29+
.precision(Precision.FLOAT_32)
30+
.boundingBoxCacheFraction(1)
31+
.shingleSize(shingle_size)
32+
.anomalyRate(anomaly_rate)
33+
.outputAfter(output_after)
34+
.internalShinglingEnabled(True)
35+
.transformMethod(TransformMethod.NORMALIZE)
36+
.alertOnce(True)
37+
.autoAdjust(True)
38+
.build())
39+
self.forest.setZfactor(z_factor)
40+
41+
def process(self, point: List[float]) -> AnomalyDescriptor:
42+
"""
43+
a single call that prepreprocesses data, compute score/grade and updates
44+
state.
45+
46+
Parameters
47+
----------
48+
point: List[float]
49+
A data point with shingle size
50+
51+
Returns
52+
-------
53+
AnomalyDescriptor
54+
Encapsulate detailed information about anomalies detected by RCF model. This class stores various attributes
55+
related to an anomaly, such as confidence levels, attribution scores, and expected values.
56+
57+
"""
58+
return self.forest.process(point, 0)

0 commit comments

Comments
 (0)