66import desbordante
77
88RED = '\033 [31m'
9+ YELLOW = '\033 [33m'
10+ BOLD_YELLOW = '\033 [1;33m'
911GREEN = '\033 [32m'
1012BLUE = '\033 [34m'
1113CYAN = '\033 [36m'
12- GRAY = '\033 [1;30m '
14+ BOLD = '\033 [1;37m '
1315ENDC = '\033 [0m'
1416
1517ENGINE_TEMPS_BAD = 'examples/datasets/verifying_pac/engine_temps_bad.csv'
1618ENGINE_TEMPS_GOOD = 'examples/datasets/verifying_pac/engine_temps_good.csv'
1719
1820
19- def column_to_str (filename : str ) -> str :
21+ def read_column (filename : str , col_num : int ) -> ( str , list [ str ]) :
2022 with open (filename , newline = '' ) as table :
2123 rows = list (reader (table , delimiter = ',' ))
22- headers = rows [0 ]
23- rows = rows [1 :]
24- values = ', ' .join (list (map (lambda row : str (row [0 ]), rows )))
25- return f'{ headers [0 ]} : [{ values } ]'
24+ header = rows [0 ][col_num ]
25+ values = [row [col_num ] for row in rows [1 :]]
26+ return header , values
27+
28+
29+ def column_to_str (filename : str , col_num : int ) -> str :
30+ header , values = read_column (filename , col_num )
31+ values_str = ', ' .join (values )
32+ return f'{ BOLD } { header } : [{ values_str } ]{ ENDC } '
33+
34+
35+ def display_columns_diff (filename_old : str , col_num_old : int ,
36+ filename_new : str , col_num_new : int ) -> str :
37+ _ , values_old = read_column (filename_old , col_num_old )
38+ header , values_new = read_column (filename_new , col_num_new )
39+ values = []
40+ for i in range (len (values_new )):
41+ value = values_new [i ]
42+ if values_old [i ] != value :
43+ value = f'{ BOLD_YELLOW } ' + value + f'{ BOLD } '
44+ values .append (value )
45+ values_str = ', ' .join (values )
46+ return f'{ BOLD } { header } : [{ values_str } ]{ ENDC } '
2647
2748
2849print (
29- f'''{ CYAN } This example illustrates the usage of Domain Probabilistic Approximate Constraints (PACs).
30- Domain PAC on column set X and domain D, with given epsilon and delta means that Pr(x ∈ D±epsilon ) ≥ delta .
50+ f'''This example illustrates the usage of Domain Probabilistic Approximate Constraints (PACs).
51+ A Domain PAC on column set X and domain D, with given ε and δ means that Pr(x ∈ D±ε ) ≥ δ .
3152For more information consult "Checks and Balances: Monitoring Data Quality Problems in Network
32- Traffic Databases" by Flip Korn et al.
53+ Traffic Databases" by Flip Korn et al (Proceedings of the 29th VLDB Conference, Berlin, 2003) .
3354
34- This is the first example in "Basic Domain PAC verification" series. Others can be found in
35- examples/basic/verifying_pac/{ ENDC } directory.
55+ This is the first example in the "Basic Domain PAC verification" series. Others can be found in
56+ { CYAN } examples/basic/verifying_pac/{ ENDC } directory.
3657''' )
3758
3859print (
39- f'''Consider we are working on a new model of engine. It \' s working temperatures lie in span { BLUE } [85, 95]{ ENDC } °C.
40- Engine is made of high-strenght metal, so slight short-term temperature deviations won \' t kill it.
41- In other words, engine works properly when Pr(t ∈ [85, 95]±epsilon ) ≥ delta .
42- Our engieneers have figured out limits: epsilon = { BLUE } 5{ ENDC } , delta = { BLUE } 0.9{ ENDC } .
43- So, in terms of Domain PACs, { BLUE } Domain PAC Pr(x ∈ [85, 95]±5) ≥ 0.9{ ENDC } should hold .
60+ f'''Suppose we are working on a new model of engine. Its operating temperature range is { BLUE } [85, 95]{ ENDC } °C.
61+ The engine is made of high-strength metal, so short-term temperature deviations are acceptable and
62+ will not cause immediate damage. In other words, engine operates properly when Pr(t ∈ [85, 95]±ε ) ≥ δ .
63+ Based on enginnering analysis, the acceptable limits are: ε = { BLUE } 5{ ENDC } , δ = { BLUE } 0.9{ ENDC } .
64+ In terms of Domain PACs, the following constraint should hold: { BLUE } Pr(x ∈ [85, 95]±5) ≥ 0.9{ ENDC } .
4465''' )
4566
46- print ('The following table contains readings of engine temperature sensor:' )
67+ print (
68+ 'The following table contains readings from the engine temperature sensor:'
69+ )
4770# Values are printed in one line for brevity, original table is single-column
48- print (f'{ GRAY } { column_to_str (ENGINE_TEMPS_BAD ) } { ENDC } ' )
71+ print (f'{ column_to_str (ENGINE_TEMPS_BAD , 0 ) } ' )
4972print ()
5073
51- print ('Let\' s use Domain PAC verifier to check if engine will be damaged\n ' )
74+ print (
75+ 'We now use the Domain PAC verifier to determine whether the engine is operating safely.'
76+ )
5277
5378print (
54- 'Firstly we need to create domain. Segment is a special case of parallelepiped, so let \' s use it.'
79+ 'First, we need to define the domain. A segment is a special case of a parallelepiped, so we use it here .'
5580)
5681# Parallelepiped has a special constructor for segment.
57- # Notice the usage of quotes: these strings will be converted to values once table is loaded.
82+ # Notice the usage of quotes: these strings will be converted to values once the table is loaded.
5883segment = desbordante .pac .domains .Parallelepiped ('85' , '95' )
5984
60- # TODO(senichenkov): diagonal_threshold example
6185print (
62- f'''Now let\' s run algorithm with the following options: domain={ BLUE } { segment } { ENDC } , max_epsilon={ BLUE } 10{ ENDC } , min_delta={ BLUE } 0.85{ ENDC } .
63- Max_epsilon should be greater than desired epsilon, min_delta -- a little less than the expected one.
64- We will use default values of other options: min_epsilon={ BLUE } 0{ ENDC } , epsilon_steps={ BLUE } 100{ ENDC } , diagonal_threshold={ BLUE } 1e-5{ ENDC } .
65- Min_epsilon, max_epsilon and epsilon_steps contol which epsilon values and how many of them will be checked by the algorithm.
66- Diagonal threshold is advanced parameter, that is explained in { CYAN } examples/advanced/verifying_pac/##EXAMPLE_NAME##{ ENDC } .
86+ f'''We run algorithm with the following options: domain={ BLUE } { segment } { ENDC } .
87+ All other parameters use default values: min_epsilon={ BLUE } 0{ ENDC } , max_epsilon={ BLUE } ∞{ ENDC } , min_delta={ BLUE } 0.9{ ENDC } , delta_steps={ BLUE } 100{ ENDC } .
6788''' )
89+
6890algo = desbordante .pac_verification .algorithms .DomainPACVerifier ()
6991# Note that domain should be set in `load_data`, not `execute`
7092algo .load_data (table = (ENGINE_TEMPS_BAD , ',' , True ),
7193 column_indices = [0 ],
7294 domain = segment )
73- algo .execute (max_epsilon = 10 , min_delta = 0.85 )
95+ algo .execute ()
96+
97+ print (f'Algorithm result: { YELLOW } { algo .get_pac ()} { ENDC } .' )
98+ print (
99+ f'''This PAC is not very informative. Let\' s run algorithm with min_epsilon={ BLUE } 5{ ENDC } and max_epsilon={ BLUE } 5{ ENDC } .
100+ This will give us the exact δ, for which PAC with ε={ BLUE } 5{ ENDC } holds.
101+ ''' )
102+
103+ # Note that, when min_epsilon or max_epsilon is specified, default min_delta becomes 0
104+ algo .execute (min_epsilon = 5 , max_epsilon = 5 )
74105
75106print (f'Algorithm result: { RED } { algo .get_pac ()} { ENDC } .' )
76- print ('Uh-oh! The desired PAC doesn\' t hold, so engine can blow up!\n ' )
107+ print (
108+ f'''Also, let\' s run algorithm with max_epsilon={ BLUE } 0{ ENDC } and min_delta={ BLUE } 0.9{ ENDC } to check which ε
109+ is needed to satisfy δ={ BLUE } 0.9{ ENDC } . With these parameters algorithm enter special mode and returns
110+ pair (ε, min_delta), so that we can validate PAC with the given δ.
111+ ''' )
112+
113+ # Actually, algorithm enter this mode whenever max_epsilon is less than epsislon needed to satisfy
114+ # min_delta.
115+ algo .execute (max_epsilon = 0 , min_delta = 0.9 )
116+
117+ pac = algo .get_pac ()
118+ print (f'Algorithm result: { RED } { pac } { ENDC } .' )
119+ print (
120+ f'''Here algorithm gives δ={ BLUE } { pac .delta } { ENDC } , which is greater than { BLUE } 0.9{ ENDC } , because achieving δ={ BLUE } 0.9{ ENDC } requires
121+ ε={ BLUE } { pac .epsilon } { ENDC } and PAC ({ BLUE } { pac .epsilon } { ENDC } , { BLUE } { pac .delta } { ENDC } ) holds. So, this means that δ={ BLUE } 0.9{ ENDC } would also require ε={ BLUE } { pac .epsilon } { ENDC } .
122+ ''' )
123+
124+ print (
125+ 'We can see that desired PAC doesn\' t hold, so the engine can blow up!\n ' )
77126
78127print (
79128 f'''Let\' s look at values violating PAC. Domain PAC verifier can detect values between eps_1
80- and eps_2, i. e. values that lie in D±eps_2 \\ D±eps_1. Such values are called highlights.
81- Let\' s check highlights for different eps_1, eps_2 values:''' )
129+ and eps_2, i. e. values that lie in D±eps_2 \\ D±eps_1. Such values are called highlights or outliers .
130+ Let\' s find outliers for different eps_1, eps_2 values:''' )
82131
83132value_ranges = [(0 , 1 ), (1 , 2 ), (2 , 3 ), (3 , 5 ), (5 , 7 ), (7 , 10 )]
84133highlights_table = [(f'{ BLUE } { v_range [0 ]} { ENDC } ' , f'{ BLUE } { v_range [1 ]} { ENDC } ' ,
@@ -88,36 +137,25 @@ def column_to_str(filename: str) -> str:
88137print ()
89138
90139print ('''We can see two problems:
91- 1. Time engine worked on low temperature was too long, but these temperatures were just a little lower than 80°C.
92- 2. Peak temperature was too high, but it has been reached only once.\n '''
93- )
94-
95- print ('''Second version of engine has:
96- 1. pre-heating system to prevent engine from working on low temperatures;
97- 2. emergency cooling system to lower peak temperatures.
98- Let\' s look at second version\' s sensor readings:''' )
99- print (f'{ GRAY } { column_to_str (ENGINE_TEMPS_GOOD )} { ENDC } ' )
140+ 1. The engine operated at low temperatures for an extended period, slightly below 80°C.
141+ 2. The peak temperature was too high, but this occured only once.\n ''' )
142+
143+ print ('''The second version of engine has:
144+ 1. A pre-heating system to prevent operation at low temperatures.
145+ 2. An emergency cooling system to limit peak temperatures.
146+ The updated sensor readings (modified values highlighted) are:''' )
147+ print (f'{ display_columns_diff (ENGINE_TEMPS_BAD , 0 , ENGINE_TEMPS_GOOD , 0 )} ' )
100148print ()
101149
102- print (
103- f'''Let\' s run Domain PAC verifier with same parameters (domain={ BLUE } { segment } { ENDC } , max_epsilon={ BLUE } 10{ ENDC } ).'''
104- )
150+ print (f'''We run the Domain PAC verifier again.''' )
105151algo = desbordante .pac_verification .algorithms .DomainPACVerifier ()
106152algo .load_data (table = (ENGINE_TEMPS_GOOD , ',' , True ),
107153 column_indices = [0 ],
108154 domain = segment )
109- algo .execute (max_epsilon = 10 , min_delta = 0.85 )
110-
111- print (f'Algorithm result: { RED } { algo .get_pac ()} { ENDC } .' )
112- print (
113- f'''This PAC says that epsilon={ BLUE } 6{ ENDC } is enough to cover all possible values. It\' s true, but this information is useless.
114- Let\' s select max_epsilon={ BLUE } 5.5{ ENDC } and min_delta={ BLUE } 0.87{ ENDC } to give algorithm a hint which values do we want.'''
115- )
116- # Max_epsilon is an "execute option", so `load_data` is not needed
117- algo .execute (max_epsilon = 5.5 , min_delta = 0.87 )
155+ algo .execute ()
118156
119157print (f'''Algorithm result: { GREEN } { algo .get_pac ()} { ENDC } .
120- Our desired PAC holds, which means that engine works well .
158+ The desired PAC now holds, which means the improved engine operates within acceptable limits .
121159
122- It's recommended to continue with reading second example ({ CYAN } examples/basic/verifying_pac/verifying_domain_pac2.py{ ENDC } ),
160+ It is recommended to continue with the second example ({ CYAN } examples/basic/verifying_pac/verifying_domain_pac2.py{ ENDC } ),
123161which demonstrates more advanced usage of the Parallelepiped domain.''' )
0 commit comments