AMLGentex/src/data_creation/generator.py at 535cd3f8760446e4b585fbe5b808882f07bf4d87 · aidotse/AMLGentex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import os
import sys
import yaml
import time
from pathlib import Path
from src.utils.logging import get_logger

logger = get_logger(__name__)


class DataGenerator:
    """
    Wrapper for the AMLGentex data generation pipeline.

    For optimization/tuning workflows:
    - Spatial simulation (graph generation) is run once or reused
    - Temporal simulation can be run multiple times with different parameters
    """

    def __init__(self, conf_file: str):
        """
        Args:
            conf_file: Absolute path to the data configuration YAML file
        """
        if not os.path.isabs(conf_file):
            raise ValueError(f'conf_file must be an absolute path, got: {conf_file}')

        if not os.path.exists(conf_file):
            raise FileNotFoundError(f'Config file not found: {conf_file}')

        self.conf_file = conf_file

        # Load YAML config
        with open(conf_file, 'r') as f:
            self.config = yaml.safe_load(f)

        # Get directories
        self.data_creation_dir = Path(__file__).parent

        # Get project root (parent of src/)
        self.project_root = self.data_creation_dir.parent.parent

        # Convert relative paths in config to absolute paths
        self._make_paths_absolute()

        # Add data_creation directory to path for imports
        if str(self.data_creation_dir) not in sys.path:
            sys.path.insert(0, str(self.data_creation_dir))

    def _make_paths_absolute(self):
        """Convert relative paths in config to absolute paths relative to project root"""
        # Input directory
        if not os.path.isabs(self.config['input']['directory']):
            self.config['input']['directory'] = str(self.project_root / self.config['input']['directory'])

        # Temporal (spatial output) directory
        if not os.path.isabs(self.config['spatial']['directory']):
            self.config['spatial']['directory'] = str(self.project_root / self.config['spatial']['directory'])

        # Output (temporal output) directory
        if not os.path.isabs(self.config['output']['directory']):
            self.config['output']['directory'] = str(self.project_root / self.config['output']['directory'])

    def run_spatial(self, force=False):
        """
        Run spatial simulation (graph generation).

        Args:
            force: If True, regenerate even if outputs exist

        Returns:
            Path to the spatial simulation output directory
        """
        import spatial_simulation.generate_scalefree as generate_scalefree
        import spatial_simulation.transaction_graph_generator as txgraph

        degree_file = self.config['input']['degree']

        # Get spatial output directory from config (already absolute)
        spatial_output = Path(self.config['spatial']['directory'])

        # Check if spatial outputs already exist
        if spatial_output.exists():
            spatial_files = list(spatial_output.glob('*.csv'))
            if spatial_files and not force:
                logger.info(f"Spatial simulation outputs found: {spatial_output}")
                logger.info(f"  Found {len(spatial_files)} CSV files")
                logger.info("Skipping spatial simulation (use force=True to regenerate)")
                return str(spatial_output)

        logger.info("Running spatial simulation...")

        # Save original working directory and argv
        orig_cwd = os.getcwd()
        orig_argv = sys.argv.copy()

        try:
            # Change to data_creation directory for simulation
            os.chdir(self.data_creation_dir)

            # Step 1: Generate degree distribution if needed (goes to spatial output)
            degree_path = spatial_output / degree_file
            if force or not degree_path.exists():
                logger.info(f"  [1/2] Generating degree distribution...")
                start = time.time()
                # Call directly with config dict (has absolute paths)
                generate_scalefree.generate_degree_file_from_config(self.config)
                logger.info(f"        Complete ({time.time() - start:.2f}s)")
            else:
                logger.info(f"  [1/2] Degree distribution found: {degree_path}")

            # Step 2: Generate transaction graph
            logger.info(f"  [2/2] Generating transaction graph...")
            start = time.time()
            # Call directly with config dict (has absolute paths)
            txgraph.generate_transaction_graph_from_config(self.config)
            logger.info(f"        Complete ({time.time() - start:.2f}s)")

        finally:
            # Restore original state
            os.chdir(orig_cwd)
            sys.argv = orig_argv

        return str(spatial_output)

    def run_temporal(self):
        """
        Run temporal simulation (time-step execution).

        Reloads config from file to pick up any parameter changes for optimization workflows.

        Returns:
            Path to the generated transaction log file
        """
        from temporal_simulation.simulator import AMLSimulator

        logger.info("Running temporal simulation...")

        # Reload config from file to pick up parameter changes
        with open(self.conf_file, 'r') as f:
            self.config = yaml.safe_load(f)
        self._make_paths_absolute()

        # Get temporal output directory from config (already absolute)
        temporal_output = Path(self.config['output']['directory'])

        # Save original working directory
        orig_cwd = os.getcwd()

        try:
            # Change to data_creation directory for simulation
            os.chdir(self.data_creation_dir)

            # Initialize and run simulator with config dict
            simulator = AMLSimulator(self.config)
            simulator.load_accounts()
            simulator.load_transactions()
            simulator.load_normal_models()
            simulator.load_alert_members()

            start = time.time()
            simulator.run()
            elapsed = time.time() - start

            # Write output
            simulator.write_output()

            logger.info(f"  Complete: {len(simulator.transactions):,} transactions in {elapsed:.2f}s")

            # Construct tx_log path
            tx_log_file = self.config['output']['transaction_log']
            tx_log_path = temporal_output / tx_log_file

            if not tx_log_path.exists():
                raise FileNotFoundError(f'Transaction log not found: {tx_log_path}')

            return str(tx_log_path)

        finally:
            # Restore original working directory
            os.chdir(orig_cwd)

    def run_spatial_baseline(self, force=False):
        """
        Run spatial simulation Phase 1: Generate baseline and save checkpoint.

        Creates the transaction graph up to demographics assignment and saves a checkpoint.
        This baseline can then be used for multiple alert injection trials with different
        ML selector configurations via run_spatial_from_baseline().

        Args:
            force: If True, regenerate even if checkpoint exists

        Returns:
            Path to the saved checkpoint file
        """
        import spatial_simulation.generate_scalefree as generate_scalefree
        import spatial_simulation.transaction_graph_generator as txgraph

        sim_name = self.config['general']['simulation_name']
        input_dir = self.config['input']['directory']
        output_dir = self.config['spatial']['directory']  # spatial output
        degree_file = self.config['input']['degree']

        # Checkpoint path (in spatial output directory, not config)
        checkpoint_path = Path(output_dir) / 'baseline_checkpoint.pkl'

        # Check if checkpoint already exists
        if checkpoint_path.exists() and not force:
            logger.info(f"Baseline checkpoint found: {checkpoint_path}")
            logger.info("Skipping baseline generation (use force=True to regenerate)")
            return str(checkpoint_path)

        logger.info("Running spatial baseline generation (Phase 1)...")

        # Save original working directory and argv
        orig_cwd = os.getcwd()
        orig_argv = sys.argv.copy()

        try:
            # Change to data_creation directory for simulation
            os.chdir(self.data_creation_dir)

            # Step 1: Generate degree distribution if needed (goes to spatial output)
            degree_path = Path(output_dir) / degree_file
            if force or not degree_path.exists():
                logger.info(f"  [1/2] Generating degree distribution...")
                start = time.time()
                generate_scalefree.generate_degree_file_from_config(self.config)
                logger.info(f"        Complete ({time.time() - start:.2f}s)")
            else:
                logger.info(f"  [1/2] Degree distribution found: {degree_path}")

            # Step 2: Generate baseline (stops before alert injection)
            logger.info(f"  [2/2] Generating baseline graph...")
            start = time.time()
            saved_path = txgraph.generate_baseline(self.config, checkpoint_path=str(checkpoint_path))
            logger.info(f"        Complete ({time.time() - start:.2f}s)")
            logger.info(f"        Checkpoint saved to: {saved_path}")

        finally:
            # Restore original state
            os.chdir(orig_cwd)
            sys.argv = orig_argv

        return str(checkpoint_path)

    def run_spatial_from_baseline(self, checkpoint_path=None):
        """
        Run spatial simulation Phase 2: Inject alerts from baseline checkpoint.

        Loads a previously saved baseline and injects alert patterns using the ML selector
        configuration from the current config. This allows testing different ML selection
        parameters without regenerating the entire graph.

        Args:
            checkpoint_path: Path to baseline checkpoint. If None, uses default location.

        Returns:
            Path to the spatial simulation output directory
        """
        import spatial_simulation.transaction_graph_generator as txgraph

        output_dir = self.config['spatial']['directory']  # spatial output

        # Default checkpoint path (in spatial output directory, not config)
        if checkpoint_path is None:
            checkpoint_path = Path(output_dir) / 'baseline_checkpoint.pkl'

        if not Path(checkpoint_path).exists():
            raise FileNotFoundError(
                f"Baseline checkpoint not found: {checkpoint_path}\n"
                "Run run_spatial_baseline() first to generate the baseline."
            )

        # Get spatial output directory from config (already absolute)
        spatial_output = Path(self.config['spatial']['directory'])

        logger.info("Running alert injection from baseline (Phase 2)...")

        # Save original working directory
        orig_cwd = os.getcwd()

        try:
            # Change to data_creation directory for simulation
            os.chdir(self.data_creation_dir)

            start = time.time()
            txgraph.inject_alerts_from_baseline(
                self.config,
                checkpoint_path=str(checkpoint_path)
            )
            logger.info(f"        Complete ({time.time() - start:.2f}s)")

        finally:
            # Restore original state
            os.chdir(orig_cwd)

        return str(spatial_output)

    def __call__(self, spatial=True, force_spatial=False):
        """
        Run the complete simulation pipeline or just temporal simulation.

        Args:
            spatial: If True, run spatial simulation first (or check if exists)
            force_spatial: If True, force regeneration of spatial graph

        Returns:
            Path to the transaction log file
        """
        if spatial:
            self.run_spatial(force=force_spatial)

        tx_log_path = self.run_temporal()

        return tx_log_path