26
26
field_validator ,
27
27
model_validator ,
28
28
)
29
+ from requests import RequestException , head
29
30
30
31
from contentctl .helper .splunk_app import SplunkApp
31
32
from contentctl .helper .utils import Utils
@@ -261,6 +262,13 @@ class init(Config_Base):
261
262
)
262
263
263
264
265
+ # There can be a number of attack data file warning mapping exceptions, or errors,
266
+ # that can occur when using attack data caches. In order to avoid very complex
267
+ # output, we will only emit the verbose versions of these message once per file.
268
+ # This is a non-intuitive place to put this, but it is good enough for now.
269
+ ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS : set [str ] = set ()
270
+
271
+
264
272
class AttackDataCache (BaseModel ):
265
273
base_url : str = Field (
266
274
"This is the beginning of a URL that the data must begin with to map to this cache object."
@@ -270,7 +278,19 @@ class AttackDataCache(BaseModel):
270
278
pattern = r"^external_repos/.+" ,
271
279
)
272
280
# suggested checkout information for our attack_data repo
273
- # curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C datasets
281
+ # curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C attack_data/
282
+ # suggested YML values for this:
283
+ helptext : str | None = Field (
284
+ default = "This repo is set up to use test_data_caches. This can be extremely helpful in validating correct links for test attack_data and speeding up testing.\n "
285
+ "Include the following in your contentctl.yml file to use this cache:\n \n "
286
+ "test_data_caches:\n "
287
+ "- base_url: https://media.githubusercontent.com/media/splunk/attack_data/master/\n "
288
+ " base_directory_name: external_repos/attack_data\n \n "
289
+ "In order to check out STRT Attack Data, you can use the following command:\n "
290
+ "mkdir -p external_repos; curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C external_repos/\n "
291
+ "or\n "
292
+ """echo "First ensure git-lfs is enabled"; git clone https://github.com/splunk/attack_data external_repos/attack_data"""
293
+ )
274
294
275
295
276
296
class validate (Config_Base ):
@@ -317,9 +337,36 @@ class validate(Config_Base):
317
337
def external_repos_path (self ) -> pathlib .Path :
318
338
return self .path / "external_repos"
319
339
340
+ # We can't make this a validator because the constructor
341
+ # is called many times - we don't want to print this out many times.
342
+ def check_test_data_caches (self ) -> Self :
343
+ """
344
+ Check that the test data caches actually exist at the specified paths.
345
+ If they do exist, then do nothing. If they do not, then emit the helpext, but
346
+ do not raise an exception. They are not required, but can significantly speed up
347
+ and reduce the flakiness of tests by reducing failed HTTP requests.
348
+ """
349
+ if not self .verbose :
350
+ # Ignore the check and error output if we are not in verbose mode
351
+ return self
352
+
353
+ for cache in self .test_data_caches :
354
+ cache_path = self .path / cache .base_directory_name
355
+ if not cache_path .is_dir ():
356
+ print (cache .helptext )
357
+ else :
358
+ print (f"Found attack data cache at { cache_path } " )
359
+ return self
360
+
320
361
def map_to_attack_data_cache (
321
- self , filename : HttpUrl | FilePath
362
+ self , filename : HttpUrl | FilePath , verbose : bool = False
322
363
) -> HttpUrl | FilePath :
364
+ if str (filename ) in ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS :
365
+ # This is already something that we have emitted a warning or
366
+ # Exception for. We don't want to emit it again as it will
367
+ # pollute the output.
368
+ return filename
369
+
323
370
# If this is simply a link to a file directly, then no mapping
324
371
# needs to take place. Return the link to the file.
325
372
if isinstance (filename , pathlib .Path ):
@@ -339,33 +386,54 @@ def map_to_attack_data_cache(
339
386
new_file_path = root_folder_path / new_file_name
340
387
341
388
if not root_folder_path .is_dir ():
342
- # This has not been checked out. If a cache file was listed in the config AND we hit
343
- # on a prefix, it MUST be checked out
344
- raise ValueError (
345
- f"The following directory does not exist: [{ root_folder_path } ]. "
346
- "You must check out this directory since you have supplied 1 or more test_data_caches in contentctl.yml."
347
- )
348
-
349
- if not new_file_path .is_file ():
350
- raise ValueError (
351
- f"The following file does not the test_data_cache { cache .base_directory_name } :\n "
352
- f"\t File: { new_file_name } "
389
+ # This has not been checked out. Even though we want to use this cache
390
+ # whenever possible, we don't want to force it.
391
+ return filename
392
+
393
+ if new_file_path .is_file ():
394
+ # We found the file in the cache. Return the new path
395
+ return new_file_path
396
+
397
+ # Any thing below here is non standard behavior that will produce either a warning message,
398
+ # an error, or both. We onyl want to do this once for each file, even if it is used
399
+ # across multiple different detections.
400
+ ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS .add (str (filename ))
401
+
402
+ # The cache exists, but we didn't find the file. We will emit an informational warning
403
+ # for this, but this is not an exception. Instead, we will just fall back to using
404
+ # the original URL.
405
+ if verbose :
406
+ # Give some extra context about missing attack data files/bad mapping
407
+ try :
408
+ h = head (str (filename ))
409
+ h .raise_for_status ()
410
+
411
+ except RequestException :
412
+ raise ValueError (
413
+ f"Error resolving the attack_data file { filename } . "
414
+ f"It was missing from the cache { cache .base_directory_name } and a download from the server failed."
415
+ )
416
+ print (
417
+ f"\n Filename { filename } not found in cache { cache .base_directory_name } , but exists on the server. "
418
+ f"Your cache { cache .base_directory_name } may be out of date."
353
419
)
354
- return new_file_path
355
-
356
- print ("raising here\n " )
357
- x = "\n \t " .join (
358
- [
359
- f"base_url{ index :02} : { url } "
360
- for index , url in enumerate (
361
- [cache .base_url for cache in self .test_data_caches ]
362
- )
363
- ]
364
- )
365
- raise ValueError (
366
- "Test data file does not start with any of the following base_url's. If you are supplying caches, any HTTP file MUST be located in a cache:\n "
367
- f"\t File : { filename } \n \t { x } "
368
- )
420
+ return filename
421
+ if verbose :
422
+ # Any thing below here is non standard behavior that will produce either a warning message,
423
+ # an error, or both. We onyl want to do this once for each file, even if it is used
424
+ # across multiple different detections.
425
+ ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS .add (str (filename ))
426
+
427
+ # Give some extra context about missing attack data files/bad mapping
428
+ url = f"Attack Data : { filename } "
429
+ prefixes = "" .join (
430
+ [
431
+ f"\n Valid Prefix: { cache .base_url } "
432
+ for cache in self .test_data_caches
433
+ ]
434
+ )
435
+ print (f"\n Attack Data Missing from all caches:\n { url } { prefixes } " )
436
+ return filename
369
437
370
438
@property
371
439
def mitre_cti_repo_path (self ) -> pathlib .Path :
0 commit comments