Skip to content

Commit f167048

Browse files
committed
final cleanup to make
attack data cache work easy to use, intuitive, and minimally intrusive for the average content developer
1 parent 25fde48 commit f167048

File tree

4 files changed

+100
-31
lines changed

4 files changed

+100
-31
lines changed

contentctl/contentctl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def init_func(config: test):
6868

6969

7070
def validate_func(config: validate) -> DirectorOutputDto:
71+
config.check_test_data_caches()
7172
validate = Validate()
7273
return validate.execute(config)
7374

contentctl/objects/abstract_security_content_objects/detection_abstract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ def search_rba_fields_exist_validate(self):
913913
return self
914914

915915
@field_validator("tests", mode="before")
916-
def ensure_yml_test_is_unittest(cls, v: list[dict]):
916+
def ensure_yml_test_is_unittest(cls, v: list[dict], info: ValidationInfo):
917917
"""The typing for the tests field allows it to be one of
918918
a number of different types of tests. However, ONLY
919919
UnitTest should be allowed to be defined in the YML
@@ -941,7 +941,7 @@ def ensure_yml_test_is_unittest(cls, v: list[dict]):
941941
for unitTest in v:
942942
# This raises a ValueError on a failed UnitTest.
943943
try:
944-
UnitTest.model_validate(unitTest)
944+
UnitTest.model_validate(unitTest, context=info.context)
945945
except ValueError as e:
946946
valueErrors.append(e)
947947
if len(valueErrors):

contentctl/objects/config.py

Lines changed: 96 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
field_validator,
2727
model_validator,
2828
)
29+
from requests import RequestException, head
2930

3031
from contentctl.helper.splunk_app import SplunkApp
3132
from contentctl.helper.utils import Utils
@@ -261,6 +262,13 @@ class init(Config_Base):
261262
)
262263

263264

265+
# There can be a number of attack data file warning mapping exceptions, or errors,
266+
# that can occur when using attack data caches. In order to avoid very complex
267+
# output, we will only emit the verbose versions of these message once per file.
268+
# This is a non-intuitive place to put this, but it is good enough for now.
269+
ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS: set[str] = set()
270+
271+
264272
class AttackDataCache(BaseModel):
265273
base_url: str = Field(
266274
"This is the beginning of a URL that the data must begin with to map to this cache object."
@@ -270,7 +278,19 @@ class AttackDataCache(BaseModel):
270278
pattern=r"^external_repos/.+",
271279
)
272280
# suggested checkout information for our attack_data repo
273-
# curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C datasets
281+
# curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C attack_data/
282+
# suggested YML values for this:
283+
helptext: str | None = Field(
284+
default="This repo is set up to use test_data_caches. This can be extremely helpful in validating correct links for test attack_data and speeding up testing.\n"
285+
"Include the following in your contentctl.yml file to use this cache:\n\n"
286+
"test_data_caches:\n"
287+
"- base_url: https://media.githubusercontent.com/media/splunk/attack_data/master/\n"
288+
" base_directory_name: external_repos/attack_data\n\n"
289+
"In order to check out STRT Attack Data, you can use the following command:\n"
290+
"mkdir -p external_repos; curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C external_repos/\n"
291+
"or\n"
292+
"""echo "First ensure git-lfs is enabled"; git clone https://github.com/splunk/attack_data external_repos/attack_data"""
293+
)
274294

275295

276296
class validate(Config_Base):
@@ -317,9 +337,36 @@ class validate(Config_Base):
317337
def external_repos_path(self) -> pathlib.Path:
318338
return self.path / "external_repos"
319339

340+
# We can't make this a validator because the constructor
341+
# is called many times - we don't want to print this out many times.
342+
def check_test_data_caches(self) -> Self:
343+
"""
344+
Check that the test data caches actually exist at the specified paths.
345+
If they do exist, then do nothing. If they do not, then emit the helpext, but
346+
do not raise an exception. They are not required, but can significantly speed up
347+
and reduce the flakiness of tests by reducing failed HTTP requests.
348+
"""
349+
if not self.verbose:
350+
# Ignore the check and error output if we are not in verbose mode
351+
return self
352+
353+
for cache in self.test_data_caches:
354+
cache_path = self.path / cache.base_directory_name
355+
if not cache_path.is_dir():
356+
print(cache.helptext)
357+
else:
358+
print(f"Found attack data cache at {cache_path}")
359+
return self
360+
320361
def map_to_attack_data_cache(
321-
self, filename: HttpUrl | FilePath
362+
self, filename: HttpUrl | FilePath, verbose: bool = False
322363
) -> HttpUrl | FilePath:
364+
if str(filename) in ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS:
365+
# This is already something that we have emitted a warning or
366+
# Exception for. We don't want to emit it again as it will
367+
# pollute the output.
368+
return filename
369+
323370
# If this is simply a link to a file directly, then no mapping
324371
# needs to take place. Return the link to the file.
325372
if isinstance(filename, pathlib.Path):
@@ -339,33 +386,54 @@ def map_to_attack_data_cache(
339386
new_file_path = root_folder_path / new_file_name
340387

341388
if not root_folder_path.is_dir():
342-
# This has not been checked out. If a cache file was listed in the config AND we hit
343-
# on a prefix, it MUST be checked out
344-
raise ValueError(
345-
f"The following directory does not exist: [{root_folder_path}]. "
346-
"You must check out this directory since you have supplied 1 or more test_data_caches in contentctl.yml."
347-
)
348-
349-
if not new_file_path.is_file():
350-
raise ValueError(
351-
f"The following file does not the test_data_cache {cache.base_directory_name}:\n"
352-
f"\tFile: {new_file_name}"
389+
# This has not been checked out. Even though we want to use this cache
390+
# whenever possible, we don't want to force it.
391+
return filename
392+
393+
if new_file_path.is_file():
394+
# We found the file in the cache. Return the new path
395+
return new_file_path
396+
397+
# Any thing below here is non standard behavior that will produce either a warning message,
398+
# an error, or both. We onyl want to do this once for each file, even if it is used
399+
# across multiple different detections.
400+
ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS.add(str(filename))
401+
402+
# The cache exists, but we didn't find the file. We will emit an informational warning
403+
# for this, but this is not an exception. Instead, we will just fall back to using
404+
# the original URL.
405+
if verbose:
406+
# Give some extra context about missing attack data files/bad mapping
407+
try:
408+
h = head(str(filename))
409+
h.raise_for_status()
410+
411+
except RequestException:
412+
raise ValueError(
413+
f"Error resolving the attack_data file {filename}. "
414+
f"It was missing from the cache {cache.base_directory_name} and a download from the server failed."
415+
)
416+
print(
417+
f"\nFilename {filename} not found in cache {cache.base_directory_name}, but exists on the server. "
418+
f"Your cache {cache.base_directory_name} may be out of date."
353419
)
354-
return new_file_path
355-
356-
print("raising here\n")
357-
x = "\n\t".join(
358-
[
359-
f"base_url{index:02}: {url}"
360-
for index, url in enumerate(
361-
[cache.base_url for cache in self.test_data_caches]
362-
)
363-
]
364-
)
365-
raise ValueError(
366-
"Test data file does not start with any of the following base_url's. If you are supplying caches, any HTTP file MUST be located in a cache:\n"
367-
f"\tFile : {filename}\n\t{x}"
368-
)
420+
return filename
421+
if verbose:
422+
# Any thing below here is non standard behavior that will produce either a warning message,
423+
# an error, or both. We onyl want to do this once for each file, even if it is used
424+
# across multiple different detections.
425+
ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS.add(str(filename))
426+
427+
# Give some extra context about missing attack data files/bad mapping
428+
url = f"Attack Data : {filename}"
429+
prefixes = "".join(
430+
[
431+
f"\n Valid Prefix: {cache.base_url}"
432+
for cache in self.test_data_caches
433+
]
434+
)
435+
print(f"\nAttack Data Missing from all caches:\n{url}{prefixes}")
436+
return filename
369437

370438
@property
371439
def mitre_cti_repo_path(self) -> pathlib.Path:

contentctl/objects/test_attack_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def check_for_existence_of_attack_data_repo(
4141
# the test data to a file on disk
4242
if info.context.get("config", None):
4343
config: validate = info.context.get("config", None)
44-
return config.map_to_attack_data_cache(value)
44+
return config.map_to_attack_data_cache(value, verbose=config.verbose)
4545
else:
4646
raise ValueError(
4747
"config not passed to TestAttackData constructor in context"

0 commit comments

Comments
 (0)