Skip to content

Commit 2b633b6

Browse files
authored
Merge pull request #392 from splunk/attack_data_cache_2
Enable Attack Data Download before Test
2 parents 881f5d5 + 8b489a0 commit 2b633b6

File tree

6 files changed

+215
-13
lines changed

6 files changed

+215
-13
lines changed

contentctl/actions/detection_testing/infrastructures/DetectionTestingInfrastructure.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import time
88
import urllib.parse
99
import uuid
10-
from shutil import copyfile
1110
from ssl import SSLEOFError, SSLZeroReturnError
1211
from sys import stdout
1312
from tempfile import TemporaryDirectory, mktemp
@@ -1402,7 +1401,6 @@ def replay_attack_data_file(
14021401
f"The only valid indexes on the server are {self.all_indexes_on_server}"
14031402
)
14041403

1405-
tempfile = mktemp(dir=tmp_dir)
14061404
if not (
14071405
str(attack_data_file.data).startswith("http://")
14081406
or str(attack_data_file.data).startswith("https://")
@@ -1415,13 +1413,7 @@ def replay_attack_data_file(
14151413
test_group_start_time,
14161414
)
14171415

1418-
try:
1419-
copyfile(str(attack_data_file.data), tempfile)
1420-
except Exception as e:
1421-
raise Exception(
1422-
f"Error copying local Attack Data File for [{test_group.name}] - [{attack_data_file.data}]: "
1423-
f"{str(e)}"
1424-
)
1416+
tempfile = str(attack_data_file.data)
14251417
else:
14261418
raise Exception(
14271419
f"Attack Data File for [{test_group.name}] is local [{attack_data_file.data}], but does not exist."
@@ -1432,6 +1424,7 @@ def replay_attack_data_file(
14321424
# We need to overwrite the file - mkstemp will create an empty file with the
14331425
# given name
14341426
try:
1427+
tempfile = mktemp(dir=tmp_dir)
14351428
# In case the path is a local file, try to get it
14361429

14371430
self.format_pbar_string(

contentctl/contentctl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def init_func(config: test):
6868

6969

7070
def validate_func(config: validate) -> DirectorOutputDto:
71+
config.check_test_data_caches()
7172
validate = Validate()
7273
return validate.execute(config)
7374

contentctl/objects/abstract_security_content_objects/detection_abstract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ def search_rba_fields_exist_validate(self):
913913
return self
914914

915915
@field_validator("tests", mode="before")
916-
def ensure_yml_test_is_unittest(cls, v: list[dict]):
916+
def ensure_yml_test_is_unittest(cls, v: list[dict], info: ValidationInfo):
917917
"""The typing for the tests field allows it to be one of
918918
a number of different types of tests. However, ONLY
919919
UnitTest should be allowed to be defined in the YML
@@ -941,7 +941,7 @@ def ensure_yml_test_is_unittest(cls, v: list[dict]):
941941
for unitTest in v:
942942
# This raises a ValueError on a failed UnitTest.
943943
try:
944-
UnitTest.model_validate(unitTest)
944+
UnitTest.model_validate(unitTest, context=info.context)
945945
except ValueError as e:
946946
valueErrors.append(e)
947947
if len(valueErrors):

contentctl/objects/config.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
field_validator,
2727
model_validator,
2828
)
29+
from requests import RequestException, head
2930

3031
from contentctl.helper.splunk_app import SplunkApp
3132
from contentctl.helper.utils import Utils
@@ -261,6 +262,37 @@ class init(Config_Base):
261262
)
262263

263264

265+
# There can be a number of attack data file warning mapping exceptions, or errors,
266+
# that can occur when using attack data caches. In order to avoid very complex
267+
# output, we will only emit the verbose versions of these message once per file.
268+
# This is a non-intuitive place to put this, but it is good enough for now.
269+
ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS: set[str] = set()
270+
271+
272+
class AttackDataCache(BaseModel):
273+
base_url: str = Field(
274+
"This is the beginning of a URL that the data must begin with to map to this cache object."
275+
)
276+
base_directory_name: str = Field(
277+
"This is the root folder name where the attack data should be downloaded to. Note that this path MUST be in the external_repos/ folder",
278+
pattern=r"^external_repos/.+",
279+
)
280+
# suggested checkout information for our attack_data repo
281+
# curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C attack_data/
282+
# suggested YML values for this:
283+
helptext: str | None = Field(
284+
default="This repo is set up to use test_data_caches. This can be extremely helpful in validating correct links for test attack_data and speeding up testing.\n"
285+
"Include the following in your contentctl.yml file to use this cache:\n\n"
286+
"test_data_caches:\n"
287+
"- base_url: https://media.githubusercontent.com/media/splunk/attack_data/master/\n"
288+
" base_directory_name: external_repos/attack_data\n\n"
289+
"In order to check out STRT Attack Data, you can use the following command:\n"
290+
"mkdir -p external_repos; curl https://attack-range-attack-data.s3.us-west-2.amazonaws.com/attack_data.tar.zstd | zstd --decompress | tar -x -C external_repos/\n"
291+
"or\n"
292+
"""echo "First ensure git-lfs is enabled"; git clone https://github.com/splunk/attack_data external_repos/attack_data"""
293+
)
294+
295+
264296
class validate(Config_Base):
265297
model_config = ConfigDict(validate_default=True, arbitrary_types_allowed=True)
266298
enforce_deprecation_mapping_requirement: bool = Field(
@@ -291,10 +323,151 @@ class validate(Config_Base):
291323
default=False, description="Validate latest TA information from Splunkbase"
292324
)
293325

326+
test_data_caches: list[AttackDataCache] = Field(
327+
default=[],
328+
description="A list of attack data that can "
329+
"be used in lieu of the HTTPS download links "
330+
"of each test data file. This cache can significantly "
331+
"increase overall test speed, ensure the correctness of "
332+
"links at 'contentctl validate' time, and reduce errors "
333+
"associated with failed responses from file servers.",
334+
)
335+
294336
@property
295337
def external_repos_path(self) -> pathlib.Path:
296338
return self.path / "external_repos"
297339

340+
# We can't make this a validator because the constructor
341+
# is called many times - we don't want to print this out many times.
342+
def check_test_data_caches(self) -> Self:
343+
"""
344+
Check that the test data caches actually exist at the specified paths.
345+
If they do exist, then do nothing. If they do not, then emit the helpext, but
346+
do not raise an exception. They are not required, but can significantly speed up
347+
and reduce the flakiness of tests by reducing failed HTTP requests.
348+
"""
349+
if not self.verbose:
350+
# Ignore the check and error output if we are not in verbose mode
351+
return self
352+
for cache in self.test_data_caches:
353+
cache_path = self.path / cache.base_directory_name
354+
if not cache_path.is_dir():
355+
print(cache.helptext)
356+
else:
357+
build_date_file = cache_path / "cache_build_date.txt"
358+
git_hash_file = cache_path / "git_hash.txt"
359+
360+
if build_date_file.is_file():
361+
# This is a cache that was built by contentctl. We can use this to
362+
# determine if the cache is out of date.
363+
with open(build_date_file, "r") as f:
364+
build_date = f.read().strip()
365+
else:
366+
build_date = "<UNKNOWN_DATE>"
367+
if git_hash_file.is_file():
368+
# This is a cache that was built by contentctl. We can use this to
369+
# determine if the cache is out of date.
370+
with open(git_hash_file, "r") as f:
371+
git_hash = f.read().strip()
372+
else:
373+
git_hash = "<UNKNOWN_HASH>"
374+
375+
print(
376+
f"Found attack data cache at [{cache_path}]\n**Cache Build Date: {build_date}\n**Repo Git Hash : {git_hash}\n"
377+
)
378+
379+
return self
380+
381+
def map_to_attack_data_cache(
382+
self, filename: HttpUrl | FilePath, verbose: bool = False
383+
) -> HttpUrl | FilePath:
384+
if str(filename) in ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS:
385+
# This is already something that we have emitted a warning or
386+
# Exception for. We don't want to emit it again as it will
387+
# pollute the output.
388+
return filename
389+
390+
# If this is simply a link to a file directly, then no mapping
391+
# needs to take place. Return the link to the file.
392+
if isinstance(filename, pathlib.Path):
393+
return filename
394+
395+
if len(self.test_data_caches) == 0:
396+
return filename
397+
398+
# Otherwise, this is a URL. See if its prefix matches one of the
399+
# prefixes in the list of caches
400+
for cache in self.test_data_caches:
401+
root_folder_path = self.path / cache.base_directory_name
402+
# See if this data file was in that path
403+
404+
if str(filename).startswith(cache.base_url):
405+
new_file_name = str(filename).replace(cache.base_url, "")
406+
new_file_path = root_folder_path / new_file_name
407+
408+
if not root_folder_path.is_dir():
409+
# This has not been checked out. Even though we want to use this cache
410+
# whenever possible, we don't want to force it.
411+
return filename
412+
413+
if new_file_path.is_file():
414+
# We found the file in the cache. Return the new path
415+
return new_file_path
416+
417+
# Any thing below here is non standard behavior that will produce either a warning message,
418+
# an error, or both. We onyl want to do this once for each file, even if it is used
419+
# across multiple different detections.
420+
ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS.add(str(filename))
421+
422+
# The cache exists, but we didn't find the file. We will emit an informational warning
423+
# for this, but this is not an exception. Instead, we will just fall back to using
424+
# the original URL.
425+
if verbose:
426+
# Give some extra context about missing attack data files/bad mapping
427+
try:
428+
h = head(str(filename))
429+
h.raise_for_status()
430+
431+
except RequestException:
432+
raise ValueError(
433+
f"Error resolving the attack_data file {filename}. "
434+
f"It was missing from the cache {cache.base_directory_name} and a download from the server failed."
435+
)
436+
print(
437+
f"\nFilename {filename} not found in cache {cache.base_directory_name}, but exists on the server. "
438+
f"Your cache {cache.base_directory_name} may be out of date."
439+
)
440+
return filename
441+
if verbose:
442+
# Any thing below here is non standard behavior that will produce either a warning message,
443+
# an error, or both. We onyl want to do this once for each file, even if it is used
444+
# across multiple different detections.
445+
ATTACK_DATA_CACHE_MAPPING_EXCEPTIONS.add(str(filename))
446+
447+
# Give some extra context about missing attack data files/bad mapping
448+
url = f"Attack Data : {filename}"
449+
prefixes = "".join(
450+
[
451+
f"\n Valid Prefix: {cache.base_url}"
452+
for cache in self.test_data_caches
453+
]
454+
)
455+
# Give some extra context about missing attack data files/bad mapping
456+
try:
457+
h = head(str(filename))
458+
h.raise_for_status()
459+
except RequestException:
460+
raise ValueError(
461+
f"Error resolving the attack_data file {filename}. It was missing from all caches and a download from the server failed.\n"
462+
f"{url}{prefixes}\n"
463+
)
464+
465+
print(
466+
f"\nAttack Data Missing from all caches, but present at URL:\n{url}{prefixes}"
467+
)
468+
469+
return filename
470+
298471
@property
299472
def mitre_cti_repo_path(self) -> pathlib.Path:
300473
return self.external_repos_path / "cti"

contentctl/objects/test_attack_data.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
from __future__ import annotations
2-
from pydantic import BaseModel, HttpUrl, FilePath, Field, ConfigDict
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from contentctl.objects.config import validate
7+
8+
from pydantic import (
9+
BaseModel,
10+
ConfigDict,
11+
Field,
12+
FilePath,
13+
HttpUrl,
14+
ValidationInfo,
15+
field_validator,
16+
)
317

418

519
class TestAttackData(BaseModel):
@@ -11,3 +25,24 @@ class TestAttackData(BaseModel):
1125
sourcetype: str = Field(...)
1226
custom_index: str | None = None
1327
host: str | None = None
28+
29+
@field_validator("data", mode="after")
30+
@classmethod
31+
def check_for_existence_of_attack_data_repo(
32+
cls, value: HttpUrl | FilePath, info: ValidationInfo
33+
) -> HttpUrl | FilePath:
34+
# this appears to be called more than once, the first time
35+
# info.context is always None. In this case, just return what
36+
# was passed.
37+
if not info.context:
38+
return value
39+
40+
# When the config is passed, used it to determine if we can map
41+
# the test data to a file on disk
42+
if info.context.get("config", None):
43+
config: validate = info.context.get("config", None)
44+
return config.map_to_attack_data_cache(value, verbose=config.verbose)
45+
else:
46+
raise ValueError(
47+
"config not passed to TestAttackData constructor in context"
48+
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.poetry]
22
name = "contentctl"
33

4-
version = "5.4.1"
4+
version = "5.5.0"
55

66
description = "Splunk Content Control Tool"
77
authors = ["STRT <[email protected]>"]

0 commit comments

Comments
 (0)