Skip to content

Commit 7b10d64

Browse files
authored
Merge pull request #200 from splunk/validate_csv
Parse and validate CSV files
2 parents 3c7df89 + c1c2a40 commit 7b10d64

File tree

4 files changed

+130
-51
lines changed

4 files changed

+130
-51
lines changed

contentctl/actions/validate.py

Lines changed: 39 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,11 @@
1-
import sys
21

3-
from dataclasses import dataclass
4-
5-
from pydantic import ValidationError
6-
from typing import Union
7-
8-
from contentctl.objects.enums import SecurityContentProduct
9-
from contentctl.objects.abstract_security_content_objects.security_content_object_abstract import (
10-
SecurityContentObject_Abstract,
11-
)
2+
import pathlib
123
from contentctl.input.director import Director, DirectorOutputDto
13-
144
from contentctl.objects.config import validate
155
from contentctl.enrichments.attack_enrichment import AttackEnrichment
166
from contentctl.enrichments.cve_enrichment import CveEnrichment
177
from contentctl.objects.atomic import AtomicTest
8+
from contentctl.helper.utils import Utils
189

1910

2011
class Validate:
@@ -42,38 +33,44 @@ def execute(self, input_dto: validate) -> DirectorOutputDto:
4233

4334
director = Director(director_output_dto)
4435
director.execute(input_dto)
36+
self.ensure_no_orphaned_files_in_lookups(input_dto.path, director_output_dto)
4537
return director_output_dto
4638

47-
def validate_duplicate_uuids(
48-
self, security_content_objects: list[SecurityContentObject_Abstract]
49-
):
50-
all_uuids = set()
51-
duplicate_uuids = set()
52-
for elem in security_content_objects:
53-
if elem.id in all_uuids:
54-
# The uuid has been found more than once
55-
duplicate_uuids.add(elem.id)
56-
else:
57-
# This is the first time the uuid has been found
58-
all_uuids.add(elem.id)
39+
40+
def ensure_no_orphaned_files_in_lookups(self, repo_path:pathlib.Path, director_output_dto:DirectorOutputDto):
41+
"""
42+
This function ensures that only files which are relevant to lookups are included in the lookups folder.
43+
This means that a file must be either:
44+
1. A lookup YML (.yml)
45+
2. A lookup CSV (.csv) which is referenced by a YML
46+
3. A lookup MLMODEL (.mlmodel) which is referenced by a YML.
47+
48+
All other files, includes CSV and MLMODEL files which are NOT
49+
referenced by a YML, will generate an exception from this function.
50+
51+
Args:
52+
repo_path (pathlib.Path): path to the root of the app
53+
director_output_dto (DirectorOutputDto): director object with all constructed content
5954
60-
if len(duplicate_uuids) == 0:
61-
return
55+
Raises:
56+
Exception: An Exception will be raised if there are any non .yml, .csv, or .mlmodel
57+
files in this directory. Additionally, an exception will be raised if there
58+
exists one or more .csv or .mlmodel files that are not referenced by at least 1
59+
detection .yml file in this directory.
60+
This avoids having additional, unused files in this directory that may be copied into
61+
the app when it is built (which can cause appinspect errors or larger app size.)
62+
"""
63+
lookupsDirectory = repo_path/"lookups"
64+
65+
# Get all of the files referneced by Lookups
66+
usedLookupFiles:list[pathlib.Path] = [lookup.filename for lookup in director_output_dto.lookups if lookup.filename is not None] + [lookup.file_path for lookup in director_output_dto.lookups if lookup.file_path is not None]
6267

63-
# At least once duplicate uuid has been found. Enumerate all
64-
# the pieces of content that use duplicate uuids
65-
duplicate_messages = []
66-
for uuid in duplicate_uuids:
67-
duplicate_uuid_content = [
68-
str(content.file_path)
69-
for content in security_content_objects
70-
if content.id in duplicate_uuids
71-
]
72-
duplicate_messages.append(
73-
f"Duplicate UUID [{uuid}] in {duplicate_uuid_content}"
74-
)
75-
76-
raise ValueError(
77-
"ERROR: Duplicate ID(s) found in objects:\n"
78-
+ "\n - ".join(duplicate_messages)
79-
)
68+
# Get all of the mlmodel and csv files in the lookups directory
69+
csvAndMlmodelFiles = Utils.get_security_content_files_from_directory(lookupsDirectory, allowedFileExtensions=[".yml",".csv",".mlmodel"], fileExtensionsToReturn=[".csv",".mlmodel"])
70+
71+
# Generate an exception of any csv or mlmodel files exist but are not used
72+
unusedLookupFiles:list[pathlib.Path] = [testFile for testFile in csvAndMlmodelFiles if testFile not in usedLookupFiles]
73+
if len(unusedLookupFiles) > 0:
74+
raise Exception(f"The following .csv or .mlmodel files exist in '{lookupsDirectory}', but are not referenced by a lookup file: {[str(path) for path in unusedLookupFiles]}")
75+
return
76+

contentctl/helper/utils.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,49 @@ def get_all_yml_files_from_directory(path: str) -> list[pathlib.Path]:
3434
listOfFiles.append(pathlib.Path(os.path.join(dirpath, file)))
3535

3636
return sorted(listOfFiles)
37+
38+
@staticmethod
39+
def get_security_content_files_from_directory(path: pathlib.Path, allowedFileExtensions:list[str]=[".yml"], fileExtensionsToReturn:list[str]=[".yml"]) -> list[pathlib.Path]:
40+
41+
"""
42+
Get all of the Security Content Object Files rooted in a given directory. These will almost
43+
certain be YML files, but could be other file types as specified by the user
44+
45+
Args:
46+
path (pathlib.Path): The root path at which to enumerate all Security Content Files. All directories will be traversed.
47+
allowedFileExtensions (set[str], optional): File extensions which are allowed to be present in this directory. In most cases, we do not want to allow the presence of non-YML files. Defaults to [".yml"].
48+
fileExtensionsToReturn (set[str], optional): Filenames with extensions that should be returned from this function. For example, the lookups/ directory contains YML, CSV, and MLMODEL directories, but only the YMLs are Security Content Objects for constructing Lookyps. Defaults to[".yml"].
49+
50+
Raises:
51+
Exception: Will raise an exception if allowedFileExtensions is not a subset of fileExtensionsToReturn.
52+
Exception: Will raise an exception if the path passed to the function does not exist or is not a directory
53+
Exception: Will raise an exception if there are any files rooted in the directory which are not in allowedFileExtensions
54+
55+
Returns:
56+
list[pathlib.Path]: list of files with an extension in fileExtensionsToReturn found in path
57+
"""
58+
if not set(fileExtensionsToReturn).issubset(set(allowedFileExtensions)):
59+
raise Exception(f"allowedFileExtensions {allowedFileExtensions} MUST be a subset of fileExtensionsToReturn {fileExtensionsToReturn}, but it is not")
60+
61+
if not path.exists() or not path.is_dir():
62+
raise Exception(f"Unable to get security_content files, required directory '{str(path)}' does not exist or is not a directory")
63+
64+
allowedFiles:list[pathlib.Path] = []
65+
erroneousFiles:list[pathlib.Path] = []
66+
#Get every single file extension
67+
for filePath in path.glob("**/*.*"):
68+
if filePath.suffix in allowedFileExtensions:
69+
# Yes these are allowed
70+
allowedFiles.append(filePath)
71+
else:
72+
# No these have not been allowed
73+
erroneousFiles.append(filePath)
74+
75+
if len(erroneousFiles):
76+
raise Exception(f"The following files are not allowed in the directory '{path}'. Only files with the extensions {allowedFileExtensions} are allowed:{[str(filePath) for filePath in erroneousFiles]}")
77+
78+
# There were no errorneous files, so return the requested files
79+
return sorted([filePath for filePath in allowedFiles if filePath.suffix in fileExtensionsToReturn])
3780

3881
@staticmethod
3982
def get_all_yml_files_from_directory_one_layer_deep(path: str) -> list[pathlib.Path]:

contentctl/objects/lookup.py

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from pydantic import field_validator, ValidationInfo, model_validator, FilePath, model_serializer
33
from typing import TYPE_CHECKING, Optional, Any, Union
44
import re
5+
import csv
56
if TYPE_CHECKING:
67
from contentctl.input.director import DirectorOutputDto
78
from contentctl.objects.config import validate
@@ -61,15 +62,53 @@ def fix_lookup_path(cls, data:Any, info: ValidationInfo)->Any:
6162
raise ValueError("config required for constructing lookup filename, but it was not")
6263
return data
6364

64-
@field_validator('filename')
65-
@classmethod
66-
def lookup_file_valid(cls, v: Union[FilePath,None], info: ValidationInfo):
67-
if not v:
68-
return v
69-
if not (v.name.endswith(".csv") or v.name.endswith(".mlmodel")):
70-
raise ValueError(f"All Lookup files must be CSV files and end in .csv. The following file does not: '{v}'")
7165

72-
return v
66+
def model_post_init(self, ctx:dict[str,Any]):
67+
if not self.filename:
68+
return
69+
import pathlib
70+
filenamePath = pathlib.Path(self.filename)
71+
72+
if filenamePath.suffix not in [".csv", ".mlmodel"]:
73+
raise ValueError(f"All Lookup files must be CSV files and end in .csv. The following file does not: '{filenamePath}'")
74+
75+
76+
77+
if filenamePath.suffix == ".mlmodel":
78+
# Do not need any additional checks for an mlmodel file
79+
return
80+
81+
# https://docs.python.org/3/library/csv.html#csv.DictReader
82+
# Column Names (fieldnames) determine by the number of columns in the first row.
83+
# If a row has MORE fields than fieldnames, they will be dumped in a list under the key 'restkey' - this should throw an Exception
84+
# If a row has LESS fields than fieldnames, then the field should contain None by default. This should also throw an exception.
85+
csv_errors:list[str] = []
86+
with open(filenamePath, "r") as csv_fp:
87+
RESTKEY = "extra_fields_in_a_row"
88+
csv_dict = csv.DictReader(csv_fp, restkey=RESTKEY)
89+
if csv_dict.fieldnames is None:
90+
raise ValueError(f"Error validating the CSV referenced by the lookup: {filenamePath}:\n\t"
91+
"Unable to read fieldnames from CSV. Is the CSV empty?\n"
92+
" Please try opening the file with a CSV Editor to ensure that it is correct.")
93+
# Remember that row 1 has the headers and we do not iterate over it in the loop below
94+
# CSVs are typically indexed starting a row 1 for the header.
95+
for row_index, data_row in enumerate(csv_dict):
96+
row_index+=2
97+
if len(data_row.get(RESTKEY,[])) > 0:
98+
csv_errors.append(f"row [{row_index}] should have [{len(csv_dict.fieldnames)}] columns,"
99+
f" but instead had [{len(csv_dict.fieldnames) + len(data_row.get(RESTKEY,[]))}].")
100+
101+
for column_index, column_name in enumerate(data_row):
102+
if data_row[column_name] is None:
103+
csv_errors.append(f"row [{row_index}] should have [{len(csv_dict.fieldnames)}] columns, "
104+
f"but instead had [{column_index}].")
105+
if len(csv_errors) > 0:
106+
err_string = '\n\t'.join(csv_errors)
107+
raise ValueError(f"Error validating the CSV referenced by the lookup: {filenamePath}:\n\t{err_string}\n"
108+
f" Please try opening the file with a CSV Editor to ensure that it is correct.")
109+
110+
return
111+
73112

74113
@field_validator('match_type')
75114
@classmethod

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "contentctl"
3-
version = "4.1.4"
3+
version = "4.1.5"
44
description = "Splunk Content Control Tool"
55
authors = ["STRT <[email protected]>"]
66
license = "Apache 2.0"

0 commit comments

Comments
 (0)