Skip to content

Commit d2b4109

Browse files
authored
Data library (#10)
* Inital code for data query library supporting Log Analytics, Security graph and WDATP * Updates to base64unpack and iocextract. Updating and expanding test cases. * Adding unit tests and fixing some things * Adding more unit tests and refactoring the driver/provider structure. Also simplified the entityschema stuff a little to use internal __dict__ for properties. * A few fixes + black formatting * Updating gitignore to ignore vscode settings * Fixing linting errors * Flake8 line len and pylint error * And updating the version to match Pete's PR
1 parent c247063 commit d2b4109

35 files changed

+3469
-525
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,4 @@ venv.bak/
104104
.mypy_cache/
105105
/msticpy.code-workspace
106106
/docs/source/_build/**
107+
**/.vscode*

msticpy/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
"""Version file."""
2-
VERSION = "0.1.7"
2+
VERSION = "0.1.8"

msticpy/data/data_providers.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
"""Data provider loader."""
7+
from functools import partial
8+
from os import path
9+
from typing import Union, Any
10+
11+
import pandas as pd
12+
13+
from .drivers import DriverBase, KqlDriver, SecurityGraphDriver
14+
from .query_store import QueryStore
15+
from .param_extractor import extract_query_params
16+
from ..nbtools.query_defns import DataEnvironment
17+
from ..nbtools.utility import export
18+
from .._version import VERSION
19+
20+
__version__ = VERSION
21+
__author__ = "Ian Hellen"
22+
23+
_PROVIDER_DIR = "providers"
24+
_QUERY_DEF_DIR = "queries"
25+
26+
_ENVIRONMENT_DRIVERS = {
27+
DataEnvironment.LogAnalytics: KqlDriver,
28+
DataEnvironment.AzureSecurityCenter: KqlDriver,
29+
DataEnvironment.SecurityGraph: SecurityGraphDriver,
30+
}
31+
32+
33+
class AttribHolder:
34+
"""Empty class used to create hierarchical attributes."""
35+
36+
def __len__(self):
37+
"""Retrun number of items in the attribute collection."""
38+
return len(self.__dict__)
39+
40+
def __iter__(self):
41+
"""Return iterator over the attributes."""
42+
return iter(self.__dict__.items())
43+
44+
45+
@export
46+
class QueryProvider:
47+
"""
48+
Container for query store and query execution provider.
49+
50+
Instances of this class hold the query set and execution
51+
methods for a specific data environment.
52+
53+
"""
54+
55+
def __init__(
56+
self, data_environment: Union[str, DataEnvironment], driver: DriverBase = None
57+
):
58+
"""
59+
Query provider interface to queries.
60+
61+
Parameters
62+
----------
63+
data_environment : Union[str, DataEnvironment]
64+
Name or Enum of environment for the QueryProvider
65+
driver : DriverBase, optional
66+
Override the builtin driver (query execution class)
67+
and use your own driver (must inherit from
68+
`DriverBase`)
69+
70+
See Also
71+
--------
72+
DataProviderBase : base class for data query providers.
73+
74+
"""
75+
if isinstance(data_environment, str):
76+
data_environment = DataEnvironment.parse(data_environment)
77+
78+
self._environment = data_environment.name
79+
80+
if driver is None:
81+
driver_class = _ENVIRONMENT_DRIVERS[data_environment]
82+
if issubclass(driver_class, DriverBase):
83+
driver = driver_class()
84+
else:
85+
raise LookupError(
86+
"Could not find suitable data provider for",
87+
f" {data_environment.name}",
88+
)
89+
90+
self._query_provider = driver
91+
92+
# Find the path of this module and build sub-path
93+
query_path = path.join(path.dirname(__file__), _QUERY_DEF_DIR)
94+
95+
# Load data query definitions for environment
96+
data_environments = QueryStore.import_files(
97+
source_path=query_path, recursive=True
98+
)
99+
self._query_store = data_environments[data_environment.name]
100+
101+
self.all_queries = AttribHolder()
102+
self._add_query_functions()
103+
104+
def connect(self, connection_str: str, **kwargs):
105+
"""
106+
Connect to data source.
107+
108+
Parameters
109+
----------
110+
connection_string : str
111+
Connection string for the data source
112+
113+
"""
114+
return self._query_provider.connect(connection_str=connection_str, **kwargs)
115+
116+
def import_query_file(self, query_file: str):
117+
"""
118+
Import a yaml data source definition.
119+
120+
Parameters
121+
----------
122+
query_file : str
123+
Path to the file to import
124+
125+
"""
126+
self._query_store.import_file(query_file)
127+
128+
def list_queries(self):
129+
"""
130+
Return list of family.query in the store.
131+
132+
Returns
133+
-------
134+
Iterable[str]
135+
List of queries
136+
137+
"""
138+
return self._query_store.query_names
139+
140+
def query_help(self, query_name):
141+
"""Print help for query."""
142+
self._query_store[query_name].help()
143+
144+
def _execute_query(self, *args, **kwargs) -> Union[pd.DataFrame, Any]:
145+
if not self._query_provider.loaded:
146+
raise ValueError("Provider is not loaded.")
147+
if not self._query_provider.connected:
148+
raise ValueError(
149+
"No connection to a data source.",
150+
"Please call connect(connection_str) and retry.",
151+
)
152+
query_name = kwargs.pop("query_name")
153+
family = kwargs.pop("data_family")
154+
155+
query_source = self._query_store.get_query(
156+
data_family=family, query_name=query_name
157+
)
158+
if "help" in args or "?" in args:
159+
query_source.help()
160+
return None
161+
162+
params, missing = extract_query_params(query_source, *args, **kwargs)
163+
if missing:
164+
query_source.help()
165+
raise ValueError(f"No values found for these parameters: {missing}")
166+
167+
query_str = query_source.create_query(**params)
168+
return self._query_provider.query(query_str)
169+
170+
def _add_query_functions(self):
171+
"""Add queries to the module as callable methods."""
172+
for qual_query_name in self.list_queries():
173+
174+
family, query_name = qual_query_name.split(".")
175+
if not hasattr(self, family):
176+
setattr(self, family, AttribHolder())
177+
query_family = getattr(self, family)
178+
179+
# Create the partial function
180+
query_func = partial(
181+
self._execute_query, data_family=family, query_name=query_name
182+
)
183+
query_func.__doc__ = self._query_store.get_query(
184+
family, query_name
185+
).create_doc_string()
186+
187+
setattr(query_family, query_name, query_func)
188+
setattr(self.all_queries, query_name, query_func)

msticpy/data/data_query_reader.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
"""Data query definition reader."""
7+
from typing import Tuple, Dict, Iterable, Any
8+
from pathlib import Path
9+
import yaml
10+
11+
from ..nbtools.query_defns import DataFamily, DataEnvironment
12+
from .._version import VERSION
13+
14+
__version__ = VERSION
15+
__author__ = "Ian Hellen"
16+
17+
18+
def find_yaml_files(source_path: str, recursive: bool = False) -> Iterable[Path]:
19+
"""Return iterable of yaml files found in `source_path`.
20+
21+
Parameters
22+
----------
23+
source_path : str
24+
The source path to search in.
25+
recursive : bool, optional
26+
Whether to recurse through subfolders.
27+
By default False
28+
29+
Returns
30+
-------
31+
Iterable[str]
32+
File paths of yanl files found.
33+
34+
"""
35+
recurse_pfx = "**/" if recursive else ""
36+
file_glob = Path(source_path).glob(f"{recurse_pfx}*.yaml")
37+
for file_path in file_glob:
38+
if not file_path.is_file():
39+
continue
40+
yield file_path
41+
42+
43+
def read_query_def_file(query_file: str) -> Tuple[Dict, Dict, Dict]:
44+
"""
45+
Read a yaml data query definition file.
46+
47+
Parameters
48+
----------
49+
query_file : str
50+
Path to yaml query defintion file
51+
52+
Returns
53+
-------
54+
Tuple[Dict, Dict, Dict]
55+
Tuple of dictionaries.
56+
sources - dictionary of query definitions
57+
defaults - the default parameters from the file
58+
metadata - the global metadata from the file
59+
60+
"""
61+
data_map = None
62+
with open(query_file) as f_handle:
63+
# use safe_load instead load
64+
data_map = yaml.safe_load(f_handle)
65+
66+
validate_query_defs(query_def_dict=data_map)
67+
68+
defaults = data_map.get("defaults", {})
69+
sources = data_map.get("sources", {})
70+
metadata = data_map.get("metadata", {})
71+
72+
return sources, defaults, metadata
73+
74+
75+
def validate_query_defs(query_def_dict: Dict[str, Any]) -> bool:
76+
"""Validate content of query definition.
77+
78+
Parameters
79+
----------
80+
query_def_dict : dict
81+
Dictionary of query definition yaml file contents.
82+
83+
Returns
84+
-------
85+
bool
86+
True if validation succeeds.
87+
88+
Raises
89+
------
90+
ValueError
91+
The validation failure reason is returned in the
92+
exception message (arg[0])
93+
94+
"""
95+
# verify that sources and metadata are in the data dict
96+
if "sources" not in query_def_dict or not query_def_dict["sources"]:
97+
raise ValueError("Imported file has no sources defined")
98+
if "metadata" not in query_def_dict or not query_def_dict["metadata"]:
99+
raise ValueError("Imported file has no metadata defined")
100+
101+
# data_environments and data_families must be defined at with at least
102+
# one value
103+
_validate_data_categories(query_def_dict)
104+
105+
return True
106+
107+
108+
def _validate_data_categories(query_def_dict: Dict):
109+
if (
110+
"data_environments" not in query_def_dict["metadata"]
111+
or not query_def_dict["metadata"]["data_environments"]
112+
):
113+
raise ValueError("Imported file has no data_environments defined")
114+
115+
for env in query_def_dict["metadata"]["data_environments"]:
116+
if not DataEnvironment.parse(env):
117+
raise ValueError(
118+
f"Unknown data evironment {env} in metadata. ",
119+
"Valid values are\n",
120+
", ".join([e.name for e in DataEnvironment]),
121+
)
122+
if (
123+
"data_families" not in query_def_dict["metadata"]
124+
or not query_def_dict["metadata"]["data_families"]
125+
):
126+
raise ValueError("Imported file has no data families defined")
127+
128+
for fam in query_def_dict["metadata"]["data_families"]:
129+
if not DataFamily.parse(fam):
130+
raise ValueError(
131+
f"Unknown data family {fam} in metadata. ",
132+
"Valid values are\n",
133+
", ".join([f.name for f in DataFamily]),
134+
)

msticpy/data/drivers/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# --------------------------------------------------------------------------
6+
"""Data provider sub-package."""
7+
# flake8: noqa: F403
8+
from . driver_base import DriverBase
9+
from . kql_driver import KqlDriver
10+
from . security_graph_driver import SecurityGraphDriver

0 commit comments

Comments
 (0)