Skip to content

Commit 6fa550f

Browse files
authored
Merge pull request #152 from DomainTools/2.3
Python Wrapper v.2.3.0 Release
2 parents 48da7e0 + 4e15601 commit 6fa550f

File tree

60 files changed

+1070283
-227
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+1070283
-227
lines changed

README.md

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,5 +211,63 @@ API_KEY
211211
Python Version Support Policy
212212
===================
213213

214-
Please see the [supported versions](https://github.com/DomainTools/python_api/raw/main/PYTHON_SUPPORT.md) document
214+
Please see the [supported versions](https://github.com/DomainTools/python_api/raw/main/PYTHON_SUPPORT.md) document
215215
for the DomainTools Python support policy.
216+
217+
218+
Real-Time Threat Intelligence Feeds
219+
===================
220+
221+
Real-Time Threat Intelligence Feeds provide data on the different stages of the domain lifecycle: from first-observed in the wild, to newly re-activated after a period of quiet. Access current feed data in real-time or retrieve historical feed data through separate APIs.
222+
223+
Custom parameters aside from the common `GET` Request parameters:
224+
- `endpoint` (choose either `download` or `feed` API endpoint - default is `feed`)
225+
```python
226+
api = API(USERNAME, KEY, always_sign_api_key=False)
227+
api.nod(endpoint="feed", **kwargs)
228+
```
229+
- `header_authentication`: by default, we're using API Header Authentication. Set this False if you want to use API Key and Secret Authentication. Apparently, you can't use API Header Authentication for `download` endpoints so this will be defaulted to `False` even without explicitly setting it.
230+
```python
231+
api = API(USERNAME, KEY, always_sign_api_key=False)
232+
api.nod(header_authentication=False, **kwargs)
233+
```
234+
- `output_format`: (choose either `csv` or `jsonl` - default is `jsonl`). Cannot be used in `domainrdap` feeds. Additionally, `csv` is not available for `download` endpoints.
235+
```python
236+
api = API(USERNAME, KEY, always_sign_api_key=False)
237+
api.nod(output_format="csv", **kwargs)
238+
```
239+
240+
The Feed API standard access pattern is to periodically request the most recent feed data, as often as every 60 seconds. Specify the range of data you receive in one of two ways:
241+
242+
1. With `sessionID`: Make a call and provide a new `sessionID` parameter of your choosing. The API will return the last hour of data by default.
243+
- Each subsequent call to the API using your `sessionID` will return all data since the last.
244+
- Any single request returns a maximum of 10M results. Requests that exceed 10M results will return a HTTP 206 response code; repeat the same request (with the same `sessionID`) to receive the next tranche of data until receiving a HTTP 200 response code.
245+
2. Or, specify the time range in one of two ways:
246+
- Either an `after=-60` query parameter, where (in this example) -60 indicates the previous 60 seconds.
247+
- Or `after` and `before` query parameters for a time range, with each parameter accepting an ISO-8601 UTC formatted timestamp (a UTC date and time of the format YYYY-MM-DDThh:mm:ssZ)
248+
249+
## Handling iterative response from RTUF endpoints:
250+
251+
Since we may dealing with large feeds datasets, the python wrapper uses `generator` for efficient memory handling. Therefore, we need to iterate through the `generator` if we're accessing the partial results of the feeds data.
252+
253+
### Single request because the requested data is within the maximum result:
254+
```python
255+
from domaintools import API
256+
257+
api = API(USERNAME, KEY, always_sign_api_key=False)
258+
results = api.nod(sessionID="my-session-id", after=-60)
259+
260+
for result in results.response() # generator that holds NOD feeds data for the past 60 seconds and is expected to request only once
261+
# do things to result
262+
```
263+
264+
## Multiple requests because the requested data is more than the maximum result per request:
265+
```python
266+
from domaintools import API
267+
268+
api = API(USERNAME, KEY, always_sign_api_key=False)
269+
results = api.nod(sessionID="my-session-id", after=-7200)
270+
271+
for partial_result in results.response() # generator that holds NOD feeds data for the past 2 hours and is expected to request multiple times
272+
# do things to partial_result
273+
```

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.0
1+
2.3.0

domaintools/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020
2121
"""
2222

23-
current = "2.2.0"
23+
current = "2.3.0"

domaintools/api.py

Lines changed: 65 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
from datetime import datetime, timedelta, timezone
22
from hashlib import sha1, sha256, md5
33
from hmac import new as hmac
4+
45
import re
56

7+
from domaintools.constants import Endpoint, ENDPOINT_TO_SOURCE_MAP, FEEDS_PRODUCTS_LIST, OutputFormat
68
from domaintools._version import current as version
79
from domaintools.results import (
810
GroupedIterable,
911
ParsedWhois,
1012
ParsedDomainRdap,
1113
Reputation,
1214
Results,
15+
FeedsResults,
1316
)
1417
from domaintools.filters import (
1518
filter_by_riskscore,
@@ -18,6 +21,8 @@
1821
filter_by_field,
1922
DTResultFilter,
2023
)
24+
from domaintools.utils import validate_feeds_parameters
25+
2126

2227
AVAILABLE_KEY_SIGN_HASHES = ["sha1", "sha256", "md5"]
2328

@@ -84,11 +89,8 @@ def __init__(
8489

8590
if not https:
8691
raise Exception("The DomainTools API endpoints no longer support http traffic. Please make sure https=True.")
87-
if proxy_url:
88-
if isinstance(proxy_url, str):
89-
self.proxy_url = {"http://": proxy_url, "https://": proxy_url}
90-
else:
91-
raise Exception("Proxy URL must be a string. For example: '127.0.0.1:8888'")
92+
if proxy_url and not isinstance(proxy_url, str):
93+
raise Exception("Proxy URL must be a string. For example: '127.0.0.1:8888'")
9294

9395
def _build_api_url(self, api_url=None, api_port=None):
9496
"""Build the API url based on the given url and port. Defaults to `https://api.domaintools.com`"""
@@ -122,14 +124,18 @@ def _results(self, product, path, cls=Results, **kwargs):
122124
uri = "/".join((self._rest_api_url, path.lstrip("/")))
123125
parameters = self.default_parameters.copy()
124126
parameters["api_username"] = self.username
125-
self.handle_api_key(path, parameters)
127+
header_authentication = kwargs.pop("header_authentication", True) # Used only by Real-Time Threat Intelligence Feeds endpoints for now
128+
self.handle_api_key(product, path, parameters, header_authentication)
126129
parameters.update({key: str(value).lower() if value in (True, False) else value for key, value in kwargs.items() if value is not None})
127130

128131
return cls(self, product, uri, **parameters)
129132

130-
def handle_api_key(self, path, parameters):
133+
def handle_api_key(self, product, path, parameters, header_authentication):
131134
if self.https and not self.always_sign_api_key:
132-
parameters["api_key"] = self.key
135+
if product in FEEDS_PRODUCTS_LIST and header_authentication:
136+
parameters["X-Api-Key"] = self.key
137+
else:
138+
parameters["api_key"] = self.key
133139
else:
134140
if self.key_sign_hash and self.key_sign_hash in AVAILABLE_KEY_SIGN_HASHES:
135141
signing_hash = eval(self.key_sign_hash)
@@ -1058,30 +1064,67 @@ def iris_detect_ignored_domains(
10581064
**kwargs,
10591065
)
10601066

1061-
def nod(self, **kwargs):
1067+
def nod(self, **kwargs) -> FeedsResults:
10621068
"""Returns back list of the newly observed domains feed"""
1063-
sessionID = kwargs.get("sessionID")
1064-
after = kwargs.get("after")
1065-
if not (sessionID or after):
1066-
raise ValueError("sessionID or after (can be both) must be defined")
1069+
validate_feeds_parameters(kwargs)
1070+
endpoint = kwargs.pop("endpoint", Endpoint.FEED.value)
1071+
source = ENDPOINT_TO_SOURCE_MAP.get(endpoint)
1072+
if endpoint == Endpoint.DOWNLOAD.value or kwargs.get("output_format", OutputFormat.JSONL.value) != OutputFormat.CSV.value:
1073+
# headers param is allowed only in Feed API and CSV format
1074+
kwargs.pop("headers", None)
10671075

10681076
return self._results(
1069-
"newly-observed-domains-feed-(api)",
1070-
"v1/feed/nod/",
1077+
f"newly-observed-domains-feed-({source.value})",
1078+
f"v1/{endpoint}/nod/",
10711079
response_path=(),
1080+
cls=FeedsResults,
10721081
**kwargs,
10731082
)
10741083

1075-
def nad(self, **kwargs):
1084+
def nad(self, **kwargs) -> FeedsResults:
10761085
"""Returns back list of the newly active domains feed"""
1077-
sessionID = kwargs.get("sessionID")
1078-
after = kwargs.get("after")
1079-
if not (sessionID or after):
1080-
raise ValueError("sessionID or after (can be both) must be defined")
1086+
validate_feeds_parameters(kwargs)
1087+
endpoint = kwargs.pop("endpoint", Endpoint.FEED.value)
1088+
source = ENDPOINT_TO_SOURCE_MAP.get(endpoint).value
1089+
if endpoint == Endpoint.DOWNLOAD.value or kwargs.get("output_format", OutputFormat.JSONL.value) != OutputFormat.CSV.value:
1090+
# headers param is allowed only in Feed API and CSV format
1091+
kwargs.pop("headers", None)
1092+
1093+
return self._results(
1094+
f"newly-active-domains-feed-({source})",
1095+
f"v1/{endpoint}/nad/",
1096+
response_path=(),
1097+
cls=FeedsResults,
1098+
**kwargs,
1099+
)
1100+
1101+
def domainrdap(self, **kwargs) -> FeedsResults:
1102+
"""Returns changes to global domain registration information, populated by the Registration Data Access Protocol (RDAP)"""
1103+
validate_feeds_parameters(kwargs)
1104+
endpoint = kwargs.pop("endpoint", Endpoint.FEED.value)
1105+
source = ENDPOINT_TO_SOURCE_MAP.get(endpoint).value
1106+
1107+
return self._results(
1108+
f"domain-registration-data-access-protocol-feed-({source})",
1109+
f"v1/{endpoint}/domainrdap/",
1110+
response_path=(),
1111+
cls=FeedsResults,
1112+
**kwargs,
1113+
)
1114+
1115+
def domaindiscovery(self, **kwargs) -> FeedsResults:
1116+
"""Returns new domains as they are either discovered in domain registration information, observed by our global sensor network, or reported by trusted third parties"""
1117+
validate_feeds_parameters(kwargs)
1118+
endpoint = kwargs.pop("endpoint", Endpoint.FEED.value)
1119+
source = ENDPOINT_TO_SOURCE_MAP.get(endpoint).value
1120+
if endpoint == Endpoint.DOWNLOAD.value or kwargs.get("output_format", OutputFormat.JSONL.value) != OutputFormat.CSV.value:
1121+
# headers param is allowed only in Feed API and CSV format
1122+
kwargs.pop("headers", None)
10811123

10821124
return self._results(
1083-
"newly-active-domains-feed-(api)",
1084-
"v1/feed/nad/",
1125+
f"real-time-domain-discovery-feed-({source})",
1126+
f"v1/{endpoint}/domaindiscovery/",
10851127
response_path=(),
1128+
cls=FeedsResults,
10861129
**kwargs,
10871130
)

domaintools/base_results.py

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
import re
55
import time
66
import logging
7+
8+
from copy import deepcopy
79
from datetime import datetime
10+
from httpx import Client
811

12+
from domaintools.constants import FEEDS_PRODUCTS_LIST, OutputFormat, HEADER_ACCEPT_KEY_CSV_FORMAT
913
from domaintools.exceptions import (
1014
BadRequestException,
1115
InternalServerErrorException,
@@ -16,9 +20,7 @@
1620
IncompleteResponseException,
1721
RequestUriTooLongException,
1822
)
19-
from domaintools.utils import get_feeds_products_list
2023

21-
from httpx import Client
2224

2325
try: # pragma: no cover
2426
from collections.abc import MutableMapping, MutableSequence
@@ -51,8 +53,6 @@ def __init__(
5153
self._response = None
5254
self._items_list = None
5355
self._data = None
54-
self._limit_exceeded = None
55-
self._limit_exceeded_message = None
5656

5757
def _wait_time(self):
5858
if not self.api.rate_limit or not self.product in self.api.limits:
@@ -75,6 +75,23 @@ def _wait_time(self):
7575

7676
return wait_for
7777

78+
def _get_session_params(self):
79+
parameters = deepcopy(self.kwargs)
80+
parameters.pop("output_format", None)
81+
parameters.pop(
82+
"format", None
83+
) # For some unknownn reasons, even if "format" is not included in the cli params for feeds endpoint, it is being populated thus we need to remove it. Happens only if using CLI.
84+
headers = {}
85+
if self.kwargs.get("output_format", OutputFormat.JSONL.value) == OutputFormat.CSV.value:
86+
parameters["headers"] = int(bool(self.kwargs.get("headers", False)))
87+
headers["accept"] = HEADER_ACCEPT_KEY_CSV_FORMAT
88+
89+
header_api_key = parameters.pop("X-Api-Key", None)
90+
if header_api_key:
91+
headers["X-Api-Key"] = header_api_key
92+
93+
return {"parameters": parameters, "headers": headers}
94+
7895
def _make_request(self):
7996

8097
with Client(verify=self.api.verify_ssl, proxy=self.api.proxy_url, timeout=None) as session:
@@ -90,6 +107,11 @@ def _make_request(self):
90107
patch_data = self.kwargs.copy()
91108
patch_data.update(self.api.extra_request_params)
92109
return session.patch(url=self.url, json=patch_data)
110+
elif self.product in FEEDS_PRODUCTS_LIST:
111+
session_params = self._get_session_params()
112+
parameters = session_params.get("parameters")
113+
headers = session_params.get("headers")
114+
return session.get(url=self.url, params=parameters, headers=headers, **self.api.extra_request_params)
93115
else:
94116
return session.get(url=self.url, params=self.kwargs, **self.api.extra_request_params)
95117

@@ -118,33 +140,26 @@ def data(self):
118140
if self._data is None:
119141
results = self._get_results()
120142
self.setStatus(results.status_code, results)
121-
if (
122-
self.kwargs.get("format", "json") == "json"
123-
and self.product
124-
not in get_feeds_products_list() # Special handling of feeds products' data to preserve the result in jsonline format
125-
):
143+
if self.kwargs.get("format", "json") == "json":
126144
self._data = results.json()
127145
else:
128146
self._data = results.text
129-
limit_exceeded, message = self.check_limit_exceeded()
130147

131-
if limit_exceeded:
132-
self._limit_exceeded = True
133-
self._limit_exceeded_message = message
148+
self.check_limit_exceeded()
134149

135-
if self._limit_exceeded is True:
136-
raise ServiceException(503, "Limit Exceeded{}".format(self._limit_exceeded_message))
137-
else:
138-
return self._data
150+
return self._data
139151

140152
def check_limit_exceeded(self):
141-
if self.kwargs.get("format", "json") == "json":
142-
if "response" in self._data and "limit_exceeded" in self._data["response"] and self._data["response"]["limit_exceeded"] is True:
143-
return True, self._data["response"]["message"]
144-
# TODO: handle html, xml response errors better.
153+
limit_exceeded, reason = False, ""
154+
if isinstance(self._data, dict) and (
155+
"response" in self._data and "limit_exceeded" in self._data["response"] and self._data["response"]["limit_exceeded"] is True
156+
):
157+
limit_exceeded, reason = True, self._data["response"]["message"]
145158
elif "response" in self._data and "limit_exceeded" in self._data:
146-
return True, "limit exceeded"
147-
return False, ""
159+
limit_exceeded = True
160+
161+
if limit_exceeded:
162+
raise ServiceException(503, f"Limit Exceeded {reason}")
148163

149164
@property
150165
def status(self):
@@ -155,7 +170,7 @@ def status(self):
155170

156171
def setStatus(self, code, response=None):
157172
self._status = code
158-
if code == 200:
173+
if code == 200 or (self.product in FEEDS_PRODUCTS_LIST and code == 206):
159174
return
160175

161176
reason = None
@@ -167,9 +182,9 @@ def setStatus(self, code, response=None):
167182
if callable(reason):
168183
reason = reason()
169184

170-
if code == 400:
185+
if code in (400, 422):
171186
raise BadRequestException(code, reason)
172-
elif code == 403:
187+
elif code in (401, 403):
173188
raise NotAuthorizedException(code, reason)
174189
elif code == 404:
175190
raise NotFoundException(code, reason)
@@ -259,6 +274,32 @@ def json(self):
259274
**self.kwargs,
260275
)
261276

277+
@property
278+
def jsonl(self):
279+
self.kwargs.pop("format", None)
280+
return self.__class__(
281+
format="jsonl",
282+
product=self.product,
283+
url=self.url,
284+
items_path=self.items_path,
285+
response_path=self.response_path,
286+
api=self.api,
287+
**self.kwargs,
288+
)
289+
290+
@property
291+
def csv(self):
292+
self.kwargs.pop("format", None)
293+
return self.__class__(
294+
format="csv",
295+
product=self.product,
296+
url=self.url,
297+
items_path=self.items_path,
298+
response_path=self.response_path,
299+
api=self.api,
300+
**self.kwargs,
301+
)
302+
262303
@property
263304
def xml(self):
264305
self.kwargs.pop("format", None)

0 commit comments

Comments
 (0)