Skip to content

Commit e3e76ac

Browse files
authored
Merge pull request #144 from DomainTools/IDEV-1996-implement-domain-discovery-feed
IDEV-1996: Implement domain discovery feed
2 parents f081335 + 7d748eb commit e3e76ac

File tree

10 files changed

+587
-12
lines changed

10 files changed

+587
-12
lines changed

domaintools/api.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from hmac import new as hmac
44
import re
55

6+
from domaintools.constants import Endpoint, ENDPOINT_TO_SOURCE_MAP, OutputFormat
67
from domaintools._version import current as version
78
from domaintools.results import (
89
GroupedIterable,
@@ -18,6 +19,8 @@
1819
filter_by_field,
1920
DTResultFilter,
2021
)
22+
from domaintools.utils import validate_feeds_parameters
23+
2124

2225
AVAILABLE_KEY_SIGN_HASHES = ["sha1", "sha256", "md5"]
2326

@@ -1088,15 +1091,29 @@ def nad(self, **kwargs):
10881091

10891092
def domainrdap(self, **kwargs):
10901093
"""Returns changes to global domain registration information, populated by the Registration Data Access Protocol (RDAP)"""
1091-
sessionID = kwargs.get("sessionID")
1092-
after = kwargs.get("after")
1093-
before = kwargs.get("before")
1094-
if not (sessionID or after or before):
1095-
raise ValueError("sessionID or after or before must be defined")
1094+
validate_feeds_parameters(kwargs)
1095+
endpoint = kwargs.pop("endpoint", Endpoint.FEED.value)
1096+
source = ENDPOINT_TO_SOURCE_MAP.get(endpoint)
1097+
1098+
return self._results(
1099+
f"domain-registration-data-access-protocol-feed-({source.value})",
1100+
f"v1/{endpoint}/domainrdap/",
1101+
response_path=(),
1102+
**kwargs,
1103+
)
1104+
1105+
def domaindiscovery(self, **kwargs):
1106+
"""Returns new domains as they are either discovered in domain registration information, observed by our global sensor network, or reported by trusted third parties"""
1107+
validate_feeds_parameters(kwargs)
1108+
endpoint = kwargs.pop("endpoint", Endpoint.FEED.value)
1109+
source = ENDPOINT_TO_SOURCE_MAP.get(endpoint)
1110+
if endpoint == Endpoint.DOWNLOAD.value or kwargs.get("output_format", OutputFormat.JSONL.value) != OutputFormat.CSV.value:
1111+
# headers param is allowed only in Feed API and CSV format
1112+
kwargs.pop("headers", None)
10961113

10971114
return self._results(
1098-
"domain-registration-data-access-protocol-feed-(api)",
1099-
"v1/feed/domainrdap/",
1115+
f"real-time-domain-discovery-feed-({source.value})",
1116+
f"v1/{endpoint}/domaindiscovery/",
11001117
response_path=(),
11011118
**kwargs,
11021119
)

domaintools/base_results.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@
44
import re
55
import time
66
import logging
7+
8+
from copy import deepcopy
79
from datetime import datetime
10+
from httpx import Client
811

12+
from domaintools.constants import OutputFormat, HEADER_ACCEPT_KEY_CSV_FORMAT
913
from domaintools.exceptions import (
1014
BadRequestException,
1115
InternalServerErrorException,
@@ -18,7 +22,6 @@
1822
)
1923
from domaintools.utils import get_feeds_products_list
2024

21-
from httpx import Client
2225

2326
try: # pragma: no cover
2427
from collections.abc import MutableMapping, MutableSequence
@@ -90,6 +93,18 @@ def _make_request(self):
9093
patch_data = self.kwargs.copy()
9194
patch_data.update(self.api.extra_request_params)
9295
return session.patch(url=self.url, json=patch_data)
96+
elif self.product in get_feeds_products_list():
97+
parameters = deepcopy(self.kwargs)
98+
parameters.pop("output_format", None)
99+
parameters.pop(
100+
"format", None
101+
) # For some unknownn reasons, even if "format" is not included in the cli params for feeds endpoint, it is being populated thus we need to remove it. Happens only if using CLI.
102+
headers = {}
103+
if self.kwargs.get("output_format", OutputFormat.JSONL.value) == OutputFormat.CSV.value:
104+
parameters["headers"] = int(bool(self.kwargs.get("headers", False)))
105+
headers["accept"] = HEADER_ACCEPT_KEY_CSV_FORMAT
106+
107+
return session.get(url=self.url, params=parameters, headers=headers, **self.api.extra_request_params)
93108
else:
94109
return session.get(url=self.url, params=self.kwargs, **self.api.extra_request_params)
95110

@@ -259,6 +274,32 @@ def json(self):
259274
**self.kwargs,
260275
)
261276

277+
@property
278+
def jsonl(self):
279+
self.kwargs.pop("format", None)
280+
return self.__class__(
281+
format="jsonl",
282+
product=self.product,
283+
url=self.url,
284+
items_path=self.items_path,
285+
response_path=self.response_path,
286+
api=self.api,
287+
**self.kwargs,
288+
)
289+
290+
@property
291+
def csv(self):
292+
self.kwargs.pop("format", None)
293+
return self.__class__(
294+
format="csv",
295+
product=self.product,
296+
url=self.url,
297+
items_path=self.items_path,
298+
response_path=self.response_path,
299+
api=self.api,
300+
**self.kwargs,
301+
)
302+
262303
@property
263304
def xml(self):
264305
self.kwargs.pop("format", None)

domaintools/cli/api.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Optional, Dict, Tuple
99
from rich.progress import Progress, SpinnerColumn, TextColumn
1010

11+
from domaintools.constants import Endpoint, OutputFormat
1112
from domaintools.api import API
1213
from domaintools.exceptions import ServiceException
1314
from domaintools.cli.utils import get_file_extension
@@ -32,6 +33,20 @@ def validate_format_input(value: str):
3233
raise typer.BadParameter(f"{value} is not in available formats: {VALID_FORMATS}")
3334
return value
3435

36+
@staticmethod
37+
def validate_feeds_format_input(value: str):
38+
VALID_FEEDS_FORMATS = ("jsonl", "csv")
39+
if value not in VALID_FEEDS_FORMATS:
40+
raise typer.BadParameter(f"{value} is not in available formats: {VALID_FEEDS_FORMATS}")
41+
return value
42+
43+
@staticmethod
44+
def validate_endpoint_input(value: str):
45+
VALID_ENDPOINTS = (Endpoint.FEED.value, Endpoint.DOWNLOAD.value)
46+
if value not in VALID_ENDPOINTS:
47+
raise typer.BadParameter(f"{value} is not in available endpoints: {VALID_ENDPOINTS}")
48+
return value
49+
3550
@staticmethod
3651
def validate_after_or_before_input(value: str):
3752
if value is None or value.replace("-", "").isdigit():
@@ -152,7 +167,13 @@ def run(cls, name: str, params: Optional[Dict] = {}, **kwargs):
152167
"""
153168
try:
154169
rate_limit = params.pop("rate_limit", False)
155-
response_format = params.pop("format", "json")
170+
response_format = (
171+
params.pop("format", "json")
172+
if params.get("format", None)
173+
else params.get(
174+
"output_format", OutputFormat.JSONL.value
175+
) # Using output_format for RTUF endpoints to separate from other endpoints. This will be needed further along the process
176+
)
156177
out_file = params.pop("out_file", sys.stdout)
157178
verify_ssl = params.pop("no_verify_ssl", False)
158179
always_sign_api_key = params.pop("no_sign_api_key", False)

domaintools/cli/commands/feeds.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from domaintools.cli.api import DTCLICommand
77
from domaintools.cli.utils import get_cli_helptext_by_name
88
from domaintools.cli import constants as c
9+
from domaintools.constants import Endpoint, OutputFormat
910

1011

1112
@dt_cli.command(
@@ -158,6 +159,13 @@ def feeds_domainrdap(
158159
"--no-sign-api-key",
159160
help="Skip signing of api key",
160161
),
162+
endpoint: str = typer.Option(
163+
Endpoint.FEED.value,
164+
"-e",
165+
"--endpoint",
166+
help=f"Valid endpoints: [{Endpoint.FEED.value}, {Endpoint.DOWNLOAD.value}]",
167+
callback=DTCLICommand.validate_endpoint_input,
168+
),
161169
sessionID: str = typer.Option(
162170
None,
163171
"--session-id",
@@ -188,3 +196,78 @@ def feeds_domainrdap(
188196
),
189197
):
190198
DTCLICommand.run(name=c.FEEDS_DOMAINRDAP, params=ctx.params)
199+
200+
201+
@dt_cli.command(
202+
name=c.FEEDS_DOMAINDISCOVERY,
203+
help=get_cli_helptext_by_name(command_name=c.FEEDS_DOMAINDISCOVERY),
204+
)
205+
def feeds_domaindiscovery(
206+
ctx: typer.Context,
207+
user: str = typer.Option(None, "-u", "--user", help="Domaintools API Username."),
208+
key: str = typer.Option(None, "-k", "--key", help="DomainTools API key"),
209+
creds_file: str = typer.Option(
210+
"~/.dtapi",
211+
"-c",
212+
"--credfile",
213+
help="Optional file with API username and API key, one per line.",
214+
),
215+
no_verify_ssl: bool = typer.Option(
216+
False,
217+
"--no-verify-ssl",
218+
help="Skip verification of SSL certificate when making HTTPs API calls",
219+
),
220+
no_sign_api_key: bool = typer.Option(
221+
False,
222+
"--no-sign-api-key",
223+
help="Skip signing of api key",
224+
),
225+
output_format: str = typer.Option(
226+
"jsonl",
227+
"-f",
228+
"--format",
229+
help=f"Output format in [{OutputFormat.JSONL.value}, {OutputFormat.CSV.value}]",
230+
callback=DTCLICommand.validate_feeds_format_input,
231+
),
232+
endpoint: str = typer.Option(
233+
Endpoint.FEED.value,
234+
"-e",
235+
"--endpoint",
236+
help=f"Valid endpoints: [{Endpoint.FEED.value}, {Endpoint.DOWNLOAD.value}]",
237+
callback=DTCLICommand.validate_endpoint_input,
238+
),
239+
sessionID: str = typer.Option(
240+
None,
241+
"--session-id",
242+
help="Unique identifier for the session",
243+
),
244+
after: str = typer.Option(
245+
None,
246+
"--after",
247+
help="Start of the time window, relative to the current time in seconds, for which data will be provided",
248+
callback=DTCLICommand.validate_after_or_before_input,
249+
),
250+
before: str = typer.Option(
251+
None,
252+
"--before",
253+
help="The end of the query window in seconds, relative to the current time, inclusive",
254+
callback=DTCLICommand.validate_after_or_before_input,
255+
),
256+
domain: str = typer.Option(
257+
None,
258+
"-d",
259+
"--domain",
260+
help="A string value used to filter feed results",
261+
),
262+
headers: bool = typer.Option(
263+
False,
264+
"--headers",
265+
help="Adds a header to the first line of response when text/csv is set in header parameters",
266+
),
267+
top: str = typer.Option(
268+
None,
269+
"--top",
270+
help="Number of results to return in the response payload. This is ignored in download endpoint",
271+
),
272+
):
273+
DTCLICommand.run(name=c.FEEDS_DOMAINDISCOVERY, params=ctx.params)

domaintools/cli/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,4 @@
4747
FEEDS_NAD = "nad"
4848
FEEDS_NOD = "nod"
4949
FEEDS_DOMAINRDAP = "domainrdap"
50+
FEEDS_DOMAINDISCOVERY = "domaindiscovery"

domaintools/constants.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from enum import Enum
2+
3+
4+
class Endpoint(Enum):
5+
FEED = "feed"
6+
DOWNLOAD = "download"
7+
8+
9+
class Source(Enum):
10+
API = "api"
11+
S3 = "s3"
12+
13+
14+
class OutputFormat(Enum):
15+
JSONL = "jsonl"
16+
CSV = "csv"
17+
18+
19+
HEADER_ACCEPT_KEY_CSV_FORMAT = "text/csv"
20+
21+
ENDPOINT_TO_SOURCE_MAP = {
22+
Endpoint.FEED.value: Source.API,
23+
Endpoint.DOWNLOAD.value: Source.S3,
24+
}

domaintools/utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from datetime import datetime
2-
32
from typing import Optional
43

4+
from domaintools.constants import Endpoint, OutputFormat
5+
56
import re
67

78

@@ -176,4 +177,19 @@ def get_feeds_products_list():
176177
"newly-active-domains-feed-(api)",
177178
"newly-observed-domains-feed-(api)",
178179
"domain-registration-data-access-protocol-feed-(api)",
180+
"domain-registration-data-access-protocol-feed-(s3)",
181+
"real-time-domain-discovery-feed-(api)",
182+
"real-time-domain-discovery-feed-(s3)",
179183
]
184+
185+
186+
def validate_feeds_parameters(params):
187+
sessionID = params.get("sessionID")
188+
after = params.get("after")
189+
before = params.get("before")
190+
if not (sessionID or after or before):
191+
raise ValueError("sessionID or after or before must be defined")
192+
193+
format = params.get("output_format")
194+
if params.get("endpoint") == Endpoint.DOWNLOAD.value and format == OutputFormat.CSV.value:
195+
raise ValueError(f"{format} format is not available in {Endpoint.DOWNLOAD.value} API.")

domaintools_async/__init__.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""Adds async capabilities to the base product object"""
22

33
import asyncio
4+
5+
from copy import deepcopy
46
from httpx import AsyncClient
57

68
from domaintools.base_results import Results
7-
8-
from domaintools.exceptions import ServiceUnavailableException, ServiceException
9+
from domaintools.constants import OutputFormat, HEADER_ACCEPT_KEY_CSV_FORMAT
10+
from domaintools.exceptions import ServiceUnavailableException
11+
from domaintools.utils import get_feeds_products_list
912

1013

1114
class _AIter(object):
@@ -49,6 +52,17 @@ async def _make_async_request(self, session):
4952
patch_data = self.kwargs.copy()
5053
patch_data.update(self.api.extra_request_params)
5154
results = await session.patch(url=self.url, json=patch_data)
55+
elif self.product in get_feeds_products_list():
56+
parameters = deepcopy(self.kwargs)
57+
parameters.pop("output_format", None)
58+
parameters.pop(
59+
"format", None
60+
) # For some unknownn reasons, even if "format" is not included in the cli params for feeds endpoint, it is being populated thus we need to remove it. Happens only if using CLI.
61+
headers = {}
62+
if self.kwargs.get("output_format", OutputFormat.JSONL.value) == OutputFormat.CSV.value:
63+
parameters["headers"] = int(bool(self.kwargs.get("headers", False)))
64+
headers["accept"] = HEADER_ACCEPT_KEY_CSV_FORMAT
65+
results = await session.get(url=self.url, params=parameters, headers=headers, **self.api.extra_request_params)
5266
else:
5367
results = await session.get(url=self.url, params=self.kwargs, **self.api.extra_request_params)
5468
if results:

0 commit comments

Comments
 (0)