Skip to content

Commit d11e9e1

Browse files
authored
Merge pull request #14 from Moineau54/dev
Error rework
2 parents 5c28f6f + 17ce300 commit d11e9e1

File tree

3 files changed

+225
-129
lines changed

3 files changed

+225
-129
lines changed

extractors/js_extractor.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def __init__(self):
2222

2323
def extract_embedded_domains(self, js_code, url, exceptions, verbose=False):
2424
"""
25-
Extract domains from JavaScript code using regex.
25+
Extract domains from any code using regex.
2626
2727
Args:
2828
js_code (list): List of JavaScript code strings to analyze
@@ -36,8 +36,9 @@ def extract_embedded_domains(self, js_code, url, exceptions, verbose=False):
3636
domains = []
3737

3838
# Pattern to extract domains from URLs
39-
pattern = r'https?:\/\/(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})'
40-
39+
# pattern = r'https?:\/\/(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})'
40+
pattern = r'https?:\/\/(?:[a-zA-Z0-9-]{1,10}\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})'
41+
4142
# Get base domain for comparison
4243
try:
4344
base_domain_parts = url.split('.')[1] if len(url.split('.')) > 1 else ''
@@ -52,13 +53,12 @@ def extract_embedded_domains(self, js_code, url, exceptions, verbose=False):
5253

5354
# Deduplicate findings
5455
for item in found_domains:
55-
if item not in domains and item not in unique_findings:
56+
if item not in domains and item not in unique_findings and item not in exceptions and item.__contains__(base_domain_parts) == False:
5657
unique_findings.append(item)
5758

5859
# Filter and add domains
5960
for domain in unique_findings:
60-
if (domain not in domains and
61-
base_domain_parts not in domain and
61+
if (domain not in domains and domain.__contains__(base_domain_parts) == False and
6262
domain not in exceptions):
6363
domains.append(domain)
6464
if verbose:

0 commit comments

Comments
 (0)