@@ -22,7 +22,7 @@ def __init__(self):
2222
2323 def extract_embedded_domains (self , js_code , url , exceptions , verbose = False ):
2424 """
25- Extract domains from JavaScript code using regex.
25+ Extract domains from any code using regex.
2626
2727 Args:
2828 js_code (list): List of JavaScript code strings to analyze
@@ -36,8 +36,9 @@ def extract_embedded_domains(self, js_code, url, exceptions, verbose=False):
3636 domains = []
3737
3838 # Pattern to extract domains from URLs
39- pattern = r'https?:\/\/(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})'
40-
39+ # pattern = r'https?:\/\/(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})'
40+ pattern = r'https?:\/\/(?:[a-zA-Z0-9-]{1,10}\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,})'
41+
4142 # Get base domain for comparison
4243 try :
4344 base_domain_parts = url .split ('.' )[1 ] if len (url .split ('.' )) > 1 else ''
@@ -52,13 +53,12 @@ def extract_embedded_domains(self, js_code, url, exceptions, verbose=False):
5253
5354 # Deduplicate findings
5455 for item in found_domains :
55- if item not in domains and item not in unique_findings :
56+ if item not in domains and item not in unique_findings and item not in exceptions and item . __contains__ ( base_domain_parts ) == False :
5657 unique_findings .append (item )
5758
5859 # Filter and add domains
5960 for domain in unique_findings :
60- if (domain not in domains and
61- base_domain_parts not in domain and
61+ if (domain not in domains and domain .__contains__ (base_domain_parts ) == False and
6262 domain not in exceptions ):
6363 domains .append (domain )
6464 if verbose :
0 commit comments