Skip to content

urllib.parse for getting the registered domain part only #135519

@hazho

Description

@hazho

Feature or enhancement

Proposal:

_psl_cache = None # This will store our parsed Public Suffix List rules.

def _fetch_and_parse_psl():
	""" Fetches the Public Suffix List, parses it, and caches it in memory. This internal function is called only when the cache is empty. """
	global _psl_cache
	psl_url = "https://publicsuffix.org/list/public_suffix_list.dat" # any other safely public lists can be used
	print("Fetching and caching Public Suffix List...") # This message will only appear once per process.
	rules = {'exceptions': set(), 'wildcards': set(), 'rules': set()}
	try:
		import urllib.request
		with urllib.request.urlopen(psl_url) as response:
			if response.status != 200:
				print(f"Error: Failed to fetch PSL file. Status: {response.status}")
				_psl_cache = {} # Cache empty dict on failure to prevent retries on every call.
				return _psl_cache
			for line in response.read().decode("utf-8").splitlines():
				line = line.strip() # Remove leading/trailing whitespace.
				if not line or line.startswith("//"): continue  # Ignore empty lines and comments.
				if line.startswith("!"): rules['exceptions'].add(line[1:])  # Handle exception rules like '!metro.tokyo.jp'.
				elif line.startswith("*."): rules['wildcards'].add(line[2:]) # Handle wildcard rules like '*.ar'.
				else: rules['rules'].add(line) # Handle normal rules like 'com' or 'co.uk'.
		_psl_cache = rules # Store the parsed rules in our cache.
	except urllib.error.URLError as e:
		print(f"Error: Could not fetch Public Suffix List. {e}")
		_psl_cache = {} # Cache empty dict on failure.
	return _psl_cache

def get_registered_domain(host):
	""" Extracts the registered domain from a host using a cached Public Suffix List. It avoids external libraries by fetching the list directly.
    Args: host (str): A hostname, with or without port (e.g., 'sub.blog.company.co.uk:8080').
    Returns: str or None: The registered domain (e.g., 'company.co.uk') or None if it cannot be determined.
	"""
	if _psl_cache is None: _fetch_and_parse_psl() # # Check if the cache is empty. If so, fetch and parse the list.
	# 1. Clean the hostname (remove port, convert to lowercase)
	hostname = urlsplit(f"//{host}").hostname or host
	hostname = hostname.lower()
	parts = hostname.split('.')
	# 2. Iterate through parts to find the longest matching public suffix
	for i in range(len(parts)):
		possible_suffix = ".".join(parts[i:])
		# The PSL algorithm states that an exception rule negates a standard rule.
		# For example, city.kawasaki.jp is an exception to *.kawasaki.jp.
		if possible_suffix in _psl_cache.get('exceptions', set()):
			# This is an exception, so the public suffix is the next part.
			# e.g., for 'city.kawasaki.jp', the suffix is 'kawasaki.jp'.
			public_suffix = ".".join(parts[i+1:])
			break
		# Check if the suffix is a known rule.
		is_wildcard = ".".join(parts[i+1:]) in _psl_cache.get('wildcards', set())
		is_rule = possible_suffix in _psl_cache.get('rules', set())
		if is_wildcard or is_rule:
			public_suffix = possible_suffix
			break
	else:
		# If no specific rule is found, fall back to the TLD.
		if len(parts) > 1: public_suffix = parts[-1]
		else: return hostname # Cannot determine a suffix, return the original host.
	# 3. The registered domain is the public suffix plus one preceding part.
	public_suffix_parts_count = len(public_suffix.split('.'))
	if len(parts) > public_suffix_parts_count:
		registered_domain_parts_count = public_suffix_parts_count + 1
		return ".".join(parts[-registered_domain_parts_count:])
	return hostname

Has this already been discussed elsewhere?

This is a minor feature, which does not need previous discussion elsewhere

Links to previous discussion of this feature:

I have no link and hope you understand that I had no enough time to go to discurse and discuss this while I think that it is really needed, however, the final decision is your, here is the explanation:
while there are external libs for nearly similar functionality, it is really discouraged to install libs for simple functionalities like this, instead, if the language itself has it, would reduce the carbon emission and storage consumption, and many other benefits (you can check the code and its comments for details)..!

Metadata

Metadata

Assignees

No one assigned

    Labels

    stdlibStandard Library Python modules in the Lib/ directorytype-featureA feature request or enhancement

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions