Skip to content

Commit 170d8df

Browse files
committed
add tests and unicode handling.
1 parent bc057d4 commit 170d8df

File tree

10 files changed

+824
-32
lines changed

10 files changed

+824
-32
lines changed

CLAUDE.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,14 @@ This is a Python library for managing hosts files (`/etc/hosts` on Unix, `C:\Win
5353
- Validate IPv4/IPv6 addresses and hostnames
5454
- Handle comments and blank lines
5555
- Support for duplicate name detection with `allow_name_dupliction` parameter
56+
- **Unicode Support**: Full Unicode support for hostnames and comments (Python 2.7 & 3.x compatible)
57+
- **IDN Support**: Automatic conversion of internationalized domain names to ASCII-compatible encoding
5658

5759
### Module Structure
5860
- `python_hosts/hosts.py` - Main classes (`Hosts`, `HostsEntry`)
5961
- `python_hosts/utils.py` - Utility functions for validation (`is_ipv4`, `is_ipv6`, `valid_hostnames`)
6062
- `python_hosts/exception.py` - Custom exceptions (`HostsException`, `InvalidIPv4Address`, etc.)
63+
- `python_hosts/unicode_utils.py` - Unicode compatibility utilities for Python 2/3 support
6164
- `python_hosts/__init__.py` - Package exports
6265

6366
### Testing

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@
114114
# The theme to use for HTML and HTML Help pages. See the documentation for
115115
# a list of builtin themes.
116116
# html_theme = 'alabaster'
117-
html_theme = "sphinx_rtd_theme"
117+
html_theme = "furo"
118118

119119
# Theme options are theme-specific and customize the look and feel of a theme
120120
# further. For a list of options available for each theme, see the

python_hosts/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,7 @@
1818
from python_hosts.exception import (HostsException, HostsEntryException, # noqa: F401
1919
InvalidIPv4Address, InvalidIPv6Address,
2020
InvalidComment)
21+
from python_hosts.unicode_utils import (ensure_text, ensure_binary, # noqa: F401
22+
normalize_hostname, normalize_comment)
2123

2224
name = "python_hosts"

python_hosts/hosts.py

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
of the HostsEntry class.
1212
"""
1313

14+
from __future__ import unicode_literals
1415
import sys
1516

1617
try:
@@ -21,6 +22,9 @@
2122
dedupe_list)
2223
from python_hosts.exception import (InvalidIPv6Address, InvalidIPv4Address,
2324
UnableToWriteHosts)
25+
from python_hosts.unicode_utils import (ensure_text, ensure_binary, safe_open,
26+
normalize_hostname, normalize_comment,
27+
text_type, string_types)
2428

2529

2630
class HostsEntry(object):
@@ -61,10 +65,16 @@ def __init__(self,
6165
if not is_ipv6(address):
6266
raise InvalidIPv6Address()
6367

64-
self.entry_type = entry_type
65-
self.address = address
66-
self.comment = comment
67-
self.names = names
68+
# Normalize all string inputs to Unicode
69+
self.entry_type = ensure_text(entry_type) if entry_type else entry_type
70+
self.address = ensure_text(address) if address else address
71+
self.comment = normalize_comment(comment) if comment else comment
72+
73+
# Normalize hostnames to Unicode and handle IDN
74+
if names:
75+
self.names = [normalize_hostname(name) for name in names]
76+
else:
77+
self.names = names
6878

6979
def is_real_entry(self):
7080
return self.entry_type in ('ipv4', 'ipv6')
@@ -98,16 +108,18 @@ def get_entry_type(hosts_entry=None):
98108
:param hosts_entry: A line from the hosts file
99109
:return: 'comment' | 'blank' | 'ipv4' | 'ipv6'
100110
"""
101-
if hosts_entry and isinstance(hosts_entry, str):
111+
if hosts_entry and isinstance(hosts_entry, string_types):
112+
# Ensure the entry is Unicode text
113+
hosts_entry = ensure_text(hosts_entry)
102114
entry = hosts_entry.strip()
103115
if not entry or not entry[0] or entry[0] == "\n":
104116
return 'blank'
105117
if entry[0] == "#":
106118
return 'comment'
107119
entry_chunks = entry.split()
108-
if is_ipv6(entry_chunks[0]):
120+
if entry_chunks and is_ipv6(entry_chunks[0]):
109121
return 'ipv6'
110-
if is_ipv4(entry_chunks[0]):
122+
if entry_chunks and is_ipv4(entry_chunks[0]):
111123
return 'ipv4'
112124

113125
@staticmethod
@@ -117,14 +129,17 @@ def str_to_hostentry(entry):
117129
:param entry: A line from the hosts file
118130
:return: An instance of HostsEntry
119131
"""
132+
# Ensure the entry is Unicode text
133+
entry = ensure_text(entry)
134+
120135
split_line = entry.split('#', 1)
121136
line = split_line[0].strip().split()
122137
inline_comment = split_line[1].strip() if len(split_line) == 2 else None
123138

124-
if is_ipv4(line[0]) and valid_hostnames(line[1:]):
139+
if line and is_ipv4(line[0]) and valid_hostnames(line[1:]):
125140
return HostsEntry('ipv4', address=line[0], names=line[1:],
126141
comment=inline_comment)
127-
if is_ipv6(line[0]) and valid_hostnames(line[1:]):
142+
if line and is_ipv6(line[0]) and valid_hostnames(line[1:]):
128143
return HostsEntry('ipv6', address=line[0], names=line[1:],
129144
comment=inline_comment)
130145
return False
@@ -206,7 +221,7 @@ def write(self, path=None, mode='w'):
206221
}
207222
output_file_path = path if path else self.path
208223
try:
209-
with open(output_file_path, mode) as hosts_file:
224+
with safe_open(output_file_path, mode, encoding='utf-8') as hosts_file:
210225
for entry in self.entries:
211226
if entry.entry_type == 'comment':
212227
hosts_file.write(entry.comment + "\n")
@@ -303,13 +318,19 @@ def import_url(self, url=None, force=None):
303318
:param url: The URL of where to download a hosts file
304319
:return: Counts reflecting the attempted additions
305320
"""
306-
file_contents = self.get_hosts_by_url(url=url).decode('utf-8')
321+
file_contents = self.get_hosts_by_url(url=url)
322+
# Handle both Python 2 and 3 URL content
323+
if hasattr(file_contents, 'decode'):
324+
file_contents = file_contents.decode('utf-8')
325+
file_contents = ensure_text(file_contents)
307326
file_contents = file_contents.rstrip().replace('^M', '\n')
308327
file_contents = file_contents.rstrip().replace('\r\n', '\n')
309328
lines = file_contents.split('\n')
310329
skipped = 0
311330
import_entries = []
312331
for line in lines:
332+
# Ensure each line is Unicode text
333+
line = ensure_text(line)
313334
stripped_entry = line.strip()
314335
if (not stripped_entry) or (stripped_entry.startswith('#')):
315336
skipped += 1
@@ -337,8 +358,10 @@ def import_file(self, import_file_path=None):
337358
invalid_count = 0
338359
if is_readable(import_file_path):
339360
import_entries = []
340-
with open(import_file_path, 'r') as infile:
361+
with safe_open(import_file_path, 'r', encoding='utf-8') as infile:
341362
for line in infile:
363+
# Ensure line is Unicode text
364+
line = ensure_text(line)
342365
stripped_entry = line.strip()
343366
if (not stripped_entry) or (stripped_entry.startswith('#')):
344367
skipped += 1
@@ -463,9 +486,11 @@ def populate_entries(self):
463486
:return: None
464487
"""
465488
try:
466-
with open(self.path, 'r') as hosts_file:
489+
with safe_open(self.path, 'r', encoding='utf-8') as hosts_file:
467490
hosts_entries = [line for line in hosts_file]
468491
for hosts_entry in hosts_entries:
492+
# Ensure line is Unicode text
493+
hosts_entry = ensure_text(hosts_entry)
469494
entry_type = HostsEntry.get_entry_type(hosts_entry)
470495
if entry_type == "comment":
471496
hosts_entry = hosts_entry.replace("\r", "")

python_hosts/unicode_utils.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Unicode compatibility utilities for Python 2.7 and 3.x support.
4+
5+
This module provides utilities to handle Unicode strings consistently
6+
across Python versions while maintaining backward compatibility.
7+
"""
8+
9+
import sys
10+
11+
# Python 2/3 compatibility
12+
PY2 = sys.version_info[0] == 2
13+
PY3 = sys.version_info[0] == 3
14+
15+
if PY3:
16+
text_type = str
17+
binary_type = bytes
18+
string_types = (str,)
19+
20+
def ensure_text(s, encoding='utf-8', errors='strict'):
21+
"""Ensure string is unicode text type."""
22+
if isinstance(s, binary_type):
23+
return s.decode(encoding, errors)
24+
elif isinstance(s, text_type):
25+
return s
26+
else:
27+
return text_type(s)
28+
29+
def ensure_binary(s, encoding='utf-8', errors='strict'):
30+
"""Ensure string is binary type."""
31+
if isinstance(s, text_type):
32+
return s.encode(encoding, errors)
33+
elif isinstance(s, binary_type):
34+
return s
35+
else:
36+
return text_type(s).encode(encoding, errors)
37+
38+
else: # Python 2
39+
text_type = unicode
40+
binary_type = str
41+
string_types = (basestring,)
42+
43+
def ensure_text(s, encoding='utf-8', errors='strict'):
44+
"""Ensure string is unicode text type."""
45+
if isinstance(s, binary_type):
46+
return s.decode(encoding, errors)
47+
elif isinstance(s, text_type):
48+
return s
49+
else:
50+
return text_type(s)
51+
52+
def ensure_binary(s, encoding='utf-8', errors='strict'):
53+
"""Ensure string is binary type."""
54+
if isinstance(s, text_type):
55+
return s.encode(encoding, errors)
56+
elif isinstance(s, binary_type):
57+
return s
58+
else:
59+
return binary_type(s)
60+
61+
62+
def safe_open(file_path, mode='r', encoding='utf-8', errors='strict'):
63+
"""
64+
Open a file with proper Unicode handling for both Python 2 and 3.
65+
66+
:param file_path: Path to the file
67+
:param mode: File mode ('r', 'w', etc.)
68+
:param encoding: Text encoding (default: utf-8)
69+
:param errors: Error handling strategy
70+
:return: File object
71+
"""
72+
if PY3:
73+
if 'b' not in mode:
74+
return open(file_path, mode, encoding=encoding, errors=errors)
75+
else:
76+
return open(file_path, mode)
77+
else: # Python 2
78+
import codecs
79+
if 'b' not in mode:
80+
return codecs.open(file_path, mode, encoding=encoding, errors=errors)
81+
else:
82+
return open(file_path, mode)
83+
84+
85+
def normalize_hostname(hostname):
86+
"""
87+
Normalize hostname to ensure consistent Unicode handling.
88+
89+
:param hostname: Hostname string
90+
:return: Normalized Unicode hostname
91+
"""
92+
if not hostname:
93+
return hostname
94+
95+
# Ensure it's Unicode text
96+
hostname = ensure_text(hostname)
97+
98+
# Basic normalization - lowercase and strip whitespace
99+
hostname = hostname.lower().strip()
100+
101+
# Handle IDN (Internationalized Domain Names)
102+
try:
103+
# Convert Unicode domain to ASCII-compatible encoding
104+
hostname = hostname.encode('idna').decode('ascii')
105+
except (UnicodeError, UnicodeDecodeError):
106+
# If IDN encoding fails, keep as-is but ensure it's valid Unicode
107+
pass
108+
109+
return hostname
110+
111+
112+
def normalize_comment(comment):
113+
"""
114+
Normalize comment to ensure consistent Unicode handling.
115+
116+
:param comment: Comment string
117+
:return: Normalized Unicode comment
118+
"""
119+
if not comment:
120+
return comment
121+
122+
# Ensure it's Unicode text
123+
comment = ensure_text(comment)
124+
125+
# Strip whitespace but preserve internal spacing
126+
comment = comment.strip()
127+
128+
return comment
129+
130+
131+
def is_unicode_string(s):
132+
"""
133+
Check if a string is a Unicode text type.
134+
135+
:param s: String to check
136+
:return: True if Unicode text type, False otherwise
137+
"""
138+
return isinstance(s, text_type)
139+
140+
141+
def to_native_string(s):
142+
"""
143+
Convert string to the native string type for the Python version.
144+
145+
In Python 2: Unicode -> str (if ASCII) or keep as Unicode
146+
In Python 3: Always Unicode (str)
147+
148+
:param s: String to convert
149+
:return: Native string type
150+
"""
151+
if PY3:
152+
return ensure_text(s)
153+
else:
154+
# In Python 2, try to convert to str if ASCII, otherwise keep as Unicode
155+
if isinstance(s, text_type):
156+
try:
157+
return s.encode('ascii')
158+
except UnicodeEncodeError:
159+
return s
160+
return s

0 commit comments

Comments
 (0)