Skip to content

Commit 18df15f

Browse files
authored
WIP: Add missing protocols in reference with go-multiaddr (#94)
* Added http-path protocol * Added tests for http-path * Added tests for http-path * Fix HTTP path URL encoding to match Go implementation - Update http_path codec to use quote(s, safe=) for consistent URL encoding - Remove redundant URL encoding from transforms.py to prevent double encoding - Update all HTTP path tests to expect URL-encoded values consistently - Fix protocol tests to use same URL encoding approach as codec - Ensure cross-language compatibility with Go multiaddr implementation Fixes test failures in: - test_http_path_url_encoding - test_http_path_edge_cases - test_http_path_bytes_string_roundtrip - test_http_path_special_characters All 251 tests now pass. * Add missing HTTP path tests and fix protocol parsing - Add test_http_path_only_reads_http_path_part: Test that http-path only reads its own part, not subsequent protocols - Add test_http_path_malformed_percent_escape: Test rejection of malformed percent-escapes like %f - Add test_http_path_raw_value_access: Test accessing raw unescaped values (similar to Go's SplitLast/RawValue) - Fix http-path protocol parsing: Remove http-path from special path handling in _from_string - Fix IS_PATH = False for http-path codec (should not consume all remaining parts) - Fix string_to_bytes to handle protocols with SIZE=0 (like p2p-circuit) - Fix codec SIZE attribute check to handle codecs without SIZE attribute These changes ensure the Python implementation matches Go behavior exactly: - http-path only consumes its immediate value, not subsequent protocols - Proper handling of flag protocols (SIZE=0) in string-to-bytes conversion - Complete test coverage matching Go implementation All 254 tests pass, ensuring cross-language compatibility. * Added news fragment --------- Co-authored-by: acul71
1 parent 8bace85 commit 18df15f

File tree

7 files changed

+278
-10
lines changed

7 files changed

+278
-10
lines changed

multiaddr/codecs/http_path.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import re
2+
from typing import Any
3+
from urllib.parse import quote, unquote
4+
5+
from ..codecs import CodecBase
6+
from ..exceptions import BinaryParseError, StringParseError
7+
8+
IS_PATH = False
9+
SIZE = -1 # LengthPrefixedVarSize
10+
11+
12+
class Codec(CodecBase):
13+
SIZE = SIZE
14+
IS_PATH = IS_PATH
15+
16+
def to_bytes(self, proto: Any, string: str) -> bytes:
17+
"""
18+
Convert an HTTP path string to bytes
19+
Unescape URL-encoded characters, validated non-empty, then encode
20+
as UTF-8
21+
"""
22+
23+
# Reject invalid percent-escapes like "%zz" or "%f" (but allow standalone %)
24+
# Look for % followed by exactly 1 hex digit OR % followed by non-hex characters OR % at end
25+
invalid_escape = (
26+
re.search(r"%[0-9A-Fa-f](?![0-9A-Fa-f])", string)
27+
or re.search(r"%[^0-9A-Fa-f]", string)
28+
or re.search(r"%$", string)
29+
)
30+
if invalid_escape:
31+
raise StringParseError("Invalid percent-escape in path", string)
32+
33+
# Now safely unquote
34+
try:
35+
unescaped = unquote(string)
36+
except Exception:
37+
raise StringParseError("Invalid HTTP path string", string)
38+
39+
if not unescaped:
40+
raise StringParseError("empty http path is not allowed", string)
41+
42+
return unescaped.encode("utf-8")
43+
44+
def to_string(self, proto: Any, buf: bytes) -> str:
45+
"""
46+
Convert bytes to an HTTP path string
47+
Decode as UTF-8 and URL-encode (matches Go implementation)
48+
"""
49+
if len(buf) == 0:
50+
raise BinaryParseError("Empty http path is not allowed", buf, "http-path")
51+
52+
return quote(buf.decode("utf-8"), safe="")
53+
54+
def validate(self, b: bytes) -> None:
55+
"""
56+
Validate an HTTP path buffer.
57+
Just check non-empty.
58+
"""
59+
if len(b) == 0:
60+
raise ValueError("Empty http path is not allowed")

multiaddr/multiaddr.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -354,12 +354,12 @@ def _from_string(self, addr: str) -> None:
354354
continue
355355

356356
# Special handling for unix paths
357-
if part == "unix":
357+
if part in ("unix",):
358358
try:
359359
# Get the next part as the path value
360-
unix_path_value = next(parts)
361-
if not unix_path_value:
362-
raise exceptions.StringParseError("empty unix path", addr)
360+
protocol_path_value = next(parts)
361+
if not protocol_path_value:
362+
raise exceptions.StringParseError("empty protocol path", addr)
363363

364364
# Join any remaining parts as part of the path
365365
remaining_parts = []
@@ -373,16 +373,16 @@ def _from_string(self, addr: str) -> None:
373373
break
374374

375375
if remaining_parts:
376-
unix_path_value = unix_path_value + "/" + "/".join(remaining_parts)
376+
protocol_path_value = protocol_path_value + "/" + "/".join(remaining_parts)
377377

378-
proto = protocol_with_name("unix")
378+
proto = protocol_with_name(part)
379379
codec = codec_by_name(proto.codec)
380380
if not codec:
381381
raise exceptions.StringParseError(f"unknown codec: {proto.codec}", addr)
382382

383383
try:
384384
self._bytes += varint.encode(proto.code)
385-
buf = codec.to_bytes(proto, unix_path_value)
385+
buf = codec.to_bytes(proto, protocol_path_value)
386386
# Add length prefix for variable-sized or zero-sized codecs
387387
if codec.SIZE <= 0:
388388
self._bytes += varint.encode(len(buf))

multiaddr/protocols.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def __repr__(self) -> str:
158158
Protocol(P_QUIC1, "quic-v1", None),
159159
Protocol(P_HTTP, "http", None),
160160
Protocol(P_HTTPS, "https", None),
161+
Protocol(P_HTTP_PATH, "http-path", "http_path"),
161162
Protocol(P_TLS, "tls", None),
162163
Protocol(P_WS, "ws", None),
163164
Protocol(P_WSS, "wss", None),

multiaddr/transforms.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ def string_to_bytes(string: str) -> bytes:
2727
logger.debug(f"[DEBUG string_to_bytes] Encoded protocol code: {encoded_code}")
2828
bs.append(encoded_code)
2929

30-
# Special case: protocols with codec=None are flag protocols
30+
# Special case: protocols with codec=None or SIZE=0 are flag protocols
3131
# (no value, no length prefix, no buffer)
32-
if codec is None:
32+
if codec is None or getattr(codec, "SIZE", None) == 0:
3333
logger.debug(
3434
f"[DEBUG string_to_bytes] Protocol {proto.name} has no data, "
3535
"skipping value encoding"
@@ -93,6 +93,7 @@ def bytes_to_string(buf: bytes) -> str:
9393
value = codec.to_string(proto, bs.read(size))
9494
logger.debug(f"[DEBUG] bytes_to_string: proto={proto.name}, value='{value}'")
9595
if codec.IS_PATH and value.startswith("/"):
96+
# For path protocols, the codec already handles URL encoding
9697
strings.append("/" + proto.name + value) # type: ignore[arg-type]
9798
else:
9899
strings.append("/" + proto.name + "/" + value) # type: ignore[arg-type]

newsfragments/94.feature.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added the http-path protocol in reference with go-multiaddr.

tests/test_multiaddr.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from multiaddr.multiaddr import Multiaddr
1111
from multiaddr.protocols import (
1212
P_DNS,
13+
P_HTTP_PATH,
1314
P_IP4,
1415
P_IP6,
1516
P_P2P,
@@ -825,3 +826,158 @@ def test_memory_protocol_properties():
825826
assert proto.code == 777
826827
assert proto.name == "memory"
827828
assert proto.codec == "memory"
829+
830+
831+
def test_http_path_multiaddr_roundtrip():
832+
"""Test basic http-path in multiaddr string roundtrip"""
833+
test_cases = [
834+
"/http-path/foo",
835+
"/http-path/foo%2Fbar", # URL-encoded forward slashes
836+
"/http-path/api%2Fv1%2Fusers", # URL-encoded forward slashes
837+
]
838+
839+
for addr_str in test_cases:
840+
m = Multiaddr(addr_str)
841+
assert str(m) == addr_str
842+
# Verify protocol value extraction
843+
path_value = m.value_for_protocol(P_HTTP_PATH)
844+
expected_path = addr_str.replace("/http-path/", "")
845+
assert path_value == expected_path
846+
847+
848+
def test_http_path_url_encoding():
849+
"""Test special characters and URL encoding behavior"""
850+
test_cases = [
851+
("/foo%20bar", "/foo%20bar"), # Already URL-encoded input
852+
(
853+
"/path%2Fwith%2Fspecial%21%40%23",
854+
"/path%2Fwith%2Fspecial%21%40%23",
855+
), # Already URL-encoded input
856+
(
857+
"/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF",
858+
"/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF",
859+
), # Already URL-encoded input
860+
("/tmp%2Fbar", "/tmp%2Fbar"), # Already URL-encoded input
861+
]
862+
863+
for input_path, expected_encoded in test_cases:
864+
addr_str = f"/http-path{input_path}"
865+
m = Multiaddr(addr_str)
866+
# The string representation should show URL-encoded path
867+
assert str(m) == f"/http-path{expected_encoded}"
868+
869+
870+
def test_http_path_in_complex_multiaddr():
871+
"""Test http-path as part of larger multiaddr chains"""
872+
test_cases = [
873+
("/ip4/127.0.0.1/tcp/443/tls/http/http-path/api%2Fv1", "api%2Fv1"),
874+
("/ip4/127.0.0.1/tcp/80/http/http-path/static%2Fcss", "static%2Fcss"),
875+
("/dns/example.com/tcp/443/tls/http/http-path/docs", "docs"),
876+
]
877+
878+
for addr_str, expected_path in test_cases:
879+
m = Multiaddr(addr_str)
880+
assert str(m) == addr_str
881+
882+
# Extract the http-path value
883+
path_value = m.value_for_protocol(P_HTTP_PATH)
884+
assert path_value == expected_path
885+
886+
887+
def test_http_path_error_cases():
888+
"""Test error handling for invalid http-path values"""
889+
890+
# Empty path should raise error
891+
with pytest.raises(StringParseError):
892+
Multiaddr("/http-path/")
893+
894+
# Missing path value should raise error
895+
with pytest.raises(StringParseError):
896+
Multiaddr("/http-path")
897+
898+
# Invalid URL encoding should raise error
899+
with pytest.raises(StringParseError):
900+
Multiaddr("/http-path/invalid%zz")
901+
902+
903+
def test_http_path_value_extraction():
904+
"""Test extracting http-path values from multiaddr"""
905+
test_cases = [
906+
("/http-path/foo", "foo"),
907+
("/http-path/foo%2Fbar", "foo%2Fbar"),
908+
("/http-path/api%2Fv1%2Fusers", "api%2Fv1%2Fusers"),
909+
("/ip4/127.0.0.1/tcp/80/http/http-path/docs", "docs"),
910+
]
911+
912+
for addr_str, expected_path in test_cases:
913+
m = Multiaddr(addr_str)
914+
path_value = m.value_for_protocol(P_HTTP_PATH)
915+
assert path_value == expected_path
916+
917+
918+
def test_http_path_edge_cases():
919+
"""Test edge cases and special character handling"""
920+
921+
# Test with various special characters (URL-encoded input)
922+
special_paths = [
923+
"path%20with%20spaces",
924+
"path%2Fwith%2Fmultiple%2Fslashes",
925+
"path%2Fwith%2Funicode%2F%E6%B5%8B%E8%AF%95",
926+
"path%2Fwith%2Fsymbols%21%40%23%24%25%5E%26%2A%28%29",
927+
]
928+
929+
for path in special_paths:
930+
addr_str = f"/http-path/{path}"
931+
m = Multiaddr(addr_str)
932+
# Should handle encoding properly
933+
assert m.value_for_protocol(P_HTTP_PATH) == path
934+
935+
936+
def test_http_path_only_reads_http_path_part():
937+
"""Test that http-path only reads its own part, not subsequent protocols"""
938+
# This test verifies that when we have /http-path/tmp%2Fbar/p2p-circuit,
939+
# the ValueForProtocol only returns the http-path part (tmp%2Fbar)
940+
# and doesn't include the /p2p-circuit part
941+
addr_str = "/http-path/tmp%2Fbar/p2p-circuit"
942+
m = Multiaddr(addr_str)
943+
944+
# Should only return the http-path part, not the p2p-circuit part
945+
http_path_value = m.value_for_protocol(P_HTTP_PATH)
946+
assert http_path_value == "tmp%2Fbar"
947+
948+
# The full string should still include both parts
949+
assert str(m) == addr_str
950+
951+
952+
def test_http_path_malformed_percent_escape():
953+
"""Test that malformed percent-escapes are properly rejected"""
954+
# This tests the specific case from Go: /http-path/thisIsMissingAfullByte%f
955+
# The %f is an incomplete percent-escape and should be rejected
956+
bad_addr = "/http-path/thisIsMissingAfullByte%f"
957+
958+
with pytest.raises(StringParseError, match="Invalid percent-escape"):
959+
Multiaddr(bad_addr)
960+
961+
962+
def test_http_path_raw_value_access():
963+
"""Test accessing raw unescaped values from http-path components"""
964+
# This test demonstrates how to get the raw unescaped value
965+
# similar to Go's SplitLast and RawValue functionality
966+
addr_str = "/http-path/tmp%2Fbar"
967+
m = Multiaddr(addr_str)
968+
969+
# Get the URL-encoded value (what ValueForProtocol returns)
970+
encoded_value = m.value_for_protocol(P_HTTP_PATH)
971+
assert encoded_value == "tmp%2Fbar"
972+
973+
# Get the raw unescaped value by accessing the component directly
974+
# This is similar to Go's component.RawValue()
975+
from urllib.parse import unquote
976+
977+
raw_value = unquote(encoded_value)
978+
assert raw_value == "tmp/bar"
979+
980+
# Verify the roundtrip
981+
from urllib.parse import quote
982+
983+
assert quote(raw_value, safe="") == encoded_value

tests/test_protocols.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import varint
33

44
from multiaddr import Multiaddr, exceptions, protocols
5-
from multiaddr.codecs import memory
5+
from multiaddr.codecs import http_path, memory
66
from multiaddr.exceptions import BinaryParseError
77

88

@@ -269,3 +269,52 @@ def test_memory_integration_invalid_values():
269269
# Too large (overflow > uint64)
270270
with pytest.raises(ValueError):
271271
Multiaddr(f"/memory/{2**64}")
272+
273+
274+
def test_http_path_bytes_string_roundtrip():
275+
codec = http_path.Codec()
276+
277+
# some valid HTTP path strings (URL-encoded input as expected by multiaddr system)
278+
from urllib.parse import quote
279+
280+
for s in ["/foo", "/foo/bar", "/a b", "/こんにちは", "/path/with/special!@#"]:
281+
encoded_s = quote(s, safe="") # Use same encoding as codec
282+
b = codec.to_bytes(None, encoded_s)
283+
assert isinstance(b, bytes)
284+
out = codec.to_string(None, b)
285+
# Should return the same URL-encoded string
286+
assert out == encoded_s
287+
288+
289+
def test_http_path_empty_string_raises():
290+
codec = http_path.Codec()
291+
with pytest.raises(ValueError):
292+
codec.to_bytes(None, "")
293+
294+
295+
def test_http_path_empty_bytes_raises():
296+
codec = http_path.Codec()
297+
with pytest.raises(BinaryParseError):
298+
codec.to_string(None, b"")
299+
300+
301+
def test_http_path_special_characters():
302+
codec = http_path.Codec()
303+
path = "/foo bar/あいうえお"
304+
from urllib.parse import quote
305+
306+
encoded_path = quote(path, safe="") # Use same encoding as codec
307+
b = codec.to_bytes(None, encoded_path)
308+
309+
assert codec.to_string(None, b) == encoded_path
310+
311+
312+
def test_http_path_validate_function():
313+
codec = http_path.Codec()
314+
315+
# valid path
316+
codec.validate(b"/valid/path") # should not raise
317+
318+
# empty path
319+
with pytest.raises(ValueError):
320+
codec.validate(b"")

0 commit comments

Comments
 (0)