diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml index 74f3ea8..3b4a189 100644 --- a/.github/workflows/tox.yml +++ b/.github/workflows/tox.yml @@ -28,12 +28,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Set TOXENV run: | - python_version="${{ matrix.python-version }}" - toxenv="${{ matrix.toxenv }}" - if [[ "$toxenv" == "docs" ]]; then - echo "TOXENV=docs" | tee -a "$GITHUB_ENV" + if [[ "${{ matrix.toxenv }}" == "docs" ]]; then + echo "TOXENV=docs" >> "$GITHUB_ENV" else - echo "TOXENV=py${python_version}-${toxenv}" | tr -d '.' | tee -a "$GITHUB_ENV" + python_version="${{ matrix.python-version }}" + echo "TOXENV=py${python_version//./}-${{ matrix.toxenv }}" >> "$GITHUB_ENV" fi - run: | python -m pip install --upgrade pip @@ -58,8 +57,7 @@ jobs: shell: bash run: | python_version="${{ matrix.python-version }}" - toxenv="${{ matrix.toxenv }}" - echo "TOXENV=py${python_version}-${toxenv}" | tr -d '.' | tee -a "$GITHUB_ENV" + echo "TOXENV=py${python_version//./}-${{ matrix.toxenv }}" >> "$GITHUB_ENV" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 5748eca..7bdce28 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -96,6 +96,33 @@ Ready to contribute? Here's how to set up `multibase` for local development. If you installed pre-commit hooks (step 4), they will run automatically on commit. +Development Workflow Commands +------------------------------- + +The project provides several ``make`` targets to help with development: + +* ``make fix`` - Automatically fix formatting and linting issues using ruff. + Use this when you want to auto-fix code style issues. + +* ``make lint`` - Run all pre-commit hooks on all files to check for code quality + issues. This includes YAML/TOML validation, trailing whitespace checks, pyupgrade, + ruff linting and formatting, and mypy type checking. + +* ``make typecheck`` - Run mypy type checking only. Use this when you want to + quickly check for type errors without running all other checks. + +* ``make test`` - Run the test suite with pytest using the default Python version. + For testing across multiple Python versions, use ``tox`` instead. + +* ``make pr`` - Run a complete pre-PR check: clean build artifacts, fix formatting, + run linting, type checking, and tests. This is the recommended command to run + before submitting a pull request. + +* ``make coverage`` - Run tests with coverage reporting and open the HTML report + in your browser. + +For a full list of available commands, run ``make help``. + 7. Commit your changes and push your branch to GitHub:: $ git add . diff --git a/README.rst b/README.rst index e80181a..c3e0b15 100644 --- a/README.rst +++ b/README.rst @@ -61,6 +61,28 @@ Sample Usage >>> decode(encode('base2', b'hello world')) b'hello world' + >>> # Using reusable Encoder/Decoder classes + >>> from multibase import Encoder, Decoder + >>> encoder = Encoder('base64') + >>> encoded1 = encoder.encode('data1') + >>> encoded2 = encoder.encode('data2') + + >>> decoder = Decoder() + >>> decoded = decoder.decode(encoded1) + + >>> # Getting encoding information + >>> from multibase import get_encoding_info, list_encodings, is_encoding_supported + >>> info = get_encoding_info('base64') + >>> print(info.encoding, info.code) + base64 b'm' + >>> all_encodings = list_encodings() + >>> is_encoding_supported('base64') + True + + >>> # Decode with encoding return + >>> encoding, data = decode(encoded1, return_encoding=True) + >>> print(f'Encoded with {encoding}: {data}') + Supported codecs ================ @@ -69,14 +91,22 @@ Supported codecs * base8 * base10 * base16 -* base16 -* base16 +* base16upper * base32hex +* base32hexupper +* base32hexpad +* base32hexpadupper * base32 +* base32upper +* base32pad +* base32padupper * base32z * base36 * base36upper * base58flickr * base58btc * base64 +* base64pad * base64url +* base64urlpad +* base256emoji diff --git a/multibase/__init__.py b/multibase/__init__.py index dc94527..fb85a46 100644 --- a/multibase/__init__.py +++ b/multibase/__init__.py @@ -4,4 +4,23 @@ __email__ = "dhruv@dhruvb.com" __version__ = "1.0.3" -from .multibase import ENCODINGS, Encoding, decode, encode, get_codec, is_encoded # noqa: F401 +from .exceptions import ( # noqa: F401 + DecodingError, + InvalidMultibaseStringError, + MultibaseError, + UnsupportedEncodingError, +) +from .multibase import ( # noqa: F401 + ENCODINGS, + ComposedDecoder, + Decoder, + Encoder, + Encoding, + decode, + encode, + get_codec, + get_encoding_info, + is_encoded, + is_encoding_supported, + list_encodings, +) diff --git a/multibase/converters.py b/multibase/converters.py index f44d8eb..4202658 100644 --- a/multibase/converters.py +++ b/multibase/converters.py @@ -28,8 +28,28 @@ def decode(self, bytes): class Base16StringConverter(BaseStringConverter): + def __init__(self, digits): + super().__init__(digits) + self.uppercase = digits.isupper() + def encode(self, bytes): - return ensure_bytes("".join([f"{byte:02x}" for byte in bytes])) + result = "".join([f"{byte:02x}" for byte in bytes]) + if self.uppercase: + result = result.upper() + return ensure_bytes(result) + + def decode(self, data): + # Base16 decode is case-insensitive, normalize to our digits case + if isinstance(data, bytes): + data_str = data.decode("utf-8") + else: + data_str = data + # Convert to match our digits case + if self.uppercase: + data_str = data_str.upper() + else: + data_str = data_str.lower() + return super().decode(data_str.encode("utf-8")) class BaseByteStringConverter: @@ -37,8 +57,9 @@ class BaseByteStringConverter: ENCODING_BITS = 1 DECODING_BITS = 1 - def __init__(self, digits): + def __init__(self, digits, pad=False): self.digits = digits + self.pad = pad def _chunk_with_padding(self, iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" @@ -49,9 +70,11 @@ def _chunk_with_padding(self, iterable, n, fillvalue=None): def _chunk_without_padding(self, iterable, n): return map("".join, zip(*[iter(iterable)] * n)) - def _encode_bytes(self, bytes_, group_bytes, encoding_bits, decoding_bits): + def _encode_bytes(self, bytes_, group_bytes, encoding_bits, decoding_bits, output_chars): buffer = BytesIO(bytes_) encoded_bytes = BytesIO() + input_length = len(bytes_) + while True: byte_ = buffer.read(group_bytes) if not byte_: @@ -67,9 +90,26 @@ def _encode_bytes(self, bytes_, group_bytes, encoding_bits, decoding_bits): # convert binary representation to an integer encoded_bytes.write(ensure_bytes(self.digits[digit])) - return encoded_bytes.getvalue() + result = encoded_bytes.getvalue() + + # Add padding if needed (RFC 4648) + if self.pad: + remainder = input_length % group_bytes + if remainder > 0: + # For partial groups, we need to pad the output + # The padding makes the output length a multiple of output_chars + chars_produced = len(result) + # Calculate padding needed to reach next multiple of output_chars + padding_needed = output_chars - (chars_produced % output_chars) + result += ensure_bytes("=" * padding_needed) + + return result def _decode_bytes(self, bytes_, group_bytes, decoding_bits, encoding_bits): + # Remove padding if present + if self.pad: + bytes_ = bytes_.rstrip(b"=") + buffer = BytesIO() decoded_bytes = BytesIO() @@ -104,7 +144,7 @@ def decode(self, bytes): class Base64StringConverter(BaseByteStringConverter): def encode(self, bytes): - return self._encode_bytes(ensure_bytes(bytes), 3, 8, 6) + return self._encode_bytes(ensure_bytes(bytes), 3, 8, 6, 4) def decode(self, bytes): return self._decode_bytes(ensure_bytes(bytes), 4, 6, 8) @@ -112,12 +152,110 @@ def decode(self, bytes): class Base32StringConverter(BaseByteStringConverter): def encode(self, bytes): - return self._encode_bytes(ensure_bytes(bytes), 5, 8, 5) + return self._encode_bytes(ensure_bytes(bytes), 5, 8, 5, 8) def decode(self, bytes): return self._decode_bytes(ensure_bytes(bytes), 8, 5, 8) +class Base256EmojiConverter: + """Base256 emoji encoding using 256 unique emoji characters. + + This implementation uses the exact same hardcoded emoji alphabet as + js-multiformats and go-multibase reference implementations to ensure + full compatibility. The alphabet is curated from Unicode emoji frequency + data, excluding modifier-based emojis (such as flags) that are bigger + than one single code point. + """ + + # Hardcoded emoji alphabet matching js-multiformats and go-multibase + # This is the exact same alphabet used in reference implementations + # Source: js-multiformats/src/bases/base256emoji.ts and go-multibase/base256emoji.go + _EMOJI_ALPHABET = ( + "๐Ÿš€๐Ÿชโ˜„๐Ÿ›ฐ๐ŸŒŒ" # Space + "๐ŸŒ‘๐ŸŒ’๐ŸŒ“๐ŸŒ”๐ŸŒ•๐ŸŒ–๐ŸŒ—๐ŸŒ˜" # Moon + "๐ŸŒ๐ŸŒ๐ŸŒŽ" # Earth + "๐Ÿ‰" # Dragon + "โ˜€" # Sun + "๐Ÿ’ป๐Ÿ–ฅ๐Ÿ’พ๐Ÿ’ฟ" # Computer + # Rest from Unicode emoji frequency data (most used first) + "๐Ÿ˜‚โค๐Ÿ˜๐Ÿคฃ๐Ÿ˜Š๐Ÿ™๐Ÿ’•๐Ÿ˜ญ๐Ÿ˜˜๐Ÿ‘" + "๐Ÿ˜…๐Ÿ‘๐Ÿ˜๐Ÿ”ฅ๐Ÿฅฐ๐Ÿ’”๐Ÿ’–๐Ÿ’™๐Ÿ˜ข๐Ÿค”" + "๐Ÿ˜†๐Ÿ™„๐Ÿ’ช๐Ÿ˜‰โ˜บ๐Ÿ‘Œ๐Ÿค—๐Ÿ’œ๐Ÿ˜”๐Ÿ˜Ž" + "๐Ÿ˜‡๐ŸŒน๐Ÿคฆ๐ŸŽ‰๐Ÿ’žโœŒโœจ๐Ÿคท๐Ÿ˜ฑ๐Ÿ˜Œ" + "๐ŸŒธ๐Ÿ™Œ๐Ÿ˜‹๐Ÿ’—๐Ÿ’š๐Ÿ˜๐Ÿ’›๐Ÿ™‚๐Ÿ’“๐Ÿคฉ" + "๐Ÿ˜„๐Ÿ˜€๐Ÿ–ค๐Ÿ˜ƒ๐Ÿ’ฏ๐Ÿ™ˆ๐Ÿ‘‡๐ŸŽถ๐Ÿ˜’๐Ÿคญ" + "โฃ๐Ÿ˜œ๐Ÿ’‹๐Ÿ‘€๐Ÿ˜ช๐Ÿ˜‘๐Ÿ’ฅ๐Ÿ™‹๐Ÿ˜ž๐Ÿ˜ฉ" + "๐Ÿ˜ก๐Ÿคช๐Ÿ‘Š๐Ÿฅณ๐Ÿ˜ฅ๐Ÿคค๐Ÿ‘‰๐Ÿ’ƒ๐Ÿ˜ณโœ‹" + "๐Ÿ˜š๐Ÿ˜๐Ÿ˜ด๐ŸŒŸ๐Ÿ˜ฌ๐Ÿ™ƒ๐Ÿ€๐ŸŒท๐Ÿ˜ป๐Ÿ˜“" + "โญโœ…๐Ÿฅบ๐ŸŒˆ๐Ÿ˜ˆ๐Ÿค˜๐Ÿ’ฆโœ”๐Ÿ˜ฃ๐Ÿƒ" + "๐Ÿ’โ˜น๐ŸŽŠ๐Ÿ’˜๐Ÿ˜ โ˜๐Ÿ˜•๐ŸŒบ๐ŸŽ‚๐ŸŒป" + "๐Ÿ˜๐Ÿ–•๐Ÿ’๐Ÿ™Š๐Ÿ˜น๐Ÿ—ฃ๐Ÿ’ซ๐Ÿ’€๐Ÿ‘‘๐ŸŽต" + "๐Ÿคž๐Ÿ˜›๐Ÿ”ด๐Ÿ˜ค๐ŸŒผ๐Ÿ˜ซโšฝ๐Ÿค™โ˜•๐Ÿ†" + "๐Ÿคซ๐Ÿ‘ˆ๐Ÿ˜ฎ๐Ÿ™†๐Ÿป๐Ÿƒ๐Ÿถ๐Ÿ’๐Ÿ˜ฒ๐ŸŒฟ" + "๐Ÿงก๐ŸŽโšก๐ŸŒž๐ŸŽˆโŒโœŠ๐Ÿ‘‹๐Ÿ˜ฐ๐Ÿคจ" + "๐Ÿ˜ถ๐Ÿค๐Ÿšถ๐Ÿ’ฐ๐Ÿ“๐Ÿ’ข๐ŸคŸ๐Ÿ™๐Ÿšจ๐Ÿ’จ" + "๐Ÿคฌโœˆ๐ŸŽ€๐Ÿบ๐Ÿค“๐Ÿ˜™๐Ÿ’Ÿ๐ŸŒฑ๐Ÿ˜–๐Ÿ‘ถ" + "๐Ÿฅดโ–ถโžกโ“๐Ÿ’Ž๐Ÿ’ธโฌ‡๐Ÿ˜จ๐ŸŒš๐Ÿฆ‹" + "๐Ÿ˜ท๐Ÿ•บโš ๐Ÿ™…๐Ÿ˜Ÿ๐Ÿ˜ต๐Ÿ‘Ž๐Ÿคฒ๐Ÿค ๐Ÿคง" + "๐Ÿ“Œ๐Ÿ”ต๐Ÿ’…๐Ÿง๐Ÿพ๐Ÿ’๐Ÿ˜—๐Ÿค‘๐ŸŒŠ๐Ÿคฏ" + "๐Ÿทโ˜Ž๐Ÿ’ง๐Ÿ˜ฏ๐Ÿ’†๐Ÿ‘†๐ŸŽค๐Ÿ™‡๐Ÿ‘โ„" + "๐ŸŒด๐Ÿ’ฃ๐Ÿธ๐Ÿ’Œ๐Ÿ“๐Ÿฅ€๐Ÿคข๐Ÿ‘…๐Ÿ’ก๐Ÿ’ฉ" + "๐Ÿ‘๐Ÿ“ธ๐Ÿ‘ป๐Ÿค๐Ÿคฎ๐ŸŽผ๐Ÿฅต๐Ÿšฉ๐ŸŽ๐ŸŠ" + "๐Ÿ‘ผ๐Ÿ’๐Ÿ“ฃ๐Ÿฅ‚" + ) + + def __init__(self): + # Verify alphabet length + if len(self._EMOJI_ALPHABET) != 256: + raise ValueError(f"EMOJI_ALPHABET must contain exactly 256 characters, got {len(self._EMOJI_ALPHABET)}") + # Create mapping from byte value to emoji character + self.byte_to_emoji = {i: self._EMOJI_ALPHABET[i] for i in range(256)} + # Create reverse mapping from emoji character to byte value + # This matches the approach in js-multiformats and go-multibase + self.emoji_to_byte = {emoji: byte for byte, emoji in self.byte_to_emoji.items()} + + def encode(self, bytes_) -> bytes: + """Encode bytes to emoji string. + + :param bytes_: Bytes to encode + :type bytes_: bytes or str + :return: UTF-8 encoded emoji string + :rtype: bytes + """ + bytes_ = ensure_bytes(bytes_) + result = [] + for byte_val in bytes_: + result.append(self.byte_to_emoji[byte_val]) + return "".join(result).encode("utf-8") + + def decode(self, bytes_) -> bytes: + """Decode emoji string to bytes. + + Decodes character-by-character, matching the behavior of js-multiformats + and go-multibase reference implementations. Each emoji in the alphabet + is a single Unicode code point, so we can safely iterate character by + character. + + :param bytes_: UTF-8 encoded emoji string + :type bytes_: bytes or str + :return: Decoded bytes + :rtype: bytes + :raises ValueError: if an invalid emoji character is encountered + """ + bytes_ = ensure_bytes(bytes_, "utf8") + # Decode UTF-8 to get emoji string + emoji_str = bytes_.decode("utf-8") + result = bytearray() + # Iterate character by character (Python string iteration handles + # single code point emojis correctly, matching js-multiformats and go-multibase) + for char in emoji_str: + if char not in self.emoji_to_byte: + raise ValueError(f"Non-base256emoji character: {char}") + result.append(self.emoji_to_byte[char]) + return bytes(result) + + class IdentityConverter: def encode(self, x): return x diff --git a/multibase/exceptions.py b/multibase/exceptions.py new file mode 100644 index 0000000..f609045 --- /dev/null +++ b/multibase/exceptions.py @@ -0,0 +1,25 @@ +"""Custom exceptions for multibase encoding/decoding errors.""" + + +class MultibaseError(ValueError): + """Base exception for all multibase errors.""" + + pass + + +class UnsupportedEncodingError(MultibaseError): + """Raised when an encoding is not supported.""" + + pass + + +class InvalidMultibaseStringError(MultibaseError): + """Raised when a multibase string is invalid or cannot be decoded.""" + + pass + + +class DecodingError(MultibaseError): + """Raised when decoding fails.""" + + pass diff --git a/multibase/multibase.py b/multibase/multibase.py index 3007b99..9447cbb 100644 --- a/multibase/multibase.py +++ b/multibase/multibase.py @@ -6,9 +6,15 @@ Base16StringConverter, Base32StringConverter, Base64StringConverter, + Base256EmojiConverter, BaseStringConverter, IdentityConverter, ) +from .exceptions import ( + DecodingError, + InvalidMultibaseStringError, + UnsupportedEncodingError, +) Encoding = namedtuple("Encoding", "encoding,code,converter") CODE_LENGTH = 1 @@ -18,19 +24,37 @@ Encoding("base8", b"7", BaseStringConverter("01234567")), Encoding("base10", b"9", BaseStringConverter("0123456789")), Encoding("base16", b"f", Base16StringConverter("0123456789abcdef")), + Encoding("base16upper", b"F", Base16StringConverter("0123456789ABCDEF")), Encoding("base32hex", b"v", Base32StringConverter("0123456789abcdefghijklmnopqrstuv")), + Encoding("base32hexupper", b"V", Base32StringConverter("0123456789ABCDEFGHIJKLMNOPQRSTUV")), + Encoding("base32hexpad", b"t", Base32StringConverter("0123456789abcdefghijklmnopqrstuv", pad=True)), + Encoding("base32hexpadupper", b"T", Base32StringConverter("0123456789ABCDEFGHIJKLMNOPQRSTUV", pad=True)), Encoding("base32", b"b", Base32StringConverter("abcdefghijklmnopqrstuvwxyz234567")), + Encoding("base32upper", b"B", Base32StringConverter("ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")), + Encoding("base32pad", b"c", Base32StringConverter("abcdefghijklmnopqrstuvwxyz234567", pad=True)), + Encoding("base32padupper", b"C", Base32StringConverter("ABCDEFGHIJKLMNOPQRSTUVWXYZ234567", pad=True)), Encoding("base32z", b"h", BaseStringConverter("ybndrfg8ejkmcpqxot1uwisza345h769")), Encoding("base36", b"k", BaseStringConverter("0123456789abcdefghijklmnopqrstuvwxyz")), Encoding("base36upper", b"K", BaseStringConverter("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")), Encoding("base58flickr", b"Z", BaseStringConverter("123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ")), Encoding("base58btc", b"z", BaseStringConverter("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")), Encoding("base64", b"m", Base64StringConverter("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")), + Encoding( + "base64pad", + b"M", + Base64StringConverter("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", pad=True), + ), Encoding( "base64url", b"u", Base64StringConverter("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"), ), + Encoding( + "base64urlpad", + b"U", + Base64StringConverter("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_", pad=True), + ), + Encoding("base256emoji", "๐Ÿš€".encode(), Base256EmojiConverter()), ] ENCODINGS_LOOKUP = {} @@ -48,13 +72,13 @@ def encode(encoding, data): :type data: str or bytes :return: multibase encoded data :rtype: bytes - :raises ValueError: if the encoding is not supported + :raises UnsupportedEncodingError: if the encoding is not supported """ data = ensure_bytes(data, "utf8") try: return ENCODINGS_LOOKUP[encoding].code + ENCODINGS_LOOKUP[encoding].converter.encode(data) except KeyError: - raise ValueError(f"Encoding {encoding} not supported.") + raise UnsupportedEncodingError(f"Encoding {encoding} not supported.") def get_codec(data): @@ -64,13 +88,21 @@ def get_codec(data): :param data: multibase encoded data :type data: str or bytes :return: the :py:obj:`multibase.Encoding` object for the data's codec - :raises ValueError: if the codec is not supported + :raises InvalidMultibaseStringError: if the codec is not supported """ + data = ensure_bytes(data, "utf8") + # Check for base256emoji first (4-byte UTF-8 prefix) + if len(data) >= 4: + emoji_prefix = data[:4] + if emoji_prefix in ENCODINGS_LOOKUP: + return ENCODINGS_LOOKUP[emoji_prefix] + + # Check for single-byte prefixes try: - key = ensure_bytes(data[:CODE_LENGTH], "utf8") + key = data[:CODE_LENGTH] codec = ENCODINGS_LOOKUP[key] except KeyError: - raise ValueError(f"Can not determine encoding for {data}") + raise InvalidMultibaseStringError(f"Can not determine encoding for {data}") else: return codec @@ -87,19 +119,173 @@ def is_encoded(data): try: get_codec(data) return True - except ValueError: + except (ValueError, InvalidMultibaseStringError): return False -def decode(data): +def is_encoding_supported(encoding): + """ + Check if an encoding is supported. + + :param encoding: encoding name to check + :type encoding: str + :return: True if encoding is supported, False otherwise + :rtype: bool + """ + return encoding in ENCODINGS_LOOKUP + + +def list_encodings(): + """ + List all supported encodings. + + :return: list of encoding names + :rtype: list + """ + return [enc.encoding for enc in ENCODINGS] + + +def get_encoding_info(encoding): + """ + Get information about a specific encoding. + + :param encoding: encoding name + :type encoding: str + :return: Encoding namedtuple with encoding, code, and converter + :rtype: Encoding + :raises UnsupportedEncodingError: if encoding is not supported + """ + if encoding not in ENCODINGS_LOOKUP: + raise UnsupportedEncodingError(f"Encoding {encoding} not supported.") + return ENCODINGS_LOOKUP[encoding] + + +def decode(data, return_encoding=False): """ Decode the multibase decoded data + :param data: multibase encoded data :type data: str or bytes - :return: decoded data - :rtype: str - :raises ValueError: if the data is not multibase encoded + :param return_encoding: if True, return tuple (encoding, decoded_data) + :type return_encoding: bool + :return: decoded data, or tuple (encoding, decoded_data) if return_encoding=True + :rtype: bytes or tuple + :raises InvalidMultibaseStringError: if the data is not multibase encoded + :raises DecodingError: if decoding fails """ data = ensure_bytes(data, "utf8") - codec = get_codec(data) - return codec.converter.decode(data[CODE_LENGTH:]) + try: + codec = get_codec(data) + # Handle base256emoji which has a 4-byte prefix + prefix_length = len(codec.code) + decoded = codec.converter.decode(data[prefix_length:]) + if return_encoding: + return (codec.encoding, decoded) + return decoded + except (InvalidMultibaseStringError, UnsupportedEncodingError): + # Re-raise these specific exceptions as-is since they already provide + # appropriate context about what went wrong (invalid format or unsupported encoding) + raise + except Exception as e: + # Wrap all other exceptions (e.g., converter errors, invalid data) + # in DecodingError to provide consistent error handling + raise DecodingError(f"Failed to decode multibase data: {e}") from e + + +class Encoder: + """Reusable encoder for a specific encoding.""" + + def __init__(self, encoding): + """ + Initialize an encoder for a specific encoding. + + :param encoding: encoding name to use + :type encoding: str + :raises UnsupportedEncodingError: if encoding is not supported + """ + if encoding not in ENCODINGS_LOOKUP: + raise UnsupportedEncodingError(f"Encoding {encoding} not supported.") + self.encoding = encoding + self._codec = ENCODINGS_LOOKUP[encoding] + + def encode(self, data): + """ + Encode data using this encoder's encoding. + + :param data: data to encode + :type data: str or bytes + :return: multibase encoded data + :rtype: bytes + """ + data = ensure_bytes(data, "utf8") + return self._codec.code + self._codec.converter.encode(data) + + +class Decoder: + """Reusable decoder for multibase data.""" + + def __init__(self): + """Initialize a decoder.""" + pass + + def decode(self, data, return_encoding=False): + """ + Decode multibase encoded data. + + :param data: multibase encoded data + :type data: str or bytes + :param return_encoding: if True, return tuple (encoding, decoded_data) + :type return_encoding: bool + :return: decoded data, or tuple (encoding, decoded_data) if return_encoding=True + :rtype: bytes or tuple + :raises InvalidMultibaseStringError: if the data is not multibase encoded + :raises DecodingError: if decoding fails + """ + return decode(data, return_encoding=return_encoding) + + def or_(self, other_decoder): + """ + Compose this decoder with another, trying this one first. + + This allows trying multiple decoders in sequence. + + :param other_decoder: another decoder to try if this one fails + :type other_decoder: Decoder + :return: a composed decoder + :rtype: ComposedDecoder + """ + return ComposedDecoder([self, other_decoder]) + + +class ComposedDecoder: + """A decoder that tries multiple decoders in sequence.""" + + def __init__(self, decoders): + """ + Initialize a composed decoder. + + :param decoders: list of decoders to try in order + :type decoders: list + """ + self.decoders = decoders + + def decode(self, data, return_encoding=False): + """ + Try to decode with each decoder in sequence. + + :param data: multibase encoded data + :type data: str or bytes + :param return_encoding: if True, return tuple (encoding, decoded_data) + :type return_encoding: bool + :return: decoded data, or tuple (encoding, decoded_data) if return_encoding=True + :rtype: bytes or tuple + :raises DecodingError: if all decoders fail + """ + last_error = None + for decoder in self.decoders: + try: + return decoder.decode(data, return_encoding=return_encoding) + except (InvalidMultibaseStringError, DecodingError) as e: + last_error = e + continue + raise DecodingError(f"All decoders failed. Last error: {last_error}") from last_error diff --git a/newsfragments/18.internal.rst b/newsfragments/18.internal.rst index abbfcde..a44c043 100644 --- a/newsfragments/18.internal.rst +++ b/newsfragments/18.internal.rst @@ -1 +1 @@ -Modernized project setup and infrastructure. Migrated from legacy setup.py/setup.cfg to modern pyproject.toml, replaced Travis CI with GitHub Actions, updated Python version support to 3.10-3.14, replaced flake8 with ruff, and added pre-commit hooks. This is an internal change that does not affect the public API. +Modernized project infrastructure: migrated to pyproject.toml, replaced Travis CI with GitHub Actions, updated Python support to 3.10-3.14, replaced flake8 with ruff, and added pre-commit hooks. This is an internal change that does not affect the public API. diff --git a/newsfragments/20.feature.rst b/newsfragments/20.feature.rst new file mode 100644 index 0000000..1f9008e --- /dev/null +++ b/newsfragments/20.feature.rst @@ -0,0 +1,22 @@ +Added complete multibase encoding support and enhanced API features. + +**New Encodings (10 total):** +- base16upper (prefix F) - Uppercase hexadecimal encoding +- base32upper (prefix B) - Uppercase base32 encoding +- base32pad (prefix c) - Base32 with RFC 4648 padding +- base32padupper (prefix C) - Base32 uppercase with padding +- base32hexupper (prefix V) - Base32hex uppercase variant +- base32hexpad (prefix t) - Base32hex with RFC 4648 padding +- base32hexpadupper (prefix T) - Base32hex uppercase with padding +- base64pad (prefix M) - Base64 with RFC 4648 padding +- base64urlpad (prefix U) - Base64url with padding +- base256emoji (prefix ๐Ÿš€) - Emoji-based encoding + +**API Enhancements:** +- Added ``Encoder`` and ``Decoder`` classes for reusable encoding/decoding +- Added ``decode(return_encoding=True)`` parameter to return encoding type along with decoded data +- Added structured exception classes: ``UnsupportedEncodingError``, ``InvalidMultibaseStringError``, ``DecodingError`` +- Added encoding metadata functions: ``get_encoding_info()``, ``list_encodings()``, ``is_encoding_supported()`` +- Added decoder composition support via ``Decoder.or_()`` method + +This brings py-multibase to 100% encoding coverage (24/24 encodings) matching reference implementations (go-multibase, rust-multibase, js-multiformats). diff --git a/tests/test_multibase.py b/tests/test_multibase.py index b50c00a..77bad99 100644 --- a/tests/test_multibase.py +++ b/tests/test_multibase.py @@ -5,7 +5,19 @@ import pytest from morphys import ensure_bytes -from multibase import decode, encode, is_encoded +from multibase import ( + Decoder, + DecodingError, + Encoder, + InvalidMultibaseStringError, + UnsupportedEncodingError, + decode, + encode, + get_encoding_info, + is_encoded, + is_encoding_supported, + list_encodings, +) TEST_FIXTURES = ( ("identity", "yes mani !", "\x00yes mani !"), @@ -21,6 +33,10 @@ ("base16", "foob", "f666f6f62"), ("base16", "fooba", "f666f6f6261"), ("base16", "foobar", "f666f6f626172"), + ("base16upper", "yes mani !", "F796573206D616E692021"), + ("base16upper", "f", "F66"), + ("base16upper", "fo", "F666F"), + ("base16upper", "foo", "F666F6F"), ("base32", "yes mani !", "bpfsxgidnmfxgsibb"), ("base32", "f", "bmy"), ("base32", "fo", "bmzxq"), @@ -28,13 +44,21 @@ ("base32", "foob", "bmzxw6yq"), ("base32", "fooba", "bmzxw6ytb"), ("base32", "foobar", "bmzxw6ytboi"), - # ('base32pad', 'yes mani !', 'cpfsxgidnmfxgsibb'), - # ('base32pad', 'f', 'cmy======'), - # ('base32pad', 'fo', 'cmzxq===='), - # ('base32pad', 'foo', 'cmzxw6==='), - # ('base32pad', 'foob', 'cmzxw6yq='), - # ('base32pad', 'fooba', 'cmzxw6ytb'), - # ('base32pad', 'foobar', 'cmzxw6ytboi======'), + ("base32upper", "yes mani !", "BPFSXGIDNMFXGSIBB"), + ("base32upper", "f", "BMY"), + ("base32upper", "fo", "BMZXQ"), + ("base32upper", "foo", "BMZXW6"), + ("base32pad", "yes mani !", "cpfsxgidnmfxgsibb"), + ("base32pad", "f", "cmy======"), + ("base32pad", "fo", "cmzxq===="), + ("base32pad", "foo", "cmzxw6==="), + ("base32pad", "foob", "cmzxw6yq="), + ("base32pad", "fooba", "cmzxw6ytb"), + ("base32pad", "foobar", "cmzxw6ytboi======"), + ("base32padupper", "yes mani !", "CPFSXGIDNMFXGSIBB"), + ("base32padupper", "f", "CMY======"), + ("base32padupper", "fo", "CMZXQ===="), + ("base32padupper", "foo", "CMZXW6==="), ("base32hex", "yes mani !", "vf5in683dc5n6i811"), ("base32hex", "f", "vco"), ("base32hex", "fo", "vcpng"), @@ -42,13 +66,19 @@ ("base32hex", "foob", "vcpnmuog"), ("base32hex", "fooba", "vcpnmuoj1"), ("base32hex", "foobar", "vcpnmuoj1e8"), - # ('base32hexpad', 'yes mani !', 'tf5in683dc5n6i811'), - # ('base32hexpad', 'f', 'tco======'), - # ('base32hexpad', 'fo', 'tcpng===='), - # ('base32hexpad', 'foo', 'tcpnmu==='), - # ('base32hexpad', 'foob', 'tcpnmuog='), - # ('base32hexpad', 'fooba', 'tcpnmuoj1'), - # ('base32hexpad', 'foobar', 'tcpnmuoj1e8======'), + ("base32hexupper", "yes mani !", "VF5IN683DC5N6I811"), + ("base32hexupper", "f", "VCO"), + ("base32hexupper", "fo", "VCPNG"), + ("base32hexpad", "yes mani !", "tf5in683dc5n6i811"), + ("base32hexpad", "f", "tco======"), + ("base32hexpad", "fo", "tcpng===="), + ("base32hexpad", "foo", "tcpnmu==="), + ("base32hexpad", "foob", "tcpnmuog="), + ("base32hexpad", "fooba", "tcpnmuoj1"), + ("base32hexpad", "foobar", "tcpnmuoj1e8======"), + ("base32hexpadupper", "yes mani !", "TF5IN683DC5N6I811"), + ("base32hexpadupper", "f", "TCO======"), + ("base32hexpadupper", "fo", "TCPNG===="), ("base32z", "yes mani !", "hxf1zgedpcfzg1ebb"), ("base36", "Decentralize everything!!!", "km552ng4dabi4neu1oo8l4i5mndwmpc3mkukwtxy9"), ("base36upper", "Decentralize everything!!!", "KM552NG4DABI4NEU1OO8L4I5MNDWMPC3MKUKWTXY9"), @@ -61,19 +91,19 @@ ("base64", "foob", "mZm9vYg"), ("base64", "fooba", "mZm9vYmE"), ("base64", "foobar", "mZm9vYmFy"), - # ('base64pad', 'f', 'MZg=='), - # ('base64pad', 'fo', 'MZm8='), - # ('base64pad', 'foo', 'MZm9v'), - # ('base64pad', 'foob', 'MZm9vYg=='), - # ('base64pad', 'fooba', 'MZm9vYmE='), - # ('base64pad', 'foobar', 'MZm9vYmFy'), + ("base64pad", "f", "MZg=="), + ("base64pad", "fo", "MZm8="), + ("base64pad", "foo", "MZm9v"), + ("base64pad", "foob", "MZm9vYg=="), + ("base64pad", "fooba", "MZm9vYmE="), + ("base64pad", "foobar", "MZm9vYmFy"), ("base64url", "รทรฏรฟ", "uw7fDr8O_"), - # ('base64urlpad', 'f', 'UZg=='), - # ('base64urlpad', 'fo', 'UZm8='), - # ('base64urlpad', 'foo', 'UZm9v'), - # ('base64urlpad', 'foob', 'UZm9vYg=='), - # ('base64urlpad', 'fooba', 'UZm9vYmE='), - # ('base64urlpad', 'foobar', 'UZm9vYmFy'), + ("base64urlpad", "f", "UZg=="), + ("base64urlpad", "fo", "UZm8="), + ("base64urlpad", "foo", "UZm9v"), + ("base64urlpad", "foob", "UZm9vYg=="), + ("base64urlpad", "fooba", "UZm9vYmE="), + ("base64urlpad", "foobar", "UZm9vYmFy"), ) @@ -88,7 +118,7 @@ def test_encode(encoding, data, encoded_data): @pytest.mark.parametrize("encoding", INCORRECT_ENCODINGS) def test_encode_incorrect_encoding(encoding): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(UnsupportedEncodingError) as excinfo: encode(encoding, "test data") assert "not supported" in str(excinfo.value) @@ -100,7 +130,7 @@ def test_decode(_, data, encoded_data): @pytest.mark.parametrize("encoded_data", INCORRECT_ENCODED_DATA) def test_decode_incorrect_encoding(encoded_data): - with pytest.raises(ValueError) as excinfo: + with pytest.raises(InvalidMultibaseStringError) as excinfo: decode(encoded_data) assert "Can not determine encoding" in str(excinfo.value) @@ -113,3 +143,91 @@ def test_is_encoded(_, data, encoded_data): @pytest.mark.parametrize("encoded_data", INCORRECT_ENCODED_DATA) def test_is_encoded_incorrect_encoding(encoded_data): assert not is_encoded(encoded_data) + + +def test_decode_return_encoding(): + """Test decode with return_encoding parameter.""" + encoding, decoded = decode("f796573206d616e692021", return_encoding=True) + assert encoding == "base16" + assert decoded == ensure_bytes("yes mani !") + + +def test_is_encoding_supported(): + """Test is_encoding_supported function.""" + assert is_encoding_supported("base64") + assert is_encoding_supported("base16") + assert not is_encoding_supported("base999") + + +def test_list_encodings(): + """Test list_encodings function.""" + encodings = list_encodings() + assert "base64" in encodings + assert "base16" in encodings + assert "base32pad" in encodings + assert "base64pad" in encodings + assert len(encodings) >= 24 # Should have at least 24 encodings + + +def test_get_encoding_info(): + """Test get_encoding_info function.""" + info = get_encoding_info("base64") + assert info.encoding == "base64" + assert info.code == b"m" + assert info.converter is not None + + with pytest.raises(UnsupportedEncodingError): + get_encoding_info("base999") + + +def test_encoder_class(): + """Test Encoder class.""" + encoder = Encoder("base64") + assert encoder.encoding == "base64" + + encoded = encoder.encode("hello") + assert encoded.startswith(b"m") + + with pytest.raises(UnsupportedEncodingError): + Encoder("base999") + + +def test_decoder_class(): + """Test Decoder class.""" + decoder = Decoder() + # Use a known good encoding + test_data = encode("base64", "hello") + decoded = decoder.decode(test_data) + assert decoded == ensure_bytes("hello") + + encoding, decoded = decoder.decode(test_data, return_encoding=True) + assert encoding == "base64" + assert decoded == ensure_bytes("hello") + + +def test_decoder_composition(): + """Test decoder composition with or_ method.""" + decoder1 = Decoder() + decoder2 = Decoder() + + # This should work with any valid multibase string + composed = decoder1.or_(decoder2) + test_data = encode("base64", "hello") + decoded = composed.decode(test_data) + assert decoded == ensure_bytes("hello") + + # Should fail with invalid data + with pytest.raises(DecodingError): + composed.decode("invalid") + + +def test_composed_decoder_all_fail(): + """Test ComposedDecoder error message when all decoders fail.""" + decoder1 = Decoder() + decoder2 = Decoder() + composed = decoder1.or_(decoder2) + + with pytest.raises(DecodingError) as excinfo: + composed.decode("invalid") + assert "All decoders failed" in str(excinfo.value) + assert "Last error" in str(excinfo.value)