dag-cbrrr/src/cbrrr/__init__.py at main · DavidBuchanan314/dag-cbrrr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from typing import Type, Iterator, Union, Callable, Any, List, Dict, BinaryIO
import base64
import hashlib
import io
from . import _cbrrr  # type: ignore

CbrrrDecodeError = _cbrrr.CbrrrDecodeError


class CID:
	"""
	This class is very minimal, intended to support atproto use cases and not
	much else.
	"""

	# fmt: off
	CIDV1_DAG_CBOR_SHA256_32_PFX = b"\x01\x71\x12\x20"
	CIDV1_RAW_SHA256_32_PFX      = b"\x01\x55\x12\x20"
	# fmt: on

	__slots__ = ("cid_bytes",)

	def __init__(self, cid_bytes: bytes) -> None:
		"""
		Expects raw byes, without a multibase prefix.

		If you don't have raw bytes, you probably want CID.decode()

		NOTE: No validation is performed here! You're responsible for ensuring
		the CID has a format you recognise. the is_cidv1_dag_cbor_sha256_32()
		and is_cidv1_raw_sha256_32() methods may be useful for this.
		"""
		self.cid_bytes = cid_bytes

	@classmethod
	def cidv1_dag_cbor_sha256_32_from(cls, data: bytes) -> "CID":
		return cls(cls.CIDV1_DAG_CBOR_SHA256_32_PFX + hashlib.sha256(data).digest())

	@classmethod
	def cidv1_raw_sha256_32_from(cls, data: bytes) -> "CID":
		return cls(cls.CIDV1_RAW_SHA256_32_PFX + hashlib.sha256(data).digest())

	@classmethod
	def decode(cls, data: Union[bytes, str]) -> "CID":
		"""
		Currently supported codecs: identity/raw, base32
		"""

		if isinstance(data, str):
			data = data.encode()

		if data.startswith(b"\x00"):  # identity multibase codec
			return cls(data[1:])

		if data.startswith(b"b"):  # base32 multibase codec
			data = data[1:]  # strip prefix
			if data.endswith(b"="):
				raise ValueError("unexpected base32 padding")
			# add back correct amount of padding (python is fussy)
			data += b"=" * ((-len(data)) % 8)
			decoded = base64.b32decode(data, casefold=True)
			return cls(decoded)

		raise ValueError("I don't know how to decode this CID")

	def encode(self, base="base32") -> str:
		if base == "base32":
			return "b" + base64.b32encode(self.cid_bytes).decode().lower().rstrip("=")
		# this function might support other encodings in the future
		raise ValueError("unsupported base encoding")

	def is_cidv1_dag_cbor_sha256_32(self) -> bool:
		return (
			self.cid_bytes.startswith(self.CIDV1_DAG_CBOR_SHA256_32_PFX)
			and len(self.cid_bytes) == 36
		)

	def is_cidv1_raw_sha256_32(self) -> bool:
		return (
			self.cid_bytes.startswith(self.CIDV1_RAW_SHA256_32_PFX)
			and len(self.cid_bytes) == 36
		)

	def __bytes__(self):
		return self.cid_bytes

	def __repr__(self):
		return f"CID({self.encode()})"

	def __hash__(self) -> int:
		return self.cid_bytes.__hash__()

	def __eq__(self, __value: object) -> bool:
		if not isinstance(__value, CID):
			return False
		return self.cid_bytes == __value.cid_bytes


def decode_varint(stream: BinaryIO):
	n = 0
	for shift in range(0, 63, 7):
		val = stream.read(1)
		if not val:
			raise ValueError("unexpected end of varint input")
		val = val[0]
		n |= (val & 0x7f) << shift
		if not val & 0x80:
			if shift and not val:
				raise ValueError("varint not minimally encoded")
			return n
		shift += 7
	raise ValueError("varint too long")


# I'm adding this so I can pass more CID tests at https://hyphacoop.github.io/dasl-testing/
# I may later decide to perform some or all of the checks inside the C code, for better perf,
# while also making it the default behaviour.
class StrictCID(CID):
	def __init__(self, cid_bytes: bytes) -> None:
		self.cid_bytes = cid_bytes

		if len(cid_bytes) == 34 and cid_bytes.startswith(b"\x12\x20"):
			return # valid CIDv0

		stream = io.BytesIO(cid_bytes)
		cid_version = decode_varint(stream)

		if cid_version != 1:
			raise ValueError("Unsupported CID version")

		decode_varint(stream) # multicodec content type, ignored
		decode_varint(stream) # hash type, value ignored

		hash_length = decode_varint(stream)
		hash_value = stream.read()

		if len(hash_value) != hash_length:
			raise ValueError("Invalid CID hash length")


# nb: | syntax not supported in <=py3.9
DagCborTypes = Union[str, bytes, int, bool, float, CID, List["DagCborTypes"], Dict[str, "DagCborTypes"], None]


def decode_dag_cbor(
	data: bytes, atjson_mode: bool = False, cid_ctor: Callable[[bytes], Any] = CID
) -> DagCborTypes:
	"""
	Decode DAG-CBOR bytes into python objects.

	If atjson_mode is True, bytes will be represented as {"$bytes": "b64..."},
	and CIDs will be represented as {"$link": "b32..."}. Otherwise they'll
	be represented as bytes objects, or CID classes, respectively.
	"""

	parsed, length = _cbrrr.decode_dag_cbor(data, cid_ctor, atjson_mode)
	if length != len(data):
		raise ValueError("did not parse to end of buffer")
	return parsed


def decode_multi_dag_cbor_in_violation_of_the_spec(
	data: bytes, atjson_mode: bool = False, cid_ctor: Callable[[bytes], Any] = CID
) -> Iterator[DagCborTypes]:
	"""
	https://ipld.io/specs/codecs/dag-cbor/spec/#strictness

	"Encode and decode must operate on a single top-level CBOR object.
	Back-to-back concatenated objects are not allowed or supported, as suggested
	by section 5.1 of RFC 8949 for streaming applications."
	"""
	view = memoryview(data)
	offset = 0
	while offset < len(data):
		parsed, length = _cbrrr.decode_dag_cbor(view[offset:], cid_ctor, atjson_mode)
		yield parsed
		offset += length
	assert offset == len(data)  # should never fail!


def encode_dag_cbor(
	obj: DagCborTypes, atjson_mode: bool = False, cid_type: Type = CID
) -> bytes:
	"""
	Encode python objects to DAG-CBOR bytes.

	If atjson_mode is True, dicts in the format {"$bytes": "b64..."} will be
	encoded as CBOR bytes, and dicts in the format {"$link": "b32..."} will be
	encoded as CIDs (CBOR tag value 42)
	"""
	return _cbrrr.encode_dag_cbor(obj, cid_type, atjson_mode)


__all__ = [
	"CbrrrDecodeError",
	"CID",
	"DagCborTypes",
	"decode_dag_cbor",
	"decode_multi_dag_cbor_in_violation_of_the_spec",
	"encode_dag_cbor",
]