@@ -80,6 +80,7 @@ class DataStore:
80
80
recent_merkle_blobs : LRUCache [bytes32 , MerkleBlob ]
81
81
merkle_blobs_path : Path
82
82
key_value_blobs_path : Path
83
+ prefer_file_kv_blob_length : int = 4096
83
84
84
85
@classmethod
85
86
@contextlib .asynccontextmanager
@@ -156,6 +157,7 @@ async def managed(
156
157
"""
157
158
CREATE TABLE IF NOT EXISTS ids(
158
159
kv_id INTEGER PRIMARY KEY,
160
+ hash BLOB NOT NULL CHECK(length(store_id) == 32),
159
161
blob BLOB,
160
162
store_id BLOB NOT NULL CHECK(length(store_id) == 32)
161
163
)
@@ -175,7 +177,7 @@ async def managed(
175
177
)
176
178
await writer .execute (
177
179
"""
178
- CREATE UNIQUE INDEX IF NOT EXISTS ids_blob_index ON ids(blob , store_id)
180
+ CREATE UNIQUE INDEX IF NOT EXISTS ids_hash_index ON ids(hash , store_id)
179
181
"""
180
182
)
181
183
await writer .execute (
@@ -562,20 +564,17 @@ async def insert_root_from_merkle_blob(
562
564
563
565
return await self ._insert_root (store_id , root_hash , status )
564
566
565
- def _kvid_blob_is_file (self , blob : bytes ) -> bool :
566
- return len (blob ) >= len ( bytes32 . zeros )
567
+ def _use_file_for_new_kv_blob (self , blob : bytes ) -> bool :
568
+ return len (blob ) > self . prefer_file_kv_blob_length
567
569
568
570
async def get_kvid (self , blob : bytes , store_id : bytes32 ) -> Optional [KeyOrValueId ]:
569
- if self ._kvid_blob_is_file (blob ):
570
- table_blob = sha256 (blob ).digest ()
571
- else :
572
- table_blob = blob
571
+ blob_hash = bytes32 (sha256 (blob ).digest ())
573
572
574
573
async with self .db_wrapper .reader () as reader :
575
574
cursor = await reader .execute (
576
- "SELECT kv_id FROM ids WHERE blob = ? AND store_id = ?" ,
575
+ "SELECT kv_id FROM ids WHERE hash = ? AND store_id = ?" ,
577
576
(
578
- table_blob ,
577
+ blob_hash ,
579
578
store_id ,
580
579
),
581
580
)
@@ -586,19 +585,15 @@ async def get_kvid(self, blob: bytes, store_id: bytes32) -> Optional[KeyOrValueI
586
585
587
586
return KeyOrValueId (row [0 ])
588
587
589
- def get_blob_from_table_blob (self , table_blob : bytes , store_id : bytes32 ) -> bytes :
590
- if not self ._kvid_blob_is_file (table_blob ):
591
- return table_blob
592
-
593
- blob_hash = bytes32 (table_blob )
588
+ def get_blob_from_file (self , blob_hash : bytes32 , store_id : bytes32 ) -> bytes :
594
589
# TODO: seems that zstd needs hinting
595
590
# TODO: consider file-system based locking of either the file or the store directory
596
591
return zstd .decompress (self .get_key_value_path (store_id = store_id , blob_hash = blob_hash ).read_bytes ()) # type: ignore[no-any-return]
597
592
598
593
async def get_blob_from_kvid (self , kv_id : KeyOrValueId , store_id : bytes32 ) -> Optional [bytes ]:
599
594
async with self .db_wrapper .reader () as reader :
600
595
cursor = await reader .execute (
601
- "SELECT blob FROM ids WHERE kv_id = ? AND store_id = ?" ,
596
+ "SELECT hash, blob FROM ids WHERE kv_id = ? AND store_id = ?" ,
602
597
(
603
598
kv_id ,
604
599
store_id ,
@@ -609,7 +604,12 @@ async def get_blob_from_kvid(self, kv_id: KeyOrValueId, store_id: bytes32) -> Op
609
604
if row is None :
610
605
return None
611
606
612
- return self .get_blob_from_table_blob (bytes (row [0 ]), store_id )
607
+ blob : bytes = row ["blob" ]
608
+ if blob is not None :
609
+ return blob
610
+
611
+ blob_hash = bytes32 (row ["hash" ])
612
+ return self .get_blob_from_file (blob_hash , store_id )
613
613
614
614
async def get_terminal_node (self , kid : KeyId , vid : ValueId , store_id : bytes32 ) -> TerminalNode :
615
615
key = await self .get_blob_from_kvid (kid .raw , store_id )
@@ -620,15 +620,17 @@ async def get_terminal_node(self, kid: KeyId, vid: ValueId, store_id: bytes32) -
620
620
return TerminalNode (hash = leaf_hash (key , value ), key = key , value = value )
621
621
622
622
async def add_kvid (self , blob : bytes , store_id : bytes32 , writer : aiosqlite .Connection ) -> KeyOrValueId :
623
- is_file = self ._kvid_blob_is_file (blob )
624
- if is_file :
625
- table_blob = sha256 (blob ).digest ()
623
+ use_file = self ._use_file_for_new_kv_blob (blob )
624
+ blob_hash = bytes32 (sha256 (blob ).digest ())
625
+ if use_file :
626
+ table_blob = None
626
627
else :
627
628
table_blob = blob
628
629
try :
629
630
row = await writer .execute_insert (
630
- "INSERT INTO ids (blob, store_id) VALUES (?, ?)" ,
631
+ "INSERT INTO ids (hash, blob, store_id) VALUES (?, ?, ?)" ,
631
632
(
633
+ blob_hash ,
632
634
table_blob ,
633
635
store_id ,
634
636
),
@@ -644,14 +646,12 @@ async def add_kvid(self, blob: bytes, store_id: bytes32, writer: aiosqlite.Conne
644
646
645
647
if row is None :
646
648
raise Exception ("Internal error" )
647
- kv_id = KeyOrValueId (row [0 ])
648
- if is_file :
649
- blob_hash = bytes32 (table_blob )
649
+ if use_file :
650
650
path = self .get_key_value_path (store_id = store_id , blob_hash = blob_hash )
651
651
path .parent .mkdir (parents = True , exist_ok = True )
652
652
# TODO: consider file-system based locking of either the file or the store directory
653
653
path .write_bytes (zstd .compress (blob ))
654
- return kv_id
654
+ return KeyOrValueId ( row [ 0 ])
655
655
656
656
async def add_key_value (
657
657
self , key : bytes , value : bytes , store_id : bytes32 , writer : aiosqlite .Connection
@@ -1050,10 +1050,22 @@ async def get_internal_nodes(self, store_id: bytes32, root_hash: Optional[bytes3
1050
1050
return internal_nodes
1051
1051
1052
1052
def get_terminal_node_from_table_blobs (
1053
- self , kid : KeyId , vid : ValueId , table_blobs : dict [KeyOrValueId , bytes ], store_id : bytes32
1053
+ self ,
1054
+ kid : KeyId ,
1055
+ vid : ValueId ,
1056
+ table_blobs : dict [KeyOrValueId , tuple [bytes32 , Optional [bytes ]]],
1057
+ store_id : bytes32 ,
1054
1058
) -> TerminalNode :
1055
- key = self .get_blob_from_table_blob (table_blobs [KeyOrValueId (kid .raw )], store_id )
1056
- value = self .get_blob_from_table_blob (table_blobs [KeyOrValueId (vid .raw )], store_id )
1059
+ key = table_blobs [KeyOrValueId (kid .raw )][1 ]
1060
+ if key is None :
1061
+ key_hash = table_blobs [KeyOrValueId (kid .raw )][0 ]
1062
+ key = self .get_blob_from_file (key_hash , store_id )
1063
+
1064
+ value = table_blobs [KeyOrValueId (vid .raw )][1 ]
1065
+ if value is None :
1066
+ value_hash = table_blobs [KeyOrValueId (vid .raw )][0 ]
1067
+ value = self .get_blob_from_file (value_hash , store_id )
1068
+
1057
1069
return TerminalNode (hash = leaf_hash (key , value ), key = key , value = value )
1058
1070
1059
1071
async def get_keys_values (
@@ -1277,8 +1289,10 @@ async def get_keys(
1277
1289
table_blobs = await self .get_table_blobs (raw_key_ids , store_id )
1278
1290
keys : list [bytes ] = []
1279
1291
for kid in kv_ids .keys ():
1280
- key = self .get_blob_from_table_blob (table_blobs [KeyOrValueId (kid .raw )], store_id )
1281
- keys .append (key )
1292
+ blob_hash , blob = table_blobs [KeyOrValueId (kid .raw )]
1293
+ if blob is None :
1294
+ blob = self .get_blob_from_file (blob_hash , store_id )
1295
+ keys .append (blob )
1282
1296
1283
1297
return keys
1284
1298
@@ -1653,22 +1667,25 @@ async def get_nodes_for_file(
1653
1667
1654
1668
async def get_table_blobs (
1655
1669
self , kv_ids_iter : Iterable [KeyOrValueId ], store_id : bytes32
1656
- ) -> dict [KeyOrValueId , bytes ]:
1657
- result : dict [KeyOrValueId , bytes ] = {}
1670
+ ) -> dict [KeyOrValueId , tuple [ bytes32 , Optional [ bytes ]] ]:
1671
+ result : dict [KeyOrValueId , tuple [ bytes32 , Optional [ bytes ]] ] = {}
1658
1672
batch_size = min (500 , SQLITE_MAX_VARIABLE_NUMBER - 10 )
1659
1673
kv_ids = list (dict .fromkeys (kv_ids_iter ))
1660
1674
1661
1675
async with self .db_wrapper .reader () as reader :
1662
1676
for i in range (0 , len (kv_ids ), batch_size ):
1663
1677
chunk = kv_ids [i : i + batch_size ]
1664
1678
placeholders = "," .join (["?" ] * len (chunk ))
1665
- query = (
1666
- f"SELECT blob, kv_id FROM ids WHERE store_id = ? AND kv_id IN ({ placeholders } ) LIMIT { len (chunk )} "
1667
- )
1679
+ query = f"""
1680
+ SELECT hash, blob, kv_id
1681
+ FROM ids
1682
+ WHERE store_id = ? AND kv_id IN ({ placeholders } )
1683
+ LIMIT { len (chunk )}
1684
+ """
1668
1685
1669
1686
async with reader .execute (query , (store_id , * chunk )) as cursor :
1670
1687
rows = await cursor .fetchall ()
1671
- result .update ({row ["kv_id" ]: row ["blob" ] for row in rows })
1688
+ result .update ({row ["kv_id" ]: ( row ["hash" ], row [ " blob" ]) for row in rows })
1672
1689
1673
1690
if len (result ) != len (kv_ids ):
1674
1691
raise Exception ("Cannot retrieve all the requested kv_ids" )
@@ -1713,8 +1730,10 @@ async def write_tree_to_file(
1713
1730
blobs = []
1714
1731
for raw_id in (node .value1 , node .value2 ):
1715
1732
id = KeyOrValueId .from_bytes (raw_id )
1716
- id_table_blob = table_blobs [id ]
1717
- blobs .append (self .get_blob_from_table_blob (id_table_blob , store_id ))
1733
+ blob_hash , blob = table_blobs [id ]
1734
+ if blob is None :
1735
+ blob = self .get_blob_from_file (blob_hash , store_id )
1736
+ blobs .append (blob )
1718
1737
to_write = bytes (SerializedNode (True , blobs [0 ], blobs [1 ]))
1719
1738
else :
1720
1739
to_write = bytes (node )
0 commit comments