22
33import itertools
44import re
5- import struct
65from functools import lru_cache
76from io import BytesIO
7+ from pathlib import Path
88from typing import TYPE_CHECKING , Any , BinaryIO
99
1010from dissect .database .sqlite3 .c_sqlite3 import c_sqlite3
1515 NoCellData ,
1616)
1717from dissect .database .sqlite3 .util import parse_table_columns_constraints
18+ from dissect .database .sqlite3 .wal import WAL , Checkpoint
1819
1920if TYPE_CHECKING :
2021 from collections .abc import Iterator
4748 9 : lambda fh : 1 ,
4849}
4950
51+ # See https://sqlite.org/fileformat2.html#magic_header_string
5052SQLITE3_HEADER_MAGIC = b"SQLite format 3\x00 "
5153
52- WAL_HEADER_MAGIC_LE = 0x377F0682
53- WAL_HEADER_MAGIC_BE = 0x377F0683
54- WAL_HEADER_MAGIC = {WAL_HEADER_MAGIC_LE , WAL_HEADER_MAGIC_BE }
55-
5654
5755class SQLite3 :
58- def __init__ (self , fh : BinaryIO , wal_fh : BinaryIO | None = None ):
56+ """SQLite3 database class.
57+
58+ Loads a SQLite3 database from the given file-like object or path. If a path is provided (or can be deduced
59+ from the file-like object), a WAL file will be automatically looked for with a few common suffixes.
60+ Optionally a WAL file-like object or path can be directly provided to read changes from the WAL (this takes
61+ priority over the aforementioned WAL lookup). Additionally, a specific checkpoint from the WAL can be applied.
62+
63+ Args:
64+ fh: The path or file-like object to open a SQLite3 database on.
65+ wal: The path or file-like object to open a SQLite3 WAL file on.
66+ checkpoint: The checkpoint to apply from the WAL file. Can be a :class:`Checkpoint` object or an integer index.
67+
68+ Raises:
69+ InvalidDatabase: If the file-like object does not look like a SQLite3 database based on the header magic.
70+
71+ References:
72+ - https://sqlite.org/fileformat2.html
73+ """
74+
75+ def __init__ (
76+ self ,
77+ fh : Path | BinaryIO ,
78+ wal : WAL | Path | BinaryIO | None = None ,
79+ checkpoint : Checkpoint | int | None = None ,
80+ ):
81+ # Use the provided file handle or try to open the file path.
82+ if hasattr (fh , "read" ):
83+ name = getattr (fh , "name" , None )
84+ path = Path (name ) if name else None
85+ else :
86+ path = fh
87+ fh = path .open ("rb" )
88+
5989 self .fh = fh
60- self .wal = WAL (wal_fh ) if wal_fh else None
90+ self .path = path
91+ self .wal = None
92+ self .checkpoint = None
6193
62- self .header = c_sqlite3 .header (fh )
94+ self .header = c_sqlite3 .header (self . fh )
6395 if self .header .magic != SQLITE3_HEADER_MAGIC :
6496 raise InvalidDatabase ("Invalid header magic" )
6597
@@ -72,10 +104,31 @@ def __init__(self, fh: BinaryIO, wal_fh: BinaryIO | None = None):
72104 if self .usable_page_size < 480 :
73105 raise InvalidDatabase ("Usable page size is too small" )
74106
107+ if wal :
108+ self .wal = WAL (wal ) if not isinstance (wal , WAL ) else wal
109+ elif path :
110+ # Check for WAL sidecar next to the DB.
111+ wal_path = path .with_name (f"{ path .name } -wal" )
112+ if wal_path .exists ():
113+ self .wal = WAL (wal_path )
114+
115+ # If a checkpoint index was provided, resolve it to a Checkpoint object.
116+ if self .wal and isinstance (checkpoint , int ):
117+ if checkpoint < 0 or checkpoint >= len (self .wal .checkpoints ):
118+ raise IndexError ("WAL checkpoint index out of range" )
119+ self .checkpoint = self .wal .checkpoints [checkpoint ]
120+ else :
121+ self .checkpoint = checkpoint
122+
75123 self .page = lru_cache (256 )(self .page )
76124
77- def open_wal (self , fh : BinaryIO ) -> None :
78- self .wal = WAL (fh )
125+ def checkpoints (self ) -> Iterator [SQLite3 ]:
126+ """Yield instances of the database at all available checkpoints in the WAL file, if applicable."""
127+ if not self .wal :
128+ return
129+
130+ for checkpoint in self .wal .checkpoints :
131+ yield SQLite3 (self .fh , self .wal , checkpoint )
79132
80133 def table (self , name : str ) -> Table | None :
81134 name = name .lower ()
@@ -108,10 +161,33 @@ def indices(self) -> Iterator[Index]:
108161 yield Index (self , * cell .values )
109162
110163 def raw_page (self , num : int ) -> bytes :
164+ """Retrieve the raw frame data for the given page number.
165+
166+ Reads the page from a checkpoint, if this class was initialized with a WAL checkpoint.
167+
168+ If a WAL is available, will first check if the WAL contains a more recent version of the page,
169+ otherwise it will read the page from the database file.
170+
171+ References:
172+ - https://sqlite.org/fileformat2.html#reader_algorithm
173+ """
111174 # Only throw an out of bounds exception if the header contains a page_count.
112175 # Some old versions of SQLite3 do not set/update the page_count correctly.
113176 if (num < 1 or num > self .header .page_count ) and self .header .page_count > 0 :
114177 raise InvalidPageNumber ("Page number exceeds boundaries" )
178+
179+ # If a specific WAL checkpoint was provided, use it instead of the on-disk page.
180+ if self .checkpoint is not None and (frame := self .checkpoint .get (num )):
181+ return frame .data
182+
183+ # Check if the latest valid instance of the page is committed (either the frame itself
184+ # is the commit frame or it is included in a commit's frames). If so, return that frame's data.
185+ if self .wal :
186+ for commit in reversed (self .wal .commits ):
187+ if (frame := commit .get (num )) and frame .valid :
188+ return frame .data
189+
190+ # Else we read the page from the database file.
115191 if num == 1 : # Page 1 is root
116192 self .fh .seek (len (c_sqlite3 .header ))
117193 else :
@@ -465,127 +541,6 @@ def values(self) -> list[int | float | str | bytes | None]:
465541 return self ._values
466542
467543
468- class WAL :
469- def __init__ (self , fh : BinaryIO ):
470- self .fh = fh
471- self .header = c_sqlite3 .wal_header (fh )
472-
473- if self .header .magic not in WAL_HEADER_MAGIC :
474- raise InvalidDatabase ("Invalid header magic" )
475-
476- self .checksum_endian = "<" if self .header .magic == WAL_HEADER_MAGIC_LE else ">"
477- self ._checkpoints = None
478-
479- self .frame = lru_cache (1024 )(self .frame )
480-
481- def frame (self , frame_idx : int ) -> WALFrame :
482- frame_size = len (c_sqlite3 .wal_frame ) + self .header .page_size
483- offset = len (c_sqlite3 .wal_header ) + frame_idx * frame_size
484- return WALFrame (self , offset )
485-
486- def frames (self ) -> Iterator [WALFrame ]:
487- frame_idx = 0
488- while True :
489- try :
490- yield self .frame (frame_idx )
491- frame_idx += 1
492- except EOFError : # noqa: PERF203
493- break
494-
495- def checkpoints (self ) -> list [WALCheckpoint ]:
496- if not self ._checkpoints :
497- checkpoints = []
498- frames = []
499-
500- for frame in self .frames ():
501- frames .append (frame )
502-
503- if frame .page_count != 0 :
504- checkpoints .append (WALCheckpoint (self , frames ))
505- frames = []
506-
507- self ._checkpoints = checkpoints
508-
509- return self ._checkpoints
510-
511-
512- class WALFrame :
513- def __init__ (self , wal : WAL , offset : int ):
514- self .wal = wal
515- self .offset = offset
516-
517- self .fh = wal .fh
518- self ._data = None
519-
520- self .fh .seek (offset )
521- self .header = c_sqlite3 .wal_frame (self .fh )
522-
523- def __repr__ (self ) -> str :
524- return f"<WALFrame page_number={ self .page_number } page_count={ self .page_count } >"
525-
526- @property
527- def valid (self ) -> bool :
528- salt1_match = self .header .salt1 == self .wal .header .salt1
529- salt2_match = self .header .salt2 == self .wal .header .salt2
530-
531- return salt1_match and salt2_match
532-
533- @property
534- def data (self ) -> bytes :
535- if not self ._data :
536- self .fh .seek (self .offset + len (c_sqlite3 .wal_frame ))
537- self ._data = self .fh .read (self .wal .header .page_size )
538- return self ._data
539-
540- @property
541- def page_number (self ) -> int :
542- return self .header .page_number
543-
544- @property
545- def page_count (self ) -> int :
546- return self .header .page_count
547-
548-
549- class WALCheckpoint :
550- def __init__ (self , wal : WAL , frames : list [WALFrame ]):
551- self .wal = wal
552- self .frames = frames
553- self ._page_map = None
554-
555- def __contains__ (self , page : int ) -> bool :
556- return page in self .page_map
557-
558- def __getitem__ (self , page : int ) -> WALFrame :
559- return self .page_map [page ]
560-
561- def __repr__ (self ) -> str :
562- return f"<WALCheckpoint frames={ len (self .frames )} >"
563-
564- @property
565- def page_map (self ) -> dict [int , WALFrame ]:
566- if not self ._page_map :
567- self ._page_map = {frame .page_number : frame for frame in self .frames }
568-
569- return self ._page_map
570-
571- def get (self , page : int , default : Any = None ) -> WALFrame :
572- return self .page_map .get (page , default )
573-
574-
575- def wal_checksum (buf : bytes , endian : str = ">" ) -> tuple [int , int ]:
576- """For future use, will be used when WAL is fully implemented"""
577-
578- s0 = s1 = 0
579- num_ints = len (buf ) // 4
580- arr = struct .unpack (f"{ endian } { num_ints } I" , buf )
581-
582- for int_num in range (0 , num_ints , 2 ):
583- s0 = (s0 + (arr [int_num ] + s1 )) & 0xFFFFFFFF
584- s1 = (s1 + (arr [int_num + 1 ] + s0 )) & 0xFFFFFFFF
585-
586- return s0 , s1
587-
588-
589544def walk_tree (sqlite : SQLite3 , page : Page ) -> Iterator [Cell ]:
590545 if page .header .flags in (
591546 c_sqlite3 .PAGE_TYPE_LEAF_TABLE ,
0 commit comments