Skip to content

Commit 33bc02a

Browse files
PimSandersSchamper
andauthored
Add proper support for SQLite3 WAL (#9)
Co-authored-by: Erik Schamper <[email protected]>
1 parent 14741f8 commit 33bc02a

File tree

9 files changed

+577
-148
lines changed

9 files changed

+577
-148
lines changed

dissect/database/sqlite3/sqlite3.py

Lines changed: 86 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
import itertools
44
import re
5-
import struct
65
from functools import lru_cache
76
from io import BytesIO
7+
from pathlib import Path
88
from typing import TYPE_CHECKING, Any, BinaryIO
99

1010
from dissect.database.sqlite3.c_sqlite3 import c_sqlite3
@@ -15,6 +15,7 @@
1515
NoCellData,
1616
)
1717
from dissect.database.sqlite3.util import parse_table_columns_constraints
18+
from dissect.database.sqlite3.wal import WAL, Checkpoint
1819

1920
if TYPE_CHECKING:
2021
from collections.abc import Iterator
@@ -47,19 +48,50 @@
4748
9: lambda fh: 1,
4849
}
4950

51+
# See https://sqlite.org/fileformat2.html#magic_header_string
5052
SQLITE3_HEADER_MAGIC = b"SQLite format 3\x00"
5153

52-
WAL_HEADER_MAGIC_LE = 0x377F0682
53-
WAL_HEADER_MAGIC_BE = 0x377F0683
54-
WAL_HEADER_MAGIC = {WAL_HEADER_MAGIC_LE, WAL_HEADER_MAGIC_BE}
55-
5654

5755
class SQLite3:
58-
def __init__(self, fh: BinaryIO, wal_fh: BinaryIO | None = None):
56+
"""SQLite3 database class.
57+
58+
Loads a SQLite3 database from the given file-like object or path. If a path is provided (or can be deduced
59+
from the file-like object), a WAL file will be automatically looked for with a few common suffixes.
60+
Optionally a WAL file-like object or path can be directly provided to read changes from the WAL (this takes
61+
priority over the aforementioned WAL lookup). Additionally, a specific checkpoint from the WAL can be applied.
62+
63+
Args:
64+
fh: The path or file-like object to open a SQLite3 database on.
65+
wal: The path or file-like object to open a SQLite3 WAL file on.
66+
checkpoint: The checkpoint to apply from the WAL file. Can be a :class:`Checkpoint` object or an integer index.
67+
68+
Raises:
69+
InvalidDatabase: If the file-like object does not look like a SQLite3 database based on the header magic.
70+
71+
References:
72+
- https://sqlite.org/fileformat2.html
73+
"""
74+
75+
def __init__(
76+
self,
77+
fh: Path | BinaryIO,
78+
wal: WAL | Path | BinaryIO | None = None,
79+
checkpoint: Checkpoint | int | None = None,
80+
):
81+
# Use the provided file handle or try to open the file path.
82+
if hasattr(fh, "read"):
83+
name = getattr(fh, "name", None)
84+
path = Path(name) if name else None
85+
else:
86+
path = fh
87+
fh = path.open("rb")
88+
5989
self.fh = fh
60-
self.wal = WAL(wal_fh) if wal_fh else None
90+
self.path = path
91+
self.wal = None
92+
self.checkpoint = None
6193

62-
self.header = c_sqlite3.header(fh)
94+
self.header = c_sqlite3.header(self.fh)
6395
if self.header.magic != SQLITE3_HEADER_MAGIC:
6496
raise InvalidDatabase("Invalid header magic")
6597

@@ -72,10 +104,31 @@ def __init__(self, fh: BinaryIO, wal_fh: BinaryIO | None = None):
72104
if self.usable_page_size < 480:
73105
raise InvalidDatabase("Usable page size is too small")
74106

107+
if wal:
108+
self.wal = WAL(wal) if not isinstance(wal, WAL) else wal
109+
elif path:
110+
# Check for WAL sidecar next to the DB.
111+
wal_path = path.with_name(f"{path.name}-wal")
112+
if wal_path.exists():
113+
self.wal = WAL(wal_path)
114+
115+
# If a checkpoint index was provided, resolve it to a Checkpoint object.
116+
if self.wal and isinstance(checkpoint, int):
117+
if checkpoint < 0 or checkpoint >= len(self.wal.checkpoints):
118+
raise IndexError("WAL checkpoint index out of range")
119+
self.checkpoint = self.wal.checkpoints[checkpoint]
120+
else:
121+
self.checkpoint = checkpoint
122+
75123
self.page = lru_cache(256)(self.page)
76124

77-
def open_wal(self, fh: BinaryIO) -> None:
78-
self.wal = WAL(fh)
125+
def checkpoints(self) -> Iterator[SQLite3]:
126+
"""Yield instances of the database at all available checkpoints in the WAL file, if applicable."""
127+
if not self.wal:
128+
return
129+
130+
for checkpoint in self.wal.checkpoints:
131+
yield SQLite3(self.fh, self.wal, checkpoint)
79132

80133
def table(self, name: str) -> Table | None:
81134
name = name.lower()
@@ -108,10 +161,33 @@ def indices(self) -> Iterator[Index]:
108161
yield Index(self, *cell.values)
109162

110163
def raw_page(self, num: int) -> bytes:
164+
"""Retrieve the raw frame data for the given page number.
165+
166+
Reads the page from a checkpoint, if this class was initialized with a WAL checkpoint.
167+
168+
If a WAL is available, will first check if the WAL contains a more recent version of the page,
169+
otherwise it will read the page from the database file.
170+
171+
References:
172+
- https://sqlite.org/fileformat2.html#reader_algorithm
173+
"""
111174
# Only throw an out of bounds exception if the header contains a page_count.
112175
# Some old versions of SQLite3 do not set/update the page_count correctly.
113176
if (num < 1 or num > self.header.page_count) and self.header.page_count > 0:
114177
raise InvalidPageNumber("Page number exceeds boundaries")
178+
179+
# If a specific WAL checkpoint was provided, use it instead of the on-disk page.
180+
if self.checkpoint is not None and (frame := self.checkpoint.get(num)):
181+
return frame.data
182+
183+
# Check if the latest valid instance of the page is committed (either the frame itself
184+
# is the commit frame or it is included in a commit's frames). If so, return that frame's data.
185+
if self.wal:
186+
for commit in reversed(self.wal.commits):
187+
if (frame := commit.get(num)) and frame.valid:
188+
return frame.data
189+
190+
# Else we read the page from the database file.
115191
if num == 1: # Page 1 is root
116192
self.fh.seek(len(c_sqlite3.header))
117193
else:
@@ -465,127 +541,6 @@ def values(self) -> list[int | float | str | bytes | None]:
465541
return self._values
466542

467543

468-
class WAL:
469-
def __init__(self, fh: BinaryIO):
470-
self.fh = fh
471-
self.header = c_sqlite3.wal_header(fh)
472-
473-
if self.header.magic not in WAL_HEADER_MAGIC:
474-
raise InvalidDatabase("Invalid header magic")
475-
476-
self.checksum_endian = "<" if self.header.magic == WAL_HEADER_MAGIC_LE else ">"
477-
self._checkpoints = None
478-
479-
self.frame = lru_cache(1024)(self.frame)
480-
481-
def frame(self, frame_idx: int) -> WALFrame:
482-
frame_size = len(c_sqlite3.wal_frame) + self.header.page_size
483-
offset = len(c_sqlite3.wal_header) + frame_idx * frame_size
484-
return WALFrame(self, offset)
485-
486-
def frames(self) -> Iterator[WALFrame]:
487-
frame_idx = 0
488-
while True:
489-
try:
490-
yield self.frame(frame_idx)
491-
frame_idx += 1
492-
except EOFError: # noqa: PERF203
493-
break
494-
495-
def checkpoints(self) -> list[WALCheckpoint]:
496-
if not self._checkpoints:
497-
checkpoints = []
498-
frames = []
499-
500-
for frame in self.frames():
501-
frames.append(frame)
502-
503-
if frame.page_count != 0:
504-
checkpoints.append(WALCheckpoint(self, frames))
505-
frames = []
506-
507-
self._checkpoints = checkpoints
508-
509-
return self._checkpoints
510-
511-
512-
class WALFrame:
513-
def __init__(self, wal: WAL, offset: int):
514-
self.wal = wal
515-
self.offset = offset
516-
517-
self.fh = wal.fh
518-
self._data = None
519-
520-
self.fh.seek(offset)
521-
self.header = c_sqlite3.wal_frame(self.fh)
522-
523-
def __repr__(self) -> str:
524-
return f"<WALFrame page_number={self.page_number} page_count={self.page_count}>"
525-
526-
@property
527-
def valid(self) -> bool:
528-
salt1_match = self.header.salt1 == self.wal.header.salt1
529-
salt2_match = self.header.salt2 == self.wal.header.salt2
530-
531-
return salt1_match and salt2_match
532-
533-
@property
534-
def data(self) -> bytes:
535-
if not self._data:
536-
self.fh.seek(self.offset + len(c_sqlite3.wal_frame))
537-
self._data = self.fh.read(self.wal.header.page_size)
538-
return self._data
539-
540-
@property
541-
def page_number(self) -> int:
542-
return self.header.page_number
543-
544-
@property
545-
def page_count(self) -> int:
546-
return self.header.page_count
547-
548-
549-
class WALCheckpoint:
550-
def __init__(self, wal: WAL, frames: list[WALFrame]):
551-
self.wal = wal
552-
self.frames = frames
553-
self._page_map = None
554-
555-
def __contains__(self, page: int) -> bool:
556-
return page in self.page_map
557-
558-
def __getitem__(self, page: int) -> WALFrame:
559-
return self.page_map[page]
560-
561-
def __repr__(self) -> str:
562-
return f"<WALCheckpoint frames={len(self.frames)}>"
563-
564-
@property
565-
def page_map(self) -> dict[int, WALFrame]:
566-
if not self._page_map:
567-
self._page_map = {frame.page_number: frame for frame in self.frames}
568-
569-
return self._page_map
570-
571-
def get(self, page: int, default: Any = None) -> WALFrame:
572-
return self.page_map.get(page, default)
573-
574-
575-
def wal_checksum(buf: bytes, endian: str = ">") -> tuple[int, int]:
576-
"""For future use, will be used when WAL is fully implemented"""
577-
578-
s0 = s1 = 0
579-
num_ints = len(buf) // 4
580-
arr = struct.unpack(f"{endian}{num_ints}I", buf)
581-
582-
for int_num in range(0, num_ints, 2):
583-
s0 = (s0 + (arr[int_num] + s1)) & 0xFFFFFFFF
584-
s1 = (s1 + (arr[int_num + 1] + s0)) & 0xFFFFFFFF
585-
586-
return s0, s1
587-
588-
589544
def walk_tree(sqlite: SQLite3, page: Page) -> Iterator[Cell]:
590545
if page.header.flags in (
591546
c_sqlite3.PAGE_TYPE_LEAF_TABLE,

0 commit comments

Comments
 (0)