Skip to content

Commit 7a88b42

Browse files
committed
refactor
- update `gix-fsck` crate status with all obvious features of `git-fsck` - clarify that fsck is only partial just to be sure people won't use it in place of `git fsck` - if BufWriter's are truly needed, let's expect it as type to avoid double-buffering unnecessarily. - consistent naming of the changelog markdown fail (all caps) - set version to 0.1.0 for fsck crate as 0.0 is used only for name-keeping when there is no functionality. - general cleanup - avoid allocating for each tree (but it's still recursive)
1 parent 8f795e8 commit 7a88b42

File tree

12 files changed

+156
-124
lines changed

12 files changed

+156
-124
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crate-status.md

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -776,17 +776,21 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
776776
* [x] [validate][tagname-validation] tag names
777777

778778
### gix-fsck
779-
* [x] validate connectivity when provided a specific commit as a starting point
780-
* [ ] validate connectivity of all `refs` in the index
779+
* [x] validate connectivity and find missing objects starting from…
780+
- [x] commits
781+
- [ ] tags
782+
- [ ] tree-cache in the `index` or any entry within
783+
* [ ] validate object hashes during connectivity traversal
781784
* [ ] progress reporting and interruptability
782-
* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit)
783-
* [ ] add support for various options
784-
* [ ] write dangling objects to the `.git/log-found` directory structure
785-
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
786-
* [ ] consider reflog entries as reference nodes/heads
787-
* [ ] when reporting reachable objects, include _how_ they are reachable
788-
* [ ] which reference node(s) made them reachable
789-
* [ ] parent commit(s)
785+
* [ ] skipList to exclude objects which are known to be broken
786+
* [ ] validate blob hashes (connectivity check
787+
* [ ] identify objects that exist but are not reachable (i.e. what remains after a full graph traversal from all valid starting points)
788+
* [ ] write dangling objects to the `.git/log-found` directory structure
789+
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
790+
* [ ] consider reflog entries from `ref` starting points
791+
* [ ] when reporting reachable objects, provide the path through which they are reachable, i.e. ref-log@{3} -> commit -> tree -> path-in-tree
792+
* [ ] limit search to ODB without alternates (default is equivalent to `git fsck --full` due to ODB implementation)
793+
* [ ] all individual [checks available in `git fsck`](https://git-scm.com/docs/git-fsck#_fsck_messages) (*too many to print here*)
790794

791795
### gix-ref
792796
* [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.

etc/check-package-size.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ function indent () {
1515
}
1616

1717
echo "in root: gitoxide CLI"
18+
(enter gix-fsck && indent cargo diet -n --package-size-limit 10KB)
1819
(enter gix-actor && indent cargo diet -n --package-size-limit 10KB)
1920
(enter gix-archive && indent cargo diet -n --package-size-limit 10KB)
2021
(enter gix-worktree-stream && indent cargo diet -n --package-size-limit 40KB)

gitoxide-core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
4949
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
5050
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
5151
gix-status = { version = "^0.2.0", path = "../gix-status" }
52-
gix-fsck = { version = "^0.0.0", path = "../gix-fsck" }
52+
gix-fsck = { version = "^0.1.0", path = "../gix-fsck" }
5353
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
5454
anyhow = "1.0.42"
5555
thiserror = "1.0.34"

gitoxide-core/src/repository/fsck.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
use std::io::{BufWriter, Write};
2-
31
use anyhow::Context;
42
use gix::{objs::Kind, ObjectId};
53

6-
pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> {
7-
let mut out = BufWriter::with_capacity(64 * 1024, out);
4+
pub fn connectivity(
5+
mut repo: gix::Repository,
6+
spec: Option<String>,
7+
mut out: impl std::io::Write,
8+
) -> anyhow::Result<()> {
89
let spec = spec.unwrap_or("HEAD".into());
910

1011
repo.object_cache_size_if_unset(4 * 1024 * 1024);
@@ -22,19 +23,18 @@ pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl s
2223
.ancestors()
2324
.all()?;
2425

25-
let missing_cb = |oid: &ObjectId, kind: Kind| {
26+
let on_missing = |oid: &ObjectId, kind: Kind| {
2627
writeln!(out, "{oid}: {kind}").expect("failed to write output");
2728
};
28-
let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb);
2929

30+
let mut check = gix_fsck::Connectivity::new(&repo.objects, on_missing);
3031
// Walk all commits, checking each one for connectivity
3132
for commit in commits {
3233
let commit = commit?;
33-
conn.check_commit(&commit.id);
34-
for parent in commit.parent_ids {
35-
conn.check_commit(&parent);
36-
}
34+
check.check_commit(&commit.id);
35+
// Note that we leave parent-iteration to the commits iterator, as it will
36+
// correctly handle shallow repositories which are expected to have the commits
37+
// along the shallow boundary missing.
3738
}
38-
3939
Ok(())
4040
}
File renamed without changes.

gix-fsck/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
[package]
22
name = "gix-fsck"
3-
version = "0.0.0"
3+
version = "0.1.0"
44
repository = "https://github.com/Byron/gitoxide"
55
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
66
license = "MIT OR Apache-2.0"
77
description = "Verifies the connectivity and validity of objects in the database"
88
edition = "2021"
9-
include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"]
9+
include = ["src/**/*", "LICENSE-*"]
1010
rust-version = "1.65"
1111

1212
[lib]

gix-fsck/src/lib.rs

Lines changed: 112 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,161 @@
11
//! A library for performing object database integrity and connectivity checks
2-
#![deny(rust_2018_idioms)]
2+
#![deny(rust_2018_idioms, unsafe_code, missing_docs)]
33

44
use gix_hash::ObjectId;
55
use gix_hashtable::HashSet;
66
use gix_object::{tree::EntryMode, Exists, FindExt, Kind};
7+
use std::cell::RefCell;
8+
use std::ops::{Deref, DerefMut};
79

8-
pub struct ConnectivityCheck<'a, T, F>
10+
/// Perform a connectivity check.
11+
pub struct Connectivity<T, F>
912
where
1013
T: FindExt + Exists,
1114
F: FnMut(&ObjectId, Kind),
1215
{
1316
/// ODB handle to use for the check
14-
db: &'a T,
17+
db: T,
1518
/// Closure to invoke when a missing object is encountered
1619
missing_cb: F,
1720
/// Set of Object IDs already (or about to be) scanned during the check
1821
oid_set: HashSet,
19-
/// Single buffer for decoding objects from the ODB
20-
/// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements)
21-
buf: Vec<u8>,
22+
/// A free-list of buffers for recursive tree decoding.
23+
free_list: FreeList,
2224
}
2325

24-
impl<'a, T, F> ConnectivityCheck<'a, T, F>
26+
impl<T, F> Connectivity<T, F>
2527
where
2628
T: FindExt + Exists,
2729
F: FnMut(&ObjectId, Kind),
2830
{
29-
/// Instantiate a connectivity check
30-
pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> {
31-
ConnectivityCheck {
31+
/// Instantiate a connectivity check.
32+
pub fn new(db: T, missing_cb: F) -> Connectivity<T, F> {
33+
Connectivity {
3234
db,
3335
missing_cb,
3436
oid_set: HashSet::default(),
35-
buf: Vec::new(),
37+
free_list: Default::default(),
3638
}
3739
}
3840

39-
/// Run the connectivity check on the provided commit object ID
40-
/// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB
41-
/// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped
42-
/// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`
43-
/// - Missing commits or trees will currently result in panic
41+
/// Run the connectivity check on the provided commit `oid`.
42+
///
43+
/// ### Algorithm
44+
///
45+
/// Walk the trees and blobs referenced by the commit and verify they exist in the ODB.
46+
/// Any objects previously encountered by this instance will be skipped silently.
47+
/// Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`.
48+
/// Missing commits or trees will cause an error to be returned.
4449
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
45-
pub fn check_commit(&mut self, oid: &ObjectId) {
50+
pub fn check_commit(&mut self, oid: &ObjectId) -> Result<(), gix_object::find::existing_object::Error> {
4651
// Attempt to insert the commit ID in the set, and if already present, return immediately
4752
if !self.oid_set.insert(*oid) {
48-
return;
53+
return Ok(());
4954
}
5055
// Obtain the commit's tree ID
5156
let tree_id = {
52-
let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit");
57+
let mut buf = self.free_list.buf();
58+
let commit = self.db.find_commit(oid, &mut buf)?;
5359
commit.tree()
5460
};
5561

56-
// Attempt to insert the tree ID in the set, and if already present, return immediately
5762
if self.oid_set.insert(tree_id) {
58-
self.check_tree(&tree_id);
63+
check_tree(
64+
&tree_id,
65+
&self.db,
66+
&mut self.free_list,
67+
&mut self.missing_cb,
68+
&mut self.oid_set,
69+
);
5970
}
71+
72+
Ok(())
6073
}
74+
}
6175

62-
fn check_tree(&mut self, oid: &ObjectId) {
63-
let tree = match self.db.find_tree(oid, &mut self.buf) {
64-
Ok(tree) => tree,
65-
Err(_) => {
66-
// Tree is missing, so invoke `missing_cb`
67-
(self.missing_cb)(oid, Kind::Tree);
68-
return;
69-
}
70-
};
76+
#[derive(Default)]
77+
struct FreeList(RefCell<Vec<Vec<u8>>>);
7178

72-
// Keeping separate sets for trees and blobs for now...
73-
// This is about a wash when compared to using a HashMap<ObjectID, Kind>
74-
struct TreeEntries {
75-
trees: HashSet<ObjectId>,
76-
blobs: HashSet<ObjectId>,
77-
}
79+
impl FreeList {
80+
fn buf(&self) -> ReturnToFreeListOnDrop<'_> {
81+
let buf = self.0.borrow_mut().pop().unwrap_or_default();
82+
ReturnToFreeListOnDrop { buf, list: &self.0 }
83+
}
84+
}
7885

79-
// Build up a set of trees and a set of blobs
80-
let entries: TreeEntries = {
81-
let mut entries = TreeEntries {
82-
trees: HashSet::default(),
83-
blobs: HashSet::default(),
84-
};
85-
86-
// For each entry in the tree
87-
for entry_ref in tree.entries.iter() {
88-
match entry_ref.mode {
89-
EntryMode::Tree => {
90-
let tree_id = entry_ref.oid.to_owned();
91-
// Check if the tree has already been encountered
92-
if self.oid_set.insert(tree_id) {
93-
entries.trees.insert(tree_id);
94-
}
95-
}
96-
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
97-
let blob_id = entry_ref.oid.to_owned();
98-
// Check if the blob has already been encountered
99-
if self.oid_set.insert(blob_id) {
100-
entries.blobs.insert(blob_id);
101-
}
102-
}
103-
EntryMode::Commit => {
104-
// This implies a submodule (OID is the commit hash of the submodule)
105-
// Skip it as it's not in this repository!
106-
}
107-
}
108-
}
109-
entries
110-
};
86+
struct ReturnToFreeListOnDrop<'a> {
87+
list: &'a RefCell<Vec<Vec<u8>>>,
88+
buf: Vec<u8>,
89+
}
11190

112-
for tree_id in entries.trees.iter() {
113-
self.check_tree(tree_id);
114-
}
115-
for blob_id in entries.blobs.iter() {
116-
self.check_blob(blob_id);
91+
impl Drop for ReturnToFreeListOnDrop<'_> {
92+
fn drop(&mut self) {
93+
if !self.buf.is_empty() {
94+
self.list.borrow_mut().push(std::mem::take(&mut self.buf));
11795
}
11896
}
97+
}
98+
99+
impl Deref for ReturnToFreeListOnDrop<'_> {
100+
type Target = Vec<u8>;
101+
102+
fn deref(&self) -> &Self::Target {
103+
&self.buf
104+
}
105+
}
106+
107+
impl DerefMut for ReturnToFreeListOnDrop<'_> {
108+
fn deref_mut(&mut self) -> &mut Self::Target {
109+
&mut self.buf
110+
}
111+
}
112+
113+
fn check_blob<F>(db: impl Exists, oid: &ObjectId, mut missing_cb: F)
114+
where
115+
F: FnMut(&ObjectId, Kind),
116+
{
117+
// Check if the blob is missing from the ODB
118+
if !db.exists(oid) {
119+
// Blob is missing, so invoke `missing_cb`
120+
missing_cb(oid, Kind::Blob);
121+
}
122+
}
119123

120-
fn check_blob(&mut self, oid: &ObjectId) {
121-
// Check if the blob is missing from the ODB
122-
if !self.db.exists(oid) {
123-
// Blob is missing, so invoke `missing_cb`
124-
(self.missing_cb)(oid, Kind::Blob);
124+
fn check_tree<F>(
125+
oid: &ObjectId,
126+
db: &(impl FindExt + Exists),
127+
list: &FreeList,
128+
missing_cb: &mut F,
129+
oid_set: &mut HashSet,
130+
) where
131+
F: FnMut(&ObjectId, Kind),
132+
{
133+
let mut buf = list.buf();
134+
let Ok(tree) = db.find_tree(oid, &mut buf) else {
135+
missing_cb(oid, Kind::Tree);
136+
return;
137+
};
138+
139+
// Build up a set of trees and a set of blobs
140+
// For each entry in the tree
141+
for entry_ref in tree.entries.iter() {
142+
match entry_ref.mode {
143+
EntryMode::Tree => {
144+
let tree_id = entry_ref.oid.to_owned();
145+
if oid_set.insert(tree_id) {
146+
check_tree(&tree_id, &*db, list, &mut *missing_cb, oid_set);
147+
}
148+
}
149+
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
150+
let blob_id = entry_ref.oid.to_owned();
151+
if oid_set.insert(blob_id) {
152+
check_blob(&*db, &blob_id, &mut *missing_cb);
153+
}
154+
}
155+
EntryMode::Commit => {
156+
// This implies a submodule (OID is the commit hash of the submodule)
157+
// Skip it as it's not in this repository!
158+
}
125159
}
126160
}
127161
}

0 commit comments

Comments
 (0)