Skip to content

Commit eb6c745

Browse files
committed
docs: doc comments for remote registry
1 parent 1bc13e9 commit eb6c745

File tree

1 file changed

+88
-21
lines changed

1 file changed

+88
-21
lines changed

src/cargo/sources/registry/remote.rs

Lines changed: 88 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//! Access to a Git index based registry. See [`RemoteRegistry`] for details.
2+
13
use crate::core::{GitReference, PackageId, SourceId};
24
use crate::sources::git;
35
use crate::sources::git::fetch::RemoteKind;
@@ -21,29 +23,73 @@ use std::task::{ready, Poll};
2123
/// A remote registry is a registry that lives at a remote URL (such as
2224
/// crates.io). The git index is cloned locally, and `.crate` files are
2325
/// downloaded as needed and cached locally.
26+
///
27+
/// This type is primarily accessed through the [`RegistryData`] trait.
28+
///
29+
/// See the [module-level documentation](super) for the index format and layout.
30+
///
31+
/// ## History of Git-based index registry
32+
///
33+
/// Using Git to host this index used to be quite efficient. The full index can
34+
/// be stored efficiently locally on disk, and once it is downloaded, all
35+
/// queries of a registry can happen locally and needn't touch the network.
36+
/// Git-based index was a reasonable design choice at the time when HTTP/2
37+
/// was just introduced.
38+
///
39+
/// However, the full index keeps growing as crates.io grows. It becomes
40+
/// relatively big and slows down the first use of Cargo. Git (specifically
41+
/// libgit2) is not efficient at handling huge amounts of small files either.
42+
/// On the other hand, newer protocols like HTTP/2 are prevalent and capable to
43+
/// serve a bunch of tiny files. Today, it is encouraged to use [`HttpRegistry`],
44+
/// which is the default from 1.70.0. That being said, Cargo will continue
45+
/// supporting Git-based index for a pretty long while.
46+
///
47+
/// [`HttpRegistry`]: super::http_remote::HttpRegistry
2448
pub struct RemoteRegistry<'cfg> {
49+
/// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`).
2550
index_path: Filesystem,
26-
/// Path to the cache of `.crate` files (`$CARGO_HOME/registry/path/$REG-HASH`).
51+
/// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`).
2752
cache_path: Filesystem,
53+
/// The unique identifier of this registry source.
2854
source_id: SourceId,
55+
/// This reference is stored so that when a registry needs update, it knows
56+
/// where to fetch from.
2957
index_git_ref: GitReference,
3058
config: &'cfg Config,
59+
/// A Git [tree object] to help this registry find crate metadata from the
60+
/// underlying Git repository.
61+
///
62+
/// This is stored here to prevent Git from repeatly creating a tree object
63+
/// during each call into `load()`.
64+
///
65+
/// [tree object]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_tree_objects
3166
tree: RefCell<Option<git2::Tree<'static>>>,
67+
/// A Git repository that contains the actual index we want.
3268
repo: LazyCell<git2::Repository>,
69+
/// The current HEAD commit of the underlying Git repository.
3370
head: Cell<Option<git2::Oid>>,
71+
/// This stores sha value of the current HEAD commit for convenience.
3472
current_sha: Cell<Option<InternedString>>,
35-
needs_update: bool, // Does this registry need to be updated?
73+
/// Whether this registry needs to update package informations.
74+
///
75+
/// See [`RemoteRegistry::mark_updated`] on how to make sure a registry
76+
/// index is updated only once per session.
77+
needs_update: bool,
78+
/// Disables status messages.
3679
quiet: bool,
3780
}
3881

3982
impl<'cfg> RemoteRegistry<'cfg> {
83+
/// Creates a Git-rebased remote registry for `source_id`.
84+
///
85+
/// * `name` --- Name of a path segment where `.crate` tarballs and the
86+
/// registry index are stored. Expect to be unique.
4087
pub fn new(source_id: SourceId, config: &'cfg Config, name: &str) -> RemoteRegistry<'cfg> {
4188
RemoteRegistry {
4289
index_path: config.registry_index_path().join(name),
4390
cache_path: config.registry_cache_path().join(name),
4491
source_id,
4592
config,
46-
// TODO: we should probably make this configurable
4793
index_git_ref: GitReference::DefaultBranch,
4894
tree: RefCell::new(None),
4995
repo: LazyCell::new(),
@@ -54,17 +100,16 @@ impl<'cfg> RemoteRegistry<'cfg> {
54100
}
55101
}
56102

103+
/// Creates intermediate dirs and initialize the repository.
57104
fn repo(&self) -> CargoResult<&git2::Repository> {
58105
self.repo.try_borrow_with(|| {
59106
let path = self.config.assert_package_cache_locked(&self.index_path);
60107

61-
// Fast path without a lock
62108
if let Ok(repo) = git2::Repository::open(&path) {
63109
trace!("opened a repo without a lock");
64110
return Ok(repo);
65111
}
66112

67-
// Ok, now we need to lock and try the whole thing over again.
68113
trace!("acquiring registry index lock");
69114
match git2::Repository::open(&path) {
70115
Ok(repo) => Ok(repo),
@@ -97,6 +142,7 @@ impl<'cfg> RemoteRegistry<'cfg> {
97142
})
98143
}
99144

145+
/// Get the object ID of the HEAD commit from the underlying Git repository.
100146
fn head(&self) -> CargoResult<git2::Oid> {
101147
if self.head.get().is_none() {
102148
let repo = self.repo()?;
@@ -106,6 +152,8 @@ impl<'cfg> RemoteRegistry<'cfg> {
106152
Ok(self.head.get().unwrap())
107153
}
108154

155+
/// Returns a [`git2::Tree`] object of the current HEAD commit of the
156+
/// underlying Git repository.
109157
fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
110158
{
111159
let tree = self.tree.borrow();
@@ -117,6 +165,7 @@ impl<'cfg> RemoteRegistry<'cfg> {
117165
let commit = repo.find_commit(self.head()?)?;
118166
let tree = commit.tree()?;
119167

168+
// SAFETY:
120169
// Unfortunately in libgit2 the tree objects look like they've got a
121170
// reference to the repository object which means that a tree cannot
122171
// outlive the repository that it came from. Here we want to cache this
@@ -134,6 +183,9 @@ impl<'cfg> RemoteRegistry<'cfg> {
134183
Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
135184
}
136185

186+
/// Gets the current version of the registry index.
187+
///
188+
/// It is usually sha of the HEAD commit from the underlying Git repository.
137189
fn current_version(&self) -> Option<InternedString> {
138190
if let Some(sha) = self.current_sha.get() {
139191
return Some(sha);
@@ -143,10 +195,16 @@ impl<'cfg> RemoteRegistry<'cfg> {
143195
Some(sha)
144196
}
145197

198+
/// Whether the registry is up-to-date. See [`Self::mark_updated`] for more.
146199
fn is_updated(&self) -> bool {
147200
self.config.updated_sources().contains(&self.source_id)
148201
}
149202

203+
/// Marks this registry as up-to-date.
204+
///
205+
/// This makes sure the index is only updated once per session since it is
206+
/// an expensive operation. This generally only happens when the resolver
207+
/// is run multiple times, such as during `cargo publish`.
150208
fn mark_updated(&self) {
151209
self.config.updated_sources().insert(self.source_id);
152210
}
@@ -156,7 +214,7 @@ const LAST_UPDATED_FILE: &str = ".last-updated";
156214

157215
impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
158216
fn prepare(&self) -> CargoResult<()> {
159-
self.repo()?; // create intermediate dirs and initialize the repo
217+
self.repo()?;
160218
Ok(())
161219
}
162220

@@ -168,13 +226,20 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
168226
self.config.assert_package_cache_locked(path)
169227
}
170228

171-
// `index_version` Is a string representing the version of the file used to construct the cached copy.
172-
// Older versions of Cargo used the single value of the hash of the HEAD commit as a `index_version`.
173-
// This is technically correct but a little too conservative. If a new commit is fetched all cached
174-
// files need to be regenerated even if a particular file was not changed.
175-
// However if an old cargo has written such a file we still know how to read it, as long as we check for that hash value.
176-
//
177-
// Cargo now uses a hash of the file's contents as provided by git.
229+
/// Read the general concept for `load()` on [`RegistryData::load`].
230+
///
231+
/// `index_version` is a string representing the version of the file used
232+
/// to construct the cached copy.
233+
///
234+
/// Older versions of Cargo used the single value of the hash of the HEAD
235+
/// commit as a `index_version`. This is technically correct but a little
236+
/// too conservative. If a new commit is fetched all cached files need to
237+
/// be regenerated even if a particular file was not changed.
238+
///
239+
/// However if an old cargo has written such a file we still know how to
240+
/// read it, as long as we check for that hash value.
241+
///
242+
/// Cargo now uses a hash of the file's contents as provided by git.
178243
fn load(
179244
&mut self,
180245
_root: &Path,
@@ -187,7 +252,8 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
187252
// Check if the cache is valid.
188253
let git_commit_hash = self.current_version();
189254
if index_version.is_some() && index_version == git_commit_hash.as_deref() {
190-
// This file was written by an old version of cargo, but it is still up-to-date.
255+
// This file was written by an old version of cargo, but it is
256+
// still up-to-date.
191257
return Poll::Ready(Ok(LoadResponse::CacheValid));
192258
}
193259
// Note that the index calls this method and the filesystem is locked
@@ -224,8 +290,8 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
224290
match load_helper(&self, path, index_version) {
225291
Ok(result) => Poll::Ready(Ok(result)),
226292
Err(_) if !self.is_updated() => {
227-
// If git returns an error and we haven't updated the repo, return
228-
// pending to allow an update to try again.
293+
// If git returns an error and we haven't updated the repo,
294+
// return pending to allow an update to try again.
229295
self.needs_update = true;
230296
Poll::Pending
231297
}
@@ -265,9 +331,6 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
265331

266332
self.needs_update = false;
267333

268-
// Make sure the index is only updated once per session since it is an
269-
// expensive operation. This generally only happens when the resolver
270-
// is run multiple times, such as during `cargo publish`.
271334
if self.is_updated() {
272335
return Ok(());
273336
}
@@ -321,8 +384,11 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
321384
Ok(())
322385
}
323386

387+
/// Read the general concept for `invalidate_cache()` on
388+
/// [`RegistryData::invalidate_cache`].
389+
///
390+
/// To fully invalidate, undo [`RemoteRegistry::mark_updated`]'s work.
324391
fn invalidate_cache(&mut self) {
325-
// To fully invalidate, undo `mark_updated`s work
326392
self.needs_update = true;
327393
}
328394

@@ -365,9 +431,10 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
365431
}
366432
}
367433

434+
/// Implemented to just be sure to drop `tree` field before our other fields.
435+
/// See SAFETY inside [`RemoteRegistry::tree()`] for more.
368436
impl<'cfg> Drop for RemoteRegistry<'cfg> {
369437
fn drop(&mut self) {
370-
// Just be sure to drop this before our other fields
371438
self.tree.borrow_mut().take();
372439
}
373440
}

0 commit comments

Comments
 (0)