Skip to content

Commit fc99a21

Browse files
committed
chore: URI caching now avoids hash collisions and reduces lock contention
Signed-off-by: Dmitry Dygalo <[email protected]>
1 parent 010ab95 commit fc99a21

File tree

6 files changed

+116
-134
lines changed

6 files changed

+116
-134
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
### Changed
66

7+
- `referencing`: URI caching now avoids hash collisions and reduces lock contention.
78
- Update `fluent-uri` to `0.4.1`.
89
- Bump MSRV to `1.83.0`.
910
- Drop the `Send + Sync` bounds from `Retrieve`/`AsyncRetrieve` on `wasm32`.

crates/jsonschema-referencing/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ futures = { version = "0.3.31", optional = true }
2525
parking_lot = "0.12.3"
2626
percent-encoding = "2.3.1"
2727
serde_json.workspace = true
28+
hashbrown = "0.16"
2829

2930
[dev-dependencies]
3031
benchmark = { path = "../benchmark/" }

crates/jsonschema-referencing/src/cache.rs

Lines changed: 80 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,29 @@
1-
use core::hash::{BuildHasherDefault, Hash, Hasher};
2-
use std::{
3-
collections::{hash_map::Entry, HashMap},
4-
sync::Arc,
5-
};
1+
use std::sync::Arc;
62

7-
use ahash::AHasher;
83
use fluent_uri::Uri;
9-
use parking_lot::RwLock;
4+
use hashbrown::hash_map::{EntryRef, HashMap};
5+
use parking_lot::{RwLock, RwLockUpgradableReadGuard};
106

11-
use crate::{hasher::BuildNoHashHasher, uri, Error};
7+
use crate::{uri, Error};
8+
9+
type CacheBucket = HashMap<String, Arc<Uri<String>>>;
10+
type CacheMap = HashMap<String, CacheBucket>;
1211

1312
#[derive(Debug, Clone)]
1413
pub(crate) struct UriCache {
15-
cache: HashMap<u64, Arc<Uri<String>>, BuildNoHashHasher>,
14+
cache: CacheMap,
1615
}
1716

1817
impl UriCache {
1918
pub(crate) fn new() -> Self {
2019
Self {
21-
cache: HashMap::with_hasher(BuildHasherDefault::default()),
20+
cache: HashMap::new(),
2221
}
2322
}
2423

2524
pub(crate) fn with_capacity(capacity: usize) -> Self {
2625
Self {
27-
cache: HashMap::with_capacity_and_hasher(capacity, BuildHasherDefault::default()),
26+
cache: HashMap::with_capacity(capacity),
2827
}
2928
}
3029

@@ -33,17 +32,31 @@ impl UriCache {
3332
base: &Uri<&str>,
3433
uri: impl AsRef<str>,
3534
) -> Result<Arc<Uri<String>>, Error> {
36-
let mut hasher = AHasher::default();
37-
(base.as_str(), uri.as_ref()).hash(&mut hasher);
38-
let hash = hasher.finish();
39-
40-
Ok(match self.cache.entry(hash) {
41-
Entry::Occupied(entry) => Arc::clone(entry.get()),
42-
Entry::Vacant(entry) => {
43-
let new = Arc::new(uri::resolve_against(base, uri.as_ref())?);
44-
Arc::clone(entry.insert(new))
35+
let base_str = base.as_str();
36+
let reference = uri.as_ref();
37+
38+
let resolved = match self.cache.entry_ref(base_str) {
39+
EntryRef::Occupied(mut entry) => {
40+
if let Some(cached) = entry.get().get(reference) {
41+
return Ok(Arc::clone(cached));
42+
}
43+
44+
let resolved = Arc::new(uri::resolve_against(base, reference)?);
45+
entry
46+
.get_mut()
47+
.insert(reference.to_owned(), Arc::clone(&resolved));
48+
resolved
4549
}
46-
})
50+
EntryRef::Vacant(entry) => {
51+
let resolved = Arc::new(uri::resolve_against(base, reference)?);
52+
let mut inner = HashMap::with_capacity(1);
53+
inner.insert(reference.to_owned(), Arc::clone(&resolved));
54+
entry.insert(inner);
55+
resolved
56+
}
57+
};
58+
59+
Ok(resolved)
4760
}
4861

4962
pub(crate) fn into_shared(self) -> SharedUriCache {
@@ -56,7 +69,7 @@ impl UriCache {
5669
/// A dedicated type for URI resolution caching.
5770
#[derive(Debug)]
5871
pub(crate) struct SharedUriCache {
59-
cache: RwLock<HashMap<u64, Arc<Uri<String>>, BuildNoHashHasher>>,
72+
cache: RwLock<CacheMap>,
6073
}
6174

6275
impl Clone for SharedUriCache {
@@ -66,7 +79,15 @@ impl Clone for SharedUriCache {
6679
self.cache
6780
.read()
6881
.iter()
69-
.map(|(k, v)| (*k, Arc::clone(v)))
82+
.map(|(base, entries)| {
83+
(
84+
base.clone(),
85+
entries
86+
.iter()
87+
.map(|(reference, value)| (reference.clone(), Arc::clone(value)))
88+
.collect(),
89+
)
90+
})
7091
.collect(),
7192
),
7293
}
@@ -79,17 +100,45 @@ impl SharedUriCache {
79100
base: &Uri<&str>,
80101
uri: impl AsRef<str>,
81102
) -> Result<Arc<Uri<String>>, Error> {
82-
let mut hasher = AHasher::default();
83-
(base.as_str(), uri.as_ref()).hash(&mut hasher);
84-
let hash = hasher.finish();
103+
let base_str = base.as_str();
104+
let reference = uri.as_ref();
85105

86-
if let Some(cached) = self.cache.read().get(&hash).cloned() {
87-
return Ok(cached);
106+
if let Some(cached) = self
107+
.cache
108+
.read()
109+
.get(base_str)
110+
.and_then(|inner| inner.get(reference))
111+
{
112+
return Ok(Arc::clone(cached));
88113
}
89114

90-
let new = Arc::new(uri::resolve_against(base, uri.as_ref())?);
91-
self.cache.write().insert(hash, Arc::clone(&new));
92-
Ok(new)
115+
let cache = self.cache.upgradable_read();
116+
if let Some(inner) = cache.get(base_str).and_then(|inner| inner.get(reference)) {
117+
return Ok(Arc::clone(inner));
118+
}
119+
120+
let resolved = Arc::new(uri::resolve_against(base, reference)?);
121+
122+
let mut cache = RwLockUpgradableReadGuard::upgrade(cache);
123+
let inserted = match cache.entry_ref(base_str) {
124+
EntryRef::Occupied(mut entry) => {
125+
if let Some(existing) = entry.get().get(reference) {
126+
return Ok(Arc::clone(existing));
127+
}
128+
entry
129+
.get_mut()
130+
.insert(reference.to_owned(), Arc::clone(&resolved));
131+
resolved
132+
}
133+
EntryRef::Vacant(entry) => {
134+
let mut inner = HashMap::with_capacity(1);
135+
inner.insert(reference.to_owned(), Arc::clone(&resolved));
136+
entry.insert(inner);
137+
resolved
138+
}
139+
};
140+
141+
Ok(inserted)
93142
}
94143

95144
pub(crate) fn into_local(self) -> UriCache {

crates/jsonschema-referencing/src/hasher.rs

Lines changed: 0 additions & 88 deletions
This file was deleted.

crates/jsonschema-referencing/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
mod anchors;
55
mod cache;
66
mod error;
7-
mod hasher;
87
mod list;
98
pub mod meta;
109
mod registry;

crates/jsonschema-referencing/src/registry.rs

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
use std::{
2-
collections::{hash_map::Entry, HashSet, VecDeque},
3-
hash::{Hash, Hasher},
2+
collections::{hash_map::Entry, VecDeque},
3+
num::NonZeroUsize,
44
pin::Pin,
55
sync::{Arc, LazyLock},
66
};
77

8-
use ahash::{AHashMap, AHashSet, AHasher};
8+
use ahash::{AHashMap, AHashSet};
99
use fluent_uri::Uri;
1010
use serde_json::Value;
1111

1212
use crate::{
1313
anchors::{AnchorKey, AnchorKeyRef},
1414
cache::{SharedUriCache, UriCache},
15-
hasher::BuildNoHashHasher,
1615
list::List,
1716
meta::{self, metas_for_draft},
1817
resource::{unescape_segment, InnerResourcePtr, JsonSchemaResource},
@@ -616,9 +615,27 @@ fn process_meta_schemas(
616615
Ok(())
617616
}
618617

618+
#[derive(Hash, Eq, PartialEq)]
619+
struct ReferenceKey {
620+
base_ptr: NonZeroUsize,
621+
reference: String,
622+
}
623+
624+
impl ReferenceKey {
625+
fn new(base: &Arc<Uri<String>>, reference: &str) -> Self {
626+
Self {
627+
base_ptr: NonZeroUsize::new(Arc::as_ptr(base) as usize)
628+
.expect("Arc pointer should never be null"),
629+
reference: reference.to_owned(),
630+
}
631+
}
632+
}
633+
634+
type ReferenceTracker = AHashSet<ReferenceKey>;
635+
619636
struct ProcessingState {
620637
queue: VecDeque<(Arc<Uri<String>>, InnerResourcePtr)>,
621-
seen: HashSet<u64, BuildNoHashHasher>,
638+
seen: ReferenceTracker,
622639
external: AHashSet<(String, Uri<String>)>,
623640
scratch: String,
624641
refers_metaschemas: bool,
@@ -628,7 +645,7 @@ impl ProcessingState {
628645
fn new() -> Self {
629646
Self {
630647
queue: VecDeque::with_capacity(32),
631-
seen: HashSet::with_hasher(BuildNoHashHasher::default()),
648+
seen: ReferenceTracker::new(),
632649
external: AHashSet::new(),
633650
scratch: String::new(),
634651
refers_metaschemas: false,
@@ -879,10 +896,10 @@ async fn process_resources_async(
879896
}
880897

881898
fn collect_external_resources(
882-
base: &Uri<String>,
899+
base: &Arc<Uri<String>>,
883900
contents: &Value,
884901
collected: &mut AHashSet<(String, Uri<String>)>,
885-
seen: &mut HashSet<u64, BuildNoHashHasher>,
902+
seen: &mut ReferenceTracker,
886903
resolution_cache: &mut UriCache,
887904
scratch: &mut String,
888905
refers_metaschemas: &mut bool,
@@ -903,10 +920,7 @@ fn collect_external_resources(
903920
*refers_metaschemas = true;
904921
}
905922
} else if $reference != "#" {
906-
let mut hasher = AHasher::default();
907-
(base.as_str(), $reference).hash(&mut hasher);
908-
let hash = hasher.finish();
909-
if seen.insert(hash) {
923+
if mark_reference(seen, base, $reference) {
910924
// Handle local references separately as they may have nested references to external resources
911925
if $reference.starts_with('#') {
912926
if let Some(referenced) =
@@ -924,7 +938,7 @@ fn collect_external_resources(
924938
}
925939
} else {
926940
let resolved = if base.has_fragment() {
927-
let mut base_without_fragment = base.clone();
941+
let mut base_without_fragment = base.as_ref().clone();
928942
base_without_fragment.set_fragment(None);
929943

930944
let (path, fragment) = match $reference.split_once('#') {
@@ -951,7 +965,9 @@ fn collect_external_resources(
951965
}
952966
resolved
953967
} else {
954-
(*resolution_cache.resolve_against(&base.borrow(), $reference)?).clone()
968+
(*resolution_cache
969+
.resolve_against(&base.borrow(), $reference)?)
970+
.clone()
955971
};
956972

957973
collected.insert(($reference.to_string(), resolved));
@@ -986,6 +1002,10 @@ fn collect_external_resources(
9861002
Ok(())
9871003
}
9881004

1005+
fn mark_reference(seen: &mut ReferenceTracker, base: &Arc<Uri<String>>, reference: &str) -> bool {
1006+
seen.insert(ReferenceKey::new(base, reference))
1007+
}
1008+
9891009
/// Look up a value by a JSON Pointer.
9901010
///
9911011
/// **NOTE**: A slightly faster version of pointer resolution based on `Value::pointer` from `serde_json`.

0 commit comments

Comments
 (0)