Skip to content

Commit 051a616

Browse files
committed
fix: Local $ref resolution within fragment-extracted external resources
Signed-off-by: Dmitry Dygalo <dmitry@dygalo.dev>
1 parent b4ec427 commit 051a616

File tree

4 files changed

+268
-42
lines changed

4 files changed

+268
-42
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## [Unreleased]
44

5+
### Fixed
6+
7+
- Local `$ref` resolution within fragment-extracted external resources. [#892](https://github.com/Stranger6667/jsonschema/issues/892)
8+
59
## [0.37.3] - 2025-11-28
610

711
### Fixed

crates/jsonschema-py/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## [Unreleased]
44

5+
### Fixed
6+
7+
- Local `$ref` resolution within fragment-extracted external resources. [#892](https://github.com/Stranger6667/jsonschema/issues/892)
8+
59
## [0.37.3] - 2025-11-28
610

711
### Fixed

crates/jsonschema-referencing/src/registry.rs

Lines changed: 181 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::{
66
};
77

88
use ahash::{AHashMap, AHashSet};
9-
use fluent_uri::Uri;
9+
use fluent_uri::{pct_enc::EStr, Uri};
1010
use serde_json::Value;
1111

1212
use crate::{
@@ -653,13 +653,22 @@ enum ReferenceKind {
653653
Schema,
654654
}
655655

656+
/// An entry in the processing queue.
657+
/// The optional third element is the document root URI, used when the resource
658+
/// was extracted from a fragment of a larger document. Local `$ref`s need to be
659+
/// resolved against the document root, not just the fragment content.
660+
type QueueEntry = (Arc<Uri<String>>, InnerResourcePtr, Option<Arc<Uri<String>>>);
661+
656662
struct ProcessingState {
657-
queue: VecDeque<(Arc<Uri<String>>, InnerResourcePtr)>,
663+
queue: VecDeque<QueueEntry>,
658664
seen: ReferenceTracker,
659665
external: AHashSet<(String, Uri<String>, ReferenceKind)>,
660666
scratch: String,
661667
refers_metaschemas: bool,
662668
custom_metaschemas: Vec<Arc<Uri<String>>>,
669+
/// Tracks schema pointers we've visited during recursive external resource collection.
670+
/// This prevents infinite recursion when schemas reference each other.
671+
visited_schemas: AHashSet<usize>,
663672
}
664673

665674
impl ProcessingState {
@@ -671,6 +680,7 @@ impl ProcessingState {
671680
scratch: String::new(),
672681
refers_metaschemas: false,
673682
custom_metaschemas: Vec::new(),
683+
visited_schemas: AHashSet::new(),
674684
}
675685
}
676686
}
@@ -697,7 +707,7 @@ fn process_input_resources(
697707
state.custom_metaschemas.push(Arc::clone(&key));
698708
}
699709

700-
state.queue.push_back((key, resource));
710+
state.queue.push_back((key, resource, None));
701711
entry.insert(wrapped_value);
702712
}
703713
}
@@ -711,31 +721,52 @@ fn process_queue(
711721
anchors: &mut AHashMap<AnchorKey, Anchor>,
712722
resolution_cache: &mut UriCache,
713723
) -> Result<(), Error> {
714-
while let Some((mut base, resource)) = state.queue.pop_front() {
724+
while let Some((mut base, resource, document_root_uri)) = state.queue.pop_front() {
715725
if let Some(id) = resource.id() {
716-
base = resolution_cache.resolve_against(&base.borrow(), id)?;
726+
base = resolve_id(&base, id, resolution_cache)?;
717727
resources.insert(base.clone(), resource.clone());
718728
}
719729

720730
for anchor in resource.anchors() {
721731
anchors.insert(AnchorKey::new(base.clone(), anchor.name()), anchor);
722732
}
723733

724-
collect_external_resources(
725-
&base,
726-
resource.contents(),
727-
resource.contents(),
728-
&mut state.external,
729-
&mut state.seen,
730-
resolution_cache,
731-
&mut state.scratch,
732-
&mut state.refers_metaschemas,
733-
resource.draft(),
734-
)?;
734+
// Determine the document root for resolving local $refs.
735+
// If document_root_uri is set (e.g., for fragment-extracted resources),
736+
// look up the full document. Otherwise, this resource IS the document root.
737+
let root = document_root_uri
738+
.as_ref()
739+
.and_then(|uri| resources.get(uri))
740+
.map_or_else(|| resource.contents(), InnerResourcePtr::contents);
741+
742+
// Skip if already visited during local $ref resolution
743+
let contents_ptr = std::ptr::from_ref::<Value>(resource.contents()) as usize;
744+
if state.visited_schemas.insert(contents_ptr) {
745+
collect_external_resources(
746+
&base,
747+
root,
748+
resource.contents(),
749+
&mut state.external,
750+
&mut state.seen,
751+
resolution_cache,
752+
&mut state.scratch,
753+
&mut state.refers_metaschemas,
754+
resource.draft(),
755+
&mut state.visited_schemas,
756+
)?;
757+
}
735758

759+
// Subresources inherit the document root URI, or use the current base if none set
760+
let subresource_root_uri = document_root_uri.or_else(|| Some(base.clone()));
736761
for contents in resource.draft().subresources_of(resource.contents()) {
737-
let subresource = InnerResourcePtr::new(contents, resource.draft());
738-
state.queue.push_back((base.clone(), subresource));
762+
// Skip subresources already visited during local $ref resolution
763+
let sub_ptr = std::ptr::from_ref::<Value>(contents) as usize;
764+
if !state.visited_schemas.contains(&sub_ptr) {
765+
let subresource = InnerResourcePtr::new(contents, resource.draft());
766+
state
767+
.queue
768+
.push_back((base.clone(), subresource, subresource_root_uri.clone()));
769+
}
739770
}
740771
}
741772
Ok(())
@@ -746,14 +777,15 @@ fn handle_fragment(
746777
resource: &InnerResourcePtr,
747778
key: &Arc<Uri<String>>,
748779
default_draft: Draft,
749-
queue: &mut VecDeque<(Arc<Uri<String>>, InnerResourcePtr)>,
780+
queue: &mut VecDeque<QueueEntry>,
781+
document_root_uri: Arc<Uri<String>>,
750782
) {
751783
if let Some(fragment) = uri.fragment() {
752784
if let Some(resolved) = pointer(resource.contents(), fragment.as_str()) {
753785
let draft = default_draft.detect(resolved);
754786
let contents = std::ptr::addr_of!(*resolved);
755787
let resource = InnerResourcePtr::new(contents, draft);
756-
queue.push_back((Arc::clone(key), resource));
788+
queue.push_back((Arc::clone(key), resource, Some(document_root_uri)));
757789
}
758790
}
759791
}
@@ -841,8 +873,15 @@ fn process_resources(
841873
resources,
842874
&mut state.custom_metaschemas,
843875
);
844-
handle_fragment(&uri, &resource, &key, default_draft, &mut state.queue);
845-
state.queue.push_back((key, resource));
876+
handle_fragment(
877+
&uri,
878+
&resource,
879+
&key,
880+
default_draft,
881+
&mut state.queue,
882+
Arc::clone(&key),
883+
);
884+
state.queue.push_back((key, resource, None));
846885
}
847886
}
848887
}
@@ -911,8 +950,15 @@ async fn process_resources_async(
911950
resources,
912951
&mut state.custom_metaschemas,
913952
);
914-
handle_fragment(uri, &resource, &key, default_draft, &mut state.queue);
915-
state.queue.push_back((key, resource));
953+
handle_fragment(
954+
uri,
955+
&resource,
956+
&key,
957+
default_draft,
958+
&mut state.queue,
959+
Arc::clone(&key),
960+
);
961+
state.queue.push_back((key, resource, None));
916962
}
917963
}
918964
}
@@ -992,6 +1038,7 @@ fn collect_external_resources(
9921038
scratch: &mut String,
9931039
refers_metaschemas: &mut bool,
9941040
draft: Draft,
1041+
visited: &mut AHashSet<usize>,
9951042
) -> Result<(), Error> {
9961043
// URN schemes are not supported for external resolution
9971044
if base.scheme().as_str() == "urn" {
@@ -1013,13 +1060,18 @@ fn collect_external_resources(
10131060
// Handle local references separately as they may have nested references to external resources
10141061
if $reference.starts_with('#') {
10151062
// Use the root document for pointer resolution since local refs are always
1016-
// relative to the document root, not the current subschema
1017-
if let Some(referenced) =
1018-
pointer(root, $reference.trim_start_matches('#'))
1019-
{
1063+
// relative to the document root, not the current subschema.
1064+
// Also track $id changes along the path to get the correct base URI.
1065+
if let Some((referenced, resolved_base)) = pointer_with_base(
1066+
root,
1067+
$reference.trim_start_matches('#'),
1068+
base,
1069+
resolution_cache,
1070+
draft,
1071+
)? {
10201072
// Recursively collect from the referenced schema and all its subresources
10211073
collect_external_resources_recursive(
1022-
base,
1074+
&resolved_base,
10231075
root,
10241076
referenced,
10251077
collected,
@@ -1028,6 +1080,7 @@ fn collect_external_resources(
10281080
scratch,
10291081
refers_metaschemas,
10301082
draft,
1083+
visited,
10311084
)?;
10321085
}
10331086
} else {
@@ -1102,6 +1155,9 @@ fn collect_external_resources(
11021155
}
11031156

11041157
/// Recursively collect external resources from a schema and all its subresources.
1158+
///
1159+
/// The `visited` set tracks schema pointers we've already processed to avoid infinite
1160+
/// recursion when schemas reference each other (directly or through subresources).
11051161
fn collect_external_resources_recursive(
11061162
base: &Arc<Uri<String>>,
11071163
root: &Value,
@@ -1112,10 +1168,22 @@ fn collect_external_resources_recursive(
11121168
scratch: &mut String,
11131169
refers_metaschemas: &mut bool,
11141170
draft: Draft,
1171+
visited: &mut AHashSet<usize>,
11151172
) -> Result<(), Error> {
1173+
// Track by pointer address to avoid processing the same schema twice
1174+
let ptr = std::ptr::from_ref::<Value>(contents) as usize;
1175+
if !visited.insert(ptr) {
1176+
return Ok(());
1177+
}
1178+
1179+
let current_base = match draft.id_of(contents) {
1180+
Some(id) => resolve_id(base, id, resolution_cache)?,
1181+
None => Arc::clone(base),
1182+
};
1183+
11161184
// First, collect from the current schema
11171185
collect_external_resources(
1118-
base,
1186+
&current_base,
11191187
root,
11201188
contents,
11211189
collected,
@@ -1124,21 +1192,26 @@ fn collect_external_resources_recursive(
11241192
scratch,
11251193
refers_metaschemas,
11261194
draft,
1195+
visited,
11271196
)?;
11281197

1129-
// Then recursively process all subresources
1198+
// Then recursively process all subresources (skip already-visited ones early)
11301199
for subresource in draft.subresources_of(contents) {
1131-
collect_external_resources_recursive(
1132-
base,
1133-
root,
1134-
subresource,
1135-
collected,
1136-
seen,
1137-
resolution_cache,
1138-
scratch,
1139-
refers_metaschemas,
1140-
draft,
1141-
)?;
1200+
let sub_ptr = std::ptr::from_ref::<Value>(subresource) as usize;
1201+
if !visited.contains(&sub_ptr) {
1202+
collect_external_resources_recursive(
1203+
&current_base,
1204+
root,
1205+
subresource,
1206+
collected,
1207+
seen,
1208+
resolution_cache,
1209+
scratch,
1210+
refers_metaschemas,
1211+
draft,
1212+
visited,
1213+
)?;
1214+
}
11421215
}
11431216
Ok(())
11441217
}
@@ -1147,6 +1220,25 @@ fn mark_reference(seen: &mut ReferenceTracker, base: &Arc<Uri<String>>, referenc
11471220
seen.insert(ReferenceKey::new(base, reference))
11481221
}
11491222

1223+
/// Resolve an `$id` against a base URI, handling anchor-style IDs and empty fragments.
1224+
///
1225+
/// Anchor-style `$id` values (starting with `#`) don't change the base URI.
1226+
/// Empty fragments are stripped from the resolved URI.
1227+
fn resolve_id(
1228+
base: &Arc<Uri<String>>,
1229+
id: &str,
1230+
resolution_cache: &mut UriCache,
1231+
) -> Result<Arc<Uri<String>>, Error> {
1232+
if id.starts_with('#') {
1233+
return Ok(Arc::clone(base));
1234+
}
1235+
let mut resolved = (*resolution_cache.resolve_against(&base.borrow(), id)?).clone();
1236+
if resolved.fragment().is_some_and(EStr::is_empty) {
1237+
resolved.set_fragment(None);
1238+
}
1239+
Ok(Arc::new(resolved))
1240+
}
1241+
11501242
/// Look up a value by a JSON Pointer.
11511243
///
11521244
/// **NOTE**: A slightly faster version of pointer resolution based on `Value::pointer` from `serde_json`.
@@ -1167,6 +1259,53 @@ pub fn pointer<'a>(document: &'a Value, pointer: &str) -> Option<&'a Value> {
11671259
)
11681260
}
11691261

1262+
/// Look up a value by a JSON Pointer, tracking `$id` changes along the path.
1263+
///
1264+
/// Returns both the resolved value and the accumulated base URI after processing
1265+
/// any `$id` declarations encountered along the path. Note that anchor-style `$id`
1266+
/// values (starting with `#`) don't change the base URI.
1267+
#[allow(clippy::type_complexity)]
1268+
fn pointer_with_base<'a>(
1269+
document: &'a Value,
1270+
pointer: &str,
1271+
base: &Arc<Uri<String>>,
1272+
resolution_cache: &mut UriCache,
1273+
draft: Draft,
1274+
) -> Result<Option<(&'a Value, Arc<Uri<String>>)>, Error> {
1275+
if pointer.is_empty() {
1276+
return Ok(Some((document, Arc::clone(base))));
1277+
}
1278+
if !pointer.starts_with('/') {
1279+
return Ok(None);
1280+
}
1281+
1282+
let mut current = document;
1283+
let mut current_base = Arc::clone(base);
1284+
1285+
for token in pointer.split('/').skip(1).map(unescape_segment) {
1286+
// Check for $id in the current value before traversing deeper
1287+
if let Some(id) = draft.id_of(current) {
1288+
current_base = resolve_id(&current_base, id, resolution_cache)?;
1289+
}
1290+
1291+
current = match current {
1292+
Value::Object(map) => match map.get(&*token) {
1293+
Some(v) => v,
1294+
None => return Ok(None),
1295+
},
1296+
Value::Array(list) => match parse_index(&token).and_then(|x| list.get(x)) {
1297+
Some(v) => v,
1298+
None => return Ok(None),
1299+
},
1300+
_ => return Ok(None),
1301+
};
1302+
}
1303+
1304+
// Note: We don't check $id in the final value here because
1305+
// `collect_external_resources_recursive` will handle it
1306+
Ok(Some((current, current_base)))
1307+
}
1308+
11701309
// Taken from `serde_json`.
11711310
#[must_use]
11721311
pub fn parse_index(s: &str) -> Option<usize> {

0 commit comments

Comments
 (0)