Add near-miss suggestions for unresolved symbol link error messages (#420)

d-ronnqvist · web-flow · commit 150eb7d295b6 · 2022-12-13T13:01:00.000-08:00
* Add a type to compute consecutive sequence lengths

* Add a type to find "near-misses" for diagnostics

* Identify near misses for unresolved symbol link error messages

* Update comments about adding near-misses with current status.

* Silence a deprecation warning in one of the unit tests

* Update error message to reflect that suggestions can be non-symbols.

* Add missing variable binding in condition for Swift &lt; 5.7 compatibility

* Small scoring adjustments for best-match ranking

* Add link to documentation for `CollectionDifference` enumeration order

* Simplify the implementations of segment insertions and removals

* Update some terminology in implementation comments

* Update previous FIXME comments to describe the follow up work

* Assert that insert segments aren't followed by other insert segments.

Also, lower one `precondition` to `assert`.

* Make micro optimization in near-miss score calculation
diff --git a/Sources/SwiftDocC/Infrastructure/Link Resolution/PathHierarchy.swift b/Sources/SwiftDocC/Infrastructure/Link Resolution/PathHierarchy.swift
@@ -829,8 +829,17 @@ extension PathHierarchy.Error {
     func errorMessage(context: DocumentationContext) -> String {
         switch self {
         case .partialResult(let partialResult, let remaining, let available):
-            return "Reference at \(partialResult.pathWithoutDisambiguation().singleQuoted) can't resolve \(remaining.singleQuoted). Available children: \(available.joined(separator: ", "))."
-            
+            let nearMisses = NearMiss.bestMatches(for: available, against: remaining)
+            let suggestion: String
+            switch nearMisses.count {
+            case 0:
+                suggestion = "No similar pages. Available children: \(available.joined(separator: ", "))."
+            case 1:
+                suggestion = "Did you mean: \(nearMisses[0])?"
+            default:
+                suggestion = "Did you mean one of: \(nearMisses.joined(separator: ", "))?"
+            }
+            return "Reference at \(partialResult.pathWithoutDisambiguation().singleQuoted) can't resolve \(remaining.singleQuoted). \(suggestion)"
         case .notFound, .unfindableMatch:
             return "No local documentation matches this reference."
             
diff --git a/Sources/SwiftDocC/Semantics/MarkupReferenceResolver.swift b/Sources/SwiftDocC/Semantics/MarkupReferenceResolver.swift
@@ -70,7 +70,7 @@ struct MarkupReferenceResolver: MarkupRewriter {
                 return nil
             }
             
-            // FIXME: Provide near-miss suggestion here. The user is likely to make mistakes with capitalization because of character input (rdar://59660520).
+            // FIXME: Structure the `PathHierarchyBasedLinkResolver` near-miss suggestions as fixits. https://github.com/apple/swift-docc/issues/438 (rdar://103279313)
             let uncuratedArticleMatch = context.uncuratedArticles[bundle.articlesDocumentationRootReference.appendingPathOfReference(unresolved)]?.source
             problems.append(unresolvedReferenceProblem(reference: reference, source: source, range: range, severity: severity, uncuratedArticleMatch: uncuratedArticleMatch, underlyingErrorMessage: errorMessage))
             return nil
diff --git a/Sources/SwiftDocC/Semantics/ReferenceResolver.swift b/Sources/SwiftDocC/Semantics/ReferenceResolver.swift
@@ -84,7 +84,7 @@ struct ReferenceResolver: SemanticVisitor {
             return .success(resolved)
             
         case let .failure(unresolved, errorMessage):
-            // FIXME: Provide near-miss suggestion here. The user is likely to make mistakes with capitalization because of character input.
+            // FIXME: Structure the `PathHierarchyBasedLinkResolver` near-miss suggestions as fixits. https://github.com/apple/swift-docc/issues/438 (rdar://103279313)
             let uncuratedArticleMatch = context.uncuratedArticles[bundle.documentationRootReference.appendingPathOfReference(unresolved)]?.source
             problems.append(unresolvedReferenceProblem(reference: reference, source: source, range: range, severity: severity, uncuratedArticleMatch: uncuratedArticleMatch, underlyingErrorMessage: errorMessage))
             return .failure(unresolved, errorMessage: errorMessage)
diff --git a/Sources/SwiftDocC/Utility/CollectionChanges.swift b/Sources/SwiftDocC/Utility/CollectionChanges.swift
@@ -0,0 +1,214 @@
+/*
+ This source file is part of the Swift.org open source project
+
+ Copyright (c) 2022 Apple Inc. and the Swift project authors
+ Licensed under Apache License v2.0 with Runtime Library Exception
+
+ See https://swift.org/LICENSE.txt for license information
+ See https://swift.org/CONTRIBUTORS.txt for Swift project authors
+*/
+
+import Foundation
+
+/// A collection of sparse segments that describe the subsequences that are common or different between two collections.
+struct CollectionChanges {
+    /// The segments of common elements, removed elements, and inserted elements.
+    let segments: [Segment]
+    
+    /// A single segment that describe a number of elements that are either common between both collections, or that are removed or inserted in the second collection.
+    struct Segment: Equatable {
+        var kind: Kind
+        var count: Int
+        
+        enum Kind: Equatable {
+            /// These elements are common between both collections.
+            case common
+            /// These elements are removed from the first collection to produce the second collection.
+            case remove
+            /// These elements are inserted in the first collection to produce the second collection.
+            case insert
+        }
+    }
+    
+    /// Creates a new collection changes value from the differences between to collections.
+    ///
+    /// - Parameters:
+    ///   - from: The collection that the base is compared to.
+    ///   - to: The base collection.
+    ///   - areEquivalent: A closure that returns a Boolean value indicating whether two elements are equivalent.
+    init<C>(from: C, to: C, by areEquivalent: (C.Element, C.Element) -> Bool = (==)) where C: BidirectionalCollection, C.Element: Hashable {
+        guard !from.isEmpty else {
+            segments = [.init(kind: .insert, count: to.count)]
+            return
+        }
+        guard !to.isEmpty else {
+            segments = [.init(kind: .remove, count: from.count)]
+            return
+        }
+        
+        var changes = ChangeSegmentBuilder(originalCount: from.count)
+        // The `CollectionDifference` enumeration order is documented; first removals in descending order then insertions in ascending order.
+        // https://github.com/apple/swift/blob/main/stdlib/public/core/CollectionDifference.swift#L216-L235
+        for change in to.difference(from: from, by: areEquivalent) {
+            switch change {
+            case .remove(let offset, _, _):
+                changes.remove(at: offset)
+            case .insert(let offset, _, _):
+                changes.insert(at: offset)
+            }
+        }
+        segments = changes.segments
+    }
+}
+
+/// A builder that applies collection differences to construct an array of ``Segment`` values.
+///
+/// - Important:
+/// Removals need to be applied in reverse order. All removals need to be applied before applying any insertions. Insertions need to be applied in order.
+private struct ChangeSegmentBuilder {
+    typealias Segment = CollectionChanges.Segment
+    
+    private(set) var segments: [Segment]
+    
+    private var insertStartIndex = 0
+    private var insertStartOffset = 0
+    
+    init(originalCount: Int) {
+        self.segments = [ Segment(kind: .common, count: originalCount) ]
+    }
+    
+    mutating func remove(at removalIndex: Int) {
+        // Removals are applied in reverse order. When the first removal is applied, the only segment is the 'common' count.
+        //
+        // Each removal can be either be at the start of the segment, middle of the segment, or end of the segment.
+        // - After removing from the start of the segment there can be no more removals (since those indices would be in ascending order).
+        // - After removing from the middle, the 'common' segment is split in two with a 'remove' segment in between.
+        //   Since the removal has to be at a lower index, it can only be applied to the split 'original' segment.
+        // - After removing from the end, the 'common' segment is made shorter and a new 'remove' segment is added after it.
+        //   Since the removal has to be at a lower index, it can only be applied to the shortened 'common' segment.
+        //
+        // This process repeats, meaning that every removal is always applied to the first segment.
+        let segment = segments[0]
+        assert(segment.kind == .common && removalIndex < segment.count, """
+            The first segment should always be a 'common' segment (was \(segment.kind)) and (0 ..< \(segment.count)) should always contain the removal index (\(removalIndex)).
+            If it's not, then that's means that the remove operations wasn't performed in reverse order.
+            """)
+        
+        if removalIndex == 0 {
+            // Removing at the start of the segment
+            if segment.count == 1 {
+                segments.remove(at: 0)
+            } else {
+                segments[0].count -= 1
+            }
+            
+            if segments.first?.kind == .remove {
+                segments[0].count += 1
+            } else {
+                segments.insert(Segment(kind: .remove, count: 1), at: 0)
+            }
+        }
+        else if removalIndex == segment.count - 1 {
+            // Removing at end of segment
+            segments[0].count -= 1
+
+            if segments.count > 1, segments[1].kind == .remove {
+                segments[1].count += 1
+            } else {
+                // Insert at `endIndex` is equivalent to `append()`
+                segments.insert(Segment(kind: .remove, count: 1), at: 1)
+            }
+        } else {
+            // Removal within segment
+            let lowerSegmentCount  = removalIndex
+            let higherSegmentCount = segment.count - lowerSegmentCount - 1 // the 1 is for the removed element
+            
+            // Split the segment in two with a new removal segment in-between.
+            segments[0...0] = [
+                Segment(kind: .common, count: lowerSegmentCount),
+                Segment(kind: .remove, count: 1),
+                Segment(kind: .common, count: higherSegmentCount),
+            ]
+        }
+    }
+    
+    private func findSegment(toInsertAt index: Int) -> (segment: Segment, startOffset: Int, segmentIndex: Int)? {
+        // Insertions are applied in order. This means that we can start with the previous offset and index.
+        var offset = insertStartOffset
+        for segmentIndex in insertStartIndex ..< segments.count {
+            let segment = segments[segmentIndex]
+            if segment.kind == .remove {
+                continue
+            }
+            
+            if index <= offset + segment.count {
+                return (segment, offset, segmentIndex)
+            }
+            offset += segment.count
+        }
+        return nil
+    }
+    
+    mutating func insert(at insertIndex: Int) {
+        guard let (segment, startOffset, segmentIndex) = findSegment(toInsertAt: insertIndex) else {
+            assert(segments.count == 1 && segments[0].kind == .remove, """
+                The only case when a segment can't be found in the loop is if the only segment is a 'remove' segment.
+                This happens when all the 'common' elements are removed (meaning that the 'from' and 'to' values have nothing in common.
+                """)
+            
+            segments.append(Segment(kind: .insert, count: 1))
+            return
+        }
+        assert(segment.kind != .remove)
+        
+        insertStartOffset = startOffset
+        insertStartIndex  = segmentIndex
+        
+        guard segment.kind != .insert else {
+            segments[segmentIndex].count += 1
+            return
+        }
+        assert(segment.kind == .common)
+        
+        if insertIndex == startOffset {
+            // Insert at start of segment
+            segments.insert(Segment(kind: .insert, count: 1), at: segmentIndex)
+        } else if insertIndex == startOffset + segment.count {
+            // Insert at end of segment
+            let insertSegmentIndex = segmentIndex + 1
+            
+            // If this is the last segment, append a new 'insert' segment
+            guard insertSegmentIndex < segments.count else {
+                segments.append(Segment(kind: .insert, count: 1))
+                return
+            }
+            
+            switch segments[insertSegmentIndex].kind {
+            case .insert:
+                assertionFailure("Inserts are processed from low to high. There shouldn't be another 'insert' segment after 'segmentIndex'.")
+                
+            case .common:
+                // If the next segment is a 'common' segment, insert a new 'insert' segment before it
+                segments.insert(Segment(kind: .insert, count: 1), at: insertSegmentIndex)
+                
+            case .remove:
+                // If the next segment is a 'remove' segment, skip over it so that insertions are always after removals.
+                segments.insert(Segment(kind: .insert, count: 1), at: insertSegmentIndex + 1)
+                
+                assert(insertSegmentIndex + 2 == segments.count || segments[insertSegmentIndex + 2].kind == .common,
+                       "If there's a segment after the remove segment, that is a segment of 'common' characters.")
+            }
+        } else {
+            // Insert within segment
+            let lowerSegmentCount  = insertIndex - startOffset
+            let higherSegmentCount = segment.count - lowerSegmentCount // nothing to add
+            
+            // Split the segment in two with a new insertion segment in-between.
+            segments[segmentIndex...segmentIndex] = [
+                Segment(kind: .common, count: lowerSegmentCount),
+                Segment(kind: .insert, count: 1),
+                Segment(kind: .common, count: higherSegmentCount),
+            ]
+        }
+    }
+}
diff --git a/Sources/SwiftDocC/Utility/NearMiss.swift b/Sources/SwiftDocC/Utility/NearMiss.swift
@@ -0,0 +1,111 @@
+/*
+ This source file is part of the Swift.org open source project
+
+ Copyright (c) 2022 Apple Inc. and the Swift project authors
+ Licensed under Apache License v2.0 with Runtime Library Exception
+
+ See https://swift.org/LICENSE.txt for license information
+ See https://swift.org/CONTRIBUTORS.txt for Swift project authors
+*/
+
+import Foundation
+
+// A type that sorts and filters a list of strings based on how "similar" they are to a given string.
+//
+// This is meant mainly for diagnostics that wan't to offer meaning full suggestions to the end-user.
+enum NearMiss {
+    
+    /// Returns the "best matches" among a list of possibilities based on how "similar" they are to a given string.
+    static func bestMatches(for possibilities: [String], against authored: String) -> [String] {
+        // There is no single right or wrong way to score changes. This implementation is completely arbitrary.
+        // It's chosen because the relative scores that it computes provide "best match" results that are close
+        // to what a person would expect. See ``NearMissTests``.
+        
+        let goodMatches = possibilities.lazy
+            .map { (text: String) -> (text: String, score: Double) in
+                (text, NearMiss.score(CollectionChanges(from: authored, to: text)))
+            }
+            .filter {
+                // A negative score is not considered very "similar" in this implementation.
+                0 < $0.score
+            }
+            .sorted(by: { lhs, rhs in
+                if lhs.score == rhs.score {
+                    return lhs.text < rhs.text // Sort same score alphabetically
+                }
+                return lhs.score > rhs.score // Sort by high score
+            })
+        
+        // Some common prefixes result in a large number of matches. For example, many types in Swift-DocC have
+        // a "Documentation" prefix which yields a fairly high score in this implementation. To counteract this
+        // we additionally filter out any match with a score that's less than 25% of the highest match's score.
+        guard let bestScore = goodMatches.first?.score else {
+            return []
+        }
+        let matchThreshold = bestScore / 4
+        
+        return goodMatches
+            .prefix(while: { matchThreshold < $0.score })
+            // More than 10 results are likely not helpful to the user.
+            .prefix(10)
+            .map { $0.text }
+    }
+    
+    /// Computes the "score" for a collection of change segments.
+    private static func score(_ changes: CollectionChanges) -> Double {
+        // Again, there is no right or wrong way to score changes and this implementation is completely arbitrary.
+        
+        // Give the first segment a bit more weight to its contribution to the total score
+        guard let first = changes.segments.first else { return 0 }
+        var score = NearMiss.score(first) * 1.75
+        
+        for segment in changes.segments.dropFirst() {
+            score += NearMiss.score(segment)
+        }
+        return score
+    }
+        
+    /// Computes the "score" for a single collection change segments.
+    private static func score(_ segment: CollectionChanges.Segment) -> Double {
+        // Again, there is no right or wrong way to score changes and this implementation is completely arbitrary.
+        
+        // This implementation is built around a few basic ideas:
+        //
+        //  - Common segments _add_ to a change collection's score,
+        //  - Inserted and removed segments _subtract from_ a change collection's score.
+        //  - Short "common segments" occur in differences that are very different ("orange" and "lemon" both contain a "e").
+        //  - A long sequence of common elements should contribute more than an equal length sequence of different characters.
+        //    In other words; a 50% match is still "good".
+        //  - The longer a common segment is, the more "similar" to two strings are.
+        //  - A removed segment contribute more than an inserted segment (since the author had written those characters).
+        
+        switch segment.kind {
+        case .common:
+            if segment.count < 3 {
+                // 1, or 2 common characters are too few to be what a person would consider a similarity.
+                return 0.0
+            } else {
+                // To produce higher contributions for longer common sequences, this implementation sums the sequence (1...length)
+                // and adds an arbitrary constant factor.
+                return Double((1...segment.count).sum()) + 3
+            }
+            
+        // Segments of removed or inserted characters contribute to the score no matter the segment length.
+        //
+        // The score is linear to the length with scale factors that are tweaked to provide "best match" results that are close
+        // to what a person would expect. See ``NearMissTests``.
+        case .insert:
+            return -Double(segment.count) * 1.5
+        case .remove:
+            // Removed characters contribute more than inserted characters since they represent something that the author wrote
+            // that is missing in this match.
+            return -Double(segment.count) * 3.0
+        }
+    }
+}
+
+private extension ClosedRange where Bound == Int {
+    func sum() -> Int {
+        return (lowerBound + upperBound) * count / 2
+    }
+}
diff --git a/Tests/SwiftDocCTests/Infrastructure/SymbolGraph/SymbolGraphLoaderTests.swift b/Tests/SwiftDocCTests/Infrastructure/SymbolGraph/SymbolGraphLoaderTests.swift
@@ -112,6 +112,9 @@ class SymbolGraphLoaderTests: XCTestCase {
         XCTAssertEqual(moduleNameFrequency, ["Main": 1, "One": 1, "Two": 1, "Three": 1])
     }
     
+    // This test calls ``SymbolGraph.relationships`` which is deprecated.
+    // Deprecating the test silences the deprecation warning when running the tests. It doesn't skip the test.
+    @available(*, deprecated)
     func testLoadingHighNumberOfModulesConcurrently() throws {
         let tempURL = try createTemporaryDirectory()
 
diff --git a/Tests/SwiftDocCTests/Semantics/SymbolTests.swift b/Tests/SwiftDocCTests/Semantics/SymbolTests.swift
diff --git a/Tests/SwiftDocCTests/Utility/CollectionChangesTests.swift b/Tests/SwiftDocCTests/Utility/CollectionChangesTests.swift
diff --git a/Tests/SwiftDocCTests/Utility/NearMissTests.swift b/Tests/SwiftDocCTests/Utility/NearMissTests.swift

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ struct MarkupReferenceResolver: MarkupRewriter {`
`70`	`70`	`return nil`
`71`	`71`	`}`
`72`	`72`
`73`		`- // FIXME: Provide near-miss suggestion here. The user is likely to make mistakes with capitalization because of character input (rdar://59660520).`
	`73`	+ // FIXME: Structure the `PathHierarchyBasedLinkResolver` near-miss suggestions as fixits. https://github.com/apple/swift-docc/issues/438 (rdar://103279313)
`74`	`74`	`let uncuratedArticleMatch = context.uncuratedArticles[bundle.articlesDocumentationRootReference.appendingPathOfReference(unresolved)]?.source`
`75`	`75`	`problems.append(unresolvedReferenceProblem(reference: reference, source: source, range: range, severity: severity, uncuratedArticleMatch: uncuratedArticleMatch, underlyingErrorMessage: errorMessage))`
`76`	`76`	`return nil`
Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,9 @@ class SymbolGraphLoaderTests: XCTestCase {`
`112`	`112`	`XCTAssertEqual(moduleNameFrequency, ["Main": 1, "One": 1, "Two": 1, "Three": 1])`
`113`	`113`	`}`
`114`	`114`
	`115`	+ // This test calls ``SymbolGraph.relationships`` which is deprecated.
	`116`	`+ // Deprecating the test silences the deprecation warning when running the tests. It doesn't skip the test.`
	`117`	`+ @available(*, deprecated)`
`115`	`118`	`func testLoadingHighNumberOfModulesConcurrently() throws {`
`116`	`119`	`let tempURL = try createTemporaryDirectory()`
`117`	`120`