Skip to content

Commit e9abfc2

Browse files
authored
Merge pull request #8 from jhrcook/stopwords
User can supply a custom list of stopwords to TextRank
2 parents ce70e1b + 423e40c commit e9abfc2

File tree

4 files changed

+95
-14
lines changed

4 files changed

+95
-14
lines changed

Sources/TextRank/Sentence.swift

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@ public struct Sentence: Hashable {
1616

1717
public let originalTextIndex: Int
1818

19-
public init(text: String, originalTextIndex: Int) {
19+
public init(text: String, originalTextIndex: Int, additionalStopwords: [String] = [String]()) {
2020
self.text = text
2121
self.originalTextIndex = originalTextIndex
22-
words = Sentence.removeStopWords(from: Sentence.clean(self.text))
22+
words = Sentence.removeStopWords(from: Sentence.clean(self.text),
23+
additionalStopwords: additionalStopwords)
2324
}
2425

2526
public func hash(into hasher: inout Hasher) {
@@ -37,9 +38,9 @@ public struct Sentence: Hashable {
3738
.words
3839
}
3940

40-
static func removeStopWords(from w: [String]) -> Set<String> {
41+
static func removeStopWords(from w: [String], additionalStopwords: [String] = [String]()) -> Set<String> {
4142
var wordSet = Set(w)
42-
wordSet.subtract(Stopwords.English)
43+
wordSet.subtract(Stopwords.English + additionalStopwords)
4344
return wordSet
4445
}
4546
}

Sources/TextRank/TextRank.swift

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,19 @@ import Foundation
1010
public class TextRank {
1111
public var text: String {
1212
didSet {
13-
sentences = TextRank.splitIntoSentences(text).filter { $0.length > 0 }
13+
textToSentences()
1414
}
1515
}
1616

17-
public var summarizationFraction: Float = 0.2
1817
public var graph: TextGraph
19-
public var graphDamping: Float = 0.85
2018
public var sentences = [Sentence]()
19+
public var summarizationFraction: Float = 0.2
20+
public var graphDamping: Float = 0.85
21+
public var stopwords = [String]() {
22+
didSet {
23+
textToSentences()
24+
}
25+
}
2126

2227
public init() {
2328
text = ""
@@ -26,16 +31,20 @@ public class TextRank {
2631

2732
public init(text: String) {
2833
self.text = text
29-
sentences = TextRank.splitIntoSentences(text).filter { $0.length > 0 }
3034
graph = TextGraph(damping: graphDamping)
35+
textToSentences()
3136
}
3237

3338
public init(text: String, summarizationFraction: Float = 0.2, graphDamping: Float = 0.85) {
3439
self.text = text
3540
self.summarizationFraction = summarizationFraction
3641
self.graphDamping = graphDamping
37-
sentences = TextRank.splitIntoSentences(text).filter { $0.length > 0 }
3842
graph = TextGraph(damping: graphDamping)
43+
textToSentences()
44+
}
45+
46+
func textToSentences() {
47+
sentences = TextRank.splitIntoSentences(text, additionalStopwords: stopwords).filter { $0.length > 0 }
3948
}
4049
}
4150

@@ -78,13 +87,17 @@ extension TextRank {
7887
/// Split text into sentences.
7988
/// - Parameter text: Original text.
8089
/// - Returns: An array of sentences.
81-
static func splitIntoSentences(_ text: String) -> [Sentence] {
90+
static func splitIntoSentences(_ text: String, additionalStopwords stopwords: [String] = [String]()) -> [Sentence] {
8291
if text.isEmpty { return [] }
8392

8493
var x = [Sentence]()
8594
text.enumerateSubstrings(in: text.range(of: text)!, options: [.bySentences, .localized]) { substring, _, _, _ in
8695
if let substring = substring, !substring.isEmpty {
87-
x.append(Sentence(text: substring.trimmingCharacters(in: .whitespacesAndNewlines), originalTextIndex: x.count))
96+
x.append(
97+
Sentence(text: substring.trimmingCharacters(in: .whitespacesAndNewlines),
98+
originalTextIndex: x.count,
99+
additionalStopwords: stopwords)
100+
)
88101
}
89102
}
90103
return Array(Set(x))
@@ -101,15 +114,12 @@ public extension TextRank {
101114
func filterTopSentencesFrom(_ results: TextGraph.PageRankResult, top percentile: Float) -> TextGraph.NodeList {
102115
let idx = Int(Float(results.results.count) * percentile)
103116
let cutoffScore: Float = results.results.values.sorted()[min(idx, results.results.count - 1)]
104-
105117
var filteredNodeList: TextGraph.NodeList = [:]
106-
107118
for (sentence, value) in results.results {
108119
if value >= cutoffScore {
109120
filteredNodeList[sentence] = value
110121
}
111122
}
112-
113123
return filteredNodeList
114124
}
115125
}

Tests/TextRankTests/SentenceTests.swift

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,38 @@ class SentenceTests: XCTestCase {
1919
XCTAssertEqual(s.words, Set(clean))
2020
}
2121
}
22+
23+
func testRemovalOfStopWords() {
24+
// Given
25+
let text = "here are some words to be"
26+
27+
// When
28+
let sentence = Sentence(text: text, originalTextIndex: 0)
29+
30+
// Then
31+
XCTAssertEqual(sentence.length, 0)
32+
}
33+
34+
func testRemovalOfStopWordsButNotMeaningfulWords() {
35+
// Given
36+
let text = "here are some words to be lion"
37+
38+
// When
39+
let sentence = Sentence(text: text, originalTextIndex: 0)
40+
41+
// Then
42+
XCTAssertEqual(sentence.length, 1)
43+
XCTAssertEqual(sentence.words, Set(["lion"]))
44+
}
45+
46+
func testRemovalOfStopWordsAndAdditionalStopwords() {
47+
// Given
48+
let text = "here are some words to be lion"
49+
50+
// When
51+
let sentence = Sentence(text: text, originalTextIndex: 0, additionalStopwords: ["lion"])
52+
53+
// Then
54+
XCTAssertEqual(sentence.length, 0)
55+
}
2256
}

Tests/TextRankTests/TextRankTests.swift

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,40 @@ class TextRankTests: XCTestCase {
8787
XCTAssertTrue(filteredResults.count < results.results.count)
8888
XCTAssertTrue(filteredResults.count == 2)
8989
}
90+
91+
func testStopwordsAreRemoved() {
92+
// Given
93+
let text = "Here are some sentences dog cat. With intentional stopwords gator. And some words that are not."
94+
95+
// When
96+
let textRank = TextRank(text: text)
97+
98+
// Then
99+
XCTAssertEqual(textRank.sentences.count, 2)
100+
XCTAssertEqual(textRank.sentences[0].length, 3)
101+
XCTAssertEqual(textRank.sentences.filter { $0.originalTextIndex == 0 }[0].words,
102+
Set(["sentences", "dog", "cat"]))
103+
XCTAssertEqual(textRank.sentences.filter { $0.originalTextIndex == 1 }[0].words,
104+
Set(["intentional", "stopwords", "gator"]))
105+
XCTAssertEqual(textRank.sentences[1].length, 3)
106+
}
107+
108+
func testAdditionalStopwords() {
109+
// Given
110+
let text = "Here are some sentences dog cat. With intentional stopwords gator. And some words that are not."
111+
let additionalStopwords = ["dog", "gator"]
112+
113+
// When
114+
let textRank = TextRank(text: text)
115+
textRank.stopwords = additionalStopwords
116+
117+
// Then
118+
XCTAssertEqual(textRank.sentences.count, 2)
119+
XCTAssertEqual(textRank.sentences[0].length, 2)
120+
XCTAssertEqual(textRank.sentences.filter { $0.originalTextIndex == 0 }[0].words,
121+
Set(["sentences", "cat"]))
122+
XCTAssertEqual(textRank.sentences.filter { $0.originalTextIndex == 1 }[0].words,
123+
Set(["intentional", "stopwords"]))
124+
XCTAssertEqual(textRank.sentences[1].length, 2)
125+
}
90126
}

0 commit comments

Comments
 (0)