Skip to content

Commit 0da6fcf

Browse files
committed
adding the possibility for extracting www-based URLs as well
1 parent 5c1c3e0 commit 0da6fcf

File tree

9 files changed

+131
-10
lines changed

9 files changed

+131
-10
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99

1010
# mvn
1111
target/
12+
/bin/

src/main/java/org/nibor/autolink/LinkExtractor.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.nibor.autolink.internal.EmailScanner;
44
import org.nibor.autolink.internal.Scanner;
55
import org.nibor.autolink.internal.UrlScanner;
6+
import org.nibor.autolink.internal.WwwUrlScanner;
67

78
import java.util.*;
89

@@ -14,10 +15,12 @@
1415
public class LinkExtractor {
1516

1617
private final Scanner urlScanner;
18+
private final Scanner wwwScanner;
1719
private final Scanner emailScanner;
1820

19-
private LinkExtractor(UrlScanner urlScanner, EmailScanner emailScanner) {
21+
private LinkExtractor(UrlScanner urlScanner, WwwUrlScanner wwwScanner, EmailScanner emailScanner) {
2022
this.urlScanner = urlScanner;
23+
this.wwwScanner = wwwScanner;
2124
this.emailScanner = emailScanner;
2225
}
2326

@@ -46,6 +49,8 @@ private Scanner trigger(char c) {
4649
return urlScanner;
4750
case '@':
4851
return emailScanner;
52+
case 'w':
53+
return wwwScanner;
4954
}
5055
return null;
5156
}
@@ -88,8 +93,9 @@ public Builder emailDomainMustHaveDot(boolean emailDomainMustHaveDot) {
8893
*/
8994
public LinkExtractor build() {
9095
UrlScanner urlScanner = linkTypes.contains(LinkType.URL) ? new UrlScanner() : null;
96+
WwwUrlScanner wwwScanner = linkTypes.contains(LinkType.URL) ? new WwwUrlScanner() : null;
9197
EmailScanner emailScanner = linkTypes.contains(LinkType.EMAIL) ? new EmailScanner(emailDomainMustHaveDot) : null;
92-
return new LinkExtractor(urlScanner, emailScanner);
98+
return new LinkExtractor(urlScanner, wwwScanner, emailScanner);
9399
}
94100
}
95101

src/main/java/org/nibor/autolink/LinkSpan.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,8 @@ public interface LinkSpan {
2121
*/
2222
int getEndIndex();
2323

24+
/**
25+
* @return the found sequence
26+
*/
27+
CharSequence sequence();
2428
}

src/main/java/org/nibor/autolink/internal/EmailScanner.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ public LinkSpan scan(CharSequence input, int triggerIndex, int rewindIndex) {
2525
}
2626

2727
int afterAt = triggerIndex + 1;
28-
int last = findLast(input, afterAt);
29-
if (last == -1) {
28+
int last = findLast(input, afterAt) + 1;
29+
if (last == 0) {
3030
return null;
3131
}
3232

33-
return new LinkSpanImpl(LinkType.EMAIL, first, last + 1);
33+
return new LinkSpanImpl(LinkType.EMAIL, first, last, input.subSequence(first, last));
3434
}
3535

3636
// See "Local-part" in RFC 5321, plus extensions in RFC 6531

src/main/java/org/nibor/autolink/internal/LinkSpanImpl.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ public class LinkSpanImpl implements LinkSpan {
88
private final LinkType linkType;
99
private final int beginIndex;
1010
private final int endIndex;
11+
private final CharSequence sequence;
1112

12-
public LinkSpanImpl(LinkType linkType, int beginIndex, int endIndex) {
13+
public LinkSpanImpl(LinkType linkType, int beginIndex, int endIndex, CharSequence sequence) {
1314
this.linkType = linkType;
1415
this.beginIndex = beginIndex;
1516
this.endIndex = endIndex;
17+
this.sequence = sequence;
1618
}
1719

1820
@Override
@@ -29,6 +31,11 @@ public int getBeginIndex() {
2931
public int getEndIndex() {
3032
return endIndex;
3133
}
34+
35+
@Override
36+
public CharSequence sequence() {
37+
return sequence;
38+
}
3239

3340
@Override
3441
public String toString() {

src/main/java/org/nibor/autolink/internal/UrlScanner.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ public LinkSpan scan(CharSequence input, int triggerIndex, int rewindIndex) {
2323
return null;
2424
}
2525

26-
int last = findLast(input, afterSlashSlash);
26+
int last = findLast(input, afterSlashSlash) + 1;
2727

28-
return new LinkSpanImpl(LinkType.URL, first, last + 1);
28+
return new LinkSpanImpl(LinkType.URL, first, last, input.subSequence(first, last));
2929
}
3030

3131
// See "scheme" in RFC 3986
32-
private int findFirst(CharSequence input, int beginIndex, int rewindIndex) {
32+
protected int findFirst(CharSequence input, int beginIndex, int rewindIndex) {
3333
int first = -1;
3434
int digit = -1;
3535
for (int i = beginIndex; i >= rewindIndex; i--) {
@@ -50,7 +50,7 @@ private int findFirst(CharSequence input, int beginIndex, int rewindIndex) {
5050
return first;
5151
}
5252

53-
private int findLast(CharSequence input, int beginIndex) {
53+
protected int findLast(CharSequence input, int beginIndex) {
5454
int round = 0;
5555
int square = 0;
5656
int curly = 0;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package org.nibor.autolink.internal;
2+
3+
import org.nibor.autolink.LinkSpan;
4+
import org.nibor.autolink.LinkType;
5+
6+
/**
7+
* Scan for URLs starting from the trigger character "w", requires "www.".
8+
* <p>
9+
* Based on RFC 3986.
10+
*/
11+
public class WwwUrlScanner extends UrlScanner {
12+
13+
@Override
14+
public LinkSpan scan(final CharSequence input, int triggerIndex, int rewindIndex) {
15+
int afterDot = triggerIndex + 4;
16+
if (afterDot >= input.length() || input.charAt(triggerIndex + 1) != 'w' || input.charAt(triggerIndex + 2) != 'w' || input.charAt(triggerIndex + 3) != '.') {
17+
return null;
18+
}
19+
20+
int first = triggerIndex;
21+
int last = findLast(input, afterDot) + 1;
22+
if (last == 0) {
23+
return null;
24+
}
25+
26+
return new LinkSpanImpl(LinkType.URL, first, last, input.subSequence(first, last));
27+
}
28+
29+
@Override
30+
protected int findLast(CharSequence input, int beginIndex) {
31+
int last = super.findLast(input, beginIndex);
32+
33+
// Make sure there is at least one dot after the first dot,
34+
// so www.something is not allowed, but www.something.co.uk is
35+
int pointer = last;
36+
while (--pointer > beginIndex) {
37+
if (input.charAt(pointer) == '.' && pointer > beginIndex) return last;
38+
}
39+
40+
return -1;
41+
}
42+
}

src/test/java/org/nibor/autolink/AutolinkBenchmark.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ public class AutolinkBenchmark extends AutolinkTestCase {
1212
private static final List<String> WORDS = Arrays.asList(
1313
"Lorem ", "ipsum ", "dolor ", "sit ", "amet ", "consectetur ", "adipiscing ", "elit ",
1414
".", ",", ":", "@", "(", ")", "http://example.com", "https://test.com/foo_(bar)",
15+
"www.something.com", "www.another.uk.co",
1516
1617
);
1718

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package org.nibor.autolink;
2+
3+
import java.util.Arrays;
4+
import java.util.EnumSet;
5+
6+
import org.junit.Test;
7+
import org.junit.runner.RunWith;
8+
import org.junit.runners.Parameterized;
9+
import org.junit.runners.Parameterized.Parameter;
10+
import org.junit.runners.Parameterized.Parameters;
11+
12+
@RunWith(Parameterized.class)
13+
public class AutolinkWwwUrlTest extends AutolinkUrlTest {
14+
15+
@Parameters(name = "{1}")
16+
public static Iterable<Object[]> data() {
17+
return Arrays.asList(new Object[][]{
18+
{LinkExtractor.builder().linkTypes(EnumSet.of(LinkType.URL)).build(), "URL"},
19+
{LinkExtractor.builder().build(), "all"}
20+
});
21+
}
22+
23+
@Parameter(0)
24+
public LinkExtractor linkExtractor;
25+
26+
@Parameter(1)
27+
public String description;
28+
29+
@Test
30+
public void notLinked() {
31+
assertNotLinked("");
32+
assertNotLinked("wwwsomething.com");
33+
assertNotLinked("ww.foo.com");
34+
assertNotLinked("w.bar.foo.co");
35+
assertNotLinked("www.something");
36+
assertNotLinked("www.go");
37+
}
38+
39+
@Test
40+
public void linked() {
41+
assertLinked("www.s.com","|www.s.com|");
42+
assertLinked("www.fo.uk","|www.fo.uk|");
43+
}
44+
45+
@Test
46+
public void schemes() {
47+
assertLinked("http://www.something.com","|http://www.something.com|");
48+
assertLinked("http://something.com","|http://something.com|");
49+
assertLinked("http://something.co.uk","|http://something.co.uk|");
50+
}
51+
52+
@Override
53+
protected LinkExtractor getLinkExtractor() {
54+
return linkExtractor;
55+
}
56+
57+
private void assertLinked(String input, String expected) {
58+
super.assertLinked(input, expected, LinkType.URL);
59+
}
60+
}

0 commit comments

Comments
 (0)