robinst
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/main/java/org/nibor/autolink/LinkExtractor.java‎
Lines changed: 8 additions & 2 deletions b/‎src/main/java/org/nibor/autolink/LinkExtractor.java‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/main/java/org/nibor/autolink/LinkType.java‎
Lines changed: 5 additions & 1 deletion b/‎src/main/java/org/nibor/autolink/LinkType.java‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/main/java/org/nibor/autolink/internal/LinkSpanImpl.java‎
Lines changed: 1 addition & 1 deletion b/‎src/main/java/org/nibor/autolink/internal/LinkSpanImpl.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/org/nibor/autolink/internal/Scanners.java‎
Lines changed: 156 additions & 0 deletions b/‎src/main/java/org/nibor/autolink/internal/Scanners.java‎
Lines changed: 156 additions & 0 deletions
@@ -9,3 +9,4 @@
 
 # mvn
 target/
+/bin/
@@ -3,6 +3,7 @@
 import org.nibor.autolink.internal.EmailScanner;
 import org.nibor.autolink.internal.Scanner;
 import org.nibor.autolink.internal.UrlScanner;
+import org.nibor.autolink.internal.WwwUrlScanner;
 
 import java.util.*;
 
@@ -14,10 +15,12 @@
 public class LinkExtractor {
 
     private final Scanner urlScanner;
+    private final Scanner wwwScanner;
     private final Scanner emailScanner;
 
-    private LinkExtractor(UrlScanner urlScanner, EmailScanner emailScanner) {
+    private LinkExtractor(UrlScanner urlScanner, WwwUrlScanner wwwScanner, EmailScanner emailScanner) {
         this.urlScanner = urlScanner;
+        this.wwwScanner = wwwScanner;
         this.emailScanner = emailScanner;
     }
 
@@ -46,6 +49,8 @@ private Scanner trigger(char c) {
                 return urlScanner;
             case '@':
                 return emailScanner;
+            case 'w':
+                return wwwScanner;
         }
         return null;
     }
@@ -88,8 +93,9 @@ public Builder emailDomainMustHaveDot(boolean emailDomainMustHaveDot) {
          */
         public LinkExtractor build() {
             UrlScanner urlScanner = linkTypes.contains(LinkType.URL) ? new UrlScanner() : null;
+            WwwUrlScanner wwwScanner = linkTypes.contains(LinkType.WWW) ? new WwwUrlScanner() : null;
             EmailScanner emailScanner = linkTypes.contains(LinkType.EMAIL) ? new EmailScanner(emailDomainMustHaveDot) : null;
-            return new LinkExtractor(urlScanner, emailScanner);
+            return new LinkExtractor(urlScanner, wwwScanner, emailScanner);
         }
     }
 
 
@@ -11,5 +11,9 @@ public enum LinkType {
     /**
      * Email address such as {@code [email protected]}
      */
-    EMAIL
+    EMAIL,
+    /**
+     * URL such as {@code www.example.com}
+     */
+    WWW
 }
@@ -29,7 +29,7 @@ public int getBeginIndex() {
     public int getEndIndex() {
         return endIndex;
     }
-
+    
     @Override
     public String toString() {
         return "Link{type=" + getType() + ", beginIndex=" + beginIndex + ", endIndex=" + endIndex + "}";
 
@@ -17,4 +17,160 @@ public static boolean isAlnum(char c) {
     public static boolean isNonAscii(char c) {
         return c >= 0x80;
     }
+
+    public static final int findUrlEnd(CharSequence input, int beginIndex) {
+        int round = 0;
+        int square = 0;
+        int curly = 0;
+        boolean doubleQuote = false;
+        boolean singleQuote = false;
+        int last = beginIndex;
+        loop:
+        for (int i = beginIndex; i < input.length(); i++) {
+            char c = input.charAt(i);
+            switch (c) {
+                case '\u0000':
+                case '\u0001':
+                case '\u0002':
+                case '\u0003':
+                case '\u0004':
+                case '\u0005':
+                case '\u0006':
+                case '\u0007':
+                case '\u0008':
+                case '\t':
+                case '\n':
+                case '\u000B':
+                case '\f':
+                case '\r':
+                case '\u000E':
+                case '\u000F':
+                case '\u0010':
+                case '\u0011':
+                case '\u0012':
+                case '\u0013':
+                case '\u0014':
+                case '\u0015':
+                case '\u0016':
+                case '\u0017':
+                case '\u0018':
+                case '\u0019':
+                case '\u001A':
+                case '\u001B':
+                case '\u001C':
+                case '\u001D':
+                case '\u001E':
+                case '\u001F':
+                case ' ':
+                case '<':
+                case '>':
+                case '\u007F':
+                case '\u0080':
+                case '\u0081':
+                case '\u0082':
+                case '\u0083':
+                case '\u0084':
+                case '\u0085':
+                case '\u0086':
+                case '\u0087':
+                case '\u0088':
+                case '\u0089':
+                case '\u008A':
+                case '\u008B':
+                case '\u008C':
+                case '\u008D':
+                case '\u008E':
+                case '\u008F':
+                case '\u0090':
+                case '\u0091':
+                case '\u0092':
+                case '\u0093':
+                case '\u0094':
+                case '\u0095':
+                case '\u0096':
+                case '\u0097':
+                case '\u0098':
+                case '\u0099':
+                case '\u009A':
+                case '\u009B':
+                case '\u009C':
+                case '\u009D':
+                case '\u009E':
+                case '\u009F':
+                    // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
+                    // Some characters are not in the above list, even they are not in "unreserved" or "reserved":
+                    //   '"', '\\', '^', '`', '{', '|', '}'
+                    // The reason for this is that other link detectors also allow them. Also see below, we require
+                    // the quote and the braces to be balanced.
+                    break loop;
+                case '?':
+                case '!':
+                case '.':
+                case ',':
+                case ':':
+                case ';':
+                    // These may be part of an URL but not at the end
+                    break;
+                case '/':
+                    // This may be part of an URL and at the end, but not if the previous character can't be the end of an URL
+                    if (last == i - 1) {
+                        last = i;
+                    }
+                    break;
+                case '(':
+                    round++;
+                    break;
+                case ')':
+                    round--;
+                    if (round >= 0) {
+                        last = i;
+                    } else {
+                        // More closing than opening brackets, stop now
+                        break loop;
+                    }
+                    break;
+                case '[':
+                    // Allowed in IPv6 address host
+                    square++;
+                    break;
+                case ']':
+                    // Allowed in IPv6 address host
+                    square--;
+                    if (square >= 0) {
+                        last = i;
+                    } else {
+                        // More closing than opening brackets, stop now
+                        break loop;
+                    }
+                    break;
+                case '{':
+                    curly++;
+                    break;
+                case '}':
+                    curly--;
+                    if (curly >= 0) {
+                        last = i;
+                    } else {
+                        // More closing than opening brackets, stop now
+                        break loop;
+                    }
+                    break;
+                case '"':
+                    doubleQuote = !doubleQuote;
+                    if (!doubleQuote) {
+                        last = i;
+                    }
+                    break;
+                case '\'':
+                    singleQuote = !singleQuote;
+                    if (!singleQuote) {
+                        last = i;
+                    }
+                    break;
+                default:
+                    last = i;
+            }
+        }
+        return last;
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -9,3 +9,4 @@`
`9`	`9`
`10`	`10`	`# mvn`
`11`	`11`	`target/`
	`12`	`+/bin/`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ public int getBeginIndex() {`
`29`	`29`	`public int getEndIndex() {`
`30`	`30`	`return endIndex;`
`31`	`31`	`}`
`32`		`-`
	`32`	`+`
`33`	`33`	`@Override`
`34`	`34`	`public String toString() {`
`35`	`35`	`return "Link{type=" + getType() + ", beginIndex=" + beginIndex + ", endIndex=" + endIndex + "}";`