Skip to content

Commit dfaff24

Browse files
Add simple URI parser library
1 parent dc2e4a8 commit dfaff24

File tree

1 file changed

+251
-0
lines changed

1 file changed

+251
-0
lines changed

libraries/common/io/uri.effekt

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
import scanner
2+
import stream
3+
4+
def hexDigit(for: Int): Char = for match {
5+
case i and i >= 0 and i < 10 => ('0'.toInt + i).toChar
6+
case i and i >= 0 and i < 16 => ('A'.toInt + (i - 10)).toChar
7+
case _ => <>
8+
}
9+
10+
/// %-encodes the characters for which `shouldEncode` returns true.
11+
/// Always %-encodes %.
12+
def urlencode(s: String){ shouldEncode: Char => Bool }: String = collectString {
13+
def encoded(c: Char): Unit = {
14+
do emit('%')
15+
val cc = c.toInt
16+
if (cc >= 256){ panic("Unicode not supported") } // TODO
17+
do emit((cc / 16).hexDigit)
18+
do emit(mod(cc, 16).hexDigit)
19+
}
20+
for[Char]{ s.each }{
21+
case '%' => encoded('%')
22+
case c and c.shouldEncode => encoded(c)
23+
case other => do emit(other)
24+
}
25+
}
26+
27+
/// gen-delims as per RFC 3986
28+
def isGenDelim(c: Char): Bool = c match {
29+
case ':' => true
30+
case '/' => true
31+
case '?' => true
32+
case '#' => true
33+
case '[' => true
34+
case ']' => true
35+
case '@' => true
36+
case _ => false
37+
}
38+
39+
/// sub-delims as per RFC 3986
40+
def isSubDelim(c: Char): Bool = c match {
41+
case '!' => true
42+
case '$' => true
43+
case '&' => true
44+
case '\'' => true
45+
case '(' => true
46+
case ')' => true
47+
case '*' => true
48+
case '+' => true
49+
case ',' => true
50+
case ';' => true
51+
case '=' => true
52+
case _ => false
53+
}
54+
55+
/// Encodes the string for urls using %-escapes
56+
def urlencode(s: String): String = urlencode(s){
57+
case '%' => true
58+
case ' ' => true
59+
case c and c.isGenDelim || c.isSubDelim => true
60+
case _ => false
61+
}
62+
63+
/// Unreserved characters as per RFC 3986
64+
def isUnreserved(c: Char): Bool = c match {
65+
case c and c.isAlphanumeric => true
66+
case '-' => true
67+
case '.' => true
68+
case '_' => true
69+
case '~' => true
70+
case _ => false
71+
}
72+
73+
/// Encodes the string for urls using %-escapes,
74+
/// escaping everything that is not an unreserved character
75+
/// as per RFC 3986.
76+
def urlencodeStrict(s: String): String =
77+
urlencode(s){ c => not(c.isUnreserved) }
78+
79+
80+
/// Decodes %-escapes in the given string
81+
def urldecode(s: String): String = collectString {
82+
with feed(s)
83+
with scanner[Char]
84+
85+
exhaustively{
86+
do read[Char]() match {
87+
case '%' =>
88+
val a = readHexDigit()
89+
val b = readHexDigit()
90+
do emit((a * 16 + b).toChar)
91+
case o => do emit(o)
92+
}
93+
}
94+
}
95+
96+
interface URIBuilder {
97+
def scheme(s: String): Unit
98+
def userinfo(a: String): Unit
99+
def host(h: String): Unit
100+
def port(p: Int): Unit
101+
def path(p: String): Unit
102+
def query(q: String): Unit
103+
def fragment(f: String): Unit
104+
}
105+
106+
def parseScheme(): String / { Scan[Char], stop } = {
107+
with collectString
108+
do emit(readIf{ c => c.isAlphabetic })
109+
readWhile{ c => c.isAlphanumeric || c == '+' || c == '-' || c == '.' }
110+
}
111+
112+
def unread[A, R](c: A){ body: => R / Scan[A] }: R / Scan[A] = {
113+
var read = false
114+
try body() with Scan[A] {
115+
def peek() = if(read) { resume{do peek()} } else { resume{ () => c } }
116+
def skip() = if(read) { resume{do skip[A]()} } else { resume{read = true} }
117+
}
118+
}
119+
def unread[R](s: String){ body: => R / Scan[Char] }: R / Scan[Char] = {
120+
var pos = 0
121+
try body() with Scan[Char] {
122+
def peek() = if (pos < s.length) { resume{s.unsafeCharAt(pos)} } else { resume{do peek()} }
123+
def skip() = if (pos < s.length) { resume{pos = pos + 1} } else { resume{do skip[Char]()} }
124+
}
125+
}
126+
127+
def parseHostAndPort(): Unit / { URIBuilder, Scan[Char] } = {
128+
try {
129+
do peek[Char]() match {
130+
case '[' => // IP-literal
131+
// this is more permissive than the spec
132+
do host(collectString{ readWhile{ c => c != ']' } } ++ "]")
133+
readIf(']')
134+
case _ =>
135+
do host(collectString{ readWhile{
136+
case '%' => true
137+
case c and c.isUnreserved => true
138+
case c and c.isSubDelim => true
139+
case _ => false
140+
} })
141+
}
142+
} with stop { () =>
143+
do host("")
144+
}
145+
attempt{
146+
readIf(':')
147+
do port(readInteger())
148+
}{
149+
// no port
150+
()
151+
}
152+
}
153+
154+
def parseAuthority(): Unit / { URIBuilder, Scan[Char] } = {
155+
// try parsing as userinfo@...
156+
val fst = collectString{ readWhile{
157+
case '%' => true
158+
case ':' => true
159+
case c and c.isUnreserved => true
160+
case c and c.isSubDelim => true
161+
case _ => false
162+
} }
163+
attempt{ // was userinfo
164+
readIf('@')
165+
do userinfo(fst)
166+
parseHostAndPort()
167+
}{ // was not userinfo
168+
with unread(fst)
169+
parseHostAndPort()
170+
}
171+
}
172+
173+
def parsePathQueryFragment(): Unit / { URIBuilder, Scan[Char], Exception[WrongFormat] } = {
174+
do path(collectString{ readWhile{
175+
case '?' => false
176+
case '#' => false
177+
case _ => true
178+
}})
179+
boundary{
180+
readIf('?')
181+
do query(collectString{ readWhile{ c => c != '#' }})
182+
}
183+
boundary{
184+
readIf('#')
185+
do fragment(collectString{ readWhile[Char]{ c => true } })
186+
}
187+
}
188+
189+
def parseURI(uri: String): Unit / { URIBuilder, Exception[WrongFormat] } = {
190+
try {
191+
with feed(uri)
192+
with scanner[Char]
193+
194+
do scheme(parseScheme())
195+
readIf(':')
196+
197+
val c = read[Char]()
198+
if (c == '/' and do peek[Char]() == '/'){
199+
// starts with `//`
200+
readIf('/')
201+
parseAuthority()
202+
boundary{
203+
do peek[Char]() match {
204+
case '?' => ()
205+
case '#' => ()
206+
case '/' => ()
207+
case _ => do raise(WrongFormat(), "Path must be empty or start with / if there is an authority component.")
208+
}
209+
}
210+
parsePathQueryFragment()
211+
} else {
212+
with unread(c)
213+
parsePathQueryFragment()
214+
}
215+
} with stop { () =>
216+
do raise(WrongFormat(), "Could not parse URI")
217+
}
218+
}
219+
220+
namespace example {
221+
def report(uri: String): Unit = {
222+
with on[WrongFormat].panic
223+
println(uri)
224+
try parseURI(uri) with URIBuilder {
225+
def scheme(s) = resume(println(" Scheme: " ++ s))
226+
def userinfo(u) = resume(println(" Userinfo: " ++ u))
227+
def host(h) = resume(println(" Host: " ++ h))
228+
def path(p) = resume(println(" Path: " ++ p))
229+
def port(p) = resume(println(" Port: " ++ p.show))
230+
def query(q) = resume(println(" Query: " ++ q))
231+
def fragment(q) = resume(println(" Fragment: " ++ q))
232+
}
233+
}
234+
def main() = {
235+
println(urldecode("%2F%20%20!"))
236+
println(urldecode(urlencode("Hallo Welt!/&^$@^*(&$)(*!_!_+\")")))
237+
println(urlencode("Hallo Welt!/&^$@^*(&$)(*!_!_+\")"))
238+
println(urldecode(urlencodeStrict("Hallo Welt!/&^$@^*(&$)(*!_!_+\")")))
239+
println(urlencodeStrict("Hallo Welt!/&^$@^*(&$)(*!_!_+\")"))
240+
241+
// examples from the spec
242+
report("ftp://ftp.is.co.za/rfc/rfc1808.txt")
243+
report("http://www.ietf.org/rfc/rfc2396.txt")
244+
report("ldap://[2001:db8::7]/c=GB?objectClass?one")
245+
report("mailto:[email protected]")
246+
report("news:comp.infosystems.www.servers.unix")
247+
report("tel:+1-816-555-1212")
248+
report("telnet://192.0.2.16:80/")
249+
report("urn:oasis:names:specification:docbook:dtd:xml:4.1")
250+
}
251+
}

0 commit comments

Comments
 (0)