1+ #lang racket/base
2+
3+ (require racket/match
4+ racket/contract
5+ (only-in racket/list add-between))
6+
7+ (provide
8+ (contract-out [parse-content-disposition-header
9+ (-> bytes?
10+ (or/c
11+ (list/c 'parsefail string?)
12+ (list/c bytes?
13+ (listof (list/c bytes? bytes?)))))]))
14+
15+ (struct parsefail exn ())
16+
17+ ;; this file parses the Content-Disposition line of HTTP headers
18+
19+ ;; from RFC6266:
20+ #|content-disposition = "Content-Disposition" ":"
21+ disposition-type *( ";" disposition-parm )
22+
23+ disposition-type = "inline" | "attachment" | disp-ext-type
24+ ; case-insensitive
25+ disp-ext-type = token
26+
27+ disposition-parm = filename-parm | disp-ext-parm
28+
29+ filename-parm = "filename" "=" value
30+ | "filename*" "=" ext-value
31+
32+ disp-ext-parm = token "=" value
33+ | ext-token "=" ext-value
34+ ext-token = <the characters in token, followed by "*">
35+ |#
36+
37+ ;; HOWEVER: we're explicitly giving up on RFC5987-style ext-values. If
38+ ;; we see a parm or filename-parm whose token ends with a star, we
39+ ;; just give up.
40+
41+ ;; SUPER-LIGHTWEIGHT PARSER FRAMEWORK:
42+
43+ ;; this is about the most lightweight parser framework that I could
44+ ;; come up with:
45+
46+ ;; a parser returns a list containing a parsed value and a byte string
47+ ;; containing the remainder, or it returns false. Only one parsing is
48+ ;; possible with this scheme. Also, this parser does not support
49+ ;; backtracking; the kleene star insists on eating the rest of the
50+ ;; input. This should ensure that every parser created with this framework
51+ ;; is nice and fast.
52+
53+ ;; it would be fun to convert this to TR, and I think it would work fine.
54+ ;; I don't have time to do it right now... :(
55+
56+ ;; given a bunch of parsers, use each of them and combine
57+ ;; their parsed values in a list. This is basically just the I/O
58+ ;; monad. Or, to be more specific, just the "I" monad. Er, with
59+ ;; the exception monad mixed in. Kinda.
60+ (define (seq . parsers)
61+ (cond [(null? parsers) (λ (bstr) (list '() bstr))]
62+ [else (λ (bstr)
63+ (match ((car parsers) bstr)
64+ [(list yay leftover)
65+ ((postproc (apply seq (cdr parsers))
66+ (λ (restyay) (cons yay restyay)))
67+ leftover)]
68+ [#f #f ]))]))
69+
70+ ;; given two parsers, use the first one that succeeds. No backtracking.
71+ (define (orparse p1 p2)
72+ (λ (bstr)
73+ (match (p1 bstr)
74+ [(list yay leftover) (list yay leftover)]
75+ [#f (p2 bstr)])))
76+
77+
78+ ;; Kleene star: given a parser, parse until you can't parse any more
79+ (define (kstar parser)
80+ (λ (bstr)
81+ (match (parser bstr)
82+ [(list yay leftover)
83+ ((postproc (kstar parser)
84+ (λ (v) (cons yay v)))
85+ leftover)]
86+ [#f (list '() bstr)])))
87+
88+
89+ ;; given a regexp (good idea for it to start with ^),
90+ ;; and a function to apply to the matched bytes before
91+ ;; returning, return a parser for that regexp
92+ (define (rx-matcher regexp postproc)
93+ (λ (bstr)
94+ (define maybe-matches (regexp-match-positions regexp bstr))
95+ (match maybe-matches
96+ [(list (cons 0 end) other ... )
97+ (list (postproc (subbytes bstr 0 end))
98+ (subbytes bstr end))]
99+ [other #f ])))
100+
101+
102+ ;; given a regexp (good idea for it to start with ^),
103+ ;; return a parser for that regexp
104+ (define (rx-matcher/raw regexp)
105+ (rx-matcher regexp (λ (x) x)))
106+
107+ ;; given a regexp (good idea for it to start with ^),
108+ ;; and a constant, return an rx parser that just returns
109+ ;; the contstant (if it matches)
110+ (define (rx-matcher/const regexp const)
111+ (rx-matcher regexp (λ (_ ) const)))
112+
113+ ;; given a parser, return a new parser that strips 'v' from the list
114+ ;; in the result position (if the whole parse result is #f, just
115+ ;; return it). Doesn't recur into sublists.
116+ (define (strip v p)
117+ (postproc p (λ (l) (filter (λ (elt) (not (equal? elt v))) l))))
118+
119+ ;; apply the given 'pp-fun' to the value in the result position
120+ ;; of the parser. If the parser fails, just return the fail
121+ (define (postproc parser pp-fun)
122+ (λ (bstr)
123+ (match (parser bstr)
124+ [(list result leftover)
125+ (list (pp-fun result) leftover)]
126+ [#f #f ])))
127+
128+ ;; try to use parser p. if it fails, pretend it succeeded, and use
129+ ;; the given value as the result
130+ (define (opt p val)
131+ (orparse p (λ (bstr) (list val bstr))))
132+
133+ ;; linear white space
134+ ;; NB: it looks like the request parser actually cleans up line breaks
135+ ;; for us... no problem.
136+ (define LWS (rx-matcher/const #px#"^(\r\n)?[ \t]+ " 'LWS ))
137+ ;; optional linear white space
138+ (define OPTLWS (opt LWS 'LWS ))
139+ ;; optional leading whitespace
140+ ;; IMPL: can't just staple OPTLWS on the front of a seq, because
141+ ;; then on #""i n a kstar it gets partway through (viz, the optlws) and
142+ ;; then thinks it's failed partway through a seq. Grr.
143+ (define (leadingLWS parser)
144+ (orparse (postproc (seq LWS parser) cadr) parser))
145+
146+ ;; a sequence where linear whitespace is allowed (and discarded)
147+ ;; before and between every pair of elements
148+ (define (seq/ws . parsers)
149+ (strip 'LWS (leadingLWS (apply seq (add-between parsers OPTLWS)))))
150+
151+
152+ (define SEMI (rx-matcher/const #px#"^; " 'SEMI ))
153+ (define EQ (rx-matcher/const #px#"^= " 'EQ ))
154+ (define DQ (rx-matcher/const #px#"^\" " 'DQ ))
155+
156+ ;; a quoted string. a quote followed by any character from 32-255 not
157+ ;; including backslash or quote, but optionally a backslash followed
158+ ;; by any char (can only be 0-127), and finally a close quote.
159+ ;; IMPL NOTE: you can do all of this with a single regexp, but you
160+ ;; wind up doing all the same work over again in cleaning up the
161+ ;; string.
162+ (define CLEANCHARSEQ (rx-matcher/raw #px#"^([ -!#-[]|[\\]-\377])+ " ))
163+ (define QDESCAPED (rx-matcher #px#"^\\\\[\0-\177] "
164+ (λ (v) (list 'escaped v))))
165+ (define QTDSTR
166+ (postproc (seq DQ (kstar (orparse CLEANCHARSEQ QDESCAPED)) DQ)
167+ (λ (v) (list 'quoted (cadr v)))))
168+ (define TOKEN (rx-matcher/raw #px#"^([-!#-'*-+.0-9A-Z^-z|~]+) " ))
169+ (define VALUE (orparse TOKEN QTDSTR))
170+
171+ ;; give up if we see a token ending with a star; these signal
172+ ;; RFC5987 ext-values, and we don't handle them correctly.
173+ (define CLAUSE
174+ (postproc
175+ (seq/ws TOKEN EQ VALUE)
176+ (λ (v)
177+ (when (regexp-match #px#"\\*$ " (car v))
178+ (raise
179+ (parsefail
180+ (format "token ending with * indicates unsupported ext-value: ~e "
181+ (car v))
182+ (current-continuation-marks))))
183+ v)))
184+
185+ (define content-disposition-parser
186+ (seq/ws TOKEN (kstar (seq/ws SEMI CLAUSE))))
187+
188+
189+ ;; given the right-hand-side of a content-disposition header
190+ ;; line, return a list containing the content-disposition-type
191+ ;; and a list of token/value lists
192+ (define (parse-content-disposition-header rhs)
193+ (with-handlers ([parsefail?
194+ (lambda (pf)
195+ (list 'parsefail (exn-message pf)))])
196+ (match (content-disposition-parser rhs)
197+ [(list matched #"" )
198+ (match matched
199+ [(list ty clauses)
200+ (list ty (for/list ([c (in-list clauses)])
201+ (match c
202+ [(list 'SEMI (list tok 'EQ val))
203+ (list tok (val-cleanup val))]
204+ [other (error
205+ 'parse-content-disposition-header
206+ "internal error, unexpected parse shape: ~e "
207+ c)])))]
208+ [other
209+ (error 'parse-content-disposition-header
210+ "internal error, unexpected parse shape 2: ~e "
211+ other)])]
212+ [other
213+ (list 'parsefail
214+ (format
215+ (string-append
216+ "expected: byte string matching RFC6266 spec with "
217+ "no RFC5987 ext-values, got: ~e " )
218+ rhs))])))
219+
220+ ;; clean up a quoted string by removing the quotes and undoing escaping
221+ (define (val-cleanup val)
222+ (match val
223+ [(? bytes? b) b]
224+ [(list 'quoted l)
225+ (apply bytes-append (for/list ([chunk (in-list l)])
226+ (match chunk
227+ [(? bytes? b) b]
228+ [(list 'escaped eseq)
229+ (subbytes eseq 1 2 )])))]))
230+
231+ (module+ test
232+ (require rackunit)
233+
234+ (check-equal? (QDESCAPED #"\\\003 3 " )
235+ (list '(escaped #"\\\003 " ) #" 3 " ))
236+ (check-equal? ((orparse CLEANCHARSEQ QDESCAPED) #"\\\003 3 " )
237+ (list '(escaped #"\\\003 " ) #" 3 " ))
238+ (check-equal? (QTDSTR #"\"abc\\\003\\\"def\" " )
239+ (list '(quoted (#"abc "
240+ (escaped #"\\\003 " )
241+ (escaped #"\\\" " )
242+ #"def " ))
243+ #"" ))
244+
245+ (check-equal?
246+ (parse-content-disposition-header
247+ #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\" " )
248+ '(#"form-data "
249+ ((#"name " #"abcz " )
250+ (#"filename " #"abc\"d " ))))
251+
252+ (check-equal? (TOKEN #"form-data ; " )
253+ (list #"form-data " #" ; " ))
254+
255+ (check-equal? ((seq LWS TOKEN) #" form-data ; " )
256+ (list (list 'LWS #"form-data " ) #" ; " ))
257+
258+ (check-equal? ((seq/ws TOKEN) #" form-data ; " )
259+ (list (list #"form-data " ) #" ; " ))
260+
261+ (check-equal? (QTDSTR #"\"abcz\"; filename=\"abc\\\"d\"
262+ zokbar=abc24 " )
263+ (list '(quoted (#"abcz " ))
264+ #"; filename=\"abc\\\"d\"
265+ zokbar=abc24 " ))
266+
267+ (check-equal? (QTDSTR #"\"abc\\\"d\"
268+ ; zokbar=abc24 " )
269+ (list '(quoted (#"abc " (escaped #"\\\" " ) #"d " ))
270+ #"
271+ ; zokbar=abc24 " ))
272+
273+ (check-equal? (content-disposition-parser
274+ #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"\r
275+ ; zokbar=abc24 " )
276+ (list `(#"form-data "
277+ ((SEMI (#"name " EQ (quoted (#"abcz " ))))
278+ (SEMI (#"filename " EQ (quoted (#"abc "
279+ (escaped #"\\\" " )
280+ #"d " ))))
281+ (SEMI (#"zokbar " EQ #"abc24 " ))))
282+ #"" ))
283+
284+ (check-equal? (QTDSTR #"\"filename=\" " )
285+ (list '(quoted (#"filename= " )) #"" ))
286+
287+
288+ (check-equal?
289+ (content-disposition-parser
290+ #"form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
291+ (list `(#"form-data "
292+ ((SEMI (#"name " EQ (quoted (#"filename= " ))))
293+ (SEMI (#"zokbar " EQ (quoted (#"dingo " ))))
294+ (SEMI (#"filename " EQ (quoted (#"wallaby " ))))))
295+ #"" ))
296+
297+ (check-match
298+ (parse-content-disposition-header
299+ #"form-data; name=\"filen\"ame=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
300+ (list 'parsefail (regexp #px"expected: byte string matching RFC6266 " )))
301+
302+ (check-match
303+ (parse-content-disposition-header
304+ #"form-data; name=\"filename=\"; zokbar*=\"dingo\"; filename=\"wallaby\" " )
305+ (list 'parsefail (regexp #px"token ending with * " )))
306+
307+ )
308+
309+ ;; this code was used to generate the regexp for tokens. In principle,
310+ ;; you shouldn't need this code unless you need to re-generate this
311+ ;; regexp
312+ (module background racket
313+
314+ (require rackunit)
315+
316+ ;; from RFC 2616:
317+ #|token = 1*<any CHAR except CTLs or separators>
318+ separators = "(" | ")" | "<" | ">" | "@"
319+ | "," | ";" | ":" | "\" | <">
320+ | "/" | "[" | "]" | "?" | "="
321+ | "{" | "}" | SP | HT
322+ |#
323+
324+ (define separators
325+ (map (λ (s) (first (string->list s)))
326+ '("( " ") " "< " "> " "@ "
327+ ", " "; " ": " "\\ " "\" "
328+ "/ " "[ " "] " "? " "= "
329+ "{ " "} " " " "\t " )))
330+
331+ (define CTLs
332+ (cons #\u007f
333+ (for/list ([i (in-range 0 32 )])
334+ (integer->char i))))
335+
336+ ;; add hyphen because it has to be treated
337+ ;; specially in regexps:
338+ (define separators-plus-ctls-plus-hyphen
339+ (cons #\- (append separators CTLs)))
340+
341+ (define omitted-integers
342+ (remove-duplicates
343+ (sort (map char->integer separators-plus-ctls-plus-hyphen) <)))
344+
345+ (define ranges
346+ (let loop ([range-begin 0 ]
347+ [badchars omitted-integers])
348+ (cond [(null? badchars)
349+ (cond [(< range-begin 127 )
350+ (list (list range-begin 126 ))]
351+ [else (list)])]
352+ [else
353+ (define nextbad (first badchars))
354+ (cond [(< range-begin nextbad)
355+ (cons (list range-begin (sub1 nextbad))
356+ (loop (add1 nextbad) (rest badchars)))]
357+ [(= range-begin nextbad)
358+ (loop (add1 range-begin)
359+ (rest badchars))]
360+ [else
361+ (error 'impossible-i-thought
362+ "~a ~a " range-begin nextbad)])])))
363+
364+ (define token-regexp-bstr
365+ (string->bytes/utf-8
366+ (call-with-output-string
367+ (λ (port)
368+ ;; adding the hyphen back in here:
369+ (fprintf port "[- " )
370+ (for/list ([r (in-list ranges)])
371+ (cond [(equal? (first r) (second r))
372+ (fprintf port "~a " (string (integer->char (first r))))]
373+ [else
374+ (fprintf port "~a~a~a " (string (integer->char (first r))) "- "
375+ (string (integer->char (second r))))]))
376+ (fprintf port "] " )))))
377+
378+ ;; check that it works:
379+
380+ (for ([i (in-range 0 128 )])
381+ (define ch (integer->char i))
382+ (cond [(member ch (append separators CTLs))
383+ (check-pred (λ (ch) (not (regexp-match? token-regexp-bstr
384+ (string ch))))
385+ ch)]
386+ [else
387+ (check-pred (λ (ch) (regexp-match? token-regexp-bstr (string ch)))
388+ ch)])))
0 commit comments