1010 (or/c
1111 (list/c 'parsefail string?)
1212 (list/c bytes?
13- (listof (list/c bytes? bytes ?)))))]))
13+ (listof (list/c bytes? string ?)))))]))
1414
1515(struct parsefail exn ())
1616
152152(define SEMI (rx-matcher/const #px#"^; " 'SEMI ))
153153(define EQ (rx-matcher/const #px#"^= " 'EQ ))
154154(define DQ (rx-matcher/const #px#"^\" " 'DQ ))
155+ (define SQ (rx-matcher/const #px"^' " 'SQ ))
155156
156157;; a quoted string. a quote followed by any character from 32-255 not
157158;; including backslash or quote, but optionally a backslash followed
166167 (postproc (seq DQ (kstar (orparse CLEANCHARSEQ QDESCAPED)) DQ)
167168 (λ (v) (list 'quoted (cadr v)))))
168169(define TOKEN (rx-matcher/raw #px#"^([-!#-'*-+.0-9A-Z^-z|~]+) " ))
169- (define VALUE (orparse TOKEN QTDSTR))
170+ (define VALUE (postproc (orparse TOKEN QTDSTR) (λ (x) (list 'val x))))
171+
172+ (define ISO-8859-1-TOKEN (rx-matcher/const #px"^[iI][sS][oO]-8859-1 " 'iso-8559-1 ))
173+ (define UTF-8-TOKEN (rx-matcher/const #px"^[uU][tT][fF]-8 " 'utf-8 ))
174+ (define LANG-TOKEN (rx-matcher/const #px"^[-a-zA-Z0-9]* " 'LANG-TAG ))
175+ (define PCT-ENCODED
176+ (postproc (rx-matcher/raw #px"^%[0-9a-fA-F][0-9a-fA-F] " )
177+ (λ (x) (list 'pct (string->number
178+ (bytes->string/utf-8 (subbytes x 1 3 ))
179+ 16 )))))
180+ (define ATTR-CHARS (rx-matcher/raw #px"^[-A-Za-z0-9!#$&+.^_`|~]+ " ))
181+ (define EXT-VALUE-CHARS (kstar (orparse PCT-ENCODED ATTR-CHARS)))
182+ (define EXT-VALUE
183+ (postproc (seq (orparse ISO-8859-1-TOKEN UTF-8-TOKEN)
184+ SQ LANG-TOKEN SQ EXT-VALUE-CHARS)
185+ (λ (x) (list 'extval x))))
186+
187+
170188
171189;; give up if we see a token ending with a star; these signal
172190;; RFC5987 ext-values, and we don't handle them correctly.
173191(define CLAUSE
174- (postproc
175- (seq/ws TOKEN EQ VALUE)
176- (λ (v)
177- (when (regexp-match #px#"\\*$ " (car v))
178- (raise
179- (parsefail
180- (format "token ending with * indicates unsupported ext-value: ~e "
181- (car v))
182- (current-continuation-marks))))
183- v)))
192+ (seq/ws TOKEN EQ (orparse VALUE EXT-VALUE)))
184193
185194(define content-disposition-parser
186195 (seq/ws TOKEN (kstar (seq/ws SEMI CLAUSE))))
200209 (list ty (for/list ([c (in-list clauses)])
201210 (match c
202211 [(list 'SEMI (list tok 'EQ val))
203- (list tok ( val-cleanup val) )]
212+ (clause-postproc tok val)]
204213 [other (error
205214 'parse-content-disposition-header
206215 "internal error, unexpected parse shape: ~e "
217226 "no RFC5987 ext-values, got: ~e " )
218227 rhs))])))
219228
229+ ;; clean up a clause by undoing escaping and joining strings
230+ (define (clause-postproc token val)
231+ (define token-ends-with-star?
232+ (regexp-match? #px"\\*$ " token))
233+ (define cleaned-val
234+ (match val
235+ [(list 'extval v)
236+ (cond [token-ends-with-star? (extval-cleanup v)]
237+ [else
238+ (raise
239+ (parsefail
240+ "illegal extended value attached to non-asterisk token: ~e "
241+ token))])]
242+ [(list 'val v) (val-cleanup v)]))
243+ (list token cleaned-val))
244+
220245;; clean up a quoted string by removing the quotes and undoing escaping
221246(define (val-cleanup val)
222247 (match val
223248 [(? bytes? b) b]
224249 [(list 'quoted l)
225- (apply bytes-append (for/list ([chunk (in-list l)])
226- (match chunk
227- [(? bytes? b) b]
228- [(list 'escaped eseq)
229- (subbytes eseq 1 2 )])))]))
250+ ;; quoted strings are supposed to be interpreted using
251+ ;; iso-8859-1, often known as latin-1.
252+ ;;
253+ ;; Here's a frightening passage from RFC2612, concerning the
254+ ;; definition of TEXT, the stuff in between the quotes:
255+ #|Words
256+ of *TEXT MAY contain characters from character sets other than ISO-
257+ 8859-1 [22] only when encoded according to the rules of RFC 2047
258+ [14]. |#
259+ ;; ... which leaves open the possibility that interpreting these
260+ ;; strictly as ISO-8859-1 strings may be incorrect. However, given
261+ ;; the existence of ext-values, I think that no provider would
262+ ;; use this mechanims. Famous last words. Lemme ask.
263+ (bytes->string/latin-1
264+ (apply bytes-append
265+ (for/list ([chunk (in-list l)])
266+ (match chunk
267+ [(? bytes? b) b]
268+ [(list 'escaped eseq)
269+ (subbytes eseq 1 2 )]))))]))
270+
271+ ;; clean up an extval by unescaping pct-encoded strings
272+ (define (extval-cleanup extval)
273+ (match extval
274+ [(list encoding _ _ _ pieces)
275+ (define unencoder
276+ (match encoding
277+ ['utf-8 bytes->string/utf-8]
278+ ['iso-8559-1 bytes->string/latin-1]))
279+ (define bstrs
280+ (for/list ([p (in-list pieces)])
281+ (match p
282+ [(list 'pct n) (bytes n)]
283+ [other other])))
284+ (unencoder (apply bytes-append bstrs))]))
230285
231286(module+ test
232287 (require rackunit)
241296 (escaped #"\\\" " )
242297 #"def " ))
243298 #"" ))
299+
300+ ;; move down later
301+ (check-equal? (EXT-VALUE #"UTF-8'en-li-SS'abcd " )
302+ '((extval (utf-8 SQ LANG-TAG SQ (#"abcd " ))) #"" ))
303+ (check-equal? (EXT-VALUE #"UTF-8'en-li-SS'abcd%20%5c " )
304+ '((extval
305+ (utf-8 SQ LANG-TAG SQ (#"abcd " (pct #x20 ) (pct #x5c ))))
306+ #"" ))
307+
308+
244309
245- (check-equal?
246- (parse-content-disposition-header
247- #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\" " )
248- '(#"form-data "
249- ((#"name " #"abcz " )
250- (#"filename " #"abc\"d " ))))
310+ (check-equal?
311+ (parse-content-disposition-header
312+ #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\" " )
313+ '(#"form-data "
314+ ((#"name " "abcz " )
315+ (#"filename " "abc\"d " ))))
316+
317+ ;; try a high latin-1 character:
318+ (check-equal?
319+ (parse-content-disposition-header
320+ #" form-data;filename=\"ab\330cd\" " )
321+ '(#"form-data "
322+ ((#"filename " "abØcd " ))))
251323
252324 (check-equal?
253325 (parse-content-disposition-header
254326 #" attachment; filename=\"\\\\foo.html\"\n " )
255327 '(#"attachment "
256- ((#"filename " # "\\foo.html " ))))
328+ ((#"filename " "\\foo.html " ))))
257329
258330(check-equal? (TOKEN #"form-data ; " )
259331 (list #"form-data " #" ; " ))
280352 #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"\r
281353 ; zokbar=abc24 " )
282354 (list `(#"form-data "
283- ((SEMI (#"name " EQ (quoted (#"abcz " ))))
284- (SEMI (#"filename " EQ (quoted (#"abc "
285- (escaped #"\\\" " )
286- #"d " ))))
287- (SEMI (#"zokbar " EQ #"abc24 " ))))
355+ ((SEMI (#"name " EQ (val ( quoted (#"abcz " ) ))))
356+ (SEMI (#"filename " EQ (val ( quoted (#"abc "
357+ (escaped #"\\\" " )
358+ #"d " ) ))))
359+ (SEMI (#"zokbar " EQ (val #"abc24 " ) ))))
288360 #"" ))
289361
290362 (check-equal? (QTDSTR #"\"filename=\" " )
296368 (content-disposition-parser
297369 #"form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
298370 (list `(#"form-data "
299- ((SEMI (#"name " EQ (quoted (#"filename= " ))))
300- (SEMI (#"zokbar " EQ (quoted (#"dingo " ))))
301- (SEMI (#"filename " EQ (quoted (#"wallaby " ))))))
371+ ((SEMI (#"name " EQ (val ( quoted (#"filename= " ) ))))
372+ (SEMI (#"zokbar " EQ (val ( quoted (#"dingo " ) ))))
373+ (SEMI (#"filename " EQ (val ( quoted (#"wallaby " ) ))))))
302374 #"" ))
303375
304376 (check-equal?
305377 (content-disposition-parser
306378 #" form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
307379 (list `(#"form-data "
308- ((SEMI (#"name " EQ (quoted (#"filename= " ))))
309- (SEMI (#"zokbar " EQ (quoted (#"dingo " ))))
310- (SEMI (#"filename " EQ (quoted (#"wallaby " ))))))
380+ ((SEMI (#"name " EQ (val ( quoted (#"filename= " ) ))))
381+ (SEMI (#"zokbar " EQ (val ( quoted (#"dingo " ) ))))
382+ (SEMI (#"filename " EQ (val ( quoted (#"wallaby " ) ))))))
311383 #"" ))
312384
313385 (check-match
314386 (parse-content-disposition-header
315387 #"form-data; name=\"filen\"ame=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
316388 (list 'parsefail (regexp #px"expected: byte string matching RFC6266 " )))
317389
318- (check-match
390+ (check-equal?
319391 (parse-content-disposition-header
320- #"form-data; name=\"filename=\"; zokbar*=\"dingo\"; filename=\"wallaby\" " )
321- (list 'parsefail (regexp #px"token ending with * " )))
392+ #" attachment; filename=\"foo-ae.html\"; filename*=UTF-8''foo-%c3%a4.html\n " )
393+ '(#"attachment " (#"filename " "foo-ae.html " )
394+ (#"filename* " "foo-ä.html " )))
322395
323396 )
397+
324398
325399;; this code was used to generate the regexp for tokens. In principle,
326400;; you shouldn't need this code unless you need to re-generate this
401475 ch)]
402476 [else
403477 (check-pred (λ (ch) (regexp-match? token-regexp-bstr (string ch)))
404- ch)])))
478+ ch)]))
479+
480+
481+ #|attr-char = ALPHA / DIGIT
482+ / "!" / "#" / "$" / "&" / "+" / "-" / "."
483+ / "^" / "_" / "`" / "|" / "~"
484+ ; token except ( "*" / "'" / "%" )
485+ |#
486+
487+ )
488+
489+
490+
491+
0 commit comments