|
1 | 1 | # frozen_string_literal: true |
2 | 2 |
|
| 3 | +require "strscan" |
| 4 | + |
3 | 5 | module Prism |
4 | 6 | module Translation |
5 | 7 | class Parser |
@@ -251,6 +253,8 @@ def to_a |
251 | 253 | end |
252 | 254 | when :tCHARACTER |
253 | 255 | value.delete_prefix!("?") |
| 256 | + # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism. |
| 257 | + value = unescape_string(value, "?") |
254 | 258 | when :tCOMMENT |
255 | 259 | if token.type == :EMBDOC_BEGIN |
256 | 260 | start_index = index |
@@ -432,6 +436,156 @@ def parse_rational(value) |
432 | 436 | rescue ArgumentError |
433 | 437 | 0r |
434 | 438 | end |
| 439 | + |
| 440 | + # Wonky heredoc tab/spaces rules. |
| 441 | + # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558 |
| 442 | + def calculate_heredoc_whitespace(heredoc_token_index) |
| 443 | + next_token_index = heredoc_token_index |
| 444 | + nesting_level = 0 |
| 445 | + previous_line = -1 |
| 446 | + result = Float::MAX |
| 447 | + |
| 448 | + while (lexed[next_token_index] && next_token = lexed[next_token_index][0]) |
| 449 | + next_token_index += 1 |
| 450 | + next_next_token = lexed[next_token_index] && lexed[next_token_index][0] |
| 451 | + |
| 452 | + # String content inside nested heredocs and interpolation is ignored |
| 453 | + if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN |
| 454 | + nesting_level += 1 |
| 455 | + elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END |
| 456 | + nesting_level -= 1 |
| 457 | + # When we encountered the matching heredoc end, we can exit |
| 458 | + break if nesting_level == -1 |
| 459 | + elsif next_token.type == :STRING_CONTENT && nesting_level == 0 |
| 460 | + common_whitespace = 0 |
| 461 | + next_token.value[/^\s*/].each_char do |char| |
| 462 | + if char == "\t" |
| 463 | + common_whitespace = (common_whitespace / 8 + 1) * 8; |
| 464 | + else |
| 465 | + common_whitespace += 1 |
| 466 | + end |
| 467 | + end |
| 468 | + |
| 469 | + is_first_token_on_line = next_token.location.start_line != previous_line |
| 470 | + # Whitespace is significant if followed by interpolation |
| 471 | + whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line |
| 472 | + if is_first_token_on_line && !whitespace_only && common_whitespace < result |
| 473 | + result = common_whitespace |
| 474 | + previous_line = next_token.location.start_line |
| 475 | + end |
| 476 | + end |
| 477 | + end |
| 478 | + result |
| 479 | + end |
| 480 | + |
| 481 | + # Wonky heredoc tab/spaces rules. |
| 482 | + # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545 |
| 483 | + def trim_heredoc_whitespace(string, heredoc) |
| 484 | + trimmed_whitespace = 0 |
| 485 | + trimmed_characters = 0 |
| 486 | + while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace |
| 487 | + if string[trimmed_characters] == "\t" |
| 488 | + trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8; |
| 489 | + break if trimmed_whitespace > heredoc.common_whitespace |
| 490 | + else |
| 491 | + trimmed_whitespace += 1 |
| 492 | + end |
| 493 | + trimmed_characters += 1 |
| 494 | + end |
| 495 | + |
| 496 | + string[trimmed_characters..] |
| 497 | + end |
| 498 | + |
| 499 | + # Escape sequences that have special and should appear unescaped in the resulting string. |
| 500 | + ESCAPES = { |
| 501 | + "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f", |
| 502 | + "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t", |
| 503 | + "v" => "\v", "\\" => "\\" |
| 504 | + }.freeze |
| 505 | + private_constant :ESCAPES |
| 506 | + |
| 507 | + # When one of these delimiters is encountered, then the other |
| 508 | + # one is allowed to be escaped as well. |
| 509 | + DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze |
| 510 | + private_constant :DELIMITER_SYMETRY |
| 511 | + |
| 512 | + # Apply Ruby string escaping rules |
| 513 | + def unescape_string(string, quote) |
| 514 | + # In single-quoted heredocs, everything is taken literally. |
| 515 | + return string if quote == "<<'" |
| 516 | + |
| 517 | + # TODO: Implement regexp escaping |
| 518 | + return string if quote == "/" || quote.start_with?("%r") |
| 519 | + |
| 520 | + # OPTIMIZATION: Assume that few strings need escaping to speed up the common case. |
| 521 | + return string unless string.include?("\\") |
| 522 | + |
| 523 | + if interpolation?(quote) |
| 524 | + # Appending individual escape sequences may force the string out of its intended |
| 525 | + # encoding. Start out with binary and force it back later. |
| 526 | + result = "".b |
| 527 | + |
| 528 | + scanner = StringScanner.new(string) |
| 529 | + while (skipped = scanner.skip_until(/\\/)) |
| 530 | + # Append what was just skipped over, excluding the found backslash. |
| 531 | + result << string.byteslice(scanner.pos - skipped, skipped - 1) |
| 532 | + |
| 533 | + # Simple single-character escape sequences like \n |
| 534 | + if (replacement = ESCAPES[scanner.peek(1)]) |
| 535 | + result << replacement |
| 536 | + scanner.pos += 1 |
| 537 | + elsif (octal = scanner.check(/[0-7]{1,3}/)) |
| 538 | + # \nnn |
| 539 | + # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr) |
| 540 | + result << octal.to_i(8).chr.b |
| 541 | + scanner.pos += octal.bytesize |
| 542 | + elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/)) |
| 543 | + # \xnn |
| 544 | + result << hex[1..].to_i(16).chr.b |
| 545 | + scanner.pos += hex.bytesize |
| 546 | + elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/)) |
| 547 | + # \unnnn |
| 548 | + result << unicode[1..].hex.chr(Encoding::UTF_8).b |
| 549 | + scanner.pos += unicode.bytesize |
| 550 | + elsif scanner.peek(3) == "u{}" |
| 551 | + # https://github.com/whitequark/parser/issues/856 |
| 552 | + scanner.pos += 3 |
| 553 | + elsif (unicode_parts = scanner.check(/u{.*}/)) |
| 554 | + # \u{nnnn ...} |
| 555 | + unicode_parts[2..-2].split.each do |unicode| |
| 556 | + result << unicode.hex.chr(Encoding::UTF_8).b |
| 557 | + end |
| 558 | + scanner.pos += unicode_parts.bytesize |
| 559 | + end |
| 560 | + end |
| 561 | + |
| 562 | + # Add remainging chars |
| 563 | + result << string.byteslice(scanner.pos..) |
| 564 | + |
| 565 | + result.force_encoding(source_buffer.source.encoding) |
| 566 | + |
| 567 | + result |
| 568 | + else |
| 569 | + if quote == "'" |
| 570 | + delimiter = "'" |
| 571 | + else |
| 572 | + delimiter = quote[2] |
| 573 | + end |
| 574 | + |
| 575 | + delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}") |
| 576 | + string.gsub(/\\([\\#{delimiters}])/, '\1') |
| 577 | + end |
| 578 | + end |
| 579 | + |
| 580 | + # Determine if characters preceeded by a backslash should be escaped or not |
| 581 | + def interpolation?(quote) |
| 582 | + quote != "'" && !quote.start_with?("%q", "%w", "%i") |
| 583 | + end |
| 584 | + |
| 585 | + # Determine if the string is part of a %-style array. |
| 586 | + def percent_array?(quote) |
| 587 | + quote.start_with?("%w", "%W", "%i", "%I") |
| 588 | + end |
435 | 589 | end |
436 | 590 | end |
437 | 591 | end |
|
0 commit comments