Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ memchr.workspace = true
thiserror.workspace = true
tracing.workspace = true
bumpalo.workspace = true
encoding_rs = "0.8.35"

[lints]
workspace = true
23 changes: 13 additions & 10 deletions lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#[cfg(test)]
mod tests;

mod locale_encoding;

use core::str;
use std::{
cmp::Ordering,
Expand All @@ -15,14 +17,15 @@ use std::{
};

use bumpalo::{Bump, collections::Vec};
pub use locale_encoding::LocaleEncoding;
use logos::{Logos, Skip};
pub use logos::{Span, SpannedIter};
use memchr::{memchr, memchr3};
use thiserror::Error;

pub type Lexer<'a> = logos::Lexer<'a, Token<'a>>;
pub type Result<T, E = LexingError> = std::result::Result<T, E>;

use memchr::{memchr, memchr3};
use thiserror::Error;

// TODO: check wnat GNU does about potentially reserved `@ident`; add error branch if so.
#[derive(Logos, Debug, PartialEq)]
#[logos(utf8 = false)]
Expand Down Expand Up @@ -236,6 +239,7 @@ pub struct Extra {
arena: NonNull<Bump>,
posix_strict: bool,
gnu_strict: bool,
encoding: LocaleEncoding,
}

#[derive(Debug, Default, PartialEq, Eq)]
Expand Down Expand Up @@ -276,6 +280,7 @@ impl<'a> Token<'a> {
arena: &'a Bump,
posix_strict: bool,
gnu_strict: bool,
encoding: LocaleEncoding,
) -> logos::Lexer<'a, Self> {
Lexer::with_extras(
source,
Expand All @@ -284,6 +289,7 @@ impl<'a> Token<'a> {
arena: NonNull::from_ref(arena),
posix_strict,
gnu_strict,
encoding,
},
)
}
Expand Down Expand Up @@ -368,6 +374,7 @@ fn parse_content<'a, const REGEX: bool, const DELIMITER: char>(
&rest[i..],
out.to_mut(lex.extras.arena()),
lex.extras.posix_strict,
lex.extras.encoding,
)?;
start = i + consumed;
}
Expand All @@ -385,6 +392,7 @@ fn parse_escape<const REGEX: bool>(
slice: &[u8],
out: &mut Vec<u8>,
posix_strict: bool,
encoding: LocaleEncoding,
) -> Result<usize> {
let mut count = 2;
let is_oct = |x: char| ('0'..'8').contains(&x);
Expand Down Expand Up @@ -444,13 +452,8 @@ fn parse_escape<const REGEX: bool>(
count += num_digits;

let codepoint = parse_hex_digits(&slice[2..2 + num_digits]);

// FIXME: assumes UTF-8 locale; replacement character and encoding may differ
// for non-UTF-8 locales.
let c = char::from_u32(codepoint).unwrap_or('\u{FFFD}');
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
out.extend_from_slice_copy(encoded.as_bytes());
let bytes = encoding.encode_unicode_escape(codepoint);
out.extend_from_slice_copy(&bytes);

return Ok(count);
}
Expand Down
102 changes: 102 additions & 0 deletions lexer/src/locale_encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// This file is part of the uutils awk package.
//
// For the full copyright and license information, please view the LICENSE
// files that was distributed with this source code.

//! Locale-aware encoding for `\u` escape sequences, matching gawk behavior.
//!
//! See: <https://www.gnu.org/software/gawk/manual/html_node/Escape-Sequences.html>

use encoding_rs::{EncoderResult, Encoding, UTF_8};

/// Character encoding derived from the process locale (`LC_*` / `LANG`).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct LocaleEncoding {
encoding: &'static Encoding,
/// `C` / `POSIX` locales only accept ASCII via `\u`.
ascii_only: bool,
}

impl LocaleEncoding {
pub fn utf8() -> Self {
Self { encoding: UTF_8, ascii_only: false }
}

/// `C` / `POSIX` locale: `\u` only encodes code points in ASCII.
pub fn ascii() -> Self {
Self { encoding: UTF_8, ascii_only: true }
}

/// ISO-8859-1 (Latin-1).
pub fn iso_8859_1() -> Self {
Self {
encoding: Encoding::for_label(b"iso-8859-1").unwrap_or(UTF_8),
ascii_only: false,
}
}

/// Detect encoding from `LC_ALL`, `LC_CTYPE`, or `LANG`.
pub fn detect() -> Self {
let name = std::env::var("LC_ALL")
.or_else(|_| std::env::var("LC_CTYPE"))
.or_else(|_| std::env::var("LANG"))
.unwrap_or_else(|_| "C.UTF-8".to_string());
from_locale_name(&name)
}

/// Encode a Unicode scalar value for a `\u` escape in the current locale.
///
/// Invalid code points and characters that cannot be represented in the
/// locale encoding become `?`, matching gawk.
pub fn encode_unicode_escape(self, codepoint: u32) -> Vec<u8> {
if codepoint > 0x0010_FFFF || (0xD800..=0xDFFF).contains(&codepoint) {
return vec![b'?'];
}

let c = char::from_u32(codepoint).unwrap();

if self.ascii_only && codepoint > 0x7F {
return vec![b'?'];
}

if self.encoding == UTF_8 && !self.ascii_only {
let mut buf = [0u8; 4];
return c.encode_utf8(&mut buf).as_bytes().to_vec();
}

let mut encoder = self.encoding.new_encoder();
let mut buf = [0u8; 8];
let ch = c.to_string();
match encoder.encode_from_utf8_without_replacement(&ch, &mut buf, true) {
(EncoderResult::InputEmpty, _, written) if written > 0 => buf[..written].to_vec(),
_ => vec![b'?'],
}
}
}

impl Default for LocaleEncoding {
fn default() -> Self {
Self::utf8()
}
}

fn from_locale_name(name: &str) -> LocaleEncoding {
let lower = name.to_ascii_lowercase();
let extension = lower.rsplit_once('.').map(|(_, ext)| ext);
if lower == "c" || lower == "posix" || extension == Some("c") || extension == Some("posix") {
return LocaleEncoding::ascii();
}

let charset = name.rsplit('.').next().unwrap_or(name);
let label = charset.to_ascii_lowercase().replace('_', "-");

if label.contains("utf-8") || label == "utf8" {
return LocaleEncoding::utf8();
}

if let Some(encoding) = Encoding::for_label(label.as_bytes()) {
LocaleEncoding { encoding, ascii_only: false }
} else {
LocaleEncoding::utf8()
}
}
93 changes: 91 additions & 2 deletions lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,25 @@ use bumpalo::{
collections::{CollectIn, Vec},
};

use crate::{Identifier, Token};
use crate::{Identifier, LocaleEncoding, Token};

fn lex<'a>(
src: &'a [u8],
arena: &'a Bump,
posix_strict: bool,
gnu_strict: bool,
) -> Vec<'a, Token<'a>> {
Token::lex(src, arena, posix_strict, gnu_strict)
lex_with_encoding(src, arena, posix_strict, gnu_strict, LocaleEncoding::utf8())
}

fn lex_with_encoding<'a>(
src: &'a [u8],
arena: &'a Bump,
posix_strict: bool,
gnu_strict: bool,
encoding: LocaleEncoding,
) -> Vec<'a, Token<'a>> {
Token::lex(src, arena, posix_strict, gnu_strict, encoding)
.collect_in::<Result<Vec<_>, _>>(arena)
.unwrap()
}
Expand Down Expand Up @@ -355,3 +365,82 @@ fn lexer_test_unicode_escape_eight_digits() {
&[Token::String(b"2".into())]
);
}

#[test]
fn lexer_test_unicode_escape_iso8859_1() {
let arena = Bump::new();
assert_eq!(
&lex_with_encoding(
b"\"\\u00e9\"",
&arena,
false,
false,
LocaleEncoding::iso_8859_1()
),
&[Token::String(b"\xe9".into())]
);
assert_eq!(
&lex_with_encoding(
b"\"\\u0041\"",
&arena,
false,
false,
LocaleEncoding::iso_8859_1()
),
&[Token::String(b"A".into())]
);
assert_eq!(
&lex_with_encoding(
b"\"\\u4e2d\"",
&arena,
false,
false,
LocaleEncoding::iso_8859_1()
),
&[Token::String(b"?".into())]
);
}

#[test]
fn lexer_test_unicode_escape_ascii_locale() {
let arena = Bump::new();
assert_eq!(
&lex_with_encoding(
b"\"\\u0041\"",
&arena,
false,
false,
LocaleEncoding::ascii()
),
&[Token::String(b"A".into())]
);
assert_eq!(
&lex_with_encoding(
b"\"\\u00e9\"",
&arena,
false,
false,
LocaleEncoding::ascii()
),
&[Token::String(b"?".into())]
);
}

#[test]
fn lexer_test_unicode_escape_invalid_codepoint() {
let arena = Bump::new();
assert_eq!(
&lex_with_encoding(
b"\"\\u110000\"",
&arena,
false,
false,
LocaleEncoding::utf8()
),
&[Token::String(b"?".into())]
);
assert_eq!(
&lex_with_encoding(b"\"\\uD800\"", &arena, false, false, LocaleEncoding::utf8()),
&[Token::String(b"?".into())]
);
}
9 changes: 6 additions & 3 deletions parser/src/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
use std::{fmt::Debug, iter::Peekable};

use bumpalo::Bump;
use lexer::{Identifier, LexingError, Slice, Span, SpannedIter, Token};
use lexer::{Identifier, LexingError, LocaleEncoding, Slice, Span, SpannedIter, Token};

use super::Result;
use crate::{
Expand All @@ -26,7 +26,9 @@ impl<'a> Lexer<'a> {
pub fn new(source: &'a [u8], arena: &'a Bump) -> Self {
Self {
// TODO: wire in POSIX & GNU strict conformance.
inner: Token::lex(source, arena, false, false).spanned().peekable(),
inner: Token::lex(source, arena, false, false, LocaleEncoding::detect())
.spanned()
.peekable(),
span: Span::default(),
// source,
}
Expand Down Expand Up @@ -96,7 +98,8 @@ impl<'a> Lexer<'a> {
}

pub fn lex_ident(&self, source: &[u8], arena: &'a Bump) -> Result<&'a str> {
let Some(Ok(Token::Identifier(ident))) = Token::lex(source, arena, true, true).next()
let Some(Ok(Token::Identifier(ident))) =
Token::lex(source, arena, true, true, LocaleEncoding::detect()).next()
else {
return Err(ParsingError::UnexpectedToken(
self.span().start + 1..self.span().end - 1,
Expand Down
Loading