Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unescaping cleanups #103919

Merged
merged 10 commits into from
Nov 9, 2022
29 changes: 11 additions & 18 deletions compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@

use crate::ast::{self, Lit, LitKind};
use crate::token::{self, Token};

use rustc_lexer::unescape::{unescape_byte, unescape_char};
use rustc_lexer::unescape::{unescape_byte_literal, unescape_literal, Mode};
use rustc_lexer::unescape::{byte_from_char, unescape_byte, unescape_char, unescape_literal, Mode};
use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span;

use std::ascii;

pub enum LitError {
Expand Down Expand Up @@ -109,13 +106,11 @@ impl LitKind {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_byte_literal(&s, Mode::ByteStr, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
unescape_literal(&s, Mode::ByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
Expand All @@ -127,13 +122,11 @@ impl LitKind {
let bytes = if s.contains('\r') {
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_byte_literal(&s, Mode::RawByteStr, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
unescape_literal(&s, Mode::RawByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
Expand Down
10 changes: 5 additions & 5 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,13 @@ pub enum RawStrError {
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Base {
/// Literal starts with "0b".
Binary,
Binary = 2,
/// Literal starts with "0o".
Octal,
/// Literal starts with "0x".
Hexadecimal,
Octal = 8,
/// Literal doesn't contain a prefix.
Decimal,
Decimal = 10,
/// Literal starts with "0x".
Hexadecimal = 16,
}

/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
Expand Down
130 changes: 51 additions & 79 deletions compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,8 @@ pub enum EscapeError {

/// Unicode escape code in byte literal.
UnicodeEscapeInByte,
/// Non-ascii character in byte literal.
/// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
NonAsciiCharInByte,
/// Non-ascii character in byte string literal.
NonAsciiCharInByteString,

/// After a line ending with '\', the next line contains whitespace
/// characters that are not skipped.
Expand All @@ -78,54 +76,33 @@ impl EscapeError {
/// Takes a contents of a literal (without quotes) and produces a
/// sequence of escaped characters or errors.
/// Values are returned through invoking of the provided callback.
pub fn unescape_literal<F>(literal_text: &str, mode: Mode, callback: &mut F)
pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
match mode {
Mode::Char | Mode::Byte => {
let mut chars = literal_text.chars();
let result = unescape_char_or_byte(&mut chars, mode);
// The Chars iterator moved forward.
callback(0..(literal_text.len() - chars.as_str().len()), result);
let mut chars = src.chars();
let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte);
callback(0..(src.len() - chars.as_str().len()), res);
}
Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback),
// NOTE: Raw strings do not perform any explicit character escaping, here we
// only translate CRLF to LF and produce errors on bare CR.
Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback),
Mode::RawStr | Mode::RawByteStr => {
unescape_raw_str_or_raw_byte_str(literal_text, mode, callback)
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback)
}
}
}

/// Takes a contents of a byte, byte string or raw byte string (without quotes)
/// and produces a sequence of bytes or errors.
/// Values are returned through invoking of the provided callback.
pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
{
debug_assert!(mode.is_bytes());
unescape_literal(literal_text, mode, &mut |range, result| {
callback(range, result.map(byte_from_char));
})
}

/// Takes a contents of a char literal (without quotes), and returns an
/// unescaped char or an error
pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
/// unescaped char or an error.
pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
unescape_char_or_byte(&mut src.chars(), false)
}

/// Takes a contents of a byte literal (without quotes), and returns an
/// unescaped byte or an error.
pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Byte)
.map(byte_from_char)
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
unescape_char_or_byte(&mut src.chars(), true).map(byte_from_char)
}

/// What kind of literal do we parse.
Expand All @@ -147,20 +124,17 @@ impl Mode {
}
}

pub fn is_bytes(self) -> bool {
pub fn is_byte(self) -> bool {
match self {
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
Mode::Char | Mode::Str | Mode::RawStr => false,
}
}
}

fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
nnethercote marked this conversation as resolved.
Show resolved Hide resolved
// Previous character was '\\', unescape what follows.

let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;

let res = match second_char {
let res = match chars.next().ok_or(EscapeError::LoneSlash)? {
'"' => '"',
'n' => '\n',
'r' => '\r',
Expand All @@ -181,7 +155,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let value = hi * 16 + lo;

// For a non-byte literal verify that it is within ASCII range.
if !mode.is_bytes() && !is_ascii(value) {
if !is_byte && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
let value = value as u8;
Expand Down Expand Up @@ -217,7 +191,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {

// Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal.
if mode.is_bytes() {
if is_byte {
return Err(EscapeError::UnicodeEscapeInByte);
}

Expand Down Expand Up @@ -249,23 +223,22 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
}

#[inline]
fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> {
if mode.is_bytes() && !first_char.is_ascii() {
fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> {
if is_byte && !c.is_ascii() {
// Byte literal can't be a non-ascii character.
Err(EscapeError::NonAsciiCharInByte)
} else {
Ok(first_char)
Ok(c)
}
}

fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
debug_assert!(mode == Mode::Char || mode == Mode::Byte);
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = match first_char {
'\\' => scan_escape(chars, mode),
fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> {
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
let res = match c {
'\\' => scan_escape(chars, is_byte),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(first_char, mode),
_ => ascii_check(c, is_byte),
}?;
if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar);
Expand All @@ -275,20 +248,20 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca

/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
debug_assert!(mode == Mode::Str || mode == Mode::ByteStr);
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
let start = initial_len - chars.as_str().len() - first_char.len_utf8();

let unescaped_char = match first_char {
// The `start` and `end` computation here is complicated because
// `skip_ascii_whitespace` makes us to skip over chars without counting
// them in the range computation.
while let Some(c) = chars.next() {
let start = src.len() - chars.as_str().len() - c.len_utf8();
let res = match c {
'\\' => {
let second_char = chars.clone().next();
match second_char {
match chars.clone().next() {
Some('\n') => {
// Rust language specification requires us to skip whitespaces
// if unescaped '\' character is followed by '\n'.
Expand All @@ -297,17 +270,17 @@ where
skip_ascii_whitespace(&mut chars, start, callback);
continue;
}
_ => scan_escape(&mut chars, mode),
_ => scan_escape(&mut chars, is_byte),
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
'"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(first_char, mode),
_ => ascii_check(c, is_byte),
};
let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char);
let end = src.len() - chars.as_str().len();
callback(start..end, res);
}

fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
Expand Down Expand Up @@ -340,30 +313,29 @@ where
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
fn unescape_raw_str_or_raw_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
/// only produce errors on bare CR.
fn unescape_raw_str_or_raw_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr);
let initial_len = literal_text.len();

let mut chars = literal_text.chars();
while let Some(curr) = chars.next() {
let start = initial_len - chars.as_str().len() - curr.len_utf8();
let mut chars = src.chars();

let result = match curr {
// The `start` and `end` computation here matches the one in
// `unescape_str_or_byte_str` for consistency, even though this function
// doesn't have to worry about skipping any chars.
while let Some(c) = chars.next() {
let start = src.len() - chars.as_str().len() - c.len_utf8();
let res = match c {
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString),
c => Ok(c),
_ => ascii_check(c, is_byte),
};
let end = initial_len - chars.as_str().len();

callback(start..end, result);
let end = src.len() - chars.as_str().len();
callback(start..end, res);
}
}

fn byte_from_char(c: char) -> u8 {
#[inline]
pub fn byte_from_char(c: char) -> u8 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is a bit dicey. I agree that in the context of compiler we don't really care about nice abstrction boundaries, and this is a simplification.

But we want rustc_lexer to also be a nice crates.io crates, and from that point of view this feels a bit too type-unsafe.

No strong opinion overall.

Though, if we do this, this one surely wants to be tagged as #[inline]? tiny cross-crate fn.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a fair point. A couple of questions:

  • Is rustc_lexer actually usable outside of rustc/rust-analyzer? the rustc-lexer crate is v0.1.0 and hasn't been updated in three years. I see some auto-updated variants but none of them have been updated in the past year.
  • I'm not sure how this is type-unsafe. You'll get u8/char type mismatches if you do something wrong?
  • Inlining is a good idea.

let res = c as u32;
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr");
res as u8
Expand Down
29 changes: 10 additions & 19 deletions compiler/rustc_lexer/src/unescape/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ use super::*;
#[test]
fn test_unescape_char_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
assert_eq!(unescape_char(literal_text), Err(expected_error));
}

check("", EscapeError::ZeroChars);
Expand Down Expand Up @@ -68,8 +67,7 @@ fn test_unescape_char_bad() {
#[test]
fn test_unescape_char_good() {
fn check(literal_text: &str, expected_char: char) {
let actual_result = unescape_char(literal_text);
assert_eq!(actual_result, Ok(expected_char));
assert_eq!(unescape_char(literal_text), Ok(expected_char));
}

check("a", 'a');
Expand Down Expand Up @@ -149,8 +147,7 @@ fn test_unescape_str_good() {
#[test]
fn test_unescape_byte_bad() {
fn check(literal_text: &str, expected_error: EscapeError) {
let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
assert_eq!(actual_result, Err(expected_error));
assert_eq!(unescape_byte(literal_text), Err(expected_error));
}

check("", EscapeError::ZeroChars);
Expand Down Expand Up @@ -219,8 +216,7 @@ fn test_unescape_byte_bad() {
#[test]
fn test_unescape_byte_good() {
fn check(literal_text: &str, expected_byte: u8) {
let actual_result = unescape_byte(literal_text);
assert_eq!(actual_result, Ok(expected_byte));
assert_eq!(unescape_byte(literal_text), Ok(expected_byte));
}

check("a", b'a');
Expand All @@ -246,10 +242,10 @@ fn test_unescape_byte_good() {
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_byte_literal(literal_text, Mode::ByteStr, &mut |range, c| {
unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Ok(c) => b.push(byte_from_char(c)),
Err(e) => buf = Err((range, e)),
}
}
Expand Down Expand Up @@ -280,18 +276,13 @@ fn test_unescape_raw_str() {

#[test]
fn test_unescape_raw_byte_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<u8, EscapeError>)]) {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_byte_literal(literal, Mode::RawByteStr, &mut |range, res| {
unescaped.push((range, res))
});
unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected);
}

check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
check(
"🦀a",
&[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))],
);
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]);
check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]);
}
Loading