From f32e6781b2932aed55342ad8a4a7f1023acb30b4 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 Nov 2022 11:31:07 +1100 Subject: [PATCH 01/10] Rename some variables. These have been bugging me for a while. - `literal_text`: `src` is also used and is shorter and better. - `first_char`: used even when "first" doesn't make sense; `c` is shorter and better. - `curr`: `c` is shorter and better. - `unescaped_char`: `result` is also used and is shorter and better. - `second_char`: these have a single use and can be elided. --- compiler/rustc_lexer/src/unescape.rs | 70 +++++++++++++--------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 8f64b5f5158e4..a6752c82bd3c5 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -78,54 +78,52 @@ impl EscapeError { /// Takes a contents of a literal (without quotes) and produces a /// sequence of escaped characters or errors. /// Values are returned through invoking of the provided callback. -pub fn unescape_literal(literal_text: &str, mode: Mode, callback: &mut F) +pub fn unescape_literal(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { match mode { Mode::Char | Mode::Byte => { - let mut chars = literal_text.chars(); + let mut chars = src.chars(); let result = unescape_char_or_byte(&mut chars, mode); // The Chars iterator moved forward. - callback(0..(literal_text.len() - chars.as_str().len()), result); + callback(0..(src.len() - chars.as_str().len()), result); } - Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback), + Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode, callback), // NOTE: Raw strings do not perform any explicit character escaping, here we // only translate CRLF to LF and produce errors on bare CR. - Mode::RawStr | Mode::RawByteStr => { - unescape_raw_str_or_raw_byte_str(literal_text, mode, callback) - } + Mode::RawStr | Mode::RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), } } /// Takes a contents of a byte, byte string or raw byte string (without quotes) /// and produces a sequence of bytes or errors. /// Values are returned through invoking of the provided callback. -pub fn unescape_byte_literal(literal_text: &str, mode: Mode, callback: &mut F) +pub fn unescape_byte_literal(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { debug_assert!(mode.is_bytes()); - unescape_literal(literal_text, mode, &mut |range, result| { + unescape_literal(src, mode, &mut |range, result| { callback(range, result.map(byte_from_char)); }) } /// Takes a contents of a char literal (without quotes), and returns an /// unescaped char or an error -pub fn unescape_char(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); +pub fn unescape_char(src: &str) -> Result { + let mut chars = src.chars(); unescape_char_or_byte(&mut chars, Mode::Char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) + .map_err(|err| (src.len() - chars.as_str().len(), err)) } /// Takes a contents of a byte literal (without quotes), and returns an /// unescaped byte or an error. -pub fn unescape_byte(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); +pub fn unescape_byte(src: &str) -> Result { + let mut chars = src.chars(); unescape_char_or_byte(&mut chars, Mode::Byte) .map(byte_from_char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) + .map_err(|err| (src.len() - chars.as_str().len(), err)) } /// What kind of literal do we parse. @@ -157,10 +155,7 @@ impl Mode { fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { // Previous character was '\\', unescape what follows. - - let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; - - let res = match second_char { + let res = match chars.next().ok_or(EscapeError::LoneSlash)? { '"' => '"', 'n' => '\n', 'r' => '\r', @@ -249,23 +244,23 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { } #[inline] -fn ascii_check(first_char: char, mode: Mode) -> Result { - if mode.is_bytes() && !first_char.is_ascii() { +fn ascii_check(c: char, mode: Mode) -> Result { + if mode.is_bytes() && !c.is_ascii() { // Byte literal can't be a non-ascii character. Err(EscapeError::NonAsciiCharInByte) } else { - Ok(first_char) + Ok(c) } } fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { debug_assert!(mode == Mode::Char || mode == Mode::Byte); - let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = match first_char { + let c = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = match c { '\\' => scan_escape(chars, mode), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(first_char, mode), + _ => ascii_check(c, mode), }?; if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); @@ -282,13 +277,12 @@ where debug_assert!(mode == Mode::Str || mode == Mode::ByteStr); let initial_len = src.len(); let mut chars = src.chars(); - while let Some(first_char) = chars.next() { - let start = initial_len - chars.as_str().len() - first_char.len_utf8(); + while let Some(c) = chars.next() { + let start = initial_len - chars.as_str().len() - c.len_utf8(); - let unescaped_char = match first_char { + let result = match c { '\\' => { - let second_char = chars.clone().next(); - match second_char { + match chars.clone().next() { Some('\n') => { // Rust language specification requires us to skip whitespaces // if unescaped '\' character is followed by '\n'. @@ -304,10 +298,10 @@ where '\t' => Ok('\t'), '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(first_char, mode), + _ => ascii_check(c, mode), }; let end = initial_len - chars.as_str().len(); - callback(start..end, unescaped_char); + callback(start..end, result); } fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) @@ -341,18 +335,18 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(literal_text: &str, mode: Mode, callback: &mut F) +fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr); - let initial_len = literal_text.len(); + let initial_len = src.len(); - let mut chars = literal_text.chars(); - while let Some(curr) = chars.next() { - let start = initial_len - chars.as_str().len() - curr.len_utf8(); + let mut chars = src.chars(); + while let Some(c) = chars.next() { + let start = initial_len - chars.as_str().len() - c.len_utf8(); - let result = match curr { + let result = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), c => Ok(c), From 84ca2c3bab370ee58ebd23050e9286e1d9e664b9 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 Nov 2022 12:15:55 +1100 Subject: [PATCH 02/10] Clarify range calculations. There is some subtlety here. --- compiler/rustc_lexer/src/unescape.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index a6752c82bd3c5..dc2fd359e278e 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -275,11 +275,13 @@ where F: FnMut(Range, Result), { debug_assert!(mode == Mode::Str || mode == Mode::ByteStr); - let initial_len = src.len(); let mut chars = src.chars(); - while let Some(c) = chars.next() { - let start = initial_len - chars.as_str().len() - c.len_utf8(); + // The `start` and `end` computation here is complicated because + // `skip_ascii_whitespace` makes us to skip over chars without counting + // them in the range computation. + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); let result = match c { '\\' => { match chars.clone().next() { @@ -300,7 +302,7 @@ where '\r' => Err(EscapeError::BareCarriageReturn), _ => ascii_check(c, mode), }; - let end = initial_len - chars.as_str().len(); + let end = src.len() - chars.as_str().len(); callback(start..end, result); } @@ -340,19 +342,19 @@ where F: FnMut(Range, Result), { debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr); - let initial_len = src.len(); - let mut chars = src.chars(); - while let Some(c) = chars.next() { - let start = initial_len - chars.as_str().len() - c.len_utf8(); + // The `start` and `end` computation here matches the one in + // `unescape_str_or_byte_str` for consistency, even though this function + // doesn't have to worry about skipping any chars. + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); let result = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), c => Ok(c), }; - let end = initial_len - chars.as_str().len(); - + let end = src.len() - chars.as_str().len(); callback(start..end, result); } } From 34b32b0dac9da3fad7861bdc2bad89d771172bb3 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 Nov 2022 13:35:49 +1100 Subject: [PATCH 03/10] Use `Mode` less. It's passed to numerous places where we just need an `is_byte` bool. Passing the bool avoids the need for some assertions. Also rename `is_bytes()` as `is_byte()`, to better match `Mode::Byte`, `Mode::ByteStr`, and `Mode::RawByteStr`. --- compiler/rustc_lexer/src/unescape.rs | 46 +++++++++---------- .../src/lexer/unescape_error_reporting.rs | 14 +++--- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index dc2fd359e278e..f0042a397c2c5 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -85,14 +85,16 @@ where match mode { Mode::Char | Mode::Byte => { let mut chars = src.chars(); - let result = unescape_char_or_byte(&mut chars, mode); + let result = unescape_char_or_byte(&mut chars, mode == Mode::Byte); // The Chars iterator moved forward. callback(0..(src.len() - chars.as_str().len()), result); } - Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode, callback), + Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), // NOTE: Raw strings do not perform any explicit character escaping, here we // only translate CRLF to LF and produce errors on bare CR. - Mode::RawStr | Mode::RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), + Mode::RawStr | Mode::RawByteStr => { + unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) + } } } @@ -103,7 +105,7 @@ pub fn unescape_byte_literal(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { - debug_assert!(mode.is_bytes()); + debug_assert!(mode.is_byte()); unescape_literal(src, mode, &mut |range, result| { callback(range, result.map(byte_from_char)); }) @@ -113,15 +115,14 @@ where /// unescaped char or an error pub fn unescape_char(src: &str) -> Result { let mut chars = src.chars(); - unescape_char_or_byte(&mut chars, Mode::Char) - .map_err(|err| (src.len() - chars.as_str().len(), err)) + unescape_char_or_byte(&mut chars, false).map_err(|err| (src.len() - chars.as_str().len(), err)) } /// Takes a contents of a byte literal (without quotes), and returns an /// unescaped byte or an error. pub fn unescape_byte(src: &str) -> Result { let mut chars = src.chars(); - unescape_char_or_byte(&mut chars, Mode::Byte) + unescape_char_or_byte(&mut chars, true) .map(byte_from_char) .map_err(|err| (src.len() - chars.as_str().len(), err)) } @@ -145,7 +146,7 @@ impl Mode { } } - pub fn is_bytes(self) -> bool { + pub fn is_byte(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, Mode::Char | Mode::Str | Mode::RawStr => false, @@ -153,7 +154,7 @@ impl Mode { } } -fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { +fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result { // Previous character was '\\', unescape what follows. let res = match chars.next().ok_or(EscapeError::LoneSlash)? { '"' => '"', @@ -176,7 +177,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { let value = hi * 16 + lo; // For a non-byte literal verify that it is within ASCII range. - if !mode.is_bytes() && !is_ascii(value) { + if !is_byte && !is_ascii(value) { return Err(EscapeError::OutOfRangeHexEscape); } let value = value as u8; @@ -212,7 +213,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if mode.is_bytes() { + if is_byte { return Err(EscapeError::UnicodeEscapeInByte); } @@ -244,8 +245,8 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { } #[inline] -fn ascii_check(c: char, mode: Mode) -> Result { - if mode.is_bytes() && !c.is_ascii() { +fn ascii_check(c: char, is_byte: bool) -> Result { + if is_byte && !c.is_ascii() { // Byte literal can't be a non-ascii character. Err(EscapeError::NonAsciiCharInByte) } else { @@ -253,14 +254,13 @@ fn ascii_check(c: char, mode: Mode) -> Result { } } -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - debug_assert!(mode == Mode::Char || mode == Mode::Byte); +fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result { let c = chars.next().ok_or(EscapeError::ZeroChars)?; let res = match c { - '\\' => scan_escape(chars, mode), + '\\' => scan_escape(chars, is_byte), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode), + _ => ascii_check(c, is_byte), }?; if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); @@ -270,11 +270,10 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result(src: &str, mode: Mode, callback: &mut F) +fn unescape_str_or_byte_str(src: &str, is_byte: bool, callback: &mut F) where F: FnMut(Range, Result), { - debug_assert!(mode == Mode::Str || mode == Mode::ByteStr); let mut chars = src.chars(); // The `start` and `end` computation here is complicated because @@ -293,14 +292,14 @@ where skip_ascii_whitespace(&mut chars, start, callback); continue; } - _ => scan_escape(&mut chars, mode), + _ => scan_escape(&mut chars, is_byte), } } '\n' => Ok('\n'), '\t' => Ok('\t'), '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode), + _ => ascii_check(c, is_byte), }; let end = src.len() - chars.as_str().len(); callback(start..end, result); @@ -337,11 +336,10 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) +fn unescape_raw_str_or_raw_byte_str(src: &str, is_byte: bool, callback: &mut F) where F: FnMut(Range, Result), { - debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr); let mut chars = src.chars(); // The `start` and `end` computation here matches the one in @@ -351,7 +349,7 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let result = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), + c if is_byte && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), c => Ok(c), }; let end = src.len() - chars.as_str().len(); diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index f075de7142676..055ee98a00aa3 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -108,7 +108,7 @@ pub(crate) fn emit_unescape_error( } if !has_help { - let (prefix, msg) = if mode.is_bytes() { + let (prefix, msg) = if mode.is_byte() { ("b", "if you meant to write a byte string literal, use double quotes") } else { ("", "if you meant to write a `str` literal, use double quotes") @@ -142,7 +142,7 @@ pub(crate) fn emit_unescape_error( EscapeError::EscapeOnlyChar => { let (c, char_span) = last_char(); - let msg = if mode.is_bytes() { + let msg = if mode.is_byte() { "byte constant must be escaped" } else { "character constant must be escaped" @@ -182,11 +182,11 @@ pub(crate) fn emit_unescape_error( let (c, span) = last_char(); let label = - if mode.is_bytes() { "unknown byte escape" } else { "unknown character escape" }; + if mode.is_byte() { "unknown byte escape" } else { "unknown character escape" }; let ec = escaped_char(c); let mut diag = handler.struct_span_err(span, &format!("{}: `{}`", label, ec)); diag.span_label(span, label); - if c == '{' || c == '}' && !mode.is_bytes() { + if c == '{' || c == '}' && !mode.is_byte() { diag.help( "if used in a formatting string, curly braces are escaped with `{{` and `}}`", ); @@ -196,7 +196,7 @@ pub(crate) fn emit_unescape_error( version control settings", ); } else { - if !mode.is_bytes() { + if !mode.is_byte() { diag.span_suggestion( span_with_quotes, "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", @@ -231,7 +231,7 @@ pub(crate) fn emit_unescape_error( .emit(); } EscapeError::NonAsciiCharInByte => { - assert!(mode.is_bytes()); + assert!(mode.is_byte()); let (c, span) = last_char(); let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant"); let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { @@ -271,7 +271,7 @@ pub(crate) fn emit_unescape_error( err.emit(); } EscapeError::NonAsciiCharInByteString => { - assert!(mode.is_bytes()); + assert!(mode.is_byte()); let (c, span) = last_char(); let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { format!(" but is {:?}", c) From 7dbf2c0ed86a6fc97aa0b93bc2ac865d6f2cc438 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 Nov 2022 15:17:37 +1100 Subject: [PATCH 04/10] Make non-ASCII errors more consistent. There are three kinds of "byte" literals: byte literals, byte string literals, and raw byte string literals. None are allowed to have non-ASCII chars in them. Two `EscapeError` variants exist for when that constraint is violated. - `NonAsciiCharInByte`: used for byte literals and byte string literals. - `NonAsciiCharInByteString`: used for raw byte string literals. As a result, the messages for raw byte string literals use different wording, without good reason. Also, byte string literals are incorrectly described as "byte constants" in some error messages. This commit eliminates `NonAsciiCharInByteString` so the three cases are handled similarly, and described correctly. The `mode` is enough to distinguish them. Note: Some existing error messages mention "byte constants" and some mention "byte literals". I went with the latter here, because it's a more correct name, as used by the Reference. --- compiler/rustc_lexer/src/unescape.rs | 7 ++-- compiler/rustc_lexer/src/unescape/tests.rs | 7 ++-- .../src/lexer/unescape_error_reporting.rs | 32 ++++++++----------- src/test/ui/attributes/key-value-non-ascii.rs | 2 +- .../ui/attributes/key-value-non-ascii.stderr | 4 +-- src/test/ui/parser/byte-literals.rs | 2 +- src/test/ui/parser/byte-literals.stderr | 4 +-- src/test/ui/parser/byte-string-literals.rs | 4 +-- .../ui/parser/byte-string-literals.stderr | 6 ++-- .../ui/parser/raw/raw-byte-string-literals.rs | 2 +- .../raw/raw-byte-string-literals.stderr | 2 +- .../ui/parser/unicode-control-codepoints.rs | 16 +++++----- .../parser/unicode-control-codepoints.stderr | 24 +++++++------- src/test/ui/suggestions/multibyte-escapes.rs | 12 +++---- .../ui/suggestions/multibyte-escapes.stderr | 12 +++---- 15 files changed, 62 insertions(+), 74 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index f0042a397c2c5..9c9cce7cbd48e 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -52,10 +52,8 @@ pub enum EscapeError { /// Unicode escape code in byte literal. UnicodeEscapeInByte, - /// Non-ascii character in byte literal. + /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. NonAsciiCharInByte, - /// Non-ascii character in byte string literal. - NonAsciiCharInByteString, /// After a line ending with '\', the next line contains whitespace /// characters that are not skipped. @@ -349,8 +347,7 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let result = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - c if is_byte && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - c => Ok(c), + _ => ascii_check(c, is_byte), }; let end = src.len() - chars.as_str().len(); callback(start..end, result); diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index fa61554afde6c..008edef5a6385 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -289,9 +289,6 @@ fn test_unescape_raw_byte_str() { } check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); - check( - "🦀a", - &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))], - ); + check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); + check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(byte_from_char('a')))]); } diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index 055ee98a00aa3..6373f5b4fd6ff 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -231,16 +231,23 @@ pub(crate) fn emit_unescape_error( .emit(); } EscapeError::NonAsciiCharInByte => { - assert!(mode.is_byte()); let (c, span) = last_char(); - let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant"); + let desc = match mode { + Mode::Byte => "byte literal", + Mode::ByteStr => "byte string literal", + Mode::RawByteStr => "raw byte string literal", + _ => panic!("non-is_byte literal paired with NonAsciiCharInByte"), + }; + let mut err = handler.struct_span_err(span, format!("non-ASCII character in {}", desc)); let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { format!(" but is {:?}", c) } else { String::new() }; - err.span_label(span, &format!("byte constant must be ASCII{}", postfix)); - if (c as u32) <= 0xFF { + err.span_label(span, &format!("must be ASCII{}", postfix)); + // Note: the \\xHH suggestions are not given for raw byte string + // literals, because they are araw and so cannot use any escapes. + if (c as u32) <= 0xFF && mode != Mode::RawByteStr { err.span_suggestion( span, &format!( @@ -250,9 +257,9 @@ pub(crate) fn emit_unescape_error( format!("\\x{:X}", c as u32), Applicability::MaybeIncorrect, ); - } else if matches!(mode, Mode::Byte) { + } else if mode == Mode::Byte { err.span_label(span, "this multibyte character does not fit into a single byte"); - } else if matches!(mode, Mode::ByteStr) { + } else if mode != Mode::RawByteStr { let mut utf8 = String::new(); utf8.push(c); err.span_suggestion( @@ -270,19 +277,6 @@ pub(crate) fn emit_unescape_error( } err.emit(); } - EscapeError::NonAsciiCharInByteString => { - assert!(mode.is_byte()); - let (c, span) = last_char(); - let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { - format!(" but is {:?}", c) - } else { - String::new() - }; - handler - .struct_span_err(span, "raw byte string must be ASCII") - .span_label(span, &format!("must be ASCII{}", postfix)) - .emit(); - } EscapeError::OutOfRangeHexEscape => { handler .struct_span_err(span, "out of range hex escape") diff --git a/src/test/ui/attributes/key-value-non-ascii.rs b/src/test/ui/attributes/key-value-non-ascii.rs index 12942eabdf7b5..e14e2fc05ad39 100644 --- a/src/test/ui/attributes/key-value-non-ascii.rs +++ b/src/test/ui/attributes/key-value-non-ascii.rs @@ -1,4 +1,4 @@ #![feature(rustc_attrs)] -#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte constant +#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal fn main() {} diff --git a/src/test/ui/attributes/key-value-non-ascii.stderr b/src/test/ui/attributes/key-value-non-ascii.stderr index 422107867f7f9..23d482de6a868 100644 --- a/src/test/ui/attributes/key-value-non-ascii.stderr +++ b/src/test/ui/attributes/key-value-non-ascii.stderr @@ -1,8 +1,8 @@ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/key-value-non-ascii.rs:3:19 | LL | #[rustc_dummy = b"ffi.rs"] - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes | diff --git a/src/test/ui/parser/byte-literals.rs b/src/test/ui/parser/byte-literals.rs index 05a510b24a7ab..896dc1a1a5fba 100644 --- a/src/test/ui/parser/byte-literals.rs +++ b/src/test/ui/parser/byte-literals.rs @@ -7,6 +7,6 @@ pub fn main() { b'\x0Z'; //~ ERROR invalid character in numeric character escape: `Z` b' '; //~ ERROR byte constant must be escaped b'''; //~ ERROR byte constant must be escaped - b'é'; //~ ERROR non-ASCII character in byte constant + b'é'; //~ ERROR non-ASCII character in byte literal b'a //~ ERROR unterminated byte constant [E0763] } diff --git a/src/test/ui/parser/byte-literals.stderr b/src/test/ui/parser/byte-literals.stderr index c3d0006163005..efa55ae05bd37 100644 --- a/src/test/ui/parser/byte-literals.stderr +++ b/src/test/ui/parser/byte-literals.stderr @@ -32,11 +32,11 @@ error: byte constant must be escaped: `'` LL | b'''; | ^ help: escape the character: `\'` -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/byte-literals.rs:10:7 | LL | b'é'; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'é', use a \xHH escape | diff --git a/src/test/ui/parser/byte-string-literals.rs b/src/test/ui/parser/byte-string-literals.rs index b1f11024a7bb6..30a4f50c4e40b 100644 --- a/src/test/ui/parser/byte-string-literals.rs +++ b/src/test/ui/parser/byte-string-literals.rs @@ -3,7 +3,7 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape pub fn main() { b"\f"; //~ ERROR unknown byte escape b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z` - b"é"; //~ ERROR non-ASCII character in byte constant - br##"é"##; //~ ERROR raw byte string must be ASCII + b"é"; //~ ERROR non-ASCII character in byte string literal + br##"é"##; //~ ERROR non-ASCII character in raw byte string literal b"a //~ ERROR unterminated double quote byte string } diff --git a/src/test/ui/parser/byte-string-literals.stderr b/src/test/ui/parser/byte-string-literals.stderr index 3b8b3692e053f..5b96cc3d18abc 100644 --- a/src/test/ui/parser/byte-string-literals.stderr +++ b/src/test/ui/parser/byte-string-literals.stderr @@ -20,18 +20,18 @@ error: invalid character in numeric character escape: `Z` LL | b"\x0Z"; | ^ invalid character in numeric character escape -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/byte-string-literals.rs:6:7 | LL | b"é"; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'é', use a \xHH escape | LL | b"\xE9"; | ~~~~ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/byte-string-literals.rs:7:10 | LL | br##"é"##; diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.rs b/src/test/ui/parser/raw/raw-byte-string-literals.rs index 163c8ac66b022..1b859fee596ad 100644 --- a/src/test/ui/parser/raw/raw-byte-string-literals.rs +++ b/src/test/ui/parser/raw/raw-byte-string-literals.rs @@ -2,6 +2,6 @@ pub fn main() { br"a "; //~ ERROR bare CR not allowed in raw string - br"é"; //~ ERROR raw byte string must be ASCII + br"é"; //~ ERROR non-ASCII character in raw byte string literal br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.stderr b/src/test/ui/parser/raw/raw-byte-string-literals.stderr index cfc877104bd9f..a2f27d1ed70ae 100644 --- a/src/test/ui/parser/raw/raw-byte-string-literals.stderr +++ b/src/test/ui/parser/raw/raw-byte-string-literals.stderr @@ -4,7 +4,7 @@ error: bare CR not allowed in raw string LL | br"a "; | ^ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/raw-byte-string-literals.rs:5:8 | LL | br"é"; diff --git a/src/test/ui/parser/unicode-control-codepoints.rs b/src/test/ui/parser/unicode-control-codepoints.rs index 5af0b585a1275..df099bb62ad1e 100644 --- a/src/test/ui/parser/unicode-control-codepoints.rs +++ b/src/test/ui/parser/unicode-control-codepoints.rs @@ -14,15 +14,15 @@ fn main() { println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); //~^ ERROR unicode codepoint changing visible direction of text present in literal println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); - //~^ ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant + //~^ ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); - //~^ ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII + //~^ ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal println!("{:?}", '‮'); //~^ ERROR unicode codepoint changing visible direction of text present in literal } diff --git a/src/test/ui/parser/unicode-control-codepoints.stderr b/src/test/ui/parser/unicode-control-codepoints.stderr index 44548c72ff5d0..fc071a9419142 100644 --- a/src/test/ui/parser/unicode-control-codepoints.stderr +++ b/src/test/ui/parser/unicode-control-codepoints.stderr @@ -14,69 +14,69 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); | = help: unicode escape sequences cannot be used as a byte or in a byte string -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:26 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{202e}' + | ^ must be ASCII but is '\u{202e}' | help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes | LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:30 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2066}' + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:41 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2069}' + | ^ must be ASCII but is '\u{2069}' | help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes | LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:43 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2066}' + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); | ~~~~~~~~~~~~ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:29 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{202e}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:33 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{2066}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:44 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{2069}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:46 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); diff --git a/src/test/ui/suggestions/multibyte-escapes.rs b/src/test/ui/suggestions/multibyte-escapes.rs index fd5d46a4e923e..c4105186244db 100644 --- a/src/test/ui/suggestions/multibyte-escapes.rs +++ b/src/test/ui/suggestions/multibyte-escapes.rs @@ -2,17 +2,17 @@ fn main() { b'µ'; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte literal //~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII b'字'; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte literal //~| NOTE: this multibyte character does not fit into a single byte - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII b"字"; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte string literal //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII } diff --git a/src/test/ui/suggestions/multibyte-escapes.stderr b/src/test/ui/suggestions/multibyte-escapes.stderr index 6e26bc1f01cef..1e7c43e6538f6 100644 --- a/src/test/ui/suggestions/multibyte-escapes.stderr +++ b/src/test/ui/suggestions/multibyte-escapes.stderr @@ -1,28 +1,28 @@ -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/multibyte-escapes.rs:4:7 | LL | b'µ'; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'µ', use a \xHH escape | LL | b'\xB5'; | ~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/multibyte-escapes.rs:9:7 | LL | b'字'; | ^^ | | - | byte constant must be ASCII + | must be ASCII | this multibyte character does not fit into a single byte -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/multibyte-escapes.rs:14:7 | LL | b"字"; - | ^^ byte constant must be ASCII + | ^^ must be ASCII | help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes | From a21c0458979d786d821c2d75a1b109fe38914da0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 Nov 2022 16:26:27 +1100 Subject: [PATCH 05/10] Improve comments. Remove a low-value comment, remove a duplicate comment, and correct a third comment. --- compiler/rustc_lexer/src/unescape.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 9c9cce7cbd48e..db7bf02e71adc 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -84,12 +84,9 @@ where Mode::Char | Mode::Byte => { let mut chars = src.chars(); let result = unescape_char_or_byte(&mut chars, mode == Mode::Byte); - // The Chars iterator moved forward. callback(0..(src.len() - chars.as_str().len()), result); } Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), - // NOTE: Raw strings do not perform any explicit character escaping, here we - // only translate CRLF to LF and produce errors on bare CR. Mode::RawStr | Mode::RawByteStr => { unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) } @@ -333,7 +330,7 @@ where /// Takes a contents of a string literal (without quotes) and produces a /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. +/// only produce errors on bare CR. fn unescape_raw_str_or_raw_byte_str(src: &str, is_byte: bool, callback: &mut F) where F: FnMut(Range, Result), From d963686f5a87b9eaa2ac2bdc29ddb796e0e83f1f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 4 Nov 2022 09:19:34 +1100 Subject: [PATCH 06/10] Refactor `cook_lexer_literal`. It deals with eight cases: ints, floats, and the six quoted types (char/byte/strings). For ints and floats we have an early return, and the other six types fall through to the code at the end, which makes the function hard to read. This commit rearranges things to avoid the early returns. --- compiler/rustc_parse/src/lexer/mod.rs | 78 +++++++++++++-------------- 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 462bce16ad717..61b5be4240414 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -363,55 +363,55 @@ impl<'a> StringReader<'a> { fn cook_lexer_literal( &self, start: BytePos, - suffix_start: BytePos, + end: BytePos, kind: rustc_lexer::LiteralKind, ) -> (token::LitKind, Symbol) { - // prefix means `"` or `br"` or `r###"`, ... - let (lit_kind, mode, prefix_len, postfix_len) = match kind { + match kind { rustc_lexer::LiteralKind::Char { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start, suffix_start), + self.mk_sp(start, end), "unterminated character literal", error_code!(E0762), ) } - (token::Char, Mode::Char, 1, 1) // ' ' + self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' ' } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start + BytePos(1), suffix_start), + self.mk_sp(start + BytePos(1), end), "unterminated byte constant", error_code!(E0763), ) } - (token::Byte, Mode::Byte, 2, 1) // b' ' + self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start, suffix_start), + self.mk_sp(start, end), "unterminated double quote string", error_code!(E0765), ) } - (token::Str, Mode::Str, 1, 1) // " " + self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " " } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start + BytePos(1), suffix_start), + self.mk_sp(start + BytePos(1), end), "unterminated double quote byte string", error_code!(E0766), ) } - (token::ByteStr, Mode::ByteStr, 2, 1) // b" " + self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); - (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "## + let kind = token::StrRaw(n_hashes); + self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## } else { self.report_raw_str_error(start, 1); } @@ -419,56 +419,47 @@ impl<'a> StringReader<'a> { rustc_lexer::LiteralKind::RawByteStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); - (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "## + let kind = token::ByteStrRaw(n_hashes); + self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## } else { self.report_raw_str_error(start, 2); } } rustc_lexer::LiteralKind::Int { base, empty_int } => { - return if empty_int { + if empty_int { self.sess .span_diagnostic .struct_span_err_with_code( - self.mk_sp(start, suffix_start), + self.mk_sp(start, end), "no valid digits found for number", error_code!(E0768), ) .emit(); (token::Integer, sym::integer(0)) } else { - self.validate_int_literal(base, start, suffix_start); - (token::Integer, self.symbol_from_to(start, suffix_start)) - }; + self.validate_int_literal(base, start, end); + (token::Integer, self.symbol_from_to(start, end)) + } } rustc_lexer::LiteralKind::Float { base, empty_exponent } => { if empty_exponent { self.err_span_(start, self.pos, "expected at least one digit in exponent"); } - match base { - Base::Hexadecimal => self.err_span_( - start, - suffix_start, - "hexadecimal float literal is not supported", - ), + Base::Hexadecimal => { + self.err_span_(start, end, "hexadecimal float literal is not supported") + } Base::Octal => { - self.err_span_(start, suffix_start, "octal float literal is not supported") + self.err_span_(start, end, "octal float literal is not supported") } Base::Binary => { - self.err_span_(start, suffix_start, "binary float literal is not supported") + self.err_span_(start, end, "binary float literal is not supported") } - _ => (), + _ => {} } - - let id = self.symbol_from_to(start, suffix_start); - return (token::Float, id); + (token::Float, self.symbol_from_to(start, end)) } - }; - let content_start = start + BytePos(prefix_len); - let content_end = suffix_start - BytePos(postfix_len); - let id = self.symbol_from_to(content_start, content_end); - self.validate_literal_escape(mode, content_start, content_end, prefix_len, postfix_len); - (lit_kind, id) + } } #[inline] @@ -659,20 +650,22 @@ impl<'a> StringReader<'a> { ) } - fn validate_literal_escape( + fn cook_quoted( &self, + kind: token::LitKind, mode: Mode, - content_start: BytePos, - content_end: BytePos, + start: BytePos, + end: BytePos, prefix_len: u32, postfix_len: u32, - ) { + ) -> (token::LitKind, Symbol) { + let content_start = start + BytePos(prefix_len); + let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); unescape::unescape_literal(lit_content, mode, &mut |range, result| { // Here we only check for errors. The actual unescaping is done later. if let Err(err) = result { - let span_with_quotes = self - .mk_sp(content_start - BytePos(prefix_len), content_end + BytePos(postfix_len)); + let span_with_quotes = self.mk_sp(start, end); let (start, end) = (range.start as u32, range.end as u32); let lo = content_start + BytePos(start); let hi = lo + BytePos(end - start); @@ -688,6 +681,7 @@ impl<'a> StringReader<'a> { ); } }); + (kind, Symbol::intern(lit_content)) } fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) { From a203482d2a20cba0c86298334ebd74438bd477ba Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 4 Nov 2022 10:02:29 +1100 Subject: [PATCH 07/10] Inline and remove `validate_int_literal`. It has a single callsite, and is fairly small. The `Float` match arm already has base-specific checking inline, so this makes things more consistent. --- compiler/rustc_lexer/src/lib.rs | 10 ++++----- compiler/rustc_parse/src/lexer/mod.rs | 31 +++++++++++---------------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 51515976e4ee9..0d29d7b1e3d9b 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -203,13 +203,13 @@ pub enum RawStrError { #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Base { /// Literal starts with "0b". - Binary, + Binary = 2, /// Literal starts with "0o". - Octal, - /// Literal starts with "0x". - Hexadecimal, + Octal = 8, /// Literal doesn't contain a prefix. - Decimal, + Decimal = 10, + /// Literal starts with "0x". + Hexadecimal = 16, } /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 61b5be4240414..9de0c74f4b1d2 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -437,7 +437,19 @@ impl<'a> StringReader<'a> { .emit(); (token::Integer, sym::integer(0)) } else { - self.validate_int_literal(base, start, end); + if matches!(base, Base::Binary | Base::Octal) { + let base = base as u32; + let s = self.str_from_to(start + BytePos(2), end); + for (idx, c) in s.char_indices() { + if c != '_' && c.to_digit(base).is_none() { + self.err_span_( + start + BytePos::from_usize(2 + idx), + start + BytePos::from_usize(2 + idx + c.len_utf8()), + &format!("invalid digit for a base {} literal", base), + ); + } + } + } (token::Integer, self.symbol_from_to(start, end)) } } @@ -683,23 +695,6 @@ impl<'a> StringReader<'a> { }); (kind, Symbol::intern(lit_content)) } - - fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) { - let base = match base { - Base::Binary => 2, - Base::Octal => 8, - _ => return, - }; - let s = self.str_from_to(content_start + BytePos(2), content_end); - for (idx, c) in s.char_indices() { - let idx = idx as u32; - if c != '_' && c.to_digit(base).is_none() { - let lo = content_start + BytePos(2 + idx); - let hi = content_start + BytePos(2 + idx + c.len_utf8() as u32); - self.err_span_(lo, hi, &format!("invalid digit for a base {} literal", base)); - } - } - } } pub fn nfc_normalize(string: &str) -> Symbol { From a838952239493d9dafe2d5f2ca1204f326841ae9 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 4 Nov 2022 11:09:23 +1100 Subject: [PATCH 08/10] Remove `unescape_byte_literal`. It's easy to just use `unescape_literal` + `byte_from_char`. --- compiler/rustc_ast/src/util/literal.rs | 29 +++++++------------ compiler/rustc_lexer/src/unescape.rs | 16 ++-------- compiler/rustc_lexer/src/unescape/tests.rs | 12 ++++---- .../crates/syntax/src/validation.rs | 6 ++-- 4 files changed, 20 insertions(+), 43 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 536b385606c69..8f342175f7d37 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -2,12 +2,9 @@ use crate::ast::{self, Lit, LitKind}; use crate::token::{self, Token}; - -use rustc_lexer::unescape::{unescape_byte, unescape_char}; -use rustc_lexer::unescape::{unescape_byte_literal, unescape_literal, Mode}; +use rustc_lexer::unescape::{byte_from_char, unescape_byte, unescape_char, unescape_literal, Mode}; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; - use std::ascii; pub enum LitError { @@ -109,13 +106,11 @@ impl LitKind { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); - unescape_byte_literal(&s, Mode::ByteStr, &mut |_, unescaped_byte| { - match unescaped_byte { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + unescape_literal(&s, Mode::ByteStr, &mut |_, c| match c { + Ok(c) => buf.push(byte_from_char(c)), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } } }); @@ -127,13 +122,11 @@ impl LitKind { let bytes = if s.contains('\r') { let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); - unescape_byte_literal(&s, Mode::RawByteStr, &mut |_, unescaped_byte| { - match unescaped_byte { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + unescape_literal(&s, Mode::RawByteStr, &mut |_, c| match c { + Ok(c) => buf.push(byte_from_char(c)), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } } }); diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index db7bf02e71adc..8d5eac29452e7 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -93,19 +93,6 @@ where } } -/// Takes a contents of a byte, byte string or raw byte string (without quotes) -/// and produces a sequence of bytes or errors. -/// Values are returned through invoking of the provided callback. -pub fn unescape_byte_literal(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - debug_assert!(mode.is_byte()); - unescape_literal(src, mode, &mut |range, result| { - callback(range, result.map(byte_from_char)); - }) -} - /// Takes a contents of a char literal (without quotes), and returns an /// unescaped char or an error pub fn unescape_char(src: &str) -> Result { @@ -351,7 +338,8 @@ where } } -fn byte_from_char(c: char) -> u8 { +#[inline] +pub fn byte_from_char(c: char) -> u8 { let res = c as u32; debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); res as u8 diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 008edef5a6385..00c8401efdfe4 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -246,10 +246,10 @@ fn test_unescape_byte_good() { fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_byte_literal(literal_text, Mode::ByteStr, &mut |range, c| { + unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| { if let Ok(b) = &mut buf { match c { - Ok(c) => b.push(c), + Ok(c) => b.push(byte_from_char(c)), Err(e) => buf = Err((range, e)), } } @@ -280,15 +280,13 @@ fn test_unescape_raw_str() { #[test] fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { + fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_byte_literal(literal, Mode::RawByteStr, &mut |range, res| { - unescaped.push((range, res)) - }); + unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); - check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(byte_from_char('a')))]); + check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]); } diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index b9f2b5132353c..1eea2346451dd 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -5,9 +5,7 @@ mod block; use rowan::Direction; -use rustc_lexer::unescape::{ - self, unescape_byte, unescape_byte_literal, unescape_char, unescape_literal, Mode, -}; +use rustc_lexer::unescape::{self, unescape_byte, unescape_char, unescape_literal, Mode}; use crate::{ algo, @@ -143,7 +141,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_byte_literal(without_quotes, Mode::ByteStr, &mut |range, char| { + unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| { if let Err(err) = char { push_err(2, (range.start, err)); } From 43d21b535f003c81a55331c31e16313a90050b18 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 4 Nov 2022 13:52:44 +1100 Subject: [PATCH 09/10] Rename some `result` variables as `res`, for consistency. --- compiler/rustc_lexer/src/unescape.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 8d5eac29452e7..674bbff0878ce 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -83,8 +83,8 @@ where match mode { Mode::Char | Mode::Byte => { let mut chars = src.chars(); - let result = unescape_char_or_byte(&mut chars, mode == Mode::Byte); - callback(0..(src.len() - chars.as_str().len()), result); + let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); + callback(0..(src.len() - chars.as_str().len()), res); } Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), Mode::RawStr | Mode::RawByteStr => { @@ -263,7 +263,7 @@ where // them in the range computation. while let Some(c) = chars.next() { let start = src.len() - chars.as_str().len() - c.len_utf8(); - let result = match c { + let res = match c { '\\' => { match chars.clone().next() { Some('\n') => { @@ -284,7 +284,7 @@ where _ => ascii_check(c, is_byte), }; let end = src.len() - chars.as_str().len(); - callback(start..end, result); + callback(start..end, res); } fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) @@ -329,12 +329,12 @@ where // doesn't have to worry about skipping any chars. while let Some(c) = chars.next() { let start = src.len() - chars.as_str().len() - c.len_utf8(); - let result = match c { + let res = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), _ => ascii_check(c, is_byte), }; let end = src.len() - chars.as_str().len(); - callback(start..end, result); + callback(start..end, res); } } From d6c97a32b4f5e38c0e85010df4438dc7205c44f4 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 8 Nov 2022 15:59:19 +1100 Subject: [PATCH 10/10] Simplify `unescape_{char,byte}`. The `usize` isn't needed in the error case. --- compiler/rustc_lexer/src/unescape.rs | 14 +++++--------- compiler/rustc_lexer/src/unescape/tests.rs | 12 ++++-------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 674bbff0878ce..e405013dcabf8 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -94,19 +94,15 @@ where } /// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error -pub fn unescape_char(src: &str) -> Result { - let mut chars = src.chars(); - unescape_char_or_byte(&mut chars, false).map_err(|err| (src.len() - chars.as_str().len(), err)) +/// unescaped char or an error. +pub fn unescape_char(src: &str) -> Result { + unescape_char_or_byte(&mut src.chars(), false) } /// Takes a contents of a byte literal (without quotes), and returns an /// unescaped byte or an error. -pub fn unescape_byte(src: &str) -> Result { - let mut chars = src.chars(); - unescape_char_or_byte(&mut chars, true) - .map(byte_from_char) - .map_err(|err| (src.len() - chars.as_str().len(), err)) +pub fn unescape_byte(src: &str) -> Result { + unescape_char_or_byte(&mut src.chars(), true).map(byte_from_char) } /// What kind of literal do we parse. diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 00c8401efdfe4..c7ca8fd16ae47 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -3,8 +3,7 @@ use super::*; #[test] fn test_unescape_char_bad() { fn check(literal_text: &str, expected_error: EscapeError) { - let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); - assert_eq!(actual_result, Err(expected_error)); + assert_eq!(unescape_char(literal_text), Err(expected_error)); } check("", EscapeError::ZeroChars); @@ -68,8 +67,7 @@ fn test_unescape_char_bad() { #[test] fn test_unescape_char_good() { fn check(literal_text: &str, expected_char: char) { - let actual_result = unescape_char(literal_text); - assert_eq!(actual_result, Ok(expected_char)); + assert_eq!(unescape_char(literal_text), Ok(expected_char)); } check("a", 'a'); @@ -149,8 +147,7 @@ fn test_unescape_str_good() { #[test] fn test_unescape_byte_bad() { fn check(literal_text: &str, expected_error: EscapeError) { - let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); - assert_eq!(actual_result, Err(expected_error)); + assert_eq!(unescape_byte(literal_text), Err(expected_error)); } check("", EscapeError::ZeroChars); @@ -219,8 +216,7 @@ fn test_unescape_byte_bad() { #[test] fn test_unescape_byte_good() { fn check(literal_text: &str, expected_byte: u8) { - let actual_result = unescape_byte(literal_text); - assert_eq!(actual_result, Ok(expected_byte)); + assert_eq!(unescape_byte(literal_text), Ok(expected_byte)); } check("a", b'a');