diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 536b385606c69..8f342175f7d37 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -2,12 +2,9 @@ use crate::ast::{self, Lit, LitKind}; use crate::token::{self, Token}; - -use rustc_lexer::unescape::{unescape_byte, unescape_char}; -use rustc_lexer::unescape::{unescape_byte_literal, unescape_literal, Mode}; +use rustc_lexer::unescape::{byte_from_char, unescape_byte, unescape_char, unescape_literal, Mode}; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; - use std::ascii; pub enum LitError { @@ -109,13 +106,11 @@ impl LitKind { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); - unescape_byte_literal(&s, Mode::ByteStr, &mut |_, unescaped_byte| { - match unescaped_byte { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + unescape_literal(&s, Mode::ByteStr, &mut |_, c| match c { + Ok(c) => buf.push(byte_from_char(c)), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } } }); @@ -127,13 +122,11 @@ impl LitKind { let bytes = if s.contains('\r') { let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); - unescape_byte_literal(&s, Mode::RawByteStr, &mut |_, unescaped_byte| { - match unescaped_byte { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + unescape_literal(&s, Mode::RawByteStr, &mut |_, c| match c { + Ok(c) => buf.push(byte_from_char(c)), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } } }); diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index dd2c09cae02fd..d4140cb295f32 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -205,13 +205,13 @@ pub enum RawStrError { #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Base { /// Literal starts with "0b". - Binary, + Binary = 2, /// Literal starts with "0o". - Octal, - /// Literal starts with "0x". - Hexadecimal, + Octal = 8, /// Literal doesn't contain a prefix. - Decimal, + Decimal = 10, + /// Literal starts with "0x". + Hexadecimal = 16, } /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 8f64b5f5158e4..674bbff0878ce 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -52,10 +52,8 @@ pub enum EscapeError { /// Unicode escape code in byte literal. UnicodeEscapeInByte, - /// Non-ascii character in byte literal. + /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. NonAsciiCharInByte, - /// Non-ascii character in byte string literal. - NonAsciiCharInByteString, /// After a line ending with '\', the next line contains whitespace /// characters that are not skipped. @@ -78,54 +76,37 @@ impl EscapeError { /// Takes a contents of a literal (without quotes) and produces a /// sequence of escaped characters or errors. /// Values are returned through invoking of the provided callback. -pub fn unescape_literal(literal_text: &str, mode: Mode, callback: &mut F) +pub fn unescape_literal(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { match mode { Mode::Char | Mode::Byte => { - let mut chars = literal_text.chars(); - let result = unescape_char_or_byte(&mut chars, mode); - // The Chars iterator moved forward. - callback(0..(literal_text.len() - chars.as_str().len()), result); + let mut chars = src.chars(); + let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); + callback(0..(src.len() - chars.as_str().len()), res); } - Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback), - // NOTE: Raw strings do not perform any explicit character escaping, here we - // only translate CRLF to LF and produce errors on bare CR. + Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), Mode::RawStr | Mode::RawByteStr => { - unescape_raw_str_or_raw_byte_str(literal_text, mode, callback) + unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) } } } -/// Takes a contents of a byte, byte string or raw byte string (without quotes) -/// and produces a sequence of bytes or errors. -/// Values are returned through invoking of the provided callback. -pub fn unescape_byte_literal(literal_text: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - debug_assert!(mode.is_bytes()); - unescape_literal(literal_text, mode, &mut |range, result| { - callback(range, result.map(byte_from_char)); - }) -} - /// Takes a contents of a char literal (without quotes), and returns an /// unescaped char or an error -pub fn unescape_char(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); - unescape_char_or_byte(&mut chars, Mode::Char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +pub fn unescape_char(src: &str) -> Result { + let mut chars = src.chars(); + unescape_char_or_byte(&mut chars, false).map_err(|err| (src.len() - chars.as_str().len(), err)) } /// Takes a contents of a byte literal (without quotes), and returns an /// unescaped byte or an error. -pub fn unescape_byte(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); - unescape_char_or_byte(&mut chars, Mode::Byte) +pub fn unescape_byte(src: &str) -> Result { + let mut chars = src.chars(); + unescape_char_or_byte(&mut chars, true) .map(byte_from_char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) + .map_err(|err| (src.len() - chars.as_str().len(), err)) } /// What kind of literal do we parse. @@ -147,7 +128,7 @@ impl Mode { } } - pub fn is_bytes(self) -> bool { + pub fn is_byte(self) -> bool { match self { Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, Mode::Char | Mode::Str | Mode::RawStr => false, @@ -155,12 +136,9 @@ impl Mode { } } -fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { +fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result { // Previous character was '\\', unescape what follows. - - let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; - - let res = match second_char { + let res = match chars.next().ok_or(EscapeError::LoneSlash)? { '"' => '"', 'n' => '\n', 'r' => '\r', @@ -181,7 +159,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { let value = hi * 16 + lo; // For a non-byte literal verify that it is within ASCII range. - if !mode.is_bytes() && !is_ascii(value) { + if !is_byte && !is_ascii(value) { return Err(EscapeError::OutOfRangeHexEscape); } let value = value as u8; @@ -217,7 +195,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if mode.is_bytes() { + if is_byte { return Err(EscapeError::UnicodeEscapeInByte); } @@ -249,23 +227,22 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { } #[inline] -fn ascii_check(first_char: char, mode: Mode) -> Result { - if mode.is_bytes() && !first_char.is_ascii() { +fn ascii_check(c: char, is_byte: bool) -> Result { + if is_byte && !c.is_ascii() { // Byte literal can't be a non-ascii character. Err(EscapeError::NonAsciiCharInByte) } else { - Ok(first_char) + Ok(c) } } -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - debug_assert!(mode == Mode::Char || mode == Mode::Byte); - let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = match first_char { - '\\' => scan_escape(chars, mode), +fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result { + let c = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = match c { + '\\' => scan_escape(chars, is_byte), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(first_char, mode), + _ => ascii_check(c, is_byte), }?; if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); @@ -275,20 +252,20 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result(src: &str, mode: Mode, callback: &mut F) +fn unescape_str_or_byte_str(src: &str, is_byte: bool, callback: &mut F) where F: FnMut(Range, Result), { - debug_assert!(mode == Mode::Str || mode == Mode::ByteStr); - let initial_len = src.len(); let mut chars = src.chars(); - while let Some(first_char) = chars.next() { - let start = initial_len - chars.as_str().len() - first_char.len_utf8(); - let unescaped_char = match first_char { + // The `start` and `end` computation here is complicated because + // `skip_ascii_whitespace` makes us to skip over chars without counting + // them in the range computation. + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { '\\' => { - let second_char = chars.clone().next(); - match second_char { + match chars.clone().next() { Some('\n') => { // Rust language specification requires us to skip whitespaces // if unescaped '\' character is followed by '\n'. @@ -297,17 +274,17 @@ where skip_ascii_whitespace(&mut chars, start, callback); continue; } - _ => scan_escape(&mut chars, mode), + _ => scan_escape(&mut chars, is_byte), } } '\n' => Ok('\n'), '\t' => Ok('\t'), '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(first_char, mode), + _ => ascii_check(c, is_byte), }; - let end = initial_len - chars.as_str().len(); - callback(start..end, unescaped_char); + let end = src.len() - chars.as_str().len(); + callback(start..end, res); } fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) @@ -340,30 +317,29 @@ where /// Takes a contents of a string literal (without quotes) and produces a /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(literal_text: &str, mode: Mode, callback: &mut F) +/// only produce errors on bare CR. +fn unescape_raw_str_or_raw_byte_str(src: &str, is_byte: bool, callback: &mut F) where F: FnMut(Range, Result), { - debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr); - let initial_len = literal_text.len(); - - let mut chars = literal_text.chars(); - while let Some(curr) = chars.next() { - let start = initial_len - chars.as_str().len() - curr.len_utf8(); + let mut chars = src.chars(); - let result = match curr { + // The `start` and `end` computation here matches the one in + // `unescape_str_or_byte_str` for consistency, even though this function + // doesn't have to worry about skipping any chars. + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - c => Ok(c), + _ => ascii_check(c, is_byte), }; - let end = initial_len - chars.as_str().len(); - - callback(start..end, result); + let end = src.len() - chars.as_str().len(); + callback(start..end, res); } } -fn byte_from_char(c: char) -> u8 { +#[inline] +pub fn byte_from_char(c: char) -> u8 { let res = c as u32; debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); res as u8 diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index fa61554afde6c..00c8401efdfe4 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -246,10 +246,10 @@ fn test_unescape_byte_good() { fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_byte_literal(literal_text, Mode::ByteStr, &mut |range, c| { + unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| { if let Ok(b) = &mut buf { match c { - Ok(c) => b.push(c), + Ok(c) => b.push(byte_from_char(c)), Err(e) => buf = Err((range, e)), } } @@ -280,18 +280,13 @@ fn test_unescape_raw_str() { #[test] fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { + fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_byte_literal(literal, Mode::RawByteStr, &mut |range, res| { - unescaped.push((range, res)) - }); + unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); - check( - "🦀a", - &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))], - ); + check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); + check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]); } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index de8f1c00c1295..645262bd2f1d3 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -353,55 +353,55 @@ impl<'a> StringReader<'a> { fn cook_lexer_literal( &self, start: BytePos, - suffix_start: BytePos, + end: BytePos, kind: rustc_lexer::LiteralKind, ) -> (token::LitKind, Symbol) { - // prefix means `"` or `br"` or `r###"`, ... - let (lit_kind, mode, prefix_len, postfix_len) = match kind { + match kind { rustc_lexer::LiteralKind::Char { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start, suffix_start), + self.mk_sp(start, end), "unterminated character literal", error_code!(E0762), ) } - (token::Char, Mode::Char, 1, 1) // ' ' + self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' ' } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start + BytePos(1), suffix_start), + self.mk_sp(start + BytePos(1), end), "unterminated byte constant", error_code!(E0763), ) } - (token::Byte, Mode::Byte, 2, 1) // b' ' + self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start, suffix_start), + self.mk_sp(start, end), "unterminated double quote string", error_code!(E0765), ) } - (token::Str, Mode::Str, 1, 1) // " " + self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " " } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { self.sess.span_diagnostic.span_fatal_with_code( - self.mk_sp(start + BytePos(1), suffix_start), + self.mk_sp(start + BytePos(1), end), "unterminated double quote byte string", error_code!(E0766), ) } - (token::ByteStr, Mode::ByteStr, 2, 1) // b" " + self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); - (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "## + let kind = token::StrRaw(n_hashes); + self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## } else { self.report_raw_str_error(start, 1); } @@ -409,56 +409,59 @@ impl<'a> StringReader<'a> { rustc_lexer::LiteralKind::RawByteStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); - (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "## + let kind = token::ByteStrRaw(n_hashes); + self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## } else { self.report_raw_str_error(start, 2); } } rustc_lexer::LiteralKind::Int { base, empty_int } => { - return if empty_int { + if empty_int { self.sess .span_diagnostic .struct_span_err_with_code( - self.mk_sp(start, suffix_start), + self.mk_sp(start, end), "no valid digits found for number", error_code!(E0768), ) .emit(); (token::Integer, sym::integer(0)) } else { - self.validate_int_literal(base, start, suffix_start); - (token::Integer, self.symbol_from_to(start, suffix_start)) - }; + if matches!(base, Base::Binary | Base::Octal) { + let base = base as u32; + let s = self.str_from_to(start + BytePos(2), end); + for (idx, c) in s.char_indices() { + if c != '_' && c.to_digit(base).is_none() { + self.err_span_( + start + BytePos::from_usize(2 + idx), + start + BytePos::from_usize(2 + idx + c.len_utf8()), + &format!("invalid digit for a base {} literal", base), + ); + } + } + } + (token::Integer, self.symbol_from_to(start, end)) + } } rustc_lexer::LiteralKind::Float { base, empty_exponent } => { if empty_exponent { self.err_span_(start, self.pos, "expected at least one digit in exponent"); } - match base { - Base::Hexadecimal => self.err_span_( - start, - suffix_start, - "hexadecimal float literal is not supported", - ), + Base::Hexadecimal => { + self.err_span_(start, end, "hexadecimal float literal is not supported") + } Base::Octal => { - self.err_span_(start, suffix_start, "octal float literal is not supported") + self.err_span_(start, end, "octal float literal is not supported") } Base::Binary => { - self.err_span_(start, suffix_start, "binary float literal is not supported") + self.err_span_(start, end, "binary float literal is not supported") } - _ => (), + _ => {} } - - let id = self.symbol_from_to(start, suffix_start); - return (token::Float, id); + (token::Float, self.symbol_from_to(start, end)) } - }; - let content_start = start + BytePos(prefix_len); - let content_end = suffix_start - BytePos(postfix_len); - let id = self.symbol_from_to(content_start, content_end); - self.validate_literal_escape(mode, content_start, content_end, prefix_len, postfix_len); - (lit_kind, id) + } } #[inline] @@ -649,20 +652,22 @@ impl<'a> StringReader<'a> { ) } - fn validate_literal_escape( + fn cook_quoted( &self, + kind: token::LitKind, mode: Mode, - content_start: BytePos, - content_end: BytePos, + start: BytePos, + end: BytePos, prefix_len: u32, postfix_len: u32, - ) { + ) -> (token::LitKind, Symbol) { + let content_start = start + BytePos(prefix_len); + let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); unescape::unescape_literal(lit_content, mode, &mut |range, result| { // Here we only check for errors. The actual unescaping is done later. if let Err(err) = result { - let span_with_quotes = self - .mk_sp(content_start - BytePos(prefix_len), content_end + BytePos(postfix_len)); + let span_with_quotes = self.mk_sp(start, end); let (start, end) = (range.start as u32, range.end as u32); let lo = content_start + BytePos(start); let hi = lo + BytePos(end - start); @@ -678,23 +683,7 @@ impl<'a> StringReader<'a> { ); } }); - } - - fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) { - let base = match base { - Base::Binary => 2, - Base::Octal => 8, - _ => return, - }; - let s = self.str_from_to(content_start + BytePos(2), content_end); - for (idx, c) in s.char_indices() { - let idx = idx as u32; - if c != '_' && c.to_digit(base).is_none() { - let lo = content_start + BytePos(2 + idx); - let hi = content_start + BytePos(2 + idx + c.len_utf8() as u32); - self.err_span_(lo, hi, &format!("invalid digit for a base {} literal", base)); - } - } + (kind, Symbol::intern(lit_content)) } } diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index f075de7142676..6373f5b4fd6ff 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -108,7 +108,7 @@ pub(crate) fn emit_unescape_error( } if !has_help { - let (prefix, msg) = if mode.is_bytes() { + let (prefix, msg) = if mode.is_byte() { ("b", "if you meant to write a byte string literal, use double quotes") } else { ("", "if you meant to write a `str` literal, use double quotes") @@ -142,7 +142,7 @@ pub(crate) fn emit_unescape_error( EscapeError::EscapeOnlyChar => { let (c, char_span) = last_char(); - let msg = if mode.is_bytes() { + let msg = if mode.is_byte() { "byte constant must be escaped" } else { "character constant must be escaped" @@ -182,11 +182,11 @@ pub(crate) fn emit_unescape_error( let (c, span) = last_char(); let label = - if mode.is_bytes() { "unknown byte escape" } else { "unknown character escape" }; + if mode.is_byte() { "unknown byte escape" } else { "unknown character escape" }; let ec = escaped_char(c); let mut diag = handler.struct_span_err(span, &format!("{}: `{}`", label, ec)); diag.span_label(span, label); - if c == '{' || c == '}' && !mode.is_bytes() { + if c == '{' || c == '}' && !mode.is_byte() { diag.help( "if used in a formatting string, curly braces are escaped with `{{` and `}}`", ); @@ -196,7 +196,7 @@ pub(crate) fn emit_unescape_error( version control settings", ); } else { - if !mode.is_bytes() { + if !mode.is_byte() { diag.span_suggestion( span_with_quotes, "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", @@ -231,16 +231,23 @@ pub(crate) fn emit_unescape_error( .emit(); } EscapeError::NonAsciiCharInByte => { - assert!(mode.is_bytes()); let (c, span) = last_char(); - let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant"); + let desc = match mode { + Mode::Byte => "byte literal", + Mode::ByteStr => "byte string literal", + Mode::RawByteStr => "raw byte string literal", + _ => panic!("non-is_byte literal paired with NonAsciiCharInByte"), + }; + let mut err = handler.struct_span_err(span, format!("non-ASCII character in {}", desc)); let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { format!(" but is {:?}", c) } else { String::new() }; - err.span_label(span, &format!("byte constant must be ASCII{}", postfix)); - if (c as u32) <= 0xFF { + err.span_label(span, &format!("must be ASCII{}", postfix)); + // Note: the \\xHH suggestions are not given for raw byte string + // literals, because they are araw and so cannot use any escapes. + if (c as u32) <= 0xFF && mode != Mode::RawByteStr { err.span_suggestion( span, &format!( @@ -250,9 +257,9 @@ pub(crate) fn emit_unescape_error( format!("\\x{:X}", c as u32), Applicability::MaybeIncorrect, ); - } else if matches!(mode, Mode::Byte) { + } else if mode == Mode::Byte { err.span_label(span, "this multibyte character does not fit into a single byte"); - } else if matches!(mode, Mode::ByteStr) { + } else if mode != Mode::RawByteStr { let mut utf8 = String::new(); utf8.push(c); err.span_suggestion( @@ -270,19 +277,6 @@ pub(crate) fn emit_unescape_error( } err.emit(); } - EscapeError::NonAsciiCharInByteString => { - assert!(mode.is_bytes()); - let (c, span) = last_char(); - let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { - format!(" but is {:?}", c) - } else { - String::new() - }; - handler - .struct_span_err(span, "raw byte string must be ASCII") - .span_label(span, &format!("must be ASCII{}", postfix)) - .emit(); - } EscapeError::OutOfRangeHexEscape => { handler .struct_span_err(span, "out of range hex escape") diff --git a/src/test/ui/attributes/key-value-non-ascii.rs b/src/test/ui/attributes/key-value-non-ascii.rs index 12942eabdf7b5..e14e2fc05ad39 100644 --- a/src/test/ui/attributes/key-value-non-ascii.rs +++ b/src/test/ui/attributes/key-value-non-ascii.rs @@ -1,4 +1,4 @@ #![feature(rustc_attrs)] -#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte constant +#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal fn main() {} diff --git a/src/test/ui/attributes/key-value-non-ascii.stderr b/src/test/ui/attributes/key-value-non-ascii.stderr index 422107867f7f9..23d482de6a868 100644 --- a/src/test/ui/attributes/key-value-non-ascii.stderr +++ b/src/test/ui/attributes/key-value-non-ascii.stderr @@ -1,8 +1,8 @@ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/key-value-non-ascii.rs:3:19 | LL | #[rustc_dummy = b"ffi.rs"] - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes | diff --git a/src/test/ui/parser/byte-literals.rs b/src/test/ui/parser/byte-literals.rs index 05a510b24a7ab..896dc1a1a5fba 100644 --- a/src/test/ui/parser/byte-literals.rs +++ b/src/test/ui/parser/byte-literals.rs @@ -7,6 +7,6 @@ pub fn main() { b'\x0Z'; //~ ERROR invalid character in numeric character escape: `Z` b' '; //~ ERROR byte constant must be escaped b'''; //~ ERROR byte constant must be escaped - b'é'; //~ ERROR non-ASCII character in byte constant + b'é'; //~ ERROR non-ASCII character in byte literal b'a //~ ERROR unterminated byte constant [E0763] } diff --git a/src/test/ui/parser/byte-literals.stderr b/src/test/ui/parser/byte-literals.stderr index c3d0006163005..efa55ae05bd37 100644 --- a/src/test/ui/parser/byte-literals.stderr +++ b/src/test/ui/parser/byte-literals.stderr @@ -32,11 +32,11 @@ error: byte constant must be escaped: `'` LL | b'''; | ^ help: escape the character: `\'` -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/byte-literals.rs:10:7 | LL | b'é'; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'é', use a \xHH escape | diff --git a/src/test/ui/parser/byte-string-literals.rs b/src/test/ui/parser/byte-string-literals.rs index b1f11024a7bb6..30a4f50c4e40b 100644 --- a/src/test/ui/parser/byte-string-literals.rs +++ b/src/test/ui/parser/byte-string-literals.rs @@ -3,7 +3,7 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape pub fn main() { b"\f"; //~ ERROR unknown byte escape b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z` - b"é"; //~ ERROR non-ASCII character in byte constant - br##"é"##; //~ ERROR raw byte string must be ASCII + b"é"; //~ ERROR non-ASCII character in byte string literal + br##"é"##; //~ ERROR non-ASCII character in raw byte string literal b"a //~ ERROR unterminated double quote byte string } diff --git a/src/test/ui/parser/byte-string-literals.stderr b/src/test/ui/parser/byte-string-literals.stderr index 3b8b3692e053f..5b96cc3d18abc 100644 --- a/src/test/ui/parser/byte-string-literals.stderr +++ b/src/test/ui/parser/byte-string-literals.stderr @@ -20,18 +20,18 @@ error: invalid character in numeric character escape: `Z` LL | b"\x0Z"; | ^ invalid character in numeric character escape -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/byte-string-literals.rs:6:7 | LL | b"é"; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'é', use a \xHH escape | LL | b"\xE9"; | ~~~~ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/byte-string-literals.rs:7:10 | LL | br##"é"##; diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.rs b/src/test/ui/parser/raw/raw-byte-string-literals.rs index 163c8ac66b022..1b859fee596ad 100644 --- a/src/test/ui/parser/raw/raw-byte-string-literals.rs +++ b/src/test/ui/parser/raw/raw-byte-string-literals.rs @@ -2,6 +2,6 @@ pub fn main() { br"a "; //~ ERROR bare CR not allowed in raw string - br"é"; //~ ERROR raw byte string must be ASCII + br"é"; //~ ERROR non-ASCII character in raw byte string literal br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/src/test/ui/parser/raw/raw-byte-string-literals.stderr b/src/test/ui/parser/raw/raw-byte-string-literals.stderr index cfc877104bd9f..a2f27d1ed70ae 100644 --- a/src/test/ui/parser/raw/raw-byte-string-literals.stderr +++ b/src/test/ui/parser/raw/raw-byte-string-literals.stderr @@ -4,7 +4,7 @@ error: bare CR not allowed in raw string LL | br"a "; | ^ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/raw-byte-string-literals.rs:5:8 | LL | br"é"; diff --git a/src/test/ui/parser/unicode-control-codepoints.rs b/src/test/ui/parser/unicode-control-codepoints.rs index 5af0b585a1275..df099bb62ad1e 100644 --- a/src/test/ui/parser/unicode-control-codepoints.rs +++ b/src/test/ui/parser/unicode-control-codepoints.rs @@ -14,15 +14,15 @@ fn main() { println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); //~^ ERROR unicode codepoint changing visible direction of text present in literal println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); - //~^ ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant - //~| ERROR non-ASCII character in byte constant + //~^ ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal + //~| ERROR non-ASCII character in byte string literal println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); - //~^ ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII - //~| ERROR raw byte string must be ASCII + //~^ ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal + //~| ERROR non-ASCII character in raw byte string literal println!("{:?}", '‮'); //~^ ERROR unicode codepoint changing visible direction of text present in literal } diff --git a/src/test/ui/parser/unicode-control-codepoints.stderr b/src/test/ui/parser/unicode-control-codepoints.stderr index 44548c72ff5d0..fc071a9419142 100644 --- a/src/test/ui/parser/unicode-control-codepoints.stderr +++ b/src/test/ui/parser/unicode-control-codepoints.stderr @@ -14,69 +14,69 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); | = help: unicode escape sequences cannot be used as a byte or in a byte string -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:26 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{202e}' + | ^ must be ASCII but is '\u{202e}' | help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes | LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:30 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2066}' + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:41 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2069}' + | ^ must be ASCII but is '\u{2069}' | help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes | LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); | ~~~~~~~~~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:43 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ byte constant must be ASCII but is '\u{2066}' + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); | ~~~~~~~~~~~~ -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:29 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{202e}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:33 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{2066}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:44 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); | ^ must be ASCII but is '\u{2069}' -error: raw byte string must be ASCII +error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:46 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); diff --git a/src/test/ui/suggestions/multibyte-escapes.rs b/src/test/ui/suggestions/multibyte-escapes.rs index fd5d46a4e923e..c4105186244db 100644 --- a/src/test/ui/suggestions/multibyte-escapes.rs +++ b/src/test/ui/suggestions/multibyte-escapes.rs @@ -2,17 +2,17 @@ fn main() { b'µ'; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte literal //~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII b'字'; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte literal //~| NOTE: this multibyte character does not fit into a single byte - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII b"字"; - //~^ ERROR: non-ASCII character in byte constant + //~^ ERROR: non-ASCII character in byte string literal //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - //~| NOTE: byte constant must be ASCII + //~| NOTE: must be ASCII } diff --git a/src/test/ui/suggestions/multibyte-escapes.stderr b/src/test/ui/suggestions/multibyte-escapes.stderr index 6e26bc1f01cef..1e7c43e6538f6 100644 --- a/src/test/ui/suggestions/multibyte-escapes.stderr +++ b/src/test/ui/suggestions/multibyte-escapes.stderr @@ -1,28 +1,28 @@ -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/multibyte-escapes.rs:4:7 | LL | b'µ'; - | ^ byte constant must be ASCII + | ^ must be ASCII | help: if you meant to use the unicode code point for 'µ', use a \xHH escape | LL | b'\xB5'; | ~~~~ -error: non-ASCII character in byte constant +error: non-ASCII character in byte literal --> $DIR/multibyte-escapes.rs:9:7 | LL | b'字'; | ^^ | | - | byte constant must be ASCII + | must be ASCII | this multibyte character does not fit into a single byte -error: non-ASCII character in byte constant +error: non-ASCII character in byte string literal --> $DIR/multibyte-escapes.rs:14:7 | LL | b"字"; - | ^^ byte constant must be ASCII + | ^^ must be ASCII | help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes | diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index b9f2b5132353c..1eea2346451dd 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -5,9 +5,7 @@ mod block; use rowan::Direction; -use rustc_lexer::unescape::{ - self, unescape_byte, unescape_byte_literal, unescape_char, unescape_literal, Mode, -}; +use rustc_lexer::unescape::{self, unescape_byte, unescape_char, unescape_literal, Mode}; use crate::{ algo, @@ -143,7 +141,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_byte_literal(without_quotes, Mode::ByteStr, &mut |range, char| { + unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| { if let Err(err) = char { push_err(2, (range.start, err)); }