From 652fc86b7d0dfaf4f2ce76177f0e4411208a669a Mon Sep 17 00:00:00 2001
From: Ed Page <eopage@gmail.com>
Date: Mon, 12 Jun 2023 15:39:50 -0500
Subject: [PATCH] refactor(embedded): Switch to `syn` for parsing doc comments

The hope is this will result in more resilient comment handling, being
more consistent with rustdoc.

I also hoped for less code but `syn` is doing less than I had expected,
requiring us to copy code over from other parts of rust.  It seems every
proc macro has to do this but there is no guide to it, so they all do it
differently, only covering the cases they thought to test for.

Note that this still won't support `include_str!()`.
---
 Cargo.lock                      |   1 -
 Cargo.toml                      |   2 -
 src/cargo/util/toml/embedded.rs | 391 ++++++++++++++++++++------------
 3 files changed, 249 insertions(+), 145 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b8d4d9d3056..4a3d126e479 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -325,7 +325,6 @@ dependencies = [
  "pretty_env_logger",
  "pulldown-cmark",
  "rand",
- "regex",
  "rustfix",
  "same-file",
  "semver",
diff --git a/Cargo.toml b/Cargo.toml
index d6d7e76d669..0e788a6afc8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -67,7 +67,6 @@ pretty_env_logger = "0.4"
 proptest = "1.1.0"
 pulldown-cmark = { version = "0.9.2", default-features = false }
 rand = "0.8.5"
-regex = "1.8.3"
 rustfix = "0.6.0"
 same-file = "1.0.6"
 security-framework = "2.0.0"
@@ -153,7 +152,6 @@ pathdiff.workspace = true
 pretty_env_logger = { workspace = true, optional = true }
 pulldown-cmark.workspace = true
 rand.workspace = true
-regex.workspace = true
 rustfix.workspace = true
 semver.workspace = true
 serde = { workspace = true, features = ["derive"] }
diff --git a/src/cargo/util/toml/embedded.rs b/src/cargo/util/toml/embedded.rs
index 43c964abc66..74f1b9e32b9 100644
--- a/src/cargo/util/toml/embedded.rs
+++ b/src/cargo/util/toml/embedded.rs
@@ -230,156 +230,271 @@ fn write_if_changed(path: &std::path::Path, new: &str) -> CargoResult<()> {
 
 /// Locates a "code block manifest" in Rust source.
 fn extract_comment(input: &str) -> CargoResult<String> {
-    let re_crate_comment = regex::Regex::new(
-        // We need to find the first `/*!` or `//!` that *isn't* preceded by something that would
-        // make it apply to anything other than the crate itself.  Because we can't do this
-        // accurately, we'll just require that the doc-comment is the *first* thing in the file
-        // (after the optional shebang).
-        r"(?x)(^\s*|^\#![^\[].*?(\r\n|\n))(/\*!|//(!|/))",
-    )
-    .unwrap();
-    let re_margin = regex::Regex::new(r"^\s*\*( |$)").unwrap();
-    let re_space = regex::Regex::new(r"^(\s+)").unwrap();
-    let re_nesting = regex::Regex::new(r"/\*|\*/").unwrap();
-    let re_comment = regex::Regex::new(r"^\s*//(!|/)").unwrap();
-
-    fn n_leading_spaces(s: &str, n: usize) -> anyhow::Result<()> {
-        if !s.chars().take(n).all(|c| c == ' ') {
-            anyhow::bail!("leading {n:?} chars aren't all spaces: {s:?}")
+    let mut doc_fragments = Vec::new();
+    let file = syn::parse_file(input)?;
+    // HACK: `syn` doesn't tell us what kind of comment was used, so infer it from how many
+    // attributes were used
+    let kind = if 1 < file
+        .attrs
+        .iter()
+        .filter(|attr| attr.meta.path().is_ident("doc"))
+        .count()
+    {
+        CommentKind::Line
+    } else {
+        CommentKind::Block
+    };
+    for attr in &file.attrs {
+        if attr.meta.path().is_ident("doc") {
+            doc_fragments.push(DocFragment::new(attr, kind)?);
         }
-        Ok(())
-    }
-
-    /// Returns a slice of the input string with the leading shebang, if there is one, omitted.
-    fn strip_shebang(s: &str) -> &str {
-        let re_shebang = regex::Regex::new(r"^#![^\[].*?(\r\n|\n)").unwrap();
-        re_shebang.find(s).map(|m| &s[m.end()..]).unwrap_or(s)
-    }
-
-    // First, we will look for and slice out a contiguous, inner doc-comment which must be *the
-    // very first thing* in the file.  `#[doc(...)]` attributes *are not supported*.  Multiple
-    // single-line comments cannot have any blank lines between them.
-    let input = strip_shebang(input); // `re_crate_comment` doesn't work with shebangs
-    let start = re_crate_comment
-        .captures(input)
-        .ok_or_else(|| anyhow::format_err!("no doc-comment found"))?
-        .get(3)
-        .ok_or_else(|| anyhow::format_err!("no doc-comment found"))?
-        .start();
-
-    let input = &input[start..];
-
-    if let Some(input) = input.strip_prefix("/*!") {
-        // On every line:
-        //
-        // - update nesting level and detect end-of-comment
-        // - if margin is None:
-        //     - if there appears to be a margin, set margin.
-        // - strip off margin marker
-        // - update the leading space counter
-        // - strip leading space
-        // - append content
-        let mut r = String::new();
-
-        let mut leading_space = None;
-        let mut margin = None;
-        let mut depth: u32 = 1;
-
-        for line in input.lines() {
-            if depth == 0 {
-                break;
-            }
-
-            // Update nesting and look for end-of-comment.
-            let mut end_of_comment = None;
+    }
+    if doc_fragments.is_empty() {
+        anyhow::bail!("no doc-comment found");
+    }
+    unindent_doc_fragments(&mut doc_fragments);
 
-            for (end, marker) in re_nesting.find_iter(line).map(|m| (m.start(), m.as_str())) {
-                match (marker, depth) {
-                    ("/*", _) => depth += 1,
-                    ("*/", 1) => {
-                        end_of_comment = Some(end);
-                        depth = 0;
-                        break;
-                    }
-                    ("*/", _) => depth -= 1,
-                    _ => panic!("got a comment marker other than /* or */"),
-                }
-            }
+    let mut doc_comment = String::new();
+    for frag in &doc_fragments {
+        add_doc_fragment(&mut doc_comment, frag);
+    }
 
-            let line = end_of_comment.map(|end| &line[..end]).unwrap_or(line);
+    Ok(doc_comment)
+}
 
-            // Detect and strip margin.
-            margin = margin.or_else(|| re_margin.find(line).map(|m| m.as_str()));
+/// A `#[doc]`
+#[derive(Clone, Debug)]
+struct DocFragment {
+    /// The attribute value
+    doc: String,
+    /// Indentation used within `doc
+    indent: usize,
+}
 
-            let line = if let Some(margin) = margin {
-                let end = line
-                    .char_indices()
-                    .take(margin.len())
-                    .map(|(i, c)| i + c.len_utf8())
-                    .last()
-                    .unwrap_or(0);
-                &line[end..]
-            } else {
-                line
+impl DocFragment {
+    fn new(attr: &syn::Attribute, kind: CommentKind) -> CargoResult<Self> {
+        let syn::Meta::NameValue(nv) = &attr.meta else {
+                anyhow::bail!("unsupported attr meta for {:?}", attr.meta.path())
             };
+        let syn::Expr::Lit(syn::ExprLit { lit: syn::Lit::Str(lit), .. }) = &nv.value else {
+                anyhow::bail!("only string literals are supported")
+            };
+        Ok(Self {
+            doc: beautify_doc_string(lit.value(), kind),
+            indent: 0,
+        })
+    }
+}
 
-            // Detect and strip leading indentation.
-            leading_space = leading_space.or_else(|| re_space.find(line).map(|m| m.end()));
-
-            // Make sure we have only leading spaces.
-            //
-            // If we see a tab, fall over.  I *would* expand them, but that gets into the question of how *many* spaces to expand them to, and *where* is the tab, because tabs are tab stops and not just N spaces.
-            n_leading_spaces(line, leading_space.unwrap_or(0))?;
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub enum CommentKind {
+    Line,
+    Block,
+}
 
-            let strip_len = line.len().min(leading_space.unwrap_or(0));
-            let line = &line[strip_len..];
+/// Makes a doc string more presentable to users.
+/// Used by rustdoc and perhaps other tools, but not by rustc.
+///
+/// See `rustc_ast/util/comments.rs`
+fn beautify_doc_string(data: String, kind: CommentKind) -> String {
+    fn get_vertical_trim(lines: &[&str]) -> Option<(usize, usize)> {
+        let mut i = 0;
+        let mut j = lines.len();
+        // first line of all-stars should be omitted
+        if !lines.is_empty() && lines[0].chars().all(|c| c == '*') {
+            i += 1;
+        }
 
-            // Done.
-            r.push_str(line);
+        // like the first, a last line of all stars should be omitted
+        if j > i && !lines[j - 1].is_empty() && lines[j - 1].chars().all(|c| c == '*') {
+            j -= 1;
+        }
 
-            // `lines` removes newlines.  Ideally, it wouldn't do that, but hopefully this shouldn't cause any *real* problems.
-            r.push('\n');
+        if i != 0 || j != lines.len() {
+            Some((i, j))
+        } else {
+            None
         }
+    }
 
-        Ok(r)
-    } else if input.starts_with("//!") || input.starts_with("///") {
-        let mut r = String::new();
+    fn get_horizontal_trim(lines: &[&str], kind: CommentKind) -> Option<String> {
+        let mut i = usize::MAX;
+        let mut first = true;
+
+        // In case we have doc comments like `/**` or `/*!`, we want to remove stars if they are
+        // present. However, we first need to strip the empty lines so they don't get in the middle
+        // when we try to compute the "horizontal trim".
+        let lines = match kind {
+            CommentKind::Block => {
+                // Whatever happens, we skip the first line.
+                let mut i = lines
+                    .get(0)
+                    .map(|l| {
+                        if l.trim_start().starts_with('*') {
+                            0
+                        } else {
+                            1
+                        }
+                    })
+                    .unwrap_or(0);
+                let mut j = lines.len();
 
-        let mut leading_space = None;
+                while i < j && lines[i].trim().is_empty() {
+                    i += 1;
+                }
+                while j > i && lines[j - 1].trim().is_empty() {
+                    j -= 1;
+                }
+                &lines[i..j]
+            }
+            CommentKind::Line => lines,
+        };
 
-        for line in input.lines() {
-            // Strip leading comment marker.
-            let content = match re_comment.find(line) {
-                Some(m) => &line[m.end()..],
-                None => break,
-            };
+        for line in lines {
+            for (j, c) in line.chars().enumerate() {
+                if j > i || !"* \t".contains(c) {
+                    return None;
+                }
+                if c == '*' {
+                    if first {
+                        i = j;
+                        first = false;
+                    } else if i != j {
+                        return None;
+                    }
+                    break;
+                }
+            }
+            if i >= line.len() {
+                return None;
+            }
+        }
+        if lines.is_empty() {
+            None
+        } else {
+            Some(lines[0][..i].into())
+        }
+    }
 
-            // Detect and strip leading indentation.
-            leading_space = leading_space.or_else(|| {
-                re_space
-                    .captures(content)
-                    .and_then(|c| c.get(1))
-                    .map(|m| m.end())
-            });
+    let data_s = data.as_str();
+    if data_s.contains('\n') {
+        let mut lines = data_s.lines().collect::<Vec<&str>>();
+        let mut changes = false;
+        let lines = if let Some((i, j)) = get_vertical_trim(&lines) {
+            changes = true;
+            // remove whitespace-only lines from the start/end of lines
+            &mut lines[i..j]
+        } else {
+            &mut lines
+        };
+        if let Some(horizontal) = get_horizontal_trim(lines, kind) {
+            changes = true;
+            // remove a "[ \t]*\*" block from each line, if possible
+            for line in lines.iter_mut() {
+                if let Some(tmp) = line.strip_prefix(&horizontal) {
+                    *line = tmp;
+                    if kind == CommentKind::Block
+                        && (*line == "*" || line.starts_with("* ") || line.starts_with("**"))
+                    {
+                        *line = &line[1..];
+                    }
+                }
+            }
+        }
+        if changes {
+            return lines.join("\n");
+        }
+    }
+    data
+}
 
-            // Make sure we have only leading spaces.
-            //
-            // If we see a tab, fall over.  I *would* expand them, but that gets into the question of how *many* spaces to expand them to, and *where* is the tab, because tabs are tab stops and not just N spaces.
-            n_leading_spaces(content, leading_space.unwrap_or(0))?;
+/// Removes excess indentation on comments in order for the Markdown
+/// to be parsed correctly. This is necessary because the convention for
+/// writing documentation is to provide a space between the /// or //! marker
+/// and the doc text, but Markdown is whitespace-sensitive. For example,
+/// a block of text with four-space indentation is parsed as a code block,
+/// so if we didn't unindent comments, these list items
+///
+/// /// A list:
+/// ///
+/// ///    - Foo
+/// ///    - Bar
+///
+/// would be parsed as if they were in a code block, which is likely not what the user intended.
+///
+/// See also `rustc_resolve/rustdoc.rs`
+fn unindent_doc_fragments(docs: &mut [DocFragment]) {
+    // HACK: We can't tell the difference between `#[doc]` and doc-comments, so we can't specialize
+    // the indentation like rustodc does
+    let add = 0;
+
+    // `min_indent` is used to know how much whitespaces from the start of each lines must be
+    // removed. Example:
+    //
+    // ```
+    // ///     hello!
+    // #[doc = "another"]
+    // ```
+    //
+    // In here, the `min_indent` is 1 (because non-sugared fragment are always counted with minimum
+    // 1 whitespace), meaning that "hello!" will be considered a codeblock because it starts with 4
+    // (5 - 1) whitespaces.
+    let Some(min_indent) = docs
+        .iter()
+        .map(|fragment| {
+            fragment.doc.as_str().lines().fold(usize::MAX, |min_indent, line| {
+                if line.chars().all(|c| c.is_whitespace()) {
+                    min_indent
+                } else {
+                    // Compare against either space or tab, ignoring whether they are
+                    // mixed or not.
+                    let whitespace = line.chars().take_while(|c| *c == ' ' || *c == '\t').count();
+                    min_indent.min(whitespace)
+                }
+            })
+        })
+        .min()
+    else {
+        return;
+    };
 
-            let strip_len = content.len().min(leading_space.unwrap_or(0));
-            let content = &content[strip_len..];
+    for fragment in docs {
+        if fragment.doc.is_empty() {
+            continue;
+        }
 
-            // Done.
-            r.push_str(content);
+        let min_indent = if min_indent > 0 {
+            min_indent - add
+        } else {
+            min_indent
+        };
 
-            // `lines` removes newlines.  Ideally, it wouldn't do that, but hopefully this shouldn't cause any *real* problems.
-            r.push('\n');
-        }
+        fragment.indent = min_indent;
+    }
+}
 
-        Ok(r)
-    } else {
-        Err(anyhow::format_err!("no doc-comment found"))
+/// The goal of this function is to apply the `DocFragment` transformation that is required when
+/// transforming into the final Markdown, which is applying the computed indent to each line in
+/// each doc fragment (a `DocFragment` can contain multiple lines in case of `#[doc = ""]`).
+///
+/// Note: remove the trailing newline where appropriate
+///
+/// See also `rustc_resolve/rustdoc.rs`
+fn add_doc_fragment(out: &mut String, frag: &DocFragment) {
+    let s = frag.doc.as_str();
+    let mut iter = s.lines();
+    if s.is_empty() {
+        out.push('\n');
+        return;
+    }
+    while let Some(line) = iter.next() {
+        if line.chars().any(|c| !c.is_whitespace()) {
+            assert!(line.len() >= frag.indent);
+            out.push_str(&line[frag.indent..]);
+        } else {
+            out.push_str(line);
+        }
+        out.push('\n');
     }
 }
 
@@ -576,14 +691,12 @@ fn main() {}
     #[test]
     fn test_multiline_comment() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"/*!
 Here is a manifest:
@@ -603,14 +716,12 @@ fn main() {
     #[test]
     fn test_multiline_comment_shebang() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"#!/usr/bin/env cargo-eval
 
@@ -632,14 +743,12 @@ fn main() {
     #[test]
     fn test_multiline_block_comment() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"/*!
  * Here is a manifest:
@@ -657,14 +766,12 @@ fn main() {}
     #[test]
     fn test_multiline_block_comment_shebang() {
         snapbox::assert_eq(
-            r#"
-Here is a manifest:
+            r#"Here is a manifest:
 
 ```cargo
 [dependencies]
 time = "*"
 ```
-
 "#,
             ec!(r#"#!/usr/bin/env cargo-eval