From 57d021f722e1dbb0afbc561b548fbc085c4e453f Mon Sep 17 00:00:00 2001
From: Ben Hamilton <beng@fb.com>
Date: Wed, 1 Mar 2017 14:05:39 -0800
Subject: [PATCH] New \p{Letter} Unicode property escape

---
 .../v4/test/tool/TestATNConstruction.java     | 123 ++++++++++
 .../v4/test/tool/TestToolSyntaxErrors.java    |  38 ++++
 .../antlr/v4/automata/LexerATNFactory.java    | 211 +++++++++++++++---
 tool/src/org/antlr/v4/tool/ErrorType.java     |  24 +-
 4 files changed, 361 insertions(+), 35 deletions(-)

diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
index 5764175a5b3..d68321a3432 100644
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
@@ -115,6 +115,129 @@ public void testA() throws Exception {
 				"s4->RuleStop_A_2\n";
 		checkTokensRule(g, null, expecting);
 	}
+	@Test public void testCharSet() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [abc] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\uABCD] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-43981->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\uABCD-\\uABFF] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 43981..44031}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\u{10ABCD}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-1092557->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 1092557..1092607}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyEscape() throws Exception {
+		// The Gothic script is long dead and unlikely to change (which would
+		// cause this test to fail)
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\P{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{0..66351, 66379..1114111}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
+		// Ditto the Mahajani script. Not going to change soon. I hope.
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}\\p{Mahajani}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378, 69968..70006}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
 	@Test public void testRangeOrRange() throws Exception {
 		LexerGrammar g = new LexerGrammar(
 			"lexer grammar P;\n"+
diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
index 6b828ef9649..cf16905bd84 100644
--- a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
+++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
@@ -529,6 +529,44 @@ public void testSetUp() throws Exception {
 		super.testErrors(pair, true);
 	}
 
+	@Test public void testInvalidUnicodeEscapesInCharSet() {
+		String grammar =
+				"lexer grammar Test;\n" +
+				"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
+				"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
+				"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
+				"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
+				"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
+				"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Letter}-Z];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Number}];\n" +
+				"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n";
+
+		String expected =
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:13:48: unicode property escapes not allowed in lexer charset range: [\\P{Uppercase_Letter}-\\P{Number}]\n";
+
+		String[] pair = new String[] {
+				grammar,
+				expected
+		};
+
+		super.testErrors(pair, true);
+	}
+
 	/**
 	 * This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
 	 * is produced as described in the documentation.
diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
index 5c1f9d78e1c..fb3279a19f4 100644
--- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java
+++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@@ -10,6 +10,7 @@
 import org.antlr.runtime.Token;
 import org.antlr.v4.codegen.CodeGenerator;
 import org.antlr.v4.misc.CharSupport;
+import org.antlr.v4.misc.EscapeSequenceParsing;
 import org.antlr.v4.parse.ANTLRParser;
 import org.antlr.v4.runtime.IntStream;
 import org.antlr.v4.runtime.Lexer;
@@ -49,6 +50,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Set;
 
 public class LexerATNFactory extends ParserATNFactory {
@@ -365,7 +367,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
 		return new Handle(left, right);
 	}
 
-	/** [Aa\t \u1234a-z\]\-] char sets */
+	/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
 	@Override
 	public Handle charSetLiteral(GrammarAST charSetAST) {
 		ATNState left = newState(charSetAST);
@@ -376,10 +378,68 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
 		return new Handle(left, right);
 	}
 
+	private static class CharSetParseState {
+		enum Mode {
+			NONE,
+			ERROR,
+			PREV_CODE_POINT,
+			PREV_PROPERTY
+		}
+
+		public static final CharSetParseState NONE = new CharSetParseState(Mode.NONE, false, -1, IntervalSet.EMPTY_SET);
+		public static final CharSetParseState ERROR = new CharSetParseState(Mode.ERROR, false, -1, IntervalSet.EMPTY_SET);
+
+		public final Mode mode;
+		public final boolean inRange;
+		public final int prevCodePoint;
+		public final IntervalSet prevProperty;
+
+		public CharSetParseState(
+				Mode mode,
+				boolean inRange,
+				int prevCodePoint,
+				IntervalSet prevProperty) {
+			this.mode = mode;
+			this.inRange = inRange;
+			this.prevCodePoint = prevCodePoint;
+			this.prevProperty = prevProperty;
+		}
+
+		@Override
+		public String toString() {
+			return String.format(
+					"%s mode=%s inRange=%s prevCodePoint=%d prevProperty=%s",
+					super.toString(),
+					mode,
+					inRange,
+					prevCodePoint,
+					prevProperty);
+		}
+
+		@Override
+		public boolean equals(Object other) {
+			if (!(other instanceof CharSetParseState)) {
+				return false;
+			}
+			CharSetParseState that = (CharSetParseState) other;
+			if (this == that) {
+				return true;
+			}
+			return Objects.equals(this.mode, that.mode) &&
+				Objects.equals(this.inRange, that.inRange) &&
+				Objects.equals(this.prevCodePoint, that.prevCodePoint) &&
+				Objects.equals(this.prevProperty, that.prevProperty);
+		}
+
+		@Override
+		public int hashCode() {
+			return Objects.hash(mode, inRange, prevCodePoint, prevProperty);
+		}
+	}
+
 	public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
 		String chars = charSetAST.getText();
 		chars = chars.substring(1, chars.length() - 1);
-		String cset = '"' + chars + '"';
 		IntervalSet set = new IntervalSet();
 
 		if (chars.length() == 0) {
@@ -387,46 +447,137 @@ public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
 					g.fileName, charSetAST.getToken(), "[]");
 			return set;
 		}
-		// unescape all valid escape char like \n, leaving escaped dashes as '\-'
-		// so we can avoid seeing them as '-' range ops.
-		chars = CharSupport.getStringFromGrammarStringLiteral(cset);
-		if (chars == null) {
-			g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
-			                           g.fileName, charSetAST.getToken());
-			return set;
-		}
+
+		CharSetParseState state = CharSetParseState.NONE;
+
 		int n = chars.length();
-		// now make x-y become set of char
 		for (int i = 0; i < n; ) {
+			if (state.mode == CharSetParseState.Mode.ERROR) {
+				return new IntervalSet();
+			}
 			int c = chars.codePointAt(i);
 			int offset = Character.charCount(c);
-			if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
-				checkSetCollision(charSetAST, set, '-');
-				set.add('-');
-				offset++;
-			}
-			else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
-				int x = c;
-				int y = chars.codePointAt(i+offset+1);
-				if (x <= y) {
-					checkSetCollision(charSetAST, set, x, y);
-					set.add(x,y);
+			if (c == '\\') {
+				EscapeSequenceParsing.Result escapeParseResult =
+					EscapeSequenceParsing.parseEscape(chars, i);
+				switch (escapeParseResult.type) {
+					case INVALID:
+						g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
+									   g.fileName, charSetAST.getToken(), charSetAST.getText());
+						state = CharSetParseState.ERROR;
+						break;
+					case CODE_POINT:
+						state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, escapeParseResult.codePoint);
+						break;
+					case PROPERTY:
+						state = applyPrevStateAndMoveToProperty(charSetAST, set, state, escapeParseResult.propertyIntervalSet);
+						break;
 				}
-				else {
-					g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
-								   g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
+				offset = escapeParseResult.parseLength;
+			} else if (c == '-' && !state.inRange) {
+				if (state.mode == CharSetParseState.Mode.PREV_PROPERTY) {
+					g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
+								   g.fileName, charSetAST.getToken(), charSetAST.getText());
+					state = CharSetParseState.ERROR;
+				} else {
+					state = new CharSetParseState(state.mode, true, state.prevCodePoint, state.prevProperty);
 				}
-				offset += Character.charCount(y) + 1;
-			}
-			else {
-				checkSetCollision(charSetAST, set, c);
-				set.add(c);
+			} else {
+				state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, c);
 			}
 			i += offset;
 		}
+		if (state.mode == CharSetParseState.Mode.ERROR) {
+			return new IntervalSet();
+		}
+		// Whether or not we were in a range, we'll add the last code point found to the set.
+		// If the range wasn't terminated, we'll treat it as a standalone codepoint.
+		applyPrevState(charSetAST, set, state);
+		if (state.inRange) {
+			// Unterminated range; add a literal hyphen to the set.
+			checkSetCollision(charSetAST, set, '-');
+			set.add('-');
+		}
 		return set;
 	}
 
+	private CharSetParseState applyPrevStateAndMoveToCodePoint(
+			GrammarAST charSetAST,
+			IntervalSet set,
+			CharSetParseState state,
+			int codePoint) {
+		if (state.inRange) {
+			if (state.prevCodePoint > codePoint) {
+				g.tool.errMgr.grammarError(
+						ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+						g.fileName,
+						charSetAST.getToken(),
+						CharSupport.toRange(state.prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
+			}
+			checkSetCollision(charSetAST, set, state.prevCodePoint, codePoint);
+			set.add(state.prevCodePoint, codePoint);
+			state = CharSetParseState.NONE;
+		} else {
+			applyPrevState(charSetAST, set, state);
+			state = new CharSetParseState(
+					CharSetParseState.Mode.PREV_CODE_POINT,
+					false,
+					codePoint,
+					IntervalSet.EMPTY_SET);
+		}
+		return state;
+	}
+
+	private CharSetParseState applyPrevStateAndMoveToProperty(
+			GrammarAST charSetAST,
+			IntervalSet set,
+			CharSetParseState state,
+			IntervalSet property) {
+		if (state.inRange) {
+			g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE,
+						   g.fileName, charSetAST.getToken(), charSetAST.getText());
+			return CharSetParseState.ERROR;
+		} else {
+			applyPrevState(charSetAST, set, state);
+			state = new CharSetParseState(
+					CharSetParseState.Mode.PREV_PROPERTY,
+					false,
+					-1,
+					property);
+		}
+		return state;
+	}
+
+	private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParseState state) {
+		switch (state.mode) {
+			case NONE:
+			case ERROR:
+				break;
+			case PREV_CODE_POINT:
+				checkSetCollision(charSetAST, set, state.prevCodePoint);
+				set.add(state.prevCodePoint);
+				break;
+			case PREV_PROPERTY:
+				set.addAll(state.prevProperty);
+				break;
+		}
+	}
+
+	private static int getSingleElement(IntervalSet set) {
+		// We don't use IntervalSet.getSingleElement() because it can't handle sets containing 0.
+		// See https://github.com/antlr/antlr4/issues/1703 .
+		if (set.size() != 1) {
+			return -1;
+		} else {
+			Interval interval = set.getIntervals().get(0);
+			if (interval.length() == 1) {
+				return interval.a;
+			} else {
+				return -1;
+			}
+		}
+	}
+
 	protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) {
 		if (set.contains(el)) {
 			g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
diff --git a/tool/src/org/antlr/v4/tool/ErrorType.java b/tool/src/org/antlr/v4/tool/ErrorType.java
index aca32da364e..c9ac268be70 100644
--- a/tool/src/org/antlr/v4/tool/ErrorType.java
+++ b/tool/src/org/antlr/v4/tool/ErrorType.java
@@ -152,11 +152,11 @@ public enum ErrorType {
 	 */
 	NO_MODEL_TO_TEMPLATE_MAPPING(34, "no mapping to template name for output model class <arg>", ErrorSeverity.ERROR),
     /**
-   	 * Compiler Error 35.
-   	 *
-   	 * <p>templates/target and tool aren't compatible</p>
-   	 */
-   	INCOMPATIBLE_TOOL_AND_TEMPLATES(35, "<arg3> code generation target requires ANTLR <arg2>; it can't be loaded by the current ANTLR <arg>", ErrorSeverity.ERROR),
+	 * Compiler Error 35.
+	 *
+	 * <p>templates/target and tool aren't compatible</p>
+	 */
+	INCOMPATIBLE_TOOL_AND_TEMPLATES(35, "<arg3> code generation target requires ANTLR <arg2>; it can't be loaded by the current ANTLR <arg>", ErrorSeverity.ERROR),
 
 	/*
 	 * Grammar errors
@@ -1060,6 +1060,20 @@ public enum ErrorType {
 	 */
 	TOKEN_RANGE_IN_PARSER(181, "token ranges not allowed in parser: <arg>..<arg2>", ErrorSeverity.ERROR),
 
+	/**
+	 * Compiler Error 182.
+	 *
+	 * <p>Unicode properties cannot be part of a lexer charset range</p>
+	 *
+	 * <pre>
+	 * A: [\\p{Letter}-\\p{Number}];
+	 * </pre>
+	 */
+	UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE(
+			182,
+			"unicode property escapes not allowed in lexer charset range: <arg>",
+			ErrorSeverity.ERROR),
+
 	/*
 	 * Backward incompatibility errors
 	 */