New \p{Letter} Unicode property escape

antlr · Mar 1, 2017 · ca03e6a · ca03e6a
1 parent d11968d
commit ca03e6a
Show file tree

Hide file tree

Showing 4 changed files with 341 additions and 30 deletions.
diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
@@ -115,6 +115,129 @@ public void testA() throws Exception {
 				"s4->RuleStop_A_2\n";
 		checkTokensRule(g, null, expecting);
 	}
+	@Test public void testCharSet() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [abc] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\uABCD] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-43981->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\uABCD-\\uABFF] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 43981..44031}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\u{10ABCD}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-1092557->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 1092557..1092607}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyEscape() throws Exception {
+		// The Gothic script is long dead and unlikely to change (which would
+		// cause this test to fail)
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\P{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{0..66351, 66379..1114111}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
+		// Ditto the Mahajani script. Not going to change soon. I hope.
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}\\p{Mahajani}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378, 69968..70006}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
 	@Test public void testRangeOrRange() throws Exception {
 		LexerGrammar g = new LexerGrammar(
 			"lexer grammar P;\n"+

diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
@@ -529,6 +529,44 @@ public void testSetUp() throws Exception {
 		super.testErrors(pair, true);
 	}
 
+	@Test public void testInvalidUnicodeEscapesInCharSet() {
+		String grammar =
+				"lexer grammar Test;\n" +
+				"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
+				"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
+				"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
+				"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
+				"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
+				"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Letter}-Z];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Number}];\n" +
+				"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n";
+
+		String expected =
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" +
+				"error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:13:48: unicode property escapes not allowed in lexer charset range: [\\P{Uppercase_Letter}-\\P{Number}]\n";
+
+		String[] pair = new String[] {
+				grammar,
+				expected
+		};
+
+		super.testErrors(pair, true);
+	}
+
 	/**
 	 * This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
 	 * is produced as described in the documentation.