From 57d021f722e1dbb0afbc561b548fbc085c4e453f Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Wed, 1 Mar 2017 14:05:39 -0800 Subject: [PATCH] New \p{Letter} Unicode property escape --- .../v4/test/tool/TestATNConstruction.java | 123 ++++++++++ .../v4/test/tool/TestToolSyntaxErrors.java | 38 ++++ .../antlr/v4/automata/LexerATNFactory.java | 211 +++++++++++++++--- tool/src/org/antlr/v4/tool/ErrorType.java | 24 +- 4 files changed, 361 insertions(+), 35 deletions(-) diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java index 5764175a5b3..d68321a3432 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java @@ -115,6 +115,129 @@ public void testA() throws Exception { "s4->RuleStop_A_2\n"; checkTokensRule(g, null, expecting); } + @Test public void testCharSet() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [abc] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{97..99}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetRange() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [a-c] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{97..99}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodeBMPEscape() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [\\uABCD] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-43981->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodeBMPEscapeRange() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [a-c\\uABCD-\\uABFF] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{97..99, 43981..44031}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodeSMPEscape() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [\\u{10ABCD}] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-1092557->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodeSMPEscapeRange() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{97..99, 1092557..1092607}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodePropertyEscape() throws Exception { + // The Gothic script is long dead and unlikely to change (which would + // cause this test to fail) + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [\\p{Gothic}] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{66352..66378}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodePropertyInvertEscape() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [\\P{Gothic}] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{0..66351, 66379..1114111}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception { + // Ditto the Mahajani script. Not going to change soon. I hope. + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [\\p{Gothic}\\p{Mahajani}] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{66352..66378, 69968..70006}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } + @Test public void testCharSetUnicodePropertyOverlap() throws Exception { + LexerGrammar g = new LexerGrammar( + "lexer grammar P;\n"+ + "A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;" + ); + String expecting = + "s0->RuleStart_A_1\n" + + "RuleStart_A_1->s3\n" + + "s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" + + "s4->RuleStop_A_2\n"; + checkTokensRule(g, null, expecting); + } @Test public void testRangeOrRange() throws Exception { LexerGrammar g = new LexerGrammar( "lexer grammar P;\n"+ diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java index 6b828ef9649..cf16905bd84 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java @@ -529,6 +529,44 @@ public void testSetUp() throws Exception { super.testErrors(pair, true); } + @Test public void testInvalidUnicodeEscapesInCharSet() { + String grammar = + "lexer grammar Test;\n" + + "INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" + + "INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" + + "INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" + + "INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" + + "INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" + + "INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" + + "INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" + + "INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" + + "UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}];\n" + + "UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Letter}-Z];\n" + + "UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Number}];\n" + + "INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Uppercase_Letter}-\\P{Number}];\n"; + + String expected = + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:2:32: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:3:41: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:4:35: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:5:32: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:41: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:41: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:8:34: invalid escape sequence\n" + + "error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:9:43: invalid escape sequence\n" + + "error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:10:39: unicode property escapes not allowed in lexer charset range: [\\p{Uppercase_Letter}-\\p{Lowercase_Letter}]\n" + + "error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:11:41: unicode property escapes not allowed in lexer charset range: [\\p{Letter}-Z]\n" + + "error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:12:41: unicode property escapes not allowed in lexer charset range: [A-\\p{Number}]\n" + + "error(" + ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE.code + "): Test.g4:13:48: unicode property escapes not allowed in lexer charset range: [\\P{Uppercase_Letter}-\\P{Number}]\n"; + + String[] pair = new String[] { + grammar, + expected + }; + + super.testErrors(pair, true); + } + /** * This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning * is produced as described in the documentation. diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java index 5c1f9d78e1c..fb3279a19f4 100644 --- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java +++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java @@ -10,6 +10,7 @@ import org.antlr.runtime.Token; import org.antlr.v4.codegen.CodeGenerator; import org.antlr.v4.misc.CharSupport; +import org.antlr.v4.misc.EscapeSequenceParsing; import org.antlr.v4.parse.ANTLRParser; import org.antlr.v4.runtime.IntStream; import org.antlr.v4.runtime.Lexer; @@ -49,6 +50,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; public class LexerATNFactory extends ParserATNFactory { @@ -365,7 +367,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) { return new Handle(left, right); } - /** [Aa\t \u1234a-z\]\-] char sets */ + /** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */ @Override public Handle charSetLiteral(GrammarAST charSetAST) { ATNState left = newState(charSetAST); @@ -376,10 +378,68 @@ public Handle charSetLiteral(GrammarAST charSetAST) { return new Handle(left, right); } + private static class CharSetParseState { + enum Mode { + NONE, + ERROR, + PREV_CODE_POINT, + PREV_PROPERTY + } + + public static final CharSetParseState NONE = new CharSetParseState(Mode.NONE, false, -1, IntervalSet.EMPTY_SET); + public static final CharSetParseState ERROR = new CharSetParseState(Mode.ERROR, false, -1, IntervalSet.EMPTY_SET); + + public final Mode mode; + public final boolean inRange; + public final int prevCodePoint; + public final IntervalSet prevProperty; + + public CharSetParseState( + Mode mode, + boolean inRange, + int prevCodePoint, + IntervalSet prevProperty) { + this.mode = mode; + this.inRange = inRange; + this.prevCodePoint = prevCodePoint; + this.prevProperty = prevProperty; + } + + @Override + public String toString() { + return String.format( + "%s mode=%s inRange=%s prevCodePoint=%d prevProperty=%s", + super.toString(), + mode, + inRange, + prevCodePoint, + prevProperty); + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof CharSetParseState)) { + return false; + } + CharSetParseState that = (CharSetParseState) other; + if (this == that) { + return true; + } + return Objects.equals(this.mode, that.mode) && + Objects.equals(this.inRange, that.inRange) && + Objects.equals(this.prevCodePoint, that.prevCodePoint) && + Objects.equals(this.prevProperty, that.prevProperty); + } + + @Override + public int hashCode() { + return Objects.hash(mode, inRange, prevCodePoint, prevProperty); + } + } + public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) { String chars = charSetAST.getText(); chars = chars.substring(1, chars.length() - 1); - String cset = '"' + chars + '"'; IntervalSet set = new IntervalSet(); if (chars.length() == 0) { @@ -387,46 +447,137 @@ public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) { g.fileName, charSetAST.getToken(), "[]"); return set; } - // unescape all valid escape char like \n, leaving escaped dashes as '\-' - // so we can avoid seeing them as '-' range ops. - chars = CharSupport.getStringFromGrammarStringLiteral(cset); - if (chars == null) { - g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, - g.fileName, charSetAST.getToken()); - return set; - } + + CharSetParseState state = CharSetParseState.NONE; + int n = chars.length(); - // now make x-y become set of char for (int i = 0; i < n; ) { + if (state.mode == CharSetParseState.Mode.ERROR) { + return new IntervalSet(); + } int c = chars.codePointAt(i); int offset = Character.charCount(c); - if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \- - checkSetCollision(charSetAST, set, '-'); - set.add('-'); - offset++; - } - else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y - int x = c; - int y = chars.codePointAt(i+offset+1); - if (x <= y) { - checkSetCollision(charSetAST, set, x, y); - set.add(x,y); + if (c == '\\') { + EscapeSequenceParsing.Result escapeParseResult = + EscapeSequenceParsing.parseEscape(chars, i); + switch (escapeParseResult.type) { + case INVALID: + g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE, + g.fileName, charSetAST.getToken(), charSetAST.getText()); + state = CharSetParseState.ERROR; + break; + case CODE_POINT: + state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, escapeParseResult.codePoint); + break; + case PROPERTY: + state = applyPrevStateAndMoveToProperty(charSetAST, set, state, escapeParseResult.propertyIntervalSet); + break; } - else { - g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, - g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED)); + offset = escapeParseResult.parseLength; + } else if (c == '-' && !state.inRange) { + if (state.mode == CharSetParseState.Mode.PREV_PROPERTY) { + g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE, + g.fileName, charSetAST.getToken(), charSetAST.getText()); + state = CharSetParseState.ERROR; + } else { + state = new CharSetParseState(state.mode, true, state.prevCodePoint, state.prevProperty); } - offset += Character.charCount(y) + 1; - } - else { - checkSetCollision(charSetAST, set, c); - set.add(c); + } else { + state = applyPrevStateAndMoveToCodePoint(charSetAST, set, state, c); } i += offset; } + if (state.mode == CharSetParseState.Mode.ERROR) { + return new IntervalSet(); + } + // Whether or not we were in a range, we'll add the last code point found to the set. + // If the range wasn't terminated, we'll treat it as a standalone codepoint. + applyPrevState(charSetAST, set, state); + if (state.inRange) { + // Unterminated range; add a literal hyphen to the set. + checkSetCollision(charSetAST, set, '-'); + set.add('-'); + } return set; } + private CharSetParseState applyPrevStateAndMoveToCodePoint( + GrammarAST charSetAST, + IntervalSet set, + CharSetParseState state, + int codePoint) { + if (state.inRange) { + if (state.prevCodePoint > codePoint) { + g.tool.errMgr.grammarError( + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, + g.fileName, + charSetAST.getToken(), + CharSupport.toRange(state.prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED)); + } + checkSetCollision(charSetAST, set, state.prevCodePoint, codePoint); + set.add(state.prevCodePoint, codePoint); + state = CharSetParseState.NONE; + } else { + applyPrevState(charSetAST, set, state); + state = new CharSetParseState( + CharSetParseState.Mode.PREV_CODE_POINT, + false, + codePoint, + IntervalSet.EMPTY_SET); + } + return state; + } + + private CharSetParseState applyPrevStateAndMoveToProperty( + GrammarAST charSetAST, + IntervalSet set, + CharSetParseState state, + IntervalSet property) { + if (state.inRange) { + g.tool.errMgr.grammarError(ErrorType.UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE, + g.fileName, charSetAST.getToken(), charSetAST.getText()); + return CharSetParseState.ERROR; + } else { + applyPrevState(charSetAST, set, state); + state = new CharSetParseState( + CharSetParseState.Mode.PREV_PROPERTY, + false, + -1, + property); + } + return state; + } + + private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParseState state) { + switch (state.mode) { + case NONE: + case ERROR: + break; + case PREV_CODE_POINT: + checkSetCollision(charSetAST, set, state.prevCodePoint); + set.add(state.prevCodePoint); + break; + case PREV_PROPERTY: + set.addAll(state.prevProperty); + break; + } + } + + private static int getSingleElement(IntervalSet set) { + // We don't use IntervalSet.getSingleElement() because it can't handle sets containing 0. + // See https://github.com/antlr/antlr4/issues/1703 . + if (set.size() != 1) { + return -1; + } else { + Interval interval = set.getIntervals().get(0); + if (interval.length() == 1) { + return interval.a; + } else { + return -1; + } + } + } + protected void checkSetCollision(GrammarAST ast, IntervalSet set, int el) { if (set.contains(el)) { g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(), diff --git a/tool/src/org/antlr/v4/tool/ErrorType.java b/tool/src/org/antlr/v4/tool/ErrorType.java index aca32da364e..c9ac268be70 100644 --- a/tool/src/org/antlr/v4/tool/ErrorType.java +++ b/tool/src/org/antlr/v4/tool/ErrorType.java @@ -152,11 +152,11 @@ public enum ErrorType { */ NO_MODEL_TO_TEMPLATE_MAPPING(34, "no mapping to template name for output model class ", ErrorSeverity.ERROR), /** - * Compiler Error 35. - * - *

templates/target and tool aren't compatible

- */ - INCOMPATIBLE_TOOL_AND_TEMPLATES(35, " code generation target requires ANTLR ; it can't be loaded by the current ANTLR ", ErrorSeverity.ERROR), + * Compiler Error 35. + * + *

templates/target and tool aren't compatible

+ */ + INCOMPATIBLE_TOOL_AND_TEMPLATES(35, " code generation target requires ANTLR ; it can't be loaded by the current ANTLR ", ErrorSeverity.ERROR), /* * Grammar errors @@ -1060,6 +1060,20 @@ public enum ErrorType { */ TOKEN_RANGE_IN_PARSER(181, "token ranges not allowed in parser: ..", ErrorSeverity.ERROR), + /** + * Compiler Error 182. + * + *

Unicode properties cannot be part of a lexer charset range

+ * + *
+	 * A: [\\p{Letter}-\\p{Number}];
+	 * 
+ */ + UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE( + 182, + "unicode property escapes not allowed in lexer charset range: ", + ErrorSeverity.ERROR), + /* * Backward incompatibility errors */