Implement new extended Unicode escape \u{10ABCD}. Bump UUID. Add lots…

… more tests.
antlr · Feb 21, 2017 · 01f5374 · 01f5374
1 parent 91df265
commit 01f5374
Show file tree

Hide file tree

Showing 25 changed files with 1,361 additions and 209 deletions.
diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/descriptors/SetsDescriptors.java
@@ -402,4 +402,282 @@ public static class StarSet extends BaseParserTestDescriptor {
 		public String grammar;
 
 	}
+
+	public static class UnicodeUnescapedBMPSet extends BaseParserTestDescriptor {
+		public String input = "a\u00E4\u3042\u4E9Cc";
+		public String output = "a\u00E4\u3042\u4E9Cc\n";
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS {<InputText():writeln()>} ;
+		 // These are actually not escaped -- Java passes the
+		 // raw unescaped Unicode values to the grammar compiler.
+		 LETTERS : ('a'|'\u00E4'|'\u4E9C'|'\u3042')* 'c';
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	public static class UnicodeUnescapedBMPRangeSet extends BaseParserTestDescriptor {
+		public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
+		public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS* 'd' {<InputText():writeln()>} ;
+		 // These are actually not escaped -- Java passes the
+		 // raw unescaped Unicode values to the grammar compiler.
+		 LETTERS : ('a'|'\u00E0'..'\u00E5');
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	public static class UnicodeEscapedBMPSet extends BaseParserTestDescriptor {
+		public String input = "a\u00E4\u3042\u4E9Cc";
+		public String output = "a\u00E4\u3042\u4E9Cc\n";
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS {<InputText():writeln()>} ;
+		 // Note the double-backslash to avoid Java passing
+		 // unescaped values as part of the grammar.
+		 LETTERS : ('a'|'\\u00E4'|'\\u4E9C'|'\\u3042')* 'c';
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	public static class UnicodeEscapedBMPRangeSet extends BaseParserTestDescriptor {
+		public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
+		public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS* 'd' {<InputText():writeln()>} ;
+		 // Note the double-backslash to avoid Java passing
+		 // unescaped values as part of the grammar.
+		 LETTERS : ('a'|'\\u00E0'..'\\u00E5');
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	// TODO(bhamiltoncx): This needs to be an error, the V3
+	// runtime used by the tool doesn't really understand unescaped code points >
+	// U+FFFF.
+	// public static class UnicodeUnescapedSMPSet extends BaseParserTestDescriptor {
+	//	public String input = new StringBuilder()
+	//			.append("a")
+	//			.appendCodePoint(0x1D5C2)
+	//			.appendCodePoint(0x1D5CE)
+	//			.appendCodePoint(0x1D5BA)
+	//			.append("c")
+	//			.toString();
+	//	public String output = new StringBuilder()
+	//			.append("a")
+	//			.appendCodePoint(0x1D5C2)
+	//			.appendCodePoint(0x1D5CE)
+	//			.appendCodePoint(0x1D5BA)
+	//			.append("c\n")
+	//			.toString();
+	//	public String errors = null;
+	//	public String startRule = "a";
+	//	public String grammarName = "T";
+
+	//	/**
+	//	 grammar T;
+	//	 a : LETTERS  {<InputText():writeln()>} ;
+	//	 // These are actually not escaped -- Java passes the
+	//	 // raw unescaped Unicode values to the grammar compiler.
+	//	 //
+	//	 // Each sequence is the UTF-16 encoding of a raw Unicode
+	//	 // SMP code point.
+	//	 LETTERS : ('a'|'\uD835\uDDBA'|'\uD835\uDDBE'|'\uD835\uDDC2'|'\uD835\uDDC8'|'\uD835\uDDCE')* 'c';
+	//	 */
+	//	@CommentHasStringValue
+	//	public String grammar;
+
+	// }
+
+	public static class UnicodeEscapedSMPSet extends BaseParserTestDescriptor {
+		public String input = new StringBuilder()
+				.append("a")
+				.appendCodePoint(0x1D5C2)
+				.appendCodePoint(0x1D5CE)
+				.appendCodePoint(0x1D5BA)
+				.append("c")
+				.toString();
+		public String output = new StringBuilder()
+				.append("a")
+				.appendCodePoint(0x1D5C2)
+				.appendCodePoint(0x1D5CE)
+				.appendCodePoint(0x1D5BA)
+				.append("c\n")
+				.toString();
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS  {<InputText():writeln()>} ;
+		 // Note the double-backslash to avoid Java passing
+		 // unescaped values as part of the grammar.
+		 LETTERS : ('a'|'\\u{1D5BA}'|'\\u{1D5BE}'|'\\u{1D5C2}'|'\\u{1D5C8}'|'\\u{1D5CE}')* 'c';
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	// Turns out Tool.java uses ANTLR 3's runtime, which means it can't use
+	// CodePointCharStream to understand unescaped code points > U+FFFF.
+	//
+	// TODO(bhamiltoncx): This needs to be an error, since we don't currently plan
+	// to port Tool.java to use ANTLR 4's runtime.
+
+	// public static class UnicodeUnescapedSMPRangeSet extends BaseParserTestDescriptor {
+	//	public String input = new StringBuilder()
+	//			.append("a")
+	//			.appendCodePoint(0x1D5C2)
+	//			.appendCodePoint(0x1D5CE)
+	//			.appendCodePoint(0x1D5BA)
+	//			.append("d")
+	//			.toString();
+	//	public String output = new StringBuilder()
+	//			.append("a")
+	//			.appendCodePoint(0x1D5C2)
+	//			.appendCodePoint(0x1D5CE)
+	//			.appendCodePoint(0x1D5BA)
+	//			.append("d\n")
+	//			.toString();
+	//	public String errors = null;
+	//	public String startRule = "a";
+	//	public String grammarName = "T";
+
+	//	/**
+	//	 grammar T;
+	//	 a : LETTERS* 'd' {<InputText():writeln()>} ;
+	//	 // These are actually not escaped -- Java passes the
+	//	 // raw unescaped Unicode values to the grammar compiler.
+	//	 LETTERS : ('a'|'\uD83D\uDE00'..'\uD83E\uDD43');
+	//	 */
+	//	@CommentHasStringValue
+	//	public String grammar;
+
+	// }
+
+	public static class UnicodeEscapedSMPRangeSet extends BaseParserTestDescriptor {
+		public String input = new StringBuilder()
+				.append("a")
+				.appendCodePoint(0x1F609)
+				.appendCodePoint(0x1F942)
+				.appendCodePoint(0x1F700)
+				.append("d")
+				.toString();
+		public String output = new StringBuilder()
+				.append("a")
+				.appendCodePoint(0x1F609)
+				.appendCodePoint(0x1F942)
+				.appendCodePoint(0x1F700)
+				.append("d\n")
+				.toString();
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS* 'd' {<InputText():writeln()>} ;
+		 // Note the double-backslash to avoid Java passing
+		 // unescaped values as part of the grammar.
+		 LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	public static class UnicodeEscapedSMPRangeSetMismatch extends BaseParserTestDescriptor {
+		// Test the code points just before and just after the range.
+		public String input = new StringBuilder()
+				.append("a")
+				.appendCodePoint(0x1F5FF)
+				.appendCodePoint(0x1F944)
+				.append("d")
+				.toString();
+		public String output = "ad\n";
+		public String errors = new StringBuilder()
+				.append("line 1:1 token recognition error at: '")
+				.appendCodePoint(0x1F5FF)
+				.append("'\n")
+				.append("line 1:2 token recognition error at: '")
+				.appendCodePoint(0x1F944)
+				.append("'\n")
+				.toString();
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS* 'd' {<InputText():writeln()>} ;
+		 // Note the double-backslash to avoid Java passing
+		 // unescaped values as part of the grammar.
+		 LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	public static class UnicodeNegatedBMPSetIncludesSMPCodePoints extends BaseParserTestDescriptor {
+		public String input = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c";
+		public String output = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c\n";
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS {<InputText():writeln()>} ;
+		 LETTERS : 'a' ~('b')+ 'c';
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
+
+	public static class UnicodeNegatedSMPSetIncludesBMPCodePoints extends BaseParserTestDescriptor {
+		public String input = "abc";
+		public String output = "abc\n";
+		public String errors = null;
+		public String startRule = "a";
+		public String grammarName = "T";
+
+		/**
+		 grammar T;
+		 a : LETTERS {<InputText():writeln()>} ;
+		 LETTERS : 'a' ~('\\u{1F600}'..'\\u{1F943}')+ 'c';
+		 */
+		@CommentHasStringValue
+		public String grammar;
+
+	}
 }
diff --git a/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs b/runtime/CSharp/runtime/CSharp/Antlr4.Runtime/Atn/ATNDeserializer.cs
@@ -22,6 +22,18 @@ public class ATNDeserializer
         /// <remarks>This is the earliest supported serialized UUID.</remarks>
         private static readonly Guid BaseSerializedUuid;
 
+        /// <summary>
+        /// This UUID indicates the serialized ATN contains two sets of
+        /// IntervalSets, where the second set's values are encoded as
+        /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
+        /// </summary>
+        /// <remarks>
+        /// This UUID indicates the serialized ATN contains two sets of
+        /// IntervalSets, where the second set's values are encoded as
+        /// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
+        /// </remarks>
+        private static readonly Guid AddedUnicodeSmp;
+
         /// <summary>
         /// This list contains all of the currently supported UUIDs, ordered by when
         /// the feature first appeared in this branch.
@@ -39,14 +51,18 @@ public class ATNDeserializer
         static ATNDeserializer()
         {
 			BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
+			AddedUnicodeSmp = new Guid("59627784-3BE5-417A-B9EB-8131A7286089");
             SupportedUuids = new List<Guid>();
             SupportedUuids.Add(BaseSerializedUuid);
-			SerializedUuid = BaseSerializedUuid;
+            SupportedUuids.Add(AddedUnicodeSmp);
+			SerializedUuid = AddedUnicodeSmp;
         }
 
         [NotNull]
         private readonly ATNDeserializationOptions deserializationOptions;
 
+        private Guid uuid;
+
         public ATNDeserializer()
             : this(ATNDeserializationOptions.Default)
         {
@@ -115,7 +131,11 @@ public virtual ATN Deserialize(char[] data)
 			ReadStates (atn);
 			ReadRules (atn);
 			ReadModes (atn);
-			IList<IntervalSet> sets = ReadSets (atn);
+			IList<IntervalSet> sets = new List<IntervalSet>();
+			ReadSets (atn, sets, this.ReadInt);
+			if (IsFeatureSupported(AddedUnicodeSmp, uuid)) {
+				ReadSets (atn, sets, this.ReadInt32);
+			}
 			ReadEdges (atn, sets);
 			ReadDecisions (atn);
 			ReadLexerActions (atn);
@@ -378,12 +398,11 @@ protected internal virtual void ReadEdges(ATN atn, IList<IntervalSet> sets)
 			}
 		}
 
-		protected internal virtual IList<IntervalSet> ReadSets(ATN atn)
+		protected internal virtual void ReadSets(ATN atn, IList<IntervalSet> sets, Func<int> readUnicode)
 		{
 			//
 			// SETS
 			//
-			IList<IntervalSet> sets = new List<IntervalSet>();
 			int nsets = ReadInt();
 			for (int i_8 = 0; i_8 < nsets; i_8++)
 			{
@@ -397,10 +416,9 @@ protected internal virtual IList<IntervalSet> ReadSets(ATN atn)
 				}
 				for (int j = 0; j < nintervals; j++)
 				{
-					set.Add(ReadInt(), ReadInt());
+					set.Add(readUnicode(), readUnicode());
 				}
 			}
-			return sets;
 		}
 
 		protected internal virtual void ReadModes(ATN atn)
@@ -530,7 +548,7 @@ protected internal virtual ATN ReadATN()
 
 		protected internal virtual void CheckUUID()
 		{
-			Guid uuid = ReadUUID();
+			uuid = ReadUUID();
 			if (!SupportedUuids.Contains(uuid))
 			{
 				string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid);