Skip to content

Commit

Permalink
Implement new extended Unicode escape \u{10ABCD}. Bump UUID. Add lots…
Browse files Browse the repository at this point in the history
… more tests.
  • Loading branch information
bhamiltoncx committed Feb 21, 2017
1 parent 91df265 commit 01f5374
Show file tree
Hide file tree
Showing 25 changed files with 1,361 additions and 209 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -402,4 +402,282 @@ public static class StarSet extends BaseParserTestDescriptor {
public String grammar;

}

public static class UnicodeUnescapedBMPSet extends BaseParserTestDescriptor {
public String input = "a\u00E4\u3042\u4E9Cc";
public String output = "a\u00E4\u3042\u4E9Cc\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
// These are actually not escaped -- Java passes the
// raw unescaped Unicode values to the grammar compiler.
LETTERS : ('a'|'\u00E4'|'\u4E9C'|'\u3042')* 'c';
*/
@CommentHasStringValue
public String grammar;

}

public static class UnicodeUnescapedBMPRangeSet extends BaseParserTestDescriptor {
public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// These are actually not escaped -- Java passes the
// raw unescaped Unicode values to the grammar compiler.
LETTERS : ('a'|'\u00E0'..'\u00E5');
*/
@CommentHasStringValue
public String grammar;

}

public static class UnicodeEscapedBMPSet extends BaseParserTestDescriptor {
public String input = "a\u00E4\u3042\u4E9Cc";
public String output = "a\u00E4\u3042\u4E9Cc\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u00E4'|'\\u4E9C'|'\\u3042')* 'c';
*/
@CommentHasStringValue
public String grammar;

}

public static class UnicodeEscapedBMPRangeSet extends BaseParserTestDescriptor {
public String input = "a\u00E1\u00E4\u00E1\u00E2\u00E5d";
public String output = "a\u00E1\u00E4\u00E1\u00E2\u00E5d\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u00E0'..'\\u00E5');
*/
@CommentHasStringValue
public String grammar;

}

// TODO(bhamiltoncx): This needs to be an error, the V3
// runtime used by the tool doesn't really understand unescaped code points >
// U+FFFF.
// public static class UnicodeUnescapedSMPSet extends BaseParserTestDescriptor {
// public String input = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("c")
// .toString();
// public String output = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("c\n")
// .toString();
// public String errors = null;
// public String startRule = "a";
// public String grammarName = "T";

// /**
// grammar T;
// a : LETTERS {<InputText():writeln()>} ;
// // These are actually not escaped -- Java passes the
// // raw unescaped Unicode values to the grammar compiler.
// //
// // Each sequence is the UTF-16 encoding of a raw Unicode
// // SMP code point.
// LETTERS : ('a'|'\uD835\uDDBA'|'\uD835\uDDBE'|'\uD835\uDDC2'|'\uD835\uDDC8'|'\uD835\uDDCE')* 'c';
// */
// @CommentHasStringValue
// public String grammar;

// }

public static class UnicodeEscapedSMPSet extends BaseParserTestDescriptor {
public String input = new StringBuilder()
.append("a")
.appendCodePoint(0x1D5C2)
.appendCodePoint(0x1D5CE)
.appendCodePoint(0x1D5BA)
.append("c")
.toString();
public String output = new StringBuilder()
.append("a")
.appendCodePoint(0x1D5C2)
.appendCodePoint(0x1D5CE)
.appendCodePoint(0x1D5BA)
.append("c\n")
.toString();
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u{1D5BA}'|'\\u{1D5BE}'|'\\u{1D5C2}'|'\\u{1D5C8}'|'\\u{1D5CE}')* 'c';
*/
@CommentHasStringValue
public String grammar;

}

// Turns out Tool.java uses ANTLR 3's runtime, which means it can't use
// CodePointCharStream to understand unescaped code points > U+FFFF.
//
// TODO(bhamiltoncx): This needs to be an error, since we don't currently plan
// to port Tool.java to use ANTLR 4's runtime.

// public static class UnicodeUnescapedSMPRangeSet extends BaseParserTestDescriptor {
// public String input = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("d")
// .toString();
// public String output = new StringBuilder()
// .append("a")
// .appendCodePoint(0x1D5C2)
// .appendCodePoint(0x1D5CE)
// .appendCodePoint(0x1D5BA)
// .append("d\n")
// .toString();
// public String errors = null;
// public String startRule = "a";
// public String grammarName = "T";

// /**
// grammar T;
// a : LETTERS* 'd' {<InputText():writeln()>} ;
// // These are actually not escaped -- Java passes the
// // raw unescaped Unicode values to the grammar compiler.
// LETTERS : ('a'|'\uD83D\uDE00'..'\uD83E\uDD43');
// */
// @CommentHasStringValue
// public String grammar;

// }

public static class UnicodeEscapedSMPRangeSet extends BaseParserTestDescriptor {
public String input = new StringBuilder()
.append("a")
.appendCodePoint(0x1F609)
.appendCodePoint(0x1F942)
.appendCodePoint(0x1F700)
.append("d")
.toString();
public String output = new StringBuilder()
.append("a")
.appendCodePoint(0x1F609)
.appendCodePoint(0x1F942)
.appendCodePoint(0x1F700)
.append("d\n")
.toString();
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
*/
@CommentHasStringValue
public String grammar;

}

public static class UnicodeEscapedSMPRangeSetMismatch extends BaseParserTestDescriptor {
// Test the code points just before and just after the range.
public String input = new StringBuilder()
.append("a")
.appendCodePoint(0x1F5FF)
.appendCodePoint(0x1F944)
.append("d")
.toString();
public String output = "ad\n";
public String errors = new StringBuilder()
.append("line 1:1 token recognition error at: '")
.appendCodePoint(0x1F5FF)
.append("'\n")
.append("line 1:2 token recognition error at: '")
.appendCodePoint(0x1F944)
.append("'\n")
.toString();
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS* 'd' {<InputText():writeln()>} ;
// Note the double-backslash to avoid Java passing
// unescaped values as part of the grammar.
LETTERS : ('a'|'\\u{1F600}'..'\\u{1F943}');
*/
@CommentHasStringValue
public String grammar;

}

public static class UnicodeNegatedBMPSetIncludesSMPCodePoints extends BaseParserTestDescriptor {
public String input = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c";
public String output = "a\uD83D\uDE33\uD83D\uDE21\uD83D\uDE1D\uD83E\uDD13c\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
LETTERS : 'a' ~('b')+ 'c';
*/
@CommentHasStringValue
public String grammar;

}

public static class UnicodeNegatedSMPSetIncludesBMPCodePoints extends BaseParserTestDescriptor {
public String input = "abc";
public String output = "abc\n";
public String errors = null;
public String startRule = "a";
public String grammarName = "T";

/**
grammar T;
a : LETTERS {<InputText():writeln()>} ;
LETTERS : 'a' ~('\\u{1F600}'..'\\u{1F943}')+ 'c';
*/
@CommentHasStringValue
public String grammar;

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@ public class ATNDeserializer
/// <remarks>This is the earliest supported serialized UUID.</remarks>
private static readonly Guid BaseSerializedUuid;

/// <summary>
/// This UUID indicates the serialized ATN contains two sets of
/// IntervalSets, where the second set's values are encoded as
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
/// </summary>
/// <remarks>
/// This UUID indicates the serialized ATN contains two sets of
/// IntervalSets, where the second set's values are encoded as
/// 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
/// </remarks>
private static readonly Guid AddedUnicodeSmp;

/// <summary>
/// This list contains all of the currently supported UUIDs, ordered by when
/// the feature first appeared in this branch.
Expand All @@ -39,14 +51,18 @@ public class ATNDeserializer
static ATNDeserializer()
{
BaseSerializedUuid = new Guid("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
AddedUnicodeSmp = new Guid("59627784-3BE5-417A-B9EB-8131A7286089");
SupportedUuids = new List<Guid>();
SupportedUuids.Add(BaseSerializedUuid);
SerializedUuid = BaseSerializedUuid;
SupportedUuids.Add(AddedUnicodeSmp);
SerializedUuid = AddedUnicodeSmp;
}

[NotNull]
private readonly ATNDeserializationOptions deserializationOptions;

private Guid uuid;

public ATNDeserializer()
: this(ATNDeserializationOptions.Default)
{
Expand Down Expand Up @@ -115,7 +131,11 @@ public virtual ATN Deserialize(char[] data)
ReadStates (atn);
ReadRules (atn);
ReadModes (atn);
IList<IntervalSet> sets = ReadSets (atn);
IList<IntervalSet> sets = new List<IntervalSet>();
ReadSets (atn, sets, this.ReadInt);
if (IsFeatureSupported(AddedUnicodeSmp, uuid)) {
ReadSets (atn, sets, this.ReadInt32);
}
ReadEdges (atn, sets);
ReadDecisions (atn);
ReadLexerActions (atn);
Expand Down Expand Up @@ -378,12 +398,11 @@ protected internal virtual void ReadEdges(ATN atn, IList<IntervalSet> sets)
}
}

protected internal virtual IList<IntervalSet> ReadSets(ATN atn)
protected internal virtual void ReadSets(ATN atn, IList<IntervalSet> sets, Func<int> readUnicode)
{
//
// SETS
//
IList<IntervalSet> sets = new List<IntervalSet>();
int nsets = ReadInt();
for (int i_8 = 0; i_8 < nsets; i_8++)
{
Expand All @@ -397,10 +416,9 @@ protected internal virtual IList<IntervalSet> ReadSets(ATN atn)
}
for (int j = 0; j < nintervals; j++)
{
set.Add(ReadInt(), ReadInt());
set.Add(readUnicode(), readUnicode());
}
}
return sets;
}

protected internal virtual void ReadModes(ATN atn)
Expand Down Expand Up @@ -530,7 +548,7 @@ protected internal virtual ATN ReadATN()

protected internal virtual void CheckUUID()
{
Guid uuid = ReadUUID();
uuid = ReadUUID();
if (!SupportedUuids.Contains(uuid))
{
string reason = string.Format(CultureInfo.CurrentCulture, "Could not deserialize ATN with UUID {0} (expected {1} or a legacy UUID).", uuid, SerializedUuid);
Expand Down
Loading

0 comments on commit 01f5374

Please sign in to comment.