Skip to content

Commit

Permalink
Merge pull request #1739 from bhamiltoncx/unicode-enum-property
Browse files Browse the repository at this point in the history
Also support Unicode enumerated properties via \p{Foo=Bar} syntax
  • Loading branch information
parrt authored Mar 8, 2017
2 parents b8c74be + 6007132 commit e8657fa
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 17 deletions.
8 changes: 4 additions & 4 deletions doc/lexer-rules.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ Match that character or sequence of characters. E.g., ’while’ or ’=’.</t
<td>[char set]</td><td>
<p>Match one of the characters specified in the character set. Interpret <tt>x-y</tt> as the set of characters between range <tt>x</tt> and <tt>y</tt>, inclusively. The following escaped characters are interpreted as single special characters: <tt>\n</tt>, <tt>\r</tt>, <tt>\b</tt>, <tt>\t</tt>, <tt>\f</tt>, <tt>\uXXXX</tt>, and <tt>\u{XXXXXX}</tt>. To get <tt>]</tt>, <tt>\</tt>, or <tt>-</tt> you must escape them with <tt>\</tt>.</p>

<p>You can also include all characters matching Unicode properties (general category, boolean, script, or block) with <tt>\p{PropertyName}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt>).</p>
<p>You can also include all characters matching Unicode properties (general category, boolean, or enumerated including scripts and blocks) with <tt>\p{PropertyName}</tt> or <tt>\p{EnumProperty=Value}</tt>. (You can invert the test with <tt>\P{PropertyName}</tt> or <tt>\P{EnumProperty=Value}</tt>).</p>

<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, and <tt>\p{Symbol}</tt>.)</p>
<p>For a list of valid Unicode property names, see <a href="http://unicode.org/reports/tr44/#Properties">Unicode Standard Annex #44</a>. (ANTLR also supports <a href="http://unicode.org/reports/tr44/#General_Category_Values">short and long Unicode general category names and values</a> like <tt>\p{Lu}</tt>, <tt>\p{Z}</tt>, <tt>\p{Symbol}</tt>, <tt>\p{Blk=Latin_1_Sup}</tt>, and <tt>\p{Block=Latin_1_Supplement}</tt>.)</p>

<p>Property names include <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> (they overlap with script names) and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>
<p>As a shortcut for <tt>\p{Block=Latin_1_Supplement}</tt>, you can refer to blocks using <a href="http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt">Unicode block names</a> prefixed with <tt>In</tt> and with spaces changed to <tt>_</tt>. For example: <tt>\p{InLatin_1_Supplement}</tt>, <tt>\p{InYijing_Hexagram_Symbols}</tt>, and <tt>\p{InAncient_Greek_Numbers}</tt>.</p>

<p>Property names are <b>case-insensitive</b>, and <tt>_</tt> and <tt>-</tt> are treated identically</p>

Expand All @@ -77,7 +77,7 @@ UNICODE_WS : [\p{White_Space}] -> skip; // match all Unicode whitespace

ID : [a-zA-Z] [a-zA-Z0-9]* ; // match usual identifier spec

UNICODE_ID : [\p{Alpha}] [\p{Alnum}]* ; // match full Unicode alphabetic ids
UNICODE_ID : [\p{Alpha}\p{General_Category=Other_Letter}] [\p{Alnum}\p{General_Category=Other_Letter}]* ; // match full Unicode alphabetic ids

EMOJI : [\u{1F4A9}\u{1F926}] ; // note Unicode code points > U+FFFF

Expand Down
4 changes: 2 additions & 2 deletions tool-codegen/src/main/string-template/unicodedata.st
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public abstract class UnicodeData {
// initialization into one method per Unicode property

<propertyCodePointRanges.keys:{ k | // Unicode code points with property "<k>"
static private void addProperty<k>() {
static private void addProperty<i>() {
List\<Interval\> intervals = Arrays.asList(
<propertyCodePointRanges.(k).intervals:{ interval | Interval.of(<interval.a>, <interval.b>)}; separator=",\n">
);
Expand All @@ -37,7 +37,7 @@ static private void addProperty<k>() {

// Put it all together
static {
<propertyCodePointRanges.keys:{ k | addProperty<k>(); }; separator="\n">
<propertyCodePointRanges.keys:{ k | addProperty<i>(); }; separator="\n">
addPropertyAliases();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ public static Map<String, Object> getProperties() {
Map<String, IntervalSet> propertyCodePointRanges = new LinkedHashMap<>();
addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBlocksToCodePointRanges(propertyCodePointRanges);
addUnicodeIntPropertyCodesToCodePointRanges(propertyCodePointRanges);

Map<String, String> propertyAliases = new LinkedHashMap<>();
addUnicodeCategoryCodesToNames(propertyAliases);
addUnicodeBinaryPropertyCodesToNames(propertyAliases);
addUnicodeScriptCodesToNames(propertyAliases);
addUnicodeBlocksToNames(propertyAliases);
addUnicodeIntPropertyCodesToNames(propertyAliases);

Map<String, Object> properties = new LinkedHashMap<>();
properties.put("propertyCodePointRanges", propertyCodePointRanges);
Expand Down Expand Up @@ -191,20 +191,22 @@ private static void addIntPropertyRanges(int property, String namePrefix, Map<St
}
}

private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
addIntPropertyRanges(UProperty.SCRIPT, "", propertyCodePointRanges);
}

private static void addUnicodeBlocksToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
addIntPropertyRanges(UProperty.BLOCK, "In", propertyCodePointRanges);
private static void addUnicodeIntPropertyCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
for (int property = UProperty.INT_START;
property < UProperty.INT_LIMIT;
property++) {
String propertyName = getShortPropertyName(property);
addIntPropertyRanges(property, propertyName + "=", propertyCodePointRanges);
}
}

private static void addIntPropertyAliases(int property, String namePrefix, Map<String, String> propertyAliases) {
String propertyName = getShortPropertyName(property);
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
propertyValue++) {
String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.LONG;
String aliasTarget = propertyName + "=" + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.SHORT;
String alias;
while (true) {
try {
Expand All @@ -214,7 +216,7 @@ private static void addIntPropertyAliases(int property, String namePrefix, Map<S
break;
}
assert alias != null;
addPropertyAlias(propertyAliases, alias, propertyName);
addPropertyAlias(propertyAliases, alias, aliasTarget);
nameChoice++;
}
}
Expand All @@ -227,4 +229,23 @@ private static void addUnicodeScriptCodesToNames(Map<String, String> propertyAli
private static void addUnicodeBlocksToNames(Map<String, String> propertyAliases) {
addIntPropertyAliases(UProperty.BLOCK, "In", propertyAliases);
}

private static void addUnicodeIntPropertyCodesToNames(Map<String, String> propertyAliases) {
for (int property = UProperty.INT_START;
property < UProperty.INT_LIMIT;
property++) {
int nameChoice = UProperty.NameChoice.SHORT + 1;
while (true) {
String propertyNameAlias;
try {
propertyNameAlias = UCharacter.getPropertyName(property, nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
}
addIntPropertyAliases(property, propertyNameAlias + "=", propertyAliases);
nameChoice++;
}
}
}
}
44 changes: 44 additions & 0 deletions tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ public void testUnicodeScripts() {
assertTrue(UnicodeData.getPropertyCodePoints("Cyrl").contains(0x0404));
}

@Test
public void testUnicodeScriptEquals() {
assertTrue(UnicodeData.getPropertyCodePoints("Script=Zyyy").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("Script=Latn").contains('X'));
assertTrue(UnicodeData.getPropertyCodePoints("Script=Hani").contains(0x4E04));
assertTrue(UnicodeData.getPropertyCodePoints("Script=Cyrl").contains(0x0404));
}

@Test
public void testUnicodeScriptAliases() {
assertTrue(UnicodeData.getPropertyCodePoints("Common").contains('0'));
Expand All @@ -116,12 +124,48 @@ public void testUnicodeBlocks() {
assertTrue(UnicodeData.getPropertyCodePoints("InMisc_Pictographs").contains(0x1F4A9));
}

@Test
public void testUnicodeBlockEquals() {
assertTrue(UnicodeData.getPropertyCodePoints("Block=ASCII").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("Block=CJK").contains(0x4E04));
assertTrue(UnicodeData.getPropertyCodePoints("Block=Cyrillic").contains(0x0404));
assertTrue(UnicodeData.getPropertyCodePoints("Block=Misc_Pictographs").contains(0x1F4A9));
}

@Test
public void testUnicodeBlockAliases() {
assertTrue(UnicodeData.getPropertyCodePoints("InBasic_Latin").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("InMiscellaneous_Mathematical_Symbols_B").contains(0x29BE));
}

@Test
public void testEnumeratedPropertyEquals() {
assertTrue(
"U+1F481 INFORMATION DESK PERSON is an emoji modifier base",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));

assertFalse(
"U+1F47E ALIEN MONSTER is not an emoji modifier",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F47E));

assertTrue(
"U+0E33 THAI CHARACTER SARA AM is a spacing mark",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1F481));

assertFalse(
"U+1038 MYANMAR SIGN VISARGA is not a spacing mark",
UnicodeData.getPropertyCodePoints("Grapheme_Cluster_Break=E_Base").contains(0x1038));

assertTrue(
"U+00A1 INVERTED EXCLAMATION MARK has ambiguous East Asian Width",
UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A1));

assertFalse(
"U+00A2 CENT SIGN does not have ambiguous East Asian Width",
UnicodeData.getPropertyCodePoints("East_Asian_Width=Ambiguous").contains(0x00A2));

}

@Test
public void testPropertyCaseInsensitivity() {
assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));
Expand Down
2 changes: 2 additions & 0 deletions tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
* \\u{10ABCD}
* \\p{Foo}
* \\P{Bar}
* \\p{Baz=Blech}
* \\P{Baz=Blech}
*/
public abstract class EscapeSequenceParsing {
public static class Result {
Expand Down

0 comments on commit e8657fa

Please sign in to comment.