From 8e90f0bc0b70d91086d4f2ff1625fa2b778ce6d2 Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Wed, 29 Mar 2017 12:36:58 -0700 Subject: [PATCH] Change UnbufferedCharStream to use 32-bit Unicode code points and 32-bit buffer --- .../v4/runtime/UnbufferedCharStream.java | 6 ++++- .../test/tool/TestUnbufferedCharStream.java | 24 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java b/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java index 563abec7c56..8043b0bd05b 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java +++ b/runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java @@ -156,10 +156,14 @@ protected int fill(int n) { add(c); } else { char ch = (char) c; - if (Character.isHighSurrogate(ch)) { + if (Character.isLowSurrogate(ch)) { + throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)"); + } else if (Character.isHighSurrogate(ch)) { int lowSurrogate = nextChar(); if (lowSurrogate > Character.MAX_VALUE) { throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF"); + } else if (lowSurrogate == IntStream.EOF) { + throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)"); } else { char lowSurrogateChar = (char) lowSurrogate; if (Character.isLowSurrogate(lowSurrogateChar)) { diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnbufferedCharStream.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnbufferedCharStream.java index 9c6b297c2ab..4f0e4cfbfa3 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestUnbufferedCharStream.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUnbufferedCharStream.java @@ -313,6 +313,30 @@ public void testLastChar() { assertEquals(expecting, tokens.getTokens().toString()); } + @Test public void testUnicodeSMP() throws Exception { + TestingUnbufferedCharStream input = createStream("\uD83C\uDF0E"); + assertEquals(0x1F30E, input.LA(1)); + assertEquals("\uD83C\uDF0E", input.getBuffer()); + input.consume(); + assertEquals(IntStream.EOF, input.LA(1)); + assertEquals("\uFFFF", input.getBuffer()); + } + + @Test(expected = RuntimeException.class) + public void testDanglingHighSurrogateAtEOFThrows() throws Exception { + createStream("\uD83C"); + } + + @Test(expected = RuntimeException.class) + public void testDanglingHighSurrogateThrows() throws Exception { + createStream("\uD83C\u0123"); + } + + @Test(expected = RuntimeException.class) + public void testDanglingLowSurrogateThrows() throws Exception { + createStream("\uDF0E"); + } + protected static TestingUnbufferedCharStream createStream(String text) { return new TestingUnbufferedCharStream(new StringReader(text)); }