From 13db7c90d0f8c88bd1e8740aa88239382c114eca Mon Sep 17 00:00:00 2001 From: cpplearner Date: Sun, 3 Mar 2024 12:31:21 +0800 Subject: [PATCH 1/5] Tools: add a script to download Unicode data files --- .../download_unicode_data_files.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tools/unicode_properties_parse/download_unicode_data_files.py diff --git a/tools/unicode_properties_parse/download_unicode_data_files.py b/tools/unicode_properties_parse/download_unicode_data_files.py new file mode 100644 index 0000000000..dec81017b4 --- /dev/null +++ b/tools/unicode_properties_parse/download_unicode_data_files.py @@ -0,0 +1,20 @@ +from urllib.request import urlretrieve + + +Unicode_data_files = { + "DerivedCoreProperties.txt": "https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt", + "DerivedGeneralCategory.txt": "https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt", + "EastAsianWidth.txt": "https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt", + "GraphemeBreakProperty.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt", + "GraphemeBreakText.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt", + "emoji-data.txt": "https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt", +} + +def download_unicode_data_files(): + for filename, url in Unicode_data_files.items(): + print(f"downloading {filename} from {url}") + urlretrieve(url, filename) + + +if __name__ == "__main__": + download_unicode_data_files() From d11ad04e422048eb334cb145863d6cd3d3aa9a46 Mon Sep 17 00:00:00 2001 From: cpplearner Date: Sun, 3 Mar 2024 12:42:23 +0800 Subject: [PATCH 2/5] Rename `grapheme_break_property_data_gen.py` to `unicode_properties_data_gen.py` --- stl/inc/__msvc_format_ucd_tables.hpp | 4 ++-- ...ak_property_data_gen.py => unicode_properties_data_gen.py} | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename tools/unicode_properties_parse/{grapheme_break_property_data_gen.py => unicode_properties_data_gen.py} (96%) diff --git a/stl/inc/__msvc_format_ucd_tables.hpp b/stl/inc/__msvc_format_ucd_tables.hpp index 5f0d7161b2..61159396c0 100644 --- a/stl/inc/__msvc_format_ucd_tables.hpp +++ b/stl/inc/__msvc_format_ucd_tables.hpp @@ -4,7 +4,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // WARNING, this entire header is generated by -// tools/unicode_properties_parse/grapheme_break_property_data_gen.py +// tools/unicode_properties_parse/unicode_properties_data_gen.py // DO NOT MODIFY! // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE @@ -112,7 +112,7 @@ struct _Unicode_property_data { // The enums containing the values for the properties are also generated, in order to ensure they match // up correctly with how we're parsing them. // -// All sets of data tables are generated by tools/unicode_properties_parse/grapheme_break_property_data_gen.py in the +// All sets of data tables are generated by tools/unicode_properties_parse/unicode_properties_data_gen.py in the // https://github.com/microsoft/stl repository. // // The data format is a set of arrays for each character property. The first is an array of uint32_t encoding diff --git a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py b/tools/unicode_properties_parse/unicode_properties_data_gen.py similarity index 96% rename from tools/unicode_properties_parse/grapheme_break_property_data_gen.py rename to tools/unicode_properties_parse/unicode_properties_data_gen.py index d330700495..706471c7f0 100644 --- a/tools/unicode_properties_parse/grapheme_break_property_data_gen.py +++ b/tools/unicode_properties_parse/unicode_properties_data_gen.py @@ -92,7 +92,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // WARNING, this entire header is generated by -// tools/unicode_properties_parse/grapheme_break_property_data_gen.py +// tools/unicode_properties_parse/unicode_properties_data_gen.py // DO NOT MODIFY! // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE @@ -201,7 +201,7 @@ def compact_property_ranges(input: list[PropertyRange]) -> list[PropertyRange]: // The enums containing the values for the properties are also generated, in order to ensure they match // up correctly with how we're parsing them. // -// All sets of data tables are generated by tools/unicode_properties_parse/grapheme_break_property_data_gen.py in the +// All sets of data tables are generated by tools/unicode_properties_parse/unicode_properties_data_gen.py in the // https://github.com/microsoft/stl repository. // // The data format is a set of arrays for each character property. The first is an array of uint32_t encoding From 0b73e363e0416086691711a5a8522500e6212aaf Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 5 Mar 2024 15:29:55 -0800 Subject: [PATCH 3/5] Add banner. --- tools/unicode_properties_parse/download_unicode_data_files.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/unicode_properties_parse/download_unicode_data_files.py b/tools/unicode_properties_parse/download_unicode_data_files.py index dec81017b4..7aec3a1783 100644 --- a/tools/unicode_properties_parse/download_unicode_data_files.py +++ b/tools/unicode_properties_parse/download_unicode_data_files.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + from urllib.request import urlretrieve From d4dee1aa638fdaa39ebe1abe25c474b16040522d Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 5 Mar 2024 15:31:24 -0800 Subject: [PATCH 4/5] Fix .gitignore typo. --- tools/unicode_properties_parse/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/unicode_properties_parse/.gitignore b/tools/unicode_properties_parse/.gitignore index 4fb9d76d45..7b336bfe76 100644 --- a/tools/unicode_properties_parse/.gitignore +++ b/tools/unicode_properties_parse/.gitignore @@ -9,4 +9,4 @@ DerivedCoreProperties.txt DerivedGeneralCategory.txt EastAsianWidth.txt GraphemeBreakProperty.txt -GraphemeBreakTest.txt +GraphemeBreakText.txt From 211febfa2b2072a528dba3c086e74c0d4eed2d93 Mon Sep 17 00:00:00 2001 From: cpplearner Date: Wed, 6 Mar 2024 11:26:12 +0800 Subject: [PATCH 5/5] Fix typo --- tools/unicode_properties_parse/.gitignore | 2 +- tools/unicode_properties_parse/download_unicode_data_files.py | 2 +- tools/unicode_properties_parse/grapheme_break_test_data_gen.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/unicode_properties_parse/.gitignore b/tools/unicode_properties_parse/.gitignore index 7b336bfe76..4fb9d76d45 100644 --- a/tools/unicode_properties_parse/.gitignore +++ b/tools/unicode_properties_parse/.gitignore @@ -9,4 +9,4 @@ DerivedCoreProperties.txt DerivedGeneralCategory.txt EastAsianWidth.txt GraphemeBreakProperty.txt -GraphemeBreakText.txt +GraphemeBreakTest.txt diff --git a/tools/unicode_properties_parse/download_unicode_data_files.py b/tools/unicode_properties_parse/download_unicode_data_files.py index 7aec3a1783..bf5c587fd1 100644 --- a/tools/unicode_properties_parse/download_unicode_data_files.py +++ b/tools/unicode_properties_parse/download_unicode_data_files.py @@ -9,7 +9,7 @@ "DerivedGeneralCategory.txt": "https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt", "EastAsianWidth.txt": "https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt", "GraphemeBreakProperty.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt", - "GraphemeBreakText.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt", + "GraphemeBreakTest.txt": "https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt", "emoji-data.txt": "https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt", } diff --git a/tools/unicode_properties_parse/grapheme_break_test_data_gen.py b/tools/unicode_properties_parse/grapheme_break_test_data_gen.py index c7f85bac60..196f199c5b 100644 --- a/tools/unicode_properties_parse/grapheme_break_test_data_gen.py +++ b/tools/unicode_properties_parse/grapheme_break_test_data_gen.py @@ -91,7 +91,7 @@ def line_to_cpp_data_line_utf8(line: BreakTestItem) -> str: """ -Generate test data from "GraphemeBreakText.txt" +Generate test data from "GraphemeBreakTest.txt" This file can be downloaded from: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt This script looks for GraphemeBreakTest.txt in same directory as this script """