Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize size/speed of Unicode datasets #68232

Merged
merged 3 commits into from
Jan 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,7 @@ __pycache__/
# Created by default with `src/ci/docker/run.sh`:
/obj/
/rustllvm/
/src/libcore/unicode/DerivedCoreProperties.txt
/src/libcore/unicode/DerivedNormalizationProps.txt
/src/libcore/unicode/PropList.txt
/src/libcore/unicode/ReadMe.txt
/src/libcore/unicode/Scripts.txt
/src/libcore/unicode/SpecialCasing.txt
/src/libcore/unicode/UnicodeData.txt
/src/libcore/unicode/downloaded
/unicode-downloads
/target/
# Generated by compiletest for incremental:
/tmp/
Expand Down
17 changes: 17 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4930,6 +4930,16 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169"

[[package]]
name = "ucd-parse"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca6b52bf4da6512f0f07785a04769222e50d29639e7ecd016b7806fd2de306b4"
dependencies = [
"lazy_static 1.3.0",
"regex",
]

[[package]]
name = "ucd-trie"
version = "0.1.1"
Expand All @@ -4951,6 +4961,13 @@ dependencies = [
"version_check 0.1.5",
]

[[package]]
name = "unicode-bdd"
version = "0.1.0"
dependencies = [
"ucd-parse",
]

[[package]]
name = "unicode-bidi"
version = "0.3.4"
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ members = [
"src/tools/rustfmt",
"src/tools/miri",
"src/tools/rustdoc-themes",
"src/tools/unicode-table-generator",
]
exclude = [
"build",
Expand Down
16 changes: 8 additions & 8 deletions src/libcore/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use crate::slice;
use crate::str::from_utf8_unchecked_mut;
use crate::unicode::printable::is_printable;
use crate::unicode::tables::{conversions, derived_property, general_category, property};
use crate::unicode::{self, conversions};

use super::*;

Expand Down Expand Up @@ -552,7 +552,7 @@ impl char {
pub fn is_alphabetic(self) -> bool {
match self {
'a'..='z' | 'A'..='Z' => true,
c => c > '\x7f' && derived_property::Alphabetic(c),
c => c > '\x7f' && unicode::Alphabetic(c),
}
}

Expand Down Expand Up @@ -583,7 +583,7 @@ impl char {
pub fn is_lowercase(self) -> bool {
match self {
'a'..='z' => true,
c => c > '\x7f' && derived_property::Lowercase(c),
c => c > '\x7f' && unicode::Lowercase(c),
}
}

Expand Down Expand Up @@ -614,7 +614,7 @@ impl char {
pub fn is_uppercase(self) -> bool {
match self {
'A'..='Z' => true,
c => c > '\x7f' && derived_property::Uppercase(c),
c => c > '\x7f' && unicode::Uppercase(c),
}
}

Expand Down Expand Up @@ -642,7 +642,7 @@ impl char {
pub fn is_whitespace(self) -> bool {
match self {
' ' | '\x09'..='\x0d' => true,
c => c > '\x7f' && property::White_Space(c),
c => c > '\x7f' && unicode::White_Space(c),
}
}

Expand Down Expand Up @@ -693,7 +693,7 @@ impl char {
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn is_control(self) -> bool {
general_category::Cc(self)
unicode::Cc(self)
}

/// Returns `true` if this `char` has the `Grapheme_Extend` property.
Expand All @@ -707,7 +707,7 @@ impl char {
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
#[inline]
pub(crate) fn is_grapheme_extended(self) -> bool {
derived_property::Grapheme_Extend(self)
unicode::Grapheme_Extend(self)
}

/// Returns `true` if this `char` has one of the general categories for numbers.
Expand Down Expand Up @@ -739,7 +739,7 @@ impl char {
pub fn is_numeric(self) -> bool {
match self {
'0'..='9' => true,
c => c > '\x7f' && general_category::N(c),
c => c > '\x7f' && unicode::N(c),
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/libcore/char/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};

// unstable re-exports
#[unstable(feature = "unicode_version", issue = "49726")]
pub use crate::unicode::tables::UNICODE_VERSION;
#[unstable(feature = "unicode_version", issue = "49726")]
pub use crate::unicode::version::UnicodeVersion;
#[unstable(feature = "unicode_version", issue = "49726")]
pub use crate::unicode::UNICODE_VERSION;

use crate::fmt::{self, Write};
use crate::iter::FusedIterator;
Expand Down
66 changes: 0 additions & 66 deletions src/libcore/unicode/bool_trie.rs

This file was deleted.

54 changes: 49 additions & 5 deletions src/libcore/unicode/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,59 @@
#![unstable(feature = "unicode_internals", issue = "none")]
#![allow(missing_docs)]

mod bool_trie;
pub(crate) mod printable;
pub(crate) mod tables;
mod unicode_data;
pub(crate) mod version;

use version::UnicodeVersion;

/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
/// `char` and `str` methods are based on.
#[unstable(feature = "unicode_version", issue = "49726")]
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
major: unicode_data::UNICODE_VERSION.0,
minor: unicode_data::UNICODE_VERSION.1,
micro: unicode_data::UNICODE_VERSION.2,
_priv: (),
};

// For use in liballoc, not re-exported in libstd.
pub mod derived_property {
pub use crate::unicode::tables::derived_property::{Case_Ignorable, Cased};
pub use super::{Case_Ignorable, Cased};
}
pub mod conversions {
pub use crate::unicode::tables::conversions::{to_lower, to_upper};

pub use unicode_data::alphabetic::lookup as Alphabetic;
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
pub use unicode_data::cased::lookup as Cased;
pub use unicode_data::cc::lookup as Cc;
pub use unicode_data::conversions;
pub use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
pub use unicode_data::lowercase::lookup as Lowercase;
pub use unicode_data::n::lookup as N;
pub use unicode_data::uppercase::lookup as Uppercase;
pub use unicode_data::white_space::lookup as White_Space;

#[inline(always)]
fn range_search<const N: usize, const N1: usize, const N2: usize>(
needle: u32,
chunk_idx_map: &[u8; N],
(last_chunk_idx, last_chunk_mapping): (u16, u8),
bitset_chunk_idx: &[[u8; 16]; N1],
bitset: &[u64; N2],
) -> bool {
let bucket_idx = (needle / 64) as usize;
let chunk_map_idx = bucket_idx / 16;
let chunk_piece = bucket_idx % 16;
let chunk_idx = if chunk_map_idx >= N {
if chunk_map_idx == last_chunk_idx as usize {
last_chunk_mapping
} else {
return false;
}
} else {
chunk_idx_map[chunk_map_idx]
};
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
let word = bitset[(idx as usize)];
(word & (1 << (needle % 64) as u64)) != 0
}
Loading