Skip to content

Commit

Permalink
perf: filter out length 1 byte sentences during pre-tokenization step
Browse files Browse the repository at this point in the history
  • Loading branch information
gautierdag committed Aug 23, 2024
1 parent 3b232e9 commit a9d7a5b
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bpeasy"
version = "0.1.2"
version = "0.1.3"
edition = "2021"

[lib]
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ Using varying vocab sizes from (5k:100k)

| Library/Operation | Time (seconds) | Standard Deviation |
|----------------------------|---------------------------------|--------------------------------|
| HuggingFace Train | 0.7369 | ±1.55 |
| `bpeasy` Train | 0.6528 | ±0.386 |
| HuggingFace Train | 0.8165 | ±0.62 |
| `bpeasy` Train | 0.68815 | ±0.41 |
| HuggingFace Encode | 0.6247 | ±0.051 |
| `bpeasy` Encode (uses `tiktoken`) | 0.2679 | ±0.035 |

Expand Down
22 changes: 17 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ fn pretokenize<'a>(text: &'a str, regex: &Regex) -> Vec<&'a str> {

fn pretokenize_strings(strings: Vec<&str>, pattern: &str) -> (Vec<Sentence>, Vec<u64>) {
let regex: Regex = Regex::new(pattern).expect("Invalid regex pattern");
// Tokenize strings in parallel
let (tokens, counts): (Vec<&str>, Vec<u64>) = strings
.par_iter()
.flat_map(|&text| pretokenize(text, &regex))
Expand All @@ -168,8 +169,15 @@ fn pretokenize_strings(strings: Vec<&str>, pattern: &str) -> (Vec<Sentence>, Vec
.into_iter()
.unzip();

let sentences: Vec<Sentence> = tokens.into_iter().map(Sentence::from_str).collect();
(sentences, counts)
// Convert tokens to sentences and filter sentences and counts to remove single byte sentences
let (filtered_sentences, filtered_counts): (Vec<Sentence>, Vec<u64>) = tokens
.into_iter()
.map(Sentence::from_str)
.zip(counts.into_iter())
.filter(|(sentence, _)| sentence.symbols.len() > 1)
.unzip();

(filtered_sentences, filtered_counts)
}

fn initialize_vocab_bytes(vocab_size: usize) -> (HashMap<Vec<u8>, u32>, Vec<Vec<u8>>) {
Expand Down Expand Up @@ -412,11 +420,15 @@ fn bpeasy(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
mod tests {
#[test]
fn test_all() {
let text: &str = "\tYou hear £ £ £ here";
let text: &str = "\tYou hear a £ £ £ here";
let pattern = r"([^\s]+)|(\s+)";
let compiled_regex = fancy_regex::Regex::new(pattern).expect("Invalid regex pattern");
let compiled_regex: fancy_regex::Regex =
fancy_regex::Regex::new(pattern).expect("Invalid regex pattern");
let pretokenized_sentences = crate::pretokenize(text, &compiled_regex);
println!("{:?}", pretokenized_sentences);
assert_eq!(
pretokenized_sentences,
vec!["\t", "You", " ", "hear", " ", "a", " ", "£", " ", "£", " ", "£", " ", "here"]
);

let text_2: &str = "You hear £ £ £ here";

Expand Down

0 comments on commit a9d7a5b

Please sign in to comment.