Skip to content

Commit

Permalink
Load independent and minimal syntax sets when using --language
Browse files Browse the repository at this point in the history
This significantly speeds up the startup time of bat, since only a single
linked SyntaxDefinition is loaded for each file. The size increase of the
binary is just ~400 kB.

In order for startup time to be improved, the --language arg must be used, and
it must match one of the following names:

"Plain Text", "ActionScript", "AppleScript", "Batch File", "NAnt Build File",
"C#", "C", "CSS", "D", "Diff", "Erlang", "Go", "Haskell", "JSON", "Java
Properties", "BibTeX", "LaTeX Log", "TeX", "Lisp", "Lua", "MATLAB", "Pascal",
"R", "Regular Expression", "Rust", "SQL", "Scala", "Tcl", "XML", "YAML", "Apache
Conf", "ARM Assembly", "Assembly (x86_64)", "CMakeCache", "Comma Separated
Values", "Cabal", "CoffeeScript", "CpuInfo", "Dart Analysis Output", "Dart",
"Dockerfile", "DotENV", "F#", "Friendly Interactive Shell (fish)", "Fortran
(Fixed Form)", "Fortran (Modern)", "Fortran Namelist", "fstab", "GLSL",
"GraphQL", "Groff/troff", "group", "hosts", "INI", "Jinja2", "jsonnet",
"Kotlin", "Less", "LLVM", "Lean", "MemInfo", "Nim", "Ninja", "Nix", "passwd",
"PowerShell", "Protocol Buffer (TEXT)", "Puppet", "Rego", "resolv", "Robot
Framework", "SML", "Strace", "Stylus", "Solidity", "Vyper", "Swift",
"SystemVerilog", "TOML", "Terraform", "TypeScript", "TypeScriptReact",
"Verilog", "VimL", "Zig", "gnuplot", "log", "requirements.txt", "Highlight
non-printables", "Private Key", "varlink"

Later commits will improve startup time for more code paths.
  • Loading branch information
Enselic committed Aug 8, 2021
1 parent cb49739 commit 2403984
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 26 deletions.
Binary file added assets/independent_syntax_sets.bin
Binary file not shown.
Binary file added assets/independent_syntax_sets_map.bin
Binary file not shown.
212 changes: 203 additions & 9 deletions src/assets.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};
use std::ffi::OsStr;
use std::fs;
use std::path::{Path, PathBuf};

use lazycell::LazyCell;

use syntect::dumps::{dump_to_file, from_binary, from_reader};
use syntect::dumps::{dump_to_file, dump_to_writer, from_binary, from_reader};
use syntect::highlighting::{Theme, ThemeSet};
use syntect::parsing::{SyntaxReference, SyntaxSet, SyntaxSetBuilder};

Expand All @@ -21,6 +21,18 @@ use crate::syntax_mapping::{MappingTarget, SyntaxMapping};
pub struct HighlightingAssets {
syntax_set_cell: LazyCell<SyntaxSet>,
serialized_syntax_set: Option<SerializedSyntaxSet>,

/// We only want to load an independent [SyntaxSet] once.
/// Here we keep track of which ones we loaded already.
independent_syntax_sets: HashMap<OffsetAndSize, LazyCell<SyntaxSet>>,

/// All independent [SyntaxSet]s in a binary format, concatenated together
serialized_independent_syntax_sets: SerializedIndependentSyntaxSets,

/// Used to lookup what byte offset and size a given independent [SyntaxSet]
/// has in `serialized_independent_syntax_sets`.
independent_syntax_sets_map: IndependentSyntaxSetsMap,

theme_set: ThemeSet,
fallback_theme: Option<&'static str>,
}
Expand Down Expand Up @@ -52,6 +64,8 @@ impl HighlightingAssets {
fn new(
syntax_set: Option<SyntaxSet>,
serialized_syntax_set: Option<SerializedSyntaxSet>,
serialized_independent_syntax_sets: SerializedIndependentSyntaxSets,
independent_syntax_sets_map: IndependentSyntaxSetsMap,
theme_set: ThemeSet,
) -> Self {
assert!(syntax_set.is_some() || serialized_syntax_set.is_some());
Expand All @@ -61,9 +75,18 @@ impl HighlightingAssets {
syntax_set_cell.fill(syntax_set).expect("can never fail");
}

// Prepare the map so we can lazily load syntaxes without a mut reference
let mut independent_syntax_sets = HashMap::new();
for value in independent_syntax_sets_map.by_name.values() {
independent_syntax_sets.insert(*value, LazyCell::new());
}

HighlightingAssets {
syntax_set_cell,
serialized_syntax_set,
independent_syntax_sets,
serialized_independent_syntax_sets,
independent_syntax_sets_map,
theme_set,
fallback_theme: None,
}
Expand Down Expand Up @@ -117,11 +140,8 @@ impl HighlightingAssets {
);
}

if std::env::var("BAT_PRINT_SYNTAX_DEPENDENCIES").is_ok() {
// To trigger this code, run:
// BAT_PRINT_SYNTAX_DEPENDENCIES=1 cargo run -- cache --build --source assets --blank --target /tmp
crate::syntax_dependencies::print_syntax_dependencies(&syntax_set_builder);
}
let (serialized_independent_syntax_sets, independent_syntax_sets_map) =
Self::build_independent_syntax_sets_data(&syntax_set_builder)?;

let syntax_set = syntax_set_builder.build();
let missing_contexts = syntax_set.find_unlinked_contexts();
Expand All @@ -132,15 +152,132 @@ impl HighlightingAssets {
}
}

Ok(HighlightingAssets::new(Some(syntax_set), None, theme_set))
Ok(HighlightingAssets::new(
Some(syntax_set),
None,
serialized_independent_syntax_sets,
independent_syntax_sets_map,
theme_set,
))
}

fn build_independent_syntax_sets_data(
syntax_set_builder: &SyntaxSetBuilder,
) -> Result<(SerializedIndependentSyntaxSets, IndependentSyntaxSetsMap)> {
let independent_syntax_sets =
crate::syntax_dependencies::build_independent_syntax_sets(&syntax_set_builder);

let mut concatenated_binary_data: Vec<u8> = vec![];

let mut current_offset = 0;

let mut independent_syntax_sets_map = IndependentSyntaxSetsMap {
by_name: HashMap::new(),
};

for independent_syntax_set in independent_syntax_sets {
// For now, only store syntax sets with one syntax, otherwise
// the binary grows by several megs
if independent_syntax_set.syntaxes().len() > 1 {
continue;
}

// bincode this syntax set
let mut serialized_syntax_set = Vec::new();
dump_to_writer(&independent_syntax_set, &mut serialized_syntax_set)
.chain_err(|| format!("failed to serialize {:?}", &independent_syntax_set))?;

// Remember where in the binary blob we can find it when we need it again
let offset_and_size = OffsetAndSize {
offset: current_offset,
size: serialized_syntax_set.len(),
};
for syntax in independent_syntax_set.syntaxes() {
independent_syntax_sets_map
.by_name
.insert(syntax.name.to_ascii_lowercase().clone(), offset_and_size);
}

// Append the binary blob with the data
concatenated_binary_data.extend(serialized_syntax_set);

// Update current_offset for next syntax set
current_offset += offset_and_size.size;
}

Ok((
SerializedIndependentSyntaxSets::FromData(concatenated_binary_data),
independent_syntax_sets_map,
))
}

fn load_independent_syntax_set_with_offset_and_size(
&self,
offset_and_size: &OffsetAndSize,
) -> Result<SyntaxSet> {
let ref_to_data: &[u8] = match self.serialized_independent_syntax_sets {
SerializedIndependentSyntaxSets::FromBinary(referenced) => referenced,
SerializedIndependentSyntaxSets::FromData(ref data) => data,
};
let start = offset_and_size.offset;
let end = start + offset_and_size.size;
let slice_of_syntax_set = &ref_to_data[start..end];
from_reader(slice_of_syntax_set).chain_err(|| {
format!(
"Could not parse independent syntax set at {:?}",
offset_and_size
)
})
}

/// Finds a [SyntaxSet] that contains a [SyntaxReference] by its name.
/// First tries to find an independent [SyntaxSet].
/// If none is found, returns the [SyntaxSet] that contains all syntaxes.
fn get_syntax_set_by_name(&self, name: &str) -> Result<&SyntaxSet> {
let independent_syntax_set = self
.get_independent_syntax_sets_map()
.by_name
.get(&name.to_ascii_lowercase())
.and_then(|offset_and_size| {
self.get_independent_syntax_set_with_offset_and_size(offset_and_size)
});

match independent_syntax_set {
Some(ref syntax_set) => Ok(syntax_set),
None => self.get_syntax_set(),
}
}

fn get_independent_syntax_set_with_offset_and_size(
&self,
offset_and_size: &OffsetAndSize,
) -> Option<&SyntaxSet> {
self.independent_syntax_sets
.get(offset_and_size)
.and_then(|cell| {
cell.try_borrow_with(|| {
self.load_independent_syntax_set_with_offset_and_size(offset_and_size)
})
.ok()
})
}

pub fn from_cache(cache_path: &Path) -> Result<Self> {
let independent_syntax_sets_path = cache_path.join("independent_syntax_sets.bin");

Ok(HighlightingAssets::new(
None,
Some(SerializedSyntaxSet::FromFile(
cache_path.join("syntaxes.bin"),
)),
SerializedIndependentSyntaxSets::FromData(
std::fs::read(&independent_syntax_sets_path)
.chain_err(|| format!("Could not load {:?}", &independent_syntax_sets_path))?,
),
asset_from_cache(
&cache_path.join("independent_syntax_sets_map.bin"),
"independent syntax sets map",
)?,
asset_from_cache(&cache_path.join("themes.bin"), "theme set")?,
))
}
Expand All @@ -151,6 +288,10 @@ impl HighlightingAssets {
Some(SerializedSyntaxSet::FromBinary(
get_serialized_integrated_syntaxset(),
)),
SerializedIndependentSyntaxSets::FromBinary(
get_serialized_integrated_independent_syntax_sets(),
),
get_serialized_integrated_independent_syntax_sets_map(),
get_integrated_themeset(),
)
}
Expand All @@ -168,6 +309,14 @@ impl HighlightingAssets {
"syntax set",
)?;

asset_to_cache(
&self.independent_syntax_sets_map,
&target_dir.join("independent_syntax_sets_map.bin"),
"independent syntax sets map",
)?;
self.serialized_independent_syntax_sets
.save(&target_dir.join("independent_syntax_sets.bin"))?;

print!(
"Writing metadata to folder {} ... ",
target_dir.to_string_lossy()
Expand Down Expand Up @@ -208,6 +357,10 @@ impl HighlightingAssets {
Ok(self.get_syntax_set()?.syntaxes())
}

fn get_independent_syntax_sets_map(&self) -> &IndependentSyntaxSetsMap {
&self.independent_syntax_sets_map
}

fn get_theme_set(&self) -> &ThemeSet {
&self.theme_set
}
Expand Down Expand Up @@ -272,7 +425,7 @@ impl HighlightingAssets {
mapping: &SyntaxMapping,
) -> Result<SyntaxReferenceInSet> {
if let Some(language) = language {
let syntax_set = self.get_syntax_set()?;
let syntax_set = self.get_syntax_set_by_name(language)?;
syntax_set
.find_syntax_by_token(language)
.map(|syntax| SyntaxReferenceInSet { syntax, syntax_set })
Expand Down Expand Up @@ -415,10 +568,51 @@ impl SerializedSyntaxSet {
}
}

#[derive(Debug)]
pub enum SerializedIndependentSyntaxSets {
FromBinary(&'static [u8]),
FromData(Vec<u8>),
}

impl SerializedIndependentSyntaxSets {
fn save(&self, path: &Path) -> Result<()> {
match self {
SerializedIndependentSyntaxSets::FromBinary(data) => std::fs::write(path, data),
SerializedIndependentSyntaxSets::FromData(ref data) => std::fs::write(path, data),
}
.chain_err(|| format!("Could not save to {:?}", path))
}
}

#[derive(Debug, Eq, PartialEq, Clone, Copy, serde::Deserialize, serde::Serialize, Hash)]
struct OffsetAndSize {
/// Offset in bytes where the start of a serialized [SyntaxSet] can be found
pub offset: usize,

/// Size in bytes of said serialized [SyntaxSet]
pub size: usize,
}

#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
pub struct IndependentSyntaxSetsMap {
/// Lookup the [OffsetAndSize] of a [SyntaxSet] by the name of
/// any [SyntaxReference] inside the [SyntaxSet]
by_name: HashMap<String, OffsetAndSize>,
// Here we will later add `by_extension`, `by_first_line`, etc
}

fn get_serialized_integrated_syntaxset() -> &'static [u8] {
include_bytes!("../assets/syntaxes.bin")
}

fn get_serialized_integrated_independent_syntax_sets() -> &'static [u8] {
include_bytes!("../assets/independent_syntax_sets.bin")
}

fn get_serialized_integrated_independent_syntax_sets_map() -> IndependentSyntaxSetsMap {
from_binary(include_bytes!("../assets/independent_syntax_sets_map.bin"))
}

fn get_integrated_syntaxset() -> SyntaxSet {
from_binary(get_serialized_integrated_syntaxset())
}
Expand Down
8 changes: 8 additions & 0 deletions src/bin/bat/assets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ pub fn cache_dir() -> Cow<'static, str> {

pub fn clear_assets() {
clear_asset("themes.bin", "theme set cache");
clear_asset(
"independent_syntax_sets_map.bin",
"independent syntax sets map cache",
);
clear_asset(
"independent_syntax_sets.bin",
"independent syntax sets cache",
);
clear_asset("syntaxes.bin", "syntax set cache");
clear_asset("metadata.yaml", "metadata file");
}
Expand Down
32 changes: 15 additions & 17 deletions src/syntax_dependencies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,18 @@ enum Dependency {
ByScope(Scope),
}

/// Generates independent [SyntaxSet]s after analyzing dependencies between syntaxes
/// in a [SyntaxSetBuilder], and then prints the reults.
pub(crate) fn print_syntax_dependencies(syntax_set_builder: &SyntaxSetBuilder) {
println!("Constructing independent SyntaxSets...");
let independent_syntax_sets = build_independent_syntax_sets(syntax_set_builder);

println!("Independent SyntaxSets:");
for syntax_set in independent_syntax_sets {
let names = syntax_set
.syntaxes()
.iter()
.map(|syntax| &syntax.name)
.collect::<Vec<_>>();
println!("{:?}", names);
}
fn print_syntax_set_names(syntax_set: &SyntaxSet) {
let names = syntax_set
.syntaxes()
.iter()
.map(|syntax| &syntax.name)
.collect::<Vec<_>>();
println!("{:?}", names);
}

/// Analyzes dependencies between syntaxes in a [SyntaxSetBuilder].
/// From that, it builds independent [SyntaxSet]s.
fn build_independent_syntax_sets(
pub(crate) fn build_independent_syntax_sets(
syntax_set_builder: &'_ SyntaxSetBuilder,
) -> impl Iterator<Item = SyntaxSet> + '_ {
let syntaxes = syntax_set_builder.syntaxes();
Expand All @@ -57,7 +49,13 @@ fn build_independent_syntax_sets(

let mut builder = SyntaxSetDependencyBuilder::new();
builder.add_with_dependencies(syntax, &syntax_to_dependencies, &dependency_to_syntax);
Some(builder.build())
let syntax_set = builder.build();

if std::env::var("BAT_PRINT_SYNTAX_DEPENDENCIES").is_ok() {
print_syntax_set_names(&syntax_set);
}

Some(syntax_set)
})
}

Expand Down

0 comments on commit 2403984

Please sign in to comment.