diff --git a/.gitmodules b/.gitmodules index ee264b99c477..60efedba66ac 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "crates/wasi-crypto/spec"] path = crates/wasi-crypto/spec url = https://github.com/WebAssembly/wasi-crypto.git +[submodule "crates/wasi-parallel/spec"] + path = crates/wasi-parallel/spec + url = https://github.com/WebAssembly/wasi-parallel diff --git a/Cargo.lock b/Cargo.lock index 7dc3628f4a9f..90c01e180eef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,6 +78,12 @@ version = "0.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec8ad6edb4840b78c5c3d88de606b22252d552b55f3a4699fbb10fc070ec3049" +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anyhow" version = "1.0.57" @@ -338,6 +344,12 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.0.73" @@ -378,6 +390,33 @@ dependencies = [ "zeroize", ] +[[package]] +name = "ciborium" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" + +[[package]] +name = "ciborium-ll" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "cipher" version = "0.3.0" @@ -533,7 +572,7 @@ dependencies = [ "cranelift-codegen-shared", "cranelift-entity", "cranelift-isle", - "criterion", + "criterion 0.3.5", "gimli", "hashbrown", "log", @@ -788,9 +827,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" dependencies = [ "atty", - "cast", + "cast 0.2.7", "clap 2.34.0", - "criterion-plot", + "criterion-plot 0.4.4", "csv", "itertools", "lazy_static", @@ -807,13 +846,49 @@ dependencies = [ "walkdir", ] +[[package]] +name = "criterion" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +dependencies = [ + "anes", + "atty", + "cast 0.3.0", + "ciborium", + "clap 3.2.8", + "criterion-plot 0.5.0", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + [[package]] name = "criterion-plot" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ - "cast", + "cast 0.2.7", + "itertools", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast 0.3.0", "itertools", ] @@ -2535,6 +2610,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scoped_threadpool" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" + [[package]] name = "scopeguard" version = "1.1.0" @@ -3449,7 +3530,7 @@ dependencies = [ "clap 3.2.8", "component-macro-test", "component-test-util", - "criterion", + "criterion 0.3.5", "env_logger 0.9.0", "filecheck", "humantime 2.1.0", @@ -3475,6 +3556,7 @@ dependencies = [ "wasmtime-wasi", "wasmtime-wasi-crypto", "wasmtime-wasi-nn", + "wasmtime-wasi-parallel", "wasmtime-wast", "wast 46.0.0", "wat", @@ -3728,6 +3810,25 @@ dependencies = [ "wiggle", ] +[[package]] +name = "wasmtime-wasi-parallel" +version = "2.0.0" +dependencies = [ + "anyhow", + "criterion 0.4.0", + "indexmap", + "log", + "num_cpus", + "pretty_env_logger", + "rand 0.8.5", + "scoped_threadpool", + "walkdir", + "wasmtime", + "wasmtime-runtime", + "wasmtime-wasi", + "wiggle", +] + [[package]] name = "wasmtime-wast" version = "2.0.0" diff --git a/Cargo.toml b/Cargo.toml index 3b6daea89c75..f493d42a91a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ wasmtime-wast = { workspace = true } wasmtime-wasi = { workspace = true } wasmtime-wasi-crypto = { workspace = true, optional = true } wasmtime-wasi-nn = { workspace = true, optional = true } +wasmtime-wasi-parallel = { workspace = true, optional = true } clap = { workspace = true, features = ["color", "suggestions", "derive"] } anyhow = { workspace = true } target-lexicon = { workspace = true } @@ -114,6 +115,7 @@ wasmtime-wast = { path = "crates/wast", version = "=2.0.0" } wasmtime-wasi = { path = "crates/wasi", version = "2.0.0" } wasmtime-wasi-crypto = { path = "crates/wasi-crypto", version = "2.0.0" } wasmtime-wasi-nn = { path = "crates/wasi-nn", version = "2.0.0" } +wasmtime-wasi-parallel = { path = "crates/wasi-parallel", version = "2.0.0" } wasmtime-component-util = { path = "crates/component-util", version = "=2.0.0" } wasmtime-component-macro = { path = "crates/component-macro", version = "=2.0.0" } wasmtime-asm-macros = { path = "crates/asm-macros", version = "=2.0.0" } @@ -178,6 +180,7 @@ jitdump = ["wasmtime/jitdump"] vtune = ["wasmtime/vtune"] wasi-crypto = ["dep:wasmtime-wasi-crypto"] wasi-nn = ["dep:wasmtime-wasi-nn"] +wasi-parallel = ["wasmtime-wasi-parallel"] memory-init-cow = ["wasmtime/memory-init-cow", "wasmtime-cli-flags/memory-init-cow"] pooling-allocator = ["wasmtime/pooling-allocator", "wasmtime-cli-flags/pooling-allocator"] all-arch = ["wasmtime/all-arch"] diff --git a/crates/bench-api/src/lib.rs b/crates/bench-api/src/lib.rs index a946fdb6b551..9c38de1c4356 100644 --- a/crates/bench-api/src/lib.rs +++ b/crates/bench-api/src/lib.rs @@ -473,6 +473,9 @@ impl BenchState { wasmtime_wasi_crypto::add_to_linker(&mut linker, |cx| &mut cx.wasi_crypto)?; } + #[cfg(feature = "wasi-parallel")] + wasmtime_wasi_parallel::add_to_linker(&mut linker, |cx| &mut cx.wasi_parallel)?; + Ok(Self { linker, compilation_timer, diff --git a/crates/cli-flags/src/lib.rs b/crates/cli-flags/src/lib.rs index 058522bbb94e..13f88a6482f1 100644 --- a/crates/cli-flags/src/lib.rs +++ b/crates/cli-flags/src/lib.rs @@ -60,6 +60,10 @@ pub const SUPPORTED_WASI_MODULES: &[(&str, &str)] = &[ "experimental-wasi-crypto", "enables support for the WASI cryptography APIs (experimental), see https://github.com/WebAssembly/wasi-crypto", ), + ( + "experimental-wasi-parallel", + "enables support for the WASI parallel APIs (experimental)", + ), ]; fn pick_profiling_strategy(jitdump: bool, vtune: bool) -> Result { @@ -478,6 +482,7 @@ fn parse_wasi_modules(modules: &str) -> Result { "wasi-common" => Ok(wasi_modules.wasi_common = enable), "experimental-wasi-nn" => Ok(wasi_modules.wasi_nn = enable), "experimental-wasi-crypto" => Ok(wasi_modules.wasi_crypto = enable), + "experimental-wasi-parallel" => Ok(wasi_modules.wasi_parallel = enable), "default" => bail!("'default' cannot be specified with other WASI modules"), _ => bail!("unsupported WASI module '{}'", module), }; @@ -509,6 +514,9 @@ pub struct WasiModules { /// Enable the experimental wasi-crypto implementation. pub wasi_crypto: bool, + + /// Enable the experimental wasi-parallel implementation. + pub wasi_parallel: bool, } impl Default for WasiModules { @@ -517,6 +525,7 @@ impl Default for WasiModules { wasi_common: true, wasi_nn: false, wasi_crypto: false, + wasi_parallel: false, } } } @@ -528,6 +537,7 @@ impl WasiModules { wasi_common: false, wasi_nn: false, wasi_crypto: false, + wasi_parallel: false, } } } @@ -674,7 +684,8 @@ mod test { WasiModules { wasi_common: true, wasi_nn: false, - wasi_crypto: false + wasi_crypto: false, + wasi_parallel: false } ); } @@ -687,7 +698,8 @@ mod test { WasiModules { wasi_common: true, wasi_nn: false, - wasi_crypto: false + wasi_crypto: false, + wasi_parallel: false } ); } @@ -704,7 +716,8 @@ mod test { WasiModules { wasi_common: false, wasi_nn: true, - wasi_crypto: false + wasi_crypto: false, + wasi_parallel: false } ); } @@ -718,7 +731,8 @@ mod test { WasiModules { wasi_common: false, wasi_nn: false, - wasi_crypto: false + wasi_crypto: false, + wasi_parallel: false } ); } diff --git a/crates/wasi-parallel/.gitignore b/crates/wasi-parallel/.gitignore new file mode 100644 index 000000000000..1440348a9f51 --- /dev/null +++ b/crates/wasi-parallel/.gitignore @@ -0,0 +1 @@ +/tests/wasm/* diff --git a/crates/wasi-parallel/Cargo.toml b/crates/wasi-parallel/Cargo.toml new file mode 100644 index 000000000000..a504b859470e --- /dev/null +++ b/crates/wasi-parallel/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "wasmtime-wasi-parallel" +version.workspace = true +authors.workspace = true +description = "Wasmtime implementation of the wasi-parallel API" +documentation = "https://docs.rs/wasmtime-wasi-parallel" +license = "Apache-2.0 WITH LLVM-exception" +categories = ["wasm", "parallel"] +keywords = ["webassembly", "wasm", "parallel"] +repository = "https://github.com/bytecodealliance/wasmtime" +readme = "README.md" +edition = "2018" + +[dependencies] +# These dependencies are necessary for the witx-generation macros to work: +anyhow = { workspace = true } +wiggle = { workspace = true } + +# These dependencies are necessary for the wasi-parallel implementation: +indexmap = "1.6" +log = { workspace = true } +num_cpus = "1.13" +rand = "0.8" +scoped_threadpool = "0.1" +wasmtime = { workspace = true, default-features = false, features = ["cranelift", "wat"] } +wasmtime-runtime = { workspace = true } +wasmtime-wasi = { workspace = true } + +[dev-dependencies] +criterion = "0.4" +pretty_env_logger = "0.4" + +[build-dependencies] +walkdir = "2.3" + +[features] +# Enable building Rust tests to Wasm. Not all environments will have a +# `wasm32-wasi` target installed so this feature is not enabled by default. +build-tests = [] + +[[bench]] +name = "nstream" +harness = false + +[[bench]] +name = "sum" +harness = false diff --git a/crates/wasi-parallel/LICENSE b/crates/wasi-parallel/LICENSE new file mode 100644 index 000000000000..f9d81955f4bc --- /dev/null +++ b/crates/wasi-parallel/LICENSE @@ -0,0 +1,220 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/crates/wasi-parallel/README.md b/crates/wasi-parallel/README.md new file mode 100644 index 000000000000..bcfdc97bc5bf --- /dev/null +++ b/crates/wasi-parallel/README.md @@ -0,0 +1,50 @@ +# wasmtime-wasi-parallel + +This crate enables experimental support for the [wasi-parallel] API in Wasmtime. + +> __WARNING__: _this implementation is highly experimental, subject to change, +> and abuses the Wasmtime API!_ It is published as a proof-of-concept to discuss +> the design of the wasi-parallel specification. + +Please open any issues related to this implementation in the [wasi-parallel] +repository. Feedback is appreciated! + +The main idea is to expose a "parallel for" mechanism using WASI (see the +[explainer] for more details). The "parallel for" call is not limited to CPU +execution; this proof-of-concept implementation can execute parallel code on +both the CPU (using Wasmtime's JIT compiled functions) and eventually the GPU +(using OpenCL). If you plan to experiment with this crate, see the "Use" section +below. + +[wasi-parallel]: https://github.com/WebAssembly/wasi-parallel +[explainer]: https://github.com/WebAssembly/wasi-parallel#readme + +### Build + +``` +cargo build +``` + +### Test + +``` +cargo test +``` + +Note: the Rust code in `tests/rust` is compiled by `build.rs` to `tests/wasm`. + +### Benchmark + +``` +cargo bench +``` + +### Use + +When compiled with the `wasi-parallel` feature, this crate is usable from the +Wasmtime CLI: + +```console +$ cargo build --features wasi-parallel +$ .../wasmtime run --wasi-modules experimental-wasi-parallel +``` diff --git a/crates/wasi-parallel/benches/nstream.rs b/crates/wasi-parallel/benches/nstream.rs new file mode 100644 index 000000000000..759e6a811c36 --- /dev/null +++ b/crates/wasi-parallel/benches/nstream.rs @@ -0,0 +1,58 @@ +//! Compare the memory throughput of parallel and sequential executions using +//! `nstream`. +//! +//! `nstream` is a memory throughput benchmark and will likely hit memory +//! bandwidth limitations as more threads are used. Do not expect a linear speed +//! up between the parallel and sequential versions. For reference, the +//! [Parallel Research Kernels] (PRK) repository has examples of `nstream` in +//! many languages. +//! +//! [Parallel Research Kernels]: https://github.com/ParRes/Kernels +mod test_case; + +use criterion::{ + criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion, +}; +use test_case::*; + +fn bench_nstream(c: &mut Criterion) { + let mut group = c.benchmark_group("nstream"); + measure(&mut group, Parallelism::Cpu); + measure(&mut group, Parallelism::Sequential); +} + +fn measure(group: &mut BenchmarkGroup, parallelism: Parallelism) { + let name = format!("{:?}", parallelism); + let mut test_case = TestCase::new("tests/wat/nstream.wat", default_engine(), None).unwrap(); + let num_threads = num_cpus::get() as i32; + // The guidance for nstream is to pick a number of items equivalent to a + // buffer that is `4 * LLC size`. Since we use 4-byte floating point numbers + // and assuming a very tame 2MB for the LLC, we set the number of items to + // ~8 million. Note what the nstream PRK does here: + // https://github.com/ParRes/Kernels/blob/default/scripts/small/runopenmp#L9. + let num_items = 8_000_000; + let device_kind = parallelism.as_device_kind(); + + let _ = test_case + .invoke( + "setup", + &[num_threads.into(), num_items.into(), device_kind.into()], + ) + .expect("failed in benchmark `setup()`"); + + group.bench_function(name, |b| { + b.iter(|| { + let _ = test_case + .invoke("execute", &[]) + .expect("failed in benchmark `execute()`"); + }); + }); + + // We cannot invoke `finish` here since the check will necessarily fail: + // criterion will invoke `execute` repeatedly and since the nstream work + // includes `A[i] += A[i] ...` then the result region `A` will quickly + // diverge from its single-run value, which is what `finish` checks. +} + +criterion_group!(benches, bench_nstream); +criterion_main!(benches); diff --git a/crates/wasi-parallel/benches/sum.rs b/crates/wasi-parallel/benches/sum.rs new file mode 100644 index 000000000000..3cd98b3622a2 --- /dev/null +++ b/crates/wasi-parallel/benches/sum.rs @@ -0,0 +1,50 @@ +//! Add 100 million integers for each available CPU. +//! +//! This benchmark measures the upper ends of the speed-up available with +//! `wasi-parallel`. Each iteration of the kernel performs CPU-bound work, so +//! the difference between the sequential and parallel versions should be +//! related to the number of cores available. + +mod test_case; + +use criterion::{ + criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion, +}; +use test_case::*; + +fn bench_nstream(c: &mut Criterion) { + let mut group = c.benchmark_group("sum"); + measure(&mut group, Parallelism::Cpu); + measure(&mut group, Parallelism::Sequential); +} + +fn measure(group: &mut BenchmarkGroup, parallelism: Parallelism) { + let name = format!("{:?}", parallelism); + let mut test_case = TestCase::new("tests/wat/sum.wat", default_engine(), None).unwrap(); + let num_threads = num_cpus::get() as i32; + let buffer_size = 100_000_000 as i32; + let device_kind = parallelism.as_device_kind(); + + let _ = test_case + .invoke( + "setup", + &[num_threads.into(), buffer_size.into(), device_kind.into()], + ) + .expect("failed in benchmark `setup()`"); + + group.bench_function(name, |b| { + b.iter(|| { + let _ = test_case + .invoke("execute", &[]) + .expect("failed in benchmark `execute()`"); + }); + }); + + let results = test_case + .invoke("finish", &[]) + .expect("failed in benchmark `finish`"); + assert_eq!(results[0].i32().unwrap(), 0); +} + +criterion_group!(benches, bench_nstream); +criterion_main!(benches); diff --git a/crates/wasi-parallel/benches/test_case.rs b/crates/wasi-parallel/benches/test_case.rs new file mode 120000 index 000000000000..dab38122630b --- /dev/null +++ b/crates/wasi-parallel/benches/test_case.rs @@ -0,0 +1 @@ +../tests/test_case.rs \ No newline at end of file diff --git a/crates/wasi-parallel/build.rs b/crates/wasi-parallel/build.rs new file mode 100644 index 000000000000..f5e64192b6c9 --- /dev/null +++ b/crates/wasi-parallel/build.rs @@ -0,0 +1,77 @@ +//! This build script: +//! - has the configuration necessary for the wiggle and witx macros +//! - generates Wasm from the files in `tests/rust` to `tests/wasm` + +#[cfg(feature = "build-tests")] +use std::{ + fs::DirBuilder, + path::{Path, PathBuf}, + process::Command, +}; + +fn main() { + // This is necessary for wiggle/witx macros. + let cwd = std::env::current_dir().unwrap(); + let wasi_root = cwd.join("spec"); + println!("cargo:rustc-env=WASI_ROOT={}", wasi_root.display()); + + // Also automatically rebuild if the WITX files change. + for entry in walkdir::WalkDir::new(wasi_root) { + println!("cargo:rerun-if-changed={}", entry.unwrap().path().display()); + } + + // If requested, rebuild the test files. + #[cfg(feature = "build-tests")] + { + build_wasm("tests"); + build_wasm("benches"); + } +} + +#[cfg(feature = "build-tests")] +fn build_wasm>(root: P) { + let root_dir = Path::new(root.as_ref().as_os_str()); + let wasm_dir = root_dir.join("wasm"); + + DirBuilder::new().recursive(true).create(&wasm_dir).unwrap(); + + // Automatically rebuild any Rust tests. + if root_dir.join("rust").exists() { + for entry in walkdir::WalkDir::new(root_dir.join("rust")) { + let entry = entry.unwrap(); + println!("cargo:rerun-if-changed={}", entry.path().display()); + if entry.path().is_file() && entry.file_name() != "wasi_parallel.rs" { + compile_rust(entry.path(), &wasm_dir) + } + } + } +} + +/// Use rustc to compile a Rust file to a Wasm file that uses the wasi-parallel +/// API. +#[cfg(feature = "build-tests")] +fn compile_rust, P2: AsRef>(source_file: P1, destination_dir: P2) { + let stem = source_file.as_ref().file_stem().unwrap(); + let mut destination_file: PathBuf = [destination_dir.as_ref().as_os_str(), stem] + .iter() + .collect(); + destination_file.set_extension("wasm"); + + let mut command = Command::new("rustc"); + command + .arg("--target") + .arg("wasm32-wasi") + .arg(source_file.as_ref().to_str().unwrap()) + .arg("-o") + .arg(destination_file.to_str().unwrap()); + + let status = command + .status() + .expect("Failed to execute 'rustc' command to generate Wasm file."); + + assert!( + status.success(), + "Failed to compile test program: {:?}", + command + ) +} diff --git a/crates/wasi-parallel/spec b/crates/wasi-parallel/spec new file mode 160000 index 000000000000..b404517fbf50 --- /dev/null +++ b/crates/wasi-parallel/spec @@ -0,0 +1 @@ +Subproject commit b404517fbf50496b58d51d08c297585488864739 diff --git a/crates/wasi-parallel/src/context.rs b/crates/wasi-parallel/src/context.rs new file mode 100644 index 000000000000..302feb6fcf78 --- /dev/null +++ b/crates/wasi-parallel/src/context.rs @@ -0,0 +1,198 @@ +//! This module implements the state held by `wasi-parallel`. E.g., when +//! `wasi-parallel` returns a handle to a device, it must maintain a mapping of +//! which device was returned. The `WasiParallelContext` proxies on calls to the +//! correct parallel device. + +use crate::device::{discover, Buffer, Device}; +use crate::witx::types::{BufferAccessKind, DeviceKind}; +use anyhow::{anyhow, Result}; +use indexmap::IndexMap; +use rand::Rng; +use std::collections::HashMap; +use wasmtime::SharedMemory; + +#[derive(Debug)] +pub struct WasiParallelContext { + pub spirv: HashMap>, + pub devices: IndexMap>, + pub buffers: HashMap>, + pub device_for_buffers: HashMap, +} + +impl WasiParallelContext { + pub fn new() -> Self { + // Perform some rudimentary device discovery. + let mut devices = IndexMap::new(); + for device in discover() { + devices.insert(Self::random_id(), device); + } + + Self { + spirv: HashMap::new(), + devices, + buffers: HashMap::new(), + device_for_buffers: HashMap::new(), + } + } + + /// Retrieve a device based on a hint, using the default device if the hint + /// cannot be satisfied. + pub fn get_device(&self, hint: DeviceKind) -> Result { + match self + .devices + .iter() + .find(|(_, device)| device.kind() == hint) + { + // If we can find a device matching the hint, return it... + Some((&id, _)) => Ok(id), + // ...otherwise, use the default device. + None => self.get_default_device(), + } + } + + // Retrieve the default device, which currently is the first registered + // device (TODO). + pub fn get_default_device(&self) -> Result { + match self.devices.iter().next().as_ref() { + // Use the first available device (TODO: implicit default)... + Some((&id, _)) => Ok(id), + // ...or fail if none are available. + None => Err(anyhow!("no devices available")), + } + } + + /// Create a buffer linked to a device. + pub fn create_buffer( + &mut self, + device_id: i32, + size: i32, + access: BufferAccessKind, + ) -> Result { + let device = match self.devices.get(&device_id) { + Some(val) => val, + None => return Err(anyhow!("unrecognized device")), + }; + + if size < 0 { + return Err(anyhow!("invalid size (less than 0)")); + } + + let id = Self::random_id(); + self.buffers + .insert(id, device.as_ref().create_buffer(size, access)); + self.device_for_buffers.insert(id, device_id); + Ok(id) + } + + /// Retrieve a created buffer by its ID. + pub fn get_buffer(&self, buffer_id: i32) -> Result<&dyn Buffer> { + match self.buffers.get(&buffer_id) { + Some(buffer) => Ok(buffer.as_ref()), + None => Err(anyhow!("invalid buffer ID")), + } + } + + /// Retrieve a created buffer by its ID. + pub fn get_buffer_mut(&mut self, buffer_id: i32) -> Result<&mut dyn Buffer> { + match self.buffers.get_mut(&buffer_id) { + Some(buffer) => Ok(buffer.as_mut()), + None => Err(anyhow!("invalid buffer ID")), + } + } + + /// Invoke the `kernel` in parallel on the devices indicated by the input + /// and output buffers. + pub fn invoke_parallel_for( + &mut self, + device_id: i32, + kernel: &[u8], + engine: &wasmtime::Engine, + shared_memory: SharedMemory, + num_threads: i32, + block_size: i32, + in_buffers: &[i32], + out_buffers: &[i32], + ) -> Result<()> { + // Collect the input buffers. + let mut in_buffers_ = Vec::new(); + for (i, b) in in_buffers.iter().enumerate() { + match self.buffers.get(b) { + Some(b) => in_buffers_.push(b), + None => return Err(anyhow!("in buffer {} has an invalid ID", i)), + } + } + + // Collect the output buffers. + let mut out_buffers_ = Vec::new(); + for (i, b) in out_buffers.iter().enumerate() { + match self.buffers.get(b) { + Some(b) => out_buffers_.push(b), + None => return Err(anyhow!("out buffer {} has an invalid ID", i)), + } + } + + // Check that all buffers are assigned to the right device. + if !in_buffers + .iter() + .chain(out_buffers.iter()) + .map(|b| *self.device_for_buffers.get(b).unwrap()) + .all(|d| d == device_id) + { + return Err(anyhow!("buffers are assigned to different devices")); + } + + // Check that the device is valid. + if let Some(device) = self.devices.get_mut(&device_id) { + log::debug!( + "starting parallel iterations = {}, block_size = {}, device = {:?}", + num_threads, + block_size, + device + ); + device.parallelize( + Kernel::new(kernel.to_owned(), engine.clone(), shared_memory), + num_threads, + block_size, + in_buffers_, + out_buffers_, + )? + } else { + return Err(anyhow!("invalid device ID")); + } + + Ok(()) + } + + fn random_id() -> i32 { + rand::thread_rng().gen() + } +} + +/// A binary-encoded WebAssembly module containing the function to be run in +/// parallel. The engine is included so that the WebAssembly code can be +/// JIT-compiled with the same configuration as the currently-running +/// WebAssembly. +pub struct Kernel { + module: Vec, + engine: wasmtime::Engine, + memory: SharedMemory, +} +impl Kernel { + pub const NAME: &'static str = "kernel"; + pub fn new(module: Vec, engine: wasmtime::Engine, memory: SharedMemory) -> Self { + Self { + module, + engine, + memory, + } + } + pub fn module(&self) -> &[u8] { + &self.module + } + pub fn engine(&self) -> &wasmtime::Engine { + &self.engine + } + pub fn memory(&self) -> &SharedMemory { + &self.memory + } +} diff --git a/crates/wasi-parallel/src/device/cpu.rs b/crates/wasi-parallel/src/device/cpu.rs new file mode 100644 index 000000000000..6253a7321566 --- /dev/null +++ b/crates/wasi-parallel/src/device/cpu.rs @@ -0,0 +1,84 @@ +//! Implement a `wasi-parallel` device using all available cores on the CPU. + +use super::wasm_memory_buffer::{as_pointer_and_length, WasmMemoryBuffer}; +use super::{Buffer, Device}; +use crate::context::Kernel; +use crate::witx::types::{BufferAccessKind, DeviceKind}; +use anyhow::Result; +use std::convert::TryInto; + +pub struct CpuDevice { + pool: scoped_threadpool::Pool, +} + +impl CpuDevice { + pub fn new() -> Box { + let pool = scoped_threadpool::Pool::new(num_cpus::get().try_into().unwrap()); + Box::new(Self { pool }) + } +} + +impl Device for CpuDevice { + fn kind(&self) -> DeviceKind { + DeviceKind::Cpu + } + + fn name(&self) -> String { + "thread pool implementation".into() // TODO retrieve CPU name from system. + } + + fn create_buffer(&self, size: i32, access: BufferAccessKind) -> Box { + Box::new(WasmMemoryBuffer::new(size as u32, access)) + } + + fn parallelize( + &mut self, + kernel: Kernel, + num_iterations: i32, + block_size: i32, + in_buffers: Vec<&Box>, + out_buffers: Vec<&Box>, + ) -> Result<()> { + self.pool.scoped(|scoped| { + let module = wasmtime::Module::new(kernel.engine(), kernel.module()) + .expect("unable to compile module"); + + // Setup the buffer pointers. + let buffers = + as_pointer_and_length(in_buffers.into_iter().chain(out_buffers.into_iter())) + .unwrap(); + + for iteration_id in 0..num_iterations { + let engine = kernel.engine().clone(); + let module = module.clone(); + let memory = kernel.memory().clone(); + let buffers = buffers.clone(); + scoped.execute(move || { + // Instantiate again in a new thread. + let mut store = wasmtime::Store::new(&engine, ()); + let imports = vec![memory.clone().into()]; + let instance = wasmtime::Instance::new(&mut store, &module, &imports) + .expect("failed to construct thread instance"); + + // Setup the parameters for the parallel execution. + let mut params = vec![ + iteration_id.into(), + num_iterations.into(), + block_size.into(), + ]; + params.extend_from_slice(&buffers); + + // Call the `kernel` function. + log::debug!("executing iteration {}", iteration_id); + let kernel_fn = instance + .get_func(&mut store, Kernel::NAME) + .expect("failed to find kernel function"); + kernel_fn + .call(&mut store, ¶ms[..], &mut []) + .expect("failed to run kernel") + }); + } + }); + Ok(()) + } +} diff --git a/crates/wasi-parallel/src/device/mod.rs b/crates/wasi-parallel/src/device/mod.rs new file mode 100644 index 000000000000..0556fed2ce4c --- /dev/null +++ b/crates/wasi-parallel/src/device/mod.rs @@ -0,0 +1,76 @@ +pub mod cpu; +pub mod sequential; +pub mod wasm_memory_buffer; + +use crate::witx::types::{BufferAccessKind, DeviceKind}; +use crate::{ + context::Kernel, + device::{cpu::CpuDevice, sequential::SequentialDevice}, +}; +use anyhow::Result; +use std::{any::Any, fmt::Debug}; +use wiggle::GuestPtr; + +/// Discover available devices. +pub fn discover() -> Vec> { + vec![CpuDevice::new(), SequentialDevice::new()] +} + +/// Define the operations possible on a device. +pub trait Device { + /// Return the device kind. + fn kind(&self) -> DeviceKind; + + /// Return the device name. + fn name(&self) -> String; + + /// Create a buffer associated with this device. The created buffer is held + /// in `WasiParallelContext`, which must guarantee that buffers are sent to + /// the correct devices. + fn create_buffer(&self, size: i32, access: BufferAccessKind) -> Box; + + /// Invoke a parallel "for" on the device. + fn parallelize( + &mut self, + kernel: Kernel, + num_threads: i32, + block_size: i32, + in_buffers: Vec<&Box>, + out_buffers: Vec<&Box>, + ) -> Result<()>; +} + +impl Debug for dyn Device { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}: {}", self.kind(), self.name()) + } +} + +/// Define the operations possible on a buffer. +pub trait Buffer: Send + Sync { + /// Returns the size of the buffer in bytes. + fn len(&self) -> u32; + + /// Describes how the buffer can be used by the device. + fn access(&self) -> BufferAccessKind; + + /// Write the given slice of Wasm memory into the buffer. + fn write(&mut self, data: GuestPtr<[u8]>) -> Result<()>; + + /// Read the buffer into a slice. + fn read(&self, slice: GuestPtr<[u8]>) -> Result<()>; + + /// Allow for downcasting the buffer: `buffer.as_any().downcast_ref::<...>()`. + fn as_any(&self) -> &dyn Any; +} + +impl Debug for dyn Buffer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Buffer: size: {} access: {:?}", + self.len(), + self.access() + ) + } +} diff --git a/crates/wasi-parallel/src/device/sequential.rs b/crates/wasi-parallel/src/device/sequential.rs new file mode 100644 index 000000000000..bdcb24a608f5 --- /dev/null +++ b/crates/wasi-parallel/src/device/sequential.rs @@ -0,0 +1,74 @@ +//! Implements a `wasi-parallel` device that solely executes sequentially. + +use super::wasm_memory_buffer::as_pointer_and_length; +use super::{wasm_memory_buffer::WasmMemoryBuffer, Buffer, Device}; +use crate::context::Kernel; +use crate::witx::types::{BufferAccessKind, DeviceKind}; +use anyhow::{Context, Result}; + +pub struct SequentialDevice; + +impl SequentialDevice { + pub fn new() -> Box { + Box::new(Self) + } +} + +impl Device for SequentialDevice { + fn kind(&self) -> DeviceKind { + DeviceKind::Sequential + } + + fn name(&self) -> String { + "sequential implementation".into() + } + + fn create_buffer(&self, size: i32, access: BufferAccessKind) -> Box { + Box::new(WasmMemoryBuffer::new(size as u32, access)) + } + + fn parallelize( + &mut self, + kernel: Kernel, + num_iterations: i32, + block_size: i32, + in_buffers: Vec<&Box>, + out_buffers: Vec<&Box>, + ) -> Result<()> { + // JIT-compile and instantiate the parallel kernel. + let module = wasmtime::Module::new(kernel.engine(), kernel.module()) + .context("unable to compile kernel module")?; + let mut store = wasmtime::Store::new(kernel.engine(), ()); + let imports = vec![kernel.memory().clone().into()]; + let instance = wasmtime::Instance::new(&mut store, &module, &imports) + .context("failed to construct kernel instance")?; + + // Setup the buffer pointers. + let buffers = + as_pointer_and_length(in_buffers.into_iter().chain(out_buffers.into_iter())).unwrap(); + + let kernel_fn = instance + .get_func(&mut store, Kernel::NAME) + .expect("failed to find kernel function"); + + // Run each iteration of the parallel kernel sequentially. + for iteration_id in 0..num_iterations { + log::debug!("executing iteration {}", iteration_id); + + // Setup the parameters for the parallel execution. + let mut params = vec![ + iteration_id.into(), + num_iterations.into(), + block_size.into(), + ]; + params.extend_from_slice(&buffers); + + // Call the `kernel` function. + kernel_fn + .call(&mut store, ¶ms[..], &mut []) + .expect("failed to run kernel") + } + + Ok(()) + } +} diff --git a/crates/wasi-parallel/src/device/wasm_memory_buffer.rs b/crates/wasi-parallel/src/device/wasm_memory_buffer.rs new file mode 100644 index 000000000000..a8404191135f --- /dev/null +++ b/crates/wasi-parallel/src/device/wasm_memory_buffer.rs @@ -0,0 +1,164 @@ +//! Contains a reference to a slice of Wasm memory. +//! +use super::Buffer; +use crate::witx::types::BufferAccessKind; +use anyhow::{anyhow, bail, Context, Result}; +use std::{any::Any, convert::TryInto, vec}; +use wasmtime::Val; +use wiggle::GuestPtr; + +/// This kind of buffer is designed to live exclusively in Wasm memory--it +/// contains the information necessary for reading and writing to Wasm memory. +/// Its lifecycle involves: +/// - The buffer is created by `create_buffer`; this associates a buffer ID with +/// a device ID, but the buffer has no knowledge of what data it may contain, +/// only its length. +/// - The buffer may then be written to by `write_buffer`; at this point the +/// buffer will record the its offset within the Wasm memory. +/// - When used by `parallel_exec`, this buffer will simply pass its offset and +/// length to the parallel kernel, where it will be mutated by a Wasm +/// function. +/// - The buffer contents may "read" from one section of the Wasm memory to +/// another. +pub struct WasmMemoryBuffer { + offset: Option, + length: u32, + access: BufferAccessKind, +} + +impl WasmMemoryBuffer { + pub fn new(size: u32, access: BufferAccessKind) -> Self { + Self { + offset: None, + length: size, + access, + } + } +} + +impl Buffer for WasmMemoryBuffer { + fn len(&self) -> u32 { + self.length + } + + fn access(&self) -> BufferAccessKind { + self.access + } + + /// Does not copy data: simply checks that the lengths of the buffer and + /// guest slice match and then records the starting location of the guest + /// pointer. This will require some re-thinking once multiple memories are + /// possible (TODO). + fn write(&mut self, slice: GuestPtr<[u8]>) -> Result<()> { + if slice.len() == self.len() { + self.offset = Some(slice.offset_base()); + Ok(()) + } else { + Err(anyhow!( + "The slice to write did not match the buffer size: {} != {}", + slice.len(), + self.len(), + )) + } + } + + /// This implementation of `read` will attempt to copy the device data, held + /// in Wasm memory, to another location in Wasm memory. Currently it will + /// fail if the slices are overlapping (TODO). At some point, this should + /// also see if the `read` is from and to the same slice and avoid the copy + /// entirely (TODO). + fn read(&self, slice: GuestPtr<[u8]>) -> Result<()> { + debug_assert_eq!(slice.len(), self.len()); + let mem = slice.mem().base(); + let mem = unsafe { std::slice::from_raw_parts_mut(mem.0, mem.1 as usize) }; + copy_within_a_slice( + mem, + self.offset.unwrap() as usize, + slice.offset_base() as usize, + slice.len() as usize, + ); + Ok(()) + } + + fn as_any(&self) -> &dyn Any { + self as &dyn Any + } +} + +/// This helper copies one sub-slice to another within a mutable slice. It will +/// panic if the slices are overlapping. +fn copy_within_a_slice(v: &mut [T], from: usize, to: usize, len: usize) { + if from == to { + // Do nothing. + } else if from > to { + let (dst, src) = v.split_at_mut(from); + dst[to..to + len].clone_from_slice(&src[..len]); + } else { + let (src, dst) = v.split_at_mut(to); + dst[..len].clone_from_slice(&src[from..from + len]); + } +} + +/// Convert an iterator of [`WasmMemoryBuffer`] into their corresponding +/// WebAssembly pointer address and length. +pub fn as_pointer_and_length<'a, I>(buffers: I) -> Result> +where + I: Iterator>, +{ + let mut results = vec![]; + for b in buffers { + if let Some(b_) = b.as_any().downcast_ref::() { + let len = b + .len() + .try_into() + .context("the buffer length is too large for an i32")?; + if let Some(offset) = b_.offset { + results.push(Val::I32(offset.try_into().unwrap())); + results.push(Val::I32(len)); + } else { + bail!("the buffer has not been written to: {:?}", b); + // TODO there must be a way to set up the buffer without writing + // to it; e.g., for write buffers that only the device touches. + } + } else { + bail!("the buffer is invalid; any buffer used by the CPU should be castable to a pointer + length"); + } + } + Ok(results) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn copy_before() { + let mut buffer = [0u8; 1024]; + buffer[42] = 42; + copy_within_a_slice(&mut buffer, 42, 41, 1); + assert_eq!(buffer[41], 42); + } + + #[test] + fn copy_same_location() { + let mut buffer = [0u8; 1024]; + buffer[42] = 42; + copy_within_a_slice(&mut buffer, 42, 42, 1); + assert_eq!(buffer[42], 42); + } + + #[test] + fn copy_after() { + let mut buffer = [0u8; 1024]; + buffer[42] = 42; + copy_within_a_slice(&mut buffer, 42, 43, 1); + assert_eq!(buffer[43], 42); + } + + #[test] + #[should_panic] + fn copy_overlapping() { + let mut buffer = [0u8; 1024]; + copy_within_a_slice(&mut buffer, 42, 43, 2); + } +} diff --git a/crates/wasi-parallel/src/impl.rs b/crates/wasi-parallel/src/impl.rs new file mode 100644 index 000000000000..d41c7d72e374 --- /dev/null +++ b/crates/wasi-parallel/src/impl.rs @@ -0,0 +1,55 @@ +//! This module translates the calls from the Wasm program (running in a Wasm +//! engine) into something usable by the `wasi-parallel` context (running in the +//! Wasm host). Wiggle performs most of the conversions using `from_witx!` in +//! `witx.rs` but the special nature of `parallel_exec` (it can call back into a +//! function in the Wasm module) involves a manual implementation of this glue +//! code elsewhere. + +use crate::witx::types::{ + Buffer, BufferAccessKind, BufferData, BufferSize, DeviceKind, ParallelDevice, +}; +use crate::witx::wasi_ephemeral_parallel::WasiEphemeralParallel; +use crate::{WasiParallel, WasiParallelError}; + +impl WasiEphemeralParallel for WasiParallel { + fn get_device(&mut self, hint: DeviceKind) -> Result { + let id = self.ctx.borrow().get_device(hint)?; + Ok(ParallelDevice::from(id)) + } + + fn create_buffer( + &mut self, + device: ParallelDevice, + size: BufferSize, + kind: BufferAccessKind, + ) -> Result { + let id = self + .ctx + .borrow_mut() + .create_buffer(device.into(), size as i32, kind)?; + Ok(Buffer::from(id)) + } + + fn write_buffer<'a>( + &mut self, + data: &BufferData<'a>, + buffer: Buffer, + ) -> Result<(), super::WasiParallelError> { + let mut ctx = self.ctx.borrow_mut(); + let buffer = ctx.get_buffer_mut(buffer.into())?; + buffer.write(*data)?; + Ok(()) + } + + fn read_buffer<'a>( + &mut self, + buffer: Buffer, + data: &BufferData<'a>, + ) -> Result<(), super::WasiParallelError> { + let ctx = self.ctx.borrow_mut(); + let buffer = ctx.get_buffer(buffer.into())?; + buffer.read(*data) + } + + // Note: `parallel_exec` is manually linked in `lib.rs`. +} diff --git a/crates/wasi-parallel/src/lib.rs b/crates/wasi-parallel/src/lib.rs new file mode 100644 index 000000000000..e0c058c9cb9d --- /dev/null +++ b/crates/wasi-parallel/src/lib.rs @@ -0,0 +1,126 @@ +//! Implement [`wasi-parallel`]. +//! +//! [`wasi-parallel`]: https://github.com/WebAssembly/wasi-parallel + +mod context; +mod device; +mod r#impl; +mod witx; + +use anyhow::Result; +use context::WasiParallelContext; +use std::cell::RefCell; +use wasmtime::{Caller, Extern, SharedMemory, Trap}; +use wiggle::GuestError; + +/// This struct solely wraps a `wasi-parallel` context in a `RefCell`. +pub struct WasiParallel { + pub(crate) ctx: RefCell, +} + +impl WasiParallel { + pub fn new() -> Self { + Self { + ctx: RefCell::new(WasiParallelContext::new()), + } + } +} + +/// Define the ways wasi-parallel can fail. +pub type WasiParallelError = anyhow::Error; + +/// Re-export the Wiggle-generated `add_to_linker` function. Because +/// `WasiParallelContext` needs access to `Caller` (i.e., to retrieve the +/// `Engine`) and the wiggle infrastructure does not support this, this API call +/// is skipped during wiggle generation (see `skip` in `witx.rs`) and manually +/// implemented here. +pub fn add_to_linker( + linker: &mut wasmtime::Linker, + get_cx: impl Fn(&mut T) -> &mut WasiParallel + Send + Sync + Copy + 'static, +) -> anyhow::Result<()> { + witx::wasi_ephemeral_parallel::add_to_linker(linker, get_cx)?; + + // At one time, this code was auto-generated by + // `wiggle_generate::wasmtime::generate_func`. + linker.func_wrap( + "wasi_ephemeral_parallel", + "parallel_exec", + move |mut caller: Caller<'_, T>, + arg0: i32, + arg1: i32, + arg2: i32, + arg3: i32, + arg4: i32, + arg5: i32, + arg6: i32, + arg7: i32, + arg8: i32| + -> Result { + // A module using wasi-parallel must export a shared memory named + // 'memory'. This is what is passed to Wiggle. + let shared_memory = get_exported_shared_memory(&mut caller)?; + let mem = unsafe { + std::slice::from_raw_parts_mut( + shared_memory.data() as *mut u8, + shared_memory.data_size(), + ) + }; + let mem = wiggle::wasmtime::WasmtimeGuestMemory::new(mem); + + // Parse the arguments using wiggle. Ideally this would happen + // directly in wiggle-generated code but our need for `Caller` + // forces us to do it here. Changes from the auto-generated + // are marked with TODOs. + let device_id = arg0; + let kernel = wiggle::GuestPtr::<[u8]>::new(&mem, (arg1 as u32, arg2 as u32)); + let num_threads = arg3; // TODO: as u32 + let block_size = arg4; // TODO: as u32 + let in_buffers = wiggle::GuestPtr::<[i32]>::new(&mem, (arg5 as u32, arg6 as u32)); // TODO ...<[Buffer]> + let out_buffers = wiggle::GuestPtr::<[i32]>::new(&mem, (arg7 as u32, arg8 as u32)); // TODO: ...<[Buffer]> + + // To properly compile the function we need to use the same engine + // as the caller. + let engine = &caller.engine().clone(); + + // Call the wasi-parallel context with all the right Rust types. + let outer_ctx = get_cx(caller.data_mut()); + let mut inner_ctx = outer_ctx.ctx.borrow_mut(); + let kernel = &*kernel.as_slice().map_err(stringify_wiggle_err)?; + let in_buffers = &*in_buffers.as_slice().map_err(stringify_wiggle_err)?; + let out_buffers = &*out_buffers.as_slice().map_err(stringify_wiggle_err)?; + let result = inner_ctx.invoke_parallel_for( + device_id, + kernel, + engine, + shared_memory, + num_threads, + block_size, + in_buffers, + out_buffers, + ); + match result { + Ok(_) => Ok(0), // TODO: ::success() as i32 + Err(e) => Err(Trap::new(e.to_string())), + } + }, + )?; + + Ok(()) +} + +/// Retrieve the exported `"memory"` from the `Caller`; this is a helper +/// implementation for simplifying the `parallel_for` closure. Usually this is +/// auto-generated inline by `wiggle_generate::wasmtime::generate_func`. +pub(crate) fn get_exported_shared_memory(caller: &mut Caller) -> Result { + match caller.get_export("memory") { + Some(Extern::SharedMemory(m)) => Ok(m), + _ => Err(Trap::new("missing required shared memory export: 'memory'")), + } +} + +/// Construct a Wasmtime [`Trap`] from a Wiggle [`GuestError`]; this uses the +/// same logic as `From for wiggle::Trap` but with one less +/// conversion. +fn stringify_wiggle_err(err: GuestError) -> Trap { + Trap::new(err.to_string()) +} diff --git a/crates/wasi-parallel/src/witx.rs b/crates/wasi-parallel/src/witx.rs new file mode 100644 index 000000000000..7f7cf40692ef --- /dev/null +++ b/crates/wasi-parallel/src/witx.rs @@ -0,0 +1,33 @@ +//! Contains the macro-generated implementation of wasi-nn from its WITX +//! definition file. +use crate::{WasiParallel, WasiParallelError}; + +// Generate the traits and types of wasi-parallel to several Rust modules (e.g. +// `types`). TODO: eventually re-add Git submodule for auto-retrieval of the +// specification. +wiggle::from_witx!({ + witx: ["$WASI_ROOT/wasi-parallel.witx"], + errors: { par_errno => WasiParallelError }, + skip: ["parallel_exec"], +}); + +use types::ParErrno; + +// Additionally, we must let Wiggle know which of our error codes represents a +// successful operation. +impl wiggle::GuestErrorType for ParErrno { + fn success() -> Self { + Self::Success + } +} + +// Provide a way to map errors from the `WasiEphemeralTrait` (see `impl.rs`) to +// the WITX-defined error type. +impl wasi_ephemeral_parallel::UserErrorConversion for WasiParallel { + fn par_errno_from_wasi_parallel_error( + &mut self, + e: anyhow::Error, + ) -> Result { + todo!("must handle error: {}", e) + } +} diff --git a/crates/wasi-parallel/tests/rust/buffer.rs b/crates/wasi-parallel/tests/rust/buffer.rs new file mode 100644 index 000000000000..00459950cf07 --- /dev/null +++ b/crates/wasi-parallel/tests/rust/buffer.rs @@ -0,0 +1,27 @@ +//! This test checks that wasi-parallel's `read_buffer`/`write_buffer` work as +//! expected on a CPU. This is intended to be compiled to Wasm by `build.rs`, +//! but to run it directly: +//! +//! ``` +//! rustc tests/rust/buffer.rs --target wasm32-wasi +//! RUST_BACKTRACE=1 wasmtime run --wasi-modules=experimental-wasi-parallel ./buffer.wasm +//! ``` +#[allow(dead_code)] +mod wasi_parallel; + +use wasi_parallel::{BufferAccessKind, DeviceKind}; + +fn main() -> Result<(), wasi_parallel::ParErrno> { + let source = [0xFF; 1024]; + let destination = [0x00; 1024]; + assert!(source != destination); + + let device = wasi_parallel::get_device(DeviceKind::Cpu)?; + let source_buffer = + wasi_parallel::create_buffer(&device, source.len() as u32, BufferAccessKind::Read)?; + wasi_parallel::write_buffer(&source, &source_buffer)?; + wasi_parallel::read_buffer(&source_buffer, &destination)?; + + assert_eq!(source, destination); + Ok(()) +} diff --git a/crates/wasi-parallel/tests/rust/wasi_parallel.rs b/crates/wasi-parallel/tests/rust/wasi_parallel.rs new file mode 100644 index 000000000000..4592e816491f --- /dev/null +++ b/crates/wasi-parallel/tests/rust/wasi_parallel.rs @@ -0,0 +1,227 @@ +//! This module provides Rust bindings to the wasi-parallel API as implemented +//! in this crate. It was generated at +//! https://alexcrichton.github.io/witx-bindgen using the WITX below and altered +//! slightly since Wiggle's generated signatures are slightly different than +//! witx-bindgen's: +//! +//! ``` +//! resource Device +//! enum DeviceKind { +//! Cpu, +//! DiscreteGpu, +//! IntegratedGpu +//! } +//! resource Buffer +//! enum BufferAccessKind { +//! Read, +//! Write, +//! ReadWrite, +//! } +//! enum ParErrno { +//! Success +//! } +//! get_device: function(hint: DeviceKind) -> expected +//! create_buffer: function(device: Device, size: u32, buffer_access_kind: BufferAccessKind) -> expected +//! write_buffer: function(source: list, destination: Buffer) -> expected<_, ParErrno> +//! read_buffer: function(source: Buffer, destination: list) -> expected<_, ParErrno> +//! parallel_for: function(device: Device, kernel: list, num_iterations: u32, block_size: u32, in_buffers: list, out_buffers: list) -> expected<_, ParErrno> +//! ``` + +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum DeviceKind { + Cpu, + DiscreteGpu, + IntegratedGpu, +} + +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum BufferAccessKind { + Read, + Write, + ReadWrite, +} + +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum ParErrno { + Success, + Error(i32), +} + +#[derive(Debug)] +#[repr(transparent)] +pub struct Device(i32); +impl Device { + pub unsafe fn from_raw(raw: i32) -> Self { + Self(raw) + } + + pub fn into_raw(self) -> i32 { + let ret = self.0; + core::mem::forget(self); + return ret; + } + + pub fn as_raw(&self) -> i32 { + self.0 + } +} +// Removed `impl Drop for Device { ... }` + +#[derive(Debug)] +#[repr(transparent)] +pub struct Buffer(i32); +impl Buffer { + pub unsafe fn from_raw(raw: i32) -> Self { + Self(raw) + } + + pub fn into_raw(self) -> i32 { + let ret = self.0; + core::mem::forget(self); + return ret; + } + + pub fn as_raw(&self) -> i32 { + self.0 + } +} +// Removed `impl Drop for Buffer { ... }` + +pub fn get_device(hint: DeviceKind) -> Result { + unsafe { + let ptr0 = RET_AREA.as_mut_ptr() as i32; + #[link(wasm_import_module = "wasi_ephemeral_parallel")] + extern "C" { + #[cfg_attr(target_arch = "wasm32", link_name = "get_device")] + #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_get_device")] + fn witx_import(_: i32, _: i32) -> i32; + } + let result = witx_import(hint as i32, ptr0); + match result { + 0 => Ok(Device(*(ptr0 as *const i32))), + _ => Err(ParErrno::Error(result)), + } + } +} + +pub fn create_buffer( + device: &Device, + size: u32, + buffer_access_kind: BufferAccessKind, +) -> Result { + unsafe { + let ptr1 = RET_AREA.as_mut_ptr() as i32; + #[link(wasm_import_module = "wasi_ephemeral_parallel")] + extern "C" { + #[cfg_attr(target_arch = "wasm32", link_name = "create_buffer")] + #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_create_buffer")] + fn witx_import(_: i32, _: i32, _: i32, _: i32) -> i32; + } + let result = witx_import(device.0, size as i32, buffer_access_kind as i32, ptr1); + match result { + 0 => Ok(Buffer(*(ptr1 as *const i32))), + _ => Err(ParErrno::Error(result)), + } + } +} + +pub fn write_buffer(source: &[u8], destination: &Buffer) -> Result<(), ParErrno> { + unsafe { + let vec2 = source; + let ptr2 = vec2.as_ptr() as i32; + let len2 = vec2.len() as i32; + #[link(wasm_import_module = "wasi_ephemeral_parallel")] + extern "C" { + #[cfg_attr(target_arch = "wasm32", link_name = "write_buffer")] + #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_write_buffer")] + fn witx_import(_: i32, _: i32, _: i32) -> i32; + } + let result = witx_import(ptr2, len2, destination.0); + match result { + 0 => Ok(()), + _ => Err(ParErrno::Error(result)), + } + } +} + +pub fn read_buffer(source: &Buffer, destination: &[u8]) -> Result<(), ParErrno> { + unsafe { + let vec4 = destination; + let ptr4 = vec4.as_ptr() as i32; + let len4 = vec4.len() as i32; + #[link(wasm_import_module = "wasi_ephemeral_parallel")] + extern "C" { + #[cfg_attr(target_arch = "wasm32", link_name = "read_buffer")] + #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_read_buffer")] + fn witx_import(_: i32, _: i32, _: i32) -> i32; + } + let result = witx_import(source.0, ptr4, len4); + match result { + 0 => Ok(()), + _ => Err(ParErrno::Error(result)), + } + } +} + +pub fn parallel_for( + kernel: u32, + num_threads: u32, + block_size: u32, + in_buffers: &[&Buffer], + out_buffers: &[&Buffer], +) -> Result<(), ParErrno> { + unsafe { + let vec6 = in_buffers; + let len6 = vec6.len() as i32; + let layout6 = core::alloc::Layout::from_size_align_unchecked(vec6.len() * 4, 4); + let result6 = std::alloc::alloc(layout6); + if result6.is_null() { + std::alloc::handle_alloc_error(layout6); + } + for (i, e) in vec6.into_iter().enumerate() { + let base = result6 as i32 + (i as i32) * 4; + { + *((base + 0) as *mut i32) = e.0; + } + } + let vec7 = out_buffers; + let len7 = vec7.len() as i32; + let layout7 = core::alloc::Layout::from_size_align_unchecked(vec7.len() * 4, 4); + let result7 = std::alloc::alloc(layout7); + if result7.is_null() { + std::alloc::handle_alloc_error(layout7); + } + for (i, e) in vec7.into_iter().enumerate() { + let base = result7 as i32 + (i as i32) * 4; + { + *((base + 0) as *mut i32) = e.0; + } + } + #[link(wasm_import_module = "wasi_ephemeral_parallel")] + extern "C" { + #[cfg_attr(target_arch = "wasm32", link_name = "parallel_for")] + #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_parallel_for")] + fn witx_import(_: i32, _: i32, _: i32, _: i32, _: i32, _: i32, _: i32) -> i32; + } + let result = witx_import( + kernel as i32, + num_threads as i32, + block_size as i32, + result6 as i32, + len6, + result7 as i32, + len7, + ); + std::alloc::dealloc(result6, layout6); + std::alloc::dealloc(result7, layout7); + match result { + 0 => Ok(()), + _ => Err(ParErrno::Error(result)), + } + } +} + +static mut RET_AREA: [i64; 2] = [0; 2]; diff --git a/crates/wasi-parallel/tests/test_case.rs b/crates/wasi-parallel/tests/test_case.rs new file mode 100644 index 000000000000..d397b5f081dc --- /dev/null +++ b/crates/wasi-parallel/tests/test_case.rs @@ -0,0 +1,145 @@ +//! Simplify testing of `wasi-parallel` modules. +//! +//! Due to several factors (toolchains, APIs, etc.) the set up for running +//! `wasi-parallel` code is still quite complicated. This module helps set up +//! the required bits for testing and benchmarking. + +#![allow(dead_code)] + +use anyhow::{Context, Result}; +use wasmtime::{Config, Engine, Func, Linker, Module, SharedMemory, Store, Val}; + +/// Helper structure for setting up a wasi-parallel environment. +pub struct TestCase { + store: Store, + linker: Linker, + memory: Option, +} + +impl TestCase { + /// Compile the WebAssembly at path. + pub fn new(path: &str, engine: Engine, import_memory: Option) -> Result { + let _ = pretty_env_logger::try_init(); + + // Import the WASI definitions. + let mut store = Store::new(&engine, TestEnvironment::new()); + let mut linker = Linker::::new(&engine); + wasmtime_wasi_parallel::add_to_linker(&mut linker, |cx| &mut cx.parallel)?; + wasmtime_wasi::add_to_linker(&mut linker, |cx| &mut cx.common)?; + + // Import the shared memory, if provided. + if let Some(import_memory) = &import_memory { + linker.define("", "memory", import_memory.clone())?; + } + + // Compile the module. + let module = Module::from_file(&engine, path)?; + linker.module(&mut store, "", &module)?; + + // Gather up either the imported or exported memory for later use. + let memory = if let Some(import_memory) = import_memory { + Some(import_memory) + } else if let Some(export) = linker.get(&mut store, "", "memory") { + let export_memory = export + .into_shared_memory() + .context("expect the 'memory' export to be a shared memory")?; + Some(export_memory) + } else { + None + }; + + Ok(Self { + store, + linker, + memory, + }) + } + + /// Provide access to the store. + pub fn store(&mut self) -> &mut Store { + &mut self.store + } + + /// Conveniently return the shared memory as a slice. + pub fn memory_as_slice(&self) -> &[u8] { + let memory = self + .memory + .as_ref() + .expect("expected an imported or exported shared memory"); + unsafe { std::slice::from_raw_parts_mut(memory.data() as *mut u8, memory.data_size()) } + } + + /// Provide a convenient way to invoke a function. + pub fn invoke(&mut self, name: &str, args: &[Val]) -> Result> { + let func = self.get_function(name)?; + let num_results = func.ty(&self.store).results().len(); + let mut results = vec![Val::I32(-1); num_results]; + func.call(&mut self.store, &args, &mut results)?; + Ok(results) + } + + /// Retrieve the exported `name` function. + fn get_function(&mut self, name: &str) -> Result { + Ok(self + .linker + .get(&mut self.store, "", name) + .context("unable to find function of the given name")? + .into_func() + .context("the export was not a function")?) + } + + /// Retrieve the default entry function. + fn get_default_function(&mut self) -> Result { + self.linker.get_default(&mut self.store, "") + } +} + +pub struct TestEnvironment { + common: wasmtime_wasi::WasiCtx, + parallel: wasmtime_wasi_parallel::WasiParallel, +} + +impl TestEnvironment { + fn new() -> Self { + Self { + common: wasmtime_wasi::WasiCtxBuilder::new().inherit_stdio().build(), + parallel: wasmtime_wasi_parallel::WasiParallel::new(), + } + } +} + +/// Configure the engine to use the threads proposal (i.e., to enable shared +/// memory). +pub fn default_engine() -> Engine { + let mut config = Config::new(); + config.wasm_threads(true); + let engine = Engine::new(&config).unwrap(); + engine +} + +/// Execute the default function of a test case and return its exit code. +pub fn exec(path: &str) -> Result { + let mut test_case = TestCase::new(path, default_engine(), None).unwrap(); + let func = test_case.get_default_function().unwrap(); + let func = func.typed::<(), i32, _>(&mut test_case.store).unwrap(); + func.call(&mut test_case.store, ()) + .context("failed to execute default function") +} + +/// Helper for describing the kinds of parallel devices. +#[derive(Clone, Copy, Debug)] +pub enum Parallelism { + Cpu, + Sequential, +} + +impl Parallelism { + pub fn as_device_kind(&self) -> i32 { + // These values must be kept up-to-date with + // https://github.com/WebAssembly/wasi-parallel/blob/main/wasi-parallel.witx. + match self { + Parallelism::Sequential => 0, + Parallelism::Cpu => 1, + } + } +} diff --git a/crates/wasi-parallel/tests/wasm.rs b/crates/wasi-parallel/tests/wasm.rs new file mode 100644 index 000000000000..e8d36728f4fa --- /dev/null +++ b/crates/wasi-parallel/tests/wasm.rs @@ -0,0 +1,15 @@ +//! Run the test cases in the `wasm` directory. +//! +//! See the `build.rs` file for how these WebAssembly binaries are generated. + +mod test_case; + +#[cfg(feature = "build-tests")] +#[test] +fn run_buffer_rs() { + let mut test_case = + test_case::TestCase::new("tests/wasm/buffer.wasm", test_case::default_engine(), None) + .unwrap(); + let results = test_case.invoke("main", &[0.into(), 0.into()]).unwrap(); + assert_eq!(results[0].i32().unwrap(), 0); +} diff --git a/crates/wasi-parallel/tests/wat.rs b/crates/wasi-parallel/tests/wat.rs new file mode 100644 index 000000000000..26f798afa9bd --- /dev/null +++ b/crates/wasi-parallel/tests/wat.rs @@ -0,0 +1,151 @@ +//! Run the test cases in the `wat` directory. + +mod test_case; + +use std::convert::TryInto; +use test_case::*; +use wasmtime::{MemoryType, SharedMemory}; + +#[test] +fn run_example_wat() { + assert_eq!(exec("tests/wat/example.wat").unwrap(), 0); +} + +// Though a bit overkill, it is very convenient when troubleshooting to be able +// to check that we can successfully run a single iteration of a kernel as well. +#[test] +fn run_example_kernel_wat() { + // Setup the imported memory. + let engine = default_engine(); + let memory = SharedMemory::new(&engine, MemoryType::shared(1, 1)).unwrap(); + + // Execute the kernel once (mimic running the 42nd iteration). + let mut test_case = + TestCase::new("tests/wat/example-kernel.wat", engine, Some(memory)).unwrap(); + let _ = test_case + .invoke("kernel", &[42.into(), 0.into(), 0.into()]) + .unwrap(); + + // Check that the first four bytes were stored correctly. + let first_4_bytes = &test_case.memory_as_slice()[0..4]; + let first_i32 = i32::from_le_bytes(first_4_bytes.try_into().unwrap()); + assert_eq!(first_i32, 43); +} + +// Here we check that we can actually run the nstream benchmark. This is also +// available using `cargo bench`, but this checks different parameters. +#[test] +fn run_nstream_wat() { + const NUM_THREADS: i32 = 24; + const NUM_ITEMS: i32 = 200000; + const DEVICE_KIND: i32 = 0x1; // CPU + + // Setup the benchmark. + let mut test_case = TestCase::new("tests/wat/nstream.wat", default_engine(), None).unwrap(); + let _ = test_case + .invoke( + "setup", + &[NUM_THREADS.into(), NUM_ITEMS.into(), DEVICE_KIND.into()], + ) + .unwrap(); + + // Execute the benchmark. + let _ = test_case.invoke("execute", &[]).unwrap(); + + // Run the finalize step. + let results = test_case.invoke("finish", &[]).unwrap(); + assert_eq!(results[0].i32().unwrap(), 0); +} + +// For troubleshooting, we to check that we can successfully run a single +// iteration of the nstream kernel as well. +#[test] +fn run_nstream_kernel_wat() { + // Set up the shared memory; nstream operates on memory buffers with + // specific floating point values that we initialize manually here. + const NUM_ITERATIONS: usize = 10; + const BLOCK_SIZE: usize = 10; + const ITEM_SIZE: usize = std::mem::size_of::(); + const BUFFER_SIZE: usize = NUM_ITERATIONS * BLOCK_SIZE * ITEM_SIZE; + let mut memory_image = [0u8; BUFFER_SIZE * 3]; + for i in 0..NUM_ITERATIONS * BLOCK_SIZE { + let a_i = i * ITEM_SIZE; + memory_image[a_i..a_i + ITEM_SIZE].copy_from_slice(&0.0f32.to_le_bytes()); + let b_i = BUFFER_SIZE + i * ITEM_SIZE; + memory_image[b_i..b_i + ITEM_SIZE].copy_from_slice(&2.0f32.to_le_bytes()); + let c_i = BUFFER_SIZE + BUFFER_SIZE + i * ITEM_SIZE; + memory_image[c_i..c_i + ITEM_SIZE].copy_from_slice(&2.0f32.to_le_bytes()); + } + + // Copy to the imported shared memory. + let engine = default_engine(); + let memory = SharedMemory::new(&engine, MemoryType::shared(0x800, 0x800)).unwrap(); + assert!(memory.data_size() > memory_image.len()); + unsafe { + std::ptr::copy( + memory_image.as_ptr(), + memory.data() as *mut u8, + memory_image.len(), + ); + } + + // Execute the kernel once (mimic running only the second iteration). + let mut test_case = + TestCase::new("tests/wat/nstream-kernel.wat", engine, Some(memory)).unwrap(); + let params = vec![ + // Second iteration (0-based). + 1.into(), + (NUM_ITERATIONS as i32).into(), + (BLOCK_SIZE as i32).into(), + // Buffer A. + 0.into(), + (BUFFER_SIZE as i32).into(), + // Buffer B. + (BUFFER_SIZE as i32).into(), + (BUFFER_SIZE as i32).into(), + // Buffer C. + (BUFFER_SIZE as i32 * 2).into(), + (BUFFER_SIZE as i32).into(), + ]; + let _ = test_case.invoke("kernel", ¶ms).unwrap(); + + // Check that the second iteration of the A buffer is filled in correctly. + let memory_as_f32s = unsafe { + std::slice::from_raw_parts( + test_case.memory_as_slice().as_ptr() as *const _ as *const f32, + NUM_ITERATIONS * BLOCK_SIZE, + ) + }; + assert_eq!(memory_as_f32s.len() * ITEM_SIZE, BUFFER_SIZE); + let iteration_memory = &memory_as_f32s[BLOCK_SIZE..BLOCK_SIZE + BLOCK_SIZE]; + assert!(iteration_memory.iter().all(|f| f == &8.0)); +} + +// Here we check that we can actually run the nstream benchmark. This is also +// available using `cargo bench`, but this checks different parameters. +#[test] +fn run_sum_wat() { + const NUM_THREADS: i32 = 8; + const ADDITIONS_PER_THREAD: i32 = 0x200_000; + const DEVICE_KIND: i32 = 0x1; // CPU + + // Setup the benchmark. + let mut test_case = TestCase::new("tests/wat/sum.wat", default_engine(), None).unwrap(); + let _ = test_case + .invoke( + "setup", + &[ + NUM_THREADS.into(), + ADDITIONS_PER_THREAD.into(), + DEVICE_KIND.into(), + ], + ) + .unwrap(); + + // Execute the benchmark. + let _ = test_case.invoke("execute", &[]).unwrap(); + + // Run the finalize step. + let results = test_case.invoke("finish", &[]).unwrap(); + assert_eq!(results[0].i32().unwrap(), 0); +} diff --git a/crates/wasi-parallel/tests/wat/example-kernel.wat b/crates/wasi-parallel/tests/wat/example-kernel.wat new file mode 100644 index 000000000000..00502c12c27e --- /dev/null +++ b/crates/wasi-parallel/tests/wat/example-kernel.wat @@ -0,0 +1,10 @@ +(module + (memory (import "" "memory") 1 1 shared) + (func $kernel (export "kernel") (param $iteration_id i32) (param $num_iterations i32) (param $block_size i32) + (i32.store + ;; Address of shared memory to store at--no buffer used. + (i32.const 0) + ;; Increment the iteration ID to avoid 0 and store this. + (i32.add (local.get $iteration_id) (i32.const 1)) + )) +) diff --git a/crates/wasi-parallel/tests/wat/example.wat b/crates/wasi-parallel/tests/wat/example.wat new file mode 100644 index 000000000000..310b2fb618dd --- /dev/null +++ b/crates/wasi-parallel/tests/wat/example.wat @@ -0,0 +1,56 @@ +;; A minimal example of wasi-parallel execution--each thread of execution writes +;; to the same location in memory. This makes use of the implicit detail in the +;; current implementation that allows a CPU-run kernel to modify the module's +;; memory. +;; +;; Note how both the kernel code is accessible in linear memory. Without +;; read-only access support in WebAssembly, the kernel could be overwritten +;; (accidentally or maliciously), which is a risk in this "bag-of-bytes" +;; paradigm. + +(module + (import "wasi_ephemeral_parallel" "get_device" (func $get_device + (param $hint i32) + (param $out_device i32) + (result i32))) + (import "wasi_ephemeral_parallel" "parallel_exec" (func $par_exec + (param $device i32) + (param $kernel_start i32) + (param $kernel_len i32) + (param $num_iterations i32) + (param $block_size i32) + (param $in_buffers_start i32) + (param $in_buffers_len i32) + (param $out_buffers_start i32) + (param $out_buffers_len i32) + (result i32))) + + ;; The kernel here is the binary-encoded version of `example-kernel.wat`, + ;; using: + ;; + ;; $ wat2wasm tests/wat/example-kernel.wat --enable-threads --output=- | xxd -g 1 -p | sed -r 's/.{2}/\\&/g' | tr -d '\n' + ;; + ;; The length is calculated using `wc -c` and dividing by 3. + (memory (export "memory") 1 1 shared) + ;; Reserve 8 bytes for the return area, then emit the kernel: + (data (i32.const 8) "\00\61\73\6d\01\00\00\00\01\07\01\60\03\7f\7f\7f\00\02\0d\01\00\06\6d\65\6d\6f\72\79\02\03\01\01\03\02\01\00\07\0a\01\06\6b\65\72\6e\65\6c\00\00\0a\0e\01\0c\00\41\00\20\00\41\01\6a\36\02\00\0b") + + (func (export "_start") (result i32) + (local $return_area i32) + (local $device i32) + (local.set $return_area (i32.const 0)) + + ;; Set up a CPU device. + (drop (call $get_device (i32.const 0x01) (local.get $return_area))) + (local.set $device (i32.load (local.get $return_area))) + + ;; Execute the kernel in parallel. + (call $par_exec (local.get $device) (i32.const 8) (i32.const 64) (i32.const 12) (i32.const 4) + ;; Empty buffers: + (i32.const 0) (i32.const 0) (i32.const 0) (i32.const 0)) + + ;; Check that the parallel execution returned 0 (success) and that the + ;; memory was updated by an invocation of the kernel--if so, return 0. + (i32.eq (i32.load (i32.const 0)) (i32.const 0)) + (i32.or)) +) diff --git a/crates/wasi-parallel/tests/wat/nstream-kernel.wat b/crates/wasi-parallel/tests/wat/nstream-kernel.wat new file mode 100644 index 000000000000..0e415cbfcd4a --- /dev/null +++ b/crates/wasi-parallel/tests/wat/nstream-kernel.wat @@ -0,0 +1,47 @@ +;; This hand-coded implementation of the nstream splits the work +;; dynamically--i.e., in the kernel itself. See, e.g., +;; https://github.com/ParRes/Kernels/blob/default/Cxx11/nstream-tbb.cc#L132 for +;; a higher-level implementation. + +(module + (memory (import "" "memory") 0x800 0x800 shared) + (func $kernel (export "kernel") (param $iteration_id i32) (param $num_iterations i32) (param $block_size i32) (param $A i32) (param $A_len i32) (param $B i32) (param $B_len i32) (param $C i32) (param $C_len i32) + (local $A_i i32) + (local $B_i i32) + (local $C_i i32) + (local $i i32) + (local $end i32) + + ;; The division of the buffers between iterations happens here: the + ;; block size defines how many floating-point numbers each iteration + ;; will touch: + ;; i = iteration_id * block_size * 4; + ;; end = i + block_size * 4; + (local.set $i (i32.mul (i32.mul (local.get $iteration_id) (local.get $block_size)) (i32.const 4))) + (local.set $end (i32.add (local.get $i) (i32.mul (local.get $block_size) (i32.const 4)))) + + (block + (loop + (local.set $A_i (i32.add (local.get $A) (local.get $i))) + (local.set $B_i (i32.add (local.get $B) (local.get $i))) + (local.set $C_i (i32.add (local.get $C) (local.get $i))) + + ;; Offset to store: A[i] = ... + (local.get $A_i) + ;; Value to store: ... = A[i] + B[i] + 3.0 * C[i]; + (f32.add + (f32.load (local.get $A_i)) + (f32.add + (f32.load (local.get $B_i)) + (f32.mul + (f32.const 3.0) + (f32.load (local.get $C_i))))) + (f32.store) + + ;; Loop control--exit once we have looped through this + ;; iteration's portion of the buffer. + (local.set $i (i32.add (local.get $i) (i32.const 4))) + (i32.ge_s (local.get $i) (local.get $end)) + (br_if 1) + (br 0)))) +) diff --git a/crates/wasi-parallel/tests/wat/nstream.wat b/crates/wasi-parallel/tests/wat/nstream.wat new file mode 100644 index 000000000000..c7ea29c165c3 --- /dev/null +++ b/crates/wasi-parallel/tests/wat/nstream.wat @@ -0,0 +1,175 @@ +;; This hand-coded implementation of `nstream` uses wasi-parallel to split the +;; `nstream` work--some light arithmetic with heavy memory access--across the +;; parallelism available to `wasi-parallel`. +;; +;; See, e.g., +;; https://github.com/ParRes/Kernels/blob/default/Cxx11/nstream-tbb.cc#L132 for +;; a higher-level implementation. + +(module + (import "wasi_ephemeral_parallel" "get_device" (func $get_device + (param $hint i32) + (param $out_device i32) + (result i32))) + (import "wasi_ephemeral_parallel" "create_buffer" (func $create_buffer + (param $device i32) + (param $size i32) + (param $access i32) + (param $out_buffer i32) + (result i32))) + (import "wasi_ephemeral_parallel" "write_buffer" (func $write_buffer + (param $data_offset i32) + (param $data_len i32) + (param $buffer i32) + (result i32))) + (import "wasi_ephemeral_parallel" "read_buffer" (func $read_buffer + (param $buffer i32) + (param $data_offset i32) + (param $data_len i32) + (result i32))) + (import "wasi_ephemeral_parallel" "parallel_exec" (func $par_exec + (param $device i32) + (param $kernel_start i32) + (param $kernel_len i32) + (param $num_iterations i32) + (param $block_size i32) + (param $in_buffers_start i32) + (param $in_buffers_len i32) + (param $out_buffers_start i32) + (param $out_buffers_len i32) + (result i32))) + + ;; The kernel here is the binary-encoded version of `nstream-kernel.wat`, + ;; using: + ;; + ;; $ wat2wasm tests/wat/nstream-kernel.wat --enable-threads --output=- | xxd -g 1 -p | sed -r 's/.{2}/\\&/g' | tr -d '\n' + ;; + ;; The length is calculated using `wc -c` and dividing by 3. + (memory (export "memory") 0x800 0x800 shared) + ;; Reserve 12 bytes for the return area and buffer lists, then emit the + ;; kernel: + (data (i32.const 12) "\00\61\73\6d\01\00\00\00\01\0d\01\60\09\7f\7f\7f\7f\7f\7f\7f\7f\7f\00\02\0f\01\00\06\6d\65\6d\6f\72\79\02\03\80\10\80\10\03\02\01\00\07\0a\01\06\6b\65\72\6e\65\6c\00\00\0a\61\01\5f\01\05\7f\20\00\20\02\6c\41\04\6c\21\0c\20\0c\20\02\41\04\6c\6a\21\0d\02\40\03\40\20\03\20\0c\6a\21\09\20\05\20\0c\6a\21\0a\20\07\20\0c\6a\21\0b\20\09\20\09\2a\02\00\20\0a\2a\02\00\43\00\00\40\40\20\0b\2a\02\00\94\92\92\38\02\00\20\0c\41\04\6a\21\0c\20\0c\20\0d\4e\0d\01\0c\00\0b\0b\0b") + + ;; Global values overwritten by `setup`. + (global $num_threads (mut i32) (i32.const -1)) + (global $block_size (mut i32) (i32.const -1)) + (global $buffer_size (mut i32) (i32.const -1)) + (global $device (mut i32) (i32.const -1)) + (global $A (mut i32) (i32.const -1)) + (global $B (mut i32) (i32.const -1)) + (global $C (mut i32) (i32.const -1)) + + + (func (export "setup") (param $num_threads i32) (param $num_items i32) (param $device_kind i32) + (local $return_area i32) + (local $device i32) + (local $len i32) + (local $memA i32) + (local $memB i32) + (local $memC i32) + (local $A i32) + (local $B i32) + (local $C i32) + + ;; Save some setup parameters for later. + (global.set $num_threads (local.get $num_threads)) + (global.set $block_size (i32.add + (i32.div_u (local.get $num_items) (local.get $num_threads)) + ;; Over-estimate the block size by 1 if there is a remainder to + ;; account for. + (if (result i32) (i32.rem_u (local.get $num_items) (local.get $num_threads)) + (then (i32.const 1)) + (else (i32.const 0)) + ) + )) + (global.set $buffer_size (i32.mul + (i32.mul (global.get $block_size) (i32.const 4)) + (global.get $num_threads) + )) + + ;; Assign some pointers, skipping the first section of memory because + ;; it contains the return area, the kernel bytes, etc. + (local.set $return_area (i32.const 0x00)) + (local.set $memA (global.get $buffer_size)) + (local.set $memB (i32.mul (global.get $buffer_size) (i32.const 2))) + (local.set $memC (i32.mul (global.get $buffer_size) (i32.const 3))) + + ;; Set up the device. + (drop (call $get_device (local.get $device_kind) (local.get $return_area))) + (global.set $device (i32.load (local.get $return_area))) + + ;; Create the buffers. Note that `0x00 = read` and `0x01 = read-write`. + (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x01) (local.get $return_area))) + (global.set $A (i32.load (local.get $return_area))) + (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x00) (local.get $return_area))) + (global.set $B (i32.load (local.get $return_area))) + (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x00) (local.get $return_area))) + (global.set $C (i32.load (local.get $return_area))) + + ;; Fill the buffers with the correct values. + (call $initialize (local.get $memA) (global.get $buffer_size) (f32.const 0)) + (call $initialize (local.get $memB) (global.get $buffer_size) (f32.const 2)) + (call $initialize (local.get $memC) (global.get $buffer_size) (f32.const 2)) + + ;; Assign the buffers their contents. + (drop (call $write_buffer (local.get $memA) (global.get $buffer_size) (global.get $A))) + (drop (call $write_buffer (local.get $memB) (global.get $buffer_size) (global.get $B))) + (drop (call $write_buffer (local.get $memC) (global.get $buffer_size) (global.get $C)))) + + (func (export "execute") + ;; Set up the list of buffers. + (i32.store (i32.const 0) (global.get $A)) + (i32.store (i32.const 4) (global.get $B)) + (i32.store (i32.const 8) (global.get $C)) + + ;; Execute the kernel in parallel. + (call $par_exec (global.get $device) + ;; Kernel bytes. + (i32.const 12) (i32.const 155) + ;; Number of iterations and block size + (global.get $num_threads) (global.get $block_size) + ;; Input buffers. + (i32.const 0) (i32.const 3) + ;; Output buffers. + (i32.const 0) (i32.const 0)) + + drop) + + (func (export "finish") (result i32) + (local $memA i32) + (local.set $memA (global.get $buffer_size)) + ;; Assert that all values in A equal 8.0. + (call $check (local.get $memA) (global.get $buffer_size) (f32.const 8.0)) + ) + + ;; Helper function to (inefficiently) initialize a block of memory. + (func $initialize (param $offset i32) (param $len i32) (param $value f32) + (block + (loop + (local.set $len (i32.sub (local.get $len) (i32.const 4))) + (f32.store (i32.add (local.get $offset) (local.get $len)) (local.get $value)) + (i32.le_s (local.get $len) (i32.const 0)) + (br_if 1) + (br 0) + ) + ) + ) + + ;; Helper function to check that an entire memory region matches the value. + (func $check (param $offset i32) (param $len i32) (param $value f32) (result i32) + (loop $cont + (local.set $len (i32.sub (local.get $len) (i32.const 4))) + ;; If the loaded value does not match, early return with a `1` + ;; code. + (i32.const 1) + (br_if 1 (f32.ne + (local.get $value) + (f32.load (i32.add (local.get $offset) (local.get $len))))) + (drop) + ;; Continue iterating until we reach the end, exiting with a `0` + ;; success code. + (br_if $cont (i32.gt_s (local.get $len) (i32.const 0))) + ) + (i32.const 0) + ) +) diff --git a/crates/wasi-parallel/tests/wat/sum-kernel.wat b/crates/wasi-parallel/tests/wat/sum-kernel.wat new file mode 100644 index 000000000000..b05e7c38bf05 --- /dev/null +++ b/crates/wasi-parallel/tests/wat/sum-kernel.wat @@ -0,0 +1,22 @@ +;; This trivial kernel increments a local until it matches the block size, +;; then it stores this value in a buffer to be aggregated later. + +(module + (memory (import "" "memory") 1 1 shared) + (func $kernel (export "kernel") (param $iteration_id i32) (param $num_iterations i32) (param $block_size i32) (param $A i32) (param $A_len i32) + (local $i i64) + (local $end i64) + (local.set $i (i64.const 0)) + (local.set $end (i64.extend_i32_u (local.get $block_size))) + (loop $cont + (local.set $i (i64.add (local.get $i) (i64.const 1))) + (br_if $cont (i64.lt_u (local.get $i) (local.get $end))) + ) + (i64.store + ;; Address to store at. + (i32.add (local.get $A) (i32.mul (local.get $iteration_id) (i32.const 8))) + ;; The summed value. + (local.get $i) + ) + ) +) diff --git a/crates/wasi-parallel/tests/wat/sum.wat b/crates/wasi-parallel/tests/wat/sum.wat new file mode 100644 index 000000000000..f3831ae2cab3 --- /dev/null +++ b/crates/wasi-parallel/tests/wat/sum.wat @@ -0,0 +1,129 @@ +;; This hand-coded implementation of nstream splits the work--a 32MB buffer (4 * +;; LLC)--evenly among 4 cores and uses wasi-parallel to distribute the work. +;; See, e.g., +;; https://github.com/ParRes/Kernels/blob/default/Cxx11/nstream-tbb.cc#L132 for +;; a higher-level implementation. + +(module + (import "wasi_ephemeral_parallel" "get_device" (func $get_device + (param $hint i32) + (param $out_device i32) + (result i32))) + (import "wasi_ephemeral_parallel" "create_buffer" (func $create_buffer + (param $device i32) + (param $size i32) + (param $access i32) + (param $out_buffer i32) + (result i32))) + (import "wasi_ephemeral_parallel" "write_buffer" (func $write_buffer + (param $data_offset i32) + (param $data_len i32) + (param $buffer i32) + (result i32))) + (import "wasi_ephemeral_parallel" "read_buffer" (func $read_buffer + (param $buffer i32) + (param $data_offset i32) + (param $data_len i32) + (result i32))) + (import "wasi_ephemeral_parallel" "parallel_exec" (func $par_exec + (param $device i32) + (param $kernel_start i32) + (param $kernel_len i32) + (param $num_iterations i32) + (param $block_size i32) + (param $in_buffers_start i32) + (param $in_buffers_len i32) + (param $out_buffers_start i32) + (param $out_buffers_len i32) + (result i32))) + + ;; The kernel here is the binary-encoded version of `nstream-kernel.wat`, + ;; using: + ;; + ;; $ wat2wasm tests/wat/sum-kernel.wat --enable-threads --output=- | xxd -g 1 -p | sed -r 's/.{2}/\\&/g' | tr -d '\n' + ;; + ;; The length is calculated using `wc -c` and dividing by 3. + (memory (export "memory") 1 1 shared) + ;; Reserve 8 bytes for the return area and buffer lists, then emit the + ;; kernel: + (data (i32.const 8) "\00\61\73\6d\01\00\00\00\01\09\01\60\05\7f\7f\7f\7f\7f\00\02\0d\01\00\06\6d\65\6d\6f\72\79\02\03\01\01\03\02\01\00\07\0a\01\06\6b\65\72\6e\65\6c\00\00\0a\2d\01\2b\01\02\7e\42\00\21\05\20\02\ad\21\06\03\40\20\05\42\01\7c\21\05\20\05\20\06\54\0d\00\0b\20\03\20\00\41\08\6c\6a\20\05\37\03\00\0b") + + ;; Global values overwritten by `setup`. + (global $num_threads (mut i32) (i32.const 4)) + (global $block_size (mut i32) (i32.const 0x2000000)) + (global $buffer_size (mut i32) (i32.const 0x2000000)) + (global $device (mut i32) (i32.const -1)) + (global $A (mut i32) (i32.const -1)) + (global $memA i32 (i32.const 0x1000)) + + (func (export "setup") (param $num_threads i32) (param $block_size i32) (param $device_kind i32) + (local $return_area i32) + ;; Assign the return area pointer. + (local.set $return_area (i32.const 0x00)) + + ;; Save some setup parameters for later. + (global.set $num_threads (local.get $num_threads)) + (global.set $buffer_size (i32.mul (local.get $num_threads) (i32.const 8))) + (global.set $block_size (local.get $block_size)) + + ;; Set up the device. + (drop (call $get_device (local.get $device_kind) (local.get $return_area))) + (global.set $device (i32.load (local.get $return_area))) + + ;; Create a buffer to store the intermediate results. Note that `0x01 = read-write`. + (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x01) (local.get $return_area))) + (global.set $A (i32.load (local.get $return_area))) + + ;; Assign the buffer its (empty) contents. + (drop (call $write_buffer (global.get $memA) (global.get $buffer_size) (global.get $A))) + ) + + (func (export "execute") + (local $sum i64) + (local $i i32) + + ;; Set up the list of buffers. + (i32.store (i32.const 0) (global.get $A)) + + ;; Execute the kernel in parallel. + (call $par_exec (global.get $device) + ;; Kernel bytes. + (i32.const 8) (i32.const 97) + ;; Number of iterations and block size + (global.get $num_threads) (global.get $block_size) + ;; Input buffers. + (i32.const 0) (i32.const 1) + ;; Output buffers. + (i32.const 0) (i32.const 0)) + (drop) + + ;; Read the buffer contents. + (drop (call $read_buffer (global.get $A) (global.get $memA) (global.get $buffer_size))) + + ;; Calculate the sum of each iteration's work and store it at address 0. + (local.set $i (i32.const 0)) + (loop $cont + (local.set $i (i32.add (local.get $i) (i32.const 1))) + (local.set $sum (i64.add + (local.get $sum) + (i64.load (i32.add + (global.get $memA) + (i32.mul (local.get $i) (i32.const 8)) + )) + )) + (br_if $cont (i32.lt_u (local.get $i) (global.get $num_threads))) + ) + (i64.store (i32.const 0) (local.get $sum)) + ) + + (func (export "finish") (result i32) + ;; Assert that the aggregate sum is what is expected. + (i64.eq + (i64.load (i32.const 0)) + (i64.mul + (i64.extend_i32_u (global.get $num_threads)) + (i64.extend_i32_u (global.get $block_size)) + ) + ) + ) +) diff --git a/crates/wiggle/generate/src/codegen_settings.rs b/crates/wiggle/generate/src/codegen_settings.rs index 144d2be282b6..637b5264ae8d 100644 --- a/crates/wiggle/generate/src/codegen_settings.rs +++ b/crates/wiggle/generate/src/codegen_settings.rs @@ -1,4 +1,4 @@ -use crate::config::{AsyncConf, ErrorConf}; +use crate::config::{AsyncConf, ErrorConf, SkipNames}; use anyhow::{anyhow, Error}; use proc_macro2::TokenStream; use quote::quote; @@ -12,6 +12,7 @@ pub struct CodegenSettings { pub errors: ErrorTransform, pub async_: AsyncConf, pub wasmtime: bool, + pub skip_names: SkipNames, } impl CodegenSettings { pub fn new( @@ -19,12 +20,14 @@ impl CodegenSettings { async_: &AsyncConf, doc: &Document, wasmtime: bool, + skip_names: &SkipNames, ) -> Result { let errors = ErrorTransform::new(error_conf, doc)?; Ok(Self { errors, async_: async_.clone(), wasmtime, + skip_names: skip_names.clone(), }) } pub fn get_async(&self, module: &Module, func: &InterfaceFunc) -> Asyncness { diff --git a/crates/wiggle/generate/src/config.rs b/crates/wiggle/generate/src/config.rs index 3d164d70f1d9..81c25c44bf06 100644 --- a/crates/wiggle/generate/src/config.rs +++ b/crates/wiggle/generate/src/config.rs @@ -15,6 +15,7 @@ pub struct Config { pub errors: ErrorConf, pub async_: AsyncConf, pub wasmtime: bool, + pub skip: SkipNames, } mod kw { @@ -22,6 +23,7 @@ mod kw { syn::custom_keyword!(witx_literal); syn::custom_keyword!(block_on); syn::custom_keyword!(errors); + syn::custom_keyword!(skip); syn::custom_keyword!(target); syn::custom_keyword!(wasmtime); } @@ -32,6 +34,7 @@ pub enum ConfigField { Error(ErrorConf), Async(AsyncConf), Wasmtime(bool), + Skip(SkipNames), } impl Parse for ConfigField { @@ -67,6 +70,10 @@ impl Parse for ConfigField { input.parse::()?; input.parse::()?; Ok(ConfigField::Wasmtime(input.parse::()?.value)) + } else if lookahead.peek(kw::skip) { + input.parse::()?; + input.parse::()?; + Ok(ConfigField::Skip(input.parse()?)) } else { Err(lookahead.error()) } @@ -79,6 +86,7 @@ impl Config { let mut errors = None; let mut async_ = None; let mut wasmtime = None; + let mut skip = None; for f in fields { match f { ConfigField::Witx(c) => { @@ -105,6 +113,12 @@ impl Config { } wasmtime = Some(c); } + ConfigField::Skip(c) => { + if skip.is_some() { + return Err(Error::new(err_loc, "duplicate `skip` field")); + } + skip = Some(c); + } } } Ok(Config { @@ -114,6 +128,7 @@ impl Config { errors: errors.take().unwrap_or_default(), async_: async_.take().unwrap_or_default(), wasmtime: wasmtime.unwrap_or(true), + skip: skip.unwrap_or_default(), }) } @@ -550,3 +565,25 @@ impl Parse for WasmtimeConfigField { } } } + +#[derive(Debug, Default, Clone)] +pub struct SkipNames(Vec); +impl Parse for SkipNames { + fn parse(input: ParseStream) -> Result { + let content; + let _ = bracketed!(content in input); + let name_literals: Punctuated = + content.parse_terminated(Parse::parse)?; + let names = name_literals + .iter() + .map(LitStr::value) + .collect::>(); + + Ok(SkipNames(names)) + } +} +impl SkipNames { + pub fn should_skip(&self, func: &witx::InterfaceFunc) -> bool { + self.0.iter().any(|n| n.as_str() == func.name) + } +} diff --git a/crates/wiggle/generate/src/lib.rs b/crates/wiggle/generate/src/lib.rs index 47e82f75870b..de1766f72926 100644 --- a/crates/wiggle/generate/src/lib.rs +++ b/crates/wiggle/generate/src/lib.rs @@ -54,6 +54,7 @@ pub fn generate(doc: &witx::Document, names: &Names, settings: &CodegenSettings) let modname = names.module(&module.name); let fs = module .funcs() + .filter(|f| !settings.skip_names.should_skip(f)) .map(|f| define_func(&names, &module, &f, &settings)); let modtrait = define_module_trait(&names, &module, &settings); let wasmtime = if settings.wasmtime { diff --git a/crates/wiggle/generate/src/module_trait.rs b/crates/wiggle/generate/src/module_trait.rs index 2e108f56f097..c3c4f4918f69 100644 --- a/crates/wiggle/generate/src/module_trait.rs +++ b/crates/wiggle/generate/src/module_trait.rs @@ -18,75 +18,78 @@ pub fn passed_by_reference(ty: &witx::Type) -> bool { pub fn define_module_trait(names: &Names, m: &Module, settings: &CodegenSettings) -> TokenStream { let traitname = names.trait_name(&m.name); let rt = names.runtime_mod(); - let traitmethods = m.funcs().map(|f| { - // Check if we're returning an entity anotated with a lifetime, - // in which case, we'll need to annotate the function itself, and - // hence will need an explicit lifetime (rather than anonymous) - let (lifetime, is_anonymous) = if f - .params - .iter() - .chain(&f.results) - .any(|ret| ret.tref.needs_lifetime()) - { - (quote!('a), false) - } else { - (anon_lifetime(), true) - }; - let funcname = names.func(&f.name); - let args = f.params.iter().map(|arg| { - let arg_name = names.func_param(&arg.name); - let arg_typename = names.type_ref(&arg.tref, lifetime.clone()); - let arg_type = if passed_by_reference(&*arg.tref.type_()) { - quote!(&#arg_typename) + let traitmethods = m + .funcs() + .filter(|f| !settings.skip_names.should_skip(f)) + .map(|f| { + // Check if we're returning an entity anotated with a lifetime, + // in which case, we'll need to annotate the function itself, and + // hence will need an explicit lifetime (rather than anonymous) + let (lifetime, is_anonymous) = if f + .params + .iter() + .chain(&f.results) + .any(|ret| ret.tref.needs_lifetime()) + { + (quote!('a), false) } else { - quote!(#arg_typename) + (anon_lifetime(), true) }; - quote!(#arg_name: #arg_type) - }); - - let result = match f.results.len() { - 0 if f.noreturn => quote!(#rt::Trap), - 0 => quote!(()), - 1 => { - let (ok, err) = match &**f.results[0].tref.type_() { - witx::Type::Variant(v) => match v.as_expected() { - Some(p) => p, - None => unimplemented!("anonymous variant ref {:?}", v), - }, - _ => unimplemented!(), + let funcname = names.func(&f.name); + let args = f.params.iter().map(|arg| { + let arg_name = names.func_param(&arg.name); + let arg_typename = names.type_ref(&arg.tref, lifetime.clone()); + let arg_type = if passed_by_reference(&*arg.tref.type_()) { + quote!(&#arg_typename) + } else { + quote!(#arg_typename) }; + quote!(#arg_name: #arg_type) + }); - let ok = match ok { - Some(ty) => names.type_ref(ty, lifetime.clone()), - None => quote!(()), - }; - let err = match err { - Some(ty) => match settings.errors.for_abi_error(ty) { - Some(custom) => { - let tn = custom.typename(); - quote!(super::#tn) - } - None => names.type_ref(ty, lifetime.clone()), - }, - None => quote!(()), - }; - quote!(Result<#ok, #err>) - } - _ => unimplemented!(), - }; + let result = match f.results.len() { + 0 if f.noreturn => quote!(#rt::Trap), + 0 => quote!(()), + 1 => { + let (ok, err) = match &**f.results[0].tref.type_() { + witx::Type::Variant(v) => match v.as_expected() { + Some(p) => p, + None => unimplemented!("anonymous variant ref {:?}", v), + }, + _ => unimplemented!(), + }; - let asyncness = if settings.get_async(&m, &f).is_sync() { - quote!() - } else { - quote!(async) - }; + let ok = match ok { + Some(ty) => names.type_ref(ty, lifetime.clone()), + None => quote!(()), + }; + let err = match err { + Some(ty) => match settings.errors.for_abi_error(ty) { + Some(custom) => { + let tn = custom.typename(); + quote!(super::#tn) + } + None => names.type_ref(ty, lifetime.clone()), + }, + None => quote!(()), + }; + quote!(Result<#ok, #err>) + } + _ => unimplemented!(), + }; - if is_anonymous { - quote!(#asyncness fn #funcname(&mut self, #(#args),*) -> #result; ) - } else { - quote!(#asyncness fn #funcname<#lifetime>(&mut self, #(#args),*) -> #result;) - } - }); + let asyncness = if settings.get_async(&m, &f).is_sync() { + quote!() + } else { + quote!(async) + }; + + if is_anonymous { + quote!(#asyncness fn #funcname(&mut self, #(#args),*) -> #result; ) + } else { + quote!(#asyncness fn #funcname<#lifetime>(&mut self, #(#args),*) -> #result;) + } + }); quote! { #[#rt::async_trait] diff --git a/crates/wiggle/generate/src/wasmtime.rs b/crates/wiggle/generate/src/wasmtime.rs index 8f89bc06f082..f3a3cdb01da1 100644 --- a/crates/wiggle/generate/src/wasmtime.rs +++ b/crates/wiggle/generate/src/wasmtime.rs @@ -21,7 +21,10 @@ pub fn link_module( let mut bodies = Vec::new(); let mut bounds = HashSet::new(); - for f in module.funcs() { + for f in module + .funcs() + .filter(|f| !settings.skip_names.should_skip(f)) + { let asyncness = settings.async_.get(module.name.as_str(), f.name.as_str()); bodies.push(generate_func(&module, &f, names, target_path, asyncness)); let bound = func_bounds(names, module, &f, settings); @@ -112,14 +115,21 @@ fn generate_func( }; let body = quote! { - let mem = match caller.get_export("memory") { - Some(#rt::wasmtime_crate::Extern::Memory(m)) => m, + let (mem, ctx) = match caller.get_export("memory") { + Some(#rt::wasmtime_crate::Extern::Memory(m)) => { + let (mem, ctx) = m.data_and_store_mut(&mut caller); + let ctx = get_cx(ctx); + (mem, ctx) + } + Some(#rt::wasmtime_crate::Extern::SharedMemory(m)) => { + let mem = unsafe { std::slice::from_raw_parts_mut(m.data() as *mut u8, m.data_size()) }; + let ctx = get_cx(caller.data_mut()); + (mem, ctx) + } _ => { return Err(#rt::wasmtime_crate::Trap::new("missing required memory export")); } }; - let (mem , ctx) = mem.data_and_store_mut(&mut caller); - let ctx = get_cx(ctx); let mem = #rt::wasmtime::WasmtimeGuestMemory::new(mem); match #abi_func(ctx, &mem #(, #arg_names)*) #await_ { Ok(r) => Ok(<#ret_ty>::from(r)), diff --git a/crates/wiggle/macro/src/lib.rs b/crates/wiggle/macro/src/lib.rs index 394ee47bb366..785785d73461 100644 --- a/crates/wiggle/macro/src/lib.rs +++ b/crates/wiggle/macro/src/lib.rs @@ -41,6 +41,9 @@ use syn::parse_macro_input; /// WebAssembly execution by trapping. /// * Optional: `async` takes a set of witx modules and functions which are /// made Rust `async` functions in the module trait. +/// * Optional: `skip` takes a list of function names and skips them when +/// generating code; this can be helpful when attempting to use Wiggle +/// alongside raw implementions of a witx interface. /// /// ## Example /// @@ -153,6 +156,7 @@ pub fn from_witx(args: TokenStream) -> TokenStream { &config.async_, &doc, cfg!(feature = "wasmtime") && config.wasmtime, + &config.skip, ) .expect("validating codegen settings"); @@ -195,6 +199,7 @@ pub fn wasmtime_integration(args: TokenStream) -> TokenStream { &config.c.async_, &doc, cfg!(feature = "wasmtime"), + &config.c.skip, ) .expect("validating codegen settings"); diff --git a/scripts/publish.rs b/scripts/publish.rs index 600ab1372d1f..20b5cb3e8482 100644 --- a/scripts/publish.rs +++ b/scripts/publish.rs @@ -60,6 +60,7 @@ const CRATES_TO_PUBLISH: &[&str] = &[ // other misc wasmtime crates "wasmtime-wasi", "wasmtime-wasi-nn", + "wasmtime-wasi-parallel", "wasmtime-wasi-crypto", "wasmtime-wast", "wasmtime-cli-flags", @@ -79,8 +80,9 @@ const PUBLIC_CRATES: &[&str] = &[ // patch releases. "wasmtime", "wasmtime-wasi", - "wasmtime-wasi-nn", "wasmtime-wasi-crypto", + "wasmtime-wasi-nn", + "wasmtime-wasi-parallel", "wasmtime-cli", // all cranelift crates are considered "public" in that they can't // have breaking API changes in patch releases diff --git a/src/commands/run.rs b/src/commands/run.rs index 8185bf7e0053..f4deccec2a13 100644 --- a/src/commands/run.rs +++ b/src/commands/run.rs @@ -22,6 +22,9 @@ use wasmtime_wasi_nn::WasiNnCtx; #[cfg(feature = "wasi-crypto")] use wasmtime_wasi_crypto::WasiCryptoCtx; +#[cfg(feature = "wasi-parallel")] +use wasmtime_wasi_parallel::WasiParallel; + fn parse_module(s: &OsStr) -> anyhow::Result { // Do not accept wasmtime subcommand names as the module name match s.to_str() { @@ -449,6 +452,8 @@ struct Host { wasi_nn: Option, #[cfg(feature = "wasi-crypto")] wasi_crypto: Option, + #[cfg(feature = "wasi-parallel")] + wasi_parallel: Option, } /// Populates the given `Linker` with WASI APIs. @@ -512,6 +517,20 @@ fn populate_with_wasi( } } + if wasi_modules.wasi_parallel { + #[cfg(not(feature = "wasi-parallel"))] + { + bail!("Cannot enable wasi-parallel when the binary is not compiled with this feature."); + } + #[cfg(feature = "wasi-parallel")] + { + wasmtime_wasi_parallel::add_to_linker(linker, |host| { + host.wasi_parallel.as_mut().unwrap() + })?; + store.data_mut().wasi_parallel = Some(WasiParallel::new()); + } + } + Ok(()) } diff --git a/supply-chain/config.toml b/supply-chain/config.toml index cb3d8ca8f558..0a55393ccf01 100644 --- a/supply-chain/config.toml +++ b/supply-chain/config.toml @@ -42,6 +42,10 @@ criteria = "safe-to-deploy" version = "0.0.1" criteria = "safe-to-deploy" +[[exemptions.anes]] +version = "0.1.6" +criteria = "safe-to-run" + [[exemptions.anyhow]] version = "1.0.57" criteria = "safe-to-deploy" @@ -102,6 +106,10 @@ criteria = "safe-to-deploy" version = "0.2.7" criteria = "safe-to-run" +[[exemptions.cast]] +version = "0.3.0" +criteria = "safe-to-run" + [[exemptions.chacha20]] version = "0.8.1" criteria = "safe-to-deploy" @@ -110,6 +118,18 @@ criteria = "safe-to-deploy" version = "0.9.0" criteria = "safe-to-deploy" +[[exemptions.ciborium]] +version = "0.2.0" +criteria = "safe-to-run" + +[[exemptions.ciborium-io]] +version = "0.2.0" +criteria = "safe-to-run" + +[[exemptions.ciborium-ll]] +version = "0.2.0" +criteria = "safe-to-run" + [[exemptions.cipher]] version = "0.3.0" criteria = "safe-to-deploy" @@ -154,10 +174,18 @@ criteria = "safe-to-deploy" version = "0.3.5" criteria = "safe-to-run" +[[exemptions.criterion]] +version = "0.4.0" +criteria = "safe-to-run" + [[exemptions.criterion-plot]] version = "0.4.4" criteria = "safe-to-run" +[[exemptions.criterion-plot]] +version = "0.5.0" +criteria = "safe-to-run" + [[exemptions.crossbeam-channel]] version = "0.5.4" criteria = "safe-to-deploy" @@ -802,6 +830,10 @@ criteria = "safe-to-deploy" version = "1.0.6" criteria = "safe-to-deploy" +[[exemptions.scoped_threadpool]] +version = "0.1.9" +criteria = "safe-to-deploy" + [[exemptions.scopeguard]] version = "1.1.0" criteria = "safe-to-deploy"