diff --git a/.gitmodules b/.gitmodules
index ee264b99c477..60efedba66ac 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "crates/wasi-crypto/spec"]
 	path = crates/wasi-crypto/spec
 	url = https://github.com/WebAssembly/wasi-crypto.git
+[submodule "crates/wasi-parallel/spec"]
+	path = crates/wasi-parallel/spec
+	url = https://github.com/WebAssembly/wasi-parallel
diff --git a/Cargo.lock b/Cargo.lock
index 7dc3628f4a9f..90c01e180eef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -78,6 +78,12 @@ version = "0.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec8ad6edb4840b78c5c3d88de606b22252d552b55f3a4699fbb10fc070ec3049"
 
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
 [[package]]
 name = "anyhow"
 version = "1.0.57"
@@ -338,6 +344,12 @@ dependencies = [
  "rustc_version",
 ]
 
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
 [[package]]
 name = "cc"
 version = "1.0.73"
@@ -378,6 +390,33 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "ciborium"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
 [[package]]
 name = "cipher"
 version = "0.3.0"
@@ -533,7 +572,7 @@ dependencies = [
  "cranelift-codegen-shared",
  "cranelift-entity",
  "cranelift-isle",
- "criterion",
+ "criterion 0.3.5",
  "gimli",
  "hashbrown",
  "log",
@@ -788,9 +827,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10"
 dependencies = [
  "atty",
- "cast",
+ "cast 0.2.7",
  "clap 2.34.0",
- "criterion-plot",
+ "criterion-plot 0.4.4",
  "csv",
  "itertools",
  "lazy_static",
@@ -807,13 +846,49 @@ dependencies = [
  "walkdir",
 ]
 
+[[package]]
+name = "criterion"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
+dependencies = [
+ "anes",
+ "atty",
+ "cast 0.3.0",
+ "ciborium",
+ "clap 3.2.8",
+ "criterion-plot 0.5.0",
+ "itertools",
+ "lazy_static",
+ "num-traits",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
 [[package]]
 name = "criterion-plot"
 version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
 dependencies = [
- "cast",
+ "cast 0.2.7",
+ "itertools",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast 0.3.0",
  "itertools",
 ]
 
@@ -2535,6 +2610,12 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "scoped_threadpool"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8"
+
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -3449,7 +3530,7 @@ dependencies = [
  "clap 3.2.8",
  "component-macro-test",
  "component-test-util",
- "criterion",
+ "criterion 0.3.5",
  "env_logger 0.9.0",
  "filecheck",
  "humantime 2.1.0",
@@ -3475,6 +3556,7 @@ dependencies = [
  "wasmtime-wasi",
  "wasmtime-wasi-crypto",
  "wasmtime-wasi-nn",
+ "wasmtime-wasi-parallel",
  "wasmtime-wast",
  "wast 46.0.0",
  "wat",
@@ -3728,6 +3810,25 @@ dependencies = [
  "wiggle",
 ]
 
+[[package]]
+name = "wasmtime-wasi-parallel"
+version = "2.0.0"
+dependencies = [
+ "anyhow",
+ "criterion 0.4.0",
+ "indexmap",
+ "log",
+ "num_cpus",
+ "pretty_env_logger",
+ "rand 0.8.5",
+ "scoped_threadpool",
+ "walkdir",
+ "wasmtime",
+ "wasmtime-runtime",
+ "wasmtime-wasi",
+ "wiggle",
+]
+
 [[package]]
 name = "wasmtime-wast"
 version = "2.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index 3b6daea89c75..f493d42a91a2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ wasmtime-wast = { workspace = true }
 wasmtime-wasi = { workspace = true }
 wasmtime-wasi-crypto = { workspace = true, optional = true }
 wasmtime-wasi-nn = { workspace = true, optional = true }
+wasmtime-wasi-parallel = { workspace = true, optional = true }
 clap = { workspace = true, features = ["color", "suggestions", "derive"] }
 anyhow = { workspace = true }
 target-lexicon = { workspace = true }
@@ -114,6 +115,7 @@ wasmtime-wast = { path = "crates/wast", version = "=2.0.0" }
 wasmtime-wasi = { path = "crates/wasi", version = "2.0.0" }
 wasmtime-wasi-crypto = { path = "crates/wasi-crypto", version = "2.0.0" }
 wasmtime-wasi-nn = { path = "crates/wasi-nn", version = "2.0.0" }
+wasmtime-wasi-parallel = { path = "crates/wasi-parallel", version = "2.0.0" }
 wasmtime-component-util = { path = "crates/component-util", version = "=2.0.0" }
 wasmtime-component-macro = { path = "crates/component-macro", version = "=2.0.0" }
 wasmtime-asm-macros = { path = "crates/asm-macros", version = "=2.0.0" }
@@ -178,6 +180,7 @@ jitdump = ["wasmtime/jitdump"]
 vtune = ["wasmtime/vtune"]
 wasi-crypto = ["dep:wasmtime-wasi-crypto"]
 wasi-nn = ["dep:wasmtime-wasi-nn"]
+wasi-parallel = ["wasmtime-wasi-parallel"]
 memory-init-cow = ["wasmtime/memory-init-cow", "wasmtime-cli-flags/memory-init-cow"]
 pooling-allocator = ["wasmtime/pooling-allocator", "wasmtime-cli-flags/pooling-allocator"]
 all-arch = ["wasmtime/all-arch"]
diff --git a/crates/bench-api/src/lib.rs b/crates/bench-api/src/lib.rs
index a946fdb6b551..9c38de1c4356 100644
--- a/crates/bench-api/src/lib.rs
+++ b/crates/bench-api/src/lib.rs
@@ -473,6 +473,9 @@ impl BenchState {
             wasmtime_wasi_crypto::add_to_linker(&mut linker, |cx| &mut cx.wasi_crypto)?;
         }
 
+        #[cfg(feature = "wasi-parallel")]
+        wasmtime_wasi_parallel::add_to_linker(&mut linker, |cx| &mut cx.wasi_parallel)?;
+
         Ok(Self {
             linker,
             compilation_timer,
diff --git a/crates/cli-flags/src/lib.rs b/crates/cli-flags/src/lib.rs
index 058522bbb94e..13f88a6482f1 100644
--- a/crates/cli-flags/src/lib.rs
+++ b/crates/cli-flags/src/lib.rs
@@ -60,6 +60,10 @@ pub const SUPPORTED_WASI_MODULES: &[(&str, &str)] = &[
         "experimental-wasi-crypto",
         "enables support for the WASI cryptography APIs (experimental), see https://github.com/WebAssembly/wasi-crypto",
     ),
+    (
+        "experimental-wasi-parallel",
+        "enables support for the WASI parallel APIs (experimental)",
+    ),
 ];
 
 fn pick_profiling_strategy(jitdump: bool, vtune: bool) -> Result<ProfilingStrategy> {
@@ -478,6 +482,7 @@ fn parse_wasi_modules(modules: &str) -> Result<WasiModules> {
                 "wasi-common" => Ok(wasi_modules.wasi_common = enable),
                 "experimental-wasi-nn" => Ok(wasi_modules.wasi_nn = enable),
                 "experimental-wasi-crypto" => Ok(wasi_modules.wasi_crypto = enable),
+                "experimental-wasi-parallel" => Ok(wasi_modules.wasi_parallel = enable),
                 "default" => bail!("'default' cannot be specified with other WASI modules"),
                 _ => bail!("unsupported WASI module '{}'", module),
             };
@@ -509,6 +514,9 @@ pub struct WasiModules {
 
     /// Enable the experimental wasi-crypto implementation.
     pub wasi_crypto: bool,
+
+    /// Enable the experimental wasi-parallel implementation.
+    pub wasi_parallel: bool,
 }
 
 impl Default for WasiModules {
@@ -517,6 +525,7 @@ impl Default for WasiModules {
             wasi_common: true,
             wasi_nn: false,
             wasi_crypto: false,
+            wasi_parallel: false,
         }
     }
 }
@@ -528,6 +537,7 @@ impl WasiModules {
             wasi_common: false,
             wasi_nn: false,
             wasi_crypto: false,
+            wasi_parallel: false,
         }
     }
 }
@@ -674,7 +684,8 @@ mod test {
             WasiModules {
                 wasi_common: true,
                 wasi_nn: false,
-                wasi_crypto: false
+                wasi_crypto: false,
+                wasi_parallel: false
             }
         );
     }
@@ -687,7 +698,8 @@ mod test {
             WasiModules {
                 wasi_common: true,
                 wasi_nn: false,
-                wasi_crypto: false
+                wasi_crypto: false,
+                wasi_parallel: false
             }
         );
     }
@@ -704,7 +716,8 @@ mod test {
             WasiModules {
                 wasi_common: false,
                 wasi_nn: true,
-                wasi_crypto: false
+                wasi_crypto: false,
+                wasi_parallel: false
             }
         );
     }
@@ -718,7 +731,8 @@ mod test {
             WasiModules {
                 wasi_common: false,
                 wasi_nn: false,
-                wasi_crypto: false
+                wasi_crypto: false,
+                wasi_parallel: false
             }
         );
     }
diff --git a/crates/wasi-parallel/.gitignore b/crates/wasi-parallel/.gitignore
new file mode 100644
index 000000000000..1440348a9f51
--- /dev/null
+++ b/crates/wasi-parallel/.gitignore
@@ -0,0 +1 @@
+/tests/wasm/*
diff --git a/crates/wasi-parallel/Cargo.toml b/crates/wasi-parallel/Cargo.toml
new file mode 100644
index 000000000000..a504b859470e
--- /dev/null
+++ b/crates/wasi-parallel/Cargo.toml
@@ -0,0 +1,47 @@
+[package]
+name = "wasmtime-wasi-parallel"
+version.workspace = true
+authors.workspace = true
+description = "Wasmtime implementation of the wasi-parallel API"
+documentation = "https://docs.rs/wasmtime-wasi-parallel"
+license = "Apache-2.0 WITH LLVM-exception"
+categories = ["wasm", "parallel"]
+keywords = ["webassembly", "wasm", "parallel"]
+repository = "https://github.com/bytecodealliance/wasmtime"
+readme = "README.md"
+edition = "2018"
+
+[dependencies]
+# These dependencies are necessary for the witx-generation macros to work:
+anyhow = { workspace = true }
+wiggle = { workspace = true }
+
+# These dependencies are necessary for the wasi-parallel implementation:
+indexmap = "1.6"
+log = { workspace = true }
+num_cpus = "1.13"
+rand = "0.8"
+scoped_threadpool = "0.1"
+wasmtime = { workspace = true, default-features = false, features = ["cranelift", "wat"] }
+wasmtime-runtime = { workspace = true }
+wasmtime-wasi = { workspace = true }
+
+[dev-dependencies]
+criterion = "0.4"
+pretty_env_logger = "0.4"
+
+[build-dependencies]
+walkdir = "2.3"
+
+[features]
+# Enable building Rust tests to Wasm. Not all environments will have a
+# `wasm32-wasi` target installed so this feature is not enabled by default.
+build-tests = []
+
+[[bench]]
+name = "nstream"
+harness = false
+
+[[bench]]
+name = "sum"
+harness = false
diff --git a/crates/wasi-parallel/LICENSE b/crates/wasi-parallel/LICENSE
new file mode 100644
index 000000000000..f9d81955f4bc
--- /dev/null
+++ b/crates/wasi-parallel/LICENSE
@@ -0,0 +1,220 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
diff --git a/crates/wasi-parallel/README.md b/crates/wasi-parallel/README.md
new file mode 100644
index 000000000000..bcfdc97bc5bf
--- /dev/null
+++ b/crates/wasi-parallel/README.md
@@ -0,0 +1,50 @@
+# wasmtime-wasi-parallel
+
+This crate enables experimental support for the [wasi-parallel] API in Wasmtime.
+
+> __WARNING__: _this implementation is highly experimental, subject to change,
+> and abuses the Wasmtime API!_ It is published as a proof-of-concept to discuss
+> the design of the wasi-parallel specification.
+
+Please open any issues related to this implementation in the [wasi-parallel]
+repository. Feedback is appreciated!
+
+The main idea is to expose a "parallel for" mechanism using WASI (see the
+[explainer] for more details). The "parallel for" call is not limited to CPU
+execution; this proof-of-concept implementation can execute parallel code on
+both the CPU (using Wasmtime's JIT compiled functions) and eventually the GPU
+(using OpenCL). If you plan to experiment with this crate, see the "Use" section
+below.
+
+[wasi-parallel]: https://github.com/WebAssembly/wasi-parallel
+[explainer]: https://github.com/WebAssembly/wasi-parallel#readme
+
+### Build
+
+```
+cargo build
+```
+
+### Test
+
+```
+cargo test
+```
+
+Note: the Rust code in `tests/rust` is compiled by `build.rs` to `tests/wasm`.
+
+### Benchmark
+
+```
+cargo bench
+```
+
+### Use
+
+When compiled with the `wasi-parallel` feature, this crate is usable from the
+Wasmtime CLI:
+
+```console
+$ cargo build --features wasi-parallel
+$ .../wasmtime run --wasi-modules experimental-wasi-parallel <module>
+```
diff --git a/crates/wasi-parallel/benches/nstream.rs b/crates/wasi-parallel/benches/nstream.rs
new file mode 100644
index 000000000000..759e6a811c36
--- /dev/null
+++ b/crates/wasi-parallel/benches/nstream.rs
@@ -0,0 +1,58 @@
+//! Compare the memory throughput of parallel and sequential executions using
+//! `nstream`.
+//!
+//! `nstream` is a memory throughput benchmark and will likely hit memory
+//! bandwidth limitations as more threads are used. Do not expect a linear speed
+//! up between the parallel and sequential versions. For reference, the
+//! [Parallel Research Kernels] (PRK) repository has examples of `nstream` in
+//! many languages.
+//!
+//! [Parallel Research Kernels]: https://github.com/ParRes/Kernels
+mod test_case;
+
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion,
+};
+use test_case::*;
+
+fn bench_nstream(c: &mut Criterion) {
+    let mut group = c.benchmark_group("nstream");
+    measure(&mut group, Parallelism::Cpu);
+    measure(&mut group, Parallelism::Sequential);
+}
+
+fn measure(group: &mut BenchmarkGroup<WallTime>, parallelism: Parallelism) {
+    let name = format!("{:?}", parallelism);
+    let mut test_case = TestCase::new("tests/wat/nstream.wat", default_engine(), None).unwrap();
+    let num_threads = num_cpus::get() as i32;
+    // The guidance for nstream is to pick a number of items equivalent to a
+    // buffer that is `4 * LLC size`. Since we use 4-byte floating point numbers
+    // and assuming a very tame 2MB for the LLC, we set the number of items to
+    // ~8 million. Note what the nstream PRK does here:
+    // https://github.com/ParRes/Kernels/blob/default/scripts/small/runopenmp#L9.
+    let num_items = 8_000_000;
+    let device_kind = parallelism.as_device_kind();
+
+    let _ = test_case
+        .invoke(
+            "setup",
+            &[num_threads.into(), num_items.into(), device_kind.into()],
+        )
+        .expect("failed in benchmark `setup()`");
+
+    group.bench_function(name, |b| {
+        b.iter(|| {
+            let _ = test_case
+                .invoke("execute", &[])
+                .expect("failed in benchmark `execute()`");
+        });
+    });
+
+    // We cannot invoke `finish` here since the check will necessarily fail:
+    // criterion will invoke `execute` repeatedly and since the nstream work
+    // includes `A[i] += A[i] ...` then the result region `A` will quickly
+    // diverge from its single-run value, which is what `finish` checks.
+}
+
+criterion_group!(benches, bench_nstream);
+criterion_main!(benches);
diff --git a/crates/wasi-parallel/benches/sum.rs b/crates/wasi-parallel/benches/sum.rs
new file mode 100644
index 000000000000..3cd98b3622a2
--- /dev/null
+++ b/crates/wasi-parallel/benches/sum.rs
@@ -0,0 +1,50 @@
+//! Add 100 million integers for each available CPU.
+//!
+//! This benchmark measures the upper ends of the speed-up available with
+//! `wasi-parallel`. Each iteration of the kernel performs CPU-bound work, so
+//! the difference between the sequential and parallel versions should be
+//! related to the number of cores available.
+
+mod test_case;
+
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion,
+};
+use test_case::*;
+
+fn bench_nstream(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sum");
+    measure(&mut group, Parallelism::Cpu);
+    measure(&mut group, Parallelism::Sequential);
+}
+
+fn measure(group: &mut BenchmarkGroup<WallTime>, parallelism: Parallelism) {
+    let name = format!("{:?}", parallelism);
+    let mut test_case = TestCase::new("tests/wat/sum.wat", default_engine(), None).unwrap();
+    let num_threads = num_cpus::get() as i32;
+    let buffer_size = 100_000_000 as i32;
+    let device_kind = parallelism.as_device_kind();
+
+    let _ = test_case
+        .invoke(
+            "setup",
+            &[num_threads.into(), buffer_size.into(), device_kind.into()],
+        )
+        .expect("failed in benchmark `setup()`");
+
+    group.bench_function(name, |b| {
+        b.iter(|| {
+            let _ = test_case
+                .invoke("execute", &[])
+                .expect("failed in benchmark `execute()`");
+        });
+    });
+
+    let results = test_case
+        .invoke("finish", &[])
+        .expect("failed in benchmark `finish`");
+    assert_eq!(results[0].i32().unwrap(), 0);
+}
+
+criterion_group!(benches, bench_nstream);
+criterion_main!(benches);
diff --git a/crates/wasi-parallel/benches/test_case.rs b/crates/wasi-parallel/benches/test_case.rs
new file mode 120000
index 000000000000..dab38122630b
--- /dev/null
+++ b/crates/wasi-parallel/benches/test_case.rs
@@ -0,0 +1 @@
+../tests/test_case.rs
\ No newline at end of file
diff --git a/crates/wasi-parallel/build.rs b/crates/wasi-parallel/build.rs
new file mode 100644
index 000000000000..f5e64192b6c9
--- /dev/null
+++ b/crates/wasi-parallel/build.rs
@@ -0,0 +1,77 @@
+//! This build script:
+//!  - has the configuration necessary for the wiggle and witx macros
+//!  - generates Wasm from the files in `tests/rust` to `tests/wasm`
+
+#[cfg(feature = "build-tests")]
+use std::{
+    fs::DirBuilder,
+    path::{Path, PathBuf},
+    process::Command,
+};
+
+fn main() {
+    // This is necessary for wiggle/witx macros.
+    let cwd = std::env::current_dir().unwrap();
+    let wasi_root = cwd.join("spec");
+    println!("cargo:rustc-env=WASI_ROOT={}", wasi_root.display());
+
+    // Also automatically rebuild if the WITX files change.
+    for entry in walkdir::WalkDir::new(wasi_root) {
+        println!("cargo:rerun-if-changed={}", entry.unwrap().path().display());
+    }
+
+    // If requested, rebuild the test files.
+    #[cfg(feature = "build-tests")]
+    {
+        build_wasm("tests");
+        build_wasm("benches");
+    }
+}
+
+#[cfg(feature = "build-tests")]
+fn build_wasm<P: AsRef<Path>>(root: P) {
+    let root_dir = Path::new(root.as_ref().as_os_str());
+    let wasm_dir = root_dir.join("wasm");
+
+    DirBuilder::new().recursive(true).create(&wasm_dir).unwrap();
+
+    // Automatically rebuild any Rust tests.
+    if root_dir.join("rust").exists() {
+        for entry in walkdir::WalkDir::new(root_dir.join("rust")) {
+            let entry = entry.unwrap();
+            println!("cargo:rerun-if-changed={}", entry.path().display());
+            if entry.path().is_file() && entry.file_name() != "wasi_parallel.rs" {
+                compile_rust(entry.path(), &wasm_dir)
+            }
+        }
+    }
+}
+
+/// Use rustc to compile a Rust file to a Wasm file that uses the wasi-parallel
+/// API.
+#[cfg(feature = "build-tests")]
+fn compile_rust<P1: AsRef<Path>, P2: AsRef<Path>>(source_file: P1, destination_dir: P2) {
+    let stem = source_file.as_ref().file_stem().unwrap();
+    let mut destination_file: PathBuf = [destination_dir.as_ref().as_os_str(), stem]
+        .iter()
+        .collect();
+    destination_file.set_extension("wasm");
+
+    let mut command = Command::new("rustc");
+    command
+        .arg("--target")
+        .arg("wasm32-wasi")
+        .arg(source_file.as_ref().to_str().unwrap())
+        .arg("-o")
+        .arg(destination_file.to_str().unwrap());
+
+    let status = command
+        .status()
+        .expect("Failed to execute 'rustc' command to generate Wasm file.");
+
+    assert!(
+        status.success(),
+        "Failed to compile test program: {:?}",
+        command
+    )
+}
diff --git a/crates/wasi-parallel/spec b/crates/wasi-parallel/spec
new file mode 160000
index 000000000000..b404517fbf50
--- /dev/null
+++ b/crates/wasi-parallel/spec
@@ -0,0 +1 @@
+Subproject commit b404517fbf50496b58d51d08c297585488864739
diff --git a/crates/wasi-parallel/src/context.rs b/crates/wasi-parallel/src/context.rs
new file mode 100644
index 000000000000..302feb6fcf78
--- /dev/null
+++ b/crates/wasi-parallel/src/context.rs
@@ -0,0 +1,198 @@
+//! This module implements the state held by `wasi-parallel`. E.g., when
+//! `wasi-parallel` returns a handle to a device, it must maintain a mapping of
+//! which device was returned. The `WasiParallelContext` proxies on calls to the
+//! correct parallel device.
+
+use crate::device::{discover, Buffer, Device};
+use crate::witx::types::{BufferAccessKind, DeviceKind};
+use anyhow::{anyhow, Result};
+use indexmap::IndexMap;
+use rand::Rng;
+use std::collections::HashMap;
+use wasmtime::SharedMemory;
+
+#[derive(Debug)]
+pub struct WasiParallelContext {
+    pub spirv: HashMap<i32, Vec<u8>>,
+    pub devices: IndexMap<i32, Box<dyn Device>>,
+    pub buffers: HashMap<i32, Box<dyn Buffer>>,
+    pub device_for_buffers: HashMap<i32, i32>,
+}
+
+impl WasiParallelContext {
+    pub fn new() -> Self {
+        // Perform some rudimentary device discovery.
+        let mut devices = IndexMap::new();
+        for device in discover() {
+            devices.insert(Self::random_id(), device);
+        }
+
+        Self {
+            spirv: HashMap::new(),
+            devices,
+            buffers: HashMap::new(),
+            device_for_buffers: HashMap::new(),
+        }
+    }
+
+    /// Retrieve a device based on a hint, using the default device if the hint
+    /// cannot be satisfied.
+    pub fn get_device(&self, hint: DeviceKind) -> Result<i32> {
+        match self
+            .devices
+            .iter()
+            .find(|(_, device)| device.kind() == hint)
+        {
+            // If we can find a device matching the hint, return it...
+            Some((&id, _)) => Ok(id),
+            // ...otherwise, use the default device.
+            None => self.get_default_device(),
+        }
+    }
+
+    // Retrieve the default device, which currently is the first registered
+    // device (TODO).
+    pub fn get_default_device(&self) -> Result<i32> {
+        match self.devices.iter().next().as_ref() {
+            // Use the first available device (TODO: implicit default)...
+            Some((&id, _)) => Ok(id),
+            // ...or fail if none are available.
+            None => Err(anyhow!("no devices available")),
+        }
+    }
+
+    /// Create a buffer linked to a device.
+    pub fn create_buffer(
+        &mut self,
+        device_id: i32,
+        size: i32,
+        access: BufferAccessKind,
+    ) -> Result<i32> {
+        let device = match self.devices.get(&device_id) {
+            Some(val) => val,
+            None => return Err(anyhow!("unrecognized device")),
+        };
+
+        if size < 0 {
+            return Err(anyhow!("invalid size (less than 0)"));
+        }
+
+        let id = Self::random_id();
+        self.buffers
+            .insert(id, device.as_ref().create_buffer(size, access));
+        self.device_for_buffers.insert(id, device_id);
+        Ok(id)
+    }
+
+    /// Retrieve a created buffer by its ID.
+    pub fn get_buffer(&self, buffer_id: i32) -> Result<&dyn Buffer> {
+        match self.buffers.get(&buffer_id) {
+            Some(buffer) => Ok(buffer.as_ref()),
+            None => Err(anyhow!("invalid buffer ID")),
+        }
+    }
+
+    /// Retrieve a created buffer by its ID.
+    pub fn get_buffer_mut(&mut self, buffer_id: i32) -> Result<&mut dyn Buffer> {
+        match self.buffers.get_mut(&buffer_id) {
+            Some(buffer) => Ok(buffer.as_mut()),
+            None => Err(anyhow!("invalid buffer ID")),
+        }
+    }
+
+    /// Invoke the `kernel` in parallel on the devices indicated by the input
+    /// and output buffers.
+    pub fn invoke_parallel_for(
+        &mut self,
+        device_id: i32,
+        kernel: &[u8],
+        engine: &wasmtime::Engine,
+        shared_memory: SharedMemory,
+        num_threads: i32,
+        block_size: i32,
+        in_buffers: &[i32],
+        out_buffers: &[i32],
+    ) -> Result<()> {
+        // Collect the input buffers.
+        let mut in_buffers_ = Vec::new();
+        for (i, b) in in_buffers.iter().enumerate() {
+            match self.buffers.get(b) {
+                Some(b) => in_buffers_.push(b),
+                None => return Err(anyhow!("in buffer {} has an invalid ID", i)),
+            }
+        }
+
+        // Collect the output buffers.
+        let mut out_buffers_ = Vec::new();
+        for (i, b) in out_buffers.iter().enumerate() {
+            match self.buffers.get(b) {
+                Some(b) => out_buffers_.push(b),
+                None => return Err(anyhow!("out buffer {} has an invalid ID", i)),
+            }
+        }
+
+        // Check that all buffers are assigned to the right device.
+        if !in_buffers
+            .iter()
+            .chain(out_buffers.iter())
+            .map(|b| *self.device_for_buffers.get(b).unwrap())
+            .all(|d| d == device_id)
+        {
+            return Err(anyhow!("buffers are assigned to different devices"));
+        }
+
+        // Check that the device is valid.
+        if let Some(device) = self.devices.get_mut(&device_id) {
+            log::debug!(
+                "starting parallel iterations = {}, block_size = {}, device = {:?}",
+                num_threads,
+                block_size,
+                device
+            );
+            device.parallelize(
+                Kernel::new(kernel.to_owned(), engine.clone(), shared_memory),
+                num_threads,
+                block_size,
+                in_buffers_,
+                out_buffers_,
+            )?
+        } else {
+            return Err(anyhow!("invalid device ID"));
+        }
+
+        Ok(())
+    }
+
+    fn random_id() -> i32 {
+        rand::thread_rng().gen()
+    }
+}
+
+/// A binary-encoded WebAssembly module containing the function to be run in
+/// parallel. The engine is included so that the WebAssembly code can be
+/// JIT-compiled with the same configuration as the currently-running
+/// WebAssembly.
+pub struct Kernel {
+    module: Vec<u8>,
+    engine: wasmtime::Engine,
+    memory: SharedMemory,
+}
+impl Kernel {
+    pub const NAME: &'static str = "kernel";
+    pub fn new(module: Vec<u8>, engine: wasmtime::Engine, memory: SharedMemory) -> Self {
+        Self {
+            module,
+            engine,
+            memory,
+        }
+    }
+    pub fn module(&self) -> &[u8] {
+        &self.module
+    }
+    pub fn engine(&self) -> &wasmtime::Engine {
+        &self.engine
+    }
+    pub fn memory(&self) -> &SharedMemory {
+        &self.memory
+    }
+}
diff --git a/crates/wasi-parallel/src/device/cpu.rs b/crates/wasi-parallel/src/device/cpu.rs
new file mode 100644
index 000000000000..6253a7321566
--- /dev/null
+++ b/crates/wasi-parallel/src/device/cpu.rs
@@ -0,0 +1,84 @@
+//! Implement a `wasi-parallel` device using all available cores on the CPU.
+
+use super::wasm_memory_buffer::{as_pointer_and_length, WasmMemoryBuffer};
+use super::{Buffer, Device};
+use crate::context::Kernel;
+use crate::witx::types::{BufferAccessKind, DeviceKind};
+use anyhow::Result;
+use std::convert::TryInto;
+
+pub struct CpuDevice {
+    pool: scoped_threadpool::Pool,
+}
+
+impl CpuDevice {
+    pub fn new() -> Box<dyn Device> {
+        let pool = scoped_threadpool::Pool::new(num_cpus::get().try_into().unwrap());
+        Box::new(Self { pool })
+    }
+}
+
+impl Device for CpuDevice {
+    fn kind(&self) -> DeviceKind {
+        DeviceKind::Cpu
+    }
+
+    fn name(&self) -> String {
+        "thread pool implementation".into() // TODO retrieve CPU name from system.
+    }
+
+    fn create_buffer(&self, size: i32, access: BufferAccessKind) -> Box<dyn Buffer> {
+        Box::new(WasmMemoryBuffer::new(size as u32, access))
+    }
+
+    fn parallelize(
+        &mut self,
+        kernel: Kernel,
+        num_iterations: i32,
+        block_size: i32,
+        in_buffers: Vec<&Box<dyn Buffer>>,
+        out_buffers: Vec<&Box<dyn Buffer>>,
+    ) -> Result<()> {
+        self.pool.scoped(|scoped| {
+            let module = wasmtime::Module::new(kernel.engine(), kernel.module())
+                .expect("unable to compile module");
+
+            // Setup the buffer pointers.
+            let buffers =
+                as_pointer_and_length(in_buffers.into_iter().chain(out_buffers.into_iter()))
+                    .unwrap();
+
+            for iteration_id in 0..num_iterations {
+                let engine = kernel.engine().clone();
+                let module = module.clone();
+                let memory = kernel.memory().clone();
+                let buffers = buffers.clone();
+                scoped.execute(move || {
+                    // Instantiate again in a new thread.
+                    let mut store = wasmtime::Store::new(&engine, ());
+                    let imports = vec![memory.clone().into()];
+                    let instance = wasmtime::Instance::new(&mut store, &module, &imports)
+                        .expect("failed to construct thread instance");
+
+                    // Setup the parameters for the parallel execution.
+                    let mut params = vec![
+                        iteration_id.into(),
+                        num_iterations.into(),
+                        block_size.into(),
+                    ];
+                    params.extend_from_slice(&buffers);
+
+                    // Call the `kernel` function.
+                    log::debug!("executing iteration {}", iteration_id);
+                    let kernel_fn = instance
+                        .get_func(&mut store, Kernel::NAME)
+                        .expect("failed to find kernel function");
+                    kernel_fn
+                        .call(&mut store, &params[..], &mut [])
+                        .expect("failed to run kernel")
+                });
+            }
+        });
+        Ok(())
+    }
+}
diff --git a/crates/wasi-parallel/src/device/mod.rs b/crates/wasi-parallel/src/device/mod.rs
new file mode 100644
index 000000000000..0556fed2ce4c
--- /dev/null
+++ b/crates/wasi-parallel/src/device/mod.rs
@@ -0,0 +1,76 @@
+pub mod cpu;
+pub mod sequential;
+pub mod wasm_memory_buffer;
+
+use crate::witx::types::{BufferAccessKind, DeviceKind};
+use crate::{
+    context::Kernel,
+    device::{cpu::CpuDevice, sequential::SequentialDevice},
+};
+use anyhow::Result;
+use std::{any::Any, fmt::Debug};
+use wiggle::GuestPtr;
+
+/// Discover available devices.
+pub fn discover() -> Vec<Box<dyn Device>> {
+    vec![CpuDevice::new(), SequentialDevice::new()]
+}
+
+/// Define the operations possible on a device.
+pub trait Device {
+    /// Return the device kind.
+    fn kind(&self) -> DeviceKind;
+
+    /// Return the device name.
+    fn name(&self) -> String;
+
+    /// Create a buffer associated with this device. The created buffer is held
+    /// in `WasiParallelContext`, which must guarantee that buffers are sent to
+    /// the correct devices.
+    fn create_buffer(&self, size: i32, access: BufferAccessKind) -> Box<dyn Buffer>;
+
+    /// Invoke a parallel "for" on the device.
+    fn parallelize(
+        &mut self,
+        kernel: Kernel,
+        num_threads: i32,
+        block_size: i32,
+        in_buffers: Vec<&Box<dyn Buffer>>,
+        out_buffers: Vec<&Box<dyn Buffer>>,
+    ) -> Result<()>;
+}
+
+impl Debug for dyn Device {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}: {}", self.kind(), self.name())
+    }
+}
+
+/// Define the operations possible on a buffer.
+pub trait Buffer: Send + Sync {
+    /// Returns the size of the buffer in bytes.
+    fn len(&self) -> u32;
+
+    /// Describes how the buffer can be used by the device.
+    fn access(&self) -> BufferAccessKind;
+
+    /// Write the given slice of Wasm memory into the buffer.
+    fn write(&mut self, data: GuestPtr<[u8]>) -> Result<()>;
+
+    /// Read the buffer into a slice.
+    fn read(&self, slice: GuestPtr<[u8]>) -> Result<()>;
+
+    /// Allow for downcasting the buffer: `buffer.as_any().downcast_ref::<...>()`.
+    fn as_any(&self) -> &dyn Any;
+}
+
+impl Debug for dyn Buffer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Buffer: size: {} access: {:?}",
+            self.len(),
+            self.access()
+        )
+    }
+}
diff --git a/crates/wasi-parallel/src/device/sequential.rs b/crates/wasi-parallel/src/device/sequential.rs
new file mode 100644
index 000000000000..bdcb24a608f5
--- /dev/null
+++ b/crates/wasi-parallel/src/device/sequential.rs
@@ -0,0 +1,74 @@
+//! Implements a `wasi-parallel` device that solely executes sequentially.
+
+use super::wasm_memory_buffer::as_pointer_and_length;
+use super::{wasm_memory_buffer::WasmMemoryBuffer, Buffer, Device};
+use crate::context::Kernel;
+use crate::witx::types::{BufferAccessKind, DeviceKind};
+use anyhow::{Context, Result};
+
+pub struct SequentialDevice;
+
+impl SequentialDevice {
+    pub fn new() -> Box<dyn Device> {
+        Box::new(Self)
+    }
+}
+
+impl Device for SequentialDevice {
+    fn kind(&self) -> DeviceKind {
+        DeviceKind::Sequential
+    }
+
+    fn name(&self) -> String {
+        "sequential implementation".into()
+    }
+
+    fn create_buffer(&self, size: i32, access: BufferAccessKind) -> Box<dyn Buffer> {
+        Box::new(WasmMemoryBuffer::new(size as u32, access))
+    }
+
+    fn parallelize(
+        &mut self,
+        kernel: Kernel,
+        num_iterations: i32,
+        block_size: i32,
+        in_buffers: Vec<&Box<dyn Buffer>>,
+        out_buffers: Vec<&Box<dyn Buffer>>,
+    ) -> Result<()> {
+        // JIT-compile and instantiate the parallel kernel.
+        let module = wasmtime::Module::new(kernel.engine(), kernel.module())
+            .context("unable to compile kernel module")?;
+        let mut store = wasmtime::Store::new(kernel.engine(), ());
+        let imports = vec![kernel.memory().clone().into()];
+        let instance = wasmtime::Instance::new(&mut store, &module, &imports)
+            .context("failed to construct kernel instance")?;
+
+        // Setup the buffer pointers.
+        let buffers =
+            as_pointer_and_length(in_buffers.into_iter().chain(out_buffers.into_iter())).unwrap();
+
+        let kernel_fn = instance
+            .get_func(&mut store, Kernel::NAME)
+            .expect("failed to find kernel function");
+
+        // Run each iteration of the parallel kernel sequentially.
+        for iteration_id in 0..num_iterations {
+            log::debug!("executing iteration {}", iteration_id);
+
+            // Setup the parameters for the parallel execution.
+            let mut params = vec![
+                iteration_id.into(),
+                num_iterations.into(),
+                block_size.into(),
+            ];
+            params.extend_from_slice(&buffers);
+
+            // Call the `kernel` function.
+            kernel_fn
+                .call(&mut store, &params[..], &mut [])
+                .expect("failed to run kernel")
+        }
+
+        Ok(())
+    }
+}
diff --git a/crates/wasi-parallel/src/device/wasm_memory_buffer.rs b/crates/wasi-parallel/src/device/wasm_memory_buffer.rs
new file mode 100644
index 000000000000..a8404191135f
--- /dev/null
+++ b/crates/wasi-parallel/src/device/wasm_memory_buffer.rs
@@ -0,0 +1,164 @@
+//! Contains a reference to a slice of Wasm memory.
+//!
+use super::Buffer;
+use crate::witx::types::BufferAccessKind;
+use anyhow::{anyhow, bail, Context, Result};
+use std::{any::Any, convert::TryInto, vec};
+use wasmtime::Val;
+use wiggle::GuestPtr;
+
+/// This kind of buffer is designed to live exclusively in Wasm memory--it
+/// contains the information necessary for reading and writing to Wasm memory.
+/// Its lifecycle involves:
+/// - The buffer is created by `create_buffer`; this associates a buffer ID with
+///   a device ID, but the buffer has no knowledge of what data it may contain,
+///   only its length.
+/// - The buffer may then be written to by `write_buffer`; at this point the
+///   buffer will record the its offset within the Wasm memory.
+/// - When used by `parallel_exec`, this buffer will simply pass its offset and
+///   length to the parallel kernel, where it will be mutated by a Wasm
+///   function.
+/// - The buffer contents may "read" from one section of the Wasm memory to
+///   another.
+pub struct WasmMemoryBuffer {
+    offset: Option<u32>,
+    length: u32,
+    access: BufferAccessKind,
+}
+
+impl WasmMemoryBuffer {
+    pub fn new(size: u32, access: BufferAccessKind) -> Self {
+        Self {
+            offset: None,
+            length: size,
+            access,
+        }
+    }
+}
+
+impl Buffer for WasmMemoryBuffer {
+    fn len(&self) -> u32 {
+        self.length
+    }
+
+    fn access(&self) -> BufferAccessKind {
+        self.access
+    }
+
+    /// Does not copy data: simply checks that the lengths of the buffer and
+    /// guest slice match and then records the starting location of the guest
+    /// pointer. This will require some re-thinking once multiple memories are
+    /// possible (TODO).
+    fn write(&mut self, slice: GuestPtr<[u8]>) -> Result<()> {
+        if slice.len() == self.len() {
+            self.offset = Some(slice.offset_base());
+            Ok(())
+        } else {
+            Err(anyhow!(
+                "The slice to write did not match the buffer size: {} != {}",
+                slice.len(),
+                self.len(),
+            ))
+        }
+    }
+
+    /// This implementation of `read` will attempt to copy the device data, held
+    /// in Wasm memory, to another location in Wasm memory. Currently it will
+    /// fail if the slices are overlapping (TODO). At some point, this should
+    /// also see if the `read` is from and to the same slice and avoid the copy
+    /// entirely (TODO).
+    fn read(&self, slice: GuestPtr<[u8]>) -> Result<()> {
+        debug_assert_eq!(slice.len(), self.len());
+        let mem = slice.mem().base();
+        let mem = unsafe { std::slice::from_raw_parts_mut(mem.0, mem.1 as usize) };
+        copy_within_a_slice(
+            mem,
+            self.offset.unwrap() as usize,
+            slice.offset_base() as usize,
+            slice.len() as usize,
+        );
+        Ok(())
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+}
+
+/// This helper copies one sub-slice to another within a mutable slice. It will
+/// panic if the slices are overlapping.
+fn copy_within_a_slice<T: Clone>(v: &mut [T], from: usize, to: usize, len: usize) {
+    if from == to {
+        // Do nothing.
+    } else if from > to {
+        let (dst, src) = v.split_at_mut(from);
+        dst[to..to + len].clone_from_slice(&src[..len]);
+    } else {
+        let (src, dst) = v.split_at_mut(to);
+        dst[..len].clone_from_slice(&src[from..from + len]);
+    }
+}
+
+/// Convert an iterator of [`WasmMemoryBuffer`] into their corresponding
+/// WebAssembly pointer address and length.
+pub fn as_pointer_and_length<'a, I>(buffers: I) -> Result<Vec<Val>>
+where
+    I: Iterator<Item = &'a Box<dyn Buffer>>,
+{
+    let mut results = vec![];
+    for b in buffers {
+        if let Some(b_) = b.as_any().downcast_ref::<WasmMemoryBuffer>() {
+            let len = b
+                .len()
+                .try_into()
+                .context("the buffer length is too large for an i32")?;
+            if let Some(offset) = b_.offset {
+                results.push(Val::I32(offset.try_into().unwrap()));
+                results.push(Val::I32(len));
+            } else {
+                bail!("the buffer has not been written to: {:?}", b);
+                // TODO there must be a way to set up the buffer without writing
+                // to it; e.g., for write buffers that only the device touches.
+            }
+        } else {
+            bail!("the buffer is invalid; any buffer used by the CPU should be castable to a pointer + length");
+        }
+    }
+    Ok(results)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn copy_before() {
+        let mut buffer = [0u8; 1024];
+        buffer[42] = 42;
+        copy_within_a_slice(&mut buffer, 42, 41, 1);
+        assert_eq!(buffer[41], 42);
+    }
+
+    #[test]
+    fn copy_same_location() {
+        let mut buffer = [0u8; 1024];
+        buffer[42] = 42;
+        copy_within_a_slice(&mut buffer, 42, 42, 1);
+        assert_eq!(buffer[42], 42);
+    }
+
+    #[test]
+    fn copy_after() {
+        let mut buffer = [0u8; 1024];
+        buffer[42] = 42;
+        copy_within_a_slice(&mut buffer, 42, 43, 1);
+        assert_eq!(buffer[43], 42);
+    }
+
+    #[test]
+    #[should_panic]
+    fn copy_overlapping() {
+        let mut buffer = [0u8; 1024];
+        copy_within_a_slice(&mut buffer, 42, 43, 2);
+    }
+}
diff --git a/crates/wasi-parallel/src/impl.rs b/crates/wasi-parallel/src/impl.rs
new file mode 100644
index 000000000000..d41c7d72e374
--- /dev/null
+++ b/crates/wasi-parallel/src/impl.rs
@@ -0,0 +1,55 @@
+//! This module translates the calls from the Wasm program (running in a Wasm
+//! engine) into something usable by the `wasi-parallel` context (running in the
+//! Wasm host). Wiggle performs most of the conversions using `from_witx!` in
+//! `witx.rs` but the special nature of `parallel_exec` (it can call back into a
+//! function in the Wasm module) involves a manual implementation of this glue
+//! code elsewhere.
+
+use crate::witx::types::{
+    Buffer, BufferAccessKind, BufferData, BufferSize, DeviceKind, ParallelDevice,
+};
+use crate::witx::wasi_ephemeral_parallel::WasiEphemeralParallel;
+use crate::{WasiParallel, WasiParallelError};
+
+impl WasiEphemeralParallel for WasiParallel {
+    fn get_device(&mut self, hint: DeviceKind) -> Result<ParallelDevice, WasiParallelError> {
+        let id = self.ctx.borrow().get_device(hint)?;
+        Ok(ParallelDevice::from(id))
+    }
+
+    fn create_buffer(
+        &mut self,
+        device: ParallelDevice,
+        size: BufferSize,
+        kind: BufferAccessKind,
+    ) -> Result<Buffer, super::WasiParallelError> {
+        let id = self
+            .ctx
+            .borrow_mut()
+            .create_buffer(device.into(), size as i32, kind)?;
+        Ok(Buffer::from(id))
+    }
+
+    fn write_buffer<'a>(
+        &mut self,
+        data: &BufferData<'a>,
+        buffer: Buffer,
+    ) -> Result<(), super::WasiParallelError> {
+        let mut ctx = self.ctx.borrow_mut();
+        let buffer = ctx.get_buffer_mut(buffer.into())?;
+        buffer.write(*data)?;
+        Ok(())
+    }
+
+    fn read_buffer<'a>(
+        &mut self,
+        buffer: Buffer,
+        data: &BufferData<'a>,
+    ) -> Result<(), super::WasiParallelError> {
+        let ctx = self.ctx.borrow_mut();
+        let buffer = ctx.get_buffer(buffer.into())?;
+        buffer.read(*data)
+    }
+
+    // Note: `parallel_exec` is manually linked in `lib.rs`.
+}
diff --git a/crates/wasi-parallel/src/lib.rs b/crates/wasi-parallel/src/lib.rs
new file mode 100644
index 000000000000..e0c058c9cb9d
--- /dev/null
+++ b/crates/wasi-parallel/src/lib.rs
@@ -0,0 +1,126 @@
+//! Implement [`wasi-parallel`].
+//!
+//! [`wasi-parallel`]: https://github.com/WebAssembly/wasi-parallel
+
+mod context;
+mod device;
+mod r#impl;
+mod witx;
+
+use anyhow::Result;
+use context::WasiParallelContext;
+use std::cell::RefCell;
+use wasmtime::{Caller, Extern, SharedMemory, Trap};
+use wiggle::GuestError;
+
+/// This struct solely wraps a `wasi-parallel` context in a `RefCell`.
+pub struct WasiParallel {
+    pub(crate) ctx: RefCell<WasiParallelContext>,
+}
+
+impl WasiParallel {
+    pub fn new() -> Self {
+        Self {
+            ctx: RefCell::new(WasiParallelContext::new()),
+        }
+    }
+}
+
+/// Define the ways wasi-parallel can fail.
+pub type WasiParallelError = anyhow::Error;
+
+/// Re-export the Wiggle-generated `add_to_linker` function. Because
+/// `WasiParallelContext` needs access to `Caller` (i.e., to retrieve the
+/// `Engine`) and the wiggle infrastructure does not support this, this API call
+/// is skipped during wiggle generation (see `skip` in `witx.rs`) and manually
+/// implemented here.
+pub fn add_to_linker<T>(
+    linker: &mut wasmtime::Linker<T>,
+    get_cx: impl Fn(&mut T) -> &mut WasiParallel + Send + Sync + Copy + 'static,
+) -> anyhow::Result<()> {
+    witx::wasi_ephemeral_parallel::add_to_linker(linker, get_cx)?;
+
+    // At one time, this code was auto-generated by
+    // `wiggle_generate::wasmtime::generate_func`.
+    linker.func_wrap(
+        "wasi_ephemeral_parallel",
+        "parallel_exec",
+        move |mut caller: Caller<'_, T>,
+              arg0: i32,
+              arg1: i32,
+              arg2: i32,
+              arg3: i32,
+              arg4: i32,
+              arg5: i32,
+              arg6: i32,
+              arg7: i32,
+              arg8: i32|
+              -> Result<i32, Trap> {
+            // A module using wasi-parallel must export a shared memory named
+            // 'memory'. This is what is passed to Wiggle.
+            let shared_memory = get_exported_shared_memory(&mut caller)?;
+            let mem = unsafe {
+                std::slice::from_raw_parts_mut(
+                    shared_memory.data() as *mut u8,
+                    shared_memory.data_size(),
+                )
+            };
+            let mem = wiggle::wasmtime::WasmtimeGuestMemory::new(mem);
+
+            // Parse the arguments using wiggle. Ideally this would happen
+            // directly in wiggle-generated code but our need for `Caller`
+            // forces us to do it here. Changes from the auto-generated
+            // are marked with TODOs.
+            let device_id = arg0;
+            let kernel = wiggle::GuestPtr::<[u8]>::new(&mem, (arg1 as u32, arg2 as u32));
+            let num_threads = arg3; // TODO: as u32
+            let block_size = arg4; // TODO: as u32
+            let in_buffers = wiggle::GuestPtr::<[i32]>::new(&mem, (arg5 as u32, arg6 as u32)); // TODO ...<[Buffer]>
+            let out_buffers = wiggle::GuestPtr::<[i32]>::new(&mem, (arg7 as u32, arg8 as u32)); // TODO: ...<[Buffer]>
+
+            // To properly compile the function we need to use the same engine
+            // as the caller.
+            let engine = &caller.engine().clone();
+
+            // Call the wasi-parallel context with all the right Rust types.
+            let outer_ctx = get_cx(caller.data_mut());
+            let mut inner_ctx = outer_ctx.ctx.borrow_mut();
+            let kernel = &*kernel.as_slice().map_err(stringify_wiggle_err)?;
+            let in_buffers = &*in_buffers.as_slice().map_err(stringify_wiggle_err)?;
+            let out_buffers = &*out_buffers.as_slice().map_err(stringify_wiggle_err)?;
+            let result = inner_ctx.invoke_parallel_for(
+                device_id,
+                kernel,
+                engine,
+                shared_memory,
+                num_threads,
+                block_size,
+                in_buffers,
+                out_buffers,
+            );
+            match result {
+                Ok(_) => Ok(0), // TODO: <ParErrno as wiggle::GuestErrorType>::success() as i32
+                Err(e) => Err(Trap::new(e.to_string())),
+            }
+        },
+    )?;
+
+    Ok(())
+}
+
+/// Retrieve the exported `"memory"` from the `Caller`; this is a helper
+/// implementation for simplifying the `parallel_for` closure. Usually this is
+/// auto-generated inline by `wiggle_generate::wasmtime::generate_func`.
+pub(crate) fn get_exported_shared_memory<T>(caller: &mut Caller<T>) -> Result<SharedMemory, Trap> {
+    match caller.get_export("memory") {
+        Some(Extern::SharedMemory(m)) => Ok(m),
+        _ => Err(Trap::new("missing required shared memory export: 'memory'")),
+    }
+}
+
+/// Construct a Wasmtime [`Trap`] from a Wiggle [`GuestError`]; this uses the
+/// same logic as `From<GuestError> for wiggle::Trap` but with one less
+/// conversion.
+fn stringify_wiggle_err(err: GuestError) -> Trap {
+    Trap::new(err.to_string())
+}
diff --git a/crates/wasi-parallel/src/witx.rs b/crates/wasi-parallel/src/witx.rs
new file mode 100644
index 000000000000..7f7cf40692ef
--- /dev/null
+++ b/crates/wasi-parallel/src/witx.rs
@@ -0,0 +1,33 @@
+//! Contains the macro-generated implementation of wasi-nn from its WITX
+//! definition file.
+use crate::{WasiParallel, WasiParallelError};
+
+// Generate the traits and types of wasi-parallel to several Rust modules (e.g.
+// `types`). TODO: eventually re-add Git submodule for auto-retrieval of the
+// specification.
+wiggle::from_witx!({
+    witx: ["$WASI_ROOT/wasi-parallel.witx"],
+    errors: { par_errno => WasiParallelError },
+    skip: ["parallel_exec"],
+});
+
+use types::ParErrno;
+
+// Additionally, we must let Wiggle know which of our error codes represents a
+// successful operation.
+impl wiggle::GuestErrorType for ParErrno {
+    fn success() -> Self {
+        Self::Success
+    }
+}
+
+// Provide a way to map errors from the `WasiEphemeralTrait` (see `impl.rs`) to
+// the WITX-defined error type.
+impl wasi_ephemeral_parallel::UserErrorConversion for WasiParallel {
+    fn par_errno_from_wasi_parallel_error(
+        &mut self,
+        e: anyhow::Error,
+    ) -> Result<crate::witx::types::ParErrno, wiggle::Trap> {
+        todo!("must handle error: {}", e)
+    }
+}
diff --git a/crates/wasi-parallel/tests/rust/buffer.rs b/crates/wasi-parallel/tests/rust/buffer.rs
new file mode 100644
index 000000000000..00459950cf07
--- /dev/null
+++ b/crates/wasi-parallel/tests/rust/buffer.rs
@@ -0,0 +1,27 @@
+//! This test checks that wasi-parallel's `read_buffer`/`write_buffer` work as
+//! expected on a CPU. This is intended to be compiled to Wasm by `build.rs`,
+//! but to run it directly:
+//!
+//! ```
+//! rustc tests/rust/buffer.rs --target wasm32-wasi
+//! RUST_BACKTRACE=1 wasmtime run --wasi-modules=experimental-wasi-parallel ./buffer.wasm
+//! ```
+#[allow(dead_code)]
+mod wasi_parallel;
+
+use wasi_parallel::{BufferAccessKind, DeviceKind};
+
+fn main() -> Result<(), wasi_parallel::ParErrno> {
+    let source = [0xFF; 1024];
+    let destination = [0x00; 1024];
+    assert!(source != destination);
+
+    let device = wasi_parallel::get_device(DeviceKind::Cpu)?;
+    let source_buffer =
+        wasi_parallel::create_buffer(&device, source.len() as u32, BufferAccessKind::Read)?;
+    wasi_parallel::write_buffer(&source, &source_buffer)?;
+    wasi_parallel::read_buffer(&source_buffer, &destination)?;
+
+    assert_eq!(source, destination);
+    Ok(())
+}
diff --git a/crates/wasi-parallel/tests/rust/wasi_parallel.rs b/crates/wasi-parallel/tests/rust/wasi_parallel.rs
new file mode 100644
index 000000000000..4592e816491f
--- /dev/null
+++ b/crates/wasi-parallel/tests/rust/wasi_parallel.rs
@@ -0,0 +1,227 @@
+//! This module provides Rust bindings to the wasi-parallel API as implemented
+//! in this crate. It was generated at
+//! https://alexcrichton.github.io/witx-bindgen using the WITX below and altered
+//! slightly since Wiggle's generated signatures are slightly different than
+//! witx-bindgen's:
+//!
+//! ```
+//! resource Device
+//! enum DeviceKind {
+//!     Cpu,
+//!     DiscreteGpu,
+//!     IntegratedGpu
+//! }
+//! resource Buffer
+//! enum BufferAccessKind {
+//!     Read,
+//!     Write,
+//!     ReadWrite,
+//! }
+//! enum ParErrno {
+//!     Success
+//! }
+//! get_device: function(hint: DeviceKind) -> expected<Device, ParErrno>
+//! create_buffer: function(device: Device, size: u32, buffer_access_kind: BufferAccessKind) -> expected<Buffer, ParErrno>
+//! write_buffer: function(source: list<u8>, destination: Buffer) -> expected<_, ParErrno>
+//! read_buffer: function(source: Buffer, destination: list<u8>) -> expected<_, ParErrno>
+//! parallel_for: function(device: Device, kernel: list<u8>, num_iterations: u32, block_size: u32, in_buffers: list<Buffer>, out_buffers: list<Buffer>) -> expected<_, ParErrno>
+//! ```
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum DeviceKind {
+    Cpu,
+    DiscreteGpu,
+    IntegratedGpu,
+}
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum BufferAccessKind {
+    Read,
+    Write,
+    ReadWrite,
+}
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub enum ParErrno {
+    Success,
+    Error(i32),
+}
+
+#[derive(Debug)]
+#[repr(transparent)]
+pub struct Device(i32);
+impl Device {
+    pub unsafe fn from_raw(raw: i32) -> Self {
+        Self(raw)
+    }
+
+    pub fn into_raw(self) -> i32 {
+        let ret = self.0;
+        core::mem::forget(self);
+        return ret;
+    }
+
+    pub fn as_raw(&self) -> i32 {
+        self.0
+    }
+}
+// Removed `impl Drop for Device { ... }`
+
+#[derive(Debug)]
+#[repr(transparent)]
+pub struct Buffer(i32);
+impl Buffer {
+    pub unsafe fn from_raw(raw: i32) -> Self {
+        Self(raw)
+    }
+
+    pub fn into_raw(self) -> i32 {
+        let ret = self.0;
+        core::mem::forget(self);
+        return ret;
+    }
+
+    pub fn as_raw(&self) -> i32 {
+        self.0
+    }
+}
+// Removed `impl Drop for Buffer { ... }`
+
+pub fn get_device(hint: DeviceKind) -> Result<Device, ParErrno> {
+    unsafe {
+        let ptr0 = RET_AREA.as_mut_ptr() as i32;
+        #[link(wasm_import_module = "wasi_ephemeral_parallel")]
+        extern "C" {
+            #[cfg_attr(target_arch = "wasm32", link_name = "get_device")]
+            #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_get_device")]
+            fn witx_import(_: i32, _: i32) -> i32;
+        }
+        let result = witx_import(hint as i32, ptr0);
+        match result {
+            0 => Ok(Device(*(ptr0 as *const i32))),
+            _ => Err(ParErrno::Error(result)),
+        }
+    }
+}
+
+pub fn create_buffer(
+    device: &Device,
+    size: u32,
+    buffer_access_kind: BufferAccessKind,
+) -> Result<Buffer, ParErrno> {
+    unsafe {
+        let ptr1 = RET_AREA.as_mut_ptr() as i32;
+        #[link(wasm_import_module = "wasi_ephemeral_parallel")]
+        extern "C" {
+            #[cfg_attr(target_arch = "wasm32", link_name = "create_buffer")]
+            #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_create_buffer")]
+            fn witx_import(_: i32, _: i32, _: i32, _: i32) -> i32;
+        }
+        let result = witx_import(device.0, size as i32, buffer_access_kind as i32, ptr1);
+        match result {
+            0 => Ok(Buffer(*(ptr1 as *const i32))),
+            _ => Err(ParErrno::Error(result)),
+        }
+    }
+}
+
+pub fn write_buffer(source: &[u8], destination: &Buffer) -> Result<(), ParErrno> {
+    unsafe {
+        let vec2 = source;
+        let ptr2 = vec2.as_ptr() as i32;
+        let len2 = vec2.len() as i32;
+        #[link(wasm_import_module = "wasi_ephemeral_parallel")]
+        extern "C" {
+            #[cfg_attr(target_arch = "wasm32", link_name = "write_buffer")]
+            #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_write_buffer")]
+            fn witx_import(_: i32, _: i32, _: i32) -> i32;
+        }
+        let result = witx_import(ptr2, len2, destination.0);
+        match result {
+            0 => Ok(()),
+            _ => Err(ParErrno::Error(result)),
+        }
+    }
+}
+
+pub fn read_buffer(source: &Buffer, destination: &[u8]) -> Result<(), ParErrno> {
+    unsafe {
+        let vec4 = destination;
+        let ptr4 = vec4.as_ptr() as i32;
+        let len4 = vec4.len() as i32;
+        #[link(wasm_import_module = "wasi_ephemeral_parallel")]
+        extern "C" {
+            #[cfg_attr(target_arch = "wasm32", link_name = "read_buffer")]
+            #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_read_buffer")]
+            fn witx_import(_: i32, _: i32, _: i32) -> i32;
+        }
+        let result = witx_import(source.0, ptr4, len4);
+        match result {
+            0 => Ok(()),
+            _ => Err(ParErrno::Error(result)),
+        }
+    }
+}
+
+pub fn parallel_for(
+    kernel: u32,
+    num_threads: u32,
+    block_size: u32,
+    in_buffers: &[&Buffer],
+    out_buffers: &[&Buffer],
+) -> Result<(), ParErrno> {
+    unsafe {
+        let vec6 = in_buffers;
+        let len6 = vec6.len() as i32;
+        let layout6 = core::alloc::Layout::from_size_align_unchecked(vec6.len() * 4, 4);
+        let result6 = std::alloc::alloc(layout6);
+        if result6.is_null() {
+            std::alloc::handle_alloc_error(layout6);
+        }
+        for (i, e) in vec6.into_iter().enumerate() {
+            let base = result6 as i32 + (i as i32) * 4;
+            {
+                *((base + 0) as *mut i32) = e.0;
+            }
+        }
+        let vec7 = out_buffers;
+        let len7 = vec7.len() as i32;
+        let layout7 = core::alloc::Layout::from_size_align_unchecked(vec7.len() * 4, 4);
+        let result7 = std::alloc::alloc(layout7);
+        if result7.is_null() {
+            std::alloc::handle_alloc_error(layout7);
+        }
+        for (i, e) in vec7.into_iter().enumerate() {
+            let base = result7 as i32 + (i as i32) * 4;
+            {
+                *((base + 0) as *mut i32) = e.0;
+            }
+        }
+        #[link(wasm_import_module = "wasi_ephemeral_parallel")]
+        extern "C" {
+            #[cfg_attr(target_arch = "wasm32", link_name = "parallel_for")]
+            #[cfg_attr(not(target_arch = "wasm32"), link_name = "input_parallel_for")]
+            fn witx_import(_: i32, _: i32, _: i32, _: i32, _: i32, _: i32, _: i32) -> i32;
+        }
+        let result = witx_import(
+            kernel as i32,
+            num_threads as i32,
+            block_size as i32,
+            result6 as i32,
+            len6,
+            result7 as i32,
+            len7,
+        );
+        std::alloc::dealloc(result6, layout6);
+        std::alloc::dealloc(result7, layout7);
+        match result {
+            0 => Ok(()),
+            _ => Err(ParErrno::Error(result)),
+        }
+    }
+}
+
+static mut RET_AREA: [i64; 2] = [0; 2];
diff --git a/crates/wasi-parallel/tests/test_case.rs b/crates/wasi-parallel/tests/test_case.rs
new file mode 100644
index 000000000000..d397b5f081dc
--- /dev/null
+++ b/crates/wasi-parallel/tests/test_case.rs
@@ -0,0 +1,145 @@
+//! Simplify testing of `wasi-parallel` modules.
+//!
+//! Due to several factors (toolchains, APIs, etc.) the set up for running
+//! `wasi-parallel` code is still quite complicated. This module helps set up
+//! the required bits for testing and benchmarking.
+
+#![allow(dead_code)]
+
+use anyhow::{Context, Result};
+use wasmtime::{Config, Engine, Func, Linker, Module, SharedMemory, Store, Val};
+
+/// Helper structure for setting up a wasi-parallel environment.
+pub struct TestCase {
+    store: Store<TestEnvironment>,
+    linker: Linker<TestEnvironment>,
+    memory: Option<SharedMemory>,
+}
+
+impl TestCase {
+    /// Compile the WebAssembly at path.
+    pub fn new(path: &str, engine: Engine, import_memory: Option<SharedMemory>) -> Result<Self> {
+        let _ = pretty_env_logger::try_init();
+
+        // Import the WASI definitions.
+        let mut store = Store::new(&engine, TestEnvironment::new());
+        let mut linker = Linker::<TestEnvironment>::new(&engine);
+        wasmtime_wasi_parallel::add_to_linker(&mut linker, |cx| &mut cx.parallel)?;
+        wasmtime_wasi::add_to_linker(&mut linker, |cx| &mut cx.common)?;
+
+        // Import the shared memory, if provided.
+        if let Some(import_memory) = &import_memory {
+            linker.define("", "memory", import_memory.clone())?;
+        }
+
+        // Compile the module.
+        let module = Module::from_file(&engine, path)?;
+        linker.module(&mut store, "", &module)?;
+
+        // Gather up either the imported or exported memory for later use.
+        let memory = if let Some(import_memory) = import_memory {
+            Some(import_memory)
+        } else if let Some(export) = linker.get(&mut store, "", "memory") {
+            let export_memory = export
+                .into_shared_memory()
+                .context("expect the 'memory' export to be a shared memory")?;
+            Some(export_memory)
+        } else {
+            None
+        };
+
+        Ok(Self {
+            store,
+            linker,
+            memory,
+        })
+    }
+
+    /// Provide access to the store.
+    pub fn store(&mut self) -> &mut Store<TestEnvironment> {
+        &mut self.store
+    }
+
+    /// Conveniently return the shared memory as a slice.
+    pub fn memory_as_slice(&self) -> &[u8] {
+        let memory = self
+            .memory
+            .as_ref()
+            .expect("expected an imported or exported shared memory");
+        unsafe { std::slice::from_raw_parts_mut(memory.data() as *mut u8, memory.data_size()) }
+    }
+
+    /// Provide a convenient way to invoke a function.
+    pub fn invoke(&mut self, name: &str, args: &[Val]) -> Result<Vec<Val>> {
+        let func = self.get_function(name)?;
+        let num_results = func.ty(&self.store).results().len();
+        let mut results = vec![Val::I32(-1); num_results];
+        func.call(&mut self.store, &args, &mut results)?;
+        Ok(results)
+    }
+
+    /// Retrieve the exported `name` function.
+    fn get_function(&mut self, name: &str) -> Result<Func> {
+        Ok(self
+            .linker
+            .get(&mut self.store, "", name)
+            .context("unable to find function of the given name")?
+            .into_func()
+            .context("the export was not a function")?)
+    }
+
+    /// Retrieve the default entry function.
+    fn get_default_function(&mut self) -> Result<Func> {
+        self.linker.get_default(&mut self.store, "")
+    }
+}
+
+pub struct TestEnvironment {
+    common: wasmtime_wasi::WasiCtx,
+    parallel: wasmtime_wasi_parallel::WasiParallel,
+}
+
+impl TestEnvironment {
+    fn new() -> Self {
+        Self {
+            common: wasmtime_wasi::WasiCtxBuilder::new().inherit_stdio().build(),
+            parallel: wasmtime_wasi_parallel::WasiParallel::new(),
+        }
+    }
+}
+
+/// Configure the engine to use the threads proposal (i.e., to enable shared
+/// memory).
+pub fn default_engine() -> Engine {
+    let mut config = Config::new();
+    config.wasm_threads(true);
+    let engine = Engine::new(&config).unwrap();
+    engine
+}
+
+/// Execute the default function of a test case and return its exit code.
+pub fn exec(path: &str) -> Result<i32> {
+    let mut test_case = TestCase::new(path, default_engine(), None).unwrap();
+    let func = test_case.get_default_function().unwrap();
+    let func = func.typed::<(), i32, _>(&mut test_case.store).unwrap();
+    func.call(&mut test_case.store, ())
+        .context("failed to execute default function")
+}
+
+/// Helper for describing the kinds of parallel devices.
+#[derive(Clone, Copy, Debug)]
+pub enum Parallelism {
+    Cpu,
+    Sequential,
+}
+
+impl Parallelism {
+    pub fn as_device_kind(&self) -> i32 {
+        // These values must be kept up-to-date with
+        // https://github.com/WebAssembly/wasi-parallel/blob/main/wasi-parallel.witx.
+        match self {
+            Parallelism::Sequential => 0,
+            Parallelism::Cpu => 1,
+        }
+    }
+}
diff --git a/crates/wasi-parallel/tests/wasm.rs b/crates/wasi-parallel/tests/wasm.rs
new file mode 100644
index 000000000000..e8d36728f4fa
--- /dev/null
+++ b/crates/wasi-parallel/tests/wasm.rs
@@ -0,0 +1,15 @@
+//! Run the test cases in the `wasm` directory.
+//!
+//! See the `build.rs` file for how these WebAssembly binaries are generated.
+
+mod test_case;
+
+#[cfg(feature = "build-tests")]
+#[test]
+fn run_buffer_rs() {
+    let mut test_case =
+        test_case::TestCase::new("tests/wasm/buffer.wasm", test_case::default_engine(), None)
+            .unwrap();
+    let results = test_case.invoke("main", &[0.into(), 0.into()]).unwrap();
+    assert_eq!(results[0].i32().unwrap(), 0);
+}
diff --git a/crates/wasi-parallel/tests/wat.rs b/crates/wasi-parallel/tests/wat.rs
new file mode 100644
index 000000000000..26f798afa9bd
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat.rs
@@ -0,0 +1,151 @@
+//! Run the test cases in the `wat` directory.
+
+mod test_case;
+
+use std::convert::TryInto;
+use test_case::*;
+use wasmtime::{MemoryType, SharedMemory};
+
+#[test]
+fn run_example_wat() {
+    assert_eq!(exec("tests/wat/example.wat").unwrap(), 0);
+}
+
+// Though a bit overkill, it is very convenient when troubleshooting to be able
+// to check that we can successfully run a single iteration of a kernel as well.
+#[test]
+fn run_example_kernel_wat() {
+    // Setup the imported memory.
+    let engine = default_engine();
+    let memory = SharedMemory::new(&engine, MemoryType::shared(1, 1)).unwrap();
+
+    // Execute the kernel once (mimic running the 42nd iteration).
+    let mut test_case =
+        TestCase::new("tests/wat/example-kernel.wat", engine, Some(memory)).unwrap();
+    let _ = test_case
+        .invoke("kernel", &[42.into(), 0.into(), 0.into()])
+        .unwrap();
+
+    // Check that the first four bytes were stored correctly.
+    let first_4_bytes = &test_case.memory_as_slice()[0..4];
+    let first_i32 = i32::from_le_bytes(first_4_bytes.try_into().unwrap());
+    assert_eq!(first_i32, 43);
+}
+
+// Here we check that we can actually run the nstream benchmark. This is also
+// available using `cargo bench`, but this checks different parameters.
+#[test]
+fn run_nstream_wat() {
+    const NUM_THREADS: i32 = 24;
+    const NUM_ITEMS: i32 = 200000;
+    const DEVICE_KIND: i32 = 0x1; // CPU
+
+    // Setup the benchmark.
+    let mut test_case = TestCase::new("tests/wat/nstream.wat", default_engine(), None).unwrap();
+    let _ = test_case
+        .invoke(
+            "setup",
+            &[NUM_THREADS.into(), NUM_ITEMS.into(), DEVICE_KIND.into()],
+        )
+        .unwrap();
+
+    // Execute the benchmark.
+    let _ = test_case.invoke("execute", &[]).unwrap();
+
+    // Run the finalize step.
+    let results = test_case.invoke("finish", &[]).unwrap();
+    assert_eq!(results[0].i32().unwrap(), 0);
+}
+
+// For troubleshooting, we to check that we can successfully run a single
+// iteration of the nstream kernel as well.
+#[test]
+fn run_nstream_kernel_wat() {
+    // Set up the shared memory; nstream operates on memory buffers with
+    // specific floating point values that we initialize manually here.
+    const NUM_ITERATIONS: usize = 10;
+    const BLOCK_SIZE: usize = 10;
+    const ITEM_SIZE: usize = std::mem::size_of::<f32>();
+    const BUFFER_SIZE: usize = NUM_ITERATIONS * BLOCK_SIZE * ITEM_SIZE;
+    let mut memory_image = [0u8; BUFFER_SIZE * 3];
+    for i in 0..NUM_ITERATIONS * BLOCK_SIZE {
+        let a_i = i * ITEM_SIZE;
+        memory_image[a_i..a_i + ITEM_SIZE].copy_from_slice(&0.0f32.to_le_bytes());
+        let b_i = BUFFER_SIZE + i * ITEM_SIZE;
+        memory_image[b_i..b_i + ITEM_SIZE].copy_from_slice(&2.0f32.to_le_bytes());
+        let c_i = BUFFER_SIZE + BUFFER_SIZE + i * ITEM_SIZE;
+        memory_image[c_i..c_i + ITEM_SIZE].copy_from_slice(&2.0f32.to_le_bytes());
+    }
+
+    // Copy to the imported shared memory.
+    let engine = default_engine();
+    let memory = SharedMemory::new(&engine, MemoryType::shared(0x800, 0x800)).unwrap();
+    assert!(memory.data_size() > memory_image.len());
+    unsafe {
+        std::ptr::copy(
+            memory_image.as_ptr(),
+            memory.data() as *mut u8,
+            memory_image.len(),
+        );
+    }
+
+    // Execute the kernel once (mimic running only the second iteration).
+    let mut test_case =
+        TestCase::new("tests/wat/nstream-kernel.wat", engine, Some(memory)).unwrap();
+    let params = vec![
+        // Second iteration (0-based).
+        1.into(),
+        (NUM_ITERATIONS as i32).into(),
+        (BLOCK_SIZE as i32).into(),
+        // Buffer A.
+        0.into(),
+        (BUFFER_SIZE as i32).into(),
+        // Buffer B.
+        (BUFFER_SIZE as i32).into(),
+        (BUFFER_SIZE as i32).into(),
+        // Buffer C.
+        (BUFFER_SIZE as i32 * 2).into(),
+        (BUFFER_SIZE as i32).into(),
+    ];
+    let _ = test_case.invoke("kernel", &params).unwrap();
+
+    // Check that the second iteration of the A buffer is filled in correctly.
+    let memory_as_f32s = unsafe {
+        std::slice::from_raw_parts(
+            test_case.memory_as_slice().as_ptr() as *const _ as *const f32,
+            NUM_ITERATIONS * BLOCK_SIZE,
+        )
+    };
+    assert_eq!(memory_as_f32s.len() * ITEM_SIZE, BUFFER_SIZE);
+    let iteration_memory = &memory_as_f32s[BLOCK_SIZE..BLOCK_SIZE + BLOCK_SIZE];
+    assert!(iteration_memory.iter().all(|f| f == &8.0));
+}
+
+// Here we check that we can actually run the nstream benchmark. This is also
+// available using `cargo bench`, but this checks different parameters.
+#[test]
+fn run_sum_wat() {
+    const NUM_THREADS: i32 = 8;
+    const ADDITIONS_PER_THREAD: i32 = 0x200_000;
+    const DEVICE_KIND: i32 = 0x1; // CPU
+
+    // Setup the benchmark.
+    let mut test_case = TestCase::new("tests/wat/sum.wat", default_engine(), None).unwrap();
+    let _ = test_case
+        .invoke(
+            "setup",
+            &[
+                NUM_THREADS.into(),
+                ADDITIONS_PER_THREAD.into(),
+                DEVICE_KIND.into(),
+            ],
+        )
+        .unwrap();
+
+    // Execute the benchmark.
+    let _ = test_case.invoke("execute", &[]).unwrap();
+
+    // Run the finalize step.
+    let results = test_case.invoke("finish", &[]).unwrap();
+    assert_eq!(results[0].i32().unwrap(), 0);
+}
diff --git a/crates/wasi-parallel/tests/wat/example-kernel.wat b/crates/wasi-parallel/tests/wat/example-kernel.wat
new file mode 100644
index 000000000000..00502c12c27e
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat/example-kernel.wat
@@ -0,0 +1,10 @@
+(module
+    (memory (import "" "memory") 1 1 shared)
+    (func $kernel (export "kernel") (param $iteration_id i32) (param $num_iterations i32) (param $block_size i32)
+        (i32.store
+            ;; Address of shared memory to store at--no buffer used.
+            (i32.const 0)
+            ;; Increment the iteration ID to avoid 0 and store this.
+            (i32.add (local.get $iteration_id) (i32.const 1))
+        ))
+)
diff --git a/crates/wasi-parallel/tests/wat/example.wat b/crates/wasi-parallel/tests/wat/example.wat
new file mode 100644
index 000000000000..310b2fb618dd
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat/example.wat
@@ -0,0 +1,56 @@
+;; A minimal example of wasi-parallel execution--each thread of execution writes
+;; to the same location in memory. This makes use of the implicit detail in the
+;; current implementation that allows a CPU-run kernel to modify the module's
+;; memory.
+;;
+;; Note how both the kernel code is accessible in linear memory. Without
+;; read-only access support in WebAssembly, the kernel could be overwritten
+;; (accidentally or maliciously), which is a risk in this "bag-of-bytes"
+;; paradigm.
+
+(module
+    (import "wasi_ephemeral_parallel" "get_device" (func $get_device
+        (param $hint i32)
+        (param $out_device i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "parallel_exec" (func $par_exec
+        (param $device i32)
+        (param $kernel_start i32)
+        (param $kernel_len i32)
+        (param $num_iterations i32)
+        (param $block_size i32)
+        (param $in_buffers_start i32)
+        (param $in_buffers_len i32)
+        (param $out_buffers_start i32)
+        (param $out_buffers_len i32)
+        (result i32)))
+
+    ;; The kernel here is the binary-encoded version of `example-kernel.wat`,
+    ;; using:
+    ;;
+    ;; $ wat2wasm tests/wat/example-kernel.wat --enable-threads --output=- | xxd -g 1 -p | sed -r 's/.{2}/\\&/g' | tr -d '\n'
+    ;;
+    ;; The length is calculated using `wc -c` and dividing by 3.
+    (memory (export "memory") 1 1 shared)
+    ;; Reserve 8 bytes for the return area, then emit the kernel:
+    (data (i32.const 8) "\00\61\73\6d\01\00\00\00\01\07\01\60\03\7f\7f\7f\00\02\0d\01\00\06\6d\65\6d\6f\72\79\02\03\01\01\03\02\01\00\07\0a\01\06\6b\65\72\6e\65\6c\00\00\0a\0e\01\0c\00\41\00\20\00\41\01\6a\36\02\00\0b")
+
+    (func (export "_start") (result i32)
+        (local $return_area i32)
+        (local $device i32)
+        (local.set $return_area (i32.const 0))
+
+        ;; Set up a CPU device.
+        (drop (call $get_device (i32.const 0x01) (local.get $return_area)))
+        (local.set $device (i32.load (local.get $return_area)))
+
+        ;; Execute the kernel in parallel.
+        (call $par_exec (local.get $device) (i32.const 8) (i32.const 64) (i32.const 12) (i32.const 4)
+            ;; Empty buffers:
+            (i32.const 0) (i32.const 0) (i32.const 0) (i32.const 0))
+
+        ;; Check that the parallel execution returned 0 (success) and that the
+        ;; memory was updated by an invocation of the kernel--if so, return 0.
+        (i32.eq (i32.load (i32.const 0)) (i32.const 0))
+        (i32.or))
+)
diff --git a/crates/wasi-parallel/tests/wat/nstream-kernel.wat b/crates/wasi-parallel/tests/wat/nstream-kernel.wat
new file mode 100644
index 000000000000..0e415cbfcd4a
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat/nstream-kernel.wat
@@ -0,0 +1,47 @@
+;; This hand-coded implementation of the nstream splits the work
+;; dynamically--i.e., in the kernel itself. See, e.g.,
+;; https://github.com/ParRes/Kernels/blob/default/Cxx11/nstream-tbb.cc#L132 for
+;; a higher-level implementation.
+
+(module
+    (memory (import "" "memory") 0x800 0x800 shared)
+    (func $kernel (export "kernel") (param $iteration_id i32) (param $num_iterations i32) (param $block_size i32) (param $A i32) (param $A_len i32) (param $B i32) (param $B_len i32) (param $C i32) (param $C_len i32)
+        (local $A_i i32)
+        (local $B_i i32)
+        (local $C_i i32)
+        (local $i i32)
+        (local $end i32)
+
+        ;; The division of the buffers between iterations happens here: the
+        ;; block size defines how many floating-point numbers each iteration
+        ;; will touch:
+        ;; i = iteration_id * block_size * 4;
+        ;; end = i + block_size * 4;
+        (local.set $i (i32.mul (i32.mul (local.get $iteration_id) (local.get $block_size)) (i32.const 4)))
+        (local.set $end (i32.add (local.get $i) (i32.mul (local.get $block_size) (i32.const 4))))
+
+        (block
+            (loop
+                (local.set $A_i (i32.add (local.get $A) (local.get $i)))
+                (local.set $B_i (i32.add (local.get $B) (local.get $i)))
+                (local.set $C_i (i32.add (local.get $C) (local.get $i)))
+
+                ;; Offset to store: A[i] = ...
+                (local.get $A_i)
+                ;; Value to store:   ... = A[i] + B[i] + 3.0 * C[i];
+                (f32.add
+                    (f32.load (local.get $A_i))
+                    (f32.add
+                        (f32.load (local.get $B_i))
+                        (f32.mul
+                            (f32.const 3.0)
+                            (f32.load (local.get $C_i)))))
+                (f32.store)
+
+                ;; Loop control--exit once we have looped through this
+                ;; iteration's portion of the buffer.
+                (local.set $i (i32.add (local.get $i) (i32.const 4)))
+                (i32.ge_s (local.get $i) (local.get $end))
+                (br_if 1)
+                (br 0))))
+)
diff --git a/crates/wasi-parallel/tests/wat/nstream.wat b/crates/wasi-parallel/tests/wat/nstream.wat
new file mode 100644
index 000000000000..c7ea29c165c3
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat/nstream.wat
@@ -0,0 +1,175 @@
+;; This hand-coded implementation of `nstream` uses wasi-parallel to split the
+;; `nstream` work--some light arithmetic with heavy memory access--across the
+;; parallelism available to `wasi-parallel`.
+;;
+;; See, e.g.,
+;; https://github.com/ParRes/Kernels/blob/default/Cxx11/nstream-tbb.cc#L132 for
+;; a higher-level implementation.
+
+(module
+    (import "wasi_ephemeral_parallel" "get_device" (func $get_device
+        (param $hint i32)
+        (param $out_device i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "create_buffer" (func $create_buffer
+        (param $device i32)
+        (param $size i32)
+        (param $access i32)
+        (param $out_buffer i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "write_buffer" (func $write_buffer
+        (param $data_offset i32)
+        (param $data_len i32)
+        (param $buffer i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "read_buffer" (func $read_buffer
+        (param $buffer i32)
+        (param $data_offset i32)
+        (param $data_len i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "parallel_exec" (func $par_exec
+        (param $device i32)
+        (param $kernel_start i32)
+        (param $kernel_len i32)
+        (param $num_iterations i32)
+        (param $block_size i32)
+        (param $in_buffers_start i32)
+        (param $in_buffers_len i32)
+        (param $out_buffers_start i32)
+        (param $out_buffers_len i32)
+        (result i32)))
+
+    ;; The kernel here is the binary-encoded version of `nstream-kernel.wat`,
+    ;; using:
+    ;;
+    ;; $ wat2wasm tests/wat/nstream-kernel.wat --enable-threads --output=- | xxd -g 1 -p | sed -r 's/.{2}/\\&/g' | tr -d '\n'
+    ;;
+    ;; The length is calculated using `wc -c` and dividing by 3.
+    (memory (export "memory") 0x800 0x800 shared)
+    ;; Reserve 12 bytes for the return area and buffer lists, then emit the
+    ;; kernel:
+    (data (i32.const 12) "\00\61\73\6d\01\00\00\00\01\0d\01\60\09\7f\7f\7f\7f\7f\7f\7f\7f\7f\00\02\0f\01\00\06\6d\65\6d\6f\72\79\02\03\80\10\80\10\03\02\01\00\07\0a\01\06\6b\65\72\6e\65\6c\00\00\0a\61\01\5f\01\05\7f\20\00\20\02\6c\41\04\6c\21\0c\20\0c\20\02\41\04\6c\6a\21\0d\02\40\03\40\20\03\20\0c\6a\21\09\20\05\20\0c\6a\21\0a\20\07\20\0c\6a\21\0b\20\09\20\09\2a\02\00\20\0a\2a\02\00\43\00\00\40\40\20\0b\2a\02\00\94\92\92\38\02\00\20\0c\41\04\6a\21\0c\20\0c\20\0d\4e\0d\01\0c\00\0b\0b\0b")
+
+    ;; Global values overwritten by `setup`.
+    (global $num_threads (mut i32) (i32.const -1))
+    (global $block_size (mut i32) (i32.const -1))
+    (global $buffer_size (mut i32) (i32.const -1))
+    (global $device (mut i32) (i32.const -1))
+    (global $A (mut i32) (i32.const -1))
+    (global $B (mut i32) (i32.const -1))
+    (global $C (mut i32) (i32.const -1))
+
+
+    (func (export "setup") (param $num_threads i32) (param $num_items i32) (param $device_kind i32)
+        (local $return_area i32)
+        (local $device i32)
+        (local $len i32)
+        (local $memA i32)
+        (local $memB i32)
+        (local $memC i32)
+        (local $A i32)
+        (local $B i32)
+        (local $C i32)
+
+        ;; Save some setup parameters for later.
+        (global.set $num_threads (local.get $num_threads))
+        (global.set $block_size (i32.add
+            (i32.div_u (local.get $num_items) (local.get $num_threads))
+            ;; Over-estimate the block size by 1 if there is a remainder to
+            ;; account for.
+            (if (result i32) (i32.rem_u (local.get $num_items) (local.get $num_threads))
+                (then (i32.const 1))
+                (else (i32.const 0))
+            )
+        ))
+        (global.set $buffer_size (i32.mul
+            (i32.mul (global.get $block_size) (i32.const 4))
+            (global.get $num_threads)
+        ))
+
+        ;; Assign some pointers, skipping the first section of memory because
+        ;; it contains the return area, the kernel bytes, etc.
+        (local.set $return_area (i32.const 0x00))
+        (local.set $memA (global.get $buffer_size))
+        (local.set $memB (i32.mul (global.get $buffer_size) (i32.const 2)))
+        (local.set $memC (i32.mul (global.get $buffer_size) (i32.const 3)))
+
+        ;; Set up the device.
+        (drop (call $get_device (local.get $device_kind) (local.get $return_area)))
+        (global.set $device (i32.load (local.get $return_area)))
+
+        ;; Create the buffers. Note that `0x00 = read` and `0x01 = read-write`.
+        (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x01) (local.get $return_area)))
+        (global.set $A (i32.load (local.get $return_area)))
+        (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x00) (local.get $return_area)))
+        (global.set $B (i32.load (local.get $return_area)))
+        (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x00) (local.get $return_area)))
+        (global.set $C (i32.load (local.get $return_area)))
+
+        ;; Fill the buffers with the correct values.
+        (call $initialize (local.get $memA) (global.get $buffer_size) (f32.const 0))
+        (call $initialize (local.get $memB) (global.get $buffer_size) (f32.const 2))
+        (call $initialize (local.get $memC) (global.get $buffer_size) (f32.const 2))
+
+        ;; Assign the buffers their contents.
+        (drop (call $write_buffer (local.get $memA) (global.get $buffer_size) (global.get $A)))
+        (drop (call $write_buffer (local.get $memB) (global.get $buffer_size) (global.get $B)))
+        (drop (call $write_buffer (local.get $memC) (global.get $buffer_size) (global.get $C))))
+
+    (func (export "execute")
+        ;; Set up the list of buffers.
+        (i32.store (i32.const 0) (global.get $A))
+        (i32.store (i32.const 4) (global.get $B))
+        (i32.store (i32.const 8) (global.get $C))
+
+        ;; Execute the kernel in parallel.
+        (call $par_exec (global.get $device)
+            ;; Kernel bytes.
+            (i32.const 12) (i32.const 155)
+            ;; Number of iterations and block size
+            (global.get $num_threads) (global.get $block_size)
+            ;; Input buffers.
+            (i32.const 0) (i32.const 3)
+            ;; Output buffers.
+            (i32.const 0) (i32.const 0))
+
+        drop)
+
+    (func (export "finish") (result i32)
+        (local $memA i32)
+        (local.set $memA (global.get $buffer_size))
+        ;; Assert that all values in A equal 8.0.
+        (call $check (local.get $memA) (global.get $buffer_size) (f32.const 8.0))
+    )
+
+    ;; Helper function to (inefficiently) initialize a block of memory.
+    (func $initialize (param $offset i32) (param $len i32) (param $value f32)
+        (block
+            (loop
+                (local.set $len (i32.sub (local.get $len) (i32.const 4)))
+                (f32.store (i32.add (local.get $offset) (local.get $len)) (local.get $value))
+                (i32.le_s (local.get $len) (i32.const 0))
+                (br_if 1)
+                (br 0)
+            )
+        )
+    )
+
+    ;; Helper function to check that an entire memory region matches the value.
+    (func $check (param $offset i32) (param $len i32) (param $value f32) (result i32)
+        (loop $cont
+            (local.set $len (i32.sub (local.get $len) (i32.const 4)))
+            ;; If the loaded value does not match, early return with a `1`
+            ;; code.
+            (i32.const 1)
+            (br_if 1 (f32.ne
+                (local.get $value)
+                (f32.load (i32.add (local.get $offset) (local.get $len)))))
+            (drop)
+            ;; Continue iterating until we reach the end, exiting with a `0`
+            ;; success code.
+            (br_if $cont (i32.gt_s (local.get $len) (i32.const 0)))
+        )
+        (i32.const 0)
+    )
+)
diff --git a/crates/wasi-parallel/tests/wat/sum-kernel.wat b/crates/wasi-parallel/tests/wat/sum-kernel.wat
new file mode 100644
index 000000000000..b05e7c38bf05
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat/sum-kernel.wat
@@ -0,0 +1,22 @@
+;; This trivial kernel increments a local until it matches the block size,
+;; then it stores this value in a buffer to be aggregated later.
+
+(module
+    (memory (import "" "memory") 1 1 shared)
+    (func $kernel (export "kernel") (param $iteration_id i32) (param $num_iterations i32) (param $block_size i32) (param $A i32) (param $A_len i32)
+        (local $i i64)
+        (local $end i64)
+        (local.set $i (i64.const 0))
+        (local.set $end (i64.extend_i32_u (local.get $block_size)))
+        (loop $cont
+            (local.set $i (i64.add (local.get $i) (i64.const 1)))
+            (br_if $cont (i64.lt_u (local.get $i) (local.get $end)))
+        )
+        (i64.store
+            ;; Address to store at.
+            (i32.add (local.get $A) (i32.mul (local.get $iteration_id) (i32.const 8)))
+            ;; The summed value.
+            (local.get $i)
+        )
+    )
+)
diff --git a/crates/wasi-parallel/tests/wat/sum.wat b/crates/wasi-parallel/tests/wat/sum.wat
new file mode 100644
index 000000000000..f3831ae2cab3
--- /dev/null
+++ b/crates/wasi-parallel/tests/wat/sum.wat
@@ -0,0 +1,129 @@
+;; This hand-coded implementation of nstream splits the work--a 32MB buffer (4 *
+;; LLC)--evenly among 4 cores and uses wasi-parallel to distribute the work.
+;; See, e.g.,
+;; https://github.com/ParRes/Kernels/blob/default/Cxx11/nstream-tbb.cc#L132 for
+;; a higher-level implementation.
+
+(module
+    (import "wasi_ephemeral_parallel" "get_device" (func $get_device
+        (param $hint i32)
+        (param $out_device i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "create_buffer" (func $create_buffer
+        (param $device i32)
+        (param $size i32)
+        (param $access i32)
+        (param $out_buffer i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "write_buffer" (func $write_buffer
+        (param $data_offset i32)
+        (param $data_len i32)
+        (param $buffer i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "read_buffer" (func $read_buffer
+        (param $buffer i32)
+        (param $data_offset i32)
+        (param $data_len i32)
+        (result i32)))
+    (import "wasi_ephemeral_parallel" "parallel_exec" (func $par_exec
+        (param $device i32)
+        (param $kernel_start i32)
+        (param $kernel_len i32)
+        (param $num_iterations i32)
+        (param $block_size i32)
+        (param $in_buffers_start i32)
+        (param $in_buffers_len i32)
+        (param $out_buffers_start i32)
+        (param $out_buffers_len i32)
+        (result i32)))
+
+    ;; The kernel here is the binary-encoded version of `nstream-kernel.wat`,
+    ;; using:
+    ;;
+    ;; $ wat2wasm tests/wat/sum-kernel.wat --enable-threads --output=- | xxd -g 1 -p | sed -r 's/.{2}/\\&/g' | tr -d '\n'
+    ;;
+    ;; The length is calculated using `wc -c` and dividing by 3.
+    (memory (export "memory") 1 1 shared)
+    ;; Reserve 8 bytes for the return area and buffer lists, then emit the
+    ;; kernel:
+    (data (i32.const 8) "\00\61\73\6d\01\00\00\00\01\09\01\60\05\7f\7f\7f\7f\7f\00\02\0d\01\00\06\6d\65\6d\6f\72\79\02\03\01\01\03\02\01\00\07\0a\01\06\6b\65\72\6e\65\6c\00\00\0a\2d\01\2b\01\02\7e\42\00\21\05\20\02\ad\21\06\03\40\20\05\42\01\7c\21\05\20\05\20\06\54\0d\00\0b\20\03\20\00\41\08\6c\6a\20\05\37\03\00\0b")
+
+    ;; Global values overwritten by `setup`.
+    (global $num_threads (mut i32) (i32.const 4))
+    (global $block_size (mut i32) (i32.const 0x2000000))
+    (global $buffer_size (mut i32) (i32.const 0x2000000))
+    (global $device (mut i32) (i32.const -1))
+    (global $A (mut i32) (i32.const -1))
+    (global $memA i32 (i32.const 0x1000))
+
+    (func (export "setup") (param $num_threads i32) (param $block_size i32) (param $device_kind i32)
+        (local $return_area i32)
+        ;; Assign the return area pointer.
+        (local.set $return_area (i32.const 0x00))
+
+        ;; Save some setup parameters for later.
+        (global.set $num_threads (local.get $num_threads))
+        (global.set $buffer_size (i32.mul (local.get $num_threads) (i32.const 8)))
+        (global.set $block_size (local.get $block_size))
+
+        ;; Set up the device.
+        (drop (call $get_device (local.get $device_kind) (local.get $return_area)))
+        (global.set $device (i32.load (local.get $return_area)))
+
+        ;; Create a buffer to store the intermediate results. Note that `0x01 = read-write`.
+        (drop (call $create_buffer (global.get $device) (global.get $buffer_size) (i32.const 0x01) (local.get $return_area)))
+        (global.set $A (i32.load (local.get $return_area)))
+
+        ;; Assign the buffer its (empty) contents.
+        (drop (call $write_buffer (global.get $memA) (global.get $buffer_size) (global.get $A)))
+    )
+
+    (func (export "execute")
+        (local $sum i64)
+        (local $i i32)
+
+        ;; Set up the list of buffers.
+        (i32.store (i32.const 0) (global.get $A))
+
+        ;; Execute the kernel in parallel.
+        (call $par_exec (global.get $device)
+            ;; Kernel bytes.
+            (i32.const 8) (i32.const 97)
+            ;; Number of iterations and block size
+            (global.get $num_threads) (global.get $block_size)
+            ;; Input buffers.
+            (i32.const 0) (i32.const 1)
+            ;; Output buffers.
+            (i32.const 0) (i32.const 0))
+        (drop)
+
+        ;; Read the buffer contents.
+        (drop (call $read_buffer (global.get $A) (global.get $memA) (global.get $buffer_size)))
+
+        ;; Calculate the sum of each iteration's work and store it at address 0.
+        (local.set $i (i32.const 0))
+        (loop $cont
+            (local.set $i (i32.add (local.get $i) (i32.const 1)))
+            (local.set $sum (i64.add
+                (local.get $sum)
+                (i64.load (i32.add
+                    (global.get $memA)
+                    (i32.mul (local.get $i) (i32.const 8))
+                ))
+            ))
+            (br_if $cont (i32.lt_u (local.get $i) (global.get $num_threads)))
+        )
+        (i64.store (i32.const 0) (local.get $sum))
+    )
+
+    (func (export "finish") (result i32)
+        ;; Assert that the aggregate sum is what is expected.
+        (i64.eq
+            (i64.load (i32.const 0))
+            (i64.mul
+                (i64.extend_i32_u (global.get $num_threads))
+                (i64.extend_i32_u (global.get $block_size))
+            )
+        )
+    )
+)
diff --git a/crates/wiggle/generate/src/codegen_settings.rs b/crates/wiggle/generate/src/codegen_settings.rs
index 144d2be282b6..637b5264ae8d 100644
--- a/crates/wiggle/generate/src/codegen_settings.rs
+++ b/crates/wiggle/generate/src/codegen_settings.rs
@@ -1,4 +1,4 @@
-use crate::config::{AsyncConf, ErrorConf};
+use crate::config::{AsyncConf, ErrorConf, SkipNames};
 use anyhow::{anyhow, Error};
 use proc_macro2::TokenStream;
 use quote::quote;
@@ -12,6 +12,7 @@ pub struct CodegenSettings {
     pub errors: ErrorTransform,
     pub async_: AsyncConf,
     pub wasmtime: bool,
+    pub skip_names: SkipNames,
 }
 impl CodegenSettings {
     pub fn new(
@@ -19,12 +20,14 @@ impl CodegenSettings {
         async_: &AsyncConf,
         doc: &Document,
         wasmtime: bool,
+        skip_names: &SkipNames,
     ) -> Result<Self, Error> {
         let errors = ErrorTransform::new(error_conf, doc)?;
         Ok(Self {
             errors,
             async_: async_.clone(),
             wasmtime,
+            skip_names: skip_names.clone(),
         })
     }
     pub fn get_async(&self, module: &Module, func: &InterfaceFunc) -> Asyncness {
diff --git a/crates/wiggle/generate/src/config.rs b/crates/wiggle/generate/src/config.rs
index 3d164d70f1d9..81c25c44bf06 100644
--- a/crates/wiggle/generate/src/config.rs
+++ b/crates/wiggle/generate/src/config.rs
@@ -15,6 +15,7 @@ pub struct Config {
     pub errors: ErrorConf,
     pub async_: AsyncConf,
     pub wasmtime: bool,
+    pub skip: SkipNames,
 }
 
 mod kw {
@@ -22,6 +23,7 @@ mod kw {
     syn::custom_keyword!(witx_literal);
     syn::custom_keyword!(block_on);
     syn::custom_keyword!(errors);
+    syn::custom_keyword!(skip);
     syn::custom_keyword!(target);
     syn::custom_keyword!(wasmtime);
 }
@@ -32,6 +34,7 @@ pub enum ConfigField {
     Error(ErrorConf),
     Async(AsyncConf),
     Wasmtime(bool),
+    Skip(SkipNames),
 }
 
 impl Parse for ConfigField {
@@ -67,6 +70,10 @@ impl Parse for ConfigField {
             input.parse::<kw::wasmtime>()?;
             input.parse::<Token![:]>()?;
             Ok(ConfigField::Wasmtime(input.parse::<syn::LitBool>()?.value))
+        } else if lookahead.peek(kw::skip) {
+            input.parse::<kw::skip>()?;
+            input.parse::<Token![:]>()?;
+            Ok(ConfigField::Skip(input.parse()?))
         } else {
             Err(lookahead.error())
         }
@@ -79,6 +86,7 @@ impl Config {
         let mut errors = None;
         let mut async_ = None;
         let mut wasmtime = None;
+        let mut skip = None;
         for f in fields {
             match f {
                 ConfigField::Witx(c) => {
@@ -105,6 +113,12 @@ impl Config {
                     }
                     wasmtime = Some(c);
                 }
+                ConfigField::Skip(c) => {
+                    if skip.is_some() {
+                        return Err(Error::new(err_loc, "duplicate `skip` field"));
+                    }
+                    skip = Some(c);
+                }
             }
         }
         Ok(Config {
@@ -114,6 +128,7 @@ impl Config {
             errors: errors.take().unwrap_or_default(),
             async_: async_.take().unwrap_or_default(),
             wasmtime: wasmtime.unwrap_or(true),
+            skip: skip.unwrap_or_default(),
         })
     }
 
@@ -550,3 +565,25 @@ impl Parse for WasmtimeConfigField {
         }
     }
 }
+
+#[derive(Debug, Default, Clone)]
+pub struct SkipNames(Vec<String>);
+impl Parse for SkipNames {
+    fn parse(input: ParseStream) -> Result<Self> {
+        let content;
+        let _ = bracketed!(content in input);
+        let name_literals: Punctuated<LitStr, Token![,]> =
+            content.parse_terminated(Parse::parse)?;
+        let names = name_literals
+            .iter()
+            .map(LitStr::value)
+            .collect::<Vec<String>>();
+
+        Ok(SkipNames(names))
+    }
+}
+impl SkipNames {
+    pub fn should_skip(&self, func: &witx::InterfaceFunc) -> bool {
+        self.0.iter().any(|n| n.as_str() == func.name)
+    }
+}
diff --git a/crates/wiggle/generate/src/lib.rs b/crates/wiggle/generate/src/lib.rs
index 47e82f75870b..de1766f72926 100644
--- a/crates/wiggle/generate/src/lib.rs
+++ b/crates/wiggle/generate/src/lib.rs
@@ -54,6 +54,7 @@ pub fn generate(doc: &witx::Document, names: &Names, settings: &CodegenSettings)
         let modname = names.module(&module.name);
         let fs = module
             .funcs()
+            .filter(|f| !settings.skip_names.should_skip(f))
             .map(|f| define_func(&names, &module, &f, &settings));
         let modtrait = define_module_trait(&names, &module, &settings);
         let wasmtime = if settings.wasmtime {
diff --git a/crates/wiggle/generate/src/module_trait.rs b/crates/wiggle/generate/src/module_trait.rs
index 2e108f56f097..c3c4f4918f69 100644
--- a/crates/wiggle/generate/src/module_trait.rs
+++ b/crates/wiggle/generate/src/module_trait.rs
@@ -18,75 +18,78 @@ pub fn passed_by_reference(ty: &witx::Type) -> bool {
 pub fn define_module_trait(names: &Names, m: &Module, settings: &CodegenSettings) -> TokenStream {
     let traitname = names.trait_name(&m.name);
     let rt = names.runtime_mod();
-    let traitmethods = m.funcs().map(|f| {
-        // Check if we're returning an entity anotated with a lifetime,
-        // in which case, we'll need to annotate the function itself, and
-        // hence will need an explicit lifetime (rather than anonymous)
-        let (lifetime, is_anonymous) = if f
-            .params
-            .iter()
-            .chain(&f.results)
-            .any(|ret| ret.tref.needs_lifetime())
-        {
-            (quote!('a), false)
-        } else {
-            (anon_lifetime(), true)
-        };
-        let funcname = names.func(&f.name);
-        let args = f.params.iter().map(|arg| {
-            let arg_name = names.func_param(&arg.name);
-            let arg_typename = names.type_ref(&arg.tref, lifetime.clone());
-            let arg_type = if passed_by_reference(&*arg.tref.type_()) {
-                quote!(&#arg_typename)
+    let traitmethods = m
+        .funcs()
+        .filter(|f| !settings.skip_names.should_skip(f))
+        .map(|f| {
+            // Check if we're returning an entity anotated with a lifetime,
+            // in which case, we'll need to annotate the function itself, and
+            // hence will need an explicit lifetime (rather than anonymous)
+            let (lifetime, is_anonymous) = if f
+                .params
+                .iter()
+                .chain(&f.results)
+                .any(|ret| ret.tref.needs_lifetime())
+            {
+                (quote!('a), false)
             } else {
-                quote!(#arg_typename)
+                (anon_lifetime(), true)
             };
-            quote!(#arg_name: #arg_type)
-        });
-
-        let result = match f.results.len() {
-            0 if f.noreturn => quote!(#rt::Trap),
-            0 => quote!(()),
-            1 => {
-                let (ok, err) = match &**f.results[0].tref.type_() {
-                    witx::Type::Variant(v) => match v.as_expected() {
-                        Some(p) => p,
-                        None => unimplemented!("anonymous variant ref {:?}", v),
-                    },
-                    _ => unimplemented!(),
+            let funcname = names.func(&f.name);
+            let args = f.params.iter().map(|arg| {
+                let arg_name = names.func_param(&arg.name);
+                let arg_typename = names.type_ref(&arg.tref, lifetime.clone());
+                let arg_type = if passed_by_reference(&*arg.tref.type_()) {
+                    quote!(&#arg_typename)
+                } else {
+                    quote!(#arg_typename)
                 };
+                quote!(#arg_name: #arg_type)
+            });
 
-                let ok = match ok {
-                    Some(ty) => names.type_ref(ty, lifetime.clone()),
-                    None => quote!(()),
-                };
-                let err = match err {
-                    Some(ty) => match settings.errors.for_abi_error(ty) {
-                        Some(custom) => {
-                            let tn = custom.typename();
-                            quote!(super::#tn)
-                        }
-                        None => names.type_ref(ty, lifetime.clone()),
-                    },
-                    None => quote!(()),
-                };
-                quote!(Result<#ok, #err>)
-            }
-            _ => unimplemented!(),
-        };
+            let result = match f.results.len() {
+                0 if f.noreturn => quote!(#rt::Trap),
+                0 => quote!(()),
+                1 => {
+                    let (ok, err) = match &**f.results[0].tref.type_() {
+                        witx::Type::Variant(v) => match v.as_expected() {
+                            Some(p) => p,
+                            None => unimplemented!("anonymous variant ref {:?}", v),
+                        },
+                        _ => unimplemented!(),
+                    };
 
-        let asyncness = if settings.get_async(&m, &f).is_sync() {
-            quote!()
-        } else {
-            quote!(async)
-        };
+                    let ok = match ok {
+                        Some(ty) => names.type_ref(ty, lifetime.clone()),
+                        None => quote!(()),
+                    };
+                    let err = match err {
+                        Some(ty) => match settings.errors.for_abi_error(ty) {
+                            Some(custom) => {
+                                let tn = custom.typename();
+                                quote!(super::#tn)
+                            }
+                            None => names.type_ref(ty, lifetime.clone()),
+                        },
+                        None => quote!(()),
+                    };
+                    quote!(Result<#ok, #err>)
+                }
+                _ => unimplemented!(),
+            };
 
-        if is_anonymous {
-            quote!(#asyncness fn #funcname(&mut self, #(#args),*) -> #result; )
-        } else {
-            quote!(#asyncness fn #funcname<#lifetime>(&mut self, #(#args),*) -> #result;)
-        }
-    });
+            let asyncness = if settings.get_async(&m, &f).is_sync() {
+                quote!()
+            } else {
+                quote!(async)
+            };
+
+            if is_anonymous {
+                quote!(#asyncness fn #funcname(&mut self, #(#args),*) -> #result; )
+            } else {
+                quote!(#asyncness fn #funcname<#lifetime>(&mut self, #(#args),*) -> #result;)
+            }
+        });
 
     quote! {
         #[#rt::async_trait]
diff --git a/crates/wiggle/generate/src/wasmtime.rs b/crates/wiggle/generate/src/wasmtime.rs
index 8f89bc06f082..f3a3cdb01da1 100644
--- a/crates/wiggle/generate/src/wasmtime.rs
+++ b/crates/wiggle/generate/src/wasmtime.rs
@@ -21,7 +21,10 @@ pub fn link_module(
 
     let mut bodies = Vec::new();
     let mut bounds = HashSet::new();
-    for f in module.funcs() {
+    for f in module
+        .funcs()
+        .filter(|f| !settings.skip_names.should_skip(f))
+    {
         let asyncness = settings.async_.get(module.name.as_str(), f.name.as_str());
         bodies.push(generate_func(&module, &f, names, target_path, asyncness));
         let bound = func_bounds(names, module, &f, settings);
@@ -112,14 +115,21 @@ fn generate_func(
     };
 
     let body = quote! {
-        let mem = match caller.get_export("memory") {
-            Some(#rt::wasmtime_crate::Extern::Memory(m)) => m,
+        let (mem, ctx) = match caller.get_export("memory") {
+            Some(#rt::wasmtime_crate::Extern::Memory(m)) => {
+                let (mem, ctx) = m.data_and_store_mut(&mut caller);
+                let ctx = get_cx(ctx);
+                (mem, ctx)
+            }
+            Some(#rt::wasmtime_crate::Extern::SharedMemory(m)) => {
+                let mem = unsafe { std::slice::from_raw_parts_mut(m.data() as *mut u8, m.data_size()) };
+                let ctx = get_cx(caller.data_mut());
+                (mem, ctx)
+            }
             _ => {
                 return Err(#rt::wasmtime_crate::Trap::new("missing required memory export"));
             }
         };
-        let (mem , ctx) = mem.data_and_store_mut(&mut caller);
-        let ctx = get_cx(ctx);
         let mem = #rt::wasmtime::WasmtimeGuestMemory::new(mem);
         match #abi_func(ctx, &mem #(, #arg_names)*) #await_ {
             Ok(r) => Ok(<#ret_ty>::from(r)),
diff --git a/crates/wiggle/macro/src/lib.rs b/crates/wiggle/macro/src/lib.rs
index 394ee47bb366..785785d73461 100644
--- a/crates/wiggle/macro/src/lib.rs
+++ b/crates/wiggle/macro/src/lib.rs
@@ -41,6 +41,9 @@ use syn::parse_macro_input;
 ///   WebAssembly execution by trapping.
 /// * Optional: `async` takes a set of witx modules and functions which are
 ///   made Rust `async` functions in the module trait.
+/// * Optional: `skip` takes a list of function names and skips them when
+///   generating code; this can be helpful when attempting to use Wiggle
+///   alongside raw implementions of a witx interface.
 ///
 /// ## Example
 ///
@@ -153,6 +156,7 @@ pub fn from_witx(args: TokenStream) -> TokenStream {
         &config.async_,
         &doc,
         cfg!(feature = "wasmtime") && config.wasmtime,
+        &config.skip,
     )
     .expect("validating codegen settings");
 
@@ -195,6 +199,7 @@ pub fn wasmtime_integration(args: TokenStream) -> TokenStream {
         &config.c.async_,
         &doc,
         cfg!(feature = "wasmtime"),
+        &config.c.skip,
     )
     .expect("validating codegen settings");
 
diff --git a/scripts/publish.rs b/scripts/publish.rs
index 600ab1372d1f..20b5cb3e8482 100644
--- a/scripts/publish.rs
+++ b/scripts/publish.rs
@@ -60,6 +60,7 @@ const CRATES_TO_PUBLISH: &[&str] = &[
     // other misc wasmtime crates
     "wasmtime-wasi",
     "wasmtime-wasi-nn",
+    "wasmtime-wasi-parallel",
     "wasmtime-wasi-crypto",
     "wasmtime-wast",
     "wasmtime-cli-flags",
@@ -79,8 +80,9 @@ const PUBLIC_CRATES: &[&str] = &[
     // patch releases.
     "wasmtime",
     "wasmtime-wasi",
-    "wasmtime-wasi-nn",
     "wasmtime-wasi-crypto",
+    "wasmtime-wasi-nn",
+    "wasmtime-wasi-parallel",
     "wasmtime-cli",
     // all cranelift crates are considered "public" in that they can't
     // have breaking API changes in patch releases
diff --git a/src/commands/run.rs b/src/commands/run.rs
index 8185bf7e0053..f4deccec2a13 100644
--- a/src/commands/run.rs
+++ b/src/commands/run.rs
@@ -22,6 +22,9 @@ use wasmtime_wasi_nn::WasiNnCtx;
 #[cfg(feature = "wasi-crypto")]
 use wasmtime_wasi_crypto::WasiCryptoCtx;
 
+#[cfg(feature = "wasi-parallel")]
+use wasmtime_wasi_parallel::WasiParallel;
+
 fn parse_module(s: &OsStr) -> anyhow::Result<PathBuf> {
     // Do not accept wasmtime subcommand names as the module name
     match s.to_str() {
@@ -449,6 +452,8 @@ struct Host {
     wasi_nn: Option<WasiNnCtx>,
     #[cfg(feature = "wasi-crypto")]
     wasi_crypto: Option<WasiCryptoCtx>,
+    #[cfg(feature = "wasi-parallel")]
+    wasi_parallel: Option<WasiParallel>,
 }
 
 /// Populates the given `Linker` with WASI APIs.
@@ -512,6 +517,20 @@ fn populate_with_wasi(
         }
     }
 
+    if wasi_modules.wasi_parallel {
+        #[cfg(not(feature = "wasi-parallel"))]
+        {
+            bail!("Cannot enable wasi-parallel when the binary is not compiled with this feature.");
+        }
+        #[cfg(feature = "wasi-parallel")]
+        {
+            wasmtime_wasi_parallel::add_to_linker(linker, |host| {
+                host.wasi_parallel.as_mut().unwrap()
+            })?;
+            store.data_mut().wasi_parallel = Some(WasiParallel::new());
+        }
+    }
+
     Ok(())
 }
 
diff --git a/supply-chain/config.toml b/supply-chain/config.toml
index cb3d8ca8f558..0a55393ccf01 100644
--- a/supply-chain/config.toml
+++ b/supply-chain/config.toml
@@ -42,6 +42,10 @@ criteria = "safe-to-deploy"
 version = "0.0.1"
 criteria = "safe-to-deploy"
 
+[[exemptions.anes]]
+version = "0.1.6"
+criteria = "safe-to-run"
+
 [[exemptions.anyhow]]
 version = "1.0.57"
 criteria = "safe-to-deploy"
@@ -102,6 +106,10 @@ criteria = "safe-to-deploy"
 version = "0.2.7"
 criteria = "safe-to-run"
 
+[[exemptions.cast]]
+version = "0.3.0"
+criteria = "safe-to-run"
+
 [[exemptions.chacha20]]
 version = "0.8.1"
 criteria = "safe-to-deploy"
@@ -110,6 +118,18 @@ criteria = "safe-to-deploy"
 version = "0.9.0"
 criteria = "safe-to-deploy"
 
+[[exemptions.ciborium]]
+version = "0.2.0"
+criteria = "safe-to-run"
+
+[[exemptions.ciborium-io]]
+version = "0.2.0"
+criteria = "safe-to-run"
+
+[[exemptions.ciborium-ll]]
+version = "0.2.0"
+criteria = "safe-to-run"
+
 [[exemptions.cipher]]
 version = "0.3.0"
 criteria = "safe-to-deploy"
@@ -154,10 +174,18 @@ criteria = "safe-to-deploy"
 version = "0.3.5"
 criteria = "safe-to-run"
 
+[[exemptions.criterion]]
+version = "0.4.0"
+criteria = "safe-to-run"
+
 [[exemptions.criterion-plot]]
 version = "0.4.4"
 criteria = "safe-to-run"
 
+[[exemptions.criterion-plot]]
+version = "0.5.0"
+criteria = "safe-to-run"
+
 [[exemptions.crossbeam-channel]]
 version = "0.5.4"
 criteria = "safe-to-deploy"
@@ -802,6 +830,10 @@ criteria = "safe-to-deploy"
 version = "1.0.6"
 criteria = "safe-to-deploy"
 
+[[exemptions.scoped_threadpool]]
+version = "0.1.9"
+criteria = "safe-to-deploy"
+
 [[exemptions.scopeguard]]
 version = "1.1.0"
 criteria = "safe-to-deploy"