Skip to content

Commit

Permalink
Merge 'origin/master' into hipblas
Browse files Browse the repository at this point in the history
  • Loading branch information
SlyEcho committed Jul 11, 2023
2 parents e610466 + 2347463 commit 8c2c497
Show file tree
Hide file tree
Showing 22 changed files with 481 additions and 48 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,40 @@ jobs:
cd build
ctest --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest

continue-on-error: true

strategy:
matrix:
mpi_library: [mpich, libopenmpi-dev]

steps:
- name: Clone
id: checkout
uses: actions/checkout@v1

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }}
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_MPI=ON ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose
macOS-latest-make:
runs-on: macos-latest

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ build-static/
build-cublas/
build-opencl/
build-metal/
build-mpi/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
Expand Down
24 changes: 24 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)

Expand Down Expand Up @@ -309,6 +310,28 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
set(c_flags ${c_flags} -Wno-cast-qual)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -509,6 +532,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_MPI}
${GGML_SOURCES_EXTRA}
)

Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
OBJS += ggml-mpi.o

ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI

ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas
Expand Down
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
```

### MPI Build

MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):

- Using `make`:

```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```

- Using `CMake`:

```bash
cmake -S . -B build -DLLAMA_MPI=ON
```

Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.

Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".

Here is an example hostfile:

```
192.168.0.1:2
malvolio.local:1
```
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
Finally, you're ready to run a computation using `mpirun`:
```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
```

### BLAS Build

Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
Expand Down
3 changes: 1 addition & 2 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
Expand Down
2 changes: 1 addition & 1 deletion examples/embd-input/embd-input-lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down
4 changes: 3 additions & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
2 changes: 1 addition & 1 deletion examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
4 changes: 3 additions & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand All @@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
usage(argv[0]);
}

llama_init_backend(false);
llama_backend_init(false);

// parse command line arguments
const std::string fname_inp = argv[arg_idx];
Expand Down Expand Up @@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
}

llama_backend_free();

return 0;
}
2 changes: 1 addition & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Command line options:
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
Expand Down
7 changes: 4 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
Expand Down Expand Up @@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
}
else if (arg == "--lora-base")
{
Expand Down Expand Up @@ -1079,7 +1078,7 @@ int main(int argc, char **argv)
params.model_alias = params.model;
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

LOG_INFO("build info", {{"build", BUILD_NUMBER},
{"commit", BUILD_COMMIT}});
Expand Down Expand Up @@ -1309,5 +1308,7 @@ int main(int argc, char **argv)
return 1;
}

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main(int argc, char ** argv)
// Init LLM :
//---------------------------------

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -173,6 +173,8 @@ int main(int argc, char ** argv)
llama_free( ctx );
llama_free_model( model );

llama_backend_free();

return 0;
}

Expand Down
1 change: 1 addition & 0 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
//}

switch (dst->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
Expand Down
Loading

0 comments on commit 8c2c497

Please sign in to comment.