From 7dfcba101e7bd9f21b6bd1f3ff78234d1387e375 Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:04:10 -0800 Subject: [PATCH 01/21] initial commit --- examples/unsloth/train.py | 61 +++++++++++++++++++++++++++++++++++ examples/unsloth/unsloth.yaml | 14 ++++++++ 2 files changed, 75 insertions(+) create mode 100644 examples/unsloth/train.py create mode 100644 examples/unsloth/unsloth.yaml diff --git a/examples/unsloth/train.py b/examples/unsloth/train.py new file mode 100644 index 00000000000..530980ba392 --- /dev/null +++ b/examples/unsloth/train.py @@ -0,0 +1,61 @@ +from unsloth import FastLanguageModel +import torch +from trl import SFTTrainer +from transformers import TrainingArguments +from datasets import load_dataset +max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! +# Get LAION dataset +url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" +dataset = load_dataset("json", data_files = {"train" : url}, split = "train") + +# 4bit pre quantized models we support - 4x faster downloading! +fourbit_models = [ + "unsloth/mistral-7b-bnb-4bit", + "unsloth/llama-2-7b-bnb-4bit", + "unsloth/llama-2-13b-bnb-4bit", + "unsloth/codellama-34b-bnb-4bit", + "unsloth/tinyllama-bnb-4bit", +] +# Load Llama model +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this! + max_seq_length = max_seq_length, + dtype = None, + load_in_4bit = True, +) + +# Do model patching and add fast LoRA weights +model = FastLanguageModel.get_peft_model( + model, + r = 16, + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj",], + lora_alpha = 16, + lora_dropout = 0, # Supports any, but = 0 is optimized + bias = "none", # Supports any, but = "none" is optimized + use_gradient_checkpointing = True, + random_state = 3407, + max_seq_length = max_seq_length, +) + +trainer = SFTTrainer( + model = model, + train_dataset = dataset, + dataset_text_field = "text", + max_seq_length = max_seq_length, + tokenizer = tokenizer, + args = TrainingArguments( + per_device_train_batch_size = 2, + gradient_accumulation_steps = 4, + warmup_steps = 10, + max_steps = 60, + fp16 = not torch.cuda.is_bf16_supported(), + bf16 = torch.cuda.is_bf16_supported(), + logging_steps = 1, + output_dir = "outputs", + optim = "adamw_8bit", + seed = 3407, + ), +) +trainer.train() + diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml new file mode 100644 index 00000000000..9f66955fa6f --- /dev/null +++ b/examples/unsloth/unsloth.yaml @@ -0,0 +1,14 @@ +resources: + accelerators: T4:1 + disk_size: 128 + +workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth + +setup: | + set -ex + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton --index-url https://download.pytorch.org/whl/cu118 + pip install ipython + pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" + +run: | + python train.py \ No newline at end of file From a08c0344f641eadbd7b91634f2b4575e972de405 Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:05:58 -0800 Subject: [PATCH 02/21] newline --- examples/unsloth/unsloth.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 9f66955fa6f..825e0dd8aa1 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -11,4 +11,5 @@ setup: | pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" run: | - python train.py \ No newline at end of file + python train.py + \ No newline at end of file From b85cbf92c10d51249b4e5f0697ee4e468fab0769 Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:17:10 -0800 Subject: [PATCH 03/21] comments --- examples/unsloth/{train.py => unsloth.py} | 20 ++++++++++++-------- examples/unsloth/unsloth.yaml | 16 +++++++++++++--- 2 files changed, 25 insertions(+), 11 deletions(-) rename examples/unsloth/{train.py => unsloth.py} (77%) diff --git a/examples/unsloth/train.py b/examples/unsloth/unsloth.py similarity index 77% rename from examples/unsloth/train.py rename to examples/unsloth/unsloth.py index 530980ba392..8d5f8071c68 100644 --- a/examples/unsloth/train.py +++ b/examples/unsloth/unsloth.py @@ -1,14 +1,16 @@ +# Use the unsloth library to fine-tune a Mistral model + from unsloth import FastLanguageModel import torch from trl import SFTTrainer from transformers import TrainingArguments from datasets import load_dataset -max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! -# Get LAION dataset +max_seq_length = 2048 + +# [1] Get LAION dataset url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" dataset = load_dataset("json", data_files = {"train" : url}, split = "train") -# 4bit pre quantized models we support - 4x faster downloading! fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", @@ -16,28 +18,30 @@ "unsloth/codellama-34b-bnb-4bit", "unsloth/tinyllama-bnb-4bit", ] -# Load Llama model + +# [2] Load Mistral model model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this! + model_name = "unsloth/mistral-7b-bnb-4bit", max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, ) -# Do model patching and add fast LoRA weights +# [3] Do model patching and add fast LoRA weights model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, - lora_dropout = 0, # Supports any, but = 0 is optimized - bias = "none", # Supports any, but = "none" is optimized + lora_dropout = 0, + bias = "none", use_gradient_checkpointing = True, random_state = 3407, max_seq_length = max_seq_length, ) +# [4] Initialize and train the model using the SFTTrainer trainer = SFTTrainer( model = model, train_dataset = dataset, diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 825e0dd8aa1..44ad3963ecb 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -1,8 +1,19 @@ +# Runs the unsloth example app via SkyPilot +# +# The example app starts by obtaining the LAION dataset, +# loads the Mistral model with 4-bit precision, performs model +# patching with fast LoRA weights, and finally initializes and +# trains the model using the SFTTrainer with specified hyperparameters +# and the LAION dataset. +# +# Usage: +# sky launch -c myclus unsloth.yaml + resources: accelerators: T4:1 disk_size: 128 -workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth +workdir: . setup: | set -ex @@ -11,5 +22,4 @@ setup: | pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" run: | - python train.py - \ No newline at end of file + python unsloth.py From 4fbfe1713f78e9acb0a8f083c09ffa18855bb14b Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:26:34 -0800 Subject: [PATCH 04/21] run linter --- examples/unsloth/unsloth.py | 7 ++++--- examples/unsloth/unsloth.yaml | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/unsloth/unsloth.py b/examples/unsloth/unsloth.py index 8d5f8071c68..ee9ab2b2f05 100644 --- a/examples/unsloth/unsloth.py +++ b/examples/unsloth/unsloth.py @@ -1,10 +1,11 @@ # Use the unsloth library to fine-tune a Mistral model -from unsloth import FastLanguageModel +from datasets import load_dataset import torch -from trl import SFTTrainer from transformers import TrainingArguments -from datasets import load_dataset +from trl import SFTTrainer +from unsloth import FastLanguageModel + max_seq_length = 2048 # [1] Get LAION dataset diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 44ad3963ecb..b900f0d0752 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -3,8 +3,8 @@ # The example app starts by obtaining the LAION dataset, # loads the Mistral model with 4-bit precision, performs model # patching with fast LoRA weights, and finally initializes and -# trains the model using the SFTTrainer with specified hyperparameters -# and the LAION dataset. +# trains the model using the SFTTrainer with specified +# hyperparameters and the LAION dataset. # # Usage: # sky launch -c myclus unsloth.yaml @@ -17,7 +17,8 @@ workdir: . setup: | set -ex - pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton --index-url https://download.pytorch.org/whl/cu118 + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu118 pip install ipython pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" From 6fc77e12809bf67cf6d60e77bd694fa6e1f0581f Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 18 Feb 2024 00:47:45 -0800 Subject: [PATCH 05/21] reminder for down --- examples/unsloth/unsloth.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index b900f0d0752..7fac6770df2 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -8,6 +8,7 @@ # # Usage: # sky launch -c myclus unsloth.yaml +# sky down myclus resources: accelerators: T4:1 From d6cb99316b4ce23fc09915e378105d8148888581 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 18 Feb 2024 02:42:30 -0800 Subject: [PATCH 06/21] tentatively done with example --- examples/unsloth/unsloth.yaml | 6 +++++- examples/unsloth/{unsloth.py => unsloth_example.py} | 11 +++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) rename examples/unsloth/{unsloth.py => unsloth_example.py} (84%) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 7fac6770df2..4c01a5ca220 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -14,6 +14,10 @@ resources: accelerators: T4:1 disk_size: 128 +file_mounts: + /outputs: + name: my-unsloth-checkpoints + workdir: . setup: | @@ -24,4 +28,4 @@ setup: | pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" run: | - python unsloth.py + python unsloth_example.py --output-dir /outputs diff --git a/examples/unsloth/unsloth.py b/examples/unsloth/unsloth_example.py similarity index 84% rename from examples/unsloth/unsloth.py rename to examples/unsloth/unsloth_example.py index ee9ab2b2f05..95ead037859 100644 --- a/examples/unsloth/unsloth.py +++ b/examples/unsloth/unsloth_example.py @@ -1,5 +1,6 @@ # Use the unsloth library to fine-tune a Mistral model +import argparse from datasets import load_dataset import torch from transformers import TrainingArguments @@ -42,7 +43,12 @@ max_seq_length = max_seq_length, ) -# [4] Initialize and train the model using the SFTTrainer +# [4] Parse output directory of checkpoints +parser = argparse.ArgumentParser() +parser.add_argument("--output-dir", type=str, default="/outputs") +args = parser.parse_args() + +# [5] Initialize and train the model using the SFTTrainer trainer = SFTTrainer( model = model, train_dataset = dataset, @@ -57,9 +63,10 @@ fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), logging_steps = 1, - output_dir = "outputs", + output_dir = args.output_dir[1:], optim = "adamw_8bit", seed = 3407, + save_steps = 10, ), ) trainer.train() From 2d5aceb894a2cf7223f5443444699581bfe4aad0 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 18 Feb 2024 02:53:11 -0800 Subject: [PATCH 07/21] formatting --- examples/unsloth/unsloth_example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 95ead037859..52c0a306c67 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -1,6 +1,7 @@ # Use the unsloth library to fine-tune a Mistral model import argparse + from datasets import load_dataset import torch from transformers import TrainingArguments From 4e1954a0027dbdea3375d3afd73177d36a18f2eb Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 19 Feb 2024 01:11:30 -0800 Subject: [PATCH 08/21] yapf --- examples/unsloth/unsloth_example.py | 70 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 52c0a306c67..400f2c8402b 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -12,7 +12,7 @@ # [1] Get LAION dataset url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" -dataset = load_dataset("json", data_files = {"train" : url}, split = "train") +dataset = load_dataset("json", data_files={"train": url}, split="train") fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", @@ -24,24 +24,31 @@ # [2] Load Mistral model model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", - max_seq_length = max_seq_length, - dtype = None, - load_in_4bit = True, + model_name="unsloth/mistral-7b-bnb-4bit", + max_seq_length=max_seq_length, + dtype=None, + load_in_4bit=True, ) # [3] Do model patching and add fast LoRA weights model = FastLanguageModel.get_peft_model( model, - r = 16, - target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj",], - lora_alpha = 16, - lora_dropout = 0, - bias = "none", - use_gradient_checkpointing = True, - random_state = 3407, - max_seq_length = max_seq_length, + r=16, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=16, + lora_dropout=0, + bias="none", + use_gradient_checkpointing=True, + random_state=3407, + max_seq_length=max_seq_length, ) # [4] Parse output directory of checkpoints @@ -51,24 +58,23 @@ # [5] Initialize and train the model using the SFTTrainer trainer = SFTTrainer( - model = model, - train_dataset = dataset, - dataset_text_field = "text", - max_seq_length = max_seq_length, - tokenizer = tokenizer, - args = TrainingArguments( - per_device_train_batch_size = 2, - gradient_accumulation_steps = 4, - warmup_steps = 10, - max_steps = 60, - fp16 = not torch.cuda.is_bf16_supported(), - bf16 = torch.cuda.is_bf16_supported(), - logging_steps = 1, - output_dir = args.output_dir[1:], - optim = "adamw_8bit", - seed = 3407, - save_steps = 10, + model=model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=max_seq_length, + tokenizer=tokenizer, + args=TrainingArguments( + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + warmup_steps=10, + max_steps=60, + fp16=not torch.cuda.is_bf16_supported(), + bf16=torch.cuda.is_bf16_supported(), + logging_steps=1, + output_dir=args.output_dir[1:], + optim="adamw_8bit", + seed=3407, + save_steps=10, ), ) trainer.train() - From 27a89050cf929f4f9a54d3078c44e2e229a3f14a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 22 Feb 2024 14:55:15 -0800 Subject: [PATCH 09/21] [Storage] Storage mounting tool permissions fix (#3215) * fix permissions * fix permissions --- sky/data/mounting_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 32904ada517..2f4e37a1b66 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -19,7 +19,7 @@ def get_s3_mount_install_cmd() -> str: install_cmd = ('sudo wget -nc https://github.com/romilbhardwaj/goofys/' 'releases/download/0.24.0-romilb-upstream/goofys ' '-O /usr/local/bin/goofys && ' - 'sudo chmod +x /usr/local/bin/goofys') + 'sudo chmod 755 /usr/local/bin/goofys') return install_cmd From 41a63df344d3b3cea0ae837d7391f3c1e86bb5da Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 22 Feb 2024 17:43:06 -0800 Subject: [PATCH 10/21] [LLM] Example for Serving Gemma (#3207) * Add serve for gemma and fix mixtral dependency * Add hf token * fix model len * Add comment * Serve your private gemma * fix serve yaml * readme * Remove chat completion due to the wrong template * add readme * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * address comments * Update README.md Co-authored-by: Zongheng Yang * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * Change to it * Add chat API * use HF_TOKEN env * typo --------- Co-authored-by: Zongheng Yang --- README.md | 2 + docs/source/index.rst | 1 + llm/gemma/README.md | 103 ++++++++++++++++++++++++++++++++++++++++++ llm/gemma/serve.yaml | 47 +++++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 llm/gemma/README.md create mode 100644 llm/gemma/serve.yaml diff --git a/README.md b/README.md index 2c03d5afa06..606ac06e2f0 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ ---- :fire: *News* :fire: +- [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/) - [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/) - [Feb, 2024] Serving [**Code Llama 70B**](https://ai.meta.com/blog/code-llama-large-language-model-coding/) with vLLM and SkyServe: [**example**](./llm/codellama/) - [Dec, 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/) @@ -148,6 +149,7 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest Runnable examples: - LLMs on SkyPilot + - [Gemma](./llm/gemma/) - [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team) - [Code Llama](./llm/codellama/) - [vLLM: Serving LLM 24x Faster On the Cloud](./llm/vllm/) (from official vLLM team) diff --git a/docs/source/index.rst b/docs/source/index.rst index 493c7459a9a..fbf03b3f552 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -69,6 +69,7 @@ Runnable examples: * **LLMs on SkyPilot** + * `Gemma `_ * `Mixtral 8x7B `_; `Mistral 7B `_ (from official Mistral team) * `Code Llama `_ * `vLLM: Serving LLM 24x Faster On the Cloud `_ (from official vLLM team) diff --git a/llm/gemma/README.md b/llm/gemma/README.md new file mode 100644 index 00000000000..d0ff0114ff8 --- /dev/null +++ b/llm/gemma/README.md @@ -0,0 +1,103 @@ +# Serve Your Gemma on Any Cloud + +Google released [Gemma](https://blog.google/technology/developers/gemma-open-models/) and has made a big wave in the AI community. +It opens the opportunity for the open-source community to serve and finetune private Gemini. + +## Serve Gemma on any Cloud + +Serving Gemma on any cloud is easy with SkyPilot. With [serve.yaml](serve.yaml) in this directory, you host the model on any cloud with a single command. + +### Prerequsites + +1. Apply for access to the Gemma model + +Go to the [application page](https://huggingface.co/google/gemma-7b) and click **Acknowledge license** to apply for access to the model weights. + + +2. Get the access token from huggingface + +Generate a read-only access token on huggingface [here](https://huggingface.co/settings/token), and make sure your huggingface account can access the Gemma models [here](https://huggingface.co/google/gemma-7b). + +3. Install SkyPilot + +```bash +pip install "skypilot-nightly[all]" +``` +For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). + +### Host on a Single Instance + +We can host the model with a single instance: +```bash +HF_TOKEN="xxx" sky launch -c gemma serve.yaml --env HF_TOKEN +``` + +After the cluster is launched, we can access the model with the following command: +```bash +IP=$(sky status --ip gemma) + +curl -L http://$IP:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "prompt": "My favourite condiment is", + "max_tokens": 25 + }' | jq . +``` + +Chat API is also supported: +```bash +IP=$(sky status --ip gemma) + +curl -L http://$IP:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "messages": [ + { + "role": "user", + "content": "Hello! What is your name?" + } + ], + "max_tokens": 25 + }' +``` + +### Scale the Serving with SkyServe + + +Using the same YAML, we can easily scale the model serving across multiple instances, regions and clouds with SkyServe: +```bash +HF_TOKEN="xxx" sky serve up -n gemma serve.yaml --env HF_TOKEN +``` + +> Notice the only change is from `sky launch` to `sky serve up`. The same YAML can be used without changes. + +After the cluster is launched, we can access the model with the following command: +```bash +ENDPOINT=$(sky serve status --endpoint gemma) + +curl -L http://$ENDPOINT/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "prompt": "My favourite condiment is", + "max_tokens": 25 + }' | jq . +``` + +Chat API is also supported: +```bash +curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "messages": [ + { + "role": "user", + "content": "Hello! What is your name?" + } + ], + "max_tokens": 25 + }' +``` diff --git a/llm/gemma/serve.yaml b/llm/gemma/serve.yaml new file mode 100644 index 00000000000..a477554d47a --- /dev/null +++ b/llm/gemma/serve.yaml @@ -0,0 +1,47 @@ +# A example yaml for serving Gemma model from Mistral.ai with an OpenAI API. +# Usage: +# 1. Launch on a single instance: `sky launch -c gemma ./serve.yaml` +# 2. Scale up to multiple instances with a single endpoint: +# `sky serve up -n gemma ./serve.yaml` +service: + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + replicas: 2 + +envs: + MODEL_NAME: google/gemma-7b-it + HF_TOKEN: # TODO: Replace with huggingface token + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + ports: 8000 + disk_tier: best + +setup: | + conda activate gemma + if [ $? -ne 0 ]; then + conda create -n gemma -y python=3.10 + conda activate gemma + fi + pip install vllm==0.3.2 + pip install transformers==4.38.0 + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + +run: | + conda activate gemma + export PATH=$PATH:/sbin + # --max-model-len is set to 1024 to avoid taking too much GPU memory on L4 and + # A10g with small memory. + python -u -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --model $MODEL_NAME \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log + From 2b17e91d93f9d54e9a0b7f44e837dba9d0c1f837 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 23 Feb 2024 13:18:59 -0800 Subject: [PATCH 11/21] [LLM] Add logo for Gemma (#3220) --- llm/gemma/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/gemma/README.md b/llm/gemma/README.md index d0ff0114ff8..676afce6606 100644 --- a/llm/gemma/README.md +++ b/llm/gemma/README.md @@ -1,4 +1,5 @@ # Serve Your Gemma on Any Cloud +![image](https://github.com/skypilot-org/skypilot/assets/6753189/e452c39e-b5ef-4cb2-ab48-053f9e6f67b7) Google released [Gemma](https://blog.google/technology/developers/gemma-open-models/) and has made a big wave in the AI community. It opens the opportunity for the open-source community to serve and finetune private Gemini. From b326d12610acbdae15e155ca12c0c6e2ef800004 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 23 Feb 2024 13:22:42 -0800 Subject: [PATCH 12/21] Minor fixes for release 0.5.0 (#3212) * when removing cudo credential, sky check fails * remove tips * minor hint fix * fix cluster version for k8s * fix typo --- docs/source/examples/auto-failover.rst | 6 ------ sky/clouds/cudo.py | 9 ++++++++- tests/backward_compatibility_tests.sh | 2 +- tests/kubernetes/README.md | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst index c8f77c533a1..bbfc3cb469b 100644 --- a/docs/source/examples/auto-failover.rst +++ b/docs/source/examples/auto-failover.rst @@ -108,12 +108,6 @@ AWS, where it succeeded after two regions: Multiple Candidate GPUs ------------------------- -.. tip:: - - Support for multiple resources via ``any_of`` or ``ordered`` was added after v0.4.1. - - To use this feature, :ref:`install the nightly release `: ``pip install -U skypilot-nightly`` - If a task can be run on different GPUs, the user can specify multiple candidate GPUs, and SkyPilot will automatically find the cheapest available GPU. diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 855bdaf59ae..ad7a22e6e03 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -276,7 +276,14 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: # pylint: disable=import-outside-toplevel,unused-import from cudo_compute import cudo_api from cudo_compute.rest import ApiException - _, error = cudo_api.client() + try: + _, error = cudo_api.client() + except FileNotFoundError as e: + return False, ( + 'Cudo credentials are not set. ' + f'{cls._CREDENTIAL_HINT}\n' + f'{cls._INDENT_PREFIX}' + f'{common_utils.format_exception(e, use_bracket=True)}') if error is not None: return False, ( diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 9fd7586e22b..47381294afe 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -112,7 +112,7 @@ sky logs ${CLUSTER_NAME}-4 2 fi # (1 node) sky start + sky exec + sky queue + sky logs -if [ "$start_form" -le 5 ]; then +if [ "$start_from" -le 5 ]; then conda activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud ${CLOUD} -y --cpus 2 -c ${CLUSTER_NAME}-5 examples/minimal.yaml diff --git a/tests/kubernetes/README.md b/tests/kubernetes/README.md index 220f96f6a9e..4a882352703 100644 --- a/tests/kubernetes/README.md +++ b/tests/kubernetes/README.md @@ -32,7 +32,7 @@ sky local up ```bash PROJECT_ID=$(gcloud config get-value project) CLUSTER_NAME=testclusterromil - gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.27.3-gke.100" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" + gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.0-gke.1381000" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" ``` 2. Get the kubeconfig for your cluster and place it in `~/.kube/config`: ```bash From 6d778726c0ef70da6e9cb614be774006f4fcd075 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 23 Feb 2024 19:18:39 -0800 Subject: [PATCH 13/21] [Docker] Add retry for docker pull due to daemon not ready (#3218) * Add retry for docker pull due to daemon not ready * longer wait time * longer wait time * retry earlier * add retry for retries as well * longer wait time * change wait time * format * Add comment * Fix * Fix indent for azure docker config * Fix docker login config * Fix comments * More robust docker login config * Add retry for docker check * minor fix * Add additional test for stop and start with docker * Fix cancelled --- sky/backends/backend_utils.py | 2 +- sky/backends/cloud_vm_ray_backend.py | 2 + sky/provision/docker_utils.py | 58 ++++++++++++++------ sky/skylet/providers/command_runner.py | 74 ++++++++++++++++++++------ sky/templates/aws-ray.yml.j2 | 9 ---- sky/templates/azure-ray.yml.j2 | 38 ++++++------- sky/templates/gcp-ray.yml.j2 | 18 +++---- tests/test_smoke.py | 8 +++ 8 files changed, 139 insertions(+), 70 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ad98e2b4e0e..596d0bec043 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -136,7 +136,7 @@ # should take the latest security group name. _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [ ('provider', 'availability_zone'), - # AWS with new provisioner has docker_login_config in the + # Clouds with new provisioner has docker_login_config in the # docker field, instead of the provider field. ('docker', 'docker_login_config'), # Other clouds diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index fc5d3d34b56..12f8bd8ac28 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1837,6 +1837,8 @@ def need_ray_up( logger.info( 'Retrying launching in {:.1f} seconds.'.format(sleep)) time.sleep(sleep) + # TODO(zhwu): when we retry ray up, it is possible that the ray + # cluster fail to start because --no-restart flag is used. ray_up_return_value = ray_up() assert ray_up_return_value is not None diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 2df14ce39fd..303032128e3 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -2,6 +2,7 @@ import dataclasses import shlex +import time import typing from typing import Any, Dict, List @@ -14,6 +15,9 @@ logger = sky_logging.init_logger(__name__) +DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ' + 'the Docker daemon socket') + @dataclasses.dataclass class DockerLoginConfig: @@ -120,7 +124,11 @@ def __init__(self, docker_config: Dict[str, Any], self.docker_cmd = 'podman' if use_podman else 'docker' self.log_path = log_path - def _run(self, cmd, run_env='host') -> str: + def _run(self, + cmd, + run_env='host', + wait_for_docker_daemon: bool = False) -> str: + if run_env == 'docker': cmd = self._docker_expand_user(cmd, any_char=True) cmd = ' '.join(_with_interactive(cmd)) @@ -132,10 +140,24 @@ def _run(self, cmd, run_env='host') -> str: f' {shlex.quote(cmd)} ') logger.debug(f'+ {cmd}') - rc, stdout, stderr = self.runner.run(cmd, - require_outputs=True, - stream_logs=False, - log_path=self.log_path) + cnt = 0 + retry = 3 + while True: + rc, stdout, stderr = self.runner.run(cmd, + require_outputs=True, + stream_logs=False, + log_path=self.log_path) + if (not wait_for_docker_daemon or + DOCKER_PERMISSION_DENIED_STR not in stdout + stderr): + break + + cnt += 1 + if cnt > retry: + break + logger.info( + 'Failed to run docker command, retrying in 10 seconds... ' + f'({cnt}/{retry})') + time.sleep(10) subprocess_utils.handle_returncode( rc, cmd, @@ -164,10 +186,12 @@ def initialize(self) -> str: # TODO(tian): Maybe support a command to get the login password? docker_login_config = DockerLoginConfig( **self.docker_config['docker_login_config']) - self._run(f'{self.docker_cmd} login --username ' - f'{docker_login_config.username} ' - f'--password {docker_login_config.password} ' - f'{docker_login_config.server}') + self._run( + f'{self.docker_cmd} login --username ' + f'{docker_login_config.username} ' + f'--password {docker_login_config.password} ' + f'{docker_login_config.server}', + wait_for_docker_daemon=True) # We automatically add the server prefix to the image name if # the user did not add it. server_prefix = f'{docker_login_config.server}/' @@ -177,11 +201,14 @@ def initialize(self) -> str: if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' + 'pull_before_run is specified') - self._run(f'{self.docker_cmd} pull {specific_image}') + self._run(f'{self.docker_cmd} pull {specific_image}', + wait_for_docker_daemon=True) else: - self._run(f'{self.docker_cmd} image inspect {specific_image} ' - '1> /dev/null 2>&1 || ' - f'{self.docker_cmd} pull {specific_image}') + self._run( + f'{self.docker_cmd} image inspect {specific_image} ' + '1> /dev/null 2>&1 || ' + f'{self.docker_cmd} pull {specific_image}', + wait_for_docker_daemon=True) logger.info(f'Starting container {self.container_name} with image ' f'{specific_image}') @@ -347,7 +374,8 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]: def _check_container_exited(self) -> bool: if self.initialized: return True - output = (self._run( - check_docker_running_cmd(self.container_name, self.docker_cmd))) + output = (self._run(check_docker_running_cmd(self.container_name, + self.docker_cmd), + wait_for_docker_daemon=True)) return 'false' in output.lower( ) and 'no such object' not in output.lower() diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py index 83abe476151..b6ea52c6eeb 100644 --- a/sky/skylet/providers/command_runner.py +++ b/sky/skylet/providers/command_runner.py @@ -1,8 +1,10 @@ """Sky's DockerCommandRunner.""" import json import os +import time from typing import Dict +import click from ray.autoscaler._private.cli_logger import cli_logger from ray.autoscaler._private.command_runner import DockerCommandRunner from ray.autoscaler._private.docker import check_docker_running_cmd @@ -81,16 +83,53 @@ class SkyDockerCommandRunner(DockerCommandRunner): `ray.autoscaler._private.command_runner.DockerCommandRunner`. """ + def _run_with_retry(self, cmd, **kwargs): + """Run a command with retries for docker.""" + cnt = 0 + max_retry = 3 + while True: + try: + return self.run(cmd, **kwargs) + except click.ClickException as e: + # We retry the command if it fails, because docker commands can + # fail due to the docker daemon not being ready yet. + # Ray command runner raise ClickException when the command + # fails. + cnt += 1 + if cnt >= max_retry: + raise e + cli_logger.warning( + f'Failed to run command {cmd!r}. ' + f'Retrying in 10 seconds. Retry count: {cnt}') + time.sleep(10) + # SkyPilot: New function to check whether a container is exited # (but not removed). This is due to previous `sky stop` command, # which will stop the container but not remove it. def _check_container_exited(self) -> bool: if self.initialized: return True - output = (self.ssh_command_runner.run( - check_docker_running_cmd(self.container_name, self.docker_cmd), - with_output=True, - ).decode('utf-8').strip()) + cnt = 0 + max_retry = 3 + cmd = check_docker_running_cmd(self.container_name, self.docker_cmd) + # We manually retry the command based on the output, as the command will + # not fail even if the docker daemon is not ready, due to the underlying + # usage of `|| true` in the command. + while True: + output = (self.run(cmd, with_output=True, + run_env='host').decode('utf-8').strip()) + if docker_utils.DOCKER_PERMISSION_DENIED_STR in output: + cnt += 1 + if cnt >= max_retry: + raise click.ClickException( + f'Failed to run command {cmd!r}. ' + f'Retry count: {cnt}. Output: {output}') + cli_logger.warning( + f'Failed to run command {cmd!r}. ' + f'Retrying in 10 seconds. Retry count: {cnt}') + time.sleep(10) + else: + break return 'false' in output.lower( ) and 'no such object' not in output.lower() @@ -110,6 +149,9 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], # If true, then we can start the container directly. # Notice that we will skip all setup commands, so we need to # manually start the ssh service. + # We also add retries when checking the container status to make sure + # the docker daemon is ready, as it may not be ready immediately after + # the VM is started. if self._check_container_exited(): self.initialized = True self.run(f'docker start {self.container_name}', run_env='host') @@ -121,12 +163,10 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], # TODO(tian): Maybe support a command to get the login password? docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[ "docker_login_config"] - self.run('{} login --username {} --password {} {}'.format( - self.docker_cmd, - docker_login_config.username, - docker_login_config.password, - docker_login_config.server, - )) + self._run_with_retry( + f'{self.docker_cmd} login --username ' + f'{docker_login_config.username} --password ' + f'{docker_login_config.password} {docker_login_config.server}') # We automatically add the server prefix to the image name if # the user did not add it. server_prefix = f'{docker_login_config.server}/' @@ -134,15 +174,15 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], specific_image = f'{server_prefix}{specific_image}' if self.docker_config.get('pull_before_run', True): - assert specific_image, ('Image must be included in config if ' + + assert specific_image, ('Image must be included in config if ' 'pull_before_run is specified') - self.run('{} pull {}'.format(self.docker_cmd, specific_image), - run_env='host') + self._run_with_retry(f'{self.docker_cmd} pull {specific_image}', + run_env='host') else: - - self.run(f'{self.docker_cmd} image inspect {specific_image} ' - '1> /dev/null 2>&1 || ' - f'{self.docker_cmd} pull {specific_image}') + self._run_with_retry( + f'{self.docker_cmd} image inspect {specific_image} ' + '1> /dev/null 2>&1 || ' + f'{self.docker_cmd} pull {specific_image}') # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index fd56448791c..e834ee1d0c8 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -40,15 +40,6 @@ provider: # NOTE: This is a new field added by SkyPilot to force use a specific VPC. vpc_name: {{vpc_name}} {% endif %} -{%- if docker_login_config is not none %} - # We put docker login config in provider section because ray's schema disabled - # additionalProperties for docker config. - # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227 - docker_login_config: - username: {{docker_login_config.username}} - password: {{docker_login_config.password}} - server: {{docker_login_config.server}} -{%- endif %} use_internal_ips: {{use_internal_ips}} # Disable launch config check for worker nodes as it can cause resource # leakage. diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 369e7a52ec6..9ffe2a7958e 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -17,14 +17,14 @@ docker: {%- endif %} provider: - type: external - module: sky.skylet.providers.azure.AzureNodeProvider - location: {{region}} - # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87 - # For Azure, ray distinguishes different instances by the resource_group, - # instead of the cluster_name. This ensures that ray creates new instances - # for different cluster_name. - resource_group: {{resource_group}} + type: external + module: sky.skylet.providers.azure.AzureNodeProvider + location: {{region}} + # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87 + # For Azure, ray distinguishes different instances by the resource_group, + # instead of the cluster_name. This ensures that ray creates new instances + # for different cluster_name. + resource_group: {{resource_group}} {%- if docker_login_config is not none %} # We put docker login config in provider section because ray's schema disabled # additionalProperties for docker config. @@ -34,17 +34,17 @@ provider: password: {{docker_login_config.password}} server: {{docker_login_config.server}} {%- endif %} - # Keep (otherwise cannot reuse when re-provisioning). - # teardown(terminate=True) will override this. - cache_stopped_nodes: True - # subscription id of the azure user - subscription_id: {{azure_subscription_id}} - # Disable launch config check for worker nodes as it can cause resource - # leakage. - # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115 - # The upper-level SkyPilot code has make sure there will not be resource - # leakage. - disable_launch_config_check: true + # Keep (otherwise cannot reuse when re-provisioning). + # teardown(terminate=True) will override this. + cache_stopped_nodes: True + # subscription id of the azure user + subscription_id: {{azure_subscription_id}} + # Disable launch config check for worker nodes as it can cause resource + # leakage. + # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115 + # The upper-level SkyPilot code has make sure there will not be resource + # leakage. + disable_launch_config_check: true auth: diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 76c818b8aef..7a92c4d4429 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -15,6 +15,15 @@ docker: {%- if gpu is not none %} --gpus all {%- endif %} +{%- if docker_login_config is not none %} + docker_login_config: + username: |- + {{docker_login_config.username}} + password: |- + {{docker_login_config.password}} + server: |- + {{docker_login_config.server}} +{%- endif %} {%- endif %} provider: @@ -37,15 +46,6 @@ provider: {% if firewall_rule is not none %} firewall_rule: {{firewall_rule}} {% endif %} -{%- if docker_login_config is not none %} - # We put docker login config in provider section because ray's schema disabled - # additionalProperties for docker config. - # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227 - docker_login_config: - username: {{docker_login_config.username}} - password: {{docker_login_config.password}} - server: {{docker_login_config.server}} -{%- endif %} use_internal_ips: {{use_internal_ips}} {%- if tpu_vm %} _has_tpus: True diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6206f50a8df..25a7764686b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1100,6 +1100,14 @@ def test_job_queue_with_docker(generic_cloud: str): 'sleep 5', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', f'sky cancel -y {name} 3', + f'sky stop -y {name}', + # Make sure the job status preserve after stop and start the + # cluster. This is also a test for the docker container to be + # preserved after stop and start. + f'sky start -y {name}', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED', f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', f'sky logs {name} 4 --status', From cb695d53d99f78be96d1f690f8b4097c5e309750 Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 26 Feb 2024 13:56:24 -0800 Subject: [PATCH 14/21] added comments --- examples/unsloth/unsloth.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 4c01a5ca220..548871753aa 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -15,10 +15,11 @@ resources: disk_size: 128 file_mounts: + # Creates a new bucket my-unsloth-checkpoints and mounts it at /outputs /outputs: - name: my-unsloth-checkpoints + name: my-unsloth-checkpoints # Ensure this name is unique -workdir: . +workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth setup: | set -ex From 888f1a8b39abedef37361a66f42c353fe94b9dc0 Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 26 Feb 2024 21:58:29 -0800 Subject: [PATCH 15/21] quick fix --- examples/unsloth/unsloth_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 400f2c8402b..255dfb560cf 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -71,7 +71,7 @@ fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, - output_dir=args.output_dir[1:], + output_dir=("~" + args.output_dir), optim="adamw_8bit", seed=3407, save_steps=10, From 48776693be2f25a25a38c6cc8de8df2044c8826f Mon Sep 17 00:00:00 2001 From: Sheth Date: Wed, 28 Feb 2024 18:39:49 -0800 Subject: [PATCH 16/21] finished pip issues --- examples/unsloth/unsloth.yaml | 18 ++++++++++++++---- examples/unsloth/unsloth_example.py | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 548871753aa..e0e07d95c94 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -23,10 +23,20 @@ workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth setup: | set -ex - pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ - --index-url https://download.pytorch.org/whl/cu118 - pip install ipython - pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" + pip install --upgrade pip + cuda_version=$(nvcc --version | grep "release" | awk '{print $6}' | cut -c 2-) + + if [[ "$cuda_version" == "12.1"* ]]; then + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu121 + pip install ipython + pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git" + else + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu118 + pip install ipython + pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git" + fi run: | python unsloth_example.py --output-dir /outputs diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 255dfb560cf..9f0895096d0 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -71,7 +71,7 @@ fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, - output_dir=("~" + args.output_dir), + output_dir=(args.output_dir), optim="adamw_8bit", seed=3407, save_steps=10, From 7a208dd9c91aceaff871efb8cefd61ac4510725a Mon Sep 17 00:00:00 2001 From: Sheth Date: Wed, 28 Feb 2024 18:43:25 -0800 Subject: [PATCH 17/21] fix --- examples/unsloth/unsloth.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index e0e07d95c94..2c163b42121 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -19,7 +19,7 @@ file_mounts: /outputs: name: my-unsloth-checkpoints # Ensure this name is unique -workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth +workdir: . setup: | set -ex From 152e36a6a40417fb1be3fcaf0bdd9b5eb671eefa Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 29 Feb 2024 11:00:16 -0800 Subject: [PATCH 18/21] fix storage error message, add example link to docs --- README.md | 2 +- docs/source/index.rst | 2 +- examples/unsloth/unsloth_example.py | 2 +- sky/data/storage.py | 10 +++++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a1af8c63f7d..93abecb0fa4 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ Runnable examples: - [LocalGPT](./llm/localgpt) - [Falcon](./llm/falcon) - Add yours here & see more in [`llm/`](./llm)! -- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), and [many more (`examples/`)](./examples). +- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml) and [many more (`examples/`)](./examples). Follow updates: - [Twitter](https://twitter.com/skypilot_org) diff --git a/docs/source/index.rst b/docs/source/index.rst index 5c0f8a7f7c5..8e140c33ed9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,7 +85,7 @@ Runnable examples: * `Falcon `_ * Add yours here & see more in `llm/ `_! -* Framework examples: `PyTorch DDP `_, `DeepSpeed `_, `JAX/Flax on TPU `_, `Stable Diffusion `_, `Detectron2 `_, `Distributed `_ `TensorFlow `_, `NeMo `_, `programmatic grid search `_, `Docker `_, and `many more `_. +* Framework examples: `PyTorch DDP `_, `DeepSpeed `_, `JAX/Flax on TPU `_, `Stable Diffusion `_, `Detectron2 `_, `Distributed `_ `TensorFlow `_, `NeMo `_, `programmatic grid search `_, `Docker `_, `Unsloth `_ and `many more `_. Follow updates: diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 9f0895096d0..4e488ad82f0 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -71,7 +71,7 @@ fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, - output_dir=(args.output_dir), + output_dir=args.output_dir, optim="adamw_8bit", seed=3407, save_steps=10, diff --git a/sky/data/storage.py b/sky/data/storage.py index 4ca8441be3a..9356a50f365 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -333,14 +333,18 @@ def _validate_existing_bucket(self): # bucket's URL as 'source'. if handle is None: with ux_utils.print_exception_no_traceback(): + store_prefix = get_store_prefix(StoreType.from_store(self)) raise exceptions.StorageSpecError( 'Attempted to mount a non-sky managed bucket ' f'{self.name!r} without specifying the storage source.' - ' To mount an externally created bucket (e.g., ' + f' Bucket {self.name!r} already exists. \n' + ' • To create a new bucket, specify a unique name.\n' + ' • To mount an externally created bucket (e.g., ' 'created through cloud console or cloud cli), ' 'specify the bucket URL in the source field ' - 'instead of its name. E.g., replace `name: external-' - 'bucket` with `source: gs://external-bucket`.') + 'instead of its name. I.e., replace ' + f'`name: {self.name}` with ' + f'`source: {store_prefix}{self.name}`.') class Storage(object): From 3d124395d18785a1b1180fe1473fbbae644b23ca Mon Sep 17 00:00:00 2001 From: Sheth Date: Wed, 27 Mar 2024 13:09:54 -0700 Subject: [PATCH 19/21] changed error message if default nc installed on mac --- sky/provision/kubernetes/utils.py | 32 ++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 67acb7b1985..a52644341fb 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1038,15 +1038,29 @@ def check_port_forward_mode_dependencies() -> None: stderr=subprocess.DEVNULL, check=True) except (FileNotFoundError, subprocess.CalledProcessError): - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'`{name}` is required to setup Kubernetes cloud with ' - f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long - 'default networking mode and it is not installed. ' - 'On Debian/Ubuntu, install it with:\n' - f' $ sudo apt install {install_cmd}\n' - f'On MacOS, install it with: \n' - f' $ brew install {install_cmd}') from None + mac_nc_error = name == 'nc' and (os.path.exists('/usr/bin/nc') or + os.path.exists('/usr/bin/netcat')) + current_nc_path = (os.path.exists('/usr/bin/nc') and + '/usr/bin/nc') or '/usr/bin/netcat' + if mac_nc_error: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'The default MacOS `nc` is installed at ' + f'{current_nc_path}. However, for ' + f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long + 'default networking mode, GNU netcat is required. ' + f'On MacOS, install it with: \n' + f' $ brew install {install_cmd}') from None + else: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'`{name}` is required to setup Kubernetes cloud with ' + f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long + 'default networking mode and it is not installed. ' + 'On Debian/Ubuntu, install it with:\n' + f' $ sudo apt install {install_cmd}\n' + f'On MacOS, install it with: \n' + f' $ brew install {install_cmd}') from None def get_endpoint_debug_message() -> str: From 0746b457ce6d9be16311792c4e739966fd62c441 Mon Sep 17 00:00:00 2001 From: Sheth Date: Thu, 28 Mar 2024 00:07:54 -0700 Subject: [PATCH 20/21] refactored check_port_forward_mode_dependencies function --- sky/provision/kubernetes/utils.py | 94 +++++++++++++++++++------------ 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index a52644341fb..9af18ebbc49 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1025,42 +1025,64 @@ def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str, def check_port_forward_mode_dependencies() -> None: - """Checks if 'socat' is installed""" - # We store the dependency list as a list of lists. Each inner list - # contains the name of the dependency, the command to check if it is - # installed, and the package name to install it. - dependency_list = [['socat', ['socat', '-V'], 'socat'], - ['nc', ['nc', '-h'], 'netcat']] - for name, check_cmd, install_cmd in dependency_list: - try: - subprocess.run(check_cmd, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - check=True) - except (FileNotFoundError, subprocess.CalledProcessError): - mac_nc_error = name == 'nc' and (os.path.exists('/usr/bin/nc') or - os.path.exists('/usr/bin/netcat')) - current_nc_path = (os.path.exists('/usr/bin/nc') and - '/usr/bin/nc') or '/usr/bin/netcat' - if mac_nc_error: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'The default MacOS `nc` is installed at ' - f'{current_nc_path}. However, for ' - f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long - 'default networking mode, GNU netcat is required. ' - f'On MacOS, install it with: \n' - f' $ brew install {install_cmd}') from None - else: - with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'`{name}` is required to setup Kubernetes cloud with ' - f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long - 'default networking mode and it is not installed. ' - 'On Debian/Ubuntu, install it with:\n' - f' $ sudo apt install {install_cmd}\n' - f'On MacOS, install it with: \n' - f' $ brew install {install_cmd}') from None + """Checks if 'socat' and 'nc' are installed""" + + # Construct runtime errors + socat_default_error = RuntimeError( + f'`socat` is required to setup Kubernetes cloud with ' + f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long + 'default networking mode and it is not installed. ' + 'On Debian/Ubuntu, install it with:\n' + f' $ sudo apt install socat\n' + f'On MacOS, install it with: \n' + f' $ brew install socat') + netcat_default_error = RuntimeError( + f'`nc` is required to setup Kubernetes cloud with ' + f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long + 'default networking mode and it is not installed. ' + 'On Debian/Ubuntu, install it with:\n' + f' $ sudo apt install netcat\n' + f'On MacOS, install it with: \n' + f' $ brew install netcat') + mac_installed_error = RuntimeError( + f'The default MacOS `nc` is installed. However, for ' + f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long + 'default networking mode, GNU netcat is required. ' + f'On MacOS, install it with: \n' + f' $ brew install netcat') + + # Ensure socat is installed + try: + subprocess.run(['socat', '-V'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True) + except (FileNotFoundError, subprocess.CalledProcessError): + with ux_utils.print_exception_no_traceback(): + raise socat_default_error from None + + # Ensure netcat is installed + # + # In some cases, the user may have the default MacOS nc installed but + # they need GNU nc installed. Checking for this case, reflected in + # `nc_mac_installed`, helps give a more specific error message + try: + netcat_output = subprocess.run(['nc', '-h'], + capture_output=True, + check=False) + nc_mac_installed = netcat_output.returncode == 1 and 'apple' in str( + netcat_output.stderr) + + if nc_mac_installed: + with ux_utils.print_exception_no_traceback(): + raise mac_installed_error from None + elif netcat_output.returncode != 0: + with ux_utils.print_exception_no_traceback(): + raise netcat_default_error from None + + except FileNotFoundError: + with ux_utils.print_exception_no_traceback(): + raise netcat_default_error from None def get_endpoint_debug_message() -> str: From e42d5b33f8b1f4e99eaf39cf9a2c1f142d54abfd Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 15 Apr 2024 12:29:05 -0700 Subject: [PATCH 21/21] update comment --- sky/provision/kubernetes/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 9af18ebbc49..94d9696f880 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1063,9 +1063,9 @@ def check_port_forward_mode_dependencies() -> None: # Ensure netcat is installed # - # In some cases, the user may have the default MacOS nc installed but - # they need GNU nc installed. Checking for this case, reflected in - # `nc_mac_installed`, helps give a more specific error message + # In some cases, the user may have the default MacOS nc installed, which + # does not support the -z flag. To use the -z flag for port scanning, + # they need GNU nc installed. We check for this case and raise an error. try: netcat_output = subprocess.run(['nc', '-h'], capture_output=True,