From 7dfcba101e7bd9f21b6bd1f3ff78234d1387e375 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Fri, 16 Feb 2024 23:04:10 -0800
Subject: [PATCH 01/21] initial commit

---
 examples/unsloth/train.py     | 61 +++++++++++++++++++++++++++++++++++
 examples/unsloth/unsloth.yaml | 14 ++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 examples/unsloth/train.py
 create mode 100644 examples/unsloth/unsloth.yaml

diff --git a/examples/unsloth/train.py b/examples/unsloth/train.py
new file mode 100644
index 00000000000..530980ba392
--- /dev/null
+++ b/examples/unsloth/train.py
@@ -0,0 +1,61 @@
+from unsloth import FastLanguageModel
+import torch
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
+# Get LAION dataset
+url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
+dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
+
+# 4bit pre quantized models we support - 4x faster downloading!
+fourbit_models = [
+    "unsloth/mistral-7b-bnb-4bit",
+    "unsloth/llama-2-7b-bnb-4bit",
+    "unsloth/llama-2-13b-bnb-4bit",
+    "unsloth/codellama-34b-bnb-4bit",
+    "unsloth/tinyllama-bnb-4bit",
+]
+# Load Llama model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this!
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    use_gradient_checkpointing = True,
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+)
+
+trainer = SFTTrainer(
+    model = model,
+    train_dataset = dataset,
+    dataset_text_field = "text",
+    max_seq_length = max_seq_length,
+    tokenizer = tokenizer,
+    args = TrainingArguments(
+        per_device_train_batch_size = 2,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 10,
+        max_steps = 60,
+        fp16 = not torch.cuda.is_bf16_supported(),
+        bf16 = torch.cuda.is_bf16_supported(),
+        logging_steps = 1,
+        output_dir = "outputs",
+        optim = "adamw_8bit",
+        seed = 3407,
+    ),
+)
+trainer.train()
+
diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
new file mode 100644
index 00000000000..9f66955fa6f
--- /dev/null
+++ b/examples/unsloth/unsloth.yaml
@@ -0,0 +1,14 @@
+resources:
+  accelerators: T4:1
+  disk_size: 128
+
+workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth
+
+setup: |
+  set -ex
+  pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton --index-url https://download.pytorch.org/whl/cu118
+  pip install ipython
+  pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git"
+
+run: |
+  python train.py
\ No newline at end of file

From a08c0344f641eadbd7b91634f2b4575e972de405 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Fri, 16 Feb 2024 23:05:58 -0800
Subject: [PATCH 02/21] newline

---
 examples/unsloth/unsloth.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index 9f66955fa6f..825e0dd8aa1 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -11,4 +11,5 @@ setup: |
   pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git"
 
 run: |
-  python train.py
\ No newline at end of file
+  python train.py
+  
\ No newline at end of file

From b85cbf92c10d51249b4e5f0697ee4e468fab0769 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Fri, 16 Feb 2024 23:17:10 -0800
Subject: [PATCH 03/21] comments

---
 examples/unsloth/{train.py => unsloth.py} | 20 ++++++++++++--------
 examples/unsloth/unsloth.yaml             | 16 +++++++++++++---
 2 files changed, 25 insertions(+), 11 deletions(-)
 rename examples/unsloth/{train.py => unsloth.py} (77%)

diff --git a/examples/unsloth/train.py b/examples/unsloth/unsloth.py
similarity index 77%
rename from examples/unsloth/train.py
rename to examples/unsloth/unsloth.py
index 530980ba392..8d5f8071c68 100644
--- a/examples/unsloth/train.py
+++ b/examples/unsloth/unsloth.py
@@ -1,14 +1,16 @@
+# Use the unsloth library to fine-tune a Mistral model
+
 from unsloth import FastLanguageModel
 import torch
 from trl import SFTTrainer
 from transformers import TrainingArguments
 from datasets import load_dataset
-max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
-# Get LAION dataset
+max_seq_length = 2048
+
+# [1] Get LAION dataset
 url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
 dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
 
-# 4bit pre quantized models we support - 4x faster downloading!
 fourbit_models = [
     "unsloth/mistral-7b-bnb-4bit",
     "unsloth/llama-2-7b-bnb-4bit",
@@ -16,28 +18,30 @@
     "unsloth/codellama-34b-bnb-4bit",
     "unsloth/tinyllama-bnb-4bit",
 ]
-# Load Llama model
+
+# [2] Load Mistral model
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this!
+    model_name = "unsloth/mistral-7b-bnb-4bit",
     max_seq_length = max_seq_length,
     dtype = None,
     load_in_4bit = True,
 )
 
-# Do model patching and add fast LoRA weights
+# [3] Do model patching and add fast LoRA weights
 model = FastLanguageModel.get_peft_model(
     model,
     r = 16,
     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                       "gate_proj", "up_proj", "down_proj",],
     lora_alpha = 16,
-    lora_dropout = 0, # Supports any, but = 0 is optimized
-    bias = "none",    # Supports any, but = "none" is optimized
+    lora_dropout = 0,
+    bias = "none",
     use_gradient_checkpointing = True,
     random_state = 3407,
     max_seq_length = max_seq_length,
 )
 
+# [4] Initialize and train the model using the SFTTrainer
 trainer = SFTTrainer(
     model = model,
     train_dataset = dataset,
diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index 825e0dd8aa1..44ad3963ecb 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -1,8 +1,19 @@
+# Runs the unsloth example app via SkyPilot
+#
+# The example app starts by obtaining the LAION dataset,
+# loads the Mistral model with 4-bit precision, performs model
+# patching with fast LoRA weights, and finally initializes and
+# trains the model using the SFTTrainer with specified hyperparameters
+# and the LAION dataset.
+#
+# Usage:
+#   sky launch -c myclus unsloth.yaml
+
 resources:
   accelerators: T4:1
   disk_size: 128
 
-workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth
+workdir: .
 
 setup: |
   set -ex
@@ -11,5 +22,4 @@ setup: |
   pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git"
 
 run: |
-  python train.py
-  
\ No newline at end of file
+  python unsloth.py

From 4fbfe1713f78e9acb0a8f083c09ffa18855bb14b Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Fri, 16 Feb 2024 23:26:34 -0800
Subject: [PATCH 04/21] run linter

---
 examples/unsloth/unsloth.py   | 7 ++++---
 examples/unsloth/unsloth.yaml | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/unsloth/unsloth.py b/examples/unsloth/unsloth.py
index 8d5f8071c68..ee9ab2b2f05 100644
--- a/examples/unsloth/unsloth.py
+++ b/examples/unsloth/unsloth.py
@@ -1,10 +1,11 @@
 # Use the unsloth library to fine-tune a Mistral model
 
-from unsloth import FastLanguageModel
+from datasets import load_dataset
 import torch
-from trl import SFTTrainer
 from transformers import TrainingArguments
-from datasets import load_dataset
+from trl import SFTTrainer
+from unsloth import FastLanguageModel
+
 max_seq_length = 2048
 
 # [1] Get LAION dataset
diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index 44ad3963ecb..b900f0d0752 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -3,8 +3,8 @@
 # The example app starts by obtaining the LAION dataset,
 # loads the Mistral model with 4-bit precision, performs model
 # patching with fast LoRA weights, and finally initializes and
-# trains the model using the SFTTrainer with specified hyperparameters
-# and the LAION dataset.
+# trains the model using the SFTTrainer with specified
+# hyperparameters and the LAION dataset.
 #
 # Usage:
 #   sky launch -c myclus unsloth.yaml
@@ -17,7 +17,8 @@ workdir: .
 
 setup: |
   set -ex
-  pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton --index-url https://download.pytorch.org/whl/cu118
+  pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
+    --index-url https://download.pytorch.org/whl/cu118
   pip install ipython
   pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git"
 

From 6fc77e12809bf67cf6d60e77bd694fa6e1f0581f Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Sun, 18 Feb 2024 00:47:45 -0800
Subject: [PATCH 05/21] reminder for down

---
 examples/unsloth/unsloth.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index b900f0d0752..7fac6770df2 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -8,6 +8,7 @@
 #
 # Usage:
 #   sky launch -c myclus unsloth.yaml
+#   sky down myclus
 
 resources:
   accelerators: T4:1

From d6cb99316b4ce23fc09915e378105d8148888581 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Sun, 18 Feb 2024 02:42:30 -0800
Subject: [PATCH 06/21] tentatively done with example

---
 examples/unsloth/unsloth.yaml                       |  6 +++++-
 examples/unsloth/{unsloth.py => unsloth_example.py} | 11 +++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)
 rename examples/unsloth/{unsloth.py => unsloth_example.py} (84%)

diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index 7fac6770df2..4c01a5ca220 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -14,6 +14,10 @@ resources:
   accelerators: T4:1
   disk_size: 128
 
+file_mounts:
+  /outputs:
+    name: my-unsloth-checkpoints
+
 workdir: .
 
 setup: |
@@ -24,4 +28,4 @@ setup: |
   pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git"
 
 run: |
-  python unsloth.py
+  python unsloth_example.py --output-dir /outputs
diff --git a/examples/unsloth/unsloth.py b/examples/unsloth/unsloth_example.py
similarity index 84%
rename from examples/unsloth/unsloth.py
rename to examples/unsloth/unsloth_example.py
index ee9ab2b2f05..95ead037859 100644
--- a/examples/unsloth/unsloth.py
+++ b/examples/unsloth/unsloth_example.py
@@ -1,5 +1,6 @@
 # Use the unsloth library to fine-tune a Mistral model
 
+import argparse
 from datasets import load_dataset
 import torch
 from transformers import TrainingArguments
@@ -42,7 +43,12 @@
     max_seq_length = max_seq_length,
 )
 
-# [4] Initialize and train the model using the SFTTrainer
+# [4] Parse output directory of checkpoints
+parser = argparse.ArgumentParser()
+parser.add_argument("--output-dir", type=str, default="/outputs")
+args = parser.parse_args()
+
+# [5] Initialize and train the model using the SFTTrainer
 trainer = SFTTrainer(
     model = model,
     train_dataset = dataset,
@@ -57,9 +63,10 @@
         fp16 = not torch.cuda.is_bf16_supported(),
         bf16 = torch.cuda.is_bf16_supported(),
         logging_steps = 1,
-        output_dir = "outputs",
+        output_dir = args.output_dir[1:],
         optim = "adamw_8bit",
         seed = 3407,
+        save_steps = 10,
     ),
 )
 trainer.train()

From 2d5aceb894a2cf7223f5443444699581bfe4aad0 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Sun, 18 Feb 2024 02:53:11 -0800
Subject: [PATCH 07/21] formatting

---
 examples/unsloth/unsloth_example.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py
index 95ead037859..52c0a306c67 100644
--- a/examples/unsloth/unsloth_example.py
+++ b/examples/unsloth/unsloth_example.py
@@ -1,6 +1,7 @@
 # Use the unsloth library to fine-tune a Mistral model
 
 import argparse
+
 from datasets import load_dataset
 import torch
 from transformers import TrainingArguments

From 4e1954a0027dbdea3375d3afd73177d36a18f2eb Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Mon, 19 Feb 2024 01:11:30 -0800
Subject: [PATCH 08/21] yapf

---
 examples/unsloth/unsloth_example.py | 70 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py
index 52c0a306c67..400f2c8402b 100644
--- a/examples/unsloth/unsloth_example.py
+++ b/examples/unsloth/unsloth_example.py
@@ -12,7 +12,7 @@
 
 # [1] Get LAION dataset
 url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
-dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
+dataset = load_dataset("json", data_files={"train": url}, split="train")
 
 fourbit_models = [
     "unsloth/mistral-7b-bnb-4bit",
@@ -24,24 +24,31 @@
 
 # [2] Load Mistral model
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "unsloth/mistral-7b-bnb-4bit",
-    max_seq_length = max_seq_length,
-    dtype = None,
-    load_in_4bit = True,
+    model_name="unsloth/mistral-7b-bnb-4bit",
+    max_seq_length=max_seq_length,
+    dtype=None,
+    load_in_4bit=True,
 )
 
 # [3] Do model patching and add fast LoRA weights
 model = FastLanguageModel.get_peft_model(
     model,
-    r = 16,
-    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
-                      "gate_proj", "up_proj", "down_proj",],
-    lora_alpha = 16,
-    lora_dropout = 0,
-    bias = "none",
-    use_gradient_checkpointing = True,
-    random_state = 3407,
-    max_seq_length = max_seq_length,
+    r=16,
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ],
+    lora_alpha=16,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing=True,
+    random_state=3407,
+    max_seq_length=max_seq_length,
 )
 
 # [4] Parse output directory of checkpoints
@@ -51,24 +58,23 @@
 
 # [5] Initialize and train the model using the SFTTrainer
 trainer = SFTTrainer(
-    model = model,
-    train_dataset = dataset,
-    dataset_text_field = "text",
-    max_seq_length = max_seq_length,
-    tokenizer = tokenizer,
-    args = TrainingArguments(
-        per_device_train_batch_size = 2,
-        gradient_accumulation_steps = 4,
-        warmup_steps = 10,
-        max_steps = 60,
-        fp16 = not torch.cuda.is_bf16_supported(),
-        bf16 = torch.cuda.is_bf16_supported(),
-        logging_steps = 1,
-        output_dir = args.output_dir[1:],
-        optim = "adamw_8bit",
-        seed = 3407,
-        save_steps = 10,
+    model=model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=TrainingArguments(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        warmup_steps=10,
+        max_steps=60,
+        fp16=not torch.cuda.is_bf16_supported(),
+        bf16=torch.cuda.is_bf16_supported(),
+        logging_steps=1,
+        output_dir=args.output_dir[1:],
+        optim="adamw_8bit",
+        seed=3407,
+        save_steps=10,
     ),
 )
 trainer.train()
-

From 27a89050cf929f4f9a54d3078c44e2e229a3f14a Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Thu, 22 Feb 2024 14:55:15 -0800
Subject: [PATCH 09/21] [Storage] Storage mounting tool permissions fix (#3215)

* fix permissions

* fix permissions
---
 sky/data/mounting_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py
index 32904ada517..2f4e37a1b66 100644
--- a/sky/data/mounting_utils.py
+++ b/sky/data/mounting_utils.py
@@ -19,7 +19,7 @@ def get_s3_mount_install_cmd() -> str:
     install_cmd = ('sudo wget -nc https://github.com/romilbhardwaj/goofys/'
                    'releases/download/0.24.0-romilb-upstream/goofys '
                    '-O /usr/local/bin/goofys && '
-                   'sudo chmod +x /usr/local/bin/goofys')
+                   'sudo chmod 755 /usr/local/bin/goofys')
     return install_cmd
 
 

From 41a63df344d3b3cea0ae837d7391f3c1e86bb5da Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 22 Feb 2024 17:43:06 -0800
Subject: [PATCH 10/21] [LLM] Example for Serving Gemma (#3207)

* Add serve for gemma and fix mixtral dependency

* Add hf token

* fix model len

* Add comment

* Serve your private gemma

* fix serve yaml

* readme

* Remove chat completion due to the wrong template

* add readme

* Update llm/gemma/README.md

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* address comments

* Update README.md

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update llm/gemma/README.md

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update llm/gemma/README.md

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update llm/gemma/README.md

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Change to it

* Add chat API

* use HF_TOKEN env

* typo

---------

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
---
 README.md             |   2 +
 docs/source/index.rst |   1 +
 llm/gemma/README.md   | 103 ++++++++++++++++++++++++++++++++++++++++++
 llm/gemma/serve.yaml  |  47 +++++++++++++++++++
 4 files changed, 153 insertions(+)
 create mode 100644 llm/gemma/README.md
 create mode 100644 llm/gemma/serve.yaml

diff --git a/README.md b/README.md
index 2c03d5afa06..606ac06e2f0 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@
 
 ----
 :fire: *News* :fire:
+- [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/)
 - [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/)
 - [Feb, 2024] Serving [**Code Llama 70B**](https://ai.meta.com/blog/code-llama-large-language-model-coding/) with vLLM and SkyServe: [**example**](./llm/codellama/)
 - [Dec, 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/)
@@ -148,6 +149,7 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest
 <!-- Keep this section in sync with index.rst in SkyPilot Docs -->
 Runnable examples:
 - LLMs on SkyPilot
+  - [Gemma](./llm/gemma/)
   - [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team)
   - [Code Llama](./llm/codellama/)
   - [vLLM: Serving LLM 24x Faster On the Cloud](./llm/vllm/) (from official vLLM team)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 493c7459a9a..fbf03b3f552 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -69,6 +69,7 @@ Runnable examples:
 
 * **LLMs on SkyPilot**
 
+  * `Gemma <https://github.com/skypilot-org/skypilot/tree/master/llm/gemma>`_
   * `Mixtral 8x7B <https://github.com/skypilot-org/skypilot/tree/master/llm/mixtral>`_; `Mistral 7B <https://docs.mistral.ai/self-deployment/skypilot>`_ (from official Mistral team)
   * `Code Llama <https://github.com/skypilot-org/skypilot/tree/master/llm/codellama/>`_
   * `vLLM: Serving LLM 24x Faster On the Cloud <https://github.com/skypilot-org/skypilot/tree/master/llm/vllm>`_ (from official vLLM team)
diff --git a/llm/gemma/README.md b/llm/gemma/README.md
new file mode 100644
index 00000000000..d0ff0114ff8
--- /dev/null
+++ b/llm/gemma/README.md
@@ -0,0 +1,103 @@
+# Serve Your Gemma on Any Cloud
+
+Google released [Gemma](https://blog.google/technology/developers/gemma-open-models/) and has made a big wave in the AI community.
+It opens the opportunity for the open-source community to serve and finetune private Gemini.
+
+## Serve Gemma on any Cloud
+
+Serving Gemma on any cloud is easy with SkyPilot. With [serve.yaml](serve.yaml) in this directory, you host the model on any cloud with a single command.
+
+### Prerequsites
+
+1. Apply for access to the Gemma model
+
+Go to the [application page](https://huggingface.co/google/gemma-7b) and click **Acknowledge license** to apply for access to the model weights.
+
+
+2. Get the access token from huggingface
+
+Generate a read-only access token on huggingface [here](https://huggingface.co/settings/token), and make sure your huggingface account can access the Gemma models [here](https://huggingface.co/google/gemma-7b).
+
+3. Install SkyPilot
+
+```bash
+pip install "skypilot-nightly[all]"
+```
+For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+
+### Host on a Single Instance
+
+We can host the model with a single instance:
+```bash
+HF_TOKEN="xxx" sky launch -c gemma serve.yaml --env HF_TOKEN
+```
+
+After the cluster is launched, we can access the model with the following command:
+```bash
+IP=$(sky status --ip gemma)
+
+curl -L http://$IP:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "google/gemma-7b-it",
+      "prompt": "My favourite condiment is",
+      "max_tokens": 25
+  }' | jq .
+```
+
+Chat API is also supported:
+```bash
+IP=$(sky status --ip gemma)
+
+curl -L http://$IP:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "google/gemma-7b-it",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello! What is your name?"
+        }
+      ],
+      "max_tokens": 25
+  }'
+```
+
+### Scale the Serving with SkyServe
+
+
+Using the same YAML, we can easily scale the model serving across multiple instances, regions and clouds with SkyServe:
+```bash
+HF_TOKEN="xxx" sky serve up -n gemma serve.yaml --env HF_TOKEN
+```
+
+> Notice the only change is from `sky launch` to `sky serve up`. The same YAML can be used without changes.
+
+After the cluster is launched, we can access the model with the following command:
+```bash
+ENDPOINT=$(sky serve status --endpoint gemma)
+
+curl -L http://$ENDPOINT/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "google/gemma-7b-it",
+      "prompt": "My favourite condiment is",
+      "max_tokens": 25
+  }' | jq .
+```
+
+Chat API is also supported:
+```bash
+curl -L http://$ENDPOINT/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "google/gemma-7b-it",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello! What is your name?"
+        }
+      ],
+      "max_tokens": 25
+  }'
+```
diff --git a/llm/gemma/serve.yaml b/llm/gemma/serve.yaml
new file mode 100644
index 00000000000..a477554d47a
--- /dev/null
+++ b/llm/gemma/serve.yaml
@@ -0,0 +1,47 @@
+# A example yaml for serving Gemma model from Mistral.ai with an OpenAI API.
+# Usage:
+#  1. Launch on a single instance: `sky launch -c gemma ./serve.yaml`
+#  2. Scale up to multiple instances with a single endpoint:
+#     `sky serve up -n gemma ./serve.yaml`
+service:
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1
+    initial_delay_seconds: 1200
+  replicas: 2
+
+envs:
+  MODEL_NAME: google/gemma-7b-it
+  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+
+resources: 
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}
+  ports: 8000
+  disk_tier: best
+
+setup: |
+  conda activate gemma
+  if [ $? -ne 0 ]; then
+    conda create -n gemma -y python=3.10
+    conda activate gemma
+  fi
+  pip install vllm==0.3.2
+  pip install transformers==4.38.0
+  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+run: |
+  conda activate gemma
+  export PATH=$PATH:/sbin
+  # --max-model-len is set to 1024 to avoid taking too much GPU memory on L4 and
+  # A10g with small memory.
+  python -u -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 \
+    --model $MODEL_NAME \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    --max-model-len 1024 | tee ~/openai_api_server.log
+

From 2b17e91d93f9d54e9a0b7f44e837dba9d0c1f837 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 23 Feb 2024 13:18:59 -0800
Subject: [PATCH 11/21] [LLM] Add logo for Gemma (#3220)

---
 llm/gemma/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/gemma/README.md b/llm/gemma/README.md
index d0ff0114ff8..676afce6606 100644
--- a/llm/gemma/README.md
+++ b/llm/gemma/README.md
@@ -1,4 +1,5 @@
 # Serve Your Gemma on Any Cloud
+![image](https://github.com/skypilot-org/skypilot/assets/6753189/e452c39e-b5ef-4cb2-ab48-053f9e6f67b7)
 
 Google released [Gemma](https://blog.google/technology/developers/gemma-open-models/) and has made a big wave in the AI community.
 It opens the opportunity for the open-source community to serve and finetune private Gemini.

From b326d12610acbdae15e155ca12c0c6e2ef800004 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 23 Feb 2024 13:22:42 -0800
Subject: [PATCH 12/21] Minor fixes for release 0.5.0 (#3212)

* when removing cudo credential, sky check fails

* remove tips

* minor hint fix

* fix cluster version for k8s

* fix typo
---
 docs/source/examples/auto-failover.rst | 6 ------
 sky/clouds/cudo.py                     | 9 ++++++++-
 tests/backward_compatibility_tests.sh  | 2 +-
 tests/kubernetes/README.md             | 2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst
index c8f77c533a1..bbfc3cb469b 100644
--- a/docs/source/examples/auto-failover.rst
+++ b/docs/source/examples/auto-failover.rst
@@ -108,12 +108,6 @@ AWS, where it succeeded after two regions:
 Multiple Candidate GPUs
 -------------------------
 
-.. tip::
-
-  Support for multiple resources via ``any_of`` or ``ordered`` was added after v0.4.1.
-
-  To use this feature, :ref:`install the nightly release <installation>`: ``pip install -U skypilot-nightly``
-
 If a task can be run on different GPUs, the user can specify multiple candidate GPUs,
 and SkyPilot will automatically find the cheapest available GPU.
 
diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py
index 855bdaf59ae..ad7a22e6e03 100644
--- a/sky/clouds/cudo.py
+++ b/sky/clouds/cudo.py
@@ -276,7 +276,14 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
         # pylint: disable=import-outside-toplevel,unused-import
         from cudo_compute import cudo_api
         from cudo_compute.rest import ApiException
-        _, error = cudo_api.client()
+        try:
+            _, error = cudo_api.client()
+        except FileNotFoundError as e:
+            return False, (
+                'Cudo credentials are not set. '
+                f'{cls._CREDENTIAL_HINT}\n'
+                f'{cls._INDENT_PREFIX}'
+                f'{common_utils.format_exception(e, use_bracket=True)}')
 
         if error is not None:
             return False, (
diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh
index 9fd7586e22b..47381294afe 100644
--- a/tests/backward_compatibility_tests.sh
+++ b/tests/backward_compatibility_tests.sh
@@ -112,7 +112,7 @@ sky logs ${CLUSTER_NAME}-4 2
 fi
 
 # (1 node) sky start + sky exec + sky queue + sky logs
-if [ "$start_form" -le 5 ]; then
+if [ "$start_from" -le 5 ]; then
 conda activate sky-back-compat-master
 rm -r  ~/.sky/wheels || true
 sky launch --cloud ${CLOUD} -y --cpus 2 -c ${CLUSTER_NAME}-5 examples/minimal.yaml
diff --git a/tests/kubernetes/README.md b/tests/kubernetes/README.md
index 220f96f6a9e..4a882352703 100644
--- a/tests/kubernetes/README.md
+++ b/tests/kubernetes/README.md
@@ -32,7 +32,7 @@ sky local up
      ```bash
       PROJECT_ID=$(gcloud config get-value project)
       CLUSTER_NAME=testclusterromil
-      gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.27.3-gke.100" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c"
+      gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.0-gke.1381000" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c"
       ```
 2. Get the kubeconfig for your cluster and place it in `~/.kube/config`:
    ```bash

From 6d778726c0ef70da6e9cb614be774006f4fcd075 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Fri, 23 Feb 2024 19:18:39 -0800
Subject: [PATCH 13/21] [Docker] Add retry for docker pull due to daemon not
 ready (#3218)

* Add retry for docker pull due to daemon not ready

* longer wait time

* longer wait time

* retry earlier

* add retry for retries as well

* longer wait time

* change wait time

* format

* Add comment

* Fix

* Fix indent for azure docker config

* Fix docker login config

* Fix comments

* More robust docker login config

* Add retry for docker check

* minor fix

* Add additional test for stop and start with docker

* Fix cancelled
---
 sky/backends/backend_utils.py          |  2 +-
 sky/backends/cloud_vm_ray_backend.py   |  2 +
 sky/provision/docker_utils.py          | 58 ++++++++++++++------
 sky/skylet/providers/command_runner.py | 74 ++++++++++++++++++++------
 sky/templates/aws-ray.yml.j2           |  9 ----
 sky/templates/azure-ray.yml.j2         | 38 ++++++-------
 sky/templates/gcp-ray.yml.j2           | 18 +++----
 tests/test_smoke.py                    |  8 +++
 8 files changed, 139 insertions(+), 70 deletions(-)

diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index ad98e2b4e0e..596d0bec043 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -136,7 +136,7 @@
 #   should take the latest security group name.
 _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
     ('provider', 'availability_zone'),
-    # AWS with new provisioner has docker_login_config in the
+    # Clouds with new provisioner has docker_login_config in the
     # docker field, instead of the provider field.
     ('docker', 'docker_login_config'),
     # Other clouds
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index fc5d3d34b56..12f8bd8ac28 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -1837,6 +1837,8 @@ def need_ray_up(
                 logger.info(
                     'Retrying launching in {:.1f} seconds.'.format(sleep))
                 time.sleep(sleep)
+            # TODO(zhwu): when we retry ray up, it is possible that the ray
+            # cluster fail to start because --no-restart flag is used.
             ray_up_return_value = ray_up()
 
         assert ray_up_return_value is not None
diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py
index 2df14ce39fd..303032128e3 100644
--- a/sky/provision/docker_utils.py
+++ b/sky/provision/docker_utils.py
@@ -2,6 +2,7 @@
 
 import dataclasses
 import shlex
+import time
 import typing
 from typing import Any, Dict, List
 
@@ -14,6 +15,9 @@
 
 logger = sky_logging.init_logger(__name__)
 
+DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
+                                'the Docker daemon socket')
+
 
 @dataclasses.dataclass
 class DockerLoginConfig:
@@ -120,7 +124,11 @@ def __init__(self, docker_config: Dict[str, Any],
         self.docker_cmd = 'podman' if use_podman else 'docker'
         self.log_path = log_path
 
-    def _run(self, cmd, run_env='host') -> str:
+    def _run(self,
+             cmd,
+             run_env='host',
+             wait_for_docker_daemon: bool = False) -> str:
+
         if run_env == 'docker':
             cmd = self._docker_expand_user(cmd, any_char=True)
             cmd = ' '.join(_with_interactive(cmd))
@@ -132,10 +140,24 @@ def _run(self, cmd, run_env='host') -> str:
                    f' {shlex.quote(cmd)} ')
 
         logger.debug(f'+ {cmd}')
-        rc, stdout, stderr = self.runner.run(cmd,
-                                             require_outputs=True,
-                                             stream_logs=False,
-                                             log_path=self.log_path)
+        cnt = 0
+        retry = 3
+        while True:
+            rc, stdout, stderr = self.runner.run(cmd,
+                                                 require_outputs=True,
+                                                 stream_logs=False,
+                                                 log_path=self.log_path)
+            if (not wait_for_docker_daemon or
+                    DOCKER_PERMISSION_DENIED_STR not in stdout + stderr):
+                break
+
+            cnt += 1
+            if cnt > retry:
+                break
+            logger.info(
+                'Failed to run docker command, retrying in 10 seconds... '
+                f'({cnt}/{retry})')
+            time.sleep(10)
         subprocess_utils.handle_returncode(
             rc,
             cmd,
@@ -164,10 +186,12 @@ def initialize(self) -> str:
             # TODO(tian): Maybe support a command to get the login password?
             docker_login_config = DockerLoginConfig(
                 **self.docker_config['docker_login_config'])
-            self._run(f'{self.docker_cmd} login --username '
-                      f'{docker_login_config.username} '
-                      f'--password {docker_login_config.password} '
-                      f'{docker_login_config.server}')
+            self._run(
+                f'{self.docker_cmd} login --username '
+                f'{docker_login_config.username} '
+                f'--password {docker_login_config.password} '
+                f'{docker_login_config.server}',
+                wait_for_docker_daemon=True)
             # We automatically add the server prefix to the image name if
             # the user did not add it.
             server_prefix = f'{docker_login_config.server}/'
@@ -177,11 +201,14 @@ def initialize(self) -> str:
         if self.docker_config.get('pull_before_run', True):
             assert specific_image, ('Image must be included in config if ' +
                                     'pull_before_run is specified')
-            self._run(f'{self.docker_cmd} pull {specific_image}')
+            self._run(f'{self.docker_cmd} pull {specific_image}',
+                      wait_for_docker_daemon=True)
         else:
-            self._run(f'{self.docker_cmd} image inspect {specific_image} '
-                      '1> /dev/null  2>&1 || '
-                      f'{self.docker_cmd} pull {specific_image}')
+            self._run(
+                f'{self.docker_cmd} image inspect {specific_image} '
+                '1> /dev/null  2>&1 || '
+                f'{self.docker_cmd} pull {specific_image}',
+                wait_for_docker_daemon=True)
 
         logger.info(f'Starting container {self.container_name} with image '
                     f'{specific_image}')
@@ -347,7 +374,8 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
     def _check_container_exited(self) -> bool:
         if self.initialized:
             return True
-        output = (self._run(
-            check_docker_running_cmd(self.container_name, self.docker_cmd)))
+        output = (self._run(check_docker_running_cmd(self.container_name,
+                                                     self.docker_cmd),
+                            wait_for_docker_daemon=True))
         return 'false' in output.lower(
         ) and 'no such object' not in output.lower()
diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py
index 83abe476151..b6ea52c6eeb 100644
--- a/sky/skylet/providers/command_runner.py
+++ b/sky/skylet/providers/command_runner.py
@@ -1,8 +1,10 @@
 """Sky's DockerCommandRunner."""
 import json
 import os
+import time
 from typing import Dict
 
+import click
 from ray.autoscaler._private.cli_logger import cli_logger
 from ray.autoscaler._private.command_runner import DockerCommandRunner
 from ray.autoscaler._private.docker import check_docker_running_cmd
@@ -81,16 +83,53 @@ class SkyDockerCommandRunner(DockerCommandRunner):
     `ray.autoscaler._private.command_runner.DockerCommandRunner`.
     """
 
+    def _run_with_retry(self, cmd, **kwargs):
+        """Run a command with retries for docker."""
+        cnt = 0
+        max_retry = 3
+        while True:
+            try:
+                return self.run(cmd, **kwargs)
+            except click.ClickException as e:
+                # We retry the command if it fails, because docker commands can
+                # fail due to the docker daemon not being ready yet.
+                # Ray command runner raise ClickException when the command
+                # fails.
+                cnt += 1
+                if cnt >= max_retry:
+                    raise e
+                cli_logger.warning(
+                    f'Failed to run command {cmd!r}. '
+                    f'Retrying in 10 seconds. Retry count: {cnt}')
+                time.sleep(10)
+
     # SkyPilot: New function to check whether a container is exited
     # (but not removed). This is due to previous `sky stop` command,
     # which will stop the container but not remove it.
     def _check_container_exited(self) -> bool:
         if self.initialized:
             return True
-        output = (self.ssh_command_runner.run(
-            check_docker_running_cmd(self.container_name, self.docker_cmd),
-            with_output=True,
-        ).decode('utf-8').strip())
+        cnt = 0
+        max_retry = 3
+        cmd = check_docker_running_cmd(self.container_name, self.docker_cmd)
+        # We manually retry the command based on the output, as the command will
+        # not fail even if the docker daemon is not ready, due to the underlying
+        # usage of `|| true` in the command.
+        while True:
+            output = (self.run(cmd, with_output=True,
+                               run_env='host').decode('utf-8').strip())
+            if docker_utils.DOCKER_PERMISSION_DENIED_STR in output:
+                cnt += 1
+                if cnt >= max_retry:
+                    raise click.ClickException(
+                        f'Failed to run command {cmd!r}. '
+                        f'Retry count: {cnt}. Output: {output}')
+                cli_logger.warning(
+                    f'Failed to run command {cmd!r}. '
+                    f'Retrying in 10 seconds. Retry count: {cnt}')
+                time.sleep(10)
+            else:
+                break
         return 'false' in output.lower(
         ) and 'no such object' not in output.lower()
 
@@ -110,6 +149,9 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
         # If true, then we can start the container directly.
         # Notice that we will skip all setup commands, so we need to
         # manually start the ssh service.
+        # We also add retries when checking the container status to make sure
+        # the docker daemon is ready, as it may not be ready immediately after
+        # the VM is started.
         if self._check_container_exited():
             self.initialized = True
             self.run(f'docker start {self.container_name}', run_env='host')
@@ -121,12 +163,10 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
             # TODO(tian): Maybe support a command to get the login password?
             docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[
                 "docker_login_config"]
-            self.run('{} login --username {} --password {} {}'.format(
-                self.docker_cmd,
-                docker_login_config.username,
-                docker_login_config.password,
-                docker_login_config.server,
-            ))
+            self._run_with_retry(
+                f'{self.docker_cmd} login --username '
+                f'{docker_login_config.username} --password '
+                f'{docker_login_config.password} {docker_login_config.server}')
             # We automatically add the server prefix to the image name if
             # the user did not add it.
             server_prefix = f'{docker_login_config.server}/'
@@ -134,15 +174,15 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
                 specific_image = f'{server_prefix}{specific_image}'
 
         if self.docker_config.get('pull_before_run', True):
-            assert specific_image, ('Image must be included in config if ' +
+            assert specific_image, ('Image must be included in config if '
                                     'pull_before_run is specified')
-            self.run('{} pull {}'.format(self.docker_cmd, specific_image),
-                     run_env='host')
+            self._run_with_retry(f'{self.docker_cmd} pull {specific_image}',
+                                 run_env='host')
         else:
-
-            self.run(f'{self.docker_cmd} image inspect {specific_image} '
-                     '1> /dev/null  2>&1 || '
-                     f'{self.docker_cmd} pull {specific_image}')
+            self._run_with_retry(
+                f'{self.docker_cmd} image inspect {specific_image} '
+                '1> /dev/null  2>&1 || '
+                f'{self.docker_cmd} pull {specific_image}')
 
         # Bootstrap files cannot be bind mounted because docker opens the
         # underlying inode. When the file is switched, docker becomes outdated.
diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2
index fd56448791c..e834ee1d0c8 100644
--- a/sky/templates/aws-ray.yml.j2
+++ b/sky/templates/aws-ray.yml.j2
@@ -40,15 +40,6 @@ provider:
   # NOTE: This is a new field added by SkyPilot to force use a specific VPC.
   vpc_name: {{vpc_name}}
 {% endif %}
-{%- if docker_login_config is not none %}
-  # We put docker login config in provider section because ray's schema disabled
-  # additionalProperties for docker config.
-  # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227
-  docker_login_config:
-    username: {{docker_login_config.username}}
-    password: {{docker_login_config.password}}
-    server: {{docker_login_config.server}}
-{%- endif %}
   use_internal_ips: {{use_internal_ips}}
   # Disable launch config check for worker nodes as it can cause resource
   # leakage.
diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2
index 369e7a52ec6..9ffe2a7958e 100644
--- a/sky/templates/azure-ray.yml.j2
+++ b/sky/templates/azure-ray.yml.j2
@@ -17,14 +17,14 @@ docker:
 {%- endif %}
 
 provider:
-    type: external
-    module: sky.skylet.providers.azure.AzureNodeProvider
-    location: {{region}}
-    # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
-    # For Azure, ray distinguishes different instances by the resource_group,
-    # instead of the cluster_name. This ensures that ray creates new instances
-    # for different cluster_name.
-    resource_group: {{resource_group}}
+  type: external
+  module: sky.skylet.providers.azure.AzureNodeProvider
+  location: {{region}}
+  # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
+  # For Azure, ray distinguishes different instances by the resource_group,
+  # instead of the cluster_name. This ensures that ray creates new instances
+  # for different cluster_name.
+  resource_group: {{resource_group}}
 {%- if docker_login_config is not none %}
   # We put docker login config in provider section because ray's schema disabled
   # additionalProperties for docker config.
@@ -34,17 +34,17 @@ provider:
     password: {{docker_login_config.password}}
     server: {{docker_login_config.server}}
 {%- endif %}
-    # Keep (otherwise cannot reuse when re-provisioning).
-    # teardown(terminate=True) will override this.
-    cache_stopped_nodes: True
-    # subscription id of the azure user
-    subscription_id: {{azure_subscription_id}}
-    # Disable launch config check for worker nodes as it can cause resource
-    # leakage.
-    # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115
-    # The upper-level SkyPilot code has make sure there will not be resource
-    # leakage.
-    disable_launch_config_check: true
+  # Keep (otherwise cannot reuse when re-provisioning).
+  # teardown(terminate=True) will override this.
+  cache_stopped_nodes: True
+  # subscription id of the azure user
+  subscription_id: {{azure_subscription_id}}
+  # Disable launch config check for worker nodes as it can cause resource
+  # leakage.
+  # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115
+  # The upper-level SkyPilot code has make sure there will not be resource
+  # leakage.
+  disable_launch_config_check: true
 
 
 auth:
diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index 76c818b8aef..7a92c4d4429 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -15,6 +15,15 @@ docker:
     {%- if gpu is not none %}
       --gpus all
     {%- endif %}
+{%- if docker_login_config is not none %}
+  docker_login_config:
+    username: |-
+      {{docker_login_config.username}}
+    password: |-
+      {{docker_login_config.password}}
+    server: |-
+      {{docker_login_config.server}}
+{%- endif %}
 {%- endif %}
 
 provider:
@@ -37,15 +46,6 @@ provider:
 {% if firewall_rule is not none %}
   firewall_rule: {{firewall_rule}}
 {% endif %}
-{%- if docker_login_config is not none %}
-  # We put docker login config in provider section because ray's schema disabled
-  # additionalProperties for docker config.
-  # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227
-  docker_login_config:
-    username: {{docker_login_config.username}}
-    password: {{docker_login_config.password}}
-    server: {{docker_login_config.server}}
-{%- endif %}
   use_internal_ips: {{use_internal_ips}}
 {%- if tpu_vm %}
   _has_tpus: True
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 6206f50a8df..25a7764686b 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1100,6 +1100,14 @@ def test_job_queue_with_docker(generic_cloud: str):
             'sleep 5',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
             f'sky cancel -y {name} 3',
+            f'sky stop -y {name}',
+            # Make sure the job status preserve after stop and start the
+            # cluster. This is also a test for the docker container to be
+            # preserved after stop and start.
+            f'sky start -y {name}',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
             f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
             f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
             f'sky logs {name} 4 --status',

From cb695d53d99f78be96d1f690f8b4097c5e309750 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Mon, 26 Feb 2024 13:56:24 -0800
Subject: [PATCH 14/21] added comments

---
 examples/unsloth/unsloth.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index 4c01a5ca220..548871753aa 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -15,10 +15,11 @@ resources:
   disk_size: 128
 
 file_mounts:
+  # Creates a new bucket my-unsloth-checkpoints and mounts it at /outputs  
   /outputs:
-    name: my-unsloth-checkpoints
+    name: my-unsloth-checkpoints # Ensure this name is unique
 
-workdir: .
+workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth
 
 setup: |
   set -ex

From 888f1a8b39abedef37361a66f42c353fe94b9dc0 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Mon, 26 Feb 2024 21:58:29 -0800
Subject: [PATCH 15/21] quick fix

---
 examples/unsloth/unsloth_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py
index 400f2c8402b..255dfb560cf 100644
--- a/examples/unsloth/unsloth_example.py
+++ b/examples/unsloth/unsloth_example.py
@@ -71,7 +71,7 @@
         fp16=not torch.cuda.is_bf16_supported(),
         bf16=torch.cuda.is_bf16_supported(),
         logging_steps=1,
-        output_dir=args.output_dir[1:],
+        output_dir=("~" + args.output_dir),
         optim="adamw_8bit",
         seed=3407,
         save_steps=10,

From 48776693be2f25a25a38c6cc8de8df2044c8826f Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Wed, 28 Feb 2024 18:39:49 -0800
Subject: [PATCH 16/21] finished pip issues

---
 examples/unsloth/unsloth.yaml       | 18 ++++++++++++++----
 examples/unsloth/unsloth_example.py |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index 548871753aa..e0e07d95c94 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -23,10 +23,20 @@ workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth
 
 setup: |
   set -ex
-  pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
-    --index-url https://download.pytorch.org/whl/cu118
-  pip install ipython
-  pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git"
+  pip install --upgrade pip
+  cuda_version=$(nvcc --version | grep "release" | awk '{print $6}' | cut -c 2-)
+  
+  if [[ "$cuda_version" == "12.1"* ]]; then
+    pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
+      --index-url https://download.pytorch.org/whl/cu121
+    pip install ipython
+    pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git"
+  else
+    pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
+      --index-url https://download.pytorch.org/whl/cu118
+    pip install ipython
+    pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git"
+  fi
 
 run: |
   python unsloth_example.py --output-dir /outputs
diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py
index 255dfb560cf..9f0895096d0 100644
--- a/examples/unsloth/unsloth_example.py
+++ b/examples/unsloth/unsloth_example.py
@@ -71,7 +71,7 @@
         fp16=not torch.cuda.is_bf16_supported(),
         bf16=torch.cuda.is_bf16_supported(),
         logging_steps=1,
-        output_dir=("~" + args.output_dir),
+        output_dir=(args.output_dir),
         optim="adamw_8bit",
         seed=3407,
         save_steps=10,

From 7a208dd9c91aceaff871efb8cefd61ac4510725a Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Wed, 28 Feb 2024 18:43:25 -0800
Subject: [PATCH 17/21] fix

---
 examples/unsloth/unsloth.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml
index e0e07d95c94..2c163b42121 100644
--- a/examples/unsloth/unsloth.yaml
+++ b/examples/unsloth/unsloth.yaml
@@ -19,7 +19,7 @@ file_mounts:
   /outputs:
     name: my-unsloth-checkpoints # Ensure this name is unique
 
-workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth
+workdir: .
 
 setup: |
   set -ex

From 152e36a6a40417fb1be3fcaf0bdd9b5eb671eefa Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Thu, 29 Feb 2024 11:00:16 -0800
Subject: [PATCH 18/21] fix storage error message, add example link to docs

---
 README.md                           |  2 +-
 docs/source/index.rst               |  2 +-
 examples/unsloth/unsloth_example.py |  2 +-
 sky/data/storage.py                 | 10 +++++++---
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a1af8c63f7d..93abecb0fa4 100644
--- a/README.md
+++ b/README.md
@@ -164,7 +164,7 @@ Runnable examples:
   - [LocalGPT](./llm/localgpt)
   - [Falcon](./llm/falcon)
   - Add yours here & see more in [`llm/`](./llm)!
-- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), and [many more (`examples/`)](./examples).
+- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml) and [many more (`examples/`)](./examples).
 
 Follow updates:
 - [Twitter](https://twitter.com/skypilot_org)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5c0f8a7f7c5..8e140c33ed9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -85,7 +85,7 @@ Runnable examples:
   * `Falcon <https://github.com/skypilot-org/skypilot/tree/master/llm/falcon>`_
   * Add yours here & see more in `llm/ <https://github.com/skypilot-org/skypilot/tree/master/llm>`_!
 
-* Framework examples: `PyTorch DDP <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml>`_, `DeepSpeed <https://github.com/skypilot-org/skypilot/blob/master/examples/deepspeed-multinode/sky.yaml>`_, `JAX/Flax on TPU <https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml>`_, `Stable Diffusion <https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion>`_, `Detectron2 <https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml>`_, `Distributed <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py>`_ `TensorFlow <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml>`_, `NeMo <https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml>`_, `programmatic grid search <https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py>`_, `Docker <https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml>`_, and `many more <https://github.com/skypilot-org/skypilot/tree/master/examples>`_.
+* Framework examples: `PyTorch DDP <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml>`_, `DeepSpeed <https://github.com/skypilot-org/skypilot/blob/master/examples/deepspeed-multinode/sky.yaml>`_, `JAX/Flax on TPU <https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml>`_, `Stable Diffusion <https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion>`_, `Detectron2 <https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml>`_, `Distributed <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py>`_ `TensorFlow <https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml>`_, `NeMo <https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml>`_, `programmatic grid search <https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py>`_, `Docker <https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml>`_, `Unsloth <https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml>`_ and `many more <https://github.com/skypilot-org/skypilot/tree/master/examples>`_.
 
 Follow updates:
 
diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py
index 9f0895096d0..4e488ad82f0 100644
--- a/examples/unsloth/unsloth_example.py
+++ b/examples/unsloth/unsloth_example.py
@@ -71,7 +71,7 @@
         fp16=not torch.cuda.is_bf16_supported(),
         bf16=torch.cuda.is_bf16_supported(),
         logging_steps=1,
-        output_dir=(args.output_dir),
+        output_dir=args.output_dir,
         optim="adamw_8bit",
         seed=3407,
         save_steps=10,
diff --git a/sky/data/storage.py b/sky/data/storage.py
index 4ca8441be3a..9356a50f365 100644
--- a/sky/data/storage.py
+++ b/sky/data/storage.py
@@ -333,14 +333,18 @@ def _validate_existing_bucket(self):
             # bucket's URL as 'source'.
             if handle is None:
                 with ux_utils.print_exception_no_traceback():
+                    store_prefix = get_store_prefix(StoreType.from_store(self))
                     raise exceptions.StorageSpecError(
                         'Attempted to mount a non-sky managed bucket '
                         f'{self.name!r} without specifying the storage source.'
-                        ' To mount an externally created bucket (e.g., '
+                        f' Bucket {self.name!r} already exists. \n'
+                        '    • To create a new bucket, specify a unique name.\n'
+                        '    • To mount an externally created bucket (e.g., '
                         'created through cloud console or cloud cli), '
                         'specify the bucket URL in the source field '
-                        'instead of its name. E.g., replace `name: external-'
-                        'bucket` with `source: gs://external-bucket`.')
+                        'instead of its name. I.e., replace '
+                        f'`name: {self.name}` with '
+                        f'`source: {store_prefix}{self.name}`.')
 
 
 class Storage(object):

From 3d124395d18785a1b1180fe1473fbbae644b23ca Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Wed, 27 Mar 2024 13:09:54 -0700
Subject: [PATCH 19/21] changed error message if default nc installed on mac

---
 sky/provision/kubernetes/utils.py | 32 ++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 67acb7b1985..a52644341fb 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1038,15 +1038,29 @@ def check_port_forward_mode_dependencies() -> None:
                            stderr=subprocess.DEVNULL,
                            check=True)
         except (FileNotFoundError, subprocess.CalledProcessError):
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError(
-                    f'`{name}` is required to setup Kubernetes cloud with '
-                    f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
-                    'default networking mode and it is not installed. '
-                    'On Debian/Ubuntu, install it with:\n'
-                    f'  $ sudo apt install {install_cmd}\n'
-                    f'On MacOS, install it with: \n'
-                    f'  $ brew install {install_cmd}') from None
+            mac_nc_error = name == 'nc' and (os.path.exists('/usr/bin/nc') or
+                                             os.path.exists('/usr/bin/netcat'))
+            current_nc_path = (os.path.exists('/usr/bin/nc') and
+                               '/usr/bin/nc') or '/usr/bin/netcat'
+            if mac_nc_error:
+                with ux_utils.print_exception_no_traceback():
+                    raise RuntimeError(
+                        f'The default MacOS `nc` is installed at '
+                        f'{current_nc_path}. However, for '
+                        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
+                        'default networking mode, GNU netcat is required. '
+                        f'On MacOS, install it with: \n'
+                        f'  $ brew install {install_cmd}') from None
+            else:
+                with ux_utils.print_exception_no_traceback():
+                    raise RuntimeError(
+                        f'`{name}` is required to setup Kubernetes cloud with '
+                        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
+                        'default networking mode and it is not installed. '
+                        'On Debian/Ubuntu, install it with:\n'
+                        f'  $ sudo apt install {install_cmd}\n'
+                        f'On MacOS, install it with: \n'
+                        f'  $ brew install {install_cmd}') from None
 
 
 def get_endpoint_debug_message() -> str:

From 0746b457ce6d9be16311792c4e739966fd62c441 Mon Sep 17 00:00:00 2001
From: Sheth <shethhriday29@berkeley.edu>
Date: Thu, 28 Mar 2024 00:07:54 -0700
Subject: [PATCH 20/21] refactored check_port_forward_mode_dependencies
 function

---
 sky/provision/kubernetes/utils.py | 94 +++++++++++++++++++------------
 1 file changed, 58 insertions(+), 36 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index a52644341fb..9af18ebbc49 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1025,42 +1025,64 @@ def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
 
 
 def check_port_forward_mode_dependencies() -> None:
-    """Checks if 'socat' is installed"""
-    # We store the dependency list as a list of lists. Each inner list
-    # contains the name of the dependency, the command to check if it is
-    # installed, and the package name to install it.
-    dependency_list = [['socat', ['socat', '-V'], 'socat'],
-                       ['nc', ['nc', '-h'], 'netcat']]
-    for name, check_cmd, install_cmd in dependency_list:
-        try:
-            subprocess.run(check_cmd,
-                           stdout=subprocess.DEVNULL,
-                           stderr=subprocess.DEVNULL,
-                           check=True)
-        except (FileNotFoundError, subprocess.CalledProcessError):
-            mac_nc_error = name == 'nc' and (os.path.exists('/usr/bin/nc') or
-                                             os.path.exists('/usr/bin/netcat'))
-            current_nc_path = (os.path.exists('/usr/bin/nc') and
-                               '/usr/bin/nc') or '/usr/bin/netcat'
-            if mac_nc_error:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(
-                        f'The default MacOS `nc` is installed at '
-                        f'{current_nc_path}. However, for '
-                        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
-                        'default networking mode, GNU netcat is required. '
-                        f'On MacOS, install it with: \n'
-                        f'  $ brew install {install_cmd}') from None
-            else:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(
-                        f'`{name}` is required to setup Kubernetes cloud with '
-                        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
-                        'default networking mode and it is not installed. '
-                        'On Debian/Ubuntu, install it with:\n'
-                        f'  $ sudo apt install {install_cmd}\n'
-                        f'On MacOS, install it with: \n'
-                        f'  $ brew install {install_cmd}') from None
+    """Checks if 'socat' and 'nc' are installed"""
+
+    # Construct runtime errors
+    socat_default_error = RuntimeError(
+        f'`socat` is required to setup Kubernetes cloud with '
+        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
+        'default networking mode and it is not installed. '
+        'On Debian/Ubuntu, install it with:\n'
+        f'  $ sudo apt install socat\n'
+        f'On MacOS, install it with: \n'
+        f'  $ brew install socat')
+    netcat_default_error = RuntimeError(
+        f'`nc` is required to setup Kubernetes cloud with '
+        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
+        'default networking mode and it is not installed. '
+        'On Debian/Ubuntu, install it with:\n'
+        f'  $ sudo apt install netcat\n'
+        f'On MacOS, install it with: \n'
+        f'  $ brew install netcat')
+    mac_installed_error = RuntimeError(
+        f'The default MacOS `nc` is installed. However, for '
+        f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` '  # pylint: disable=line-too-long
+        'default networking mode, GNU netcat is required. '
+        f'On MacOS, install it with: \n'
+        f'  $ brew install netcat')
+
+    # Ensure socat is installed
+    try:
+        subprocess.run(['socat', '-V'],
+                       stdout=subprocess.DEVNULL,
+                       stderr=subprocess.DEVNULL,
+                       check=True)
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        with ux_utils.print_exception_no_traceback():
+            raise socat_default_error from None
+
+    # Ensure netcat is installed
+    #
+    # In some cases, the user may have the default MacOS nc installed but
+    # they need GNU nc installed. Checking for this case, reflected in
+    # `nc_mac_installed`, helps give a more specific error message
+    try:
+        netcat_output = subprocess.run(['nc', '-h'],
+                                       capture_output=True,
+                                       check=False)
+        nc_mac_installed = netcat_output.returncode == 1 and 'apple' in str(
+            netcat_output.stderr)
+
+        if nc_mac_installed:
+            with ux_utils.print_exception_no_traceback():
+                raise mac_installed_error from None
+        elif netcat_output.returncode != 0:
+            with ux_utils.print_exception_no_traceback():
+                raise netcat_default_error from None
+
+    except FileNotFoundError:
+        with ux_utils.print_exception_no_traceback():
+            raise netcat_default_error from None
 
 
 def get_endpoint_debug_message() -> str:

From e42d5b33f8b1f4e99eaf39cf9a2c1f142d54abfd Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Mon, 15 Apr 2024 12:29:05 -0700
Subject: [PATCH 21/21] update comment

---
 sky/provision/kubernetes/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 9af18ebbc49..94d9696f880 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -1063,9 +1063,9 @@ def check_port_forward_mode_dependencies() -> None:
 
     # Ensure netcat is installed
     #
-    # In some cases, the user may have the default MacOS nc installed but
-    # they need GNU nc installed. Checking for this case, reflected in
-    # `nc_mac_installed`, helps give a more specific error message
+    # In some cases, the user may have the default MacOS nc installed, which
+    # does not support the -z flag. To use the -z flag for port scanning,
+    # they need GNU nc installed. We check for this case and raise an error.
     try:
         netcat_output = subprocess.run(['nc', '-h'],
                                        capture_output=True,