From 3f50915ae4a54e920087e5d6a0472232e548b143 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Jul 2022 23:12:58 -0700 Subject: [PATCH 01/21] Add host VM - GPU compatibility check for GCP --- sky/clouds/gcp.py | 8 ++ sky/clouds/service_catalog/__init__.py | 9 ++ sky/clouds/service_catalog/gcp_catalog.py | 117 ++++++++++++++++++++++ sky/resources.py | 9 +- 4 files changed, 141 insertions(+), 2 deletions(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 45d2dec6278..a59ccce18a0 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -345,3 +345,11 @@ def get_project_id(cls, dryrun: bool = False) -> str: project_id = gcp_credentials.get('quota_project_id', None) or gcp_credentials['project_id'] return project_id + + @staticmethod + def check_host_accelerator_compatibility( + instance_type: str, + accelerators: Optional[Dict[str, int]], + zone: Optional[str] = None) -> None: + service_catalog.check_host_accelerator_compatibility( + instance_type, accelerators, zone, 'gcp') diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 088ff0a7d4c..df39119aae8 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -159,6 +159,15 @@ def get_region_zones_for_accelerators( acc_name, acc_count, use_spot) +def check_host_accelerator_compatibility(instance_type: str, + accelerators: Optional[Dict[str, int]], + zone: Optional[str] = None, + clouds: CloudFilter = None) -> None: + """Check if GCP host VM type is compatible with the accelerators.""" + _map_clouds_catalog(clouds, 'check_host_accelerator_compatibility', + instance_type, accelerators, zone) + + def get_common_gpus() -> List[str]: """Returns a list of commonly used GPU names.""" return [ diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index a121f725dbf..956bbcbc06c 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -9,6 +9,7 @@ import pandas as pd from sky.clouds.service_catalog import common +from sky.utils import ux_utils if typing.TYPE_CHECKING: from sky.clouds import cloud @@ -189,6 +190,38 @@ }, } +# num_gpus -> (max_num_cpus, max_memory_gb) +# Refer to: https://cloud.google.com/compute/docs/gpus +_NUM_ACC_TO_MAX_CPU_AND_MEMORY = { + 'K80': { + 1: (8, 52), + 2: (16, 104), + 4: (32, 208), + 8: (64, 416), # exception: asia-east1-a, us-east1-d + }, + 'V100': { + 1: (12, 78), + 2: (24, 156), + 4: (48, 312), + 8: (96, 624), + }, + 'T4': { + 1: (48, 312), + 2: (48, 312), + 4: (96, 624), + }, + 'P4': { + 1: (24, 156), + 2: (48, 312), + 4: (96, 624), + }, + 'P100': { + 1: (16, 104), + 2: (32, 208), + 4: (96, 624), # exception: us-east1-c, europe-west1-d, europe-west1-b + } +} + def _is_power_of_two(x: int) -> bool: """Returns true if x is a power of two.""" @@ -337,3 +370,87 @@ def get_region_zones_for_accelerators( """Returns a list of regions for a given accelerators.""" df = _get_accelerator(_df, accelerator, count, region=None) return common.get_region_zones(df, use_spot) + + +def check_host_accelerator_compatibility(instance_type: str, + accelerators: Optional[Dict[str, int]], + zone: Optional[str] = None) -> None: + """Check if the instance type is compatible with the accelerators.""" + if accelerators is None: + if instance_type.startswith('a2-highgpu-'): + # NOTE: While it is allowed to use A2 machines as CPU-only nodes, + # we exclude this case as it is uncommon and undesirable. + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'A2 instance types should be used with A100 GPUs. ' + 'Either use other instance types or specify the ' + 'accelerators as A100.') + return + + acc = list(accelerators.items()) + assert len(acc) == 1, acc + acc_name, acc_count = acc[0] + + if 'tpu' in acc_name.lower(): + # TODO(woosuk): Add validation for TPUs. + return + acc_name = acc_name.upper() + + # Treat A100 as a special case. + if acc_name == 'A100': + # A100 must be attached to A2 instance type. + if not instance_type.startswith('a2-highgpu-'): + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'A100 GPUs cannot be attached to {instance_type}. ' + 'Use A2 instance type instead.') + a100_instance_type = _A100_INSTANCE_TYPES[acc_count] + if instance_type != a100_instance_type: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'A100:{acc_count} cannot be attached to {instance_type}. ' + f'Use {a100_instance_type} instead.') + return + + # Other GPUs must be attached to N1 machines. + # Refer to: https://cloud.google.com/compute/docs/machine-types#gpus + if not instance_type.startswith('n1-'): + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'{acc_name} GPUs cannot be attached to {instance_type}. ' + f'Use N1 instance types instead.') + + # Memory/CPU ratios of N1 machines. + num_cpus = int(instance_type.split('-')[2]) + machine_type = instance_type.split('-')[1] + if machine_type == 'standard': + memory = 3.75 * num_cpus + elif machine_type == 'highmem': + memory = 6.5 * num_cpus + elif machine_type == 'highcpu': + memory = 0.9 * num_cpus + + if acc_name not in _NUM_ACC_TO_MAX_CPU_AND_MEMORY: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'{acc_name} is not supported by GCP.') + + # Check available vCPUs and memory. + max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][acc_count] + if acc_name == 'K80' and acc_count == 8: + if zone in ['asia-east1-a', 'us-east1-d']: + max_memory = 416 + elif acc_name == 'P100' and acc_count == 4: + if zone in ['us-east1-c', 'europe-west1-d', 'europe-west1-b']: + max_cpus = 64 + max_memory = 208 + + if num_cpus > max_cpus: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'{acc_name}:{acc_count} cannot be attached to ' + f'{instance_type}. The maximum number of vCPUs is {max_cpus}.') + if memory > max_memory: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'{acc_name}:{acc_count} cannot be attached to ' + f'{instance_type}. The maximum CPU memory is {max_memory} GB.') diff --git a/sky/resources.py b/sky/resources.py index 2a82c7b7868..c7deffcc6c3 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -305,8 +305,13 @@ def _try_validate_instance_type(self): def _try_validate_accelerators(self) -> None: """Try-validates accelerators against the instance type.""" - if self.is_launchable() and not isinstance(self.cloud, clouds.GCP): - # GCP attaches accelerators to VMs, so no need for this check. + if not self.is_launchable(): + return + + if isinstance(self.cloud, clouds.GCP): + self.cloud.check_host_accelerator_compatibility( + self._instance_type, self.accelerators) + else: acc_requested = self.accelerators if acc_requested is None: return From 4d1b1420edc4b407558f27a7ba2b1b964bc1f381 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Jul 2022 23:14:10 -0700 Subject: [PATCH 02/21] Minor fix --- sky/clouds/service_catalog/gcp_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 956bbcbc06c..e1fce26bad4 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -197,7 +197,7 @@ 1: (8, 52), 2: (16, 104), 4: (32, 208), - 8: (64, 416), # exception: asia-east1-a, us-east1-d + 8: (64, 416), # except for asia-east1-a, us-east1-d }, 'V100': { 1: (12, 78), @@ -218,7 +218,7 @@ 'P100': { 1: (16, 104), 2: (32, 208), - 4: (96, 624), # exception: us-east1-c, europe-west1-d, europe-west1-b + 4: (96, 624), # except for us-east1-c, europe-west1-d, europe-west1-b } } From 9adb053f9e91821b46814defdb6db7774a388641 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Jul 2022 23:31:31 -0700 Subject: [PATCH 03/21] Fix test_spot --- tests/test_spot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_spot.py b/tests/test_spot.py index 429990804d6..2366cdce0f4 100644 --- a/tests/test_spot.py +++ b/tests/test_spot.py @@ -59,7 +59,7 @@ def _mock_cluster_state(self, _mock_db_conn): head_ip='1.1.1.2', launched_nodes=1, launched_resources=sky.Resources(sky.GCP(), - instance_type='n1-highmem-8', + instance_type='a2-highgpu-4g', accelerators={'A100': 4}, region='us-west1'), ) From a5f27e22313edf1e4dc8cf440238fbf222db26b9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Jul 2022 23:48:04 -0700 Subject: [PATCH 04/21] Fix optimizer test --- tests/test_optimizer_random_dag.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_optimizer_random_dag.py b/tests/test_optimizer_random_dag.py index 133c408bd2d..46309688fb2 100644 --- a/tests/test_optimizer_random_dag.py +++ b/tests/test_optimizer_random_dag.py @@ -2,8 +2,8 @@ import random import numpy as np +import pandas as pd import sky -from sky.clouds.service_catalog import gcp_catalog CLOUDS = { 'AWS': sky.AWS(), @@ -11,7 +11,6 @@ 'Azure': sky.Azure(), } ALL_INSTANCE_TYPES = sum(sky.list_accelerators(gpus_only=True).values(), []) -GCP_INSTANCE_TYPES = list(gcp_catalog._ON_DEMAND_PRICES.keys()) DUMMY_NODES = [ sky.optimizer._DUMMY_SOURCE_NAME, @@ -65,8 +64,8 @@ def generate_random_dag( sky.Resources( cloud=CLOUDS[candidate.cloud], instance_type=candidate.instance_type \ - if candidate.cloud != 'GCP' \ - else random.choice(GCP_INSTANCE_TYPES), + if not pd.isna(candidate.instance_type) \ + else CLOUDS['GCP'].get_default_instance_type(), accelerators={ candidate.accelerator_name: candidate.accelerator_count}, ) From 6a4b345b12c71e3490105e59b304251cf31922ee Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 29 Jul 2022 23:42:15 -0700 Subject: [PATCH 05/21] Fix optimizer test --- tests/test_optimizer_random_dag.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/test_optimizer_random_dag.py b/tests/test_optimizer_random_dag.py index 230dfc882e7..f4b98fa18e2 100644 --- a/tests/test_optimizer_random_dag.py +++ b/tests/test_optimizer_random_dag.py @@ -62,17 +62,20 @@ def generate_random_dag( num_candidates = random.randint(1, max_num_candidate_resources) candidate_instance_types = random.choices(ALL_INSTANCE_TYPES, k=num_candidates) - op.set_resources({ - sky.Resources( + + candidate_resources = set() + for candidate in candidate_instance_types: + instance_type = candidate.instance_type + if pd.isna(instance_type): + instance_type = GCP_DEFAULT_INSTANCE_TYPE + resources = sky.Resources( cloud=CLOUDS[candidate.cloud], - instance_type=candidate.instance_type \ - if not pd.isna(candidate.instance_type) \ - else GCP_DEFAULT_INSTANCE_TYPE, + instance_type=instance_type, accelerators={ - candidate.accelerator_name: candidate.accelerator_count}, - ) - for candidate in candidate_instance_types - }) + candidate.accelerator_name: candidate.accelerator_count + }) + candidate_resources.add(resources) + op.set_resources(candidate_resources) return dag From 0cd3bf9450cd6db0dd2d90b4ee36edc1eaa57aa0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 29 Jul 2022 23:42:57 -0700 Subject: [PATCH 06/21] Minor bugfix + Add reference URL in error message --- sky/clouds/service_catalog/gcp_catalog.py | 25 +++++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 4df21c4b2f5..924c05d6c85 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -87,7 +87,7 @@ 1: (8, 52), 2: (16, 104), 4: (32, 208), - 8: (64, 416), # except for asia-east1-a, us-east1-d + 8: (64, 208), # except for asia-east1-a, us-east1-d }, 'V100': { 1: (12, 78), @@ -288,7 +288,7 @@ def check_host_accelerator_compatibility(instance_type: str, zone: Optional[str] = None) -> None: """Check if the instance type is compatible with the accelerators.""" if accelerators is None: - if instance_type.startswith('a2-highgpu-'): + if instance_type.startswith('a2-'): # NOTE: While it is allowed to use A2 machines as CPU-only nodes, # we exclude this case as it is uncommon and undesirable. with ux_utils.print_exception_no_traceback(): @@ -310,17 +310,19 @@ def check_host_accelerator_compatibility(instance_type: str, # Treat A100 as a special case. if acc_name == 'A100': # A100 must be attached to A2 instance type. - if not instance_type.startswith('a2-highgpu-'): + a100_instance_type = _A100_INSTANCE_TYPES[acc_count] + if not instance_type.startswith('a2-'): with ux_utils.print_exception_no_traceback(): raise ValueError( f'A100 GPUs cannot be attached to {instance_type}. ' - 'Use A2 instance type instead.') - a100_instance_type = _A100_INSTANCE_TYPES[acc_count] + f'Use {a100_instance_type} instead. Please refer to ' + 'https://cloud.google.com/compute/docs/gpus#a100-gpus') if instance_type != a100_instance_type: with ux_utils.print_exception_no_traceback(): raise ValueError( f'A100:{acc_count} cannot be attached to {instance_type}. ' - f'Use {a100_instance_type} instead.') + f'Use {a100_instance_type} instead. Please refer to ' + 'https://cloud.google.com/compute/docs/gpus#a100-gpus') return # Other GPUs must be attached to N1 machines. @@ -329,7 +331,8 @@ def check_host_accelerator_compatibility(instance_type: str, with ux_utils.print_exception_no_traceback(): raise ValueError( f'{acc_name} GPUs cannot be attached to {instance_type}. ' - f'Use N1 instance types instead.') + 'Use N1 instance types instead. Please refer to: ' + 'https://cloud.google.com/compute/docs/machine-types#gpus') # Memory/CPU ratios of N1 machines. num_cpus = int(instance_type.split('-')[2]) @@ -340,6 +343,8 @@ def check_host_accelerator_compatibility(instance_type: str, memory = 6.5 * num_cpus elif machine_type == 'highcpu': memory = 0.9 * num_cpus + else: + raise ValueError(f'Unknown machine type: {machine_type}') if acc_name not in _NUM_ACC_TO_MAX_CPU_AND_MEMORY: with ux_utils.print_exception_no_traceback(): @@ -359,9 +364,11 @@ def check_host_accelerator_compatibility(instance_type: str, with ux_utils.print_exception_no_traceback(): raise ValueError( f'{acc_name}:{acc_count} cannot be attached to ' - f'{instance_type}. The maximum number of vCPUs is {max_cpus}.') + f'{instance_type}. The maximum number of vCPUs is {max_cpus}. ' + 'Please refer to: https://cloud.google.com/compute/docs/gpus') if memory > max_memory: with ux_utils.print_exception_no_traceback(): raise ValueError( f'{acc_name}:{acc_count} cannot be attached to ' - f'{instance_type}. The maximum CPU memory is {max_memory} GB.') + f'{instance_type}. The maximum CPU memory is {max_memory} GB. ' + 'Please refer to: https://cloud.google.com/compute/docs/gpus') From e5e5f9d5fe022de77e2519961bd9451d2d367c18 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 29 Jul 2022 23:43:07 -0700 Subject: [PATCH 07/21] Minor fix in docstring --- sky/clouds/service_catalog/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 2b761a6248d..0148e113bdc 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -169,7 +169,7 @@ def check_host_accelerator_compatibility(instance_type: str, accelerators: Optional[Dict[str, int]], zone: Optional[str] = None, clouds: CloudFilter = None) -> None: - """Check if GCP host VM type is compatible with the accelerators.""" + """GCP only: Check if host VM type is compatible with the accelerators.""" _map_clouds_catalog(clouds, 'check_host_accelerator_compatibility', instance_type, accelerators, zone) From 135fa71d087e9a23806fb6d6a88ade7699fea658 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 30 Jul 2022 00:26:24 -0700 Subject: [PATCH 08/21] Get memory size from catalog --- sky/clouds/service_catalog/gcp_catalog.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 924c05d6c85..274cf8ee092 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -334,23 +334,10 @@ def check_host_accelerator_compatibility(instance_type: str, 'Use N1 instance types instead. Please refer to: ' 'https://cloud.google.com/compute/docs/machine-types#gpus') - # Memory/CPU ratios of N1 machines. - num_cpus = int(instance_type.split('-')[2]) - machine_type = instance_type.split('-')[1] - if machine_type == 'standard': - memory = 3.75 * num_cpus - elif machine_type == 'highmem': - memory = 6.5 * num_cpus - elif machine_type == 'highcpu': - memory = 0.9 * num_cpus - else: - raise ValueError(f'Unknown machine type: {machine_type}') - + # Check maximum vCPUs and memory. if acc_name not in _NUM_ACC_TO_MAX_CPU_AND_MEMORY: with ux_utils.print_exception_no_traceback(): raise ValueError(f'{acc_name} is not supported by GCP.') - - # Check available vCPUs and memory. max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][acc_count] if acc_name == 'K80' and acc_count == 8: if zone in ['asia-east1-a', 'us-east1-d']: @@ -360,6 +347,11 @@ def check_host_accelerator_compatibility(instance_type: str, max_cpus = 64 max_memory = 208 + # vCPU counts and memory sizes of N1 machines. + num_cpus = int(instance_type.split('-')[2]) + df = _df[_df['InstanceType'] == instance_type] + memory = df['MemoryGiB'].iloc[0] + if num_cpus > max_cpus: with ux_utils.print_exception_no_traceback(): raise ValueError( From 940827c6d839e9323a5db32005856cedb0996a77 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 30 Jul 2022 00:26:58 -0700 Subject: [PATCH 09/21] Add TODO --- sky/clouds/service_catalog/gcp_catalog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 274cf8ee092..187e1295c45 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -348,6 +348,7 @@ def check_host_accelerator_compatibility(instance_type: str, max_memory = 208 # vCPU counts and memory sizes of N1 machines. + # TODO(woosuk): Query vCPU counts from the GCP catalog. num_cpus = int(instance_type.split('-')[2]) df = _df[_df['InstanceType'] == instance_type] memory = df['MemoryGiB'].iloc[0] From fc237093c84c96229188682790712eb6075eb64f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 00:50:08 -0700 Subject: [PATCH 10/21] Resolve merge conflicts & Address TODOs --- sky/clouds/service_catalog/gcp_catalog.py | 14 +++-- sky/resources.py | 69 ++++++++++++----------- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index fd9e87065c9..4b3c67f9698 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -321,10 +321,12 @@ def check_host_accelerator_compatibility(instance_type: str, assert len(acc) == 1, acc acc_name, acc_count = acc[0] - if 'tpu' in acc_name.lower(): - # TODO(woosuk): Add validation for TPUs. + if acc_name.startswith('tpu-'): + if instance_type != 'TPU-VM' and not instance_type.startswith('n1-'): + raise ValueError( + 'TPUs can be only used with N1 machines. Please refer to: ' + 'https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines') # pylint: disable=line-too-long return - acc_name = acc_name.upper() # Treat A100 as a special case. if acc_name == 'A100': @@ -356,7 +358,8 @@ def check_host_accelerator_compatibility(instance_type: str, # Check maximum vCPUs and memory. if acc_name not in _NUM_ACC_TO_MAX_CPU_AND_MEMORY: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'{acc_name} is not supported by GCP.') + raise ValueError(f'{acc_name} is not available in GCP. ' + 'See \'sky show-gpus --cloud gcp\'') max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][acc_count] if acc_name == 'K80' and acc_count == 8: if zone in ['asia-east1-a', 'us-east1-d']: @@ -367,9 +370,8 @@ def check_host_accelerator_compatibility(instance_type: str, max_memory = 208 # vCPU counts and memory sizes of N1 machines. - # TODO(woosuk): Query vCPU counts from the GCP catalog. - num_cpus = int(instance_type.split('-')[2]) df = _df[_df['InstanceType'] == instance_type] + num_cpus = df['vCPUs'].iloc[0] memory = df['MemoryGiB'].iloc[0] if num_cpus > max_cpus: diff --git a/sky/resources.py b/sky/resources.py index 127a45d59f2..85172110dbc 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -301,43 +301,48 @@ def _try_validate_instance_type(self): def _try_validate_accelerators(self) -> None: """Try-validates accelerators against the instance type.""" - if self.is_launchable() and not isinstance(self.cloud, clouds.GCP): - # GCP attaches accelerators to VMs, so no need for this check. - acc_requested = self.accelerators - if acc_requested is None: - return - acc_from_instance_type = ( - self.cloud.get_accelerators_from_instance_type( - self._instance_type)) - if not Resources(accelerators=acc_requested).less_demanding_than( - Resources(accelerators=acc_from_instance_type)): - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Infeasible resource demands found:\n' - f' Instance type requested: {self._instance_type}\n' - f' Accelerators for {self._instance_type}: ' - f'{acc_from_instance_type}\n' - f' Accelerators requested: {acc_requested}\n' - f'To fix: either only specify instance_type, or change ' - 'the accelerators field to be consistent.') + acc_requested = self.accelerators + if acc_requested is None: + return + + if self.is_launchable(): + if isinstance(self.cloud, clouds.GCP): + clouds.GCP.check_host_accelerator_compatibility( + self.instance_type, acc_requested, self.zone) + else: + acc_from_instance_type = ( + self.cloud.get_accelerators_from_instance_type( + self._instance_type)) + if not Resources( + accelerators=acc_requested).less_demanding_than( + Resources(accelerators=acc_from_instance_type)): + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Infeasible resource demands found:\n' + ' Instance type requested: ' + f'{self._instance_type}\n' + f' Accelerators for {self._instance_type}: ' + f'{acc_from_instance_type}\n' + f' Accelerators requested: {acc_requested}\n' + f'To fix: either only specify instance_type, or ' + 'change the accelerators field to be consistent.') # NOTE: should not clear 'self.accelerators' even for AWS/Azure, # because e.g., the instance may have 4 GPUs, while the task # specifies to use 1 GPU. # Validate whether accelerator is available in specified region/zone. - if self.accelerators is not None: - acc, acc_count = list(self.accelerators.items())[0] - if self.region is not None or self.zone is not None: - if not self._cloud.accelerator_in_region_or_zone( - acc, acc_count, self.region, self.zone): - error_str = (f'Accelerator "{acc}" is not available in ' - '"{}" region/zone.') - if self.zone: - error_str = error_str.format(self.zone) - else: - error_str = error_str.format(self.region) - with ux_utils.print_exception_no_traceback(): - raise ValueError(error_str) + acc, acc_count = list(acc_requested.items())[0] + if self.region is not None or self.zone is not None: + if not self._cloud.accelerator_in_region_or_zone( + acc, acc_count, self.region, self.zone): + error_str = (f'Accelerator "{acc}" is not available in ' + '"{}" region/zone.') + if self.zone: + error_str = error_str.format(self.zone) + else: + error_str = error_str.format(self.region) + with ux_utils.print_exception_no_traceback(): + raise ValueError(error_str) def _try_validate_spot(self) -> None: if self._spot_recovery is None: From c9ad4212aef457d401a19b7787e6f119884c6db7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 14:15:23 -0700 Subject: [PATCH 11/21] Move compatibility check to optimizer --- sky/optimizer.py | 5 ++++ sky/resources.py | 71 ++++++++++++++++++++++-------------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index afd415ae0e0..b8dd39fa5b3 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -847,6 +847,11 @@ def _fill_in_launchable_resources( 'enabled. Run `sky check` to enable access to it, ' 'or change the cloud requirement.') elif resources.is_launchable(): + if (isinstance(resources.cloud, clouds.GCP) and + resources.accelerators is not None): + clouds.GCP.check_host_accelerator_compatibility( + resources.instance_type, resources.accelerators, + resources.zone) launchable[resources] = [resources] else: clouds_list = [resources.cloud diff --git a/sky/resources.py b/sky/resources.py index 85172110dbc..ad65c11b69a 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -300,49 +300,44 @@ def _try_validate_instance_type(self): self._cloud = valid_clouds[0] def _try_validate_accelerators(self) -> None: - """Try-validates accelerators against the instance type.""" - acc_requested = self.accelerators - if acc_requested is None: - return - - if self.is_launchable(): - if isinstance(self.cloud, clouds.GCP): - clouds.GCP.check_host_accelerator_compatibility( - self.instance_type, acc_requested, self.zone) - else: - acc_from_instance_type = ( - self.cloud.get_accelerators_from_instance_type( - self._instance_type)) - if not Resources( - accelerators=acc_requested).less_demanding_than( - Resources(accelerators=acc_from_instance_type)): - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Infeasible resource demands found:\n' - ' Instance type requested: ' - f'{self._instance_type}\n' - f' Accelerators for {self._instance_type}: ' - f'{acc_from_instance_type}\n' - f' Accelerators requested: {acc_requested}\n' - f'To fix: either only specify instance_type, or ' - 'change the accelerators field to be consistent.') + """Validate accelerators against the instance type and region/zone.""" + if self.is_launchable() and not isinstance(self.cloud, clouds.GCP): + # GCP attaches accelerators to VMs, so no need for this check. + acc_requested = self.accelerators + if acc_requested is None: + return + acc_from_instance_type = ( + self.cloud.get_accelerators_from_instance_type( + self._instance_type)) + if not Resources(accelerators=acc_requested).less_demanding_than( + Resources(accelerators=acc_from_instance_type)): + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Infeasible resource demands found:\n' + f' Instance type requested: {self._instance_type}\n' + f' Accelerators for {self._instance_type}: ' + f'{acc_from_instance_type}\n' + f' Accelerators requested: {acc_requested}\n' + f'To fix: either only specify instance_type, or change ' + 'the accelerators field to be consistent.') # NOTE: should not clear 'self.accelerators' even for AWS/Azure, # because e.g., the instance may have 4 GPUs, while the task # specifies to use 1 GPU. # Validate whether accelerator is available in specified region/zone. - acc, acc_count = list(acc_requested.items())[0] - if self.region is not None or self.zone is not None: - if not self._cloud.accelerator_in_region_or_zone( - acc, acc_count, self.region, self.zone): - error_str = (f'Accelerator "{acc}" is not available in ' - '"{}" region/zone.') - if self.zone: - error_str = error_str.format(self.zone) - else: - error_str = error_str.format(self.region) - with ux_utils.print_exception_no_traceback(): - raise ValueError(error_str) + if self.accelerators is not None: + acc, acc_count = list(self.accelerators.items())[0] + if self.region is not None or self.zone is not None: + if not self._cloud.accelerator_in_region_or_zone( + acc, acc_count, self.region, self.zone): + error_str = (f'Accelerator "{acc}" is not available in ' + '"{}" region/zone.') + if self.zone: + error_str = error_str.format(self.zone) + else: + error_str = error_str.format(self.region) + with ux_utils.print_exception_no_traceback(): + raise ValueError(error_str) def _try_validate_spot(self) -> None: if self._spot_recovery is None: From 02d41590ad6e4fa33b8ea024982c8623a5b3b68f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 14:15:54 -0700 Subject: [PATCH 12/21] ValueError -> ResourcesMismatchError --- sky/clouds/service_catalog/gcp_catalog.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 4b3c67f9698..c0420b27308 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -8,6 +8,7 @@ import pandas as pd +from sky import exceptions from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -311,7 +312,7 @@ def check_host_accelerator_compatibility(instance_type: str, # NOTE: While it is allowed to use A2 machines as CPU-only nodes, # we exclude this case as it is uncommon and undesirable. with ux_utils.print_exception_no_traceback(): - raise ValueError( + raise exceptions.ResourcesMismatchError( 'A2 instance types should be used with A100 GPUs. ' 'Either use other instance types or specify the ' 'accelerators as A100.') @@ -323,7 +324,7 @@ def check_host_accelerator_compatibility(instance_type: str, if acc_name.startswith('tpu-'): if instance_type != 'TPU-VM' and not instance_type.startswith('n1-'): - raise ValueError( + raise exceptions.ResourcesMismatchError( 'TPUs can be only used with N1 machines. Please refer to: ' 'https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines') # pylint: disable=line-too-long return @@ -334,13 +335,13 @@ def check_host_accelerator_compatibility(instance_type: str, a100_instance_type = _A100_INSTANCE_TYPES[acc_count] if not instance_type.startswith('a2-'): with ux_utils.print_exception_no_traceback(): - raise ValueError( + raise exceptions.ResourcesMismatchError( f'A100 GPUs cannot be attached to {instance_type}. ' f'Use {a100_instance_type} instead. Please refer to ' 'https://cloud.google.com/compute/docs/gpus#a100-gpus') if instance_type != a100_instance_type: with ux_utils.print_exception_no_traceback(): - raise ValueError( + raise exceptions.ResourcesMismatchError( f'A100:{acc_count} cannot be attached to {instance_type}. ' f'Use {a100_instance_type} instead. Please refer to ' 'https://cloud.google.com/compute/docs/gpus#a100-gpus') @@ -350,7 +351,7 @@ def check_host_accelerator_compatibility(instance_type: str, # Refer to: https://cloud.google.com/compute/docs/machine-types#gpus if not instance_type.startswith('n1-'): with ux_utils.print_exception_no_traceback(): - raise ValueError( + raise exceptions.ResourcesMismatchError( f'{acc_name} GPUs cannot be attached to {instance_type}. ' 'Use N1 instance types instead. Please refer to: ' 'https://cloud.google.com/compute/docs/machine-types#gpus') @@ -358,8 +359,9 @@ def check_host_accelerator_compatibility(instance_type: str, # Check maximum vCPUs and memory. if acc_name not in _NUM_ACC_TO_MAX_CPU_AND_MEMORY: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'{acc_name} is not available in GCP. ' - 'See \'sky show-gpus --cloud gcp\'') + raise exceptions.ResourcesUnavailableError( + f'{acc_name} is not available in GCP. ' + 'See \'sky show-gpus --cloud gcp\'') max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][acc_count] if acc_name == 'K80' and acc_count == 8: if zone in ['asia-east1-a', 'us-east1-d']: @@ -376,13 +378,13 @@ def check_host_accelerator_compatibility(instance_type: str, if num_cpus > max_cpus: with ux_utils.print_exception_no_traceback(): - raise ValueError( + raise exceptions.ResourcesMismatchError( f'{acc_name}:{acc_count} cannot be attached to ' f'{instance_type}. The maximum number of vCPUs is {max_cpus}. ' 'Please refer to: https://cloud.google.com/compute/docs/gpus') if memory > max_memory: with ux_utils.print_exception_no_traceback(): - raise ValueError( + raise exceptions.ResourcesMismatchError( f'{acc_name}:{acc_count} cannot be attached to ' f'{instance_type}. The maximum CPU memory is {max_memory} GB. ' 'Please refer to: https://cloud.google.com/compute/docs/gpus') From 974ad50c4ede64900f66d1b71ec97c29790204ff Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 14:19:44 -0700 Subject: [PATCH 13/21] Fix TPU error msg --- sky/clouds/service_catalog/gcp_catalog.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index c0420b27308..87401d8b374 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -324,9 +324,12 @@ def check_host_accelerator_compatibility(instance_type: str, if acc_name.startswith('tpu-'): if instance_type != 'TPU-VM' and not instance_type.startswith('n1-'): - raise exceptions.ResourcesMismatchError( - 'TPUs can be only used with N1 machines. Please refer to: ' - 'https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines') # pylint: disable=line-too-long + # TODO(woosuk): Check max vcpus and memory for each TPU type. + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesMismatchError( + 'TPU Nodes can be only used with N1 machines. ' + 'Please refer to: ' + 'https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines') # pylint: disable=line-too-long return # Treat A100 as a special case. From 50404605173390fe57313975d3ac15439619b3fe Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 14:25:11 -0700 Subject: [PATCH 14/21] Minor bugfix --- sky/optimizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index b8dd39fa5b3..11304123e77 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -847,8 +847,7 @@ def _fill_in_launchable_resources( 'enabled. Run `sky check` to enable access to it, ' 'or change the cloud requirement.') elif resources.is_launchable(): - if (isinstance(resources.cloud, clouds.GCP) and - resources.accelerators is not None): + if isinstance(resources.cloud, clouds.GCP): clouds.GCP.check_host_accelerator_compatibility( resources.instance_type, resources.accelerators, resources.zone) From cc32bcdd115a6d74e1f3c80dbe01ee364eab7ca6 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 18:37:11 -0700 Subject: [PATCH 15/21] Move compatibility check to resources & Add attachability check in optimizer --- sky/clouds/gcp.py | 9 ++- sky/clouds/service_catalog/__init__.py | 10 ++- sky/clouds/service_catalog/gcp_catalog.py | 70 +++++++++++++++------ sky/optimizer.py | 9 +-- sky/resources.py | 74 ++++++++++++----------- 5 files changed, 113 insertions(+), 59 deletions(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index f6bdc0d62ce..66c198dd335 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -377,8 +377,15 @@ def get_project_id(cls, dryrun: bool = False) -> str: @staticmethod def check_host_accelerator_compatibility( + instance_type: str, + accelerators: Optional[Dict[str, int]]) -> None: + service_catalog.check_host_accelerator_compatibility( + instance_type, accelerators, 'gcp') + + @staticmethod + def check_accelerator_attachable_to_host( instance_type: str, accelerators: Optional[Dict[str, int]], zone: Optional[str] = None) -> None: - service_catalog.check_host_accelerator_compatibility( + service_catalog.check_accelerator_attachable_to_host( instance_type, accelerators, zone, 'gcp') diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index e7dc833c06c..ae459cdeccc 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -190,10 +190,18 @@ def get_region_zones_for_accelerators( def check_host_accelerator_compatibility(instance_type: str, accelerators: Optional[Dict[str, int]], - zone: Optional[str] = None, clouds: CloudFilter = None) -> None: """GCP only: Check if host VM type is compatible with the accelerators.""" _map_clouds_catalog(clouds, 'check_host_accelerator_compatibility', + instance_type, accelerators) + + +def check_accelerator_attachable_to_host(instance_type: str, + accelerators: Optional[Dict[str, int]], + zone: Optional[str] = None, + clouds: CloudFilter = None) -> None: + """GCP only: Check if the accelerators can be attached to the host VM.""" + _map_clouds_catalog(clouds, 'check_accelerator_attachable_to_host', instance_type, accelerators, zone) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 87401d8b374..cabdb7d0522 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -304,9 +304,12 @@ def get_region_zones_for_accelerators( def check_host_accelerator_compatibility(instance_type: str, - accelerators: Optional[Dict[str, int]], - zone: Optional[str] = None) -> None: - """Check if the instance type is compatible with the accelerators.""" + accelerators: Optional[Dict[str, int]]) -> None: + """Check if the instance type is compatible with the accelerators. + + This function ensures that TPUs and GPUs except A100 are attached to N1 machines, + and A100 are attached to A2 machines. + """ if accelerators is None: if instance_type.startswith('a2-'): # NOTE: While it is allowed to use A2 machines as CPU-only nodes, @@ -320,11 +323,17 @@ def check_host_accelerator_compatibility(instance_type: str, acc = list(accelerators.items()) assert len(acc) == 1, acc - acc_name, acc_count = acc[0] + acc_name, _ = acc[0] + + # Check if the accelerator is supported by GCP. + if not list_accelerators(gpus_only=False, name_filter=acc_name): + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesUnavailableError( + f'{acc_name} is not available in GCP. ' + 'See \'sky show-gpus --cloud gcp\'') if acc_name.startswith('tpu-'): if instance_type != 'TPU-VM' and not instance_type.startswith('n1-'): - # TODO(woosuk): Check max vcpus and memory for each TPU type. with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( 'TPU Nodes can be only used with N1 machines. ' @@ -335,18 +344,11 @@ def check_host_accelerator_compatibility(instance_type: str, # Treat A100 as a special case. if acc_name == 'A100': # A100 must be attached to A2 instance type. - a100_instance_type = _A100_INSTANCE_TYPES[acc_count] if not instance_type.startswith('a2-'): with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( f'A100 GPUs cannot be attached to {instance_type}. ' - f'Use {a100_instance_type} instead. Please refer to ' - 'https://cloud.google.com/compute/docs/gpus#a100-gpus') - if instance_type != a100_instance_type: - with ux_utils.print_exception_no_traceback(): - raise exceptions.ResourcesMismatchError( - f'A100:{acc_count} cannot be attached to {instance_type}. ' - f'Use {a100_instance_type} instead. Please refer to ' + f'Use A2 machines instead. Please refer to ' 'https://cloud.google.com/compute/docs/gpus#a100-gpus') return @@ -359,12 +361,44 @@ def check_host_accelerator_compatibility(instance_type: str, 'Use N1 instance types instead. Please refer to: ' 'https://cloud.google.com/compute/docs/machine-types#gpus') - # Check maximum vCPUs and memory. - if acc_name not in _NUM_ACC_TO_MAX_CPU_AND_MEMORY: + +def check_accelerator_attachable_to_host(instance_type: str, + accelerators: Dict[str, int], + zone: Optional[str] = None) -> None: + """Check if the accelerators can be attached to the host. + + This function checks the max CPU count and memory of the host that + the accelerator can be attached to. + """ + acc = list(accelerators.items()) + assert len(acc) == 1, acc + acc_name, acc_count = acc[0] + + if acc_name.startswith('tpu-'): + # TODO(woosuk): Check max vcpus and memory for each TPU type. + assert instance_type == 'TPU-VM' or instance_type.startswith('n1-'), instance_type + return + + if acc_name == 'A100': + valid_counts = list(_A100_INSTANCE_TYPES.keys()) + else: + valid_counts = list(_NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name].keys()) + if acc_count not in valid_counts: with ux_utils.print_exception_no_traceback(): - raise exceptions.ResourcesUnavailableError( - f'{acc_name} is not available in GCP. ' - 'See \'sky show-gpus --cloud gcp\'') + raise exceptions.ResourcesMismatchError( + f'{acc_name}:{acc_count} is not launchable on GCP. ' + f'Valid accelerator counts are {valid_counts}.') + + if acc_name == 'A100': + a100_instance_type = _A100_INSTANCE_TYPES[acc_count] + if instance_type != a100_instance_type: + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesMismatchError( + f'A100:{acc_count} cannot be attached to {instance_type}. ' + f'Use {a100_instance_type} instead. Please refer to ' + 'https://cloud.google.com/compute/docs/gpus#a100-gpus') + + # Check maximum vCPUs and memory. max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][acc_count] if acc_name == 'K80' and acc_count == 8: if zone in ['asia-east1-a', 'us-east1-d']: diff --git a/sky/optimizer.py b/sky/optimizer.py index 11304123e77..23379504e06 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -847,10 +847,6 @@ def _fill_in_launchable_resources( 'enabled. Run `sky check` to enable access to it, ' 'or change the cloud requirement.') elif resources.is_launchable(): - if isinstance(resources.cloud, clouds.GCP): - clouds.GCP.check_host_accelerator_compatibility( - resources.instance_type, resources.accelerators, - resources.zone) launchable[resources] = [resources] else: clouds_list = [resources.cloud @@ -887,4 +883,9 @@ def _fill_in_launchable_resources( launchable[resources] = _filter_out_blocked_launchable_resources( launchable[resources], blocked_launchable_resources) + for r in launchable[resources]: + if isinstance(r.cloud, clouds.GCP): + clouds.GCP.check_accelerator_attachable_to_host( + r.instance_type, r.accelerators, r.zone) + return launchable, cloud_candidates diff --git a/sky/resources.py b/sky/resources.py index ad65c11b69a..1cebad6b1a8 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -301,43 +301,47 @@ def _try_validate_instance_type(self): def _try_validate_accelerators(self) -> None: """Validate accelerators against the instance type and region/zone.""" - if self.is_launchable() and not isinstance(self.cloud, clouds.GCP): - # GCP attaches accelerators to VMs, so no need for this check. - acc_requested = self.accelerators - if acc_requested is None: - return - acc_from_instance_type = ( - self.cloud.get_accelerators_from_instance_type( - self._instance_type)) - if not Resources(accelerators=acc_requested).less_demanding_than( - Resources(accelerators=acc_from_instance_type)): - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Infeasible resource demands found:\n' - f' Instance type requested: {self._instance_type}\n' - f' Accelerators for {self._instance_type}: ' - f'{acc_from_instance_type}\n' - f' Accelerators requested: {acc_requested}\n' - f'To fix: either only specify instance_type, or change ' - 'the accelerators field to be consistent.') - # NOTE: should not clear 'self.accelerators' even for AWS/Azure, - # because e.g., the instance may have 4 GPUs, while the task - # specifies to use 1 GPU. + acc_requested = self.accelerators + if acc_requested is None: + return - # Validate whether accelerator is available in specified region/zone. - if self.accelerators is not None: - acc, acc_count = list(self.accelerators.items())[0] - if self.region is not None or self.zone is not None: - if not self._cloud.accelerator_in_region_or_zone( - acc, acc_count, self.region, self.zone): - error_str = (f'Accelerator "{acc}" is not available in ' - '"{}" region/zone.') - if self.zone: - error_str = error_str.format(self.zone) - else: - error_str = error_str.format(self.region) + if self.is_launchable(): + if isinstance(self.cloud, clouds.GCP): + clouds.GCP.check_host_accelerator_compatibility( + self.instance_type, acc_requested) + else: + # GCP attaches accelerators to VMs, so no need for this check. + acc_from_instance_type = ( + self.cloud.get_accelerators_from_instance_type( + self._instance_type)) + if not Resources(accelerators=acc_requested).less_demanding_than( + Resources(accelerators=acc_from_instance_type)): with ux_utils.print_exception_no_traceback(): - raise ValueError(error_str) + raise ValueError( + 'Infeasible resource demands found:\n' + f' Instance type requested: {self._instance_type}\n' + f' Accelerators for {self._instance_type}: ' + f'{acc_from_instance_type}\n' + f' Accelerators requested: {acc_requested}\n' + f'To fix: either only specify instance_type, or change ' + 'the accelerators field to be consistent.') + # NOTE: should not clear 'self.accelerators' even for AWS/Azure, + # because e.g., the instance may have 4 GPUs, while the task + # specifies to use 1 GPU. + + # Validate whether accelerator is available in specified region/zone. + acc, acc_count = list(acc_requested.items())[0] + if self.region is not None or self.zone is not None: + if not self._cloud.accelerator_in_region_or_zone( + acc, acc_count, self.region, self.zone): + error_str = (f'Accelerator "{acc}" is not available in ' + '"{}" region/zone.') + if self.zone: + error_str = error_str.format(self.zone) + else: + error_str = error_str.format(self.region) + with ux_utils.print_exception_no_traceback(): + raise ValueError(error_str) def _try_validate_spot(self) -> None: if self._spot_recovery is None: From 102f0c940bab9bb09daa685d6ca7300460a9951d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 18:40:10 -0700 Subject: [PATCH 16/21] yapf --- sky/clouds/gcp.py | 3 +-- sky/clouds/service_catalog/gcp_catalog.py | 12 ++++++------ sky/resources.py | 14 ++++++++------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 66c198dd335..46ab4e2eb4f 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -377,8 +377,7 @@ def get_project_id(cls, dryrun: bool = False) -> str: @staticmethod def check_host_accelerator_compatibility( - instance_type: str, - accelerators: Optional[Dict[str, int]]) -> None: + instance_type: str, accelerators: Optional[Dict[str, int]]) -> None: service_catalog.check_host_accelerator_compatibility( instance_type, accelerators, 'gcp') diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index cabdb7d0522..3f2406527b8 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -303,11 +303,11 @@ def get_region_zones_for_accelerators( return common.get_region_zones(df, use_spot) -def check_host_accelerator_compatibility(instance_type: str, - accelerators: Optional[Dict[str, int]]) -> None: +def check_host_accelerator_compatibility( + instance_type: str, accelerators: Optional[Dict[str, int]]) -> None: """Check if the instance type is compatible with the accelerators. - - This function ensures that TPUs and GPUs except A100 are attached to N1 machines, + + This function ensures that TPUs and GPUs except A100 are attached to N1, and A100 are attached to A2 machines. """ if accelerators is None: @@ -366,7 +366,7 @@ def check_accelerator_attachable_to_host(instance_type: str, accelerators: Dict[str, int], zone: Optional[str] = None) -> None: """Check if the accelerators can be attached to the host. - + This function checks the max CPU count and memory of the host that the accelerator can be attached to. """ @@ -376,7 +376,7 @@ def check_accelerator_attachable_to_host(instance_type: str, if acc_name.startswith('tpu-'): # TODO(woosuk): Check max vcpus and memory for each TPU type. - assert instance_type == 'TPU-VM' or instance_type.startswith('n1-'), instance_type + assert instance_type == 'TPU-VM' or instance_type.startswith('n1-') return if acc_name == 'A100': diff --git a/sky/resources.py b/sky/resources.py index 1cebad6b1a8..49b6a97725b 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -314,17 +314,19 @@ def _try_validate_accelerators(self) -> None: acc_from_instance_type = ( self.cloud.get_accelerators_from_instance_type( self._instance_type)) - if not Resources(accelerators=acc_requested).less_demanding_than( - Resources(accelerators=acc_from_instance_type)): + if not Resources( + accelerators=acc_requested).less_demanding_than( + Resources(accelerators=acc_from_instance_type)): with ux_utils.print_exception_no_traceback(): raise ValueError( 'Infeasible resource demands found:\n' - f' Instance type requested: {self._instance_type}\n' + ' Instance type requested: ' + f'{self._instance_type}\n' f' Accelerators for {self._instance_type}: ' f'{acc_from_instance_type}\n' f' Accelerators requested: {acc_requested}\n' - f'To fix: either only specify instance_type, or change ' - 'the accelerators field to be consistent.') + f'To fix: either only specify instance_type, or ' + 'change the accelerators field to be consistent.') # NOTE: should not clear 'self.accelerators' even for AWS/Azure, # because e.g., the instance may have 4 GPUs, while the task # specifies to use 1 GPU. @@ -335,7 +337,7 @@ def _try_validate_accelerators(self) -> None: if not self._cloud.accelerator_in_region_or_zone( acc, acc_count, self.region, self.zone): error_str = (f'Accelerator "{acc}" is not available in ' - '"{}" region/zone.') + '"{}" region/zone.') if self.zone: error_str = error_str.format(self.zone) else: From b182e76cc7664d689ddc7e5325056bf8aebdb20b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 18:58:36 -0700 Subject: [PATCH 17/21] Consider accelerators == None --- sky/clouds/service_catalog/gcp_catalog.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 3f2406527b8..63a7954ac80 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -363,13 +363,16 @@ def check_host_accelerator_compatibility( def check_accelerator_attachable_to_host(instance_type: str, - accelerators: Dict[str, int], + accelerators: Optional[Dict[str, int]], zone: Optional[str] = None) -> None: """Check if the accelerators can be attached to the host. This function checks the max CPU count and memory of the host that - the accelerator can be attached to. + the accelerators can be attached to. """ + if accelerators is None: + return + acc = list(accelerators.items()) assert len(acc) == 1, acc acc_name, acc_count = acc[0] From eaa28ffb28b3f74227a648be8836292fe3b0e38b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 29 Aug 2022 18:58:47 -0700 Subject: [PATCH 18/21] Consider accelerators == None --- sky/resources.py | 50 ++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/sky/resources.py b/sky/resources.py index 49b6a97725b..8a81584b252 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -302,34 +302,34 @@ def _try_validate_instance_type(self): def _try_validate_accelerators(self) -> None: """Validate accelerators against the instance type and region/zone.""" acc_requested = self.accelerators + if (isinstance(self.cloud, clouds.GCP) and + self.instance_type is not None): + # Do this check even if acc_requested is None. + clouds.GCP.check_host_accelerator_compatibility( + self.instance_type, acc_requested) + if acc_requested is None: return - if self.is_launchable(): - if isinstance(self.cloud, clouds.GCP): - clouds.GCP.check_host_accelerator_compatibility( - self.instance_type, acc_requested) - else: - # GCP attaches accelerators to VMs, so no need for this check. - acc_from_instance_type = ( - self.cloud.get_accelerators_from_instance_type( - self._instance_type)) - if not Resources( - accelerators=acc_requested).less_demanding_than( - Resources(accelerators=acc_from_instance_type)): - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Infeasible resource demands found:\n' - ' Instance type requested: ' - f'{self._instance_type}\n' - f' Accelerators for {self._instance_type}: ' - f'{acc_from_instance_type}\n' - f' Accelerators requested: {acc_requested}\n' - f'To fix: either only specify instance_type, or ' - 'change the accelerators field to be consistent.') - # NOTE: should not clear 'self.accelerators' even for AWS/Azure, - # because e.g., the instance may have 4 GPUs, while the task - # specifies to use 1 GPU. + if self.is_launchable() and not isinstance(self.cloud, clouds.GCP): + # GCP attaches accelerators to VMs, so no need for this check. + acc_from_instance_type = ( + self.cloud.get_accelerators_from_instance_type( + self._instance_type)) + if not Resources(accelerators=acc_requested).less_demanding_than( + Resources(accelerators=acc_from_instance_type)): + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Infeasible resource demands found:\n' + f' Instance type requested: {self._instance_type}\n' + f' Accelerators for {self._instance_type}: ' + f'{acc_from_instance_type}\n' + f' Accelerators requested: {acc_requested}\n' + f'To fix: either only specify instance_type, or change ' + 'the accelerators field to be consistent.') + # NOTE: should not clear 'self.accelerators' even for AWS/Azure, + # because e.g., the instance may have 4 GPUs, while the task + # specifies to use 1 GPU. # Validate whether accelerator is available in specified region/zone. acc, acc_count = list(acc_requested.items())[0] From 309081d81e1a94a754e0930c65479ad5febf8dbe Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 30 Aug 2022 22:04:00 -0700 Subject: [PATCH 19/21] Add comments --- sky/clouds/service_catalog/__init__.py | 12 ++++++++++-- sky/clouds/service_catalog/gcp_catalog.py | 4 ++-- sky/optimizer.py | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index ae459cdeccc..722798005c6 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -191,7 +191,11 @@ def get_region_zones_for_accelerators( def check_host_accelerator_compatibility(instance_type: str, accelerators: Optional[Dict[str, int]], clouds: CloudFilter = None) -> None: - """GCP only: Check if host VM type is compatible with the accelerators.""" + """GCP only: Check if host VM type is compatible with the accelerators. + + That is, this function ensures that TPUs and GPUs except A100 are attached + to N1, and A100 GPUs are attached to A2 machines. + """ _map_clouds_catalog(clouds, 'check_host_accelerator_compatibility', instance_type, accelerators) @@ -200,7 +204,11 @@ def check_accelerator_attachable_to_host(instance_type: str, accelerators: Optional[Dict[str, int]], zone: Optional[str] = None, clouds: CloudFilter = None) -> None: - """GCP only: Check if the accelerators can be attached to the host VM.""" + """GCP only: Check if the accelerators can be attached to the host VM. + + Specifically, this function checks the max CPU count and memory of the host + that the accelerators can be attached to. + """ _map_clouds_catalog(clouds, 'check_accelerator_attachable_to_host', instance_type, accelerators, zone) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 63a7954ac80..0c55d6335e9 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -308,7 +308,7 @@ def check_host_accelerator_compatibility( """Check if the instance type is compatible with the accelerators. This function ensures that TPUs and GPUs except A100 are attached to N1, - and A100 are attached to A2 machines. + and A100 GPUs are attached to A2 machines. """ if accelerators is None: if instance_type.startswith('a2-'): @@ -390,7 +390,7 @@ def check_accelerator_attachable_to_host(instance_type: str, with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( f'{acc_name}:{acc_count} is not launchable on GCP. ' - f'Valid accelerator counts are {valid_counts}.') + f'The valid {acc_name} counts are {valid_counts}.') if acc_name == 'A100': a100_instance_type = _A100_INSTANCE_TYPES[acc_count] diff --git a/sky/optimizer.py b/sky/optimizer.py index 23379504e06..1cc9c5f1f4e 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -885,6 +885,7 @@ def _fill_in_launchable_resources( for r in launchable[resources]: if isinstance(r.cloud, clouds.GCP): + # Check if the host VM satisfies the max vCPU and memory limits. clouds.GCP.check_accelerator_attachable_to_host( r.instance_type, r.accelerators, r.zone) From 50bbd913686c75a3a1ef4ab0c651888b4ca2a5d0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 31 Aug 2022 12:27:19 -0700 Subject: [PATCH 20/21] Add comments --- sky/clouds/service_catalog/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 722798005c6..26df7a02813 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -193,8 +193,12 @@ def check_host_accelerator_compatibility(instance_type: str, clouds: CloudFilter = None) -> None: """GCP only: Check if host VM type is compatible with the accelerators. - That is, this function ensures that TPUs and GPUs except A100 are attached - to N1, and A100 GPUs are attached to A2 machines. + This function is invoked whenever a Resources object is created. + This function ensures that TPUs and GPUs (except A100) are attached to N1, + and A100 GPUs are attached to A2 machines. However, it does NOT check + the maximum vCPU count and maximum memory limits for the accelerators + because any Resources like GCP(n1-highmem-64, {'V100': 0.01}) can be valid + for sky exec/launch on an existing cluster. """ _map_clouds_catalog(clouds, 'check_host_accelerator_compatibility', instance_type, accelerators) @@ -207,7 +211,8 @@ def check_accelerator_attachable_to_host(instance_type: str, """GCP only: Check if the accelerators can be attached to the host VM. Specifically, this function checks the max CPU count and memory of the host - that the accelerators can be attached to. + that the accelerators can be attached to. It is invoked by the optimizer, + so sky exec will not execute this function. """ _map_clouds_catalog(clouds, 'check_accelerator_attachable_to_host', instance_type, accelerators, zone) From b90549b3842711314a4f7024b075add0e53cf1d3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 31 Aug 2022 13:22:52 -0700 Subject: [PATCH 21/21] Address comments --- sky/clouds/service_catalog/__init__.py | 2 +- sky/optimizer.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 26df7a02813..1b63dcac301 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -193,7 +193,7 @@ def check_host_accelerator_compatibility(instance_type: str, clouds: CloudFilter = None) -> None: """GCP only: Check if host VM type is compatible with the accelerators. - This function is invoked whenever a Resources object is created. + This function is invoked whenever a Resources object is created. This function ensures that TPUs and GPUs (except A100) are attached to N1, and A100 GPUs are attached to A2 machines. However, it does NOT check the maximum vCPU count and maximum memory limits for the accelerators diff --git a/sky/optimizer.py b/sky/optimizer.py index 1cc9c5f1f4e..34a910c05bf 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -847,6 +847,11 @@ def _fill_in_launchable_resources( 'enabled. Run `sky check` to enable access to it, ' 'or change the cloud requirement.') elif resources.is_launchable(): + if isinstance(resources.cloud, clouds.GCP): + # Check if the host VM satisfies the max vCPU and memory limits. + clouds.GCP.check_accelerator_attachable_to_host( + resources.instance_type, resources.accelerators, + resources.zone) launchable[resources] = [resources] else: clouds_list = [resources.cloud @@ -883,10 +888,4 @@ def _fill_in_launchable_resources( launchable[resources] = _filter_out_blocked_launchable_resources( launchable[resources], blocked_launchable_resources) - for r in launchable[resources]: - if isinstance(r.cloud, clouds.GCP): - # Check if the host VM satisfies the max vCPU and memory limits. - clouds.GCP.check_accelerator_attachable_to_host( - r.instance_type, r.accelerators, r.zone) - return launchable, cloud_candidates