Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add host VM - GPU compatibility checks for GCP #989

Merged
merged 26 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3f50915
Add host VM - GPU compatibility check for GCP
WoosukKwon Jul 18, 2022
4d1b142
Minor fix
WoosukKwon Jul 18, 2022
8319e23
Merge branch 'master' into gcp-host-vm
WoosukKwon Jul 18, 2022
9adb053
Fix test_spot
WoosukKwon Jul 18, 2022
a5f27e2
Fix optimizer test
WoosukKwon Jul 18, 2022
a3fd8d0
Merge branch 'master' into gcp-host-vm
WoosukKwon Jul 30, 2022
6a4b345
Fix optimizer test
WoosukKwon Jul 30, 2022
0cd3bf9
Minor bugfix + Add reference URL in error message
WoosukKwon Jul 30, 2022
e5e5f9d
Minor fix in docstring
WoosukKwon Jul 30, 2022
135fa71
Get memory size from catalog
WoosukKwon Jul 30, 2022
940827c
Add TODO
WoosukKwon Jul 30, 2022
737cb71
Merge branch 'master' into gcp-host-vm
WoosukKwon Aug 29, 2022
fc23709
Resolve merge conflicts & Address TODOs
WoosukKwon Aug 29, 2022
b8e5864
Merge branch 'master' into gcp-host-vm
WoosukKwon Aug 29, 2022
c9ad421
Move compatibility check to optimizer
WoosukKwon Aug 29, 2022
02d4159
ValueError -> ResourcesMismatchError
WoosukKwon Aug 29, 2022
974ad50
Fix TPU error msg
WoosukKwon Aug 29, 2022
5040460
Minor bugfix
WoosukKwon Aug 29, 2022
cc32bcd
Move compatibility check to resources & Add attachability check in op…
WoosukKwon Aug 30, 2022
102f0c9
yapf
WoosukKwon Aug 30, 2022
b182e76
Consider accelerators == None
WoosukKwon Aug 30, 2022
eaa28ff
Consider accelerators == None
WoosukKwon Aug 30, 2022
b2a19e4
Merge branch 'master' into gcp-host-vm
WoosukKwon Aug 31, 2022
309081d
Add comments
WoosukKwon Aug 31, 2022
50bbd91
Add comments
WoosukKwon Aug 31, 2022
b90549b
Address comments
WoosukKwon Aug 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,17 @@ def get_project_id(cls, dryrun: bool = False) -> str:
project_id = gcp_credentials.get('quota_project_id',
None) or gcp_credentials['project_id']
return project_id

@staticmethod
def check_host_accelerator_compatibility(
instance_type: str, accelerators: Optional[Dict[str, int]]) -> None:
service_catalog.check_host_accelerator_compatibility(
instance_type, accelerators, 'gcp')

@staticmethod
def check_accelerator_attachable_to_host(
instance_type: str,
accelerators: Optional[Dict[str, int]],
zone: Optional[str] = None) -> None:
service_catalog.check_accelerator_attachable_to_host(
instance_type, accelerators, zone, 'gcp')
25 changes: 25 additions & 0 deletions sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,31 @@ def get_region_zones_for_accelerators(
acc_name, acc_count, use_spot)


def check_host_accelerator_compatibility(instance_type: str,
accelerators: Optional[Dict[str, int]],
clouds: CloudFilter = None) -> None:
"""GCP only: Check if host VM type is compatible with the accelerators.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add #989 (comment) to this func and the next func (L207+)? It's great explanation on why these two funcs are structured this way.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.


That is, this function ensures that TPUs and GPUs except A100 are attached
to N1, and A100 GPUs are attached to A2 machines.
"""
_map_clouds_catalog(clouds, 'check_host_accelerator_compatibility',
instance_type, accelerators)


def check_accelerator_attachable_to_host(instance_type: str,
accelerators: Optional[Dict[str, int]],
zone: Optional[str] = None,
clouds: CloudFilter = None) -> None:
"""GCP only: Check if the accelerators can be attached to the host VM.

Specifically, this function checks the max CPU count and memory of the host
that the accelerators can be attached to.
"""
_map_clouds_catalog(clouds, 'check_accelerator_attachable_to_host',
instance_type, accelerators, zone)


def get_common_gpus() -> List[str]:
"""Returns a list of commonly used GPU names."""
return [
Expand Down
161 changes: 161 additions & 0 deletions sky/clouds/service_catalog/gcp_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import pandas as pd

from sky import exceptions
from sky.clouds.service_catalog import common
from sky.utils import ux_utils

if typing.TYPE_CHECKING:
from sky.clouds import cloud
Expand Down Expand Up @@ -79,6 +81,38 @@
},
}

# num_gpus -> (max_num_cpus, max_memory_gb)
# Refer to: https://cloud.google.com/compute/docs/gpus
_NUM_ACC_TO_MAX_CPU_AND_MEMORY = {
'K80': {
1: (8, 52),
2: (16, 104),
4: (32, 208),
8: (64, 208), # except for asia-east1-a, us-east1-d
},
'V100': {
1: (12, 78),
2: (24, 156),
4: (48, 312),
8: (96, 624),
},
'T4': {
1: (48, 312),
2: (48, 312),
4: (96, 624),
},
'P4': {
1: (24, 156),
2: (48, 312),
4: (96, 624),
},
'P100': {
1: (16, 104),
2: (32, 208),
4: (96, 624), # except for us-east1-c, europe-west1-d, europe-west1-b
}
}


def _is_power_of_two(x: int) -> bool:
"""Returns true if x is a power of two."""
Expand Down Expand Up @@ -267,3 +301,130 @@ def get_region_zones_for_accelerators(
"""Returns a list of regions for a given accelerators."""
df = _get_accelerator(_df, accelerator, count, region=None)
return common.get_region_zones(df, use_spot)


def check_host_accelerator_compatibility(
instance_type: str, accelerators: Optional[Dict[str, int]]) -> None:
"""Check if the instance type is compatible with the accelerators.

This function ensures that TPUs and GPUs except A100 are attached to N1,
and A100 GPUs are attached to A2 machines.
"""
if accelerators is None:
if instance_type.startswith('a2-'):
# NOTE: While it is allowed to use A2 machines as CPU-only nodes,
# we exclude this case as it is uncommon and undesirable.
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
'A2 instance types should be used with A100 GPUs. '
'Either use other instance types or specify the '
'accelerators as A100.')
return

acc = list(accelerators.items())
assert len(acc) == 1, acc
acc_name, _ = acc[0]

# Check if the accelerator is supported by GCP.
if not list_accelerators(gpus_only=False, name_filter=acc_name):
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesUnavailableError(
f'{acc_name} is not available in GCP. '
'See \'sky show-gpus --cloud gcp\'')

if acc_name.startswith('tpu-'):
if instance_type != 'TPU-VM' and not instance_type.startswith('n1-'):
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
'TPU Nodes can be only used with N1 machines. '
'Please refer to: '
'https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines') # pylint: disable=line-too-long
return

# Treat A100 as a special case.
if acc_name == 'A100':
# A100 must be attached to A2 instance type.
if not instance_type.startswith('a2-'):
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'A100 GPUs cannot be attached to {instance_type}. '
f'Use A2 machines instead. Please refer to '
'https://cloud.google.com/compute/docs/gpus#a100-gpus')
return

# Other GPUs must be attached to N1 machines.
# Refer to: https://cloud.google.com/compute/docs/machine-types#gpus
if not instance_type.startswith('n1-'):
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'{acc_name} GPUs cannot be attached to {instance_type}. '
'Use N1 instance types instead. Please refer to: '
'https://cloud.google.com/compute/docs/machine-types#gpus')


def check_accelerator_attachable_to_host(instance_type: str,
accelerators: Optional[Dict[str, int]],
zone: Optional[str] = None) -> None:
"""Check if the accelerators can be attached to the host.

This function checks the max CPU count and memory of the host that
the accelerators can be attached to.
"""
if accelerators is None:
return

acc = list(accelerators.items())
assert len(acc) == 1, acc
acc_name, acc_count = acc[0]

if acc_name.startswith('tpu-'):
# TODO(woosuk): Check max vcpus and memory for each TPU type.
assert instance_type == 'TPU-VM' or instance_type.startswith('n1-')
return

if acc_name == 'A100':
valid_counts = list(_A100_INSTANCE_TYPES.keys())
else:
valid_counts = list(_NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name].keys())
if acc_count not in valid_counts:
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'{acc_name}:{acc_count} is not launchable on GCP. '
f'The valid {acc_name} counts are {valid_counts}.')

if acc_name == 'A100':
a100_instance_type = _A100_INSTANCE_TYPES[acc_count]
if instance_type != a100_instance_type:
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'A100:{acc_count} cannot be attached to {instance_type}. '
f'Use {a100_instance_type} instead. Please refer to '
'https://cloud.google.com/compute/docs/gpus#a100-gpus')

# Check maximum vCPUs and memory.
max_cpus, max_memory = _NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name][acc_count]
if acc_name == 'K80' and acc_count == 8:
if zone in ['asia-east1-a', 'us-east1-d']:
max_memory = 416
elif acc_name == 'P100' and acc_count == 4:
if zone in ['us-east1-c', 'europe-west1-d', 'europe-west1-b']:
max_cpus = 64
max_memory = 208

# vCPU counts and memory sizes of N1 machines.
df = _df[_df['InstanceType'] == instance_type]
num_cpus = df['vCPUs'].iloc[0]
memory = df['MemoryGiB'].iloc[0]

if num_cpus > max_cpus:
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'{acc_name}:{acc_count} cannot be attached to '
f'{instance_type}. The maximum number of vCPUs is {max_cpus}. '
'Please refer to: https://cloud.google.com/compute/docs/gpus')
if memory > max_memory:
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
f'{acc_name}:{acc_count} cannot be attached to '
f'{instance_type}. The maximum CPU memory is {max_memory} GB. '
'Please refer to: https://cloud.google.com/compute/docs/gpus')
6 changes: 6 additions & 0 deletions sky/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,4 +883,10 @@ def _fill_in_launchable_resources(
launchable[resources] = _filter_out_blocked_launchable_resources(
launchable[resources], blocked_launchable_resources)

for r in launchable[resources]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: why move it to here, rather than after L849? Was thinking checking resources in that loop makes more sense, as it represents a validation of the user-requested resources. Here, it may be possible than launchable[resources] has more than 1 "expanded" resources, and throwing an error on these may be unexpected?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by the "expanded" resources? I thought this check should be applied to every case, as the max cpu and memory limits must be respected to launch an instance on GCP.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant that here launchable[resources] may have more than 1 element, - can some of them pass the check, while some fail? In these cases it may make sense to remove the candidates that fail rather than raising an error to the whole program.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. That makes sense. I've rolled back the change.

if isinstance(r.cloud, clouds.GCP):
# Check if the host VM satisfies the max vCPU and memory limits.
clouds.GCP.check_accelerator_attachable_to_host(
r.instance_type, r.accelerators, r.zone)

return launchable, cloud_candidates
38 changes: 22 additions & 16 deletions sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,11 +301,18 @@ def _try_validate_instance_type(self):

def _try_validate_accelerators(self) -> None:
"""Validate accelerators against the instance type and region/zone."""
acc_requested = self.accelerators
if (isinstance(self.cloud, clouds.GCP) and
self.instance_type is not None):
# Do this check even if acc_requested is None.
clouds.GCP.check_host_accelerator_compatibility(
self.instance_type, acc_requested)

if acc_requested is None:
return

if self.is_launchable() and not isinstance(self.cloud, clouds.GCP):
# GCP attaches accelerators to VMs, so no need for this check.
acc_requested = self.accelerators
if acc_requested is None:
return
acc_from_instance_type = (
self.cloud.get_accelerators_from_instance_type(
self._instance_type))
Expand All @@ -325,19 +332,18 @@ def _try_validate_accelerators(self) -> None:
# specifies to use 1 GPU.

# Validate whether accelerator is available in specified region/zone.
if self.accelerators is not None:
acc, acc_count = list(self.accelerators.items())[0]
if self.region is not None or self.zone is not None:
if not self._cloud.accelerator_in_region_or_zone(
acc, acc_count, self.region, self.zone):
error_str = (f'Accelerator "{acc}" is not available in '
'"{}" region/zone.')
if self.zone:
error_str = error_str.format(self.zone)
else:
error_str = error_str.format(self.region)
with ux_utils.print_exception_no_traceback():
raise ValueError(error_str)
acc, acc_count = list(acc_requested.items())[0]
if self.region is not None or self.zone is not None:
if not self._cloud.accelerator_in_region_or_zone(
acc, acc_count, self.region, self.zone):
error_str = (f'Accelerator "{acc}" is not available in '
'"{}" region/zone.')
if self.zone:
error_str = error_str.format(self.zone)
else:
error_str = error_str.format(self.region)
with ux_utils.print_exception_no_traceback():
raise ValueError(error_str)

def _try_validate_spot(self) -> None:
if self._spot_recovery is None:
Expand Down
21 changes: 12 additions & 9 deletions tests/test_optimizer_random_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,20 @@ def generate_random_dag(
num_candidates = random.randint(1, max_num_candidate_resources)
candidate_instance_types = random.choices(ALL_INSTANCE_TYPES,
k=num_candidates)
op.set_resources({
sky.Resources(

candidate_resources = set()
for candidate in candidate_instance_types:
instance_type = candidate.instance_type
if pd.isna(instance_type):
instance_type = GCP_DEFAULT_INSTANCE_TYPE
resources = sky.Resources(
cloud=CLOUDS[candidate.cloud],
instance_type=candidate.instance_type \
if not pd.isna(candidate.instance_type) \
else GCP_DEFAULT_INSTANCE_TYPE,
instance_type=instance_type,
accelerators={
candidate.accelerator_name: candidate.accelerator_count},
)
for candidate in candidate_instance_types
})
candidate.accelerator_name: candidate.accelerator_count
})
candidate_resources.add(resources)
op.set_resources(candidate_resources)
return dag


Expand Down
2 changes: 1 addition & 1 deletion tests/test_spot.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _mock_cluster_state(self, _mock_db_conn):
head_ip='1.1.1.2',
launched_nodes=1,
launched_resources=sky.Resources(sky.GCP(),
instance_type='n1-highmem-8',
instance_type='a2-highgpu-4g',
accelerators={'A100': 4},
region='us-west1'),
)
Expand Down