From c54ef6cacb50dae5b20dc170fef5a18cd14f82be Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 18:11:05 -0700 Subject: [PATCH 01/36] Support for autodown --- sky/backends/cloud_vm_ray_backend.py | 3 +- sky/cli.py | 53 ++++++++++++++++++++++++---- sky/core.py | 17 +++++---- sky/execution.py | 4 +-- sky/skylet/autostop_lib.py | 15 +++++--- sky/skylet/events.py | 9 +++-- 6 files changed, 76 insertions(+), 25 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4043051196c..feff4b74c6b 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2608,10 +2608,11 @@ def post_teardown_cleanup(self, def set_autostop(self, handle: ResourceHandle, idle_minutes_to_autostop: Optional[int], + teardown: bool = True, stream_logs: bool = True) -> None: if idle_minutes_to_autostop is not None: code = autostop_lib.AutostopCodeGen.set_autostop( - idle_minutes_to_autostop, self.NAME) + idle_minutes_to_autostop, self.NAME, teardown) returncode, _, stderr = self.run_on_head(handle, code, require_outputs=True, diff --git a/sky/cli.py b/sky/cli.py index 0f36ad71cf2..62a9e0c0a9c 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -604,6 +604,7 @@ def _launch_with_confirm( detach_run: bool, no_confirm: bool = False, idle_minutes_to_autostop: Optional[int] = None, + teardown: bool = False, retry_until_up: bool = False, no_setup: bool = False, node_type: Optional[str] = None, @@ -654,6 +655,7 @@ def _launch_with_confirm( detach_run=detach_run, backend=backend, idle_minutes_to_autostop=idle_minutes_to_autostop, + teardown=teardown, retry_until_up=retry_until_up, no_setup=no_setup, ) @@ -992,6 +994,14 @@ def cli(): 'Setting this flag is equivalent to ' 'running ``sky launch -d ...`` and then ``sky autostop -i ``' '. If not set, the cluster will not be auto-stopped.')) +@click.option( + '--idle-minutes-to-autodown', + '-I', + default=None, + type=int, + required=False, + help=('Same as --idle-minutes-to-autostop, but tears down the cluster.'), +) @click.option( '--retry-until-up', '-r', @@ -1033,6 +1043,7 @@ def launch( env: List[Dict[str, str]], disk_size: Optional[int], idle_minutes_to_autostop: Optional[int], + idle_minutes_to_autodown: Optional[int], retry_until_up: bool, yes: bool, no_setup: bool, @@ -1045,6 +1056,14 @@ def launch( In both cases, the commands are run under the task's workdir (if specified) and they undergo job queue scheduling. """ + autodown = False + if idle_minutes_to_autostop is not None and idle_minutes_to_autodown is not None: + raise click.UsageError( + 'Only one of --idle-minutes-to-autostop and --idle-minutes-to-autodown should be specified. ' + f'autostop: {idle_minutes_to_autostop}, autodown: {idle_minutes_to_autodown}') + if idle_minutes_to_autodown is not None: + idle_minutes_to_autostop = idle_minutes_to_autodown + autodown = True backend_utils.check_cluster_name_not_reserved( cluster, operation_str='Launching task on it') if backend_name is None: @@ -1083,6 +1102,7 @@ def launch( detach_run=detach_run, no_confirm=yes, idle_minutes_to_autostop=idle_minutes_to_autostop, + teardown=autodown, retry_until_up=retry_until_up, no_setup=no_setup, is_local_cloud=onprem_utils.check_if_local_cloud(cluster)) @@ -1499,6 +1519,11 @@ def stop( is_flag=True, required=False, help='Cancel the auto-stopping.') +@click.option('--down', + default=False, + is_flag=True, + required=False, + help='Tear down the cluster instead of stopping it, when auto-stopping.') @click.option('--yes', '-y', is_flag=True, @@ -1512,6 +1537,7 @@ def autostop( idle_minutes: Optional[int], cancel: bool, # pylint: disable=redefined-outer-name yes: bool, + down: bool, ): """Schedule or cancel auto-stopping for cluster(s). @@ -1548,7 +1574,7 @@ def autostop( idle_minutes = 5 _terminate_or_stop_clusters(clusters, apply_to_all=all, - terminate=False, + terminate=down, no_confirm=yes, idle_minutes_to_autostop=idle_minutes) @@ -1584,6 +1610,14 @@ def autostop( 'Setting this flag is equivalent to ' 'running ``sky launch -d ...`` and then ``sky autostop -i ``' '. If not set, the cluster will not be auto-stopped.')) +@click.option( + '--idle-minutes-to-autodown', + '-I', + default=None, + type=int, + required=False, + help=('Same as --idle-minutes-to-autostop, but tears down the cluster.'), +) @click.option( '--retry-until-up', '-r', @@ -1595,7 +1629,7 @@ def autostop( @usage_lib.entrypoint # pylint: disable=redefined-builtin def start(clusters: Tuple[str], all: bool, yes: bool, - idle_minutes_to_autostop: int, retry_until_up: bool): + idle_minutes_to_autostop: Optional[int], idle_minutes_to_autodown: Optional[int], retry_until_up: bool): """Restart cluster(s). If a cluster is previously stopped (status is STOPPED) or failed in @@ -1623,6 +1657,12 @@ def start(clusters: Tuple[str], all: bool, yes: bool, sky start -a """ + if idle_minutes_to_autostop is not None and idle_minutes_to_autodown is not None: + raise click.UsageError( + 'Only one of --idle-minutes-to-autostop and --idle-minutes-to-autodown should be specified. ' + f'autostop: {idle_minutes_to_autostop}, autodown: {idle_minutes_to_autodown}') + autodown = idle_minutes_to_autodown is not None + to_start = [] if not clusters and not all: @@ -1711,7 +1751,7 @@ def start(clusters: Tuple[str], all: bool, yes: bool, for name in to_start: try: - core.start(name, idle_minutes_to_autostop, retry_until_up) + core.start(name, idle_minutes_to_autostop, retry_until_up, autodown=autodown) except exceptions.NotSupportedError as e: click.echo(str(e)) click.secho(f'Cluster {name} started.', fg='green') @@ -1796,8 +1836,6 @@ def _terminate_or_stop_clusters( name is explicitly and uniquely specified (not via glob) and purge is set to True. """ - assert idle_minutes_to_autostop is None or not terminate, ( - idle_minutes_to_autostop, terminate) command = 'down' if terminate else 'stop' if not names and apply_to_all is None: raise click.UsageError( @@ -1807,7 +1845,8 @@ def _terminate_or_stop_clusters( operation = 'Terminating' if terminate else 'Stopping' if idle_minutes_to_autostop is not None: verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - operation = f'{verb} auto-stop on' + down_str = f' (tear down)' if terminate else '' + operation = f'{verb} auto-stop{down_str} on' if len(names) > 0: reserved_clusters = [ @@ -1902,7 +1941,7 @@ def _terminate_or_stop(name: str): success_progress = False if idle_minutes_to_autostop is not None: try: - core.autostop(name, idle_minutes_to_autostop) + core.autostop(name, idle_minutes_to_autostop, terminate) except (exceptions.NotSupportedError, exceptions.ClusterNotUpError) as e: message = str(e) diff --git a/sky/core.py b/sky/core.py index 87d2a9846ac..9b1cb4e4bce 100644 --- a/sky/core.py +++ b/sky/core.py @@ -53,7 +53,8 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: def _start(cluster_name: str, idle_minutes_to_autostop: Optional[int] = None, - retry_until_up: bool = False) -> backends.Backend.ResourceHandle: + retry_until_up: bool = False, + autodown: bool=False) -> backends.Backend.ResourceHandle: cluster_status, handle = backend_utils.refresh_cluster_status_handle( cluster_name) @@ -79,13 +80,14 @@ def _start(cluster_name: str, cluster_name=cluster_name, retry_until_up=retry_until_up) if idle_minutes_to_autostop is not None: - backend.set_autostop(handle, idle_minutes_to_autostop) + backend.set_autostop(handle, idle_minutes_to_autostop, teardown=autodown) return handle def start(cluster_name: str, idle_minutes_to_autostop: Optional[int] = None, - retry_until_up: bool = False): + retry_until_up: bool = False, + autodown: bool = False): """Start the cluster. Please refer to the sky.cli.start for the document. @@ -93,7 +95,7 @@ def start(cluster_name: str, Raises: sky.exceptions.NotSupportedError: the cluster is not supported. """ - _start(cluster_name, idle_minutes_to_autostop, retry_until_up) + _start(cluster_name, idle_minutes_to_autostop, retry_until_up, autodown) def stop(cluster_name: str, purge: bool = False): @@ -156,7 +158,7 @@ def down(cluster_name: str, purge: bool = False): backend.teardown(handle, terminate=True, purge=purge) -def autostop(cluster_name: str, idle_minutes_to_autostop: int): +def autostop(cluster_name: str, idle_minutes_to_autostop: int, terminate: bool=False): """Set the autostop time of the cluster. Please refer to the sky.cli.autostop for the document. @@ -170,7 +172,8 @@ def autostop(cluster_name: str, idle_minutes_to_autostop: int): sky.exceptions.ClusterNotUpError: the cluster is not UP. """ verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - operation = f'{verb} auto-stop on' + down_str = f' (tear down)' if terminate else '' + operation = f'{verb} auto-stop{down_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' @@ -202,7 +205,7 @@ def autostop(cluster_name: str, idle_minutes_to_autostop: int): f'{colorama.Style.RESET_ALL}' '\n Auto-stop can only be set/unset for ' f'{global_user_state.ClusterStatus.UP.value} clusters.') - backend.set_autostop(handle, idle_minutes_to_autostop) + backend.set_autostop(handle, idle_minutes_to_autostop, terminate) # ================== diff --git a/sky/execution.py b/sky/execution.py index da60737feae..d9eac829b7f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -214,7 +214,7 @@ def _execute( if stages is None or Stage.PRE_EXEC in stages: if idle_minutes_to_autostop is not None: - backend.set_autostop(handle, idle_minutes_to_autostop) + backend.set_autostop(handle, idle_minutes_to_autostop, teardown=teardown) if stages is None or Stage.EXEC in stages: try: @@ -225,7 +225,7 @@ def _execute( backend.post_execute(handle, teardown) if stages is None or Stage.TEARDOWN in stages: - if teardown: + if teardown and idle_minutes_to_autostop is None: backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: diff --git a/sky/skylet/autostop_lib.py b/sky/skylet/autostop_lib.py index 73c74f3f510..8c9f9453098 100644 --- a/sky/skylet/autostop_lib.py +++ b/sky/skylet/autostop_lib.py @@ -12,12 +12,17 @@ class AutostopConfig: def __init__(self, autostop_idle_minutes: int, boot_time: int, - backend: Optional[str]): + backend: Optional[str], teardown: bool = False): assert autostop_idle_minutes < 0 or backend is not None, ( autostop_idle_minutes, backend) self.autostop_idle_minutes = autostop_idle_minutes self.boot_time = boot_time self.backend = backend + self.teardown = teardown + + def __set_state__(self, state: dict): + state.setdefault('teardown', False) + self.__dict__.update(state) def get_autostop_config() -> Optional[AutostopConfig]: @@ -27,9 +32,9 @@ def get_autostop_config() -> Optional[AutostopConfig]: return pickle.loads(config_str) -def set_autostop(idle_minutes: int, backend: Optional[str]) -> None: +def set_autostop(idle_minutes: int, backend: Optional[str], teardown: bool) -> None: boot_time = psutil.boot_time() - autostop_config = AutostopConfig(idle_minutes, boot_time, backend) + autostop_config = AutostopConfig(idle_minutes, boot_time, backend, teardown) configs.set_config(AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config)) @@ -43,9 +48,9 @@ class AutostopCodeGen: _PREFIX = ['from sky.skylet import autostop_lib'] @classmethod - def set_autostop(cls, idle_minutes: int, backend: str) -> str: + def set_autostop(cls, idle_minutes: int, backend: str, teardown: bool) -> str: code = [ - f'autostop_lib.set_autostop({idle_minutes}, {backend!r})', + f'autostop_lib.set_autostop({idle_minutes}, {backend!r}, {teardown})', ] return cls._build(code) diff --git a/sky/skylet/events.py b/sky/skylet/events.py index 7f5fbaba17a..e8a382d8ee8 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -119,7 +119,7 @@ def _run(self): def _stop_cluster(self, autostop_config): if (autostop_config.backend == cloud_vm_ray_backend.CloudVmRayBackend.NAME): - self._replace_yaml_for_stopping(self.ray_yaml_path) + self._replace_yaml_for_stopping(self.ray_yaml_path, autostop_config.teardown) # `ray up` is required to reset the upscaling speed and min/max # workers. Otherwise, `ray down --workers-only` will continuously # scale down and up. @@ -135,13 +135,16 @@ def _stop_cluster(self, autostop_config): else: raise NotImplementedError - def _replace_yaml_for_stopping(self, yaml_path: str): + def _replace_yaml_for_stopping(self, yaml_path: str, teardown: bool): with open(yaml_path, 'r') as f: yaml_str = f.read() # Update the number of workers to 0. yaml_str = self._NUM_WORKER_PATTERN.sub(r'\g<1>_workers: 0', yaml_str) yaml_str = self._UPSCALING_PATTERN.sub(r'upscaling_speed: 0', yaml_str) - yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: true', yaml_str) + if teardown: + yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: false', yaml_str) + else: + yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: true', yaml_str) config = yaml.safe_load(yaml_str) # Set the private key with the existed key on the remote instance. config['auth']['ssh_private_key'] = '~/ray_bootstrap_key.pem' From 10f41e6e8af2547d943b32b0aaa1143d0c9664f1 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 20:51:41 -0700 Subject: [PATCH 02/36] Change API to terminate --- sky/backends/cloud_vm_ray_backend.py | 4 +- sky/cli.py | 69 +++++++++++++--------------- sky/core.py | 16 ++++--- sky/execution.py | 24 +++++----- sky/skylet/autostop_lib.py | 24 ++++++---- sky/skylet/events.py | 9 ++-- 6 files changed, 78 insertions(+), 68 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index feff4b74c6b..2c3caaf0765 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2608,11 +2608,11 @@ def post_teardown_cleanup(self, def set_autostop(self, handle: ResourceHandle, idle_minutes_to_autostop: Optional[int], - teardown: bool = True, + terminate: bool = True, stream_logs: bool = True) -> None: if idle_minutes_to_autostop is not None: code = autostop_lib.AutostopCodeGen.set_autostop( - idle_minutes_to_autostop, self.NAME, teardown) + idle_minutes_to_autostop, self.NAME, terminate) returncode, _, stderr = self.run_on_head(handle, code, require_outputs=True, diff --git a/sky/cli.py b/sky/cli.py index 62a9e0c0a9c..86541b5659d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -604,7 +604,7 @@ def _launch_with_confirm( detach_run: bool, no_confirm: bool = False, idle_minutes_to_autostop: Optional[int] = None, - teardown: bool = False, + terminate: bool = False, retry_until_up: bool = False, no_setup: bool = False, node_type: Optional[str] = None, @@ -655,7 +655,7 @@ def _launch_with_confirm( detach_run=detach_run, backend=backend, idle_minutes_to_autostop=idle_minutes_to_autostop, - teardown=teardown, + terminate=terminate, retry_until_up=retry_until_up, no_setup=no_setup, ) @@ -995,12 +995,13 @@ def cli(): 'running ``sky launch -d ...`` and then ``sky autostop -i ``' '. If not set, the cluster will not be auto-stopped.')) @click.option( - '--idle-minutes-to-autodown', - '-I', - default=None, - type=int, + '--terminate', + default=False, + type=bool, required=False, - help=('Same as --idle-minutes-to-autostop, but tears down the cluster.'), + help=( + 'Terminate the cluster after execution. If --idle-minutes-to-autostop ' + 'is set, the cluster will be torn down after the idle time.'), ) @click.option( '--retry-until-up', @@ -1043,7 +1044,7 @@ def launch( env: List[Dict[str, str]], disk_size: Optional[int], idle_minutes_to_autostop: Optional[int], - idle_minutes_to_autodown: Optional[int], + terminate: bool, retry_until_up: bool, yes: bool, no_setup: bool, @@ -1056,14 +1057,6 @@ def launch( In both cases, the commands are run under the task's workdir (if specified) and they undergo job queue scheduling. """ - autodown = False - if idle_minutes_to_autostop is not None and idle_minutes_to_autodown is not None: - raise click.UsageError( - 'Only one of --idle-minutes-to-autostop and --idle-minutes-to-autodown should be specified. ' - f'autostop: {idle_minutes_to_autostop}, autodown: {idle_minutes_to_autodown}') - if idle_minutes_to_autodown is not None: - idle_minutes_to_autostop = idle_minutes_to_autodown - autodown = True backend_utils.check_cluster_name_not_reserved( cluster, operation_str='Launching task on it') if backend_name is None: @@ -1102,7 +1095,7 @@ def launch( detach_run=detach_run, no_confirm=yes, idle_minutes_to_autostop=idle_minutes_to_autostop, - teardown=autodown, + terminate=terminate, retry_until_up=retry_until_up, no_setup=no_setup, is_local_cloud=onprem_utils.check_if_local_cloud(cluster)) @@ -1519,11 +1512,12 @@ def stop( is_flag=True, required=False, help='Cancel the auto-stopping.') -@click.option('--down', - default=False, - is_flag=True, - required=False, - help='Tear down the cluster instead of stopping it, when auto-stopping.') +@click.option( + '--terminate', + default=False, + is_flag=True, + required=False, + help='Terminate the cluster instead of stopping it, when auto-stopping.') @click.option('--yes', '-y', is_flag=True, @@ -1536,8 +1530,8 @@ def autostop( all: Optional[bool], # pylint: disable=redefined-builtin idle_minutes: Optional[int], cancel: bool, # pylint: disable=redefined-outer-name + terminate: bool, yes: bool, - down: bool, ): """Schedule or cancel auto-stopping for cluster(s). @@ -1574,7 +1568,7 @@ def autostop( idle_minutes = 5 _terminate_or_stop_clusters(clusters, apply_to_all=all, - terminate=down, + terminate=terminate, no_confirm=yes, idle_minutes_to_autostop=idle_minutes) @@ -1611,12 +1605,13 @@ def autostop( 'running ``sky launch -d ...`` and then ``sky autostop -i ``' '. If not set, the cluster will not be auto-stopped.')) @click.option( - '--idle-minutes-to-autodown', - '-I', - default=None, - type=int, + '--terminate', + default=False, + type=bool, required=False, - help=('Same as --idle-minutes-to-autostop, but tears down the cluster.'), + help=( + 'Terminate the cluster after execution. If --idle-minutes-to-autostop ' + 'is set, the cluster will be torn down after the idle time.'), ) @click.option( '--retry-until-up', @@ -1629,7 +1624,8 @@ def autostop( @usage_lib.entrypoint # pylint: disable=redefined-builtin def start(clusters: Tuple[str], all: bool, yes: bool, - idle_minutes_to_autostop: Optional[int], idle_minutes_to_autodown: Optional[int], retry_until_up: bool): + idle_minutes_to_autostop: Optional[int], terminate: bool, + retry_until_up: bool): """Restart cluster(s). If a cluster is previously stopped (status is STOPPED) or failed in @@ -1657,12 +1653,6 @@ def start(clusters: Tuple[str], all: bool, yes: bool, sky start -a """ - if idle_minutes_to_autostop is not None and idle_minutes_to_autodown is not None: - raise click.UsageError( - 'Only one of --idle-minutes-to-autostop and --idle-minutes-to-autodown should be specified. ' - f'autostop: {idle_minutes_to_autostop}, autodown: {idle_minutes_to_autodown}') - autodown = idle_minutes_to_autodown is not None - to_start = [] if not clusters and not all: @@ -1751,7 +1741,10 @@ def start(clusters: Tuple[str], all: bool, yes: bool, for name in to_start: try: - core.start(name, idle_minutes_to_autostop, retry_until_up, autodown=autodown) + core.start(name, + idle_minutes_to_autostop, + retry_until_up, + terminate=terminate) except exceptions.NotSupportedError as e: click.echo(str(e)) click.secho(f'Cluster {name} started.', fg='green') @@ -1845,7 +1838,7 @@ def _terminate_or_stop_clusters( operation = 'Terminating' if terminate else 'Stopping' if idle_minutes_to_autostop is not None: verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - down_str = f' (tear down)' if terminate else '' + down_str = ' (terminate)' if terminate else '' operation = f'{verb} auto-stop{down_str} on' if len(names) > 0: diff --git a/sky/core.py b/sky/core.py index 9b1cb4e4bce..00774983c46 100644 --- a/sky/core.py +++ b/sky/core.py @@ -54,7 +54,7 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: def _start(cluster_name: str, idle_minutes_to_autostop: Optional[int] = None, retry_until_up: bool = False, - autodown: bool=False) -> backends.Backend.ResourceHandle: + terminate: bool = False) -> backends.Backend.ResourceHandle: cluster_status, handle = backend_utils.refresh_cluster_status_handle( cluster_name) @@ -80,14 +80,16 @@ def _start(cluster_name: str, cluster_name=cluster_name, retry_until_up=retry_until_up) if idle_minutes_to_autostop is not None: - backend.set_autostop(handle, idle_minutes_to_autostop, teardown=autodown) + backend.set_autostop(handle, + idle_minutes_to_autostop, + terminate=terminate) return handle def start(cluster_name: str, idle_minutes_to_autostop: Optional[int] = None, retry_until_up: bool = False, - autodown: bool = False): + terminate: bool = False): """Start the cluster. Please refer to the sky.cli.start for the document. @@ -95,7 +97,7 @@ def start(cluster_name: str, Raises: sky.exceptions.NotSupportedError: the cluster is not supported. """ - _start(cluster_name, idle_minutes_to_autostop, retry_until_up, autodown) + _start(cluster_name, idle_minutes_to_autostop, retry_until_up, terminate) def stop(cluster_name: str, purge: bool = False): @@ -158,7 +160,9 @@ def down(cluster_name: str, purge: bool = False): backend.teardown(handle, terminate=True, purge=purge) -def autostop(cluster_name: str, idle_minutes_to_autostop: int, terminate: bool=False): +def autostop(cluster_name: str, + idle_minutes_to_autostop: int, + terminate: bool = False): """Set the autostop time of the cluster. Please refer to the sky.cli.autostop for the document. @@ -172,7 +176,7 @@ def autostop(cluster_name: str, idle_minutes_to_autostop: int, terminate: bool=F sky.exceptions.ClusterNotUpError: the cluster is not UP. """ verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - down_str = f' (tear down)' if terminate else '' + down_str = ' (tear down)' if terminate else '' operation = f'{verb} auto-stop{down_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( diff --git a/sky/execution.py b/sky/execution.py index d9eac829b7f..41500c684d0 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -93,13 +93,13 @@ class Stage(enum.Enum): SETUP = enum.auto() PRE_EXEC = enum.auto() EXEC = enum.auto() - TEARDOWN = enum.auto() + TERMINATE = enum.auto() def _execute( dag: sky.Dag, dryrun: bool = False, - teardown: bool = False, + terminate: bool = False, stream_logs: bool = True, handle: Any = None, backend: Optional[backends.Backend] = None, @@ -120,7 +120,7 @@ def _execute( dag: sky.Dag. dryrun: bool; if True, only print the provision info (e.g., cluster yaml). - teardown: bool; whether to teardown the launched resources after + terminate: bool; whether to terminate the launched resources after execution. stream_logs: bool; whether to stream all tasks' outputs to the client. handle: Any; if provided, execution will use an existing backend cluster @@ -214,7 +214,9 @@ def _execute( if stages is None or Stage.PRE_EXEC in stages: if idle_minutes_to_autostop is not None: - backend.set_autostop(handle, idle_minutes_to_autostop, teardown=teardown) + backend.set_autostop(handle, + idle_minutes_to_autostop, + terminate=terminate) if stages is None or Stage.EXEC in stages: try: @@ -222,10 +224,10 @@ def _execute( backend.execute(handle, task, detach_run) finally: # Enables post_execute() to be run after KeyboardInterrupt. - backend.post_execute(handle, teardown) + backend.post_execute(handle, terminate) - if stages is None or Stage.TEARDOWN in stages: - if teardown and idle_minutes_to_autostop is None: + if stages is None or Stage.TERMINATE in stages: + if terminate and idle_minutes_to_autostop is None: backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) finally: @@ -253,7 +255,7 @@ def launch( retry_until_up: bool = False, idle_minutes_to_autostop: Optional[int] = None, dryrun: bool = False, - teardown: bool = False, + terminate: bool = False, stream_logs: bool = True, backend: Optional[backends.Backend] = None, optimize_target: OptimizeTarget = OptimizeTarget.COST, @@ -290,7 +292,7 @@ def launch( _execute( dag=dag, dryrun=dryrun, - teardown=teardown, + terminate=terminate, stream_logs=stream_logs, handle=None, backend=backend, @@ -308,7 +310,7 @@ def exec( # pylint: disable=redefined-builtin dag: sky.Dag, cluster_name: str, dryrun: bool = False, - teardown: bool = False, + terminate: bool = False, stream_logs: bool = True, backend: Optional[backends.Backend] = None, optimize_target: OptimizeTarget = OptimizeTarget.COST, @@ -327,7 +329,7 @@ def exec( # pylint: disable=redefined-builtin 'Use `sky status` to check the status.') _execute(dag=dag, dryrun=dryrun, - teardown=teardown, + terminate=terminate, stream_logs=stream_logs, handle=handle, backend=backend, diff --git a/sky/skylet/autostop_lib.py b/sky/skylet/autostop_lib.py index 8c9f9453098..80a990dd23f 100644 --- a/sky/skylet/autostop_lib.py +++ b/sky/skylet/autostop_lib.py @@ -10,18 +10,22 @@ class AutostopConfig: + """Autostop configuration.""" - def __init__(self, autostop_idle_minutes: int, boot_time: int, - backend: Optional[str], teardown: bool = False): + def __init__(self, + autostop_idle_minutes: int, + boot_time: int, + backend: Optional[str], + terminate: bool = False): assert autostop_idle_minutes < 0 or backend is not None, ( autostop_idle_minutes, backend) self.autostop_idle_minutes = autostop_idle_minutes self.boot_time = boot_time self.backend = backend - self.teardown = teardown + self.terminate = terminate def __set_state__(self, state: dict): - state.setdefault('teardown', False) + state.setdefault('terminate', False) self.__dict__.update(state) @@ -32,9 +36,11 @@ def get_autostop_config() -> Optional[AutostopConfig]: return pickle.loads(config_str) -def set_autostop(idle_minutes: int, backend: Optional[str], teardown: bool) -> None: +def set_autostop(idle_minutes: int, backend: Optional[str], + terminate: bool) -> None: boot_time = psutil.boot_time() - autostop_config = AutostopConfig(idle_minutes, boot_time, backend, teardown) + autostop_config = AutostopConfig(idle_minutes, boot_time, backend, + terminate) configs.set_config(AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config)) @@ -48,9 +54,11 @@ class AutostopCodeGen: _PREFIX = ['from sky.skylet import autostop_lib'] @classmethod - def set_autostop(cls, idle_minutes: int, backend: str, teardown: bool) -> str: + def set_autostop(cls, idle_minutes: int, backend: str, + terminate: bool) -> str: code = [ - f'autostop_lib.set_autostop({idle_minutes}, {backend!r}, {teardown})', + f'autostop_lib.set_autostop({idle_minutes}, {backend!r},' + f' {terminate})', ] return cls._build(code) diff --git a/sky/skylet/events.py b/sky/skylet/events.py index e8a382d8ee8..c166bb32c2f 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -119,7 +119,8 @@ def _run(self): def _stop_cluster(self, autostop_config): if (autostop_config.backend == cloud_vm_ray_backend.CloudVmRayBackend.NAME): - self._replace_yaml_for_stopping(self.ray_yaml_path, autostop_config.teardown) + self._replace_yaml_for_stopping(self.ray_yaml_path, + autostop_config.teardown) # `ray up` is required to reset the upscaling speed and min/max # workers. Otherwise, `ray down --workers-only` will continuously # scale down and up. @@ -142,9 +143,11 @@ def _replace_yaml_for_stopping(self, yaml_path: str, teardown: bool): yaml_str = self._NUM_WORKER_PATTERN.sub(r'\g<1>_workers: 0', yaml_str) yaml_str = self._UPSCALING_PATTERN.sub(r'upscaling_speed: 0', yaml_str) if teardown: - yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: false', yaml_str) + yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: false', + yaml_str) else: - yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: true', yaml_str) + yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: true', + yaml_str) config = yaml.safe_load(yaml_str) # Set the private key with the existed key on the remote instance. config['auth']['ssh_private_key'] = '~/ray_bootstrap_key.pem' From 420510f33064cdc7a2602ddd188fc57388e1a678 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 20:53:27 -0700 Subject: [PATCH 03/36] fix flag --- sky/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 86541b5659d..88237d8fb37 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -997,7 +997,7 @@ def cli(): @click.option( '--terminate', default=False, - type=bool, + is_flag=True, required=False, help=( 'Terminate the cluster after execution. If --idle-minutes-to-autostop ' @@ -1607,7 +1607,7 @@ def autostop( @click.option( '--terminate', default=False, - type=bool, + is_flag=True, required=False, help=( 'Terminate the cluster after execution. If --idle-minutes-to-autostop ' From 9a741aa94c1c44fb821046a6e579d8ba7e3a1b24 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 21:25:36 -0700 Subject: [PATCH 04/36] fix autostop --- sky/skylet/events.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/skylet/events.py b/sky/skylet/events.py index c166bb32c2f..df9102670f3 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -120,7 +120,7 @@ def _stop_cluster(self, autostop_config): if (autostop_config.backend == cloud_vm_ray_backend.CloudVmRayBackend.NAME): self._replace_yaml_for_stopping(self.ray_yaml_path, - autostop_config.teardown) + autostop_config.terminate) # `ray up` is required to reset the upscaling speed and min/max # workers. Otherwise, `ray down --workers-only` will continuously # scale down and up. @@ -136,13 +136,13 @@ def _stop_cluster(self, autostop_config): else: raise NotImplementedError - def _replace_yaml_for_stopping(self, yaml_path: str, teardown: bool): + def _replace_yaml_for_stopping(self, yaml_path: str, terminate: bool): with open(yaml_path, 'r') as f: yaml_str = f.read() # Update the number of workers to 0. yaml_str = self._NUM_WORKER_PATTERN.sub(r'\g<1>_workers: 0', yaml_str) yaml_str = self._UPSCALING_PATTERN.sub(r'upscaling_speed: 0', yaml_str) - if teardown: + if terminate: yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: false', yaml_str) else: From f057f3d4723df74c742c20d892b55a59f82af794 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 21:42:49 -0700 Subject: [PATCH 05/36] fix comment --- sky/execution.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/execution.py b/sky/execution.py index 41500c684d0..b87eea0e44e 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -121,7 +121,8 @@ def _execute( dryrun: bool; if True, only print the provision info (e.g., cluster yaml). terminate: bool; whether to terminate the launched resources after - execution. + execution. If idle_minutes_to_autostop is set, the cluster will be + terminated after the specified idle time. stream_logs: bool; whether to stream all tasks' outputs to the client. handle: Any; if provided, execution will use an existing backend cluster handle instead of provisioning a new one. From b646909166a556b7011b91b4bcd6acecaa751ee0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 13:37:09 -0700 Subject: [PATCH 06/36] address comment --- sky/backends/cloud_vm_ray_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 2c3caaf0765..9bdddb1753b 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2608,7 +2608,7 @@ def post_teardown_cleanup(self, def set_autostop(self, handle: ResourceHandle, idle_minutes_to_autostop: Optional[int], - terminate: bool = True, + terminate: bool = False, stream_logs: bool = True) -> None: if idle_minutes_to_autostop is not None: code = autostop_lib.AutostopCodeGen.set_autostop( From a6ccef80a265ebf67460eab055317b4972746c1f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 16:21:44 -0700 Subject: [PATCH 07/36] address comment --- sky/cli.py | 22 +++++++++++++++------- sky/core.py | 4 ++-- sky/execution.py | 8 ++++---- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 88237d8fb37..c7bb7b162fa 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1000,8 +1000,9 @@ def cli(): is_flag=True, required=False, help=( - 'Terminate the cluster after execution. If --idle-minutes-to-autostop ' - 'is set, the cluster will be torn down after the idle time.'), + 'Terminate the cluster after execution (successfully or abnormally). If ' + '--idle-minutes-to-autostop is set, the cluster will be torn down after ' + 'the idle time.'), ) @click.option( '--retry-until-up', @@ -1517,7 +1518,8 @@ def stop( default=False, is_flag=True, required=False, - help='Terminate the cluster instead of stopping it, when auto-stopping.') + help='Terminate the cluster instead of stopping it, when auto-stopping ' + '(i.e., autodown rather than autostop).') @click.option('--yes', '-y', is_flag=True, @@ -1540,6 +1542,8 @@ def autostop( ``--idle-minutes`` is the number of minutes of idleness (no pending/running jobs) after which the cluster will be stopped automatically. + Scheduling autostop twice on the same cluster will overwrite the previous + autostop schedule. ``--cancel`` will cancel the autostopping. If the cluster was not scheduled autostop, this will do nothing to autostop. @@ -1610,8 +1614,9 @@ def autostop( is_flag=True, required=False, help=( - 'Terminate the cluster after execution. If --idle-minutes-to-autostop ' - 'is set, the cluster will be torn down after the idle time.'), + 'Terminate the cluster after execution (successfully or abnormally). If ' + '--idle-minutes-to-autostop is set, the cluster will be torn down after ' + 'the idle time.'), ) @click.option( '--retry-until-up', @@ -1653,6 +1658,9 @@ def start(clusters: Tuple[str], all: bool, yes: bool, sky start -a """ + if terminate and idle_minutes_to_autostop is None: + raise click.UsageError( + '--idle-minutes-to-autostop must be set if --terminate is set.') to_start = [] if not clusters and not all: @@ -1838,8 +1846,8 @@ def _terminate_or_stop_clusters( operation = 'Terminating' if terminate else 'Stopping' if idle_minutes_to_autostop is not None: verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - down_str = ' (terminate)' if terminate else '' - operation = f'{verb} auto-stop{down_str} on' + option_str = 'down' if terminate else 'stop' + operation = f'{verb} auto-{option_str} on' if len(names) > 0: reserved_clusters = [ diff --git a/sky/core.py b/sky/core.py index 00774983c46..5cc67986399 100644 --- a/sky/core.py +++ b/sky/core.py @@ -176,8 +176,8 @@ def autostop(cluster_name: str, sky.exceptions.ClusterNotUpError: the cluster is not UP. """ verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - down_str = ' (tear down)' if terminate else '' - operation = f'{verb} auto-stop{down_str} on' + option_str = 'stop' if terminate else 'down' + operation = f'{verb} auto-{option_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' diff --git a/sky/execution.py b/sky/execution.py index b87eea0e44e..812dccf895f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -121,8 +121,8 @@ def _execute( dryrun: bool; if True, only print the provision info (e.g., cluster yaml). terminate: bool; whether to terminate the launched resources after - execution. If idle_minutes_to_autostop is set, the cluster will be - terminated after the specified idle time. + execution (successfully or abnormally). If idle_minutes_to_autostop + is set, the cluster will be terminated after the specified idle time. stream_logs: bool; whether to stream all tasks' outputs to the client. handle: Any; if provided, execution will use an existing backend cluster handle instead of provisioning a new one. @@ -226,12 +226,12 @@ def _execute( finally: # Enables post_execute() to be run after KeyboardInterrupt. backend.post_execute(handle, terminate) - + finally: if stages is None or Stage.TERMINATE in stages: if terminate and idle_minutes_to_autostop is None: backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) - finally: + if cluster_name != spot.SPOT_CONTROLLER_NAME: # UX: print live clusters to make users aware (to save costs). # From 0660eb1153c0a903e2fa8f84b24720ed3b94cf69 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 16:23:07 -0700 Subject: [PATCH 08/36] format --- sky/cli.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index c7bb7b162fa..3327d452b83 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -999,10 +999,10 @@ def cli(): default=False, is_flag=True, required=False, - help=( - 'Terminate the cluster after execution (successfully or abnormally). If ' - '--idle-minutes-to-autostop is set, the cluster will be torn down after ' - 'the idle time.'), + help= + ('Terminate the cluster after execution (successfully or abnormally). If ' + '--idle-minutes-to-autostop is set, the cluster will be torn down after ' + 'the idle time.'), ) @click.option( '--retry-until-up', @@ -1613,10 +1613,10 @@ def autostop( default=False, is_flag=True, required=False, - help=( - 'Terminate the cluster after execution (successfully or abnormally). If ' - '--idle-minutes-to-autostop is set, the cluster will be torn down after ' - 'the idle time.'), + help= + ('Terminate the cluster after execution (successfully or abnormally). If ' + '--idle-minutes-to-autostop is set, the cluster will be torn down after ' + 'the idle time.'), ) @click.option( '--retry-until-up', From 2a4567108fd3e369ae0a95dad78be907799944e4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 16:34:27 -0700 Subject: [PATCH 09/36] Rename terminate to down --- sky/backends/cloud_vm_ray_backend.py | 4 +- sky/cli.py | 102 ++++++++++++++------------- sky/core.py | 38 +++++----- sky/execution.py | 24 +++---- sky/skylet/autostop_lib.py | 17 ++--- 5 files changed, 95 insertions(+), 90 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 9bdddb1753b..0986a64ec50 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2608,11 +2608,11 @@ def post_teardown_cleanup(self, def set_autostop(self, handle: ResourceHandle, idle_minutes_to_autostop: Optional[int], - terminate: bool = False, + down: bool = False, stream_logs: bool = True) -> None: if idle_minutes_to_autostop is not None: code = autostop_lib.AutostopCodeGen.set_autostop( - idle_minutes_to_autostop, self.NAME, terminate) + idle_minutes_to_autostop, self.NAME, down) returncode, _, stderr = self.run_on_head(handle, code, require_outputs=True, diff --git a/sky/cli.py b/sky/cli.py index 3327d452b83..39c10481332 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -604,7 +604,7 @@ def _launch_with_confirm( detach_run: bool, no_confirm: bool = False, idle_minutes_to_autostop: Optional[int] = None, - terminate: bool = False, + down: bool = False, # pylint: disable=redefined-outer-name retry_until_up: bool = False, no_setup: bool = False, node_type: Optional[str] = None, @@ -655,7 +655,7 @@ def _launch_with_confirm( detach_run=detach_run, backend=backend, idle_minutes_to_autostop=idle_minutes_to_autostop, - terminate=terminate, + down=down, retry_until_up=retry_until_up, no_setup=no_setup, ) @@ -995,12 +995,12 @@ def cli(): 'running ``sky launch -d ...`` and then ``sky autostop -i ``' '. If not set, the cluster will not be auto-stopped.')) @click.option( - '--terminate', + '--down', default=False, is_flag=True, required=False, help= - ('Terminate the cluster after execution (successfully or abnormally). If ' + ('Tear down the cluster after execution (successfully or abnormally). If ' '--idle-minutes-to-autostop is set, the cluster will be torn down after ' 'the idle time.'), ) @@ -1045,7 +1045,7 @@ def launch( env: List[Dict[str, str]], disk_size: Optional[int], idle_minutes_to_autostop: Optional[int], - terminate: bool, + down: bool, # pylint: disable=redefined-outer-name retry_until_up: bool, yes: bool, no_setup: bool, @@ -1096,7 +1096,7 @@ def launch( detach_run=detach_run, no_confirm=yes, idle_minutes_to_autostop=idle_minutes_to_autostop, - terminate=terminate, + down=down, retry_until_up=retry_until_up, no_setup=no_setup, is_local_cloud=onprem_utils.check_if_local_cloud(cluster)) @@ -1486,10 +1486,10 @@ def stop( sky stop -a """ - _terminate_or_stop_clusters(clusters, - apply_to_all=all, - terminate=False, - no_confirm=yes) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=False, + no_confirm=yes) @cli.command(cls=_DocumentedCodeCommand) @@ -1514,11 +1514,11 @@ def stop( required=False, help='Cancel the auto-stopping.') @click.option( - '--terminate', + '--down', default=False, is_flag=True, required=False, - help='Terminate the cluster instead of stopping it, when auto-stopping ' + help='Tear down the cluster instead of stopping it, when auto-stopping ' '(i.e., autodown rather than autostop).') @click.option('--yes', '-y', @@ -1532,7 +1532,7 @@ def autostop( all: Optional[bool], # pylint: disable=redefined-builtin idle_minutes: Optional[int], cancel: bool, # pylint: disable=redefined-outer-name - terminate: bool, + down: bool, # pylint: disable=redefined-outer-name yes: bool, ): """Schedule or cancel auto-stopping for cluster(s). @@ -1570,11 +1570,11 @@ def autostop( idle_minutes = -1 elif idle_minutes is None: idle_minutes = 5 - _terminate_or_stop_clusters(clusters, - apply_to_all=all, - terminate=terminate, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=down, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes) @cli.command(cls=_DocumentedCodeCommand) @@ -1609,12 +1609,12 @@ def autostop( 'running ``sky launch -d ...`` and then ``sky autostop -i ``' '. If not set, the cluster will not be auto-stopped.')) @click.option( - '--terminate', + '--down', default=False, is_flag=True, required=False, help= - ('Terminate the cluster after execution (successfully or abnormally). If ' + ('Tear down the cluster after execution (successfully or abnormally). If ' '--idle-minutes-to-autostop is set, the cluster will be torn down after ' 'the idle time.'), ) @@ -1628,9 +1628,13 @@ def autostop( 'if we fail to start the cluster due to unavailability errors.')) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def start(clusters: Tuple[str], all: bool, yes: bool, - idle_minutes_to_autostop: Optional[int], terminate: bool, - retry_until_up: bool): +def start( + clusters: Tuple[str], + all: bool, + yes: bool, + idle_minutes_to_autostop: Optional[int], + down: bool, # pylint: disable=redefined-outer-name + retry_until_up: bool): """Restart cluster(s). If a cluster is previously stopped (status is STOPPED) or failed in @@ -1658,9 +1662,9 @@ def start(clusters: Tuple[str], all: bool, yes: bool, sky start -a """ - if terminate and idle_minutes_to_autostop is None: + if down and idle_minutes_to_autostop is None: raise click.UsageError( - '--idle-minutes-to-autostop must be set if --terminate is set.') + '--idle-minutes-to-autostop must be set if --down is set.') to_start = [] if not clusters and not all: @@ -1752,7 +1756,7 @@ def start(clusters: Tuple[str], all: bool, yes: bool, core.start(name, idle_minutes_to_autostop, retry_until_up, - terminate=terminate) + down=down) except exceptions.NotSupportedError as e: click.echo(str(e)) click.secho(f'Cluster {name} started.', fg='green') @@ -1817,36 +1821,36 @@ def down( sky down -a """ - _terminate_or_stop_clusters(clusters, - apply_to_all=all, - terminate=True, - no_confirm=yes, - purge=purge) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=True, + no_confirm=yes, + purge=purge) -def _terminate_or_stop_clusters( +def _down_or_stop_clusters( names: Tuple[str], apply_to_all: Optional[bool], - terminate: bool, + down: bool, # pylint: disable=redefined-outer-name no_confirm: bool, purge: bool = False, idle_minutes_to_autostop: Optional[int] = None) -> None: - """Terminates or (auto-)stops a cluster (or all clusters). + """Tears down or (auto-)stops a cluster (or all clusters). Reserved clusters (spot controller) can only be terminated if the cluster name is explicitly and uniquely specified (not via glob) and purge is set to True. """ - command = 'down' if terminate else 'stop' + command = 'down' if down else 'stop' if not names and apply_to_all is None: raise click.UsageError( f'sky {command} requires either a cluster name (see `sky status`) ' 'or --all.') - operation = 'Terminating' if terminate else 'Stopping' + operation = 'Terminating' if down else 'Stopping' if idle_minutes_to_autostop is not None: verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - option_str = 'down' if terminate else 'stop' + option_str = 'down' if down else 'stop' operation = f'{verb} auto-{option_str} on' if len(names) > 0: @@ -1859,7 +1863,7 @@ def _terminate_or_stop_clusters( name for name in _get_glob_clusters(names) if name not in backend_utils.SKY_RESERVED_CLUSTER_NAMES ] - if not terminate: + if not down: local_clusters = onprem_utils.check_and_get_local_clusters() # Local clusters are allowed to `sky down`, but not # `sky start/stop`. `sky down` unregisters the local cluster @@ -1876,7 +1880,7 @@ def _terminate_or_stop_clusters( if not purge: msg = (f'{operation} reserved cluster(s) ' f'{reserved_clusters_str} is not supported.') - if terminate: + if down: msg += ( '\nPlease specify --purge (-p) to force-terminate the ' 'reserved cluster(s).') @@ -1938,11 +1942,11 @@ def _terminate_or_stop_clusters( f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]', total=len(clusters)) - def _terminate_or_stop(name: str): + def _down_or_stop(name: str): success_progress = False if idle_minutes_to_autostop is not None: try: - core.autostop(name, idle_minutes_to_autostop, terminate) + core.autostop(name, idle_minutes_to_autostop, down) except (exceptions.NotSupportedError, exceptions.ClusterNotUpError) as e: message = str(e) @@ -1960,7 +1964,7 @@ def _terminate_or_stop(name: str): f'{colorama.Style.RESET_ALL}') else: try: - if terminate: + if down: core.down(name, purge=purge) else: core.stop(name, purge=purge) @@ -1975,7 +1979,7 @@ def _terminate_or_stop(name: str): message = ( f'{colorama.Fore.GREEN}{operation} cluster {name}...done.' f'{colorama.Style.RESET_ALL}') - if not terminate: + if not down: message += ('\n To restart the cluster, run: ' f'{colorama.Style.BRIGHT}sky start {name}' f'{colorama.Style.RESET_ALL}') @@ -1987,7 +1991,7 @@ def _terminate_or_stop(name: str): progress.start() with progress: - subprocess_utils.run_in_parallel(_terminate_or_stop, clusters) + subprocess_utils.run_in_parallel(_down_or_stop, clusters) progress.live.transient = False # Make sure the progress bar not mess up the terminal. progress.refresh() @@ -3220,7 +3224,7 @@ def benchmark_down( clusters_to_exclude: List[str], yes: bool, ) -> None: - """Terminate all clusters belonging to a benchmark.""" + """Tear down all clusters belonging to a benchmark.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: raise click.BadParameter(f'Benchmark {benchmark} does not exist.') @@ -3234,10 +3238,10 @@ def benchmark_down( continue to_stop.append(cluster) - _terminate_or_stop_clusters(to_stop, - apply_to_all=False, - terminate=True, - no_confirm=yes) + _down_or_stop_clusters(to_stop, + apply_to_all=False, + down=True, + no_confirm=yes) @bench.command('delete', cls=_DocumentedCodeCommand) diff --git a/sky/core.py b/sky/core.py index 5cc67986399..5de804bc6f8 100644 --- a/sky/core.py +++ b/sky/core.py @@ -51,10 +51,12 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: return cluster_records -def _start(cluster_name: str, - idle_minutes_to_autostop: Optional[int] = None, - retry_until_up: bool = False, - terminate: bool = False) -> backends.Backend.ResourceHandle: +def _start( + cluster_name: str, + idle_minutes_to_autostop: Optional[int] = None, + retry_until_up: bool = False, + down: bool = False, #pylint: disable=redefined-outer-name +) -> backends.Backend.ResourceHandle: cluster_status, handle = backend_utils.refresh_cluster_status_handle( cluster_name) @@ -80,16 +82,16 @@ def _start(cluster_name: str, cluster_name=cluster_name, retry_until_up=retry_until_up) if idle_minutes_to_autostop is not None: - backend.set_autostop(handle, - idle_minutes_to_autostop, - terminate=terminate) + backend.set_autostop(handle, idle_minutes_to_autostop, down=down) return handle -def start(cluster_name: str, - idle_minutes_to_autostop: Optional[int] = None, - retry_until_up: bool = False, - terminate: bool = False): +def start( + cluster_name: str, + idle_minutes_to_autostop: Optional[int] = None, + retry_until_up: bool = False, + down: bool = False, #pylint: disable=redefined-outer-name +): """Start the cluster. Please refer to the sky.cli.start for the document. @@ -97,7 +99,7 @@ def start(cluster_name: str, Raises: sky.exceptions.NotSupportedError: the cluster is not supported. """ - _start(cluster_name, idle_minutes_to_autostop, retry_until_up, terminate) + _start(cluster_name, idle_minutes_to_autostop, retry_until_up, down) def stop(cluster_name: str, purge: bool = False): @@ -160,9 +162,11 @@ def down(cluster_name: str, purge: bool = False): backend.teardown(handle, terminate=True, purge=purge) -def autostop(cluster_name: str, - idle_minutes_to_autostop: int, - terminate: bool = False): +def autostop( + cluster_name: str, + idle_minutes_to_autostop: int, + down: bool = False, #pylint: disable=redefined-outer-name +): """Set the autostop time of the cluster. Please refer to the sky.cli.autostop for the document. @@ -176,7 +180,7 @@ def autostop(cluster_name: str, sky.exceptions.ClusterNotUpError: the cluster is not UP. """ verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - option_str = 'stop' if terminate else 'down' + option_str = 'stop' if down else 'down' operation = f'{verb} auto-{option_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( @@ -209,7 +213,7 @@ def autostop(cluster_name: str, f'{colorama.Style.RESET_ALL}' '\n Auto-stop can only be set/unset for ' f'{global_user_state.ClusterStatus.UP.value} clusters.') - backend.set_autostop(handle, idle_minutes_to_autostop, terminate) + backend.set_autostop(handle, idle_minutes_to_autostop, down) # ================== diff --git a/sky/execution.py b/sky/execution.py index 812dccf895f..1e014bd7be8 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -93,13 +93,13 @@ class Stage(enum.Enum): SETUP = enum.auto() PRE_EXEC = enum.auto() EXEC = enum.auto() - TERMINATE = enum.auto() + DOWN = enum.auto() def _execute( dag: sky.Dag, dryrun: bool = False, - terminate: bool = False, + down: bool = False, stream_logs: bool = True, handle: Any = None, backend: Optional[backends.Backend] = None, @@ -120,9 +120,9 @@ def _execute( dag: sky.Dag. dryrun: bool; if True, only print the provision info (e.g., cluster yaml). - terminate: bool; whether to terminate the launched resources after + down: bool; whether to tear down the launched resources after execution (successfully or abnormally). If idle_minutes_to_autostop - is set, the cluster will be terminated after the specified idle time. + is set, the cluster will be torn down after the specified idle time. stream_logs: bool; whether to stream all tasks' outputs to the client. handle: Any; if provided, execution will use an existing backend cluster handle instead of provisioning a new one. @@ -217,7 +217,7 @@ def _execute( if idle_minutes_to_autostop is not None: backend.set_autostop(handle, idle_minutes_to_autostop, - terminate=terminate) + down=down) if stages is None or Stage.EXEC in stages: try: @@ -225,10 +225,10 @@ def _execute( backend.execute(handle, task, detach_run) finally: # Enables post_execute() to be run after KeyboardInterrupt. - backend.post_execute(handle, terminate) + backend.post_execute(handle, down) finally: - if stages is None or Stage.TERMINATE in stages: - if terminate and idle_minutes_to_autostop is None: + if stages is None or Stage.DOWN in stages: + if down and idle_minutes_to_autostop is None: backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) @@ -256,7 +256,7 @@ def launch( retry_until_up: bool = False, idle_minutes_to_autostop: Optional[int] = None, dryrun: bool = False, - terminate: bool = False, + down: bool = False, stream_logs: bool = True, backend: Optional[backends.Backend] = None, optimize_target: OptimizeTarget = OptimizeTarget.COST, @@ -293,7 +293,7 @@ def launch( _execute( dag=dag, dryrun=dryrun, - terminate=terminate, + down=down, stream_logs=stream_logs, handle=None, backend=backend, @@ -311,7 +311,7 @@ def exec( # pylint: disable=redefined-builtin dag: sky.Dag, cluster_name: str, dryrun: bool = False, - terminate: bool = False, + down: bool = False, stream_logs: bool = True, backend: Optional[backends.Backend] = None, optimize_target: OptimizeTarget = OptimizeTarget.COST, @@ -330,7 +330,7 @@ def exec( # pylint: disable=redefined-builtin 'Use `sky status` to check the status.') _execute(dag=dag, dryrun=dryrun, - terminate=terminate, + down=down, stream_logs=stream_logs, handle=handle, backend=backend, diff --git a/sky/skylet/autostop_lib.py b/sky/skylet/autostop_lib.py index 80a990dd23f..1e6c95f70fa 100644 --- a/sky/skylet/autostop_lib.py +++ b/sky/skylet/autostop_lib.py @@ -16,16 +16,16 @@ def __init__(self, autostop_idle_minutes: int, boot_time: int, backend: Optional[str], - terminate: bool = False): + down: bool = False): assert autostop_idle_minutes < 0 or backend is not None, ( autostop_idle_minutes, backend) self.autostop_idle_minutes = autostop_idle_minutes self.boot_time = boot_time self.backend = backend - self.terminate = terminate + self.down = down def __set_state__(self, state: dict): - state.setdefault('terminate', False) + state.setdefault('down', False) self.__dict__.update(state) @@ -36,11 +36,9 @@ def get_autostop_config() -> Optional[AutostopConfig]: return pickle.loads(config_str) -def set_autostop(idle_minutes: int, backend: Optional[str], - terminate: bool) -> None: +def set_autostop(idle_minutes: int, backend: Optional[str], down: bool) -> None: boot_time = psutil.boot_time() - autostop_config = AutostopConfig(idle_minutes, boot_time, backend, - terminate) + autostop_config = AutostopConfig(idle_minutes, boot_time, backend, down) configs.set_config(AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config)) @@ -54,11 +52,10 @@ class AutostopCodeGen: _PREFIX = ['from sky.skylet import autostop_lib'] @classmethod - def set_autostop(cls, idle_minutes: int, backend: str, - terminate: bool) -> str: + def set_autostop(cls, idle_minutes: int, backend: str, down: bool) -> str: code = [ f'autostop_lib.set_autostop({idle_minutes}, {backend!r},' - f' {terminate})', + f' {down})', ] return cls._build(code) From d89fe932f08f1544799249cb97ff818f7078226d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 20:39:40 -0700 Subject: [PATCH 10/36] add smoke test --- sky/skylet/events.py | 6 +++--- tests/test_smoke.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/sky/skylet/events.py b/sky/skylet/events.py index df9102670f3..a592a55f56c 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -120,7 +120,7 @@ def _stop_cluster(self, autostop_config): if (autostop_config.backend == cloud_vm_ray_backend.CloudVmRayBackend.NAME): self._replace_yaml_for_stopping(self.ray_yaml_path, - autostop_config.terminate) + autostop_config.down) # `ray up` is required to reset the upscaling speed and min/max # workers. Otherwise, `ray down --workers-only` will continuously # scale down and up. @@ -136,13 +136,13 @@ def _stop_cluster(self, autostop_config): else: raise NotImplementedError - def _replace_yaml_for_stopping(self, yaml_path: str, terminate: bool): + def _replace_yaml_for_stopping(self, yaml_path: str, down: bool): with open(yaml_path, 'r') as f: yaml_str = f.read() # Update the number of workers to 0. yaml_str = self._NUM_WORKER_PATTERN.sub(r'\g<1>_workers: 0', yaml_str) yaml_str = self._UPSCALING_PATTERN.sub(r'upscaling_speed: 0', yaml_str) - if terminate: + if down: yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: false', yaml_str) else: diff --git a/tests/test_smoke.py b/tests/test_smoke.py index ba7f1e37610..9166f97a62a 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -525,6 +525,29 @@ def test_autostop(): ) run_one_test(test) +# ---------- Testing Autostopping ---------- +def test_autodown(): + name = _get_cluster_name() + test = Test( + 'autodown', + [ + f'sky launch -y -d -c {name} --num-nodes 2 --cloud gcp examples/minimal.yaml', + f'sky autostop -y {name} --down -i 1', + # Ensure autostop is set. + f'sky status | grep {name} | grep "1 min"', + 'sleep 180', + # Ensure the cluster is STOPPED. + f's=$(sky status --refresh) && {{echo $s |grep {name} | grep "was terminated";}} || {{echo $s | grep {name} && exit 1 || exit 0;}}', # Ensure the cluster is DOWN. + f'sky launch -y -d -c {name} --cloud gcp -i 2 --down examples/minimal.yaml', + f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. + f'sky exec {name} examples/minimal.yaml', + f's=$(sky status --refresh) && {{echo $s |grep {name} | grep "was terminated";}} || {{echo $s | grep {name} && exit 1 || exit 0;}}', # Ensure the cluster is DOWN. + ], + f'sky down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + def _get_cancel_task_with_cloud(name, cloud, timeout=15 * 60): test = Test( From c2a4c4a0fd826aee5b081e2f992afebb1d6808e4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 21:15:20 -0700 Subject: [PATCH 11/36] fix autodown for multi-node --- sky/skylet/events.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sky/skylet/events.py b/sky/skylet/events.py index a592a55f56c..5599c0151c6 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -80,7 +80,6 @@ class AutostopEvent(SkyletEvent): """Skylet event for autostop.""" EVENT_INTERVAL_SECONDS = 60 - _NUM_WORKER_PATTERN = re.compile(r'((?:min|max))_workers: (\d+)') _UPSCALING_PATTERN = re.compile(r'upscaling_speed: (\d+)') _CATCH_NODES = re.compile(r'cache_stopped_nodes: (.*)') @@ -125,7 +124,7 @@ def _stop_cluster(self, autostop_config): # workers. Otherwise, `ray down --workers-only` will continuously # scale down and up. subprocess.run( - ['ray', 'up', '-y', '--restart-only', self.ray_yaml_path], + ['ray', 'up', '-y', '--restart-only', '--disable-usage-stats', self.ray_yaml_path], check=True) # Stop the workers first to avoid orphan workers. subprocess.run( @@ -139,8 +138,6 @@ def _stop_cluster(self, autostop_config): def _replace_yaml_for_stopping(self, yaml_path: str, down: bool): with open(yaml_path, 'r') as f: yaml_str = f.read() - # Update the number of workers to 0. - yaml_str = self._NUM_WORKER_PATTERN.sub(r'\g<1>_workers: 0', yaml_str) yaml_str = self._UPSCALING_PATTERN.sub(r'upscaling_speed: 0', yaml_str) if down: yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: false', @@ -154,4 +151,5 @@ def _replace_yaml_for_stopping(self, yaml_path: str, down: bool): # Empty the file_mounts. config['file_mounts'] = dict() common_utils.dump_yaml(yaml_path, config) - logger.debug('Replaced worker num and upscaling speed to 0.') + logger.debug('Replaced upscaling speed to 0.') + From 63546198c6d527bcbbb66dae29314ee60d805aec Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 21:16:43 -0700 Subject: [PATCH 12/36] format --- sky/skylet/events.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/skylet/events.py b/sky/skylet/events.py index 5599c0151c6..4d736fee9e4 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -123,9 +123,11 @@ def _stop_cluster(self, autostop_config): # `ray up` is required to reset the upscaling speed and min/max # workers. Otherwise, `ray down --workers-only` will continuously # scale down and up. - subprocess.run( - ['ray', 'up', '-y', '--restart-only', '--disable-usage-stats', self.ray_yaml_path], - check=True) + subprocess.run([ + 'ray', 'up', '-y', '--restart-only', '--disable-usage-stats', + self.ray_yaml_path + ], + check=True) # Stop the workers first to avoid orphan workers. subprocess.run( ['ray', 'down', '-y', '--workers-only', self.ray_yaml_path], @@ -152,4 +154,3 @@ def _replace_yaml_for_stopping(self, yaml_path: str, down: bool): config['file_mounts'] = dict() common_utils.dump_yaml(yaml_path, config) logger.debug('Replaced upscaling speed to 0.') - From 18bc534b09ea39219b5ff147232500d34da19ec6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 21:29:01 -0700 Subject: [PATCH 13/36] fix syntax --- tests/test_smoke.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9166f97a62a..20c71a766a4 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -525,23 +525,24 @@ def test_autostop(): ) run_one_test(test) + # ---------- Testing Autostopping ---------- def test_autodown(): name = _get_cluster_name() test = Test( 'autodown', [ - f'sky launch -y -d -c {name} --num-nodes 2 --cloud gcp examples/minimal.yaml', + f'sky launch -y -d -c {name} --num-nodes 2 --cloud aws examples/minimal.yaml', f'sky autostop -y {name} --down -i 1', # Ensure autostop is set. f'sky status | grep {name} | grep "1 min"', 'sleep 180', # Ensure the cluster is STOPPED. - f's=$(sky status --refresh) && {{echo $s |grep {name} | grep "was terminated";}} || {{echo $s | grep {name} && exit 1 || exit 0;}}', # Ensure the cluster is DOWN. + f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. f'sky launch -y -d -c {name} --cloud gcp -i 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} examples/minimal.yaml', - f's=$(sky status --refresh) && {{echo $s |grep {name} | grep "was terminated";}} || {{echo $s | grep {name} && exit 1 || exit 0;}}', # Ensure the cluster is DOWN. + f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. ], f'sky down -y {name}', timeout=20 * 60, From 8658d3e0799d2bfedaf1a3ab643c48a4b337d573 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 21:41:12 -0700 Subject: [PATCH 14/36] use gcp for autodown test --- tests/test_smoke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 20c71a766a4..8ec31979c89 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -532,7 +532,7 @@ def test_autodown(): test = Test( 'autodown', [ - f'sky launch -y -d -c {name} --num-nodes 2 --cloud aws examples/minimal.yaml', + f'sky launch -y -d -c {name} --num-nodes 2 --cloud gcp examples/minimal.yaml', f'sky autostop -y {name} --down -i 1', # Ensure autostop is set. f'sky status | grep {name} | grep "1 min"', @@ -541,7 +541,7 @@ def test_autodown(): f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. f'sky launch -y -d -c {name} --cloud gcp -i 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} examples/minimal.yaml', + f'sky exec {name} --cloud gcp examples/minimal.yaml', f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. ], f'sky down -y {name}', From 8280c9d1eb17003243f02732014572c92f004586 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 21:59:23 -0700 Subject: [PATCH 15/36] fix smoke test --- tests/test_smoke.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8ec31979c89..360ba466b92 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -542,6 +542,7 @@ def test_autodown(): f'sky launch -y -d -c {name} --cloud gcp -i 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud gcp examples/minimal.yaml', + 'sleep 180', f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. ], f'sky down -y {name}', From 5a08c842bc4ffd7401aa187704d9a4357b7f0103 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 22:49:28 -0700 Subject: [PATCH 16/36] fix smoke test --- tests/test_smoke.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 360ba466b92..76ff20e5ae8 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -536,14 +536,14 @@ def test_autodown(): f'sky autostop -y {name} --down -i 1', # Ensure autostop is set. f'sky status | grep {name} | grep "1 min"', - 'sleep 180', + 'sleep 240', # Ensure the cluster is STOPPED. - f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. - f'sky launch -y -d -c {name} --cloud gcp -i 2 --down examples/minimal.yaml', + f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. + f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 -i 1 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. - f'sky exec {name} --cloud gcp examples/minimal.yaml', - 'sleep 180', - f's=$(sky status --refresh) && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. + f'sky exec {name} --cloud aws examples/minimal.yaml', + 'sleep 240', + f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. ], f'sky down -y {name}', timeout=20 * 60, From 59ced1df9e130f814eb5d85e75c3614f1ebb42ed Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 00:48:58 -0700 Subject: [PATCH 17/36] address comments --- sky/backends/backend_utils.py | 4 +++- sky/backends/cloud_vm_ray_backend.py | 19 ++++++++++++++-- sky/cli.py | 21 +++++++++++------- sky/core.py | 14 +++++++----- sky/execution.py | 4 ++-- sky/global_user_state.py | 33 ++++++++++++++++++++-------- sky/utils/cli_utils/status_utils.py | 9 +++++++- tests/test_smoke.py | 9 ++++---- 8 files changed, 80 insertions(+), 33 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 51acef19ef8..0c46422927a 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1589,7 +1589,9 @@ def _update_cluster_status_no_lock( backend.set_autostop(handle, -1, stream_logs=False) except (Exception, SystemExit): # pylint: disable=broad-except logger.debug('Failed to reset autostop.') - global_user_state.set_cluster_autostop_value(handle.cluster_name, -1) + global_user_state.set_cluster_autostop_value(handle.cluster_name, + -1, + to_down=False) # If the user starts part of a STOPPED cluster, we still need a status to # represent the abnormal status. For spot cluster, it can also represent diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 0986a64ec50..df4993ac050 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1116,6 +1116,11 @@ def ray_up(): # different order from directly running in the console. The # `--log-style` and `--log-color` flags do not work. To reproduce, # `ray up --log-style pretty --log-color true | tee tmp.out`. + + # Use environment variables to disable the ray usage collection (to + # avoid the 10 second wait for usage collection confirmation), as + # the ray version on the user's machine may be lower version that + # does not support the `--disable-usage-stats` flag. returncode, stdout, stderr = log_lib.run_with_log( # NOTE: --no-restart solves the following bug. Without it, if # 'ray up' (sky launch) twice on a cluster with >1 node, the @@ -1133,7 +1138,9 @@ def ray_up(): line_processor=log_utils.RayUpLineProcessor(), # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging # time during 'ray up' if insufficient capacity occurs. - env=dict(os.environ, BOTO_MAX_RETRIES='5'), + env=dict(BOTO_MAX_RETRIES='5', + RAY_USAGE_STATS_ENABLED='0', + **os.environ), require_outputs=True, # Disable stdin to avoid ray outputs mess up the terminal with # misaligned output when multithreading/multiprocessing are used @@ -1333,10 +1340,18 @@ def _ensure_cluster_ray_started(self, 'of the local cluster. Check if ray[default]==1.13.0 ' 'is installed or running correctly.') backend.run_on_head(handle, 'ray stop', use_cached_head_ip=False) + + # Use environment variables to disable the ray usage collection (avoid + # the 10 second wait for usage collection confirmation), as the ray + # version on the user's machine may be lower version that does not + # support the `--disable-usage-stats` flag. + env = os.environ.copy() + env['RAY_USAGE_STATS_ENABLED'] = '0' log_lib.run_with_log( ['ray', 'up', '-y', '--restart-only', handle.cluster_yaml], log_abs_path, stream_logs=False, + env=dict(RAY_USAGE_STATS_ENABLED='0', **os.environ), # Disable stdin to avoid ray outputs mess up the terminal with # misaligned output when multithreading/multiprocessing is used. # Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long @@ -2623,7 +2638,7 @@ def set_autostop(self, stderr=stderr, stream_logs=stream_logs) global_user_state.set_cluster_autostop_value( - handle.cluster_name, idle_minutes_to_autostop) + handle.cluster_name, idle_minutes_to_autostop, down) # TODO(zhwu): Refactor this to a CommandRunner class, so different backends # can support its own command runner. diff --git a/sky/cli.py b/sky/cli.py index 39c10481332..b6689b46404 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1000,9 +1000,10 @@ def cli(): is_flag=True, required=False, help= - ('Tear down the cluster after execution (successfully or abnormally). If ' - '--idle-minutes-to-autostop is set, the cluster will be torn down after ' - 'the idle time.'), + ('Tear down the cluster after execution finishes (successfully or ' + 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' + 'be torn down after the idle time, rather than immediately after execution' + ' finishes.'), ) @click.option( '--retry-until-up', @@ -1519,7 +1520,7 @@ def stop( is_flag=True, required=False, help='Tear down the cluster instead of stopping it, when auto-stopping ' - '(i.e., autodown rather than autostop).') + '(i.e., auto-down rather than auto-stop).') @click.option('--yes', '-y', is_flag=True, @@ -1535,11 +1536,14 @@ def autostop( down: bool, # pylint: disable=redefined-outer-name yes: bool, ): - """Schedule or cancel auto-stopping for cluster(s). + """Schedule or cancel an auto-stop or auto-down for cluster(s). CLUSTERS are the name (or glob pattern) of the clusters to stop. If both CLUSTERS and ``--all`` are supplied, the latter takes precedence. + If --down is passed, autodown (tear down the cluster; non-restartable) is + used, rather than autostop (restartable). + ``--idle-minutes`` is the number of minutes of idleness (no pending/running jobs) after which the cluster will be stopped automatically. Scheduling autostop twice on the same cluster will overwrite the previous @@ -1614,9 +1618,10 @@ def autostop( is_flag=True, required=False, help= - ('Tear down the cluster after execution (successfully or abnormally). If ' - '--idle-minutes-to-autostop is set, the cluster will be torn down after ' - 'the idle time.'), + ('Tear down the cluster after execution finishes (successfully or ' + 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' + 'be torn down after the idle time, rather than immediately after execution' + ' finishes.'), ) @click.option( '--retry-until-up', diff --git a/sky/core.py b/sky/core.py index 5de804bc6f8..340e178256a 100644 --- a/sky/core.py +++ b/sky/core.py @@ -42,6 +42,8 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: 'last_use': (int) timestamp of last use, 'status': (sky.ClusterStatus) cluster status, 'autostop': (int) idle time before autostop, + 'to_down': (bool) whether to tear down the cluster after + execution (autostop), 'metadata': (dict) metadata of the cluster, } ] @@ -55,7 +57,7 @@ def _start( cluster_name: str, idle_minutes_to_autostop: Optional[int] = None, retry_until_up: bool = False, - down: bool = False, #pylint: disable=redefined-outer-name + down: bool = False, # pylint: disable=redefined-outer-name ) -> backends.Backend.ResourceHandle: cluster_status, handle = backend_utils.refresh_cluster_status_handle( @@ -90,7 +92,7 @@ def start( cluster_name: str, idle_minutes_to_autostop: Optional[int] = None, retry_until_up: bool = False, - down: bool = False, #pylint: disable=redefined-outer-name + down: bool = False, # pylint: disable=redefined-outer-name ): """Start the cluster. @@ -165,7 +167,7 @@ def down(cluster_name: str, purge: bool = False): def autostop( cluster_name: str, idle_minutes_to_autostop: int, - down: bool = False, #pylint: disable=redefined-outer-name + down: bool = False, # pylint: disable=redefined-outer-name ): """Set the autostop time of the cluster. @@ -180,7 +182,7 @@ def autostop( sky.exceptions.ClusterNotUpError: the cluster is not UP. """ verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' - option_str = 'stop' if down else 'down' + option_str = 'down' if down else 'stop' operation = f'{verb} auto-{option_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( @@ -203,7 +205,7 @@ def autostop( raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}{operation} cluster ' f'{cluster_name!r}... skipped{colorama.Style.RESET_ALL}' - '\n Auto-stopping is only supported by backend: ' + f'\n Auto-{option_str} is only supported by backend: ' f'{backends.CloudVmRayBackend.NAME}') if cluster_status != global_user_state.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): @@ -211,7 +213,7 @@ def autostop( f'{colorama.Fore.YELLOW}{operation} cluster ' f'{cluster_name!r} (status: {cluster_status.value})... skipped' f'{colorama.Style.RESET_ALL}' - '\n Auto-stop can only be set/unset for ' + f'\n Auto-{option_str} can only be set/unset for ' f'{global_user_state.ClusterStatus.UP.value} clusters.') backend.set_autostop(handle, idle_minutes_to_autostop, down) diff --git a/sky/execution.py b/sky/execution.py index 1e014bd7be8..b2b2f24f42b 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -226,12 +226,12 @@ def _execute( finally: # Enables post_execute() to be run after KeyboardInterrupt. backend.post_execute(handle, down) - finally: + if stages is None or Stage.DOWN in stages: if down and idle_minutes_to_autostop is None: backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) - + finally: if cluster_name != spot.SPOT_CONTROLLER_NAME: # UX: print live clusters to make users aware (to save costs). # diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 4a03be70bb4..ae8717420e4 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -61,6 +61,8 @@ def create_table(cursor, conn): db_utils.add_column_to_table(cursor, conn, 'clusters', 'metadata', 'TEXT DEFAULT "{}"') + db_utils.add_column_to_table(cursor, conn, 'clusters', 'to_down', + 'INTEGER DEFAULT 0') conn.commit() @@ -114,7 +116,7 @@ def add_or_update_cluster(cluster_name: str, status = ClusterStatus.UP if ready else ClusterStatus.INIT _DB.cursor.execute( 'INSERT or REPLACE INTO clusters' - '(name, launched_at, handle, last_use, status, autostop) ' + '(name, launched_at, handle, last_use, status, autostop, to_down) ' 'VALUES (' # name '?, ' @@ -132,7 +134,11 @@ def add_or_update_cluster(cluster_name: str, # Keep the old autostop value if it exists, otherwise set it to # default -1. 'COALESCE(' - '(SELECT autostop FROM clusters WHERE name=? AND status!=?), -1)' + '(SELECT autostop FROM clusters WHERE name=? AND status!=?), -1), ' + # Keep the old to_down value if it exists, otherwise set it to + # default 0. + 'COALESCE(' + '(SELECT to_down FROM clusters WHERE name=? AND status!=?), 0)' ')', ( # name @@ -150,6 +156,8 @@ def add_or_update_cluster(cluster_name: str, # autostop cluster_name, ClusterStatus.STOPPED.value, + cluster_name, + ClusterStatus.STOPPED.value, )) _DB.conn.commit() @@ -211,11 +219,14 @@ def set_cluster_status(cluster_name: str, status: ClusterStatus) -> None: raise ValueError(f'Cluster {cluster_name} not found.') -def set_cluster_autostop_value(cluster_name: str, idle_minutes: int) -> None: - _DB.cursor.execute('UPDATE clusters SET autostop=(?) WHERE name=(?)', ( - idle_minutes, - cluster_name, - )) +def set_cluster_autostop_value(cluster_name: str, idle_minutes: int, + to_down: bool) -> None: + _DB.cursor.execute( + 'UPDATE clusters SET autostop=(?), to_down=(?) WHERE name=(?)', ( + idle_minutes, + int(to_down), + cluster_name, + )) count = _DB.cursor.rowcount _DB.conn.commit() assert count <= 1, count @@ -248,7 +259,8 @@ def get_cluster_from_name( cluster_name: Optional[str]) -> Optional[Dict[str, Any]]: rows = _DB.cursor.execute('SELECT * FROM clusters WHERE name=(?)', (cluster_name,)) - for name, launched_at, handle, last_use, status, autostop, metadata in rows: + for (name, launched_at, handle, last_use, status, autostop, metadata, + to_down) in rows: record = { 'name': name, 'launched_at': launched_at, @@ -256,6 +268,7 @@ def get_cluster_from_name( 'last_use': last_use, 'status': ClusterStatus[status], 'autostop': autostop, + 'to_down': bool(to_down), 'metadata': json.loads(metadata), } return record @@ -265,7 +278,8 @@ def get_clusters() -> List[Dict[str, Any]]: rows = _DB.cursor.execute( 'select * from clusters order by launched_at desc') records = [] - for name, launched_at, handle, last_use, status, autostop, metadata in rows: + for (name, launched_at, handle, last_use, status, autostop, metadata, + to_down) in rows: # TODO: use namedtuple instead of dict record = { 'name': name, @@ -274,6 +288,7 @@ def get_clusters() -> List[Dict[str, Any]]: 'last_use': last_use, 'status': ClusterStatus[status], 'autostop': autostop, + 'to_down': bool(to_down), 'metadata': json.loads(metadata), } records.append(record) diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 94528e37c4f..5d464dfd9df 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -208,10 +208,17 @@ def _get_zone(cluster_status): def _get_autostop(cluster_status): - autostop_str = '-' + autostop_str = '' + separtion = '' if cluster_status['autostop'] >= 0: # TODO(zhwu): check the status of the autostop cluster. autostop_str = str(cluster_status['autostop']) + ' min' + separtion = ' ' + + if cluster_status['to_down']: + autostop_str += f'{separtion}(down)' + if autostop_str == '': + autostop_str = '-' return autostop_str diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 76ff20e5ae8..27942d7fcce 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -526,7 +526,7 @@ def test_autostop(): run_one_test(test) -# ---------- Testing Autostopping ---------- +# ---------- Testing Autodowning ---------- def test_autodown(): name = _get_cluster_name() test = Test( @@ -537,13 +537,14 @@ def test_autodown(): # Ensure autostop is set. f'sky status | grep {name} | grep "1 min"', 'sleep 240', - # Ensure the cluster is STOPPED. - f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. + # Ensure the cluster is terminated. + f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 -i 1 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud aws examples/minimal.yaml', 'sleep 240', - f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', # Ensure the cluster is DOWN. + # Ensure the cluster is terminated. + f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', ], f'sky down -y {name}', timeout=20 * 60, From f3b357e2595d42e52eac3d4d6076332c58fbb996 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 00:56:01 -0700 Subject: [PATCH 18/36] Add comment --- sky/cli.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sky/cli.py b/sky/cli.py index b6689b46404..5e59656ec8a 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1262,6 +1262,10 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin - STOPPED: The cluster is stopped and the storage is persisted. Use ``sky start`` to restart the cluster. + The autostop column indicates how long the cluster will be auto-stopped + after idling (no jobs running). If the time is followed by '(down)', e.g. + '1 min (down)', the cluster will be auto-downed, rather than auto-stopped. + """ cluster_records = core.status(all=all, refresh=refresh) local_clusters = onprem_utils.check_and_get_local_clusters( From 5198b1ada10bcaa4cdc34f76caa678a8fd8e06bf Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 01:05:26 -0700 Subject: [PATCH 19/36] Switch back to terminate --- sky/cli.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 5e59656ec8a..5ca8e275e33 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -13,10 +13,10 @@ # Show the list of running clusters. >> sky status - # Tear down a specific cluster. + # Terminate a specific cluster. >> sky down cluster_name - # Tear down all existing clusters. + # Terminate all existing clusters. >> sky down -a TODO: @@ -747,7 +747,7 @@ def _create_and_ssh_into_node( click.secho(f'sky {node_type}{option}', bold=True) click.echo('To stop the node:\t', nl=False) click.secho(f'sky stop {cluster_name}', bold=True) - click.echo('To tear down the node:\t', nl=False) + click.echo('To terminate the node:\t', nl=False) click.secho(f'sky down {cluster_name}', bold=True) click.echo('To upload a folder:\t', nl=False) click.secho(f'rsync -rP /local/path {cluster_name}:/remote/path', bold=True) @@ -1000,7 +1000,7 @@ def cli(): is_flag=True, required=False, help= - ('Tear down the cluster after execution finishes (successfully or ' + ('Terminate the cluster after execution finishes (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' 'be torn down after the idle time, rather than immediately after execution' ' finishes.'), @@ -1450,7 +1450,7 @@ def cancel(cluster: str, all: bool, jobs: List[int]): # pylint: disable=redefin '-a', default=None, is_flag=True, - help='Tear down all existing clusters.') + help='Stop all existing clusters.') @click.option('--yes', '-y', is_flag=True, @@ -1523,7 +1523,7 @@ def stop( default=False, is_flag=True, required=False, - help='Tear down the cluster instead of stopping it, when auto-stopping ' + help='Terminate the cluster instead of stopping it, when auto-stopping ' '(i.e., auto-down rather than auto-stop).') @click.option('--yes', '-y', @@ -1545,7 +1545,7 @@ def autostop( CLUSTERS are the name (or glob pattern) of the clusters to stop. If both CLUSTERS and ``--all`` are supplied, the latter takes precedence. - If --down is passed, autodown (tear down the cluster; non-restartable) is + If --down is passed, autodown (terminate the cluster; non-restartable) is used, rather than autostop (restartable). ``--idle-minutes`` is the number of minutes of idleness (no pending/running @@ -1622,7 +1622,7 @@ def autostop( is_flag=True, required=False, help= - ('Tear down the cluster after execution finishes (successfully or ' + ('Terminate the cluster after execution finishes (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' 'be torn down after the idle time, rather than immediately after execution' ' finishes.'), @@ -1780,7 +1780,7 @@ def start( '-a', default=None, is_flag=True, - help='Tear down all existing clusters.') + help='Terminate all existing clusters.') @click.option('--yes', '-y', is_flag=True, @@ -1801,9 +1801,9 @@ def down( yes: bool, purge: bool, ): - """Tear down cluster(s). + """Terminate cluster(s). - CLUSTER is the name of the cluster (or glob pattern) to tear down. If both + CLUSTER is the name of the cluster (or glob pattern) to terminate. If both CLUSTER and ``--all`` are supplied, the latter takes precedence. Terminating a cluster will delete all associated resources (all billing @@ -1817,16 +1817,16 @@ def down( .. code-block:: bash - # Tear down a specific cluster. + # Terminate a specific cluster. sky down cluster_name \b - # Tear down multiple clusters. + # Terminate multiple clusters. sky down cluster1 cluster2 \b - # Tear down all clusters matching glob pattern 'cluster*'. + # Terminate all clusters matching glob pattern 'cluster*'. sky down "cluster*" \b - # Tear down all existing clusters. + # Terminate all existing clusters. sky down -a """ @@ -3233,7 +3233,7 @@ def benchmark_down( clusters_to_exclude: List[str], yes: bool, ) -> None: - """Tear down all clusters belonging to a benchmark.""" + """Terminate all clusters belonging to a benchmark.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: raise click.BadParameter(f'Benchmark {benchmark} does not exist.') @@ -3259,7 +3259,7 @@ def benchmark_down( '-a', default=None, is_flag=True, - help='Tear down all existing clusters.') + help='Terminate all existing clusters.') @click.option('--yes', '-y', is_flag=True, From bce99fc736e2ec62d816de3d8deaf4acc4808d97 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 01:09:37 -0700 Subject: [PATCH 20/36] fix comments --- sky/backends/cloud_vm_ray_backend.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index df4993ac050..8aae605a96d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1117,10 +1117,6 @@ def ray_up(): # `--log-style` and `--log-color` flags do not work. To reproduce, # `ray up --log-style pretty --log-color true | tee tmp.out`. - # Use environment variables to disable the ray usage collection (to - # avoid the 10 second wait for usage collection confirmation), as - # the ray version on the user's machine may be lower version that - # does not support the `--disable-usage-stats` flag. returncode, stdout, stderr = log_lib.run_with_log( # NOTE: --no-restart solves the following bug. Without it, if # 'ray up' (sky launch) twice on a cluster with >1 node, the @@ -1138,9 +1134,15 @@ def ray_up(): line_processor=log_utils.RayUpLineProcessor(), # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging # time during 'ray up' if insufficient capacity occurs. - env=dict(BOTO_MAX_RETRIES='5', - RAY_USAGE_STATS_ENABLED='0', - **os.environ), + env=dict( + BOTO_MAX_RETRIES='5', + # Use environment variables to disable the ray usage stats + # (to avoid the 10 second wait for usage collection + # confirmation), as the ray version on the user's machine + # may be lower version that does not support the + # `--disable-usage-stats` flag. + RAY_USAGE_STATS_ENABLED='0', + **os.environ), require_outputs=True, # Disable stdin to avoid ray outputs mess up the terminal with # misaligned output when multithreading/multiprocessing are used @@ -1341,16 +1343,14 @@ def _ensure_cluster_ray_started(self, 'is installed or running correctly.') backend.run_on_head(handle, 'ray stop', use_cached_head_ip=False) - # Use environment variables to disable the ray usage collection (avoid - # the 10 second wait for usage collection confirmation), as the ray - # version on the user's machine may be lower version that does not - # support the `--disable-usage-stats` flag. - env = os.environ.copy() - env['RAY_USAGE_STATS_ENABLED'] = '0' log_lib.run_with_log( ['ray', 'up', '-y', '--restart-only', handle.cluster_yaml], log_abs_path, stream_logs=False, + # Use environment variables to disable the ray usage collection + # (avoid the 10 second wait for usage collection confirmation), + # as the ray version on the user's machine may be lower version + # that does not support the `--disable-usage-stats` flag. env=dict(RAY_USAGE_STATS_ENABLED='0', **os.environ), # Disable stdin to avoid ray outputs mess up the terminal with # misaligned output when multithreading/multiprocessing is used. From c214028687c1ea0e476f3cab0c423835c190cd04 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 01:19:46 -0700 Subject: [PATCH 21/36] Change back to tear down --- sky/cli.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 5ca8e275e33..d28fb39d39e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -13,10 +13,10 @@ # Show the list of running clusters. >> sky status - # Terminate a specific cluster. + # Tear down a specific cluster. >> sky down cluster_name - # Terminate all existing clusters. + # Tear down all existing clusters. >> sky down -a TODO: @@ -747,7 +747,7 @@ def _create_and_ssh_into_node( click.secho(f'sky {node_type}{option}', bold=True) click.echo('To stop the node:\t', nl=False) click.secho(f'sky stop {cluster_name}', bold=True) - click.echo('To terminate the node:\t', nl=False) + click.echo('To tear down the node:\t', nl=False) click.secho(f'sky down {cluster_name}', bold=True) click.echo('To upload a folder:\t', nl=False) click.secho(f'rsync -rP /local/path {cluster_name}:/remote/path', bold=True) @@ -1000,7 +1000,7 @@ def cli(): is_flag=True, required=False, help= - ('Terminate the cluster after execution finishes (successfully or ' + ('Tear down the cluster after execution finishes (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' 'be torn down after the idle time, rather than immediately after execution' ' finishes.'), @@ -1523,7 +1523,7 @@ def stop( default=False, is_flag=True, required=False, - help='Terminate the cluster instead of stopping it, when auto-stopping ' + help='Tear down the cluster instead of stopping it, when auto-stopping ' '(i.e., auto-down rather than auto-stop).') @click.option('--yes', '-y', @@ -1545,7 +1545,7 @@ def autostop( CLUSTERS are the name (or glob pattern) of the clusters to stop. If both CLUSTERS and ``--all`` are supplied, the latter takes precedence. - If --down is passed, autodown (terminate the cluster; non-restartable) is + If --down is passed, autodown (tear down the cluster; non-restartable) is used, rather than autostop (restartable). ``--idle-minutes`` is the number of minutes of idleness (no pending/running @@ -1622,7 +1622,7 @@ def autostop( is_flag=True, required=False, help= - ('Terminate the cluster after execution finishes (successfully or ' + ('Tear down the cluster after execution finishes (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' 'be torn down after the idle time, rather than immediately after execution' ' finishes.'), @@ -1801,9 +1801,9 @@ def down( yes: bool, purge: bool, ): - """Terminate cluster(s). + """Tear down cluster(s). - CLUSTER is the name of the cluster (or glob pattern) to terminate. If both + CLUSTER is the name of the cluster (or glob pattern) to tear down. If both CLUSTER and ``--all`` are supplied, the latter takes precedence. Terminating a cluster will delete all associated resources (all billing @@ -1817,16 +1817,16 @@ def down( .. code-block:: bash - # Terminate a specific cluster. + # Tear down a specific cluster. sky down cluster_name \b - # Terminate multiple clusters. + # Tear down multiple clusters. sky down cluster1 cluster2 \b - # Terminate all clusters matching glob pattern 'cluster*'. + # Tear down all clusters matching glob pattern 'cluster*'. sky down "cluster*" \b - # Terminate all existing clusters. + # Tear down all existing clusters. sky down -a """ @@ -3233,7 +3233,7 @@ def benchmark_down( clusters_to_exclude: List[str], yes: bool, ) -> None: - """Terminate all clusters belonging to a benchmark.""" + """Tear down all clusters belonging to a benchmark.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: raise click.BadParameter(f'Benchmark {benchmark} does not exist.') @@ -3259,7 +3259,7 @@ def benchmark_down( '-a', default=None, is_flag=True, - help='Terminate all existing clusters.') + help='Delete all benchmark reports from the history.') @click.option('--yes', '-y', is_flag=True, From ccdd792661b71522644baa0ce285a1b33bd1e01a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 01:23:20 -0700 Subject: [PATCH 22/36] Change to tear down --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index d28fb39d39e..7a87b3fb57e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1780,7 +1780,7 @@ def start( '-a', default=None, is_flag=True, - help='Terminate all existing clusters.') + help='Tear down all existing clusters.') @click.option('--yes', '-y', is_flag=True, From 5425c21bcae61d09b89ec758ae78e3c0ee667be1 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 01:24:55 -0700 Subject: [PATCH 23/36] fix comment --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 7a87b3fb57e..ac2eabd6f0f 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1806,7 +1806,7 @@ def down( CLUSTER is the name of the cluster (or glob pattern) to tear down. If both CLUSTER and ``--all`` are supplied, the latter takes precedence. - Terminating a cluster will delete all associated resources (all billing + Tearing down a cluster will delete all associated resources (all billing stops), and any data on the attached disks will be lost. For local clusters, `sky down` does not terminate the local cluster, but instead removes the cluster from `sky status` and terminates the calling user's running jobs. From 7e309b4198e2de13f2c83894c0d7fd87745c1341 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 14:47:06 -0700 Subject: [PATCH 24/36] change the logic of --down to use auto-down by default --- sky/cli.py | 10 ++++------ sky/execution.py | 45 +++++++++++++++++++++++++++------------------ 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index ac2eabd6f0f..38b37ae4945 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1000,10 +1000,9 @@ def cli(): is_flag=True, required=False, help= - ('Tear down the cluster after execution finishes (successfully or ' + ('Tear down the cluster after all jobs completed (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' - 'be torn down after the idle time, rather than immediately after execution' - ' finishes.'), + 'be torn down after the specified idle time.'), ) @click.option( '--retry-until-up', @@ -1622,10 +1621,9 @@ def autostop( is_flag=True, required=False, help= - ('Tear down the cluster after execution finishes (successfully or ' + ('Tear down the cluster after all jobs completed (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' - 'be torn down after the idle time, rather than immediately after execution' - ' finishes.'), + 'be torn down after the specified idle time.'), ) @click.option( '--retry-until-up', diff --git a/sky/execution.py b/sky/execution.py index b2b2f24f42b..a466bf3d525 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -43,7 +43,6 @@ logger = sky_logging.init_logger(__name__) OptimizeTarget = optimizer.OptimizeTarget -_MAX_SPOT_JOB_LENGTH = 10 # Message thrown when APIs sky.{exec,launch,spot_launch}() received a string # instead of a Dag. CLI (cli.py) is implemented by us so should not trigger @@ -120,8 +119,8 @@ def _execute( dag: sky.Dag. dryrun: bool; if True, only print the provision info (e.g., cluster yaml). - down: bool; whether to tear down the launched resources after - execution (successfully or abnormally). If idle_minutes_to_autostop + down: bool; whether to tear down the launched resources after all jobs + completed (successfully or abnormally). If idle_minutes_to_autostop is set, the cluster will be torn down after the specified idle time. stream_logs: bool; whether to stream all tasks' outputs to the client. handle: Any; if provided, execution will use an existing backend cluster @@ -157,17 +156,27 @@ def _execute( existing_handle = global_user_state.get_handle_from_cluster_name( cluster_name) cluster_exists = existing_handle is not None + + stages = stages if stages is not None else list(Stage) backend = backend if backend is not None else backends.CloudVmRayBackend() - if not isinstance(backend, backends.CloudVmRayBackend - ) and idle_minutes_to_autostop is not None: - # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend. - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Backend {backend.NAME} does not support autostop, please try ' - f'{backends.CloudVmRayBackend.NAME}') + if isinstance(backend, backends.CloudVmRayBackend): + if down: + # Set the idle minutes to >= 1 to avoid the cluster being torn + # down during the task submission. + idle_minutes_to_autostop = idle_minutes_to_autostop or 1 + # Use auto-down to terminate the cluster after the task is done. + # Otherwise, the cluster will be immediately terminated when the user + # detach from the execution. + stages.remove(Stage.DOWN) + elif idle_minutes_to_autostop is not None: + # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend. + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Backend {backend.NAME} does not support autostop, please try ' + f'{backends.CloudVmRayBackend.NAME}') - if not cluster_exists and (stages is None or Stage.OPTIMIZE in stages): + if not cluster_exists and Stage.OPTIMIZE in stages: if task.best_resources is None: # TODO: fix this for the situation where number of requested # accelerators is not an integer. @@ -187,7 +196,7 @@ def _execute( task.sync_storage_mounts() try: - if stages is None or Stage.PROVISION in stages: + if Stage.PROVISION in stages: if handle is None: handle = backend.provision(task, task.best_resources, @@ -200,26 +209,26 @@ def _execute( logger.info('Dry run finished.') return - if stages is None or Stage.SYNC_WORKDIR in stages: + if Stage.SYNC_WORKDIR in stages: if task.workdir is not None: backend.sync_workdir(handle, task.workdir) - if stages is None or Stage.SYNC_FILE_MOUNTS in stages: + if Stage.SYNC_FILE_MOUNTS in stages: backend.sync_file_mounts(handle, task.file_mounts, task.storage_mounts) if no_setup: logger.info('Setup commands skipped.') - elif stages is None or Stage.SETUP in stages: + elif Stage.SETUP in stages: backend.setup(handle, task) - if stages is None or Stage.PRE_EXEC in stages: + if Stage.PRE_EXEC in stages: if idle_minutes_to_autostop is not None: backend.set_autostop(handle, idle_minutes_to_autostop, down=down) - if stages is None or Stage.EXEC in stages: + if Stage.EXEC in stages: try: global_user_state.update_last_use(handle.get_cluster_name()) backend.execute(handle, task, detach_run) @@ -227,7 +236,7 @@ def _execute( # Enables post_execute() to be run after KeyboardInterrupt. backend.post_execute(handle, down) - if stages is None or Stage.DOWN in stages: + if Stage.DOWN in stages: if down and idle_minutes_to_autostop is None: backend.teardown_ephemeral_storage(task) backend.teardown(handle, terminate=True) From b625173ae58a7dcf9db2db445cf377b014a47932 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 21:40:55 -0700 Subject: [PATCH 25/36] Use autodown for --down and address comments --- sky/cli.py | 27 +++++++++++----------- sky/core.py | 10 ++++---- sky/execution.py | 36 +++++++++++++++++------------ sky/utils/cli_utils/status_utils.py | 4 ++-- tests/test_smoke.py | 2 +- tests/test_spot.py | 2 +- 6 files changed, 44 insertions(+), 37 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 38b37ae4945..ed516a1d46b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -993,7 +993,7 @@ def cli(): 'job queue. ' 'Setting this flag is equivalent to ' 'running ``sky launch -d ...`` and then ``sky autostop -i ``' - '. If not set, the cluster will not be auto-stopped.')) + '. If not set, the cluster will not be autostopped.')) @click.option( '--down', default=False, @@ -1002,7 +1002,9 @@ def cli(): help= ('Tear down the cluster after all jobs completed (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' - 'be torn down after the specified idle time.'), + 'be torn down after the specified idle time. ' + 'Note that if errors occur during provisioning/data syncing/setting up, ' + 'the cluster will not be torn down for debugging purposes.'), ) @click.option( '--retry-until-up', @@ -1261,10 +1263,9 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin - STOPPED: The cluster is stopped and the storage is persisted. Use ``sky start`` to restart the cluster. - The autostop column indicates how long the cluster will be auto-stopped + The autostop column indicates how long the cluster will be autostopped after idling (no jobs running). If the time is followed by '(down)', e.g. - '1 min (down)', the cluster will be auto-downed, rather than auto-stopped. - + '1 min (down)', the cluster will be autodowned, rather than autostopped. """ cluster_records = core.status(all=all, refresh=refresh) local_clusters = onprem_utils.check_and_get_local_clusters( @@ -1511,19 +1512,19 @@ def stop( type=int, default=None, required=False, - help='Set the idle minutes before auto-stopping the cluster.') + help='Set the idle minutes before autostopping the cluster.') @click.option('--cancel', default=False, is_flag=True, required=False, - help='Cancel the auto-stopping.') + help='Cancel the autostopping.') @click.option( '--down', default=False, is_flag=True, required=False, - help='Tear down the cluster instead of stopping it, when auto-stopping ' - '(i.e., auto-down rather than auto-stop).') + help='Use autodown (tear down the cluster; non-restartable), instead ' + 'of autostop (restartable).') @click.option('--yes', '-y', is_flag=True, @@ -1539,7 +1540,7 @@ def autostop( down: bool, # pylint: disable=redefined-outer-name yes: bool, ): - """Schedule or cancel an auto-stop or auto-down for cluster(s). + """Schedule or cancel an autostop or autodown for cluster(s). CLUSTERS are the name (or glob pattern) of the clusters to stop. If both CLUSTERS and ``--all`` are supplied, the latter takes precedence. @@ -1614,7 +1615,7 @@ def autostop( 'job queue. ' 'Setting this flag is equivalent to ' 'running ``sky launch -d ...`` and then ``sky autostop -i ``' - '. If not set, the cluster will not be auto-stopped.')) + '. If not set, the cluster will not be autostopped.')) @click.option( '--down', default=False, @@ -1858,7 +1859,7 @@ def _down_or_stop_clusters( if idle_minutes_to_autostop is not None: verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' option_str = 'down' if down else 'stop' - operation = f'{verb} auto-{option_str} on' + operation = f'{verb} auto{option_str} on' if len(names) > 0: reserved_clusters = [ @@ -2838,7 +2839,7 @@ def bench(): help=('Automatically stop the cluster after this many minutes ' 'of idleness after setup/file_mounts. This is equivalent to ' 'running `sky launch -d ...` and then `sky autostop -i `. ' - 'If not set, the cluster will not be auto-stopped.')) + 'If not set, the cluster will not be autostopped.')) @click.option('--yes', '-y', is_flag=True, diff --git a/sky/core.py b/sky/core.py index 340e178256a..e1cc790cc6d 100644 --- a/sky/core.py +++ b/sky/core.py @@ -42,8 +42,8 @@ def status(all: bool, refresh: bool) -> List[Dict[str, Any]]: 'last_use': (int) timestamp of last use, 'status': (sky.ClusterStatus) cluster status, 'autostop': (int) idle time before autostop, - 'to_down': (bool) whether to tear down the cluster after - execution (autostop), + 'to_down': (bool) whether autodown is used instead of + autostop, 'metadata': (dict) metadata of the cluster, } ] @@ -183,7 +183,7 @@ def autostop( """ verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' option_str = 'down' if down else 'stop' - operation = f'{verb} auto-{option_str} on' + operation = f'{verb} auto{option_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( f'{operation} sky reserved cluster {cluster_name!r} ' @@ -205,7 +205,7 @@ def autostop( raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}{operation} cluster ' f'{cluster_name!r}... skipped{colorama.Style.RESET_ALL}' - f'\n Auto-{option_str} is only supported by backend: ' + f'\n auto{option_str} is only supported by backend: ' f'{backends.CloudVmRayBackend.NAME}') if cluster_status != global_user_state.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): @@ -213,7 +213,7 @@ def autostop( f'{colorama.Fore.YELLOW}{operation} cluster ' f'{cluster_name!r} (status: {cluster_status.value})... skipped' f'{colorama.Style.RESET_ALL}' - f'\n Auto-{option_str} can only be set/unset for ' + f'\n auto{option_str} can only be set/unset for ' f'{global_user_state.ClusterStatus.UP.value} clusters.') backend.set_autostop(handle, idle_minutes_to_autostop, down) diff --git a/sky/execution.py b/sky/execution.py index a466bf3d525..cd12d19df6f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -122,6 +122,8 @@ def _execute( down: bool; whether to tear down the launched resources after all jobs completed (successfully or abnormally). If idle_minutes_to_autostop is set, the cluster will be torn down after the specified idle time. + Note that if errors occur during provisioning/data syncing/setting up, + the cluster will not be torn down for debugging purposes. stream_logs: bool; whether to stream all tasks' outputs to the client. handle: Any; if provided, execution will use an existing backend cluster handle instead of provisioning a new one. @@ -156,25 +158,29 @@ def _execute( existing_handle = global_user_state.get_handle_from_cluster_name( cluster_name) cluster_exists = existing_handle is not None - + stages = stages if stages is not None else list(Stage) backend = backend if backend is not None else backends.CloudVmRayBackend() if isinstance(backend, backends.CloudVmRayBackend): - if down: - # Set the idle minutes to >= 1 to avoid the cluster being torn - # down during the task submission. - idle_minutes_to_autostop = idle_minutes_to_autostop or 1 - # Use auto-down to terminate the cluster after the task is done. - # Otherwise, the cluster will be immediately terminated when the user - # detach from the execution. - stages.remove(Stage.DOWN) + if down and idle_minutes_to_autostop is None: + # Use autostop(down) to terminate the cluster after the task is + # done. Otherwise, the cluster will be immediately terminated when + # the user detach from the execution. + idle_minutes_to_autostop = 0 + if idle_minutes_to_autostop is not None: + if idle_minutes_to_autostop == 0: + verb = 'torn down' if down else 'stopped' + logger.debug('Setting idle_minutes_to_autostop to 1, to avoid ' + f'cluster being {verb} during task submission.') + idle_minutes_to_autostop = 1 + stages.remove(Stage.DOWN) elif idle_minutes_to_autostop is not None: - # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend. - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Backend {backend.NAME} does not support autostop, please try ' - f'{backends.CloudVmRayBackend.NAME}') + # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend. + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Backend {backend.NAME} does not support autostop, please try ' + f'{backends.CloudVmRayBackend.NAME}') if not cluster_exists and Stage.OPTIMIZE in stages: if task.best_resources is None: @@ -280,7 +286,7 @@ def launch( auto-generate a name. retry_until_up: whether to retry launching the cluster until it is up. - idle_minutes_to_autostop: if provided, the cluster will be auto-stop + idle_minutes_to_autostop: if provided, the cluster will be autostop after this many minutes of idleness. no_setup: if true, the cluster will not re-run setup instructions diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 5d464dfd9df..264aa74b131 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -34,7 +34,7 @@ def calc(self, record): def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool): """Compute cluster table values and display.""" - # TODO(zhwu): Update the information for auto-stop clusters. + # TODO(zhwu): Update the information for autostop clusters. status_columns = [ StatusColumn('NAME', _get_name), @@ -212,7 +212,7 @@ def _get_autostop(cluster_status): separtion = '' if cluster_status['autostop'] >= 0: # TODO(zhwu): check the status of the autostop cluster. - autostop_str = str(cluster_status['autostop']) + ' min' + autostop_str = str(cluster_status['autostop']) + 'm' separtion = ' ' if cluster_status['to_down']: diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 27942d7fcce..004c131e05c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -539,7 +539,7 @@ def test_autodown(): 'sleep 240', # Ensure the cluster is terminated. f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', - f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 -i 1 --down examples/minimal.yaml', + f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud aws examples/minimal.yaml', 'sleep 240', diff --git a/tests/test_spot.py b/tests/test_spot.py index d1bc6a7c5fd..549411a4dcc 100644 --- a/tests/test_spot.py +++ b/tests/test_spot.py @@ -141,7 +141,7 @@ def test_autostop_spot_controller(self, _mock_cluster_state): cli_runner = cli_testing.CliRunner() result = cli_runner.invoke(cli.autostop, [spot.SPOT_CONTROLLER_NAME]) assert result.exit_code == click.UsageError.exit_code - assert ('Scheduling auto-stop on reserved cluster(s) ' + assert ('Scheduling autostop on reserved cluster(s) ' f'\'{spot.SPOT_CONTROLLER_NAME}\' is not supported' in result.output) From 306671d934c213a67e449347a81d7f85da21d601 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 21:45:18 -0700 Subject: [PATCH 26/36] fix comment --- sky/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index ed516a1d46b..54975df2424 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1264,8 +1264,9 @@ def status(all: bool, refresh: bool): # pylint: disable=redefined-builtin ``sky start`` to restart the cluster. The autostop column indicates how long the cluster will be autostopped - after idling (no jobs running). If the time is followed by '(down)', e.g. - '1 min (down)', the cluster will be autodowned, rather than autostopped. + after minutes of idling (no jobs running). If the time is followed by + '(down)', e.g. '1m (down)', the cluster will be autodowned, rather than + autostopped. """ cluster_records = core.status(all=all, refresh=refresh) local_clusters = onprem_utils.check_and_get_local_clusters( From 5aff9e431c96d156c1fda5b88e5c4e9845136c25 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 21:53:20 -0700 Subject: [PATCH 27/36] fix ux --- sky/execution.py | 6 ++++-- sky/utils/cli_utils/status_utils.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index cd12d19df6f..28f244612f9 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -171,8 +171,10 @@ def _execute( if idle_minutes_to_autostop is not None: if idle_minutes_to_autostop == 0: verb = 'torn down' if down else 'stopped' - logger.debug('Setting idle_minutes_to_autostop to 1, to avoid ' - f'cluster being {verb} during task submission.') + logger.warning(f'{colorama.Fore.LIGHTBLACK_EX}Setting ' + 'idle_minutes_to_autostop to 1, to avoid ' + f'cluster being {verb} during task submission.' + f'{colorama.Style.RESET_ALL}') idle_minutes_to_autostop = 1 stages.remove(Stage.DOWN) elif idle_minutes_to_autostop is not None: diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 264aa74b131..266aac91c2b 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -72,8 +72,8 @@ def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool): if pending_autostop: click.echo( '\n' - f'You have {pending_autostop} clusters with autostop scheduled.' - ' Refresh statuses with: `sky status --refresh`.') + f'You have {pending_autostop} clusters with autostop(down) ' + 'scheduled. Refresh statuses with: `sky status --refresh`.') else: click.echo('No existing clusters.') From 2cda239aa87a23a7bb5e2e976fb126382b9722be Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 22:04:00 -0700 Subject: [PATCH 28/36] Add test for cancel --- tests/test_smoke.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 004c131e05c..52a461a6c2f 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -535,16 +535,22 @@ def test_autodown(): f'sky launch -y -d -c {name} --num-nodes 2 --cloud gcp examples/minimal.yaml', f'sky autostop -y {name} --down -i 1', # Ensure autostop is set. - f'sky status | grep {name} | grep "1 min"', + f'sky status | grep {name} | grep "1m (down)"', 'sleep 240', # Ensure the cluster is terminated. f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud aws examples/minimal.yaml', + f'sky status | grep {name} | grep "1m (down)"', 'sleep 240', # Ensure the cluster is terminated. f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', + f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', + f'sky autostop -y {name} --cancel', + 'sleep 240', + # Ensure the cluster is still UP. + f's=$(sky status --refresh) && printf "$s" && echo $s | grep {name} | grep UP', ], f'sky down -y {name}', timeout=20 * 60, From 787ac90ddaab23416ab7b465ba29eb84d9b76f2f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 23:00:33 -0700 Subject: [PATCH 29/36] fix UX --- sky/cli.py | 5 ++++- sky/core.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 54975df2424..564ad450aa4 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1858,8 +1858,11 @@ def _down_or_stop_clusters( operation = 'Terminating' if down else 'Stopping' if idle_minutes_to_autostop is not None: - verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' + is_cancel = idle_minutes_to_autostop < 0 + verb = 'Cancelling' if is_cancel else 'Scheduling' option_str = 'down' if down else 'stop' + if is_cancel: + option_str = 'stop(down)' operation = f'{verb} auto{option_str} on' if len(names) > 0: diff --git a/sky/core.py b/sky/core.py index e1cc790cc6d..e8a326eda9f 100644 --- a/sky/core.py +++ b/sky/core.py @@ -181,8 +181,11 @@ def autostop( sky.exceptions.NotSupportedError: the cluster is not supported. sky.exceptions.ClusterNotUpError: the cluster is not UP. """ - verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling' + is_cancel = idle_minutes_to_autostop < 0 + verb = 'Cancelling' if is_cancel else 'Scheduling' option_str = 'down' if down else 'stop' + if is_cancel: + option_str = 'stop(down)' operation = f'{verb} auto{option_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( From b7596b7827533a75e8cc414abd5c8a6661949d82 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 12 Oct 2022 23:08:05 -0700 Subject: [PATCH 30/36] fix test_smoke --- tests/test_smoke.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 52a461a6c2f..1dffb2c00f4 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -538,19 +538,19 @@ def test_autodown(): f'sky status | grep {name} | grep "1m (down)"', 'sleep 240', # Ensure the cluster is terminated. - f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud aws examples/minimal.yaml', f'sky status | grep {name} | grep "1m (down)"', 'sleep 240', # Ensure the cluster is terminated. - f's=$(sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', f'sky autostop -y {name} --cancel', 'sleep 240', # Ensure the cluster is still UP. - f's=$(sky status --refresh) && printf "$s" && echo $s | grep {name} | grep UP', + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && echo $s | grep {name} | grep UP', ], f'sky down -y {name}', timeout=20 * 60, From e34f88e911d6b0c077353ab42329c0de51d1de56 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 22:21:15 -0700 Subject: [PATCH 31/36] address comments --- sky/cli.py | 21 +++++++++++------- sky/core.py | 2 +- sky/execution.py | 34 ++++++++++++++++++++++------- sky/utils/cli_utils/status_utils.py | 2 +- 4 files changed, 41 insertions(+), 18 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 564ad450aa4..80c13f043d3 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1000,7 +1000,7 @@ def cli(): is_flag=True, required=False, help= - ('Tear down the cluster after all jobs completed (successfully or ' + ('Tear down the cluster after all jobs finish (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' 'be torn down after the specified idle time. ' 'Note that if errors occur during provisioning/data syncing/setting up, ' @@ -1514,11 +1514,13 @@ def stop( default=None, required=False, help='Set the idle minutes before autostopping the cluster.') -@click.option('--cancel', - default=False, - is_flag=True, - required=False, - help='Cancel the autostopping.') +@click.option( + '--cancel', + default=False, + is_flag=True, + required=False, + help='Cancel the currently active autostop/autodown setting for the ' + 'cluster.') @click.option( '--down', default=False, @@ -1560,6 +1562,9 @@ def autostop( If ``--idle-minutes`` and ``--cancel`` are not specified, default to 5 minutes. + When multiple configurations are specified for the same cluster, e.g. using + ``sky autostop`` or ``sky launch -i``, the last one takes precedence. + Examples: .. code-block:: bash @@ -1623,7 +1628,7 @@ def autostop( is_flag=True, required=False, help= - ('Tear down the cluster after all jobs completed (successfully or ' + ('Tear down the cluster after all jobs finish (successfully or ' 'abnormally). If --idle-minutes-to-autostop is also set, the cluster will ' 'be torn down after the specified idle time.'), ) @@ -1862,7 +1867,7 @@ def _down_or_stop_clusters( verb = 'Cancelling' if is_cancel else 'Scheduling' option_str = 'down' if down else 'stop' if is_cancel: - option_str = 'stop(down)' + option_str = '{stop,down}' operation = f'{verb} auto{option_str} on' if len(names) > 0: diff --git a/sky/core.py b/sky/core.py index e8a326eda9f..6bbe81a972c 100644 --- a/sky/core.py +++ b/sky/core.py @@ -185,7 +185,7 @@ def autostop( verb = 'Cancelling' if is_cancel else 'Scheduling' option_str = 'down' if down else 'stop' if is_cancel: - option_str = 'stop(down)' + option_str = '{stop,down}' operation = f'{verb} auto{option_str} on' if cluster_name in backend_utils.SKY_RESERVED_CLUSTER_NAMES: raise exceptions.NotSupportedError( diff --git a/sky/execution.py b/sky/execution.py index 28f244612f9..88e4e1223fa 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -120,8 +120,8 @@ def _execute( dryrun: bool; if True, only print the provision info (e.g., cluster yaml). down: bool; whether to tear down the launched resources after all jobs - completed (successfully or abnormally). If idle_minutes_to_autostop - is set, the cluster will be torn down after the specified idle time. + finish (successfully or abnormally). If idle_minutes_to_autostop is + also set, the cluster will be torn down after the specified idle time. Note that if errors occur during provisioning/data syncing/setting up, the cluster will not be torn down for debugging purposes. stream_logs: bool; whether to stream all tasks' outputs to the client. @@ -139,7 +139,7 @@ def _execute( cluster_name: Name of the cluster to create/reuse. If None, auto-generate a name. detach_run: bool; whether to detach the process after the job submitted. - autostop_idle_minutes: int; if provided, the cluster will be set to + idle_minutes_to_autostop: int; if provided, the cluster will be set to autostop after this many minutes of idleness. no_setup: bool; whether to skip setup commands or not when (re-)launching. """ @@ -164,9 +164,8 @@ def _execute( backend = backend if backend is not None else backends.CloudVmRayBackend() if isinstance(backend, backends.CloudVmRayBackend): if down and idle_minutes_to_autostop is None: - # Use autostop(down) to terminate the cluster after the task is - # done. Otherwise, the cluster will be immediately terminated when - # the user detach from the execution. + # Use auto{stop,down} to terminate the cluster after the task is + # done. idle_minutes_to_autostop = 0 if idle_minutes_to_autostop is not None: if idle_minutes_to_autostop == 0: @@ -288,8 +287,27 @@ def launch( auto-generate a name. retry_until_up: whether to retry launching the cluster until it is up. - idle_minutes_to_autostop: if provided, the cluster will be autostop - after this many minutes of idleness. + idle_minutes_to_autostop: automatically stop the cluster after this + many minute of idleness, i.e., no running or pending jobs in the + cluster's job queue. Idleness starts counting after + setup/file_mounts are done; the clock gets reset whenever there + are running/pending jobs in the job queue. Setting this flag is + equivalent to running ``sky.launch(..., detach_run=True, ...)`` + and then ``sky.autostop(idle_minutes=)``. If not set, + the cluster will not be autostopped. + down: Tear down the cluster after all jobs finish (successfully or + abnormally). If --idle-minutes-to-autostop is also set, the + cluster will be torn down after the specified idle time. + Note that if errors occur during provisioning/data syncing/setting + up, the cluster will not be torn down for debugging purposes. + dryrun: if True, do not actually launch the cluster. + stream_logs: if True, show the logs in the terminal. + backend: backend to use. If None, use the default backend + (CloudVMRayBackend). + optimize_target: target to optimize for. Choices: OptimizeTarget.COST, + OptimizeTarget.TIME. + detach_run: If True, run setup first (blocking), then detach from the + job's execution. no_setup: if true, the cluster will not re-run setup instructions Examples: diff --git a/sky/utils/cli_utils/status_utils.py b/sky/utils/cli_utils/status_utils.py index 266aac91c2b..6931ee875ce 100644 --- a/sky/utils/cli_utils/status_utils.py +++ b/sky/utils/cli_utils/status_utils.py @@ -72,7 +72,7 @@ def show_status_table(cluster_records: List[Dict[str, Any]], show_all: bool): if pending_autostop: click.echo( '\n' - f'You have {pending_autostop} clusters with autostop(down) ' + f'You have {pending_autostop} clusters with auto{{stop,down}} ' 'scheduled. Refresh statuses with: `sky status --refresh`.') else: click.echo('No existing clusters.') From faee1a01cbed2980307497b80baf9a02a2f6dc94 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 22:22:36 -0700 Subject: [PATCH 32/36] fix --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 80c13f043d3..163abfa9bd5 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1519,7 +1519,7 @@ def stop( default=False, is_flag=True, required=False, - help='Cancel the currently active autostop/autodown setting for the ' + help='Cancel the currently active auto{stop,down} setting for the ' 'cluster.') @click.option( '--down', From f653c849e7a36f7642709bc1517108a07760469f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 23:30:19 -0700 Subject: [PATCH 33/36] fix logging and comment --- sky/execution.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 892eccea6fe..412aac0a0d2 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -169,11 +169,16 @@ def _execute( idle_minutes_to_autostop = 0 if idle_minutes_to_autostop is not None: if idle_minutes_to_autostop == 0: + # idle_minutes_to_autostop=0 can cause the following problem: + # After we set the autostop in the PRE_EXEC stage with -i 0, + # it could be possible that the cluster immediately found + # itself have no task running and start the auto{stop,down} + # process, before the task is submitted in the EXEC stage. verb = 'torn down' if down else 'stopped' - logger.warning(f'{colorama.Fore.LIGHTBLACK_EX}Setting ' - 'idle_minutes_to_autostop to 1, to avoid ' - f'cluster being {verb} during task submission.' - f'{colorama.Style.RESET_ALL}') + logger.info(f'{colorama.Fore.LIGHTBLACK_EX}The cluster will ' + f'be {verb} after 1 minutes of idleness ' + '(after all jobs finish).' + f'{colorama.Style.RESET_ALL}') idle_minutes_to_autostop = 1 stages.remove(Stage.DOWN) elif idle_minutes_to_autostop is not None: From 57343b8c1ccdeac3e118640f1da79fb5d5db1ffb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 14 Oct 2022 01:13:46 -0700 Subject: [PATCH 34/36] fix environment variable overwrite --- sky/backends/cloud_vm_ray_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index e06f3f9eb09..fc22bdf91a6 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1137,14 +1137,14 @@ def ray_up(): # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging # time during 'ray up' if insufficient capacity occurs. env=dict( + os.environ, BOTO_MAX_RETRIES='5', # Use environment variables to disable the ray usage stats # (to avoid the 10 second wait for usage collection # confirmation), as the ray version on the user's machine # may be lower version that does not support the # `--disable-usage-stats` flag. - RAY_USAGE_STATS_ENABLED='0', - **os.environ), + RAY_USAGE_STATS_ENABLED='0'), require_outputs=True, # Disable stdin to avoid ray outputs mess up the terminal with # misaligned output when multithreading/multiprocessing are used @@ -1353,7 +1353,7 @@ def _ensure_cluster_ray_started(self, # (avoid the 10 second wait for usage collection confirmation), # as the ray version on the user's machine may be lower version # that does not support the `--disable-usage-stats` flag. - env=dict(RAY_USAGE_STATS_ENABLED='0', **os.environ), + env=dict(os.environ, RAY_USAGE_STATS_ENABLED='0'), # Disable stdin to avoid ray outputs mess up the terminal with # misaligned output when multithreading/multiprocessing is used. # Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long From 1d3219738a5e75dfbba6900b569f3689f2d18fa4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 14 Oct 2022 01:51:54 -0700 Subject: [PATCH 35/36] fix smoke test --- tests/test_smoke.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 5369e072317..57af9ffa9cf 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -512,7 +512,7 @@ def test_autostop(): f'sky launch -y -d -c {name} --num-nodes 2 examples/minimal.yaml', f'sky autostop -y {name} -i 1', # Ensure autostop is set. - f'sky status | grep {name} | grep "1 min"', + f'sky status | grep {name} | grep "1m"', 'sleep 180', # Ensure the cluster is STOPPED. f'sky status --refresh | grep {name} | grep STOPPED', @@ -539,14 +539,14 @@ def test_autodown(): f'sky status | grep {name} | grep "1m (down)"', 'sleep 240', # Ensure the cluster is terminated. - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "The following cluster" | grep "terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud aws examples/minimal.yaml', f'sky status | grep {name} | grep "1m (down)"', 'sleep 240', # Ensure the cluster is terminated. - f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "was terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', + f's=$(SKYPILOT_DEBUG=0 sky status --refresh) && printf "$s" && {{ echo $s | grep {name} | grep "The following cluster" | grep "terminated"; }} || {{ echo $s | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud aws --num-nodes 2 --down examples/minimal.yaml', f'sky autostop -y {name} --cancel', 'sleep 240', From 83dbdff6af10e35a3fb2c337fb3957dbd8c2b457 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 14 Oct 2022 01:53:04 -0700 Subject: [PATCH 36/36] print info --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 57af9ffa9cf..3560feab72c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -763,7 +763,7 @@ def test_inline_spot_env(): [ f'sky spot launch -n {name} -y --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKY_NODE_IPS\\" ]] && [[ ! -z \\"\$SKY_NODE_RANK\\" ]]) || exit 1"', 'sleep 10', - f'sky spot status | grep {name} | grep SUCCEEDED', + f's=$(sky spot status) && printf "$s" && echo "$s" | grep {name} | grep SUCCEEDED', ], f'sky spot cancel -y -n {name}', )