Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Autodown] Support for autodown #1217

Merged
merged 38 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c54ef6c
Support for autodown
Michaelvll Oct 10, 2022
10f41e6
Change API to terminate
Michaelvll Oct 10, 2022
420510f
fix flag
Michaelvll Oct 10, 2022
9a741aa
fix autostop
Michaelvll Oct 10, 2022
f057f3d
fix comment
Michaelvll Oct 10, 2022
b646909
address comment
Michaelvll Oct 10, 2022
a6ccef8
address comment
Michaelvll Oct 10, 2022
0660eb1
format
Michaelvll Oct 10, 2022
2a45671
Rename terminate to down
Michaelvll Oct 10, 2022
d89fe93
add smoke test
Michaelvll Oct 11, 2022
c2a4c4a
fix autodown for multi-node
Michaelvll Oct 11, 2022
6354619
format
Michaelvll Oct 11, 2022
18bc534
fix syntax
Michaelvll Oct 11, 2022
8658d3e
use gcp for autodown test
Michaelvll Oct 11, 2022
8280c9d
fix smoke test
Michaelvll Oct 11, 2022
5a08c84
fix smoke test
Michaelvll Oct 11, 2022
59ced1d
address comments
Michaelvll Oct 12, 2022
f3b357e
Add comment
Michaelvll Oct 12, 2022
5198b1a
Switch back to terminate
Michaelvll Oct 12, 2022
bce99fc
fix comments
Michaelvll Oct 12, 2022
c214028
Change back to tear down
Michaelvll Oct 12, 2022
ccdd792
Change to tear down
Michaelvll Oct 12, 2022
5425c21
fix comment
Michaelvll Oct 12, 2022
7e309b4
change the logic of --down to use auto-down by default
Michaelvll Oct 12, 2022
b625173
Use autodown for --down and address comments
Michaelvll Oct 13, 2022
306671d
fix comment
Michaelvll Oct 13, 2022
5aff9e4
fix ux
Michaelvll Oct 13, 2022
2cda239
Add test for cancel
Michaelvll Oct 13, 2022
787ac90
fix UX
Michaelvll Oct 13, 2022
b7596b7
fix test_smoke
Michaelvll Oct 13, 2022
e34f88e
address comments
Michaelvll Oct 14, 2022
faee1a0
fix
Michaelvll Oct 14, 2022
e012373
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Oct 14, 2022
f653c84
fix logging and comment
Michaelvll Oct 14, 2022
ca57e69
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Oct 14, 2022
57343b8
fix environment variable overwrite
Michaelvll Oct 14, 2022
1d32197
fix smoke test
Michaelvll Oct 14, 2022
83dbdff
print info
Michaelvll Oct 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2608,11 +2608,11 @@ def post_teardown_cleanup(self,
def set_autostop(self,
handle: ResourceHandle,
idle_minutes_to_autostop: Optional[int],
terminate: bool = True,
down: bool = False,
stream_logs: bool = True) -> None:
if idle_minutes_to_autostop is not None:
code = autostop_lib.AutostopCodeGen.set_autostop(
idle_minutes_to_autostop, self.NAME, terminate)
idle_minutes_to_autostop, self.NAME, down)
returncode, _, stderr = self.run_on_head(handle,
code,
require_outputs=True,
Expand Down
116 changes: 64 additions & 52 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def _launch_with_confirm(
detach_run: bool,
no_confirm: bool = False,
idle_minutes_to_autostop: Optional[int] = None,
terminate: bool = False,
down: bool = False, # pylint: disable=redefined-outer-name
retry_until_up: bool = False,
no_setup: bool = False,
node_type: Optional[str] = None,
Expand Down Expand Up @@ -655,7 +655,7 @@ def _launch_with_confirm(
detach_run=detach_run,
backend=backend,
idle_minutes_to_autostop=idle_minutes_to_autostop,
terminate=terminate,
down=down,
retry_until_up=retry_until_up,
no_setup=no_setup,
)
Expand Down Expand Up @@ -995,13 +995,14 @@ def cli():
'running ``sky launch -d ...`` and then ``sky autostop -i <minutes>``'
'. If not set, the cluster will not be auto-stopped.'))
@click.option(
'--terminate',
'--down',
default=False,
is_flag=True,
required=False,
help=(
'Terminate the cluster after execution. If --idle-minutes-to-autostop '
'is set, the cluster will be torn down after the idle time.'),
help=
('Tear down the cluster after execution (successfully or abnormally). If '
'--idle-minutes-to-autostop is set, the cluster will be torn down after '
'the idle time.'),
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
)
@click.option(
'--retry-until-up',
Expand Down Expand Up @@ -1044,7 +1045,7 @@ def launch(
env: List[Dict[str, str]],
disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int],
terminate: bool,
down: bool, # pylint: disable=redefined-outer-name
retry_until_up: bool,
yes: bool,
no_setup: bool,
Expand Down Expand Up @@ -1095,7 +1096,7 @@ def launch(
detach_run=detach_run,
no_confirm=yes,
idle_minutes_to_autostop=idle_minutes_to_autostop,
terminate=terminate,
down=down,
retry_until_up=retry_until_up,
no_setup=no_setup,
is_local_cloud=onprem_utils.check_if_local_cloud(cluster))
Expand Down Expand Up @@ -1485,10 +1486,10 @@ def stop(
sky stop -a

"""
_terminate_or_stop_clusters(clusters,
apply_to_all=all,
terminate=False,
no_confirm=yes)
_down_or_stop_clusters(clusters,
apply_to_all=all,
down=False,
no_confirm=yes)


@cli.command(cls=_DocumentedCodeCommand)
Expand All @@ -1513,11 +1514,12 @@ def stop(
required=False,
help='Cancel the auto-stopping.')
@click.option(
'--terminate',
'--down',
default=False,
is_flag=True,
required=False,
help='Terminate the cluster instead of stopping it, when auto-stopping.')
help='Tear down the cluster instead of stopping it, when auto-stopping '
'(i.e., autodown rather than autostop).')
@click.option('--yes',
'-y',
is_flag=True,
Expand All @@ -1530,7 +1532,7 @@ def autostop(
all: Optional[bool], # pylint: disable=redefined-builtin
idle_minutes: Optional[int],
cancel: bool, # pylint: disable=redefined-outer-name
terminate: bool,
down: bool, # pylint: disable=redefined-outer-name
yes: bool,
):
"""Schedule or cancel auto-stopping for cluster(s).
Expand All @@ -1540,6 +1542,8 @@ def autostop(

``--idle-minutes`` is the number of minutes of idleness (no pending/running
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
jobs) after which the cluster will be stopped automatically.
Scheduling autostop twice on the same cluster will overwrite the previous
autostop schedule.

``--cancel`` will cancel the autostopping. If the cluster was not scheduled
autostop, this will do nothing to autostop.
Expand All @@ -1566,11 +1570,11 @@ def autostop(
idle_minutes = -1
elif idle_minutes is None:
idle_minutes = 5
_terminate_or_stop_clusters(clusters,
apply_to_all=all,
terminate=terminate,
no_confirm=yes,
idle_minutes_to_autostop=idle_minutes)
_down_or_stop_clusters(clusters,
apply_to_all=all,
down=down,
no_confirm=yes,
idle_minutes_to_autostop=idle_minutes)


@cli.command(cls=_DocumentedCodeCommand)
Expand Down Expand Up @@ -1605,13 +1609,14 @@ def autostop(
'running ``sky launch -d ...`` and then ``sky autostop -i <minutes>``'
'. If not set, the cluster will not be auto-stopped.'))
@click.option(
'--terminate',
'--down',
default=False,
is_flag=True,
required=False,
help=(
'Terminate the cluster after execution. If --idle-minutes-to-autostop '
'is set, the cluster will be torn down after the idle time.'),
help=
('Tear down the cluster after execution (successfully or abnormally). If '
'--idle-minutes-to-autostop is set, the cluster will be torn down after '
'the idle time.'),
)
@click.option(
'--retry-until-up',
Expand All @@ -1623,9 +1628,13 @@ def autostop(
'if we fail to start the cluster due to unavailability errors.'))
@usage_lib.entrypoint
# pylint: disable=redefined-builtin
def start(clusters: Tuple[str], all: bool, yes: bool,
idle_minutes_to_autostop: Optional[int], terminate: bool,
retry_until_up: bool):
def start(
clusters: Tuple[str],
all: bool,
yes: bool,
idle_minutes_to_autostop: Optional[int],
down: bool, # pylint: disable=redefined-outer-name
retry_until_up: bool):
"""Restart cluster(s).

If a cluster is previously stopped (status is STOPPED) or failed in
Expand Down Expand Up @@ -1653,6 +1662,9 @@ def start(clusters: Tuple[str], all: bool, yes: bool,
sky start -a

"""
if down and idle_minutes_to_autostop is None:
raise click.UsageError(
'--idle-minutes-to-autostop must be set if --down is set.')
to_start = []

if not clusters and not all:
Expand Down Expand Up @@ -1744,7 +1756,7 @@ def start(clusters: Tuple[str], all: bool, yes: bool,
core.start(name,
idle_minutes_to_autostop,
retry_until_up,
terminate=terminate)
down=down)
except exceptions.NotSupportedError as e:
click.echo(str(e))
click.secho(f'Cluster {name} started.', fg='green')
Expand Down Expand Up @@ -1809,37 +1821,37 @@ def down(
sky down -a

"""
_terminate_or_stop_clusters(clusters,
apply_to_all=all,
terminate=True,
no_confirm=yes,
purge=purge)
_down_or_stop_clusters(clusters,
apply_to_all=all,
down=True,
no_confirm=yes,
purge=purge)


def _terminate_or_stop_clusters(
def _down_or_stop_clusters(
names: Tuple[str],
apply_to_all: Optional[bool],
terminate: bool,
down: bool, # pylint: disable=redefined-outer-name
no_confirm: bool,
purge: bool = False,
idle_minutes_to_autostop: Optional[int] = None) -> None:
"""Terminates or (auto-)stops a cluster (or all clusters).
"""Tears down or (auto-)stops a cluster (or all clusters).

Reserved clusters (spot controller) can only be terminated if the cluster
name is explicitly and uniquely specified (not via glob) and purge is set
to True.
"""
command = 'down' if terminate else 'stop'
command = 'down' if down else 'stop'
if not names and apply_to_all is None:
raise click.UsageError(
f'sky {command} requires either a cluster name (see `sky status`) '
'or --all.')

operation = 'Terminating' if terminate else 'Stopping'
operation = 'Terminating' if down else 'Stopping'
if idle_minutes_to_autostop is not None:
verb = 'Scheduling' if idle_minutes_to_autostop >= 0 else 'Cancelling'
down_str = ' (terminate)' if terminate else ''
operation = f'{verb} auto-stop{down_str} on'
option_str = 'down' if down else 'stop'
operation = f'{verb} auto-{option_str} on'

if len(names) > 0:
reserved_clusters = [
Expand All @@ -1851,7 +1863,7 @@ def _terminate_or_stop_clusters(
name for name in _get_glob_clusters(names)
if name not in backend_utils.SKY_RESERVED_CLUSTER_NAMES
]
if not terminate:
if not down:
local_clusters = onprem_utils.check_and_get_local_clusters()
# Local clusters are allowed to `sky down`, but not
# `sky start/stop`. `sky down` unregisters the local cluster
Expand All @@ -1868,7 +1880,7 @@ def _terminate_or_stop_clusters(
if not purge:
msg = (f'{operation} reserved cluster(s) '
f'{reserved_clusters_str} is not supported.')
if terminate:
if down:
msg += (
'\nPlease specify --purge (-p) to force-terminate the '
'reserved cluster(s).')
Expand Down Expand Up @@ -1930,11 +1942,11 @@ def _terminate_or_stop_clusters(
f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]',
total=len(clusters))

def _terminate_or_stop(name: str):
def _down_or_stop(name: str):
success_progress = False
if idle_minutes_to_autostop is not None:
try:
core.autostop(name, idle_minutes_to_autostop, terminate)
core.autostop(name, idle_minutes_to_autostop, down)
except (exceptions.NotSupportedError,
exceptions.ClusterNotUpError) as e:
message = str(e)
Expand All @@ -1952,7 +1964,7 @@ def _terminate_or_stop(name: str):
f'{colorama.Style.RESET_ALL}')
else:
try:
if terminate:
if down:
core.down(name, purge=purge)
else:
core.stop(name, purge=purge)
Expand All @@ -1967,7 +1979,7 @@ def _terminate_or_stop(name: str):
message = (
f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
f'{colorama.Style.RESET_ALL}')
if not terminate:
if not down:
message += ('\n To restart the cluster, run: '
f'{colorama.Style.BRIGHT}sky start {name}'
f'{colorama.Style.RESET_ALL}')
Expand All @@ -1979,7 +1991,7 @@ def _terminate_or_stop(name: str):
progress.start()

with progress:
subprocess_utils.run_in_parallel(_terminate_or_stop, clusters)
subprocess_utils.run_in_parallel(_down_or_stop, clusters)
progress.live.transient = False
# Make sure the progress bar not mess up the terminal.
progress.refresh()
Expand Down Expand Up @@ -3212,7 +3224,7 @@ def benchmark_down(
clusters_to_exclude: List[str],
yes: bool,
) -> None:
"""Terminate all clusters belonging to a benchmark."""
"""Tear down all clusters belonging to a benchmark."""
record = benchmark_state.get_benchmark_from_name(benchmark)
if record is None:
raise click.BadParameter(f'Benchmark {benchmark} does not exist.')
Expand All @@ -3226,10 +3238,10 @@ def benchmark_down(
continue
to_stop.append(cluster)

_terminate_or_stop_clusters(to_stop,
apply_to_all=False,
terminate=True,
no_confirm=yes)
_down_or_stop_clusters(to_stop,
apply_to_all=False,
down=True,
no_confirm=yes)


@bench.command('delete', cls=_DocumentedCodeCommand)
Expand Down
Loading