Skip to content

Commit

Permalink
Fixes ray dashboard hanging problem (#1088) (#1109)
Browse files Browse the repository at this point in the history
* patch job_manager

* fix cloud API
  • Loading branch information
Michaelvll authored Aug 21, 2022
1 parent fcfe289 commit c224819
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 7 deletions.
5 changes: 1 addition & 4 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,7 @@ def is_same_cloud(self, other):
return isinstance(other, Azure)

@classmethod
def get_default_instance_type(cls,
accelerators: Optional[Dict[str, int]] = None
) -> str:
del accelerators
def get_default_instance_type(cls) -> str:
# 8 vCpus, 32 GB RAM. Prev-gen (as of 2021) general purpose.
return 'Standard_D8_v4'

Expand Down
4 changes: 1 addition & 3 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,7 @@ def get_accelerators_from_instance_type(
raise NotImplementedError

@classmethod
def get_default_instance_type(cls,
accelerators: Optional[Dict[str, int]] = None
) -> str:
def get_default_instance_type(cls) -> str:
raise NotImplementedError

@classmethod
Expand Down
3 changes: 3 additions & 0 deletions sky/skylet/ray_patches/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ def patch() -> None:
from ray.dashboard.modules.job import cli
_run_patch(cli.__file__, _to_absolute('cli.py.patch'))

from ray.dashboard.modules.job import job_manager
_run_patch(job_manager.__file__, _to_absolute('job_manager.py.patch'))

from ray.autoscaler._private import autoscaler
_run_patch(autoscaler.__file__, _to_absolute('autoscaler.py.patch'))

Expand Down
9 changes: 9 additions & 0 deletions sky/skylet/ray_patches/job_manager.py.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
0a1,4
> # Adapted from https://github.com/ray-project/ray/blob/ray-1.13.0/dashboard/modules/job/job_manager.py
> # Fixed the problem where the _monitor_job thread is leaked, due to `await job_supervisor.ping.remote()`
> # does not raise an exception after the job_supervisor is exited, causing the dashboard to hang.
>
334c338
< await job_supervisor.ping.remote()
---
> ray.get(job_supervisor.ping.remote())

0 comments on commit c224819

Please sign in to comment.