forked from hojonathanho/diffusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrapper.py
160 lines (138 loc) · 5.17 KB
/
wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from pprint import pprint as pp
from contextlib import contextmanager
import sys
import os
import re
import six
from six.moves.urllib.error import URLError
from tensorflow.python import framework
from tensorflow.python.client import session
from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
from tensorflow.contrib.cluster_resolver import TPUClusterResolver as BaseTPUClusterResolver
from tensorflow.python.eager.context import LogicalDevice
from tensorflow.python.framework import errors
from tensorflow.python.framework import test_util
from tensorflow.python.platform import test
from tensorflow.python.training import server_lib
from tensorflow.python.util import compat
mock = test.mock
def reroute(addr, host=None):
if host is None or host is False:
return addr
if addr.startswith('grpc://'):
return 'grpc://' + reroute(addr[len('grpc://'):], host=host)
if not re.match('[0-9]+[.][0-9]+[.][0-9]+[.][0-9]+[:]8470', addr):
return addr
if not addr.endswith(':8470'):
return addr
a, b, c, d = [int(x) for x in addr.split(':')[0].split('.')]
if a == 10 and b in [48, 49]:
assert (d == 2)
port = b * 1000 + c
elif a == 10 and b in range(2, 66) and c == 0:
port = b * 1000 + d
else:
return addr
return host + ':' + str(port)
class TPUClusterResolver(BaseTPUClusterResolver):
def __init__(self, *args, host=None, node_count=None, node_offset=None, **kws):
super(TPUClusterResolver, self).__init__(*args, **kws)
if host is None:
host = _tpu_host()
self._host = host
if node_count is None:
if 'TPU_NODE_COUNT' in os.environ:
node_count = int(os.environ['TPU_NODE_COUNT'])
self._node_count = node_count
if node_offset is None:
if 'TPU_NODE_OFFSET' in os.environ:
node_offset = int(os.environ['TPU_NODE_OFFSET'])
self._node_offset = node_offset
def master(self, *args, **kws):
ip = super(TPUClusterResolver, self).master(*args, **kws)
return reroute(ip, host=self._host)
def cluster_spec(self):
spec = super(TPUClusterResolver, self).cluster_spec()
r = dict()
for k, v in spec.as_dict().items():
r[k] = [reroute(ip, host=self._host) for ip in v]
i = self._node_count or len(r['worker'])
j = self._node_offset or 0
r['worker'] = [r['worker'][0]] + r['worker'][(j+1):(j+1)+(i-1)]
spec2 = server_lib.ClusterSpec(r)
print(spec2.as_cluster_def())
return spec2
_master = resolver.TPUClusterResolver.master
def _tpu_host():
return os.environ.get('TPU_HOST', '10.255.128.3')
def mock_master(cls, *args, **kws):
ip = _master(cls, *args, **kws)
return reroute(ip, host=os.environ['TPU_HOST'])
_cluster_spec = resolver.TPUClusterResolver.cluster_spec
def cluster_spec(cls, *args, **kws):
spec = _cluster_spec(cls, *args, **kws)
r = dict()
for k, v in spec.as_dict().items():
r[k] = [reroute(ip, host=os.environ['TPU_HOST']) for ip in v]
return server_lib.ClusterSpec(r)
__fetch_cloud_tpu_metadata = resolver.TPUClusterResolver._fetch_cloud_tpu_metadata
def _fetch_cloud_tpu_metadata(cls, *args, **kws):
while True:
try:
return __fetch_cloud_tpu_metadata(cls, *args, **kws)
except Exception as e:
if '[Errno 111] Connection refused' in str(e):
# retry
import time
time.sleep(1.0)
else:
raise e
@contextmanager
def patch_tensorflow():
with mock.patch.object(resolver.TPUClusterResolver, 'master', mock_master):
with mock.patch.object(resolver.TPUClusterResolver, 'cluster_spec', cluster_spec):
with mock.patch.object(resolver.TPUClusterResolver, '_fetch_cloud_tpu_metadata', _fetch_cloud_tpu_metadata):
result = yield
return result
def patch_tensorflow_interactive():
patch = patch_tensorflow()
patch.__enter__()
return patch
if __name__ == '__main__':
_tf_patch = patch_tensorflow_interactive()
if len(sys.argv) <= 1:
from tensorflow.core.protobuf import config_pb2
import tensorflow as tf
import numpy as np
session_config = config_pb2.ConfigProto(allow_soft_placement=True, isolate_session_state=True)
master = None
res = None
cluster_spec = None
cluster_def = None
if 'TPU_NAME' in os.environ:
res = resolver.TPUClusterResolver(os.environ['TPU_NAME'])
master = res.get_master()
cluster_spec = res.cluster_spec()
if cluster_spec:
cluster_def = cluster_spec.as_cluster_def()
session_config.cluster_def.CopyFrom(cluster_def)
graph = tf.Graph()
sess = tf.compat.v1.InteractiveSession(master, graph=graph, config=session_config)
devices = sess.list_devices()
num_cores = len([x for x in devices if ':TPU:' in x.name])
print(cluster_def)
print('ip: %s', master, num_cores)
r = sess.run
from tensorflow.python.tpu import tpu as tpu_ops
from tensorflow.compiler.tf2xla.python import xla
from tensorflow.compiler.tf2xla.ops import gen_xla_ops
else:
filename = sys.argv[1]
sys.argv = sys.argv[1:]
with open(filename) as f:
source = f.read()
code = compile(source, filename, 'exec')
exec(code, globals(), globals())