forked from ybz79/semits
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
65 lines (54 loc) · 1.77 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import codecs
import random
import re
import os
def select_by_index(path, index):
a = codecs.open(path).readlines()
b = codecs.open(path + 'sample', mode='w')
for i in index:
b.write(a[i].strip() + '\n')
return
def double(matched):
value = matched.group('value')
return re.sub(" , ", "," ,value)
def double2(matched):
value = matched.group('value')
return re.sub(" . ", ".", value)
def clean(sent):
sent = re.sub("\' \'", "\'\'", sent)
sent = re.sub("` `", "\'\'", sent)
sent = re.sub(" - - ", " -- ", sent)
sent = re.sub(" \' s ", " \'s ", sent)
sent = re.sub(" \' ve ", " \'ve ", sent)
sent = re.sub(" \' d ", " \'d ", sent)
sent = re.sub(" \' ll ", " \'ll ", sent)
sent = re.sub(" - ", "-", sent)
sent = re.sub(" n \' t ", " n\'t ", sent)
sent = re.sub(" \' m ", " \'m ", sent)
sent = re.sub(" u . s . ", " u.s. ", sent)
sent = re.sub(" h . w . ", " h.w. ", sent)
sent = re.sub(" st . ", " st. ", sent)
sent = re.sub(" mrs . ", " mrs. ", sent)
sent = re.sub("< sep >", "<sep>", sent)
sent = re.sub('(?P<value>(\d+ , \d+))', double, sent)
sent = re.sub('(?P<value>(\d+ \. \d+))', double2, sent)
return sent
def random_chose(paths):
'''
index = []
for i in range(1000):
if random.random() < 0.1:
index.append(i)
'''
for path in paths:
# select_by_index(path, index)
if os.path.exists(path):
b = codecs.open(path).readlines()
a = codecs.open(path + 'smooth', mode='w')
for i in b:
sent = clean(i.strip())
a.write(sent + '\n')
if __name__ == '__main__':
dataset = './final_result/wiki/untswikismooth'
paths = [dataset]
random_chose(paths)