-
Notifications
You must be signed in to change notification settings - Fork 0
/
stemmer.py
171 lines (135 loc) · 6.86 KB
/
stemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer
import DBModel
import util
import nltk
import os
import re
from config import config as cfg
cwd = os.getcwd()
def stem_system(system_name):
list_of_stem_dicts = []
banned_words = []
print "[stemmer] Stemming {}.".format(system_name)
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
vocab = set()
if cfg.clustering_mode == 'label':
temp_list = [row for row in DBModel.Terse_PreProcessed_Keyword.get_db_ref_by_system(cfg.labelling_dataset).random(cfg.labelling_dataset)]
selection = []
for row in temp_list:
if row.status != 'New' and row.status != 'Expired' and row.status != 'In Progress':
selection.append(row)
else:
selection = DBModel.Terse_PreProcessed_Keyword.get_db_ref_by_system(system_name).select()
for row in selection:
if cfg.clustering_mode == 'label' and len(list_of_stem_dicts) == 1000:
break
stripped_description = util.strip_autogen_info(row.description)
stems = []
sentences = nltk.sent_tokenize(stripped_description)
# Tokenize into sentences first for better POS tagging
for sentence in sentences:
words = nltk.word_tokenize(sentence)
regex_proc_words = []
#print "Words:"
#print words
#print '\n'
# Handle special cases of words with symbols and capitals, while preserving capitalization and order.
for word in words:
word = re.sub(r'\d+', '', word)
word = word.replace(system_name, '')
if not re.match('[A-Za-z]', word):
continue
split_word_list = regex_match_reduce(word)
if len(split_word_list) > 0:
for split_word in split_word_list:
regex_proc_words.append(split_word)
else:
regex_proc_words.append(word)
#print "Regex proc words:"
#print regex_proc_words
#print '\n'
# Keeps nouns and verbs
word_to_pos_pair_list = nltk.pos_tag(regex_proc_words)
for word, tag in word_to_pos_pair_list:
if word in stopwords.words('english'):
continue
if len(word) < 3:
continue
if '\'' in word:
continue
word = word.lower()
if 'V' in tag:
word = wordnet_lemmatizer.lemmatize(word, pos='v')
word = reduce_according_to_phrase(word)
word = get_most_likely_synonym(word, vocab, 'v')
stem = porter_stemmer.stem(word)
stem = find_system_synonym_from_stem(stem)
if stem not in banned_words and len(stem) > 2:
stems.append(stem)
elif 'NN' in tag:
word = wordnet_lemmatizer.lemmatize(word)
word = reduce_according_to_phrase(word)
word = get_most_likely_synonym(word, vocab, 'n')
stem = porter_stemmer.stem(word)
stem = find_system_synonym_from_stem(stem)
if stem not in banned_words and len(stem) > 2:
stems.append(stem)
if len(stems) <= 4:
continue
if cfg.clustering_mode == 'label':
list_of_stem_dicts.append({'system': system_name,
'description': u' '.join(stems).encode('utf-8'),
'classification': row.classification,
'title': row.title,
'status': row.status,
'issue_number': row.issue_number,
'target': row.target})
else:
list_of_stem_dicts.append({'system': system_name,
'description': u' '.join(stems).encode('utf-8'),
'classification': row.classification})
DBModel.Stemmed_Keyword.get_db_ref_by_system(system_name).overwrite_system_rows(system_name, list_of_stem_dicts)
print "[stemmer] : Stemmed " + system_name + "."
def regex_match_reduce(word):
master_regex = re.compile(r'[A-Z]*[a-z]+')
if len(master_regex.findall(word)):
return master_regex.findall(word)
return []
def find_system_synonym_from_stem(s1):
sys_synonyms = {'oom': 'outofmemory', 'tdd': 'testdrivendevelopment',
'hdf': 'hdfs', 'except': 'exception', 'flum': 'flume', 'lifecyc': 'lifecycle',
'interfac': 'interface', 'improv': 'improve', 'stabl': 'stable', 'cach': 'cache', 'nod': 'node',
'npe': 'exception', 'mapred': 'mapreduce', 'zoo': 'zookeeper', 'hfile': 'file',
'cfg': 'config', 'mapr': 'mapreduce', 'synchron': 'sync', 'err': 'error', 'holder': 'hold',
'agre': 'agree', 'algo': 'algorithm', 'amoun': 'amount', 'analyz': 'analysi', 'assig': 'assign',
'branc': 'branch', 'broken': 'break', 'buf': 'buffer', 'callabl': 'call', 'additiv': 'add',
'dir': 'directori', 'increas': 'increment', 'indic': 'index', 'integr': 'integrate', 'int': 'integ',
'lose': 'loss', 'mini': 'min', 'partit': 'partition', 'multi': 'multipl', 'regress': 'regression',
'replic': 'replica', 'secondari': 'second', 'startup': 'start', 'statu': 'state', 'storag': 'store',
'transact': 'transfer', 'translat': 'transfer', 'verifi': 'verif', 'shut': 'shutdown', 'mem': 'memori'}
if s1 in sys_synonyms.keys():
return sys_synonyms[s1]
return s1
def reduce_according_to_phrase(w1):
phrases_to_filter = ['outofmemory', 'zip', 'lifecyc', 'stabl', 'batch',
'properti', 'recov', 'mapred', 'yarn', 'runtim',
'heartbeat', 'stress', 'hadoop', 'buffer', 'agre',
'cassandra', 'filesystem', 'ttl', 'drop', 'wait', 'visit', 'accumul', 'admin',
'assign', 'amoun', 'directori', 'system', 'job', 'launch', 'sink', 'block']
for phrase in phrases_to_filter:
if phrase in w1:
#print phrase
return phrase
return w1
def get_most_likely_synonym(w1, vocabulary, pos):
if w1 not in vocabulary:
synonyms = [x.name().split('.')[0] for x in wn.synsets(w1) if x.name().split('.')[1] == pos]
if len(synonyms) > 0:
for likely_word in synonyms:
if likely_word in vocabulary:
return likely_word
# Cache word into vocabulary
vocabulary.add(w1)
return w1