-
Notifications
You must be signed in to change notification settings - Fork 1
/
lstm_rnn.py
94 lines (81 loc) · 3.5 KB
/
lstm_rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy
import sys
import os
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pdb
head_shape = 5
tail_shape = 10
top_words = 100000
EMBEDDING_DIM = 100
GLOVE_DIR = '../../glove6B'
# word_index = 0
def createEmbeddingMatrix(file_path):
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
text = []
head = []
tail = []
with open(file_path,'r') as f:
for l in f:
t = l.split('||')
text.append(l)
head.append(t[0].strip())
tail.append(t[1].strip())
labels.append(int(t[3].strip()))
tokenizer = Tokenizer(num_words=top_words)
tokenizer.fit_on_texts(text)
sequences_head = tokenizer.texts_to_sequences(head)
sequences_tail = tokenizer.texts_to_sequences(tail)
head_data = pad_sequences(sequences_head,maxlen=head_shape)
tail_data = pad_sequences(sequences_tail,maxlen=tail_shape)
word_index = tokenizer.word_index
print "Total Words found : ",len(word_index)
num_words = min(top_words, len(word_index))
embedding_matrix = np.zeros((num_words+1, EMBEDDING_DIM))
for word, i in word_index.items():
if i >= top_words:
continue
embedding_vector = embeddings_index.get(word)
pdb.set_trace()
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return tokenizer,embedding_matrix,head_data,tail_data,labels,word_index
def model_LSTM_Classifier(head_data,tail_data,labels,embedding_matrix):
main_input_head = Input(shape=(head_shape,), dtype='int32')
main_input_tail = Input(shape=(tail_shape,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
x_head = embedding_layer(main_input_head)
x_tail = embedding_layer(main_input_tail)
# x_head = Embedding(EMBEDDING_DIM, weights=[embedding_matrix] ,input_dim=top_words, input_length=head_shape)(main_input_head,trainable=False)
# x_tail = Embedding(EMBEDDING_DIM, weights=[embedding_matrix] ,input_dim=top_words, input_length=tail_shape)(main_input_tail,trainable=False)
lstm_head = LSTM(100)(x_head)
lstm_tail = LSTM(100)(x_tail)
combined_input = keras.layers.concatenate([lstm_head, lstm_tail])
combined_input = Dense(50, activation='relu')(combined_input)
main_output = Dense(1, activation='sigmoid')(combined_input)
model = Model(inputs=[main_input_head, main_input_tail], outputs=[main_output])
model.compile(optimizer='rmsprop', loss='binary_crossentropy',loss_weights=[1.])
model.fit([head_data, tail_data], [labels],
epochs=50, batch_size=32,verbose=1,validation_split=0.2,
shuffle=True,)
return model
if __name__== "__main__":
file_path = sys.argv[1]
tokenizer,embedding_matrix,head_data,tail_data,,word_index = createEmbeddingMatrix(file_path)
# for i in range(len(head_data)):
# print head_data[i],tail_data[i]
model = model_LSTM_Classifier(head_data,tail_data,labels,embedding_matrix,word_index)
model.predict()