-
Notifications
You must be signed in to change notification settings - Fork 3
/
dataset.py
164 lines (130 loc) · 6.31 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from collections import defaultdict
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np
def cosine_similarity(v1, v2):
"""
Calculates cosine similarity between two vectors
Code from: http://danushka.net/lect/dm/Numpy-basics.html
"""
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
class DataSet:
def __init__(self, annotation_file, minimum_token_count=2, cutoff=0.005):
self._data = None
self._pos_instances_to_tokens = None
self._neg_instances_to_tokens = None
self._pos_tokens_to_instances = None
self._neg_tokens_to_instances = None
self._read_annotations(annotation_file)
self._extract_and_set_positives(minimum_token_count)
self._extract_and_set_negatives(cutoff)
@property
def data(self):
return self._data
@property
def pos_instances_to_tokens(self):
return self._pos_instances_to_tokens
@property
def pos_tokens_to_instances(self):
return self._pos_tokens_to_instances
@property
def neg_instances_to_tokens(self):
return self._neg_instances_to_tokens
@property
def neg_tokens_to_instances(self):
return self._neg_tokens_to_instances
@property
def tokens(self):
tokens = []
for instance in self.data:
tokens += self.data[instance]
# return a list of unique tokens
return list(set(tokens))
def _read_annotations(self, annotation_file):
"""
Parse the annotation file into a python dict
{
instance1: [tokens1],
instance2: [tokens2],
...
}
"default" annotation_file
conf_files/UW_english/UW_AMT_description_documents_per_image_nopreproc_stop_raw.conf
:param str annotation_file: File path to amazon mechanical turk annotation file
"""
data = {}
with open(annotation_file, 'r') as annotations:
for line in annotations:
instance, tokens = line.strip().split(',')
data[instance] = tokens.split(' ')
self._data = data
def _extract_and_set_positives(self, minimum_token_count):
"""
Finds positive token examples of the instances in the annotation set.
A positive example is an token that has described an instance more times
than a minimum threshold
Sets the relevant instance variables
PAT TODO: In Nisha's paper this was done with tf-idf which this is no longer doing
should it go back to tf-idf?
:param int minimum_token_count: This controls the minimum number of times
a token has to appear in descriptions for an instance before the instance
is deemed to be a positive example of this token
"""
instances_to_positive = defaultdict(list)
for instance, tokens in self.data.items():
for token in self.tokens:
# Using a list with "not in" instead of a set so we can iterate later
if token in tokens and tokens.count(token) >= minimum_token_count:
instances_to_positive[instance].append(token)
self._pos_instances_to_tokens = dict(instances_to_positive)
# Invert the dict to map tokens -> instances
pos_tokens_to_instances = defaultdict(list)
for instance, tokens in self._pos_instances_to_tokens.items():
for token in tokens:
pos_tokens_to_instances[token].append(instance)
self._pos_tokens_to_instances = dict(pos_tokens_to_instances)
def _extract_and_set_negatives(self, cutoff):
"""
Finds negative instance examples of the tokens in the annotation set.
A negative example is an instance is defined...
Cosine similarity tells you how similar two instances are to each other.
So we can get instances that are disimilar to each other. Then the negative
instances positive tokens are the tokens we want to place the original instance under
Sets the relevant instance variables
:param float cutoff: percentage cutoff for negative scores
"""
tagged_documents = []
for instance, tokens in self.data.items():
tagged_documents.append(TaggedDocument(tokens, [instance]))
# Train doc2vec model for computing similarities between instances
# negative set to 0 so no negative sampling will be used while training document model
model = Doc2Vec(min_count=2, negative=0, workers=8)
print('building vocab...')
model.build_vocab(tagged_documents)
print('training model...')
model.train(tagged_documents, total_examples=model.corpus_count, epochs=10)
print('done training model')
instances_to_negative = {}
for instance1 in self.pos_instances_to_tokens:
tokens = []
for instance2 in self.pos_instances_to_tokens:
docvec1 = model.docvecs[instance1]
docvec2 = model.docvecs[instance2]
if cosine_similarity(docvec1, docvec2) <= cutoff:
#tokens += [t if t not in self.pos_tokens_to_instances[instance1] for t in self.pos_instances_to_tokens[instance2]]
for token in self.pos_instances_to_tokens[instance2]:
# If the token isn't already in the positive tokens of the instance, add the token.
# This is to avoid things like water_bottle and shampoo being negatives of each other,
# but bottle still shows up in the negative tokens of water_bottle because bottle shows up
# sometimes in the shampoo examples.
if token not in self.pos_instances_to_tokens[instance1]:
tokens.append(token)
# add unique list of tokens
instances_to_negative[instance1] = list(set(tokens))
self._neg_instances_to_tokens = instances_to_negative
# Invert the dict to map tokens -> instances
neg_tokens_to_instances = defaultdict(list)
for instance, tokens in self._neg_instances_to_tokens.items():
for token in tokens:
neg_tokens_to_instances[token].append(instance)
self._neg_tokens_to_instances = dict(neg_tokens_to_instances)