-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_docs_by_co_occur.py
338 lines (300 loc) · 14.5 KB
/
split_docs_by_co_occur.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import json
import random
confirm_filepath = "./data/stanford_label_sent_boundary/"
read_filepath = "./data/new_annot/feature/"
write_filepath_yes = "./data/has_co_occurs/"
write_filepath_no = "./data/no_co_occurs/"
read_files = ["new_train.json", "acl_dev_eval_new.json", "acl_dev_tune_new.json", "acl_test_new.json", "mpqa_new.json"]
confirm_files = ["train_all.json", "acl_dev_eval.json", "acl_dev_tune.json", "acl_test.json", "acl_mpqa_eval.json"]
write_files = ["new_train.json", "acl_dev_eval_new.json", "acl_dev_tune_new.json", "acl_test_new.json", "mpqa_new.json"]
read_combine_files = ["E_train.json", "F_train.json", "G_train.json", "H_train.json"]
combine_files = ["new_train.json", "acl_dev_tune_new.json"]
write_combine_files = ["E_train.json", "F_train.json", "G_train.json", "H_train.json"]
def make_key(holder_inds, target_inds):
key = ""
key += str(holder_inds) + "; "
key += str(target_inds)
return key
for i in range(len(read_files)):
read_file = read_files[i]
confirm_file = confirm_files[i]
write_file = write_files[i]
print(read_file)
# Getting sentence boundaries
doc_to_sentbounds = {}
doc_to_sentseq = {}
with open(confirm_filepath + confirm_file, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
if annot["docid"] not in doc_to_sentbounds:
doc_to_sentbounds[annot["docid"]] = annot["sent_boundaries"]
doc_to_sentseq[annot["docid"]] = annot["sent_seq"]
all_data = []
doc_to_ht_to_co_occurs = {}
doc_to_names_to_co_occurs = {}
# Getting co-occurring & non-co-occurring pairs
# Adding sentence boundaries
# Adding sentence classifications
co_occur_exs = []
no_co_occur_exs = []
with open(read_filepath + read_file, "r", encoding="latin1") as rf:
for line in rf:
annot = json.loads(line)
# Getting co-occurrences
annot["co_occurrences"] = int(annot["co_occurrences"])
if annot["docid"] not in doc_to_ht_to_co_occurs:
doc_to_ht_to_co_occurs[annot["docid"]] = {}
doc_to_names_to_co_occurs[annot["docid"]] = {}
ht_key = make_key(annot["holder_index"], annot["target_index"])
ht_name_key = str(annot["holder_index"]) + "; " + str(annot["target_index"])
doc_to_ht_to_co_occurs[annot["docid"]][ht_key] = annot["co_occurrences"]
doc_to_names_to_co_occurs[annot["docid"]][ht_name_key] = annot["co_occurrences"]
# Adding sentence boundaries
annot["sent_boundaries"] = doc_to_sentbounds[annot["docid"]]
# Adding sentence classifications
annot["sent_seq"] = doc_to_sentseq[annot["docid"]]
all_data.append(annot)
inconsistencies = 0
num_not_found = 0
# Confirming co-occurrence of pairs--many inconsistencies... okay then...
# Iterate through all data and get co-occurrences
# Also get sentence RNN classification
for annot in all_data:
# ht_key = make_key(annot["holder_index"], annot["target_index"])
# ht_name_key = str(annot["holder_index"]) + "; " + str(annot["target_index"])
co_occurs_prev = annot["co_occurrences"]
has_holder = [False for i in range(len(annot["sent_boundaries"]))]
sent_ind = 0
for holder_bounds in sorted(annot["holder_index"], key=lambda x: x[0]):
for ind in holder_bounds:
while sent_ind < len(annot["sent_boundaries"]) - 1 and ind >= annot["sent_boundaries"][sent_ind + 1]:
sent_ind += 1
has_holder[sent_ind] = True
has_target = [False for i in range(len(annot["sent_boundaries"]))]
sent_ind = 0
for target_bounds in sorted(annot["target_index"], key=lambda x: x[0]):
for ind in target_bounds:
while sent_ind < len(annot["sent_boundaries"]) - 1 and ind >= annot["sent_boundaries"][sent_ind + 1]:
sent_ind += 1
has_target[sent_ind] = True
has_both = [has_holder[i] and has_target[i] for i in range(len(has_holder))]
count_has_both = sum(has_both)
if co_occurs_prev != count_has_both:
inconsistencies += 1
print(annot["sent_boundaries"])
print("inconsistent: " + str(count_has_both) + " " + str(has_both) + " " + str(co_occurs_prev))
print()
# Update co-occurrences based on new sentence boundaries
annot["co_occurrences"] = count_has_both
# Get doc ht classification based on sentence features
classify = 1
counts = [0, 0, 0]
for i in range(len(has_both)):
if has_both[i]:
if annot["sent_seq"][i] > 2:
counts[2] += 1
elif annot["sent_seq"][i] == 2:
counts[1] += 1
else:
counts[0] += 1
# At least 1 positive = classify as positive
if counts[2] >= 1:
classify = 2
elif counts[0] >= 1: # At least 1 negative & no positive = classify as negative
classify = 0
else: # No negative or positive (all neutral or no co-occurs)
classify = 1
annot["classify"] = classify
if annot["co_occurrences"] > 0:
co_occur_exs.append(annot)
else:
no_co_occur_exs.append(annot)
print(num_not_found)
print(inconsistencies)
print()
print(len(all_data))
print(len(co_occur_exs))
print(len(no_co_occur_exs))
assert len(all_data) == len(co_occur_exs) + len(no_co_occur_exs)
# Write co-occurring examples to doc
with open(write_filepath_yes + write_file, "w", encoding="latin1") as wf:
for annot in co_occur_exs:
json.dump(annot, wf)
wf.write("\n")
# Write non-co-occurring examples to doc
with open(write_filepath_no + write_file, "w", encoding="latin1") as wf:
for annot in no_co_occur_exs:
json.dump(annot, wf)
wf.write("\n")
# Create maps for combined training datasets from written-to files
doc_to_ht_to_annot = {}
doc_to_sentbounds = {}
doc_to_sentseq = {}
for combine_file in combine_files:
print(" " + str(combine_file))
with open(write_filepath_yes + combine_file, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
if annot["docid"] not in doc_to_ht_to_annot:
doc_to_ht_to_annot[annot["docid"]] = {}
doc_to_sentbounds[annot["docid"]] = annot["sent_boundaries"]
doc_to_sentseq[annot["docid"]] = annot["sent_seq"]
ht_key = make_key(annot["holder_index"], annot["target_index"])
doc_to_ht_to_annot[annot["docid"]][ht_key] = annot
with open(write_filepath_no + combine_file, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
if annot["docid"] not in doc_to_ht_to_annot:
doc_to_ht_to_annot[annot["docid"]] = {}
doc_to_sentbounds[annot["docid"]] = annot["sent_boundaries"]
doc_to_sentseq[annot["docid"]] = annot["sent_seq"]
ht_key = make_key(annot["holder_index"], annot["target_index"])
doc_to_ht_to_annot[annot["docid"]][ht_key] = annot
print(" train_all.json")
with open(confirm_filepath + "train_all.json", "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
if annot["docid"] not in doc_to_sentbounds:
doc_to_sentbounds[annot["docid"]] = annot["sent_boundaries"]
doc_to_sentseq[annot["docid"]] = annot["sent_seq"]
# Get the combined training datasets
for i in range(len(read_combine_files)):
read_file = read_combine_files[i]
write_file = write_combine_files[i]
print(read_file)
all_data = []
with open(read_filepath + read_file, "r", encoding="latin1") as rf:
for line in rf:
annot = json.loads(line)
annot["sent_boundaries"] = doc_to_sentbounds[annot["docid"]]
annot["sent_seq"] = doc_to_sentseq[annot["docid"]]
all_data.append(annot)
write_co_occur = []
write_no_co_occur = []
inconsistencies = 0
num_not_found = 0
# Confirming co-occurrence of pairs--many inconsistencies... okay then...
# Iterate through all data and get co-occurrences
# Also get sentence RNN classification
for annot in all_data:
# ht_key = make_key(annot["holder_index"], annot["target_index"])
# ht_name_key = str(annot["holder_index"]) + "; " + str(annot["target_index"])
co_occurs_prev = annot["co_occurrences"]
has_holder = [False for i in range(len(annot["sent_boundaries"]))]
sent_ind = 0
for holder_bounds in sorted(annot["holder_index"], key=lambda x: x[0]):
for ind in holder_bounds:
while sent_ind < len(annot["sent_boundaries"]) - 1 and ind >= annot["sent_boundaries"][sent_ind + 1]:
sent_ind += 1
has_holder[sent_ind] = True
has_target = [False for i in range(len(annot["sent_boundaries"]))]
sent_ind = 0
for target_bounds in sorted(annot["target_index"], key=lambda x: x[0]):
for ind in target_bounds:
while sent_ind < len(annot["sent_boundaries"]) - 1 and ind >= annot["sent_boundaries"][sent_ind + 1]:
sent_ind += 1
has_target[sent_ind] = True
has_both = [has_holder[i] and has_target[i] for i in range(len(has_holder))]
count_has_both = sum(has_both)
if int(co_occurs_prev) != int(count_has_both):
inconsistencies += 1
print(annot["holder_index"])
print(annot["target_index"])
print(annot["sent_seq"])
print(annot["sent_boundaries"])
print("inconsistent: " + str(count_has_both) + " " + str(has_both) + " " + str(co_occurs_prev))
print()
# Update co-occurrences based on new sentence boundaries
annot["co_occurrences"] = count_has_both
# Get doc ht classification based on sentence features
classify = 1
counts = [0, 0, 0]
for i in range(len(has_both)):
if has_both[i]:
if annot["sent_seq"][i] > 2:
counts[2] += 1
elif annot["sent_seq"][i] == 2:
counts[1] += 1
else:
counts[0] += 1
# At least 1 positive = classify as positive
if counts[2] >= 1:
classify = 2
elif counts[0] >= 1: # At least 1 negative & no positive = classify as negative
classify = 0
else: # No negative or positive (all neutral or no co-occurs)
classify = 1
annot["classify"] = classify
if annot["co_occurrences"] > 0:
write_co_occur.append(annot)
else:
write_no_co_occur.append(annot)
# Check if existed previously, and check against the existent label if it does
ht_key = make_key(annot["holder_index"], annot["target_index"])
if annot["docid"] in doc_to_ht_to_annot and ht_key in doc_to_ht_to_annot[annot["docid"]]:
assert doc_to_ht_to_annot[annot["docid"]][ht_key].keys() == annot.keys()
assert doc_to_ht_to_annot[annot["docid"]][ht_key]["sent_boundaries"] == annot["sent_boundaries"]
assert doc_to_ht_to_annot[annot["docid"]][ht_key]["sent_seq"] == annot["sent_seq"]
assert annot["co_occurrences"] == doc_to_ht_to_annot[annot["docid"]][ht_key]["co_occurrences"]
# assert annot["classify"] == doc_to_ht_to_annot[annot["docid"]][ht_key]["classify"]
print(annot["label"])
print(doc_to_ht_to_annot[annot["docid"]][ht_key]["label"])
if annot["label"] != doc_to_ht_to_annot[annot["docid"]][ht_key]["label"]:
if annot["label"] == 1:
annot["label"] = doc_to_ht_to_annot[annot["docid"]][ht_key]["label"]
annot["token"] = doc_to_ht_to_annot[annot["docid"]][ht_key]["token"] # apparently these tokens are right...
annot["holder_rank"] = doc_to_ht_to_annot[annot["docid"]][ht_key]["holder_rank"]
annot["target_rank"] = doc_to_ht_to_annot[annot["docid"]][ht_key]["target_rank"]
annot["holder"] = doc_to_ht_to_annot[annot["docid"]][ht_key]["holder"]
annot["target"] = doc_to_ht_to_annot[annot["docid"]][ht_key]["target"]
assert annot["holder_target"] == doc_to_ht_to_annot[annot["docid"]][ht_key]["holder_target"]
assert annot["polarity"] == doc_to_ht_to_annot[annot["docid"]][ht_key]["polarity"]
# print(str(annot["docid"]) + " " + str(ht_key))
assert doc_to_ht_to_annot[annot["docid"]][ht_key] == annot
print(len(all_data))
print(len(write_co_occur))
print(len(write_no_co_occur))
assert len(all_data) == len(write_co_occur) + len(write_no_co_occur)
print("Writing...")
# Write co-occurring examples to doc
with open(write_filepath_yes + write_file, "w", encoding="latin1") as wf:
for annot in write_co_occur:
json.dump(annot, wf)
wf.write("\n")
# Write non-co-occurring examples to doc
with open(write_filepath_no + write_file, "w", encoding="latin1") as wf:
for annot in write_no_co_occur:
json.dump(annot, wf)
wf.write("\n")
for filename in read_files:
print(filename)
all_data = []
with open(write_filepath_yes + filename, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
all_data.append(annot)
with open(write_filepath_no + filename, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
all_data.append(annot)
random.shuffle(all_data)
with open(read_filepath + filename, "w", encoding="latin1") as wf:
for annot in all_data:
json.dump(annot, wf)
wf.write("\n")
for filename in read_combine_files:
print(filename)
all_data = []
with open(write_filepath_yes + filename, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
all_data.append(annot)
with open(write_filepath_no + filename, "r", encoding="latin1") as cf:
for line in cf:
annot = json.loads(line)
all_data.append(annot)
random.shuffle(all_data)
with open(read_filepath + filename, "w", encoding="latin1") as wf:
for annot in all_data:
json.dump(annot, wf)
wf.write("\n")