-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate_text.py
220 lines (196 loc) · 12 KB
/
annotate_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import csv
import re
import time
from collections import Counter
from math import ceil
from statistics import mean
from timeit import default_timer as timer
from util_funcs import pickle_obj, unpickle_obj, get_most_recent_file
def get_gold_names(path: str) -> list[str]:
"""Read the CSV file at the specified path and return a list containing the name from each cell as an element."""
with open(path, newline='') as csv_file:
return [name for row in csv.reader(csv_file) for name in row]
def get_shahnameh_text() -> str:
"""Read the file containing the Shahnameh text and return the content as a string."""
# Shahnameh text from www.ganjoor.org
with open('data/shahnameh_raw.txt', 'r') as shahnameh_file:
return shahnameh_file.read()
def get_verses() -> list[str]:
"""Return a list containing all verses from the Shahnameh text – excluding titles and tables of contents –
pickle the list object, and save the verses to a text file."""
print(f'\n{time.strftime("%H:%M:%S")}: Extracting verses...')
# remove titles
text_wo_titles = re.sub(r'\n\n\n.+\n\n\n', '', get_shahnameh_text())
# exclude other non-verse lines
verses = [line for line in text_wo_titles.split('\n\n')
if not re.search(r'بخش \d+', line) and ":" not in line and line]
print(f'{time.strftime("%H:%M:%S")}: Extracted {len(verses)} verses.\n')
pickle_obj(verses, f'pickles/all_verses_{time.strftime("%d%m%Y_%H%M")}.pickle')
with open(f'data/all_verses_{time.strftime("%d%m%Y_%H%M")}.txt', 'w') as verses_txt_file:
verses_txt_file.write('\n'.join(verses))
return verses
def get_couplets() -> list[str]:
"""
Assemble the couplets from the verses, return them in a list, pickle the list object,
and save the couplets to a text file."""
print(f'\n{time.strftime("%H:%M:%S")}: Assembling couplets from verses...')
verses = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='all_verses'))
couplets = [' '.join([verse, verses[verses.index(verse)+1]]) for verse in verses if (verses.index(verse) % 2) == 0]
print(f'{time.strftime("%H:%M:%S")}: Assembled {len(couplets)} couplets.\n')
pickle_obj(couplets, f'pickles/all_couplets_{time.strftime("%d%m%Y_%H%M")}.pickle')
with open(f'data/all_couplets_{time.strftime("%d%m%Y_%H%M")}.txt', 'w') as couplets_txt_file:
couplets_txt_file.write('\n'.join(couplets))
return couplets
def get_gold_couplets() -> list[str]:
"""
Return a list of all couplets that contain a name from one of the gold standard lists
and pickle the list object."""
print(f'\n{time.strftime("%H:%M:%S")}: Extracting gold couplets...')
location_names = get_gold_names(get_most_recent_file(dir_path='data/gold_lists', prefix='locations_gold_list'))
person_names = get_gold_names(get_most_recent_file(dir_path='data/gold_lists', prefix='persons_gold_list'))
all_couplets = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='all_couplets'))
gold_couplets = [*{couplet for couplet in all_couplets
if any(f' {name} ' in couplet for name in location_names + person_names)}]
print(f'{time.strftime("%H:%M:%S")}: Extracted {len(gold_couplets)} gold couplets.\n')
pickle_obj(gold_couplets, f'pickles/gold_couplets_{time.strftime("%d%m%Y_%H%M")}.pickle')
with open(f'data/gold_couplets_{time.strftime("%d%m%Y_%H%M")}.txt', 'w') as gold_couplets_file:
gold_couplets_file.write('\n'.join(gold_couplets))
return gold_couplets
def replace_spaces_in_names() -> tuple[list[str], list[str], list[str]]:
"""
Replace the spaces in all names with § in preparation for BIO annotation, return a tuple containing the three
lists with the modified gold standard location names, person names and couplets, and pickle the list objects."""
print(f'\n{time.strftime("%H:%M:%S")}: Replacing spaces in gold standard names with §...')
gold_couplets = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='gold_couplets'))
location_names = get_gold_names(get_most_recent_file(dir_path='data/gold_lists', prefix='locations_gold_list'))
person_names = get_gold_names(get_most_recent_file(dir_path='data/gold_lists', prefix='persons_gold_list'))
modified_locations = {name: (name.replace(' ', '§') if ' ' in name.strip() else name) for name in location_names}
modified_persons = {name: (name.replace(' ', '§') if ' ' in name.strip() else name) for name in person_names}
n = 0
modified_gold_couplets = []
for gold_couplet in gold_couplets:
for original_name, modified_name in {**modified_locations, **modified_persons}.items():
if '§' in modified_name and f' {original_name} ' in gold_couplet:
n += 1
gold_couplet = gold_couplet.replace(f' {original_name} ', f' {modified_name} ')
modified_gold_couplets.append(gold_couplet)
print(f'{time.strftime("%H:%M:%S")}: Found and modified {n} occurrences of names with spaces.\n')
pickle_obj(modified_gold_couplets,
f'pickles/modified_gold_couplets_{time.strftime("%d%m%Y_%H%M")}.pickle')
pickle_obj([*modified_locations.values()],
f'pickles/modified_location_names_{time.strftime("%d%m%Y_%H%M")}.pickle')
pickle_obj([*modified_persons.values()],
f'pickles/modified_person_names_{time.strftime("%d%m%Y_%H%M")}.pickle')
return [*modified_locations.values()], [*modified_persons.values()], modified_gold_couplets
def get_names_for_testing(name_tokens: list[str]) -> tuple[list[str], list[str], list[str]]:
"""
From the given list of name tokens, select five with a high frequency, five with an average frequency and five with
a low frequency and return the names in three separate lists."""
tokens_counts = Counter(name_tokens)
tokens_counts_sorted = tokens_counts.most_common()
token_average_freq = ceil(mean([t[1] for t in tokens_counts_sorted]))
twenty_from_middle = []
for i, t in enumerate(tokens_counts_sorted):
if t[1] in [token_average_freq, token_average_freq+1, token_average_freq-1]:
twenty_from_middle.extend(tokens_counts.most_common()[i-10:i+10])
break
five_from_middle = [t[0] for t in twenty_from_middle][3::4]
five_from_top = [t[0] for t in tokens_counts.most_common(20)][3::4]
five_from_bottom = [t[0] for t in tokens_counts_sorted[-20:]][3::4]
return five_from_top, five_from_middle, five_from_bottom
def save_excluded_names_to_csv(names_tuple: tuple[list[str], list[str], list[str]], path: str) -> None:
"""Save each list from the given tuple as a row in a CSV file at the specified path."""
with open(path, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
for names in names_tuple:
writer.writerow(names)
def save_names_to_exclude() -> list[str]:
"""
Create a list containing 15 person and 15 location entity names that should not be tagged as entities in the
training data for testing purposes, pickle and return the list."""
print(f'\n{time.strftime("%H:%M:%S")}: Getting names to exclude...')
modified_gold_couplets = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='modified_gold_couplets'))
modified_location_names = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='modified_location_names'))
modified_person_names = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='modified_person_names'))
per_tokens = []
loc_tokens = []
for gold_couplet in modified_gold_couplets:
for word in gold_couplet.split(' '):
if word in modified_location_names:
loc_tokens.append(word)
elif word in modified_person_names:
per_tokens.append(word)
per_names_for_testing = get_names_for_testing(per_tokens)
loc_names_for_testing = get_names_for_testing(loc_tokens)
save_excluded_names_to_csv(per_names_for_testing,
f'data/testing/excluded_per_names_{time.strftime("%d%m%Y_%H%M")}.csv')
save_excluded_names_to_csv(loc_names_for_testing,
f'data/testing/excluded_loc_names_{time.strftime("%d%m%Y_%H%M")}.csv')
names_for_testing = [name for t in per_names_for_testing + loc_names_for_testing for name in t]
print(f'{time.strftime("%H:%M:%S")}: Saving {len(names_for_testing)} names to exclude from training data:\n'
f'person names: {per_names_for_testing}\n'
f'location names: {loc_names_for_testing}\n')
pickle_obj(names_for_testing, f'pickles/names_for_testing_{time.strftime("%d%m%Y_%H%M")}.pickle')
return names_for_testing
def tag_gold_couplets() -> list[list[tuple[str, str]]]:
"""
Tag the couplets containing names from the gold standard lists according to the BIO notation, return a list
containing the tagged couplets, each couplet being a list of tuples containing a token and its label,
and pickle the list object."""
print(f'\n{time.strftime("%H:%M:%S")}: Tagging couplets...')
modified_gold_couplets = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='modified_gold_couplets'))
modified_location_names = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='modified_location_names'))
modified_person_names = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='modified_person_names'))
names_for_testing = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='names_for_testing'))
location_names_shortened = [name for name in modified_location_names if name not in names_for_testing]
person_names_shortened = [name for name in modified_person_names if name not in names_for_testing]
tagged_couplets = []
for gold_couplet in modified_gold_couplets:
tagged_couplet = []
for word in gold_couplet.split(' '):
if word in location_names_shortened:
if '§' in word:
split_word = word.split('§')
tagged_couplet.append((split_word[0], 'B-LOC'))
for part in split_word[1:]:
tagged_couplet.append((part, 'I-LOC'))
else:
tagged_couplet.append((word, 'B-LOC'))
elif word in person_names_shortened:
if '§' in word:
split_word = word.split('§')
tagged_couplet.append((split_word[0], 'B-PER'))
for part in split_word[1:]:
tagged_couplet.append((part, 'I-PER'))
else:
tagged_couplet.append((word, 'B-PER'))
else:
tagged_couplet.append((word, 'O'))
tagged_couplets.append(tagged_couplet)
print(f'{time.strftime("%H:%M:%S")}: Tagged {len(tagged_couplets)} couplets.\n')
pickle_obj(tagged_couplets, f'pickles/tagged_couplets_final_{time.strftime("%d%m%Y_%H%M")}.pickle')
return tagged_couplets
def save_tagged_data() -> None:
"""Write the tagged data to a text file according to the BIO notation."""
tagged_couplets = unpickle_obj(get_most_recent_file(dir_path='pickles', prefix='tagged_couplets_final'))
file_path = f'data/tagged/tagged_data_final_{time.strftime("%d%m%Y_%H%M")}.txt'
with open(file_path, 'w') as tagged_data_file:
for couplet in tagged_couplets:
for word in couplet:
tagged_data_file.write(f'{word[0]}\t{word[1]}\n')
tagged_data_file.write('\n')
print(f'{time.strftime("%H:%M:%S")}: Saved tagged data as "{file_path}".\n')
if __name__ == '__main__':
print('\nRunning annotate_text.py...\n')
start = timer()
# if you've already ran a function and thus pickled its results, comment out the respective line
get_verses()
get_couplets()
get_gold_couplets()
replace_spaces_in_names()
# TODO: check how many entity tokens / occurrences were excluded because of this
save_names_to_exclude()
tag_gold_couplets()
save_tagged_data()
end = timer()
print(f'\n{time.strftime("%d/%m/%Y %H:%M:%S")}: Done!\n\nElapsed time: {round((end - start) / 60, 2)} minutes.\n')