-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnotReader.py
52 lines (49 loc) · 1.84 KB
/
AnotReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import nltk
import re
from nltk.probability import *
from lxml import etree
doc = etree.parse('bc3corpus.1.0/annotation.xml')
thread_dict = {}
names = []
for thread in doc.xpath('//thread'):
for name in thread.xpath('name'):
thread_dict[name.text] = {}
names.append(name.text)
for desc in thread.xpath('annotation/desc'):
thread_dict[name.text].setdefault(desc.text, [])
for t in thread.xpath('annotation/summary/sent'):
thread_dict[name.text][desc.text].append(t.text)
#print thread_dict[name.text][desc.text]
subject_map={}
y=0
for name in names:
y = y+1
for x in thread_dict[name]:
try:
subj_string=','.join(thread_dict[name][x])
#print subj_string
except TypeError:
print "Exception occured due to {0}".format(thread_dict[name][x])
#subject_map[x] = subj_string
try:
subject_map[name]=''.join(subj_string)
except TypeError:
print "Exception occured due to {0}".format(thread_dict[name])
print "\nBasic analysis for thread number {0}".format(y)
text1 = subject_map[name]
#Tokenize text
tok_text1 = nltk.word_tokenize(text1)
#Convert the token set to NLP Text
nlp_text = nltk.Text(tok_text1)
print "Number of unique words in the input are {0}".format(len(set(nlp_text)))
#Find the frequency distribution of the text
fdist_text1 = FreqDist(nlp_text)
hapax_text = fdist_text1.hapaxes()
relwords = [w for w in set(hapax_text) if len(w)>2 and w.isalpha()]
print "The relevant words are (Most stop words removed):"
print relwords
#Analyze the frequency of occurence of words
len_relwords = [len(w) for w in set(relwords)]
len_dist = FreqDist(len_relwords)
print "The count of most frequently occuring relevant words lengths are:"
print len_dist.items()