-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBasicStats.py
95 lines (92 loc) · 3.97 KB
/
BasicStats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import nltk
import re
from nltk.probability import *
from lxml import etree
def read_xml():
thread_dict = {}
s_features = {}
doc = etree.parse('bc3corpus.1.0/corpus.xml')
for thread in doc.xpath('//thread'):
for name in thread.xpath('name'):
l = 0
n = name.text
thread_dict[n] = {}
s_features[n] = {}
#names.append(name.text)
for docs in thread.xpath('DOC'):
for subject in docs.xpath('Subject'):
thread_dict[n].setdefault(subject.text, [])
#for Recv in docs.xpath('From'):
# thread_dict[n][subject.text].append(Recv.text)
#for to in docs.xpath('To'):
# thread_dict[n][subject.text].append(to.text)
for t in docs.xpath('Text/Sent'):
s = t.text
thread_dict[n][subject.text].append(s)
s_features[n][s] = []
#Feature in the 0th column is thread_line_num
s_features[n][s].append(l)
#1st column feature - t_rel_pos
s_features[n][s].append(len(s))
#2nd column conatins is Question
if '?' in s:
s_features[n][s].append(1)
else:
s_features[n][s].append(0)
l += 1
print thread_dict[name.text][subject.text]
#s_features[name.text] = basic
print "The basic feature set is {0}".format(s_features[n])
count = l
print "The total number of sentences are"
print count
print "\n"
return thread_dict, s_features, count
def print_Stats(t_dict):
subject_map = {}
y = 0
for name in t_dict:
y += 1
for x in t_dict[name]:
try:
subj_string=','.join(t_dict[name][x])
except TypeError:
print "Exception occured due to {0}".format(t_dict[name][x])
#subject_map[x] = subj_string
try:
subject_map[name]=''.join(subj_string)
except TypeError:
print "Exception occured due to {0}".format(t_dict[name])
print "The total content of thread number {0} is".format(y)
print subject_map[name]
print "\nBasic analysis for thread number {0}".format(y)
text1 = subject_map[name]
#Tokenize text
tok_text1 = nltk.word_tokenize(text1)
print "The tokenized text is {0}".format(tok_text1)
#Convert the token set to NLP Text
nlp_text = nltk.Text(tok_text1)
#print nlp_text.concordance('severe')
#print nlp_text.similar('useful')
#print nlp_text.common_contexts(["useful","very"])
#Print the nltk generated abstract
print "The nltk generated abstract is"
print nlp_text.generate()
print "Number of words in the input are {0}".format(len(nlp_text))
print "Number of unique words in the input are {0}".format(len(set(nlp_text)))
#Find the frequency distribution of the text
fdist_text1 = FreqDist(nlp_text)
vocab_text1 = fdist_text1.keys()
print "The hapaxes in the given text are {0}".format(fdist_text1.hapaxes())
#fdist_text1.plot(20, cumulative=True)
#Find the basic set of relevant words
relwords = [w for w in set(nlp_text) if len(w)>2 and w.isalpha()]
print "The relevant words are (Most stop words removed):"
print relwords
print "The number of relevant words are {0}".format(len(relwords))
#Analyze the frequency of occurence of words
len_relwords = [len(w) for w in set(relwords)]
len_dist = FreqDist(len_relwords)
print "The count of most frequently occuring relevant words lenths are:"
print len_dist.items()
#fdist_text1.plot(40, cumulative=True)