-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTokenize.py
121 lines (104 loc) · 5.95 KB
/
Tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
@author: sourabh garg
"""
import itertools
import re
import words_tagging
from cltk.stem.sanskrit.indian_syllabifier import Syllabifier
from cltk.corpus.sanskrit.alphabet import *
lang='hindi'
h = Syllabifier(lang)
VOWELS=[INDEPENDENT_VOWELS_SIMPLE,
INDEPENDENT_VOWELS_DIPTHONGS,
INDEPENDENT_VOWELS]
VOWELS= list(itertools.chain(*VOWELS))
CONSONANTS=[CONSONANT_GUTTURALS,
CONSONANT_PALATALS,
CONSONANT_CEREBRALS,
CONSONANT_DENTALS,
CONSONANT_LABIALS,
SEMIVOWEL_CONSONANT,
SIBILANT_CONSONANT,
SONANT_ASPIRATE ]
CONSONANTS= list(itertools.chain(*CONSONANTS))
CONSONANT_HALANTA = [x+'्' for x in CONSONANTS]
CONS_TO_CONS=dict(zip(CONSONANT_HALANTA, CONSONANTS))
matraa_to_vowel={'':'अ' , 'ा':'आ' ,'ि':'इ', 'ी':'ई', 'ु':'उ', 'ू':'ऊ','े':'ए' , 'ै':'ऐ' , 'ो':'ओ', 'ौ':'औ', 'ृ':'ऋ', 'ं':'अं','ः':'अः','ँ':'अँ' , 'ां':'आं'}
vowel_to_matraa={'अ':'','आ' : 'ा','इ':'ि','ई':'ी','उ':'ु','ऊ':'ू','ए':'े','ऐ':'ै','ओ':'ो','औ':'ौ','ऋ':'ृ','अं':'ं','अः':'ः','अँ':'ँ','आं': 'ां'}
ch_to_stem={'अ':'a_stem_','इ':'i_stem_','उ':'u_stem_','आ':'a_stem_','ई':'ii_stem_',
'ऊ':'uu_stem_','ओ':'o_stem_','औ':'oo_stem_','ऋ':'r_stem_','द्':'d_stem_',
'अन्':'an_stem_','त्':'t_stem_','स्':'s_stem_','च्':'c_stem_','ज्':'j_stem_',
'ध्':'dh_stem_','श्':'sh_stem_','ह्':'h_stem_','इन्':'en_stem_','अस्':'as_stem_'}
def complete_tokenize(word):
'''
Complete Tokenization of a noun word into smallest possible unit (character)
for e.g. विद्यार्थी = 'व्'+'इ'+'द्'+'य्'+'आ'+'र्'+'थ्'+'ई'
'''
tokened_list=[]
word_list = h.orthographic_syllabify(word)
for syallable in word_list:
syallable=list(itertools.chain(*syallable))
if len(syallable)==1:
if syallable[0] in CONSONANTS:
tokened_list.append(syallable[0]+'्')
tokened_list.append('अ')
elif syallable[0] in VOWELS:
tokened_list.append(syallable[0])
elif syallable[0] in matraa_to_vowel.keys():
tokened_list.append(matraa_to_vowel[syallable[0]])
else:
for index in range(len(syallable)):
token=syallable[index]
if token in CONSONANTS and index==len(syallable)-1:
tokened_list.append(token+'्')
tokened_list.append('अ')
elif token in CONSONANTS:
tokened_list.append(token+'्')
elif token in matraa_to_vowel.keys():
tokened_list.append(matraa_to_vowel[token])
return tokened_list
def join(list_char):
''' It joins the complete tokenized word'''
word=''
length=len(list_char)
for index in range(length):
if index==0 and list_char[index] in vowel_to_matraa.keys():
word=word+list_char[index]
elif list_char[index] in CONS_TO_CONS.keys() and index==length-1:
word=word+list_char[index]
elif list_char[index] in CONS_TO_CONS.keys() and list_char [index+1] in vowel_to_matraa.keys():
word=word+CONS_TO_CONS[list_char[index]]
elif list_char[index] in CONS_TO_CONS.keys():
word=word+list_char[index]
elif list_char[index] in vowel_to_matraa.keys():
word=word+vowel_to_matraa[list_char[index]]
else:
word=word+list_char[index]
return word
def stem_class(word):
lis=complete_tokenize(word)
length=len(lis)
ch=lis[length-1]
ch2=lis[length-2]
if join([ch2,ch]) in ch_to_stem.keys():
return ch_to_stem[join([ch2,ch])]
elif ch in ch_to_stem.keys():
return ch_to_stem[ch]
def gender(word):
for index in range(len(words_tagging.all_noun)):
if word in words_tagging.all_noun[index]:
stem_t=words_tagging.dict_noun[index]
gen = re.sub(r"^[^_]*_[^_]*_", "", stem_t)
return gen
if __name__ == '__main__':
'''
Illustration with some difficult words'''
words=['क्लेश','नित्यः','यत्न','स्त्रोत','चक्कर','पुख्ता', 'दुग्गल', 'घग्घर', 'घ्राण','पड्क','सच्चा','स्वच्छ','सज्जा','ज्योतिष','विज्ञान','वञ्चिंत','मिट्टी','गट्ठर','अकाट्य','ट्रस्ट','बुड्ढा','ड्रेस','घण्टा',"कण्ठ",
'उत्कंठा','कुत्ता','मत्था','उत्फुल्ल','उत्पत्ति','नृत्य','जिद्दी','सिद्धि','कर्तृणी','विद्यार्थी','द्रवित','द्वारा','द्वेष','आँध्र','संध्या','सिंह','सींग','महन्त','मन्दाकिनी','धन्धा','प्रसन्न','उन्मूलन','अन्य','कन्हैया',
'गुप्त','ठप्पा','प्यास','प्रणाम','हफ्ता','जब्त','कब्ज','ब्रिटिश','भ्लेच्छ','सहस्र','सहस्त्र','ह्रस्व','आर्य','चक्र','क्षत्रिय','दण्डिन्','राम','नृप','ग्राम्','पुत्र','सूर्य','चन्द्र','शिष्य्','अनार']
for i in words:
st=complete_tokenize(i)
print(i,st,join(st))
print(stem_class('अम्बु'))
print(gender('ज्ञातृ'))
print(join(['भ्', 'ल्', 'ए', 'च्', 'छ्']))