-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.py
50 lines (42 loc) · 1.48 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import string
def init_skip_symbol():
global skip_symbol
global erase_symbol
skip_symbol = f"0123456789{string.ascii_letters}"
erase_symbol = "\n,./;'[]\<>?:\"{}|_+(*&^%$#@!)(),。.?」 ⋯「!"
def split_to_word(sentense:str,word_len : int):
if(len(sentense) == 0 or len(sentense) < word_len):
return []
temp_seq = ""
word_container = []
word_sequence = []
def push_word(force_update : bool = False):
cur_len = len(word_sequence)
if(cur_len == 0 or (not force_update and cur_len < word_len)):
return
word_container.append("".join(word_sequence))
word_sequence.clear()
for each_char in sentense:
if each_char in skip_symbol:
temp_seq += each_char
continue
if len(temp_seq) > 0:
word_sequence.append(temp_seq)
temp_seq = ""
push_word()
word_sequence.append(each_char)
push_word()
if(len(temp_seq) > 0):
word_sequence.append(temp_seq)
push_word(True)
return word_container
def extract_gram(sentense : str,word_len : int):
last_char = ""
words = []
for char_pos in range(0, len(sentense)):
if last_char in skip_symbol and sentense[char_pos] in skip_symbol:
continue
words = [*words,*list(filter(lambda w: w not in words,split_to_word(sentense[char_pos:], word_len)))]
last_char = sentense[char_pos]
return words
init_skip_symbol()