| import numpy as np |
|
|
| s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→" |
|
|
| def split(text): |
| o = [] |
| t = "" |
| for i in text+" ": |
| if i in s: |
| if t != "": |
| o.append(t) |
| t = "" |
| if i != " ": |
| o.append(i) |
| t = "" |
| else: |
| t += i |
| return o |
|
|
| def tokenize_2str(text: str): |
| text = split(text) |
|
|
| o = [] |
|
|
| for i in text: |
| if i[-2:] == "es": |
| o.append(i[:-2]) |
| o.append("<es>") |
| else: |
| o.append(i) |
| return o |
|
|
| ind2text = ["<NULL>", "<UNK>", "<es>"] |
| text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2} |
|
|
| def fit_on_text(text: str): |
| global ind2text |
| global text2ind |
| tokens = tokenize_2str(text) |
| for i in tokens: |
| if i not in ind2text: |
| ind2text.append(i) |
| text2ind[i] = len(ind2text) - 1 |
|
|
| def fit_on_texts(texts): |
| for text in texts: fit_on_text(text) |
|
|
| def tokenize(text: str): |
| text = tokenize_2str(text) |
|
|
| o = [] |
|
|
| for i in text: |
| if i in ind2text: |
| o.append(text2ind[i]) |
| else: |
| o.append(text2ind['<UNK>']) |
| return np.array(o) |
|
|
|
|