KE-AI
/

basicchatbot-kel

Text Generation

Kela (Democratic Republic of Congo)

Model card Files Files and versions

basicchatbot-kel / tokenizer.py

ierhon's picture

Upload 5 files

d80c106 over 2 years ago

history blame contribute delete

1.2 kB

	import numpy as np

	s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/\|\"?\n–№…«»→"

	def split(text):
	o = []
	t = ""
	for i in text+" ":
	if i in s:
	if t != "":
	o.append(t)
	t = ""
	if i != " ":
	o.append(i)
	t = ""
	else:
	t += i
	return o

	def tokenize_2str(text: str):
	text = split(text)

	o = []

	for i in text:
	if i[-2:] == "es":
	o.append(i[:-2])
	o.append("<es>")
	else:
	o.append(i)
	return o

	ind2text = ["<NULL>", "<UNK>", "<es>"]
	text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}

	def fit_on_text(text: str):
	global ind2text
	global text2ind
	tokens = tokenize_2str(text)
	for i in tokens:
	if i not in ind2text:
	ind2text.append(i)
	text2ind[i] = len(ind2text) - 1

	def fit_on_texts(texts):
	for text in texts: fit_on_text(text)

	def tokenize(text: str):
	text = tokenize_2str(text)

	o = []

	for i in text:
	if i in ind2text:
	o.append(text2ind[i])
	else:
	o.append(text2ind['<UNK>'])
	return np.array(o)