Spaces:
Runtime error
Runtime error
Robert Jones
commited on
Commit
·
79f947d
1
Parent(s):
0e49377
Add SongBloom files directly - remove git clone dependency
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- SongBloom/.DS_Store +0 -0
- SongBloom/g2p/__init__.py +0 -0
- SongBloom/g2p/__pycache__/__init__.cpython-39.pyc +0 -0
- SongBloom/g2p/__pycache__/lyric_common.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/__init__.py +106 -0
- SongBloom/g2p/cn_zh_g2p/__pycache__/__init__.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/__pycache__/chinese.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/__pycache__/english.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/__pycache__/symbols.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/__pycache__/tone_sandhi.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/chinese.py +173 -0
- SongBloom/g2p/cn_zh_g2p/cmudict-fast.rep +0 -0
- SongBloom/g2p/cn_zh_g2p/cmudict.rep +0 -0
- SongBloom/g2p/cn_zh_g2p/engdict-hot.rep +2 -0
- SongBloom/g2p/cn_zh_g2p/english.py +362 -0
- SongBloom/g2p/cn_zh_g2p/nltk_data/corpora/cmudict/README +76 -0
- SongBloom/g2p/cn_zh_g2p/nltk_data/corpora/cmudict/cmudict +0 -0
- SongBloom/g2p/cn_zh_g2p/opencpop-strict.txt +429 -0
- SongBloom/g2p/cn_zh_g2p/symbols.py +401 -0
- SongBloom/g2p/cn_zh_g2p/tone_sandhi.py +806 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/README.md +16 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__init__.py +14 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/__init__.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/char_convert.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/chronology.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/constants.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/num.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/phonecode.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/quantifier.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/text_normlization.cpython-39.pyc +0 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/char_convert.py +46 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/chronology.py +134 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/constants.py +62 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/num.py +282 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/phonecode.py +63 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/quantifier.py +63 -0
- SongBloom/g2p/cn_zh_g2p/zh_normalization/text_normlization.py +165 -0
- SongBloom/g2p/lyric_common.py +81 -0
- SongBloom/g2p/pinyin/__init__.py +430 -0
- SongBloom/g2p/pinyin/__pycache__/__init__.cpython-39.pyc +0 -0
- SongBloom/g2p/pinyin/__pycache__/pinyin.cpython-39.pyc +0 -0
- SongBloom/g2p/pinyin/__pycache__/symbols.cpython-39.pyc +0 -0
- SongBloom/g2p/pinyin/pinyin.py +137 -0
- SongBloom/g2p/pinyin/symbols.py +71 -0
- SongBloom/models/__pycache__/transformer.cpython-39.pyc +0 -0
- SongBloom/models/base/__pycache__/sample.cpython-39.pyc +0 -0
- SongBloom/models/base/__pycache__/utils.cpython-39.pyc +0 -0
- SongBloom/models/base/sample.py +57 -0
- SongBloom/models/base/utils.py +57 -0
- SongBloom/models/musicgen/__init__.py +0 -0
SongBloom/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
SongBloom/g2p/__init__.py
ADDED
|
File without changes
|
SongBloom/g2p/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (145 Bytes). View file
|
|
|
SongBloom/g2p/__pycache__/lyric_common.cpython-39.pyc
ADDED
|
Binary file (1.84 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/__init__.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from . import chinese, english # , japanese 暂时干掉看看
|
| 2 |
+
from .symbols import *
|
| 3 |
+
import yaml
|
| 4 |
+
language_module_map = {"zh": chinese, "en": english} #, "ja": japanese
|
| 5 |
+
|
| 6 |
+
def is_chinese(uchar):
|
| 7 |
+
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
|
| 8 |
+
return True
|
| 9 |
+
else:
|
| 10 |
+
return False
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
|
| 14 |
+
# def split_text(text):
|
| 15 |
+
# chinese_pattern = r'[\u4e00-\u9fa5][\u4e00-\u9fa5\ \,\.\!\?\,\。]+'
|
| 16 |
+
# english_pattern = r'[a-zA-Z][a-zA-Z\'\ \,\.\!\?]+'
|
| 17 |
+
|
| 18 |
+
# chinese_text = re.findall(chinese_pattern, text)
|
| 19 |
+
# print(chinese_text)
|
| 20 |
+
# english_text = re.findall(english_pattern, text)
|
| 21 |
+
|
| 22 |
+
# return chinese_text, english_text
|
| 23 |
+
|
| 24 |
+
def split_text(text):
|
| 25 |
+
pattern = re.compile("|".join(re.escape(p) for p in chinese.rep_map.keys()))
|
| 26 |
+
text = pattern.sub(lambda x: chinese.rep_map[x.group()], text)
|
| 27 |
+
|
| 28 |
+
result = []
|
| 29 |
+
lang = []
|
| 30 |
+
buffer = ""
|
| 31 |
+
chinese_pattern = r'[\u4e00-\u9fa5]'
|
| 32 |
+
special_pattern = r'[\,\.\!\?\…\-]'
|
| 33 |
+
# TODO check 一下
|
| 34 |
+
for char in text:
|
| 35 |
+
if re.match(special_pattern, char):
|
| 36 |
+
if buffer:
|
| 37 |
+
if not re.match(chinese_pattern, buffer[0]):
|
| 38 |
+
result.append(buffer)
|
| 39 |
+
lang.append('en')
|
| 40 |
+
else:
|
| 41 |
+
result.append(buffer)
|
| 42 |
+
lang.append("zh")
|
| 43 |
+
result.append(char)
|
| 44 |
+
lang.append('sp')
|
| 45 |
+
buffer = ""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
elif re.match(chinese_pattern, char):
|
| 49 |
+
if buffer and not re.match(chinese_pattern, buffer[-1]):
|
| 50 |
+
result.append(buffer)
|
| 51 |
+
buffer = ""
|
| 52 |
+
lang.append('en')
|
| 53 |
+
buffer += char
|
| 54 |
+
else:
|
| 55 |
+
if buffer and re.match(chinese_pattern, buffer[-1]):
|
| 56 |
+
result.append(buffer)
|
| 57 |
+
buffer = ""
|
| 58 |
+
lang.append("zh")
|
| 59 |
+
buffer += char
|
| 60 |
+
|
| 61 |
+
if buffer:
|
| 62 |
+
result.append(buffer)
|
| 63 |
+
lang.append("zh" if re.match(chinese_pattern, buffer[-1]) else 'en')
|
| 64 |
+
|
| 65 |
+
return result, lang
|
| 66 |
+
|
| 67 |
+
def mixed_language_to_phoneme(text):
|
| 68 |
+
segments, lang = split_text(text)
|
| 69 |
+
# print(segments, lang)
|
| 70 |
+
result = [language_to_phoneme(s, l) for s, l in zip(segments, lang)]
|
| 71 |
+
phones, word2ph = [], []
|
| 72 |
+
for p, w, n in result:
|
| 73 |
+
phones += p
|
| 74 |
+
if w is None:
|
| 75 |
+
w = []
|
| 76 |
+
word2ph += w
|
| 77 |
+
return phones, word2ph
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def language_to_phoneme(text, language):
|
| 81 |
+
if language == 'sp':
|
| 82 |
+
return [text], None, text
|
| 83 |
+
language_module = language_module_map[language]
|
| 84 |
+
norm_text = language_module.text_normalize(text)
|
| 85 |
+
if language == "zh":
|
| 86 |
+
phones, word2ph = language_module.g2p(norm_text)
|
| 87 |
+
assert len(phones) == sum(word2ph)
|
| 88 |
+
assert len(norm_text) == len(word2ph)
|
| 89 |
+
else:
|
| 90 |
+
try:
|
| 91 |
+
phones = language_module.g2p(norm_text)
|
| 92 |
+
except:
|
| 93 |
+
phones = [norm_text]
|
| 94 |
+
word2ph = None
|
| 95 |
+
|
| 96 |
+
# for ph in phones:
|
| 97 |
+
# assert ph in symbols, ph
|
| 98 |
+
return phones, word2ph, norm_text
|
| 99 |
+
|
| 100 |
+
def gen_vocabs():
|
| 101 |
+
yaml.dump(symbols, open('./vocab.yaml', 'w'))
|
| 102 |
+
|
| 103 |
+
class G2P_Mix():
|
| 104 |
+
def __call__(self, text):
|
| 105 |
+
phones, word2ph = mixed_language_to_phoneme(text)
|
| 106 |
+
return ' '.join(phones)
|
SongBloom/g2p/cn_zh_g2p/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (2.77 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/__pycache__/chinese.cpython-39.pyc
ADDED
|
Binary file (4.18 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/__pycache__/english.cpython-39.pyc
ADDED
|
Binary file (6.81 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/__pycache__/symbols.cpython-39.pyc
ADDED
|
Binary file (2.29 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/__pycache__/tone_sandhi.cpython-39.pyc
ADDED
|
Binary file (14.1 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/chinese.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pdb
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import cn2an
|
| 6 |
+
from pypinyin import lazy_pinyin, Style
|
| 7 |
+
|
| 8 |
+
from .symbols import punctuation
|
| 9 |
+
from .tone_sandhi import ToneSandhi
|
| 10 |
+
from .zh_normalization.text_normlization import TextNormalizer
|
| 11 |
+
|
| 12 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
| 13 |
+
|
| 14 |
+
current_file_path = os.path.dirname(__file__)
|
| 15 |
+
pinyin_to_symbol_map = {
|
| 16 |
+
line.split("\t")[0]: line.strip().split("\t")[1]
|
| 17 |
+
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
import jieba_fast.posseg as psg
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
rep_map = {
|
| 24 |
+
":": ",",
|
| 25 |
+
";": ",",
|
| 26 |
+
",": ",",
|
| 27 |
+
"。": ".",
|
| 28 |
+
"!": "!",
|
| 29 |
+
"?": "?",
|
| 30 |
+
"\n": ".",
|
| 31 |
+
"·": ",",
|
| 32 |
+
"、": ",",
|
| 33 |
+
"...": "…",
|
| 34 |
+
"$": ".",
|
| 35 |
+
"/": ",",
|
| 36 |
+
"—": "-",
|
| 37 |
+
"~": "…",
|
| 38 |
+
"~":"…",
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
tone_modifier = ToneSandhi()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def replace_punctuation(text):
|
| 45 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
| 46 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
| 47 |
+
|
| 48 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
| 49 |
+
replaced_text = re.sub(
|
| 50 |
+
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
return replaced_text
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def g2p(text):
|
| 57 |
+
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
| 58 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
| 59 |
+
phones, word2ph = _g2p(sentences)
|
| 60 |
+
return phones, word2ph
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _get_initials_finals(word):
|
| 64 |
+
initials = []
|
| 65 |
+
finals = []
|
| 66 |
+
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
| 67 |
+
orig_finals = lazy_pinyin(
|
| 68 |
+
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
|
| 69 |
+
)
|
| 70 |
+
for c, v in zip(orig_initials, orig_finals):
|
| 71 |
+
initials.append(c)
|
| 72 |
+
finals.append(v)
|
| 73 |
+
return initials, finals
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _g2p(segments):
|
| 77 |
+
phones_list = []
|
| 78 |
+
word2ph = []
|
| 79 |
+
for seg in segments:
|
| 80 |
+
pinyins = []
|
| 81 |
+
# Replace all English words in the sentence
|
| 82 |
+
seg = re.sub("[a-zA-Z]+", "", seg)
|
| 83 |
+
seg_cut = psg.lcut(seg)
|
| 84 |
+
initials = []
|
| 85 |
+
finals = []
|
| 86 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
| 87 |
+
for word, pos in seg_cut:
|
| 88 |
+
if pos == "eng":
|
| 89 |
+
continue
|
| 90 |
+
sub_initials, sub_finals = _get_initials_finals(word)
|
| 91 |
+
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
| 92 |
+
initials.append(sub_initials)
|
| 93 |
+
finals.append(sub_finals)
|
| 94 |
+
|
| 95 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
| 96 |
+
initials = sum(initials, [])
|
| 97 |
+
finals = sum(finals, [])
|
| 98 |
+
#
|
| 99 |
+
for c, v in zip(initials, finals):
|
| 100 |
+
raw_pinyin = c + v
|
| 101 |
+
# NOTE: post process for pypinyin outputs
|
| 102 |
+
# we discriminate i, ii and iii
|
| 103 |
+
if c == v:
|
| 104 |
+
assert c in punctuation
|
| 105 |
+
phone = [c]
|
| 106 |
+
word2ph.append(1)
|
| 107 |
+
else:
|
| 108 |
+
v_without_tone = v[:-1]
|
| 109 |
+
tone = v[-1]
|
| 110 |
+
|
| 111 |
+
pinyin = c + v_without_tone
|
| 112 |
+
assert tone in "12345"
|
| 113 |
+
|
| 114 |
+
if c:
|
| 115 |
+
# 多音节
|
| 116 |
+
v_rep_map = {
|
| 117 |
+
"uei": "ui",
|
| 118 |
+
"iou": "iu",
|
| 119 |
+
"uen": "un",
|
| 120 |
+
}
|
| 121 |
+
if v_without_tone in v_rep_map.keys():
|
| 122 |
+
pinyin = c + v_rep_map[v_without_tone]
|
| 123 |
+
else:
|
| 124 |
+
# 单音节
|
| 125 |
+
pinyin_rep_map = {
|
| 126 |
+
"ing": "ying",
|
| 127 |
+
"i": "yi",
|
| 128 |
+
"in": "yin",
|
| 129 |
+
"u": "wu",
|
| 130 |
+
}
|
| 131 |
+
if pinyin in pinyin_rep_map.keys():
|
| 132 |
+
pinyin = pinyin_rep_map[pinyin]
|
| 133 |
+
else:
|
| 134 |
+
single_rep_map = {
|
| 135 |
+
"v": "yu",
|
| 136 |
+
"e": "e",
|
| 137 |
+
"i": "y",
|
| 138 |
+
"u": "w",
|
| 139 |
+
}
|
| 140 |
+
if pinyin[0] in single_rep_map.keys():
|
| 141 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
| 142 |
+
|
| 143 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
| 144 |
+
new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
|
| 145 |
+
new_v = new_v + tone
|
| 146 |
+
phone = [new_c, new_v]
|
| 147 |
+
word2ph.append(len(phone))
|
| 148 |
+
|
| 149 |
+
phones_list += phone
|
| 150 |
+
return phones_list, word2ph
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def text_normalize(text):
|
| 154 |
+
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
| 155 |
+
tx = TextNormalizer()
|
| 156 |
+
sentences = tx.normalize(text)
|
| 157 |
+
dest_text = ""
|
| 158 |
+
for sentence in sentences:
|
| 159 |
+
dest_text += replace_punctuation(sentence)
|
| 160 |
+
return dest_text
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
|
| 165 |
+
text = "呣呣呣~就是…大人的鼹鼠党吧?"
|
| 166 |
+
text = "你好"
|
| 167 |
+
text = text_normalize(text)
|
| 168 |
+
print(g2p(text))
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# # 示例用法
|
| 172 |
+
# text = "这是一个示例文本:,你好!这是一个测试..."
|
| 173 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
SongBloom/g2p/cn_zh_g2p/cmudict-fast.rep
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SongBloom/g2p/cn_zh_g2p/cmudict.rep
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SongBloom/g2p/cn_zh_g2p/engdict-hot.rep
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CHATGPT CH AE1 T JH IY1 P IY1 T IY1
|
| 2 |
+
JSON JH EY1 S AH0 N
|
SongBloom/g2p/cn_zh_g2p/english.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import wordsegment
|
| 5 |
+
from g2p_en import G2p
|
| 6 |
+
|
| 7 |
+
from string import punctuation
|
| 8 |
+
|
| 9 |
+
from .symbols import symbols
|
| 10 |
+
|
| 11 |
+
import unicodedata
|
| 12 |
+
from builtins import str as unicode
|
| 13 |
+
from g2p_en.expand import normalize_numbers
|
| 14 |
+
from nltk.tokenize import TweetTokenizer
|
| 15 |
+
word_tokenize = TweetTokenizer().tokenize
|
| 16 |
+
from nltk import pos_tag
|
| 17 |
+
|
| 18 |
+
current_file_path = os.path.dirname(__file__)
|
| 19 |
+
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
|
| 20 |
+
CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
|
| 21 |
+
CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
|
| 22 |
+
CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
|
| 23 |
+
NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
|
| 24 |
+
|
| 25 |
+
arpa = {
|
| 26 |
+
"AH0",
|
| 27 |
+
"S",
|
| 28 |
+
"AH1",
|
| 29 |
+
"EY2",
|
| 30 |
+
"AE2",
|
| 31 |
+
"EH0",
|
| 32 |
+
"OW2",
|
| 33 |
+
"UH0",
|
| 34 |
+
"NG",
|
| 35 |
+
"B",
|
| 36 |
+
"G",
|
| 37 |
+
"AY0",
|
| 38 |
+
"M",
|
| 39 |
+
"AA0",
|
| 40 |
+
"F",
|
| 41 |
+
"AO0",
|
| 42 |
+
"ER2",
|
| 43 |
+
"UH1",
|
| 44 |
+
"IY1",
|
| 45 |
+
"AH2",
|
| 46 |
+
"DH",
|
| 47 |
+
"IY0",
|
| 48 |
+
"EY1",
|
| 49 |
+
"IH0",
|
| 50 |
+
"K",
|
| 51 |
+
"N",
|
| 52 |
+
"W",
|
| 53 |
+
"IY2",
|
| 54 |
+
"T",
|
| 55 |
+
"AA1",
|
| 56 |
+
"ER1",
|
| 57 |
+
"EH2",
|
| 58 |
+
"OY0",
|
| 59 |
+
"UH2",
|
| 60 |
+
"UW1",
|
| 61 |
+
"Z",
|
| 62 |
+
"AW2",
|
| 63 |
+
"AW1",
|
| 64 |
+
"V",
|
| 65 |
+
"UW2",
|
| 66 |
+
"AA2",
|
| 67 |
+
"ER",
|
| 68 |
+
"AW0",
|
| 69 |
+
"UW0",
|
| 70 |
+
"R",
|
| 71 |
+
"OW1",
|
| 72 |
+
"EH1",
|
| 73 |
+
"ZH",
|
| 74 |
+
"AE0",
|
| 75 |
+
"IH2",
|
| 76 |
+
"IH",
|
| 77 |
+
"Y",
|
| 78 |
+
"JH",
|
| 79 |
+
"P",
|
| 80 |
+
"AY1",
|
| 81 |
+
"EY0",
|
| 82 |
+
"OY2",
|
| 83 |
+
"TH",
|
| 84 |
+
"HH",
|
| 85 |
+
"D",
|
| 86 |
+
"ER0",
|
| 87 |
+
"CH",
|
| 88 |
+
"AO1",
|
| 89 |
+
"AE1",
|
| 90 |
+
"AO2",
|
| 91 |
+
"OY1",
|
| 92 |
+
"AY2",
|
| 93 |
+
"IH1",
|
| 94 |
+
"OW0",
|
| 95 |
+
"L",
|
| 96 |
+
"SH",
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def replace_phs(phs):
|
| 101 |
+
rep_map = {"'": "-"}
|
| 102 |
+
phs_new = []
|
| 103 |
+
for ph in phs:
|
| 104 |
+
if ph in symbols:
|
| 105 |
+
phs_new.append(ph)
|
| 106 |
+
elif ph in rep_map.keys():
|
| 107 |
+
phs_new.append(rep_map[ph])
|
| 108 |
+
else:
|
| 109 |
+
print("ph not in symbols: ", ph)
|
| 110 |
+
return phs_new
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def read_dict():
|
| 114 |
+
g2p_dict = {}
|
| 115 |
+
start_line = 49
|
| 116 |
+
with open(CMU_DICT_PATH) as f:
|
| 117 |
+
line = f.readline()
|
| 118 |
+
line_index = 1
|
| 119 |
+
while line:
|
| 120 |
+
if line_index >= start_line:
|
| 121 |
+
line = line.strip()
|
| 122 |
+
word_split = line.split(" ")
|
| 123 |
+
word = word_split[0].lower()
|
| 124 |
+
|
| 125 |
+
syllable_split = word_split[1].split(" - ")
|
| 126 |
+
g2p_dict[word] = []
|
| 127 |
+
for syllable in syllable_split:
|
| 128 |
+
phone_split = syllable.split(" ")
|
| 129 |
+
g2p_dict[word].append(phone_split)
|
| 130 |
+
|
| 131 |
+
line_index = line_index + 1
|
| 132 |
+
line = f.readline()
|
| 133 |
+
|
| 134 |
+
return g2p_dict
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def read_dict_new():
|
| 138 |
+
g2p_dict = {}
|
| 139 |
+
with open(CMU_DICT_PATH) as f:
|
| 140 |
+
line = f.readline()
|
| 141 |
+
line_index = 1
|
| 142 |
+
while line:
|
| 143 |
+
if line_index >= 57:
|
| 144 |
+
line = line.strip()
|
| 145 |
+
word_split = line.split(" ")
|
| 146 |
+
word = word_split[0].lower()
|
| 147 |
+
g2p_dict[word] = [word_split[1].split(" ")]
|
| 148 |
+
|
| 149 |
+
line_index = line_index + 1
|
| 150 |
+
line = f.readline()
|
| 151 |
+
|
| 152 |
+
with open(CMU_DICT_FAST_PATH) as f:
|
| 153 |
+
line = f.readline()
|
| 154 |
+
line_index = 1
|
| 155 |
+
while line:
|
| 156 |
+
if line_index >= 0:
|
| 157 |
+
line = line.strip()
|
| 158 |
+
word_split = line.split(" ")
|
| 159 |
+
word = word_split[0].lower()
|
| 160 |
+
if word not in g2p_dict:
|
| 161 |
+
g2p_dict[word] = [word_split[1:]]
|
| 162 |
+
|
| 163 |
+
line_index = line_index + 1
|
| 164 |
+
line = f.readline()
|
| 165 |
+
|
| 166 |
+
return g2p_dict
|
| 167 |
+
|
| 168 |
+
def hot_reload_hot(g2p_dict):
|
| 169 |
+
with open(CMU_DICT_HOT_PATH) as f:
|
| 170 |
+
line = f.readline()
|
| 171 |
+
line_index = 1
|
| 172 |
+
while line:
|
| 173 |
+
if line_index >= 0:
|
| 174 |
+
line = line.strip()
|
| 175 |
+
word_split = line.split(" ")
|
| 176 |
+
word = word_split[0].lower()
|
| 177 |
+
# 自定义发音词直接覆盖字典
|
| 178 |
+
g2p_dict[word] = [word_split[1:]]
|
| 179 |
+
|
| 180 |
+
line_index = line_index + 1
|
| 181 |
+
line = f.readline()
|
| 182 |
+
|
| 183 |
+
return g2p_dict
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def cache_dict(g2p_dict, file_path):
|
| 187 |
+
with open(file_path, "wb") as pickle_file:
|
| 188 |
+
pickle.dump(g2p_dict, pickle_file)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def get_dict():
|
| 192 |
+
if os.path.exists(CACHE_PATH):
|
| 193 |
+
with open(CACHE_PATH, "rb") as pickle_file:
|
| 194 |
+
g2p_dict = pickle.load(pickle_file)
|
| 195 |
+
else:
|
| 196 |
+
g2p_dict = read_dict_new()
|
| 197 |
+
cache_dict(g2p_dict, CACHE_PATH)
|
| 198 |
+
|
| 199 |
+
g2p_dict = hot_reload_hot(g2p_dict)
|
| 200 |
+
|
| 201 |
+
return g2p_dict
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def get_namedict():
|
| 205 |
+
if os.path.exists(NAMECACHE_PATH):
|
| 206 |
+
with open(NAMECACHE_PATH, "rb") as pickle_file:
|
| 207 |
+
name_dict = pickle.load(pickle_file)
|
| 208 |
+
else:
|
| 209 |
+
name_dict = {}
|
| 210 |
+
|
| 211 |
+
return name_dict
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def text_normalize(text):
|
| 215 |
+
# todo: eng text normalize
|
| 216 |
+
# 适配中文及 g2p_en 标点
|
| 217 |
+
rep_map = {
|
| 218 |
+
"[;::,;]": ",",
|
| 219 |
+
'["’]': "'",
|
| 220 |
+
"。": ".",
|
| 221 |
+
"!": "!",
|
| 222 |
+
"?": "?",
|
| 223 |
+
}
|
| 224 |
+
for p, r in rep_map.items():
|
| 225 |
+
text = re.sub(p, r, text)
|
| 226 |
+
|
| 227 |
+
# 来自 g2p_en 文本格式化处理
|
| 228 |
+
# 增加大写兼容
|
| 229 |
+
text = unicode(text)
|
| 230 |
+
text = normalize_numbers(text)
|
| 231 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
| 232 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
| 233 |
+
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
|
| 234 |
+
text = re.sub(r"(?i)i\.e\.", "that is", text)
|
| 235 |
+
text = re.sub(r"(?i)e\.g\.", "for example", text)
|
| 236 |
+
|
| 237 |
+
return text
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
class en_G2p(G2p):
|
| 241 |
+
def __init__(self):
|
| 242 |
+
super().__init__()
|
| 243 |
+
# 分词初始化
|
| 244 |
+
wordsegment.load()
|
| 245 |
+
|
| 246 |
+
# 扩展过时字典, 添加姓名字典
|
| 247 |
+
self.cmu = get_dict()
|
| 248 |
+
self.namedict = get_namedict()
|
| 249 |
+
|
| 250 |
+
# 剔除读音错误的几个缩写
|
| 251 |
+
for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
|
| 252 |
+
del self.cmu[word.lower()]
|
| 253 |
+
|
| 254 |
+
# 修正多音字
|
| 255 |
+
self.homograph2features["read"] = (['R', 'IY1', 'D'], ['R', 'EH1', 'D'], 'VBP')
|
| 256 |
+
self.homograph2features["complex"] = (['K', 'AH0', 'M', 'P', 'L', 'EH1', 'K', 'S'], ['K', 'AA1', 'M', 'P', 'L', 'EH0', 'K', 'S'], 'JJ')
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def __call__(self, text):
|
| 260 |
+
# tokenization
|
| 261 |
+
words = word_tokenize(text)
|
| 262 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
| 263 |
+
|
| 264 |
+
# steps
|
| 265 |
+
prons = []
|
| 266 |
+
for o_word, pos in tokens:
|
| 267 |
+
# 还原 g2p_en 小写操作逻辑
|
| 268 |
+
word = o_word.lower()
|
| 269 |
+
|
| 270 |
+
if re.search("[a-z]", word) is None:
|
| 271 |
+
pron = [word]
|
| 272 |
+
# 先把单字母推出去
|
| 273 |
+
elif len(word) == 1:
|
| 274 |
+
# 单读 A 发音修正, 这里需要原格式 o_word 判断大写
|
| 275 |
+
if o_word == "A":
|
| 276 |
+
pron = ['EY1']
|
| 277 |
+
else:
|
| 278 |
+
pron = self.cmu[word][0]
|
| 279 |
+
# g2p_en 原版多音字处理
|
| 280 |
+
elif word in self.homograph2features: # Check homograph
|
| 281 |
+
pron1, pron2, pos1 = self.homograph2features[word]
|
| 282 |
+
if pos.startswith(pos1):
|
| 283 |
+
pron = pron1
|
| 284 |
+
# pos1比pos长仅出现在read
|
| 285 |
+
elif len(pos) < len(pos1) and pos == pos1[:len(pos)]:
|
| 286 |
+
pron = pron1
|
| 287 |
+
else:
|
| 288 |
+
pron = pron2
|
| 289 |
+
else:
|
| 290 |
+
# 递归查找预测
|
| 291 |
+
pron = self.qryword(o_word)
|
| 292 |
+
|
| 293 |
+
prons.extend(pron)
|
| 294 |
+
prons.extend([" "])
|
| 295 |
+
|
| 296 |
+
return prons[:-1]
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def qryword(self, o_word):
|
| 300 |
+
word = o_word.lower()
|
| 301 |
+
|
| 302 |
+
# 查字典, 单字母除外
|
| 303 |
+
if len(word) > 1 and word in self.cmu: # lookup CMU dict
|
| 304 |
+
return self.cmu[word][0]
|
| 305 |
+
|
| 306 |
+
# 单词仅首字母大写时查找姓名字典
|
| 307 |
+
if o_word.istitle() and word in self.namedict:
|
| 308 |
+
return self.namedict[word][0]
|
| 309 |
+
|
| 310 |
+
# oov 长度小于等于 3 直接读字母
|
| 311 |
+
if len(word) <= 3:
|
| 312 |
+
phones = []
|
| 313 |
+
for w in word:
|
| 314 |
+
# 单读 A 发音修正, 此处不存在大写的情况
|
| 315 |
+
if w == "a":
|
| 316 |
+
phones.extend(['EY1'])
|
| 317 |
+
else:
|
| 318 |
+
phones.extend(self.cmu[w][0])
|
| 319 |
+
return phones
|
| 320 |
+
|
| 321 |
+
# 尝试分离所有格
|
| 322 |
+
if re.match(r"^([a-z]+)('s)$", word):
|
| 323 |
+
phones = self.qryword(word[:-2])[:]
|
| 324 |
+
# P T K F TH HH 无声辅音结尾 's 发 ['S']
|
| 325 |
+
if phones[-1] in ['P', 'T', 'K', 'F', 'TH', 'HH']:
|
| 326 |
+
phones.extend(['S'])
|
| 327 |
+
# S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
|
| 328 |
+
elif phones[-1] in ['S', 'Z', 'SH', 'ZH', 'CH', 'JH']:
|
| 329 |
+
phones.extend(['AH0', 'Z'])
|
| 330 |
+
# B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
|
| 331 |
+
# AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
|
| 332 |
+
# ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
|
| 333 |
+
else:
|
| 334 |
+
phones.extend(['Z'])
|
| 335 |
+
return phones
|
| 336 |
+
|
| 337 |
+
# 尝试进行分词,应对复合词
|
| 338 |
+
comps = wordsegment.segment(word.lower())
|
| 339 |
+
|
| 340 |
+
# 无法分词的送回去预测
|
| 341 |
+
if len(comps)==1:
|
| 342 |
+
return self.predict(word)
|
| 343 |
+
|
| 344 |
+
# 可以分词的递归处理
|
| 345 |
+
return [phone for comp in comps for phone in self.qryword(comp)]
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
_g2p = en_G2p()
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def g2p(text):
|
| 352 |
+
# g2p_en 整段推理,剔除不存在的arpa返回
|
| 353 |
+
phone_list = _g2p(text)
|
| 354 |
+
phones = [ph if ph != "<unk>" else "UNK" for ph in phone_list if ph not in [" ", "<pad>", "UW", "</s>", "<s>"]]
|
| 355 |
+
|
| 356 |
+
return replace_phs(phones)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
if __name__ == "__main__":
|
| 360 |
+
print(g2p("hello"))
|
| 361 |
+
print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
|
| 362 |
+
print(g2p(text_normalize("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")))
|
SongBloom/g2p/cn_zh_g2p/nltk_data/corpora/cmudict/README
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
|
| 2 |
+
|
| 3 |
+
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
| 4 |
+
https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
|
| 5 |
+
|
| 6 |
+
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
|
| 7 |
+
|
| 8 |
+
File Format: Each line consists of an uppercased word,
|
| 9 |
+
a counter (for alternative pronunciations), and a transcription.
|
| 10 |
+
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
|
| 11 |
+
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
|
| 12 |
+
|
| 13 |
+
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
| 14 |
+
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
| 15 |
+
three or more pronunciations. Many of these are fast-speech variants.
|
| 16 |
+
|
| 17 |
+
Phonemes: There are 39 phonemes, as shown below:
|
| 18 |
+
|
| 19 |
+
Phoneme Example Translation Phoneme Example Translation
|
| 20 |
+
------- ------- ----------- ------- ------- -----------
|
| 21 |
+
AA odd AA D AE at AE T
|
| 22 |
+
AH hut HH AH T AO ought AO T
|
| 23 |
+
AW cow K AW AY hide HH AY D
|
| 24 |
+
B be B IY CH cheese CH IY Z
|
| 25 |
+
D dee D IY DH thee DH IY
|
| 26 |
+
EH Ed EH D ER hurt HH ER T
|
| 27 |
+
EY ate EY T F fee F IY
|
| 28 |
+
G green G R IY N HH he HH IY
|
| 29 |
+
IH it IH T IY eat IY T
|
| 30 |
+
JH gee JH IY K key K IY
|
| 31 |
+
L lee L IY M me M IY
|
| 32 |
+
N knee N IY NG ping P IH NG
|
| 33 |
+
OW oat OW T OY toy T OY
|
| 34 |
+
P pee P IY R read R IY D
|
| 35 |
+
S sea S IY SH she SH IY
|
| 36 |
+
T tea T IY TH theta TH EY T AH
|
| 37 |
+
UH hood HH UH D UW two T UW
|
| 38 |
+
V vee V IY W we W IY
|
| 39 |
+
Y yield Y IY L D Z zee Z IY
|
| 40 |
+
ZH seizure S IY ZH ER
|
| 41 |
+
|
| 42 |
+
(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
|
| 43 |
+
are contiguous, and not separated by FIRE'S 1.)
|
| 44 |
+
|
| 45 |
+
Redistribution and use in source and binary forms, with or without
|
| 46 |
+
modification, are permitted provided that the following conditions
|
| 47 |
+
are met:
|
| 48 |
+
|
| 49 |
+
1. Redistributions of source code must retain the above copyright
|
| 50 |
+
notice, this list of conditions and the following disclaimer.
|
| 51 |
+
The contents of this file are deemed to be source code.
|
| 52 |
+
|
| 53 |
+
2. Redistributions in binary form must reproduce the above copyright
|
| 54 |
+
notice, this list of conditions and the following disclaimer in
|
| 55 |
+
the documentation and/or other materials provided with the
|
| 56 |
+
distribution.
|
| 57 |
+
|
| 58 |
+
This work was supported in part by funding from the Defense Advanced
|
| 59 |
+
Research Projects Agency, the Office of Naval Research and the National
|
| 60 |
+
Science Foundation of the United States of America, and by member
|
| 61 |
+
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
| 62 |
+
the contributions of many volunteers to the expansion and improvement of
|
| 63 |
+
this dictionary.
|
| 64 |
+
|
| 65 |
+
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
| 66 |
+
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
| 67 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 68 |
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
| 69 |
+
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 70 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 71 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 72 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 73 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 74 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 75 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 76 |
+
|
SongBloom/g2p/cn_zh_g2p/nltk_data/corpora/cmudict/cmudict
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
SongBloom/g2p/cn_zh_g2p/opencpop-strict.txt
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
a AA a
|
| 2 |
+
ai AA ai
|
| 3 |
+
an AA an
|
| 4 |
+
ang AA ang
|
| 5 |
+
ao AA ao
|
| 6 |
+
ba b a
|
| 7 |
+
bai b ai
|
| 8 |
+
ban b an
|
| 9 |
+
bang b ang
|
| 10 |
+
bao b ao
|
| 11 |
+
bei b ei
|
| 12 |
+
ben b en
|
| 13 |
+
beng b eng
|
| 14 |
+
bi b i
|
| 15 |
+
bian b ian
|
| 16 |
+
biao b iao
|
| 17 |
+
bie b ie
|
| 18 |
+
bin b in
|
| 19 |
+
bing b ing
|
| 20 |
+
bo b o
|
| 21 |
+
bu b u
|
| 22 |
+
ca c a
|
| 23 |
+
cai c ai
|
| 24 |
+
can c an
|
| 25 |
+
cang c ang
|
| 26 |
+
cao c ao
|
| 27 |
+
ce c e
|
| 28 |
+
cei c ei
|
| 29 |
+
cen c en
|
| 30 |
+
ceng c eng
|
| 31 |
+
cha ch a
|
| 32 |
+
chai ch ai
|
| 33 |
+
chan ch an
|
| 34 |
+
chang ch ang
|
| 35 |
+
chao ch ao
|
| 36 |
+
che ch e
|
| 37 |
+
chen ch en
|
| 38 |
+
cheng ch eng
|
| 39 |
+
chi ch ir
|
| 40 |
+
chong ch ong
|
| 41 |
+
chou ch ou
|
| 42 |
+
chu ch u
|
| 43 |
+
chua ch ua
|
| 44 |
+
chuai ch uai
|
| 45 |
+
chuan ch uan
|
| 46 |
+
chuang ch uang
|
| 47 |
+
chui ch ui
|
| 48 |
+
chun ch un
|
| 49 |
+
chuo ch uo
|
| 50 |
+
ci c i0
|
| 51 |
+
cong c ong
|
| 52 |
+
cou c ou
|
| 53 |
+
cu c u
|
| 54 |
+
cuan c uan
|
| 55 |
+
cui c ui
|
| 56 |
+
cun c un
|
| 57 |
+
cuo c uo
|
| 58 |
+
da d a
|
| 59 |
+
dai d ai
|
| 60 |
+
dan d an
|
| 61 |
+
dang d ang
|
| 62 |
+
dao d ao
|
| 63 |
+
de d e
|
| 64 |
+
dei d ei
|
| 65 |
+
den d en
|
| 66 |
+
deng d eng
|
| 67 |
+
di d i
|
| 68 |
+
dia d ia
|
| 69 |
+
dian d ian
|
| 70 |
+
diao d iao
|
| 71 |
+
die d ie
|
| 72 |
+
ding d ing
|
| 73 |
+
diu d iu
|
| 74 |
+
dong d ong
|
| 75 |
+
dou d ou
|
| 76 |
+
du d u
|
| 77 |
+
duan d uan
|
| 78 |
+
dui d ui
|
| 79 |
+
dun d un
|
| 80 |
+
duo d uo
|
| 81 |
+
e EE e
|
| 82 |
+
ei EE ei
|
| 83 |
+
en EE en
|
| 84 |
+
eng EE eng
|
| 85 |
+
er EE er
|
| 86 |
+
fa f a
|
| 87 |
+
fan f an
|
| 88 |
+
fang f ang
|
| 89 |
+
fei f ei
|
| 90 |
+
fen f en
|
| 91 |
+
feng f eng
|
| 92 |
+
fo f o
|
| 93 |
+
fou f ou
|
| 94 |
+
fu f u
|
| 95 |
+
ga g a
|
| 96 |
+
gai g ai
|
| 97 |
+
gan g an
|
| 98 |
+
gang g ang
|
| 99 |
+
gao g ao
|
| 100 |
+
ge g e
|
| 101 |
+
gei g ei
|
| 102 |
+
gen g en
|
| 103 |
+
geng g eng
|
| 104 |
+
gong g ong
|
| 105 |
+
gou g ou
|
| 106 |
+
gu g u
|
| 107 |
+
gua g ua
|
| 108 |
+
guai g uai
|
| 109 |
+
guan g uan
|
| 110 |
+
guang g uang
|
| 111 |
+
gui g ui
|
| 112 |
+
gun g un
|
| 113 |
+
guo g uo
|
| 114 |
+
ha h a
|
| 115 |
+
hai h ai
|
| 116 |
+
han h an
|
| 117 |
+
hang h ang
|
| 118 |
+
hao h ao
|
| 119 |
+
he h e
|
| 120 |
+
hei h ei
|
| 121 |
+
hen h en
|
| 122 |
+
heng h eng
|
| 123 |
+
hong h ong
|
| 124 |
+
hou h ou
|
| 125 |
+
hu h u
|
| 126 |
+
hua h ua
|
| 127 |
+
huai h uai
|
| 128 |
+
huan h uan
|
| 129 |
+
huang h uang
|
| 130 |
+
hui h ui
|
| 131 |
+
hun h un
|
| 132 |
+
huo h uo
|
| 133 |
+
ji j i
|
| 134 |
+
jia j ia
|
| 135 |
+
jian j ian
|
| 136 |
+
jiang j iang
|
| 137 |
+
jiao j iao
|
| 138 |
+
jie j ie
|
| 139 |
+
jin j in
|
| 140 |
+
jing j ing
|
| 141 |
+
jiong j iong
|
| 142 |
+
jiu j iu
|
| 143 |
+
ju j v
|
| 144 |
+
jv j v
|
| 145 |
+
juan j van
|
| 146 |
+
jvan j van
|
| 147 |
+
jue j ve
|
| 148 |
+
jve j ve
|
| 149 |
+
jun j vn
|
| 150 |
+
jvn j vn
|
| 151 |
+
ka k a
|
| 152 |
+
kai k ai
|
| 153 |
+
kan k an
|
| 154 |
+
kang k ang
|
| 155 |
+
kao k ao
|
| 156 |
+
ke k e
|
| 157 |
+
kei k ei
|
| 158 |
+
ken k en
|
| 159 |
+
keng k eng
|
| 160 |
+
kong k ong
|
| 161 |
+
kou k ou
|
| 162 |
+
ku k u
|
| 163 |
+
kua k ua
|
| 164 |
+
kuai k uai
|
| 165 |
+
kuan k uan
|
| 166 |
+
kuang k uang
|
| 167 |
+
kui k ui
|
| 168 |
+
kun k un
|
| 169 |
+
kuo k uo
|
| 170 |
+
la l a
|
| 171 |
+
lai l ai
|
| 172 |
+
lan l an
|
| 173 |
+
lang l ang
|
| 174 |
+
lao l ao
|
| 175 |
+
le l e
|
| 176 |
+
lei l ei
|
| 177 |
+
leng l eng
|
| 178 |
+
li l i
|
| 179 |
+
lia l ia
|
| 180 |
+
lian l ian
|
| 181 |
+
liang l iang
|
| 182 |
+
liao l iao
|
| 183 |
+
lie l ie
|
| 184 |
+
lin l in
|
| 185 |
+
ling l ing
|
| 186 |
+
liu l iu
|
| 187 |
+
lo l o
|
| 188 |
+
long l ong
|
| 189 |
+
lou l ou
|
| 190 |
+
lu l u
|
| 191 |
+
luan l uan
|
| 192 |
+
lun l un
|
| 193 |
+
luo l uo
|
| 194 |
+
lv l v
|
| 195 |
+
lve l ve
|
| 196 |
+
ma m a
|
| 197 |
+
mai m ai
|
| 198 |
+
man m an
|
| 199 |
+
mang m ang
|
| 200 |
+
mao m ao
|
| 201 |
+
me m e
|
| 202 |
+
mei m ei
|
| 203 |
+
men m en
|
| 204 |
+
meng m eng
|
| 205 |
+
mi m i
|
| 206 |
+
mian m ian
|
| 207 |
+
miao m iao
|
| 208 |
+
mie m ie
|
| 209 |
+
min m in
|
| 210 |
+
ming m ing
|
| 211 |
+
miu m iu
|
| 212 |
+
mo m o
|
| 213 |
+
mou m ou
|
| 214 |
+
mu m u
|
| 215 |
+
na n a
|
| 216 |
+
nai n ai
|
| 217 |
+
nan n an
|
| 218 |
+
nang n ang
|
| 219 |
+
nao n ao
|
| 220 |
+
ne n e
|
| 221 |
+
nei n ei
|
| 222 |
+
nen n en
|
| 223 |
+
neng n eng
|
| 224 |
+
ni n i
|
| 225 |
+
nian n ian
|
| 226 |
+
niang n iang
|
| 227 |
+
niao n iao
|
| 228 |
+
nie n ie
|
| 229 |
+
nin n in
|
| 230 |
+
ning n ing
|
| 231 |
+
niu n iu
|
| 232 |
+
nong n ong
|
| 233 |
+
nou n ou
|
| 234 |
+
nu n u
|
| 235 |
+
nuan n uan
|
| 236 |
+
nun n un
|
| 237 |
+
nuo n uo
|
| 238 |
+
nv n v
|
| 239 |
+
nve n ve
|
| 240 |
+
o OO o
|
| 241 |
+
ou OO ou
|
| 242 |
+
pa p a
|
| 243 |
+
pai p ai
|
| 244 |
+
pan p an
|
| 245 |
+
pang p ang
|
| 246 |
+
pao p ao
|
| 247 |
+
pei p ei
|
| 248 |
+
pen p en
|
| 249 |
+
peng p eng
|
| 250 |
+
pi p i
|
| 251 |
+
pian p ian
|
| 252 |
+
piao p iao
|
| 253 |
+
pie p ie
|
| 254 |
+
pin p in
|
| 255 |
+
ping p ing
|
| 256 |
+
po p o
|
| 257 |
+
pou p ou
|
| 258 |
+
pu p u
|
| 259 |
+
qi q i
|
| 260 |
+
qia q ia
|
| 261 |
+
qian q ian
|
| 262 |
+
qiang q iang
|
| 263 |
+
qiao q iao
|
| 264 |
+
qie q ie
|
| 265 |
+
qin q in
|
| 266 |
+
qing q ing
|
| 267 |
+
qiong q iong
|
| 268 |
+
qiu q iu
|
| 269 |
+
qu q v
|
| 270 |
+
qv q v
|
| 271 |
+
quan q van
|
| 272 |
+
qvan q van
|
| 273 |
+
que q ve
|
| 274 |
+
qve q ve
|
| 275 |
+
qun q vn
|
| 276 |
+
qvn q vn
|
| 277 |
+
ran r an
|
| 278 |
+
rang r ang
|
| 279 |
+
rao r ao
|
| 280 |
+
re r e
|
| 281 |
+
ren r en
|
| 282 |
+
reng r eng
|
| 283 |
+
ri r ir
|
| 284 |
+
rong r ong
|
| 285 |
+
rou r ou
|
| 286 |
+
ru r u
|
| 287 |
+
rua r ua
|
| 288 |
+
ruan r uan
|
| 289 |
+
rui r ui
|
| 290 |
+
run r un
|
| 291 |
+
ruo r uo
|
| 292 |
+
sa s a
|
| 293 |
+
sai s ai
|
| 294 |
+
san s an
|
| 295 |
+
sang s ang
|
| 296 |
+
sao s ao
|
| 297 |
+
se s e
|
| 298 |
+
sen s en
|
| 299 |
+
seng s eng
|
| 300 |
+
sha sh a
|
| 301 |
+
shai sh ai
|
| 302 |
+
shan sh an
|
| 303 |
+
shang sh ang
|
| 304 |
+
shao sh ao
|
| 305 |
+
she sh e
|
| 306 |
+
shei sh ei
|
| 307 |
+
shen sh en
|
| 308 |
+
sheng sh eng
|
| 309 |
+
shi sh ir
|
| 310 |
+
shou sh ou
|
| 311 |
+
shu sh u
|
| 312 |
+
shua sh ua
|
| 313 |
+
shuai sh uai
|
| 314 |
+
shuan sh uan
|
| 315 |
+
shuang sh uang
|
| 316 |
+
shui sh ui
|
| 317 |
+
shun sh un
|
| 318 |
+
shuo sh uo
|
| 319 |
+
si s i0
|
| 320 |
+
song s ong
|
| 321 |
+
sou s ou
|
| 322 |
+
su s u
|
| 323 |
+
suan s uan
|
| 324 |
+
sui s ui
|
| 325 |
+
sun s un
|
| 326 |
+
suo s uo
|
| 327 |
+
ta t a
|
| 328 |
+
tai t ai
|
| 329 |
+
tan t an
|
| 330 |
+
tang t ang
|
| 331 |
+
tao t ao
|
| 332 |
+
te t e
|
| 333 |
+
tei t ei
|
| 334 |
+
teng t eng
|
| 335 |
+
ti t i
|
| 336 |
+
tian t ian
|
| 337 |
+
tiao t iao
|
| 338 |
+
tie t ie
|
| 339 |
+
ting t ing
|
| 340 |
+
tong t ong
|
| 341 |
+
tou t ou
|
| 342 |
+
tu t u
|
| 343 |
+
tuan t uan
|
| 344 |
+
tui t ui
|
| 345 |
+
tun t un
|
| 346 |
+
tuo t uo
|
| 347 |
+
wa w a
|
| 348 |
+
wai w ai
|
| 349 |
+
wan w an
|
| 350 |
+
wang w ang
|
| 351 |
+
wei w ei
|
| 352 |
+
wen w en
|
| 353 |
+
weng w eng
|
| 354 |
+
wo w o
|
| 355 |
+
wu w u
|
| 356 |
+
xi x i
|
| 357 |
+
xia x ia
|
| 358 |
+
xian x ian
|
| 359 |
+
xiang x iang
|
| 360 |
+
xiao x iao
|
| 361 |
+
xie x ie
|
| 362 |
+
xin x in
|
| 363 |
+
xing x ing
|
| 364 |
+
xiong x iong
|
| 365 |
+
xiu x iu
|
| 366 |
+
xu x v
|
| 367 |
+
xv x v
|
| 368 |
+
xuan x van
|
| 369 |
+
xvan x van
|
| 370 |
+
xue x ve
|
| 371 |
+
xve x ve
|
| 372 |
+
xun x vn
|
| 373 |
+
xvn x vn
|
| 374 |
+
ya y a
|
| 375 |
+
yan y En
|
| 376 |
+
yang y ang
|
| 377 |
+
yao y ao
|
| 378 |
+
ye y E
|
| 379 |
+
yi y i
|
| 380 |
+
yin y in
|
| 381 |
+
ying y ing
|
| 382 |
+
yo y o
|
| 383 |
+
yong y ong
|
| 384 |
+
you y ou
|
| 385 |
+
yu y v
|
| 386 |
+
yv y v
|
| 387 |
+
yuan y van
|
| 388 |
+
yvan y van
|
| 389 |
+
yue y ve
|
| 390 |
+
yve y ve
|
| 391 |
+
yun y vn
|
| 392 |
+
yvn y vn
|
| 393 |
+
za z a
|
| 394 |
+
zai z ai
|
| 395 |
+
zan z an
|
| 396 |
+
zang z ang
|
| 397 |
+
zao z ao
|
| 398 |
+
ze z e
|
| 399 |
+
zei z ei
|
| 400 |
+
zen z en
|
| 401 |
+
zeng z eng
|
| 402 |
+
zha zh a
|
| 403 |
+
zhai zh ai
|
| 404 |
+
zhan zh an
|
| 405 |
+
zhang zh ang
|
| 406 |
+
zhao zh ao
|
| 407 |
+
zhe zh e
|
| 408 |
+
zhei zh ei
|
| 409 |
+
zhen zh en
|
| 410 |
+
zheng zh eng
|
| 411 |
+
zhi zh ir
|
| 412 |
+
zhong zh ong
|
| 413 |
+
zhou zh ou
|
| 414 |
+
zhu zh u
|
| 415 |
+
zhua zh ua
|
| 416 |
+
zhuai zh uai
|
| 417 |
+
zhuan zh uan
|
| 418 |
+
zhuang zh uang
|
| 419 |
+
zhui zh ui
|
| 420 |
+
zhun zh un
|
| 421 |
+
zhuo zh uo
|
| 422 |
+
zi z i0
|
| 423 |
+
zong z ong
|
| 424 |
+
zou z ou
|
| 425 |
+
zu z u
|
| 426 |
+
zuan z uan
|
| 427 |
+
zui z ui
|
| 428 |
+
zun z un
|
| 429 |
+
zuo z uo
|
SongBloom/g2p/cn_zh_g2p/symbols.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
|
| 4 |
+
punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
|
| 5 |
+
punctuation.append("-")
|
| 6 |
+
pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
|
| 7 |
+
# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
|
| 8 |
+
pad = "_"
|
| 9 |
+
|
| 10 |
+
c = [
|
| 11 |
+
"AA",
|
| 12 |
+
"EE",
|
| 13 |
+
"OO",
|
| 14 |
+
"b",
|
| 15 |
+
"c",
|
| 16 |
+
"ch",
|
| 17 |
+
"d",
|
| 18 |
+
"f",
|
| 19 |
+
"g",
|
| 20 |
+
"h",
|
| 21 |
+
"j",
|
| 22 |
+
"k",
|
| 23 |
+
"l",
|
| 24 |
+
"m",
|
| 25 |
+
"n",
|
| 26 |
+
"p",
|
| 27 |
+
"q",
|
| 28 |
+
"r",
|
| 29 |
+
"s",
|
| 30 |
+
"sh",
|
| 31 |
+
"t",
|
| 32 |
+
"w",
|
| 33 |
+
"x",
|
| 34 |
+
"y",
|
| 35 |
+
"z",
|
| 36 |
+
"zh",
|
| 37 |
+
]
|
| 38 |
+
v = [
|
| 39 |
+
"E1",
|
| 40 |
+
"En1",
|
| 41 |
+
"a1",
|
| 42 |
+
"ai1",
|
| 43 |
+
"an1",
|
| 44 |
+
"ang1",
|
| 45 |
+
"ao1",
|
| 46 |
+
"e1",
|
| 47 |
+
"ei1",
|
| 48 |
+
"en1",
|
| 49 |
+
"eng1",
|
| 50 |
+
"er1",
|
| 51 |
+
"i1",
|
| 52 |
+
"i01",
|
| 53 |
+
"ia1",
|
| 54 |
+
"ian1",
|
| 55 |
+
"iang1",
|
| 56 |
+
"iao1",
|
| 57 |
+
"ie1",
|
| 58 |
+
"in1",
|
| 59 |
+
"ing1",
|
| 60 |
+
"iong1",
|
| 61 |
+
"ir1",
|
| 62 |
+
"iu1",
|
| 63 |
+
"o1",
|
| 64 |
+
"ong1",
|
| 65 |
+
"ou1",
|
| 66 |
+
"u1",
|
| 67 |
+
"ua1",
|
| 68 |
+
"uai1",
|
| 69 |
+
"uan1",
|
| 70 |
+
"uang1",
|
| 71 |
+
"ui1",
|
| 72 |
+
"un1",
|
| 73 |
+
"uo1",
|
| 74 |
+
"v1",
|
| 75 |
+
"van1",
|
| 76 |
+
"ve1",
|
| 77 |
+
"vn1",
|
| 78 |
+
"E2",
|
| 79 |
+
"En2",
|
| 80 |
+
"a2",
|
| 81 |
+
"ai2",
|
| 82 |
+
"an2",
|
| 83 |
+
"ang2",
|
| 84 |
+
"ao2",
|
| 85 |
+
"e2",
|
| 86 |
+
"ei2",
|
| 87 |
+
"en2",
|
| 88 |
+
"eng2",
|
| 89 |
+
"er2",
|
| 90 |
+
"i2",
|
| 91 |
+
"i02",
|
| 92 |
+
"ia2",
|
| 93 |
+
"ian2",
|
| 94 |
+
"iang2",
|
| 95 |
+
"iao2",
|
| 96 |
+
"ie2",
|
| 97 |
+
"in2",
|
| 98 |
+
"ing2",
|
| 99 |
+
"iong2",
|
| 100 |
+
"ir2",
|
| 101 |
+
"iu2",
|
| 102 |
+
"o2",
|
| 103 |
+
"ong2",
|
| 104 |
+
"ou2",
|
| 105 |
+
"u2",
|
| 106 |
+
"ua2",
|
| 107 |
+
"uai2",
|
| 108 |
+
"uan2",
|
| 109 |
+
"uang2",
|
| 110 |
+
"ui2",
|
| 111 |
+
"un2",
|
| 112 |
+
"uo2",
|
| 113 |
+
"v2",
|
| 114 |
+
"van2",
|
| 115 |
+
"ve2",
|
| 116 |
+
"vn2",
|
| 117 |
+
"E3",
|
| 118 |
+
"En3",
|
| 119 |
+
"a3",
|
| 120 |
+
"ai3",
|
| 121 |
+
"an3",
|
| 122 |
+
"ang3",
|
| 123 |
+
"ao3",
|
| 124 |
+
"e3",
|
| 125 |
+
"ei3",
|
| 126 |
+
"en3",
|
| 127 |
+
"eng3",
|
| 128 |
+
"er3",
|
| 129 |
+
"i3",
|
| 130 |
+
"i03",
|
| 131 |
+
"ia3",
|
| 132 |
+
"ian3",
|
| 133 |
+
"iang3",
|
| 134 |
+
"iao3",
|
| 135 |
+
"ie3",
|
| 136 |
+
"in3",
|
| 137 |
+
"ing3",
|
| 138 |
+
"iong3",
|
| 139 |
+
"ir3",
|
| 140 |
+
"iu3",
|
| 141 |
+
"o3",
|
| 142 |
+
"ong3",
|
| 143 |
+
"ou3",
|
| 144 |
+
"u3",
|
| 145 |
+
"ua3",
|
| 146 |
+
"uai3",
|
| 147 |
+
"uan3",
|
| 148 |
+
"uang3",
|
| 149 |
+
"ui3",
|
| 150 |
+
"un3",
|
| 151 |
+
"uo3",
|
| 152 |
+
"v3",
|
| 153 |
+
"van3",
|
| 154 |
+
"ve3",
|
| 155 |
+
"vn3",
|
| 156 |
+
"E4",
|
| 157 |
+
"En4",
|
| 158 |
+
"a4",
|
| 159 |
+
"ai4",
|
| 160 |
+
"an4",
|
| 161 |
+
"ang4",
|
| 162 |
+
"ao4",
|
| 163 |
+
"e4",
|
| 164 |
+
"ei4",
|
| 165 |
+
"en4",
|
| 166 |
+
"eng4",
|
| 167 |
+
"er4",
|
| 168 |
+
"i4",
|
| 169 |
+
"i04",
|
| 170 |
+
"ia4",
|
| 171 |
+
"ian4",
|
| 172 |
+
"iang4",
|
| 173 |
+
"iao4",
|
| 174 |
+
"ie4",
|
| 175 |
+
"in4",
|
| 176 |
+
"ing4",
|
| 177 |
+
"iong4",
|
| 178 |
+
"ir4",
|
| 179 |
+
"iu4",
|
| 180 |
+
"o4",
|
| 181 |
+
"ong4",
|
| 182 |
+
"ou4",
|
| 183 |
+
"u4",
|
| 184 |
+
"ua4",
|
| 185 |
+
"uai4",
|
| 186 |
+
"uan4",
|
| 187 |
+
"uang4",
|
| 188 |
+
"ui4",
|
| 189 |
+
"un4",
|
| 190 |
+
"uo4",
|
| 191 |
+
"v4",
|
| 192 |
+
"van4",
|
| 193 |
+
"ve4",
|
| 194 |
+
"vn4",
|
| 195 |
+
"E5",
|
| 196 |
+
"En5",
|
| 197 |
+
"a5",
|
| 198 |
+
"ai5",
|
| 199 |
+
"an5",
|
| 200 |
+
"ang5",
|
| 201 |
+
"ao5",
|
| 202 |
+
"e5",
|
| 203 |
+
"ei5",
|
| 204 |
+
"en5",
|
| 205 |
+
"eng5",
|
| 206 |
+
"er5",
|
| 207 |
+
"i5",
|
| 208 |
+
"i05",
|
| 209 |
+
"ia5",
|
| 210 |
+
"ian5",
|
| 211 |
+
"iang5",
|
| 212 |
+
"iao5",
|
| 213 |
+
"ie5",
|
| 214 |
+
"in5",
|
| 215 |
+
"ing5",
|
| 216 |
+
"iong5",
|
| 217 |
+
"ir5",
|
| 218 |
+
"iu5",
|
| 219 |
+
"o5",
|
| 220 |
+
"ong5",
|
| 221 |
+
"ou5",
|
| 222 |
+
"u5",
|
| 223 |
+
"ua5",
|
| 224 |
+
"uai5",
|
| 225 |
+
"uan5",
|
| 226 |
+
"uang5",
|
| 227 |
+
"ui5",
|
| 228 |
+
"un5",
|
| 229 |
+
"uo5",
|
| 230 |
+
"v5",
|
| 231 |
+
"van5",
|
| 232 |
+
"ve5",
|
| 233 |
+
"vn5",
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
v_without_tone = [
|
| 237 |
+
"E",
|
| 238 |
+
"En",
|
| 239 |
+
"a",
|
| 240 |
+
"ai",
|
| 241 |
+
"an",
|
| 242 |
+
"ang",
|
| 243 |
+
"ao",
|
| 244 |
+
"e",
|
| 245 |
+
"ei",
|
| 246 |
+
"en",
|
| 247 |
+
"eng",
|
| 248 |
+
"er",
|
| 249 |
+
"i",
|
| 250 |
+
"i0",
|
| 251 |
+
"ia",
|
| 252 |
+
"ian",
|
| 253 |
+
"iang",
|
| 254 |
+
"iao",
|
| 255 |
+
"ie",
|
| 256 |
+
"in",
|
| 257 |
+
"ing",
|
| 258 |
+
"iong",
|
| 259 |
+
"ir",
|
| 260 |
+
"iu",
|
| 261 |
+
"o",
|
| 262 |
+
"ong",
|
| 263 |
+
"ou",
|
| 264 |
+
"u",
|
| 265 |
+
"ua",
|
| 266 |
+
"uai",
|
| 267 |
+
"uan",
|
| 268 |
+
"uang",
|
| 269 |
+
"ui",
|
| 270 |
+
"un",
|
| 271 |
+
"uo",
|
| 272 |
+
"v",
|
| 273 |
+
"van",
|
| 274 |
+
"ve",
|
| 275 |
+
"vn",
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
# japanese
|
| 279 |
+
ja_symbols = [
|
| 280 |
+
"I",
|
| 281 |
+
"N",
|
| 282 |
+
"U",
|
| 283 |
+
"a",
|
| 284 |
+
"b",
|
| 285 |
+
"by",
|
| 286 |
+
"ch",
|
| 287 |
+
"cl",
|
| 288 |
+
"d",
|
| 289 |
+
"dy",
|
| 290 |
+
"e",
|
| 291 |
+
"f",
|
| 292 |
+
"g",
|
| 293 |
+
"gy",
|
| 294 |
+
"h",
|
| 295 |
+
"hy",
|
| 296 |
+
"i",
|
| 297 |
+
"j",
|
| 298 |
+
"k",
|
| 299 |
+
"ky",
|
| 300 |
+
"m",
|
| 301 |
+
"my",
|
| 302 |
+
"n",
|
| 303 |
+
"ny",
|
| 304 |
+
"o",
|
| 305 |
+
"p",
|
| 306 |
+
"py",
|
| 307 |
+
"r",
|
| 308 |
+
"ry",
|
| 309 |
+
"s",
|
| 310 |
+
"sh",
|
| 311 |
+
"t",
|
| 312 |
+
"ts",
|
| 313 |
+
"u",
|
| 314 |
+
"v",
|
| 315 |
+
"w",
|
| 316 |
+
"y",
|
| 317 |
+
"z",
|
| 318 |
+
# "[", #上升调型
|
| 319 |
+
# "]", #下降调型
|
| 320 |
+
# "$", #结束符
|
| 321 |
+
# "^", #开始符
|
| 322 |
+
]
|
| 323 |
+
|
| 324 |
+
arpa = {
|
| 325 |
+
"AH0",
|
| 326 |
+
"S",
|
| 327 |
+
"AH1",
|
| 328 |
+
"EY2",
|
| 329 |
+
"AE2",
|
| 330 |
+
"EH0",
|
| 331 |
+
"OW2",
|
| 332 |
+
"UH0",
|
| 333 |
+
"NG",
|
| 334 |
+
"B",
|
| 335 |
+
"G",
|
| 336 |
+
"AY0",
|
| 337 |
+
"M",
|
| 338 |
+
"AA0",
|
| 339 |
+
"F",
|
| 340 |
+
"AO0",
|
| 341 |
+
"ER2",
|
| 342 |
+
"UH1",
|
| 343 |
+
"IY1",
|
| 344 |
+
"AH2",
|
| 345 |
+
"DH",
|
| 346 |
+
"IY0",
|
| 347 |
+
"EY1",
|
| 348 |
+
"IH0",
|
| 349 |
+
"K",
|
| 350 |
+
"N",
|
| 351 |
+
"W",
|
| 352 |
+
"IY2",
|
| 353 |
+
"T",
|
| 354 |
+
"AA1",
|
| 355 |
+
"ER1",
|
| 356 |
+
"EH2",
|
| 357 |
+
"OY0",
|
| 358 |
+
"UH2",
|
| 359 |
+
"UW1",
|
| 360 |
+
"Z",
|
| 361 |
+
"AW2",
|
| 362 |
+
"AW1",
|
| 363 |
+
"V",
|
| 364 |
+
"UW2",
|
| 365 |
+
"AA2",
|
| 366 |
+
"ER",
|
| 367 |
+
"AW0",
|
| 368 |
+
"UW0",
|
| 369 |
+
"R",
|
| 370 |
+
"OW1",
|
| 371 |
+
"EH1",
|
| 372 |
+
"ZH",
|
| 373 |
+
"AE0",
|
| 374 |
+
"IH2",
|
| 375 |
+
"IH",
|
| 376 |
+
"Y",
|
| 377 |
+
"JH",
|
| 378 |
+
"P",
|
| 379 |
+
"AY1",
|
| 380 |
+
"EY0",
|
| 381 |
+
"OY2",
|
| 382 |
+
"TH",
|
| 383 |
+
"HH",
|
| 384 |
+
"D",
|
| 385 |
+
"ER0",
|
| 386 |
+
"CH",
|
| 387 |
+
"AO1",
|
| 388 |
+
"AE1",
|
| 389 |
+
"AO2",
|
| 390 |
+
"OY1",
|
| 391 |
+
"AY2",
|
| 392 |
+
"IH1",
|
| 393 |
+
"OW0",
|
| 394 |
+
"L",
|
| 395 |
+
"SH",
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
| 399 |
+
symbols = sorted(set(symbols))
|
| 400 |
+
if __name__ == "__main__":
|
| 401 |
+
print(len(symbols))
|
SongBloom/g2p/cn_zh_g2p/tone_sandhi.py
ADDED
|
@@ -0,0 +1,806 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
from typing import List
|
| 15 |
+
from typing import Tuple
|
| 16 |
+
|
| 17 |
+
import jieba_fast as jieba
|
| 18 |
+
from pypinyin import lazy_pinyin
|
| 19 |
+
from pypinyin import Style
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ToneSandhi:
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self.must_neural_tone_words = {
|
| 25 |
+
"麻烦",
|
| 26 |
+
"麻利",
|
| 27 |
+
"鸳鸯",
|
| 28 |
+
"高粱",
|
| 29 |
+
"骨头",
|
| 30 |
+
"骆驼",
|
| 31 |
+
"马虎",
|
| 32 |
+
"首饰",
|
| 33 |
+
"馒头",
|
| 34 |
+
"馄饨",
|
| 35 |
+
"风筝",
|
| 36 |
+
"难为",
|
| 37 |
+
"队伍",
|
| 38 |
+
"阔气",
|
| 39 |
+
"闺女",
|
| 40 |
+
"门道",
|
| 41 |
+
"锄头",
|
| 42 |
+
"铺盖",
|
| 43 |
+
"铃铛",
|
| 44 |
+
"铁匠",
|
| 45 |
+
"钥匙",
|
| 46 |
+
"里脊",
|
| 47 |
+
"里头",
|
| 48 |
+
"部分",
|
| 49 |
+
"那么",
|
| 50 |
+
"道士",
|
| 51 |
+
"造化",
|
| 52 |
+
"迷糊",
|
| 53 |
+
"连累",
|
| 54 |
+
"这么",
|
| 55 |
+
"这个",
|
| 56 |
+
"运气",
|
| 57 |
+
"过去",
|
| 58 |
+
"软和",
|
| 59 |
+
"转悠",
|
| 60 |
+
"踏实",
|
| 61 |
+
"跳蚤",
|
| 62 |
+
"跟头",
|
| 63 |
+
"趔趄",
|
| 64 |
+
"财主",
|
| 65 |
+
"豆腐",
|
| 66 |
+
"讲究",
|
| 67 |
+
"记性",
|
| 68 |
+
"记号",
|
| 69 |
+
"认识",
|
| 70 |
+
"规矩",
|
| 71 |
+
"见识",
|
| 72 |
+
"裁缝",
|
| 73 |
+
"补丁",
|
| 74 |
+
"衣裳",
|
| 75 |
+
"衣服",
|
| 76 |
+
"衙门",
|
| 77 |
+
"街坊",
|
| 78 |
+
"行李",
|
| 79 |
+
"行当",
|
| 80 |
+
"蛤蟆",
|
| 81 |
+
"蘑菇",
|
| 82 |
+
"薄荷",
|
| 83 |
+
"葫芦",
|
| 84 |
+
"葡萄",
|
| 85 |
+
"萝卜",
|
| 86 |
+
"荸荠",
|
| 87 |
+
"苗条",
|
| 88 |
+
"苗头",
|
| 89 |
+
"苍蝇",
|
| 90 |
+
"芝麻",
|
| 91 |
+
"舒服",
|
| 92 |
+
"舒坦",
|
| 93 |
+
"舌头",
|
| 94 |
+
"自在",
|
| 95 |
+
"膏药",
|
| 96 |
+
"脾气",
|
| 97 |
+
"脑袋",
|
| 98 |
+
"脊梁",
|
| 99 |
+
"能耐",
|
| 100 |
+
"胳膊",
|
| 101 |
+
"胭脂",
|
| 102 |
+
"胡萝",
|
| 103 |
+
"胡琴",
|
| 104 |
+
"胡同",
|
| 105 |
+
"聪明",
|
| 106 |
+
"耽误",
|
| 107 |
+
"耽搁",
|
| 108 |
+
"耷拉",
|
| 109 |
+
"耳朵",
|
| 110 |
+
"老爷",
|
| 111 |
+
"老实",
|
| 112 |
+
"老婆",
|
| 113 |
+
"老头",
|
| 114 |
+
"老太",
|
| 115 |
+
"翻腾",
|
| 116 |
+
"罗嗦",
|
| 117 |
+
"罐头",
|
| 118 |
+
"编辑",
|
| 119 |
+
"结实",
|
| 120 |
+
"红火",
|
| 121 |
+
"累赘",
|
| 122 |
+
"糨糊",
|
| 123 |
+
"糊涂",
|
| 124 |
+
"精神",
|
| 125 |
+
"粮食",
|
| 126 |
+
"簸箕",
|
| 127 |
+
"篱笆",
|
| 128 |
+
"算计",
|
| 129 |
+
"算盘",
|
| 130 |
+
"答应",
|
| 131 |
+
"笤帚",
|
| 132 |
+
"笑语",
|
| 133 |
+
"笑话",
|
| 134 |
+
"窟窿",
|
| 135 |
+
"窝囊",
|
| 136 |
+
"窗户",
|
| 137 |
+
"稳当",
|
| 138 |
+
"稀罕",
|
| 139 |
+
"称呼",
|
| 140 |
+
"秧歌",
|
| 141 |
+
"秀气",
|
| 142 |
+
"秀才",
|
| 143 |
+
"福气",
|
| 144 |
+
"祖宗",
|
| 145 |
+
"砚台",
|
| 146 |
+
"码头",
|
| 147 |
+
"石榴",
|
| 148 |
+
"石头",
|
| 149 |
+
"石匠",
|
| 150 |
+
"知识",
|
| 151 |
+
"眼睛",
|
| 152 |
+
"眯缝",
|
| 153 |
+
"眨巴",
|
| 154 |
+
"眉毛",
|
| 155 |
+
"相声",
|
| 156 |
+
"盘算",
|
| 157 |
+
"白净",
|
| 158 |
+
"痢疾",
|
| 159 |
+
"痛快",
|
| 160 |
+
"疟疾",
|
| 161 |
+
"疙瘩",
|
| 162 |
+
"疏忽",
|
| 163 |
+
"畜生",
|
| 164 |
+
"生意",
|
| 165 |
+
"甘蔗",
|
| 166 |
+
"琵琶",
|
| 167 |
+
"琢磨",
|
| 168 |
+
"琉璃",
|
| 169 |
+
"玻璃",
|
| 170 |
+
"玫瑰",
|
| 171 |
+
"玄乎",
|
| 172 |
+
"狐狸",
|
| 173 |
+
"状元",
|
| 174 |
+
"特务",
|
| 175 |
+
"牲口",
|
| 176 |
+
"牙碜",
|
| 177 |
+
"牌楼",
|
| 178 |
+
"爽快",
|
| 179 |
+
"爱人",
|
| 180 |
+
"热闹",
|
| 181 |
+
"烧饼",
|
| 182 |
+
"烟筒",
|
| 183 |
+
"烂糊",
|
| 184 |
+
"点心",
|
| 185 |
+
"炊帚",
|
| 186 |
+
"灯笼",
|
| 187 |
+
"火候",
|
| 188 |
+
"漂亮",
|
| 189 |
+
"滑溜",
|
| 190 |
+
"溜达",
|
| 191 |
+
"温和",
|
| 192 |
+
"清楚",
|
| 193 |
+
"消息",
|
| 194 |
+
"浪头",
|
| 195 |
+
"活泼",
|
| 196 |
+
"比方",
|
| 197 |
+
"正经",
|
| 198 |
+
"欺负",
|
| 199 |
+
"模糊",
|
| 200 |
+
"槟榔",
|
| 201 |
+
"棺材",
|
| 202 |
+
"棒槌",
|
| 203 |
+
"棉花",
|
| 204 |
+
"核桃",
|
| 205 |
+
"栅栏",
|
| 206 |
+
"柴火",
|
| 207 |
+
"架势",
|
| 208 |
+
"枕头",
|
| 209 |
+
"���杷",
|
| 210 |
+
"机灵",
|
| 211 |
+
"本事",
|
| 212 |
+
"木头",
|
| 213 |
+
"木匠",
|
| 214 |
+
"朋友",
|
| 215 |
+
"月饼",
|
| 216 |
+
"月亮",
|
| 217 |
+
"暖和",
|
| 218 |
+
"明白",
|
| 219 |
+
"时候",
|
| 220 |
+
"新鲜",
|
| 221 |
+
"故事",
|
| 222 |
+
"收拾",
|
| 223 |
+
"收成",
|
| 224 |
+
"提防",
|
| 225 |
+
"挖苦",
|
| 226 |
+
"挑剔",
|
| 227 |
+
"指甲",
|
| 228 |
+
"指头",
|
| 229 |
+
"拾掇",
|
| 230 |
+
"拳头",
|
| 231 |
+
"拨弄",
|
| 232 |
+
"招牌",
|
| 233 |
+
"招呼",
|
| 234 |
+
"抬举",
|
| 235 |
+
"护士",
|
| 236 |
+
"折腾",
|
| 237 |
+
"扫帚",
|
| 238 |
+
"打量",
|
| 239 |
+
"打算",
|
| 240 |
+
"打点",
|
| 241 |
+
"打扮",
|
| 242 |
+
"打听",
|
| 243 |
+
"打发",
|
| 244 |
+
"扎实",
|
| 245 |
+
"扁担",
|
| 246 |
+
"戒指",
|
| 247 |
+
"懒得",
|
| 248 |
+
"意识",
|
| 249 |
+
"意思",
|
| 250 |
+
"情形",
|
| 251 |
+
"悟性",
|
| 252 |
+
"怪物",
|
| 253 |
+
"思量",
|
| 254 |
+
"怎么",
|
| 255 |
+
"念头",
|
| 256 |
+
"念叨",
|
| 257 |
+
"快活",
|
| 258 |
+
"忙活",
|
| 259 |
+
"志气",
|
| 260 |
+
"心思",
|
| 261 |
+
"得罪",
|
| 262 |
+
"张罗",
|
| 263 |
+
"弟兄",
|
| 264 |
+
"开通",
|
| 265 |
+
"应酬",
|
| 266 |
+
"庄稼",
|
| 267 |
+
"干事",
|
| 268 |
+
"帮手",
|
| 269 |
+
"帐篷",
|
| 270 |
+
"希罕",
|
| 271 |
+
"师父",
|
| 272 |
+
"师傅",
|
| 273 |
+
"巴结",
|
| 274 |
+
"巴掌",
|
| 275 |
+
"差事",
|
| 276 |
+
"工夫",
|
| 277 |
+
"岁数",
|
| 278 |
+
"屁股",
|
| 279 |
+
"尾巴",
|
| 280 |
+
"少爷",
|
| 281 |
+
"小气",
|
| 282 |
+
"小伙",
|
| 283 |
+
"将就",
|
| 284 |
+
"对头",
|
| 285 |
+
"对付",
|
| 286 |
+
"寡妇",
|
| 287 |
+
"家伙",
|
| 288 |
+
"客气",
|
| 289 |
+
"实在",
|
| 290 |
+
"官司",
|
| 291 |
+
"学问",
|
| 292 |
+
"学生",
|
| 293 |
+
"字号",
|
| 294 |
+
"嫁妆",
|
| 295 |
+
"媳妇",
|
| 296 |
+
"媒人",
|
| 297 |
+
"婆家",
|
| 298 |
+
"娘家",
|
| 299 |
+
"委屈",
|
| 300 |
+
"姑娘",
|
| 301 |
+
"姐夫",
|
| 302 |
+
"妯娌",
|
| 303 |
+
"妥当",
|
| 304 |
+
"妖精",
|
| 305 |
+
"奴才",
|
| 306 |
+
"女婿",
|
| 307 |
+
"头发",
|
| 308 |
+
"太阳",
|
| 309 |
+
"大爷",
|
| 310 |
+
"大方",
|
| 311 |
+
"大意",
|
| 312 |
+
"大夫",
|
| 313 |
+
"多少",
|
| 314 |
+
"多么",
|
| 315 |
+
"外甥",
|
| 316 |
+
"壮实",
|
| 317 |
+
"地道",
|
| 318 |
+
"地方",
|
| 319 |
+
"在乎",
|
| 320 |
+
"困难",
|
| 321 |
+
"嘴巴",
|
| 322 |
+
"嘱咐",
|
| 323 |
+
"嘟囔",
|
| 324 |
+
"嘀咕",
|
| 325 |
+
"喜欢",
|
| 326 |
+
"喇嘛",
|
| 327 |
+
"喇叭",
|
| 328 |
+
"商量",
|
| 329 |
+
"唾沫",
|
| 330 |
+
"哑巴",
|
| 331 |
+
"哈欠",
|
| 332 |
+
"哆嗦",
|
| 333 |
+
"咳嗽",
|
| 334 |
+
"和尚",
|
| 335 |
+
"告诉",
|
| 336 |
+
"告示",
|
| 337 |
+
"含糊",
|
| 338 |
+
"吓唬",
|
| 339 |
+
"后头",
|
| 340 |
+
"名字",
|
| 341 |
+
"名堂",
|
| 342 |
+
"合同",
|
| 343 |
+
"吆喝",
|
| 344 |
+
"叫唤",
|
| 345 |
+
"口袋",
|
| 346 |
+
"厚道",
|
| 347 |
+
"厉害",
|
| 348 |
+
"千斤",
|
| 349 |
+
"包袱",
|
| 350 |
+
"包涵",
|
| 351 |
+
"匀称",
|
| 352 |
+
"勤快",
|
| 353 |
+
"动静",
|
| 354 |
+
"动弹",
|
| 355 |
+
"功夫",
|
| 356 |
+
"力气",
|
| 357 |
+
"前头",
|
| 358 |
+
"刺猬",
|
| 359 |
+
"刺激",
|
| 360 |
+
"别扭",
|
| 361 |
+
"利落",
|
| 362 |
+
"利索",
|
| 363 |
+
"利害",
|
| 364 |
+
"分析",
|
| 365 |
+
"出息",
|
| 366 |
+
"凑合",
|
| 367 |
+
"凉快",
|
| 368 |
+
"冷战",
|
| 369 |
+
"冤枉",
|
| 370 |
+
"冒失",
|
| 371 |
+
"养活",
|
| 372 |
+
"关系",
|
| 373 |
+
"先生",
|
| 374 |
+
"兄弟",
|
| 375 |
+
"便宜",
|
| 376 |
+
"使唤",
|
| 377 |
+
"佩服",
|
| 378 |
+
"作坊",
|
| 379 |
+
"体面",
|
| 380 |
+
"位置",
|
| 381 |
+
"似的",
|
| 382 |
+
"伙计",
|
| 383 |
+
"休息",
|
| 384 |
+
"什么",
|
| 385 |
+
"人家",
|
| 386 |
+
"亲戚",
|
| 387 |
+
"亲家",
|
| 388 |
+
"交情",
|
| 389 |
+
"云彩",
|
| 390 |
+
"事情",
|
| 391 |
+
"买卖",
|
| 392 |
+
"主意",
|
| 393 |
+
"丫头",
|
| 394 |
+
"丧气",
|
| 395 |
+
"两口",
|
| 396 |
+
"东西",
|
| 397 |
+
"东家",
|
| 398 |
+
"世故",
|
| 399 |
+
"不由",
|
| 400 |
+
"不在",
|
| 401 |
+
"下水",
|
| 402 |
+
"下巴",
|
| 403 |
+
"上头",
|
| 404 |
+
"上司",
|
| 405 |
+
"丈夫",
|
| 406 |
+
"丈人",
|
| 407 |
+
"一辈",
|
| 408 |
+
"那个",
|
| 409 |
+
"菩萨",
|
| 410 |
+
"父亲",
|
| 411 |
+
"母亲",
|
| 412 |
+
"咕噜",
|
| 413 |
+
"邋遢",
|
| 414 |
+
"费用",
|
| 415 |
+
"冤家",
|
| 416 |
+
"甜头",
|
| 417 |
+
"介绍",
|
| 418 |
+
"荒唐",
|
| 419 |
+
"大人",
|
| 420 |
+
"泥鳅",
|
| 421 |
+
"幸福",
|
| 422 |
+
"熟悉",
|
| 423 |
+
"计划",
|
| 424 |
+
"扑腾",
|
| 425 |
+
"蜡烛",
|
| 426 |
+
"姥爷",
|
| 427 |
+
"照顾",
|
| 428 |
+
"喉咙",
|
| 429 |
+
"吉他",
|
| 430 |
+
"弄堂",
|
| 431 |
+
"蚂蚱",
|
| 432 |
+
"凤凰",
|
| 433 |
+
"拖沓",
|
| 434 |
+
"寒碜",
|
| 435 |
+
"糟蹋",
|
| 436 |
+
"倒腾",
|
| 437 |
+
"报复",
|
| 438 |
+
"逻辑",
|
| 439 |
+
"盘缠",
|
| 440 |
+
"喽啰",
|
| 441 |
+
"牢骚",
|
| 442 |
+
"咖喱",
|
| 443 |
+
"扫把",
|
| 444 |
+
"惦记",
|
| 445 |
+
}
|
| 446 |
+
self.must_not_neural_tone_words = {
|
| 447 |
+
"男子",
|
| 448 |
+
"女子",
|
| 449 |
+
"分子",
|
| 450 |
+
"原子",
|
| 451 |
+
"量子",
|
| 452 |
+
"莲子",
|
| 453 |
+
"石子",
|
| 454 |
+
"瓜子",
|
| 455 |
+
"电子",
|
| 456 |
+
"人人",
|
| 457 |
+
"虎虎",
|
| 458 |
+
"幺幺",
|
| 459 |
+
"干嘛",
|
| 460 |
+
"学子",
|
| 461 |
+
"哈哈",
|
| 462 |
+
"数数",
|
| 463 |
+
"袅袅",
|
| 464 |
+
"局地",
|
| 465 |
+
"以下",
|
| 466 |
+
"娃哈哈",
|
| 467 |
+
"花花草草",
|
| 468 |
+
"留得",
|
| 469 |
+
"耕地",
|
| 470 |
+
"想想",
|
| 471 |
+
"熙熙",
|
| 472 |
+
"攘攘",
|
| 473 |
+
"卵子",
|
| 474 |
+
"死死",
|
| 475 |
+
"冉冉",
|
| 476 |
+
"恳恳",
|
| 477 |
+
"佼佼",
|
| 478 |
+
"吵吵",
|
| 479 |
+
"打打",
|
| 480 |
+
"考考",
|
| 481 |
+
"整整",
|
| 482 |
+
"莘莘",
|
| 483 |
+
"落地",
|
| 484 |
+
"算子",
|
| 485 |
+
"家家户户",
|
| 486 |
+
"青青",
|
| 487 |
+
}
|
| 488 |
+
self.punc = ":,;。?!“”‘’':,;.?!"
|
| 489 |
+
|
| 490 |
+
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
|
| 491 |
+
# e.g.
|
| 492 |
+
# word: "家里"
|
| 493 |
+
# pos: "s"
|
| 494 |
+
# finals: ['ia1', 'i3']
|
| 495 |
+
def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
|
| 496 |
+
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
| 497 |
+
for j, item in enumerate(word):
|
| 498 |
+
if (
|
| 499 |
+
j - 1 >= 0
|
| 500 |
+
and item == word[j - 1]
|
| 501 |
+
and pos[0] in {"n", "v", "a"}
|
| 502 |
+
and word not in self.must_not_neural_tone_words
|
| 503 |
+
):
|
| 504 |
+
finals[j] = finals[j][:-1] + "5"
|
| 505 |
+
ge_idx = word.find("个")
|
| 506 |
+
if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
|
| 507 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 508 |
+
elif len(word) >= 1 and word[-1] in "的地得":
|
| 509 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 510 |
+
# e.g. 走了, 看着, 去过
|
| 511 |
+
elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
|
| 512 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 513 |
+
elif (
|
| 514 |
+
len(word) > 1
|
| 515 |
+
and word[-1] in "们子"
|
| 516 |
+
and pos in {"r", "n"}
|
| 517 |
+
and word not in self.must_not_neural_tone_words
|
| 518 |
+
):
|
| 519 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 520 |
+
# e.g. 桌上, 地下, 家里
|
| 521 |
+
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
|
| 522 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 523 |
+
# e.g. 上来, 下去
|
| 524 |
+
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
|
| 525 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 526 |
+
# 个做量词
|
| 527 |
+
elif (
|
| 528 |
+
ge_idx >= 1
|
| 529 |
+
and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
|
| 530 |
+
) or word == "个":
|
| 531 |
+
finals[ge_idx] = finals[ge_idx][:-1] + "5"
|
| 532 |
+
else:
|
| 533 |
+
if (
|
| 534 |
+
word in self.must_neural_tone_words
|
| 535 |
+
or word[-2:] in self.must_neural_tone_words
|
| 536 |
+
):
|
| 537 |
+
finals[-1] = finals[-1][:-1] + "5"
|
| 538 |
+
|
| 539 |
+
word_list = self._split_word(word)
|
| 540 |
+
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
|
| 541 |
+
for i, word in enumerate(word_list):
|
| 542 |
+
# conventional neural in Chinese
|
| 543 |
+
if (
|
| 544 |
+
word in self.must_neural_tone_words
|
| 545 |
+
or word[-2:] in self.must_neural_tone_words
|
| 546 |
+
):
|
| 547 |
+
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
|
| 548 |
+
finals = sum(finals_list, [])
|
| 549 |
+
return finals
|
| 550 |
+
|
| 551 |
+
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
| 552 |
+
# e.g. 看不懂
|
| 553 |
+
if len(word) == 3 and word[1] == "不":
|
| 554 |
+
finals[1] = finals[1][:-1] + "5"
|
| 555 |
+
else:
|
| 556 |
+
for i, char in enumerate(word):
|
| 557 |
+
# "不" before tone4 should be bu2, e.g. 不怕
|
| 558 |
+
if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
|
| 559 |
+
finals[i] = finals[i][:-1] + "2"
|
| 560 |
+
return finals
|
| 561 |
+
|
| 562 |
+
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
| 563 |
+
# "一" in number sequences, e.g. 一零零, 二一零
|
| 564 |
+
if word.find("一") != -1 and all(
|
| 565 |
+
[item.isnumeric() for item in word if item != "一"]
|
| 566 |
+
):
|
| 567 |
+
return finals
|
| 568 |
+
# "一" between reduplication words shold be yi5, e.g. 看一看
|
| 569 |
+
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
|
| 570 |
+
finals[1] = finals[1][:-1] + "5"
|
| 571 |
+
# when "一" is ordinal word, it should be yi1
|
| 572 |
+
elif word.startswith("第一"):
|
| 573 |
+
finals[1] = finals[1][:-1] + "1"
|
| 574 |
+
else:
|
| 575 |
+
for i, char in enumerate(word):
|
| 576 |
+
if char == "一" and i + 1 < len(word):
|
| 577 |
+
# "一" before tone4 should be yi2, e.g. 一段
|
| 578 |
+
if finals[i + 1][-1] == "4":
|
| 579 |
+
finals[i] = finals[i][:-1] + "2"
|
| 580 |
+
# "一" before non-tone4 should be yi4, e.g. 一天
|
| 581 |
+
else:
|
| 582 |
+
# "一" 后面如果是标点,还读一声
|
| 583 |
+
if word[i + 1] not in self.punc:
|
| 584 |
+
finals[i] = finals[i][:-1] + "4"
|
| 585 |
+
return finals
|
| 586 |
+
|
| 587 |
+
def _split_word(self, word: str) -> List[str]:
|
| 588 |
+
word_list = jieba.cut_for_search(word)
|
| 589 |
+
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
|
| 590 |
+
first_subword = word_list[0]
|
| 591 |
+
first_begin_idx = word.find(first_subword)
|
| 592 |
+
if first_begin_idx == 0:
|
| 593 |
+
second_subword = word[len(first_subword) :]
|
| 594 |
+
new_word_list = [first_subword, second_subword]
|
| 595 |
+
else:
|
| 596 |
+
second_subword = word[: -len(first_subword)]
|
| 597 |
+
new_word_list = [second_subword, first_subword]
|
| 598 |
+
return new_word_list
|
| 599 |
+
|
| 600 |
+
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
| 601 |
+
if len(word) == 2 and self._all_tone_three(finals):
|
| 602 |
+
finals[0] = finals[0][:-1] + "2"
|
| 603 |
+
elif len(word) == 3:
|
| 604 |
+
word_list = self._split_word(word)
|
| 605 |
+
if self._all_tone_three(finals):
|
| 606 |
+
# disyllabic + monosyllabic, e.g. 蒙古/包
|
| 607 |
+
if len(word_list[0]) == 2:
|
| 608 |
+
finals[0] = finals[0][:-1] + "2"
|
| 609 |
+
finals[1] = finals[1][:-1] + "2"
|
| 610 |
+
# monosyllabic + disyllabic, e.g. 纸/老虎
|
| 611 |
+
elif len(word_list[0]) == 1:
|
| 612 |
+
finals[1] = finals[1][:-1] + "2"
|
| 613 |
+
else:
|
| 614 |
+
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
|
| 615 |
+
if len(finals_list) == 2:
|
| 616 |
+
for i, sub in enumerate(finals_list):
|
| 617 |
+
# e.g. 所有/人
|
| 618 |
+
if self._all_tone_three(sub) and len(sub) == 2:
|
| 619 |
+
finals_list[i][0] = finals_list[i][0][:-1] + "2"
|
| 620 |
+
# e.g. 好/喜欢
|
| 621 |
+
elif (
|
| 622 |
+
i == 1
|
| 623 |
+
and not self._all_tone_three(sub)
|
| 624 |
+
and finals_list[i][0][-1] == "3"
|
| 625 |
+
and finals_list[0][-1][-1] == "3"
|
| 626 |
+
):
|
| 627 |
+
finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
|
| 628 |
+
finals = sum(finals_list, [])
|
| 629 |
+
# split idiom into two words who's length is 2
|
| 630 |
+
elif len(word) == 4:
|
| 631 |
+
finals_list = [finals[:2], finals[2:]]
|
| 632 |
+
finals = []
|
| 633 |
+
for sub in finals_list:
|
| 634 |
+
if self._all_tone_three(sub):
|
| 635 |
+
sub[0] = sub[0][:-1] + "2"
|
| 636 |
+
finals += sub
|
| 637 |
+
|
| 638 |
+
return finals
|
| 639 |
+
|
| 640 |
+
def _all_tone_three(self, finals: List[str]) -> bool:
|
| 641 |
+
return all(x[-1] == "3" for x in finals)
|
| 642 |
+
|
| 643 |
+
# merge "不" and the word behind it
|
| 644 |
+
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
|
| 645 |
+
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
| 646 |
+
new_seg = []
|
| 647 |
+
last_word = ""
|
| 648 |
+
for word, pos in seg:
|
| 649 |
+
if last_word == "不":
|
| 650 |
+
word = last_word + word
|
| 651 |
+
if word != "不":
|
| 652 |
+
new_seg.append((word, pos))
|
| 653 |
+
last_word = word[:]
|
| 654 |
+
if last_word == "不":
|
| 655 |
+
new_seg.append((last_word, "d"))
|
| 656 |
+
last_word = ""
|
| 657 |
+
return new_seg
|
| 658 |
+
|
| 659 |
+
# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
|
| 660 |
+
# function 2: merge single "一" and the word behind it
|
| 661 |
+
# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
|
| 662 |
+
# e.g.
|
| 663 |
+
# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
|
| 664 |
+
# output seg: [['听一听', 'v']]
|
| 665 |
+
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
| 666 |
+
new_seg = []
|
| 667 |
+
# function 1
|
| 668 |
+
for i, (word, pos) in enumerate(seg):
|
| 669 |
+
if (
|
| 670 |
+
i - 1 >= 0
|
| 671 |
+
and word == "一"
|
| 672 |
+
and i + 1 < len(seg)
|
| 673 |
+
and seg[i - 1][0] == seg[i + 1][0]
|
| 674 |
+
and seg[i - 1][1] == "v"
|
| 675 |
+
and seg[i + 1][1] == "v"
|
| 676 |
+
):
|
| 677 |
+
new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
|
| 678 |
+
else:
|
| 679 |
+
if (
|
| 680 |
+
i - 2 >= 0
|
| 681 |
+
and seg[i - 1][0] == "一"
|
| 682 |
+
and seg[i - 2][0] == word
|
| 683 |
+
and pos == "v"
|
| 684 |
+
):
|
| 685 |
+
continue
|
| 686 |
+
else:
|
| 687 |
+
new_seg.append([word, pos])
|
| 688 |
+
seg = new_seg
|
| 689 |
+
new_seg = []
|
| 690 |
+
# function 2
|
| 691 |
+
for i, (word, pos) in enumerate(seg):
|
| 692 |
+
if new_seg and new_seg[-1][0] == "一":
|
| 693 |
+
new_seg[-1][0] = new_seg[-1][0] + word
|
| 694 |
+
else:
|
| 695 |
+
new_seg.append([word, pos])
|
| 696 |
+
return new_seg
|
| 697 |
+
|
| 698 |
+
# the first and the second words are all_tone_three
|
| 699 |
+
def _merge_continuous_three_tones(
|
| 700 |
+
self, seg: List[Tuple[str, str]]
|
| 701 |
+
) -> List[Tuple[str, str]]:
|
| 702 |
+
new_seg = []
|
| 703 |
+
sub_finals_list = [
|
| 704 |
+
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
| 705 |
+
for (word, pos) in seg
|
| 706 |
+
]
|
| 707 |
+
assert len(sub_finals_list) == len(seg)
|
| 708 |
+
merge_last = [False] * len(seg)
|
| 709 |
+
for i, (word, pos) in enumerate(seg):
|
| 710 |
+
if (
|
| 711 |
+
i - 1 >= 0
|
| 712 |
+
and self._all_tone_three(sub_finals_list[i - 1])
|
| 713 |
+
and self._all_tone_three(sub_finals_list[i])
|
| 714 |
+
and not merge_last[i - 1]
|
| 715 |
+
):
|
| 716 |
+
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
| 717 |
+
if (
|
| 718 |
+
not self._is_reduplication(seg[i - 1][0])
|
| 719 |
+
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
|
| 720 |
+
):
|
| 721 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
| 722 |
+
merge_last[i] = True
|
| 723 |
+
else:
|
| 724 |
+
new_seg.append([word, pos])
|
| 725 |
+
else:
|
| 726 |
+
new_seg.append([word, pos])
|
| 727 |
+
|
| 728 |
+
return new_seg
|
| 729 |
+
|
| 730 |
+
def _is_reduplication(self, word: str) -> bool:
|
| 731 |
+
return len(word) == 2 and word[0] == word[1]
|
| 732 |
+
|
| 733 |
+
# the last char of first word and the first char of second word is tone_three
|
| 734 |
+
def _merge_continuous_three_tones_2(
|
| 735 |
+
self, seg: List[Tuple[str, str]]
|
| 736 |
+
) -> List[Tuple[str, str]]:
|
| 737 |
+
new_seg = []
|
| 738 |
+
sub_finals_list = [
|
| 739 |
+
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
| 740 |
+
for (word, pos) in seg
|
| 741 |
+
]
|
| 742 |
+
assert len(sub_finals_list) == len(seg)
|
| 743 |
+
merge_last = [False] * len(seg)
|
| 744 |
+
for i, (word, pos) in enumerate(seg):
|
| 745 |
+
if (
|
| 746 |
+
i - 1 >= 0
|
| 747 |
+
and sub_finals_list[i - 1][-1][-1] == "3"
|
| 748 |
+
and sub_finals_list[i][0][-1] == "3"
|
| 749 |
+
and not merge_last[i - 1]
|
| 750 |
+
):
|
| 751 |
+
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
| 752 |
+
if (
|
| 753 |
+
not self._is_reduplication(seg[i - 1][0])
|
| 754 |
+
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
|
| 755 |
+
):
|
| 756 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
| 757 |
+
merge_last[i] = True
|
| 758 |
+
else:
|
| 759 |
+
new_seg.append([word, pos])
|
| 760 |
+
else:
|
| 761 |
+
new_seg.append([word, pos])
|
| 762 |
+
return new_seg
|
| 763 |
+
|
| 764 |
+
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
| 765 |
+
new_seg = []
|
| 766 |
+
for i, (word, pos) in enumerate(seg):
|
| 767 |
+
if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
|
| 768 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
| 769 |
+
else:
|
| 770 |
+
new_seg.append([word, pos])
|
| 771 |
+
return new_seg
|
| 772 |
+
|
| 773 |
+
def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
| 774 |
+
new_seg = []
|
| 775 |
+
for i, (word, pos) in enumerate(seg):
|
| 776 |
+
if new_seg and word == new_seg[-1][0]:
|
| 777 |
+
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
| 778 |
+
else:
|
| 779 |
+
new_seg.append([word, pos])
|
| 780 |
+
return new_seg
|
| 781 |
+
|
| 782 |
+
def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
| 783 |
+
seg = self._merge_bu(seg)
|
| 784 |
+
try:
|
| 785 |
+
seg = self._merge_yi(seg)
|
| 786 |
+
except:
|
| 787 |
+
print("_merge_yi failed")
|
| 788 |
+
seg = self._merge_reduplication(seg)
|
| 789 |
+
try:
|
| 790 |
+
seg = self._merge_continuous_three_tones(seg)
|
| 791 |
+
except:
|
| 792 |
+
print("_merge_continuous_three_tones failed")
|
| 793 |
+
try:
|
| 794 |
+
seg = self._merge_continuous_three_tones_2(seg)
|
| 795 |
+
except:
|
| 796 |
+
print("_merge_continuous_three_tones_2 failed")
|
| 797 |
+
|
| 798 |
+
seg = self._merge_er(seg)
|
| 799 |
+
return seg
|
| 800 |
+
|
| 801 |
+
def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
|
| 802 |
+
finals = self._bu_sandhi(word, finals)
|
| 803 |
+
finals = self._yi_sandhi(word, finals)
|
| 804 |
+
finals = self._neural_sandhi(word, pos, finals)
|
| 805 |
+
finals = self._three_sandhi(word, finals)
|
| 806 |
+
return finals
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Supported NSW (Non-Standard-Word) Normalization
|
| 2 |
+
|
| 3 |
+
|NSW type|raw|normalized|
|
| 4 |
+
|:--|:-|:-|
|
| 5 |
+
|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
|
| 6 |
+
|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
|
| 7 |
+
|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
|
| 8 |
+
|date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
|
| 9 |
+
|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
|
| 10 |
+
|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
|
| 11 |
+
|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
|
| 12 |
+
|percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨|
|
| 13 |
+
|money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
|
| 14 |
+
|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
|
| 15 |
+
## References
|
| 16 |
+
[Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
from .text_normlization import *
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (206 Bytes). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/char_convert.cpython-39.pyc
ADDED
|
Binary file (65.9 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/chronology.cpython-39.pyc
ADDED
|
Binary file (2.33 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/constants.cpython-39.pyc
ADDED
|
Binary file (1.46 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/num.cpython-39.pyc
ADDED
|
Binary file (6.9 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/phonecode.cpython-39.pyc
ADDED
|
Binary file (1.42 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/quantifier.cpython-39.pyc
ADDED
|
Binary file (1.22 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/__pycache__/text_normlization.cpython-39.pyc
ADDED
|
Binary file (5.37 kB). View file
|
|
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/char_convert.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters.
|
| 16 |
+
"""
|
| 17 |
+
simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁���稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢��尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎���蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓��鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤'
|
| 18 |
+
|
| 19 |
+
traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨��倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢���鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙��舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒���踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤'
|
| 20 |
+
|
| 21 |
+
assert len(simplified_charcters) == len(simplified_charcters)
|
| 22 |
+
|
| 23 |
+
s2t_dict = {}
|
| 24 |
+
t2s_dict = {}
|
| 25 |
+
for i, item in enumerate(simplified_charcters):
|
| 26 |
+
s2t_dict[item] = traditional_characters[i]
|
| 27 |
+
t2s_dict[traditional_characters[i]] = item
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def tranditional_to_simplified(text: str) -> str:
|
| 31 |
+
return "".join(
|
| 32 |
+
[t2s_dict[item] if item in t2s_dict else item for item in text])
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def simplified_to_traditional(text: str) -> str:
|
| 36 |
+
return "".join(
|
| 37 |
+
[s2t_dict[item] if item in s2t_dict else item for item in text])
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點"
|
| 42 |
+
print(text)
|
| 43 |
+
text_simple = tranditional_to_simplified(text)
|
| 44 |
+
print(text_simple)
|
| 45 |
+
text_traditional = simplified_to_traditional(text_simple)
|
| 46 |
+
print(text_traditional)
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/chronology.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
from .num import DIGITS
|
| 17 |
+
from .num import num2str
|
| 18 |
+
from .num import verbalize_cardinal
|
| 19 |
+
from .num import verbalize_digit
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _time_num2str(num_string: str) -> str:
|
| 23 |
+
"""A special case for verbalizing number in time."""
|
| 24 |
+
result = num2str(num_string.lstrip('0'))
|
| 25 |
+
if num_string.startswith('0'):
|
| 26 |
+
result = DIGITS['0'] + result
|
| 27 |
+
return result
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# 时刻表达式
|
| 31 |
+
RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
|
| 32 |
+
r':([0-5][0-9])'
|
| 33 |
+
r'(:([0-5][0-9]))?')
|
| 34 |
+
|
| 35 |
+
# 时间范围,如8:30-12:30
|
| 36 |
+
RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
|
| 37 |
+
r':([0-5][0-9])'
|
| 38 |
+
r'(:([0-5][0-9]))?'
|
| 39 |
+
r'(~|-)'
|
| 40 |
+
r'([0-1]?[0-9]|2[0-3])'
|
| 41 |
+
r':([0-5][0-9])'
|
| 42 |
+
r'(:([0-5][0-9]))?')
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def replace_time(match) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Args:
|
| 48 |
+
match (re.Match)
|
| 49 |
+
Returns:
|
| 50 |
+
str
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
is_range = len(match.groups()) > 5
|
| 54 |
+
|
| 55 |
+
hour = match.group(1)
|
| 56 |
+
minute = match.group(2)
|
| 57 |
+
second = match.group(4)
|
| 58 |
+
|
| 59 |
+
if is_range:
|
| 60 |
+
hour_2 = match.group(6)
|
| 61 |
+
minute_2 = match.group(7)
|
| 62 |
+
second_2 = match.group(9)
|
| 63 |
+
|
| 64 |
+
result = f"{num2str(hour)}点"
|
| 65 |
+
if minute.lstrip('0'):
|
| 66 |
+
if int(minute) == 30:
|
| 67 |
+
result += "半"
|
| 68 |
+
else:
|
| 69 |
+
result += f"{_time_num2str(minute)}分"
|
| 70 |
+
if second and second.lstrip('0'):
|
| 71 |
+
result += f"{_time_num2str(second)}秒"
|
| 72 |
+
|
| 73 |
+
if is_range:
|
| 74 |
+
result += "至"
|
| 75 |
+
result += f"{num2str(hour_2)}点"
|
| 76 |
+
if minute_2.lstrip('0'):
|
| 77 |
+
if int(minute) == 30:
|
| 78 |
+
result += "半"
|
| 79 |
+
else:
|
| 80 |
+
result += f"{_time_num2str(minute_2)}分"
|
| 81 |
+
if second_2 and second_2.lstrip('0'):
|
| 82 |
+
result += f"{_time_num2str(second_2)}秒"
|
| 83 |
+
|
| 84 |
+
return result
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
RE_DATE = re.compile(r'(\d{4}|\d{2})年'
|
| 88 |
+
r'((0?[1-9]|1[0-2])月)?'
|
| 89 |
+
r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def replace_date(match) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Args:
|
| 95 |
+
match (re.Match)
|
| 96 |
+
Returns:
|
| 97 |
+
str
|
| 98 |
+
"""
|
| 99 |
+
year = match.group(1)
|
| 100 |
+
month = match.group(3)
|
| 101 |
+
day = match.group(5)
|
| 102 |
+
result = ""
|
| 103 |
+
if year:
|
| 104 |
+
result += f"{verbalize_digit(year)}年"
|
| 105 |
+
if month:
|
| 106 |
+
result += f"{verbalize_cardinal(month)}月"
|
| 107 |
+
if day:
|
| 108 |
+
result += f"{verbalize_cardinal(day)}{match.group(9)}"
|
| 109 |
+
return result
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
|
| 113 |
+
RE_DATE2 = re.compile(
|
| 114 |
+
r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def replace_date2(match) -> str:
|
| 118 |
+
"""
|
| 119 |
+
Args:
|
| 120 |
+
match (re.Match)
|
| 121 |
+
Returns:
|
| 122 |
+
str
|
| 123 |
+
"""
|
| 124 |
+
year = match.group(1)
|
| 125 |
+
month = match.group(3)
|
| 126 |
+
day = match.group(4)
|
| 127 |
+
result = ""
|
| 128 |
+
if year:
|
| 129 |
+
result += f"{verbalize_digit(year)}年"
|
| 130 |
+
if month:
|
| 131 |
+
result += f"{verbalize_cardinal(month)}月"
|
| 132 |
+
if day:
|
| 133 |
+
result += f"{verbalize_cardinal(day)}日"
|
| 134 |
+
return result
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/constants.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import re
|
| 15 |
+
import string
|
| 16 |
+
|
| 17 |
+
from pypinyin.constants import SUPPORT_UCS4
|
| 18 |
+
|
| 19 |
+
# 全角半角转换
|
| 20 |
+
# 英文字符全角 -> 半角映射表 (num: 52)
|
| 21 |
+
F2H_ASCII_LETTERS = {
|
| 22 |
+
ord(char) + 65248: ord(char)
|
| 23 |
+
for char in string.ascii_letters
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# 英文字符半角 -> 全角映射表
|
| 27 |
+
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
|
| 28 |
+
|
| 29 |
+
# 数字字符全角 -> 半角映射表 (num: 10)
|
| 30 |
+
F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
|
| 31 |
+
# 数字字符半角 -> 全角映射表
|
| 32 |
+
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
|
| 33 |
+
|
| 34 |
+
# 标点符号全角 -> 半角映射表 (num: 32)
|
| 35 |
+
F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
|
| 36 |
+
# 标点符号半角 -> 全角映射表
|
| 37 |
+
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
|
| 38 |
+
|
| 39 |
+
# 空格 (num: 1)
|
| 40 |
+
F2H_SPACE = {'\u3000': ' '}
|
| 41 |
+
H2F_SPACE = {' ': '\u3000'}
|
| 42 |
+
|
| 43 |
+
# 非"有拼音的汉字"的字符串,可用于NSW提取
|
| 44 |
+
if SUPPORT_UCS4:
|
| 45 |
+
RE_NSW = re.compile(r'(?:[^'
|
| 46 |
+
r'\u3007' # 〇
|
| 47 |
+
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
|
| 48 |
+
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
|
| 49 |
+
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
|
| 50 |
+
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
|
| 51 |
+
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
|
| 52 |
+
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
|
| 53 |
+
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
|
| 54 |
+
r'])+')
|
| 55 |
+
else:
|
| 56 |
+
RE_NSW = re.compile( # pragma: no cover
|
| 57 |
+
r'(?:[^'
|
| 58 |
+
r'\u3007' # 〇
|
| 59 |
+
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
|
| 60 |
+
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
|
| 61 |
+
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
|
| 62 |
+
r'])+')
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/num.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""
|
| 15 |
+
Rules to verbalize numbers into Chinese characters.
|
| 16 |
+
https://zh.wikipedia.org/wiki/中文数字#現代中文
|
| 17 |
+
"""
|
| 18 |
+
import re
|
| 19 |
+
from collections import OrderedDict
|
| 20 |
+
from typing import List
|
| 21 |
+
|
| 22 |
+
DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
|
| 23 |
+
UNITS = OrderedDict({
|
| 24 |
+
1: '十',
|
| 25 |
+
2: '百',
|
| 26 |
+
3: '千',
|
| 27 |
+
4: '万',
|
| 28 |
+
8: '亿',
|
| 29 |
+
})
|
| 30 |
+
|
| 31 |
+
COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
|
| 32 |
+
|
| 33 |
+
# 分数表达式
|
| 34 |
+
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def replace_frac(match) -> str:
|
| 38 |
+
"""
|
| 39 |
+
Args:
|
| 40 |
+
match (re.Match)
|
| 41 |
+
Returns:
|
| 42 |
+
str
|
| 43 |
+
"""
|
| 44 |
+
sign = match.group(1)
|
| 45 |
+
nominator = match.group(2)
|
| 46 |
+
denominator = match.group(3)
|
| 47 |
+
sign: str = "负" if sign else ""
|
| 48 |
+
nominator: str = num2str(nominator)
|
| 49 |
+
denominator: str = num2str(denominator)
|
| 50 |
+
result = f"{sign}{denominator}分之{nominator}"
|
| 51 |
+
return result
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# 百分数表达式
|
| 55 |
+
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def replace_percentage(match) -> str:
|
| 59 |
+
"""
|
| 60 |
+
Args:
|
| 61 |
+
match (re.Match)
|
| 62 |
+
Returns:
|
| 63 |
+
str
|
| 64 |
+
"""
|
| 65 |
+
sign = match.group(1)
|
| 66 |
+
percent = match.group(2)
|
| 67 |
+
sign: str = "负" if sign else ""
|
| 68 |
+
percent: str = num2str(percent)
|
| 69 |
+
result = f"{sign}百分之{percent}"
|
| 70 |
+
return result
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# 整数表达式
|
| 74 |
+
# 带负号的整数 -10
|
| 75 |
+
RE_INTEGER = re.compile(r'(-)' r'(\d+)')
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def replace_negative_num(match) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Args:
|
| 81 |
+
match (re.Match)
|
| 82 |
+
Returns:
|
| 83 |
+
str
|
| 84 |
+
"""
|
| 85 |
+
sign = match.group(1)
|
| 86 |
+
number = match.group(2)
|
| 87 |
+
sign: str = "负" if sign else ""
|
| 88 |
+
number: str = num2str(number)
|
| 89 |
+
result = f"{sign}{number}"
|
| 90 |
+
return result
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# 编号-无符号整形
|
| 94 |
+
# 00078
|
| 95 |
+
RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def replace_default_num(match):
|
| 99 |
+
"""
|
| 100 |
+
Args:
|
| 101 |
+
match (re.Match)
|
| 102 |
+
Returns:
|
| 103 |
+
str
|
| 104 |
+
"""
|
| 105 |
+
number = match.group(0)
|
| 106 |
+
return verbalize_digit(number, alt_one=True)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# 加减乘除
|
| 110 |
+
RE_ASMD = re.compile(
|
| 111 |
+
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
|
| 112 |
+
asmd_map = {
|
| 113 |
+
'+': '加',
|
| 114 |
+
'-': '减',
|
| 115 |
+
'×': '乘',
|
| 116 |
+
'÷': '除',
|
| 117 |
+
'=': '等于'
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def replace_asmd(match) -> str:
|
| 122 |
+
"""
|
| 123 |
+
Args:
|
| 124 |
+
match (re.Match)
|
| 125 |
+
Returns:
|
| 126 |
+
str
|
| 127 |
+
"""
|
| 128 |
+
result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
|
| 129 |
+
return result
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# 数字表达式
|
| 133 |
+
# 纯小数
|
| 134 |
+
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
|
| 135 |
+
# 正整数 + 量词
|
| 136 |
+
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
|
| 137 |
+
RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def replace_positive_quantifier(match) -> str:
|
| 141 |
+
"""
|
| 142 |
+
Args:
|
| 143 |
+
match (re.Match)
|
| 144 |
+
Returns:
|
| 145 |
+
str
|
| 146 |
+
"""
|
| 147 |
+
number = match.group(1)
|
| 148 |
+
match_2 = match.group(2)
|
| 149 |
+
if match_2 == "+":
|
| 150 |
+
match_2 = "多"
|
| 151 |
+
match_2: str = match_2 if match_2 else ""
|
| 152 |
+
quantifiers: str = match.group(3)
|
| 153 |
+
number: str = num2str(number)
|
| 154 |
+
result = f"{number}{match_2}{quantifiers}"
|
| 155 |
+
return result
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def replace_number(match) -> str:
|
| 159 |
+
"""
|
| 160 |
+
Args:
|
| 161 |
+
match (re.Match)
|
| 162 |
+
Returns:
|
| 163 |
+
str
|
| 164 |
+
"""
|
| 165 |
+
sign = match.group(1)
|
| 166 |
+
number = match.group(2)
|
| 167 |
+
pure_decimal = match.group(5)
|
| 168 |
+
if pure_decimal:
|
| 169 |
+
result = num2str(pure_decimal)
|
| 170 |
+
else:
|
| 171 |
+
sign: str = "负" if sign else ""
|
| 172 |
+
number: str = num2str(number)
|
| 173 |
+
result = f"{sign}{number}"
|
| 174 |
+
return result
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# 范围表达式
|
| 178 |
+
# match.group(1) and match.group(8) are copy from RE_NUMBER
|
| 179 |
+
|
| 180 |
+
RE_RANGE = re.compile(
|
| 181 |
+
r"""
|
| 182 |
+
(?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
|
| 183 |
+
((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
|
| 184 |
+
[-~] # 匹配范围分隔符
|
| 185 |
+
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
|
| 186 |
+
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
|
| 187 |
+
""", re.VERBOSE)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def replace_range(match) -> str:
|
| 191 |
+
"""
|
| 192 |
+
Args:
|
| 193 |
+
match (re.Match)
|
| 194 |
+
Returns:
|
| 195 |
+
str
|
| 196 |
+
"""
|
| 197 |
+
first, second = match.group(1), match.group(6)
|
| 198 |
+
first = RE_NUMBER.sub(replace_number, first)
|
| 199 |
+
second = RE_NUMBER.sub(replace_number, second)
|
| 200 |
+
result = f"{first}到{second}"
|
| 201 |
+
return result
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# ~至表达式
|
| 205 |
+
RE_TO_RANGE = re.compile(
|
| 206 |
+
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
|
| 207 |
+
|
| 208 |
+
def replace_to_range(match) -> str:
|
| 209 |
+
"""
|
| 210 |
+
Args:
|
| 211 |
+
match (re.Match)
|
| 212 |
+
Returns:
|
| 213 |
+
str
|
| 214 |
+
"""
|
| 215 |
+
result = match.group(0).replace('~', '至')
|
| 216 |
+
return result
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
|
| 220 |
+
stripped = value_string.lstrip('0')
|
| 221 |
+
if len(stripped) == 0:
|
| 222 |
+
return []
|
| 223 |
+
elif len(stripped) == 1:
|
| 224 |
+
if use_zero and len(stripped) < len(value_string):
|
| 225 |
+
return [DIGITS['0'], DIGITS[stripped]]
|
| 226 |
+
else:
|
| 227 |
+
return [DIGITS[stripped]]
|
| 228 |
+
else:
|
| 229 |
+
largest_unit = next(
|
| 230 |
+
power for power in reversed(UNITS.keys()) if power < len(stripped))
|
| 231 |
+
first_part = value_string[:-largest_unit]
|
| 232 |
+
second_part = value_string[-largest_unit:]
|
| 233 |
+
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
|
| 234 |
+
second_part)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def verbalize_cardinal(value_string: str) -> str:
|
| 238 |
+
if not value_string:
|
| 239 |
+
return ''
|
| 240 |
+
|
| 241 |
+
# 000 -> '零' , 0 -> '零'
|
| 242 |
+
value_string = value_string.lstrip('0')
|
| 243 |
+
if len(value_string) == 0:
|
| 244 |
+
return DIGITS['0']
|
| 245 |
+
|
| 246 |
+
result_symbols = _get_value(value_string)
|
| 247 |
+
# verbalized number starting with '一十*' is abbreviated as `十*`
|
| 248 |
+
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
|
| 249 |
+
'1'] and result_symbols[1] == UNITS[1]:
|
| 250 |
+
result_symbols = result_symbols[1:]
|
| 251 |
+
return ''.join(result_symbols)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def verbalize_digit(value_string: str, alt_one=False) -> str:
|
| 255 |
+
result_symbols = [DIGITS[digit] for digit in value_string]
|
| 256 |
+
result = ''.join(result_symbols)
|
| 257 |
+
if alt_one:
|
| 258 |
+
result = result.replace("一", "幺")
|
| 259 |
+
return result
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def num2str(value_string: str) -> str:
|
| 263 |
+
integer_decimal = value_string.split('.')
|
| 264 |
+
if len(integer_decimal) == 1:
|
| 265 |
+
integer = integer_decimal[0]
|
| 266 |
+
decimal = ''
|
| 267 |
+
elif len(integer_decimal) == 2:
|
| 268 |
+
integer, decimal = integer_decimal
|
| 269 |
+
else:
|
| 270 |
+
raise ValueError(
|
| 271 |
+
f"The value string: '${value_string}' has more than one point in it."
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
result = verbalize_cardinal(integer)
|
| 275 |
+
|
| 276 |
+
decimal = decimal.rstrip('0')
|
| 277 |
+
if decimal:
|
| 278 |
+
# '.22' is verbalized as '零点二二'
|
| 279 |
+
# '3.20' is verbalized as '三点二
|
| 280 |
+
result = result if result else "零"
|
| 281 |
+
result += '点' + verbalize_digit(decimal)
|
| 282 |
+
return result
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/phonecode.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
from .num import verbalize_digit
|
| 17 |
+
|
| 18 |
+
# 规范化固话/手机号码
|
| 19 |
+
# 手机
|
| 20 |
+
# http://www.jihaoba.com/news/show/13680
|
| 21 |
+
# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
|
| 22 |
+
# 联通:130、131、132、156、155、186、185、176
|
| 23 |
+
# 电信:133、153、189、180、181、177
|
| 24 |
+
RE_MOBILE_PHONE = re.compile(
|
| 25 |
+
r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
|
| 26 |
+
RE_TELEPHONE = re.compile(
|
| 27 |
+
r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
|
| 28 |
+
|
| 29 |
+
# 全国统一的号码400开头
|
| 30 |
+
RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def phone2str(phone_string: str, mobile=True) -> str:
|
| 34 |
+
if mobile:
|
| 35 |
+
sp_parts = phone_string.strip('+').split()
|
| 36 |
+
result = ','.join(
|
| 37 |
+
[verbalize_digit(part, alt_one=True) for part in sp_parts])
|
| 38 |
+
return result
|
| 39 |
+
else:
|
| 40 |
+
sil_parts = phone_string.split('-')
|
| 41 |
+
result = ','.join(
|
| 42 |
+
[verbalize_digit(part, alt_one=True) for part in sil_parts])
|
| 43 |
+
return result
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def replace_phone(match) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Args:
|
| 49 |
+
match (re.Match)
|
| 50 |
+
Returns:
|
| 51 |
+
str
|
| 52 |
+
"""
|
| 53 |
+
return phone2str(match.group(0), mobile=False)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def replace_mobile(match) -> str:
|
| 57 |
+
"""
|
| 58 |
+
Args:
|
| 59 |
+
match (re.Match)
|
| 60 |
+
Returns:
|
| 61 |
+
str
|
| 62 |
+
"""
|
| 63 |
+
return phone2str(match.group(0))
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/quantifier.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import re
|
| 15 |
+
|
| 16 |
+
from .num import num2str
|
| 17 |
+
|
| 18 |
+
# 温度表达式,温度会影响负号的读法
|
| 19 |
+
# -3°C 零下三度
|
| 20 |
+
RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
|
| 21 |
+
measure_dict = {
|
| 22 |
+
"cm2": "平方厘米",
|
| 23 |
+
"cm²": "平方厘米",
|
| 24 |
+
"cm3": "立方厘米",
|
| 25 |
+
"cm³": "立方厘米",
|
| 26 |
+
"cm": "厘米",
|
| 27 |
+
"db": "分贝",
|
| 28 |
+
"ds": "毫秒",
|
| 29 |
+
"kg": "千克",
|
| 30 |
+
"km": "千米",
|
| 31 |
+
"m2": "平方米",
|
| 32 |
+
"m²": "平方米",
|
| 33 |
+
"m³": "立方米",
|
| 34 |
+
"m3": "立方米",
|
| 35 |
+
"ml": "毫升",
|
| 36 |
+
"m": "米",
|
| 37 |
+
"mm": "毫米",
|
| 38 |
+
"s": "秒"
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def replace_temperature(match) -> str:
|
| 43 |
+
"""
|
| 44 |
+
Args:
|
| 45 |
+
match (re.Match)
|
| 46 |
+
Returns:
|
| 47 |
+
str
|
| 48 |
+
"""
|
| 49 |
+
sign = match.group(1)
|
| 50 |
+
temperature = match.group(2)
|
| 51 |
+
unit = match.group(3)
|
| 52 |
+
sign: str = "零下" if sign else ""
|
| 53 |
+
temperature: str = num2str(temperature)
|
| 54 |
+
unit: str = "摄氏度" if unit == "摄氏度" else "度"
|
| 55 |
+
result = f"{sign}{temperature}{unit}"
|
| 56 |
+
return result
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def replace_measure(sentence) -> str:
|
| 60 |
+
for q_notation in measure_dict:
|
| 61 |
+
if q_notation in sentence:
|
| 62 |
+
sentence = sentence.replace(q_notation, measure_dict[q_notation])
|
| 63 |
+
return sentence
|
SongBloom/g2p/cn_zh_g2p/zh_normalization/text_normlization.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import re
|
| 15 |
+
from typing import List
|
| 16 |
+
|
| 17 |
+
from .char_convert import tranditional_to_simplified
|
| 18 |
+
from .chronology import RE_DATE
|
| 19 |
+
from .chronology import RE_DATE2
|
| 20 |
+
from .chronology import RE_TIME
|
| 21 |
+
from .chronology import RE_TIME_RANGE
|
| 22 |
+
from .chronology import replace_date
|
| 23 |
+
from .chronology import replace_date2
|
| 24 |
+
from .chronology import replace_time
|
| 25 |
+
from .constants import F2H_ASCII_LETTERS
|
| 26 |
+
from .constants import F2H_DIGITS
|
| 27 |
+
from .constants import F2H_SPACE
|
| 28 |
+
from .num import RE_DECIMAL_NUM
|
| 29 |
+
from .num import RE_DEFAULT_NUM
|
| 30 |
+
from .num import RE_FRAC
|
| 31 |
+
from .num import RE_INTEGER
|
| 32 |
+
from .num import RE_NUMBER
|
| 33 |
+
from .num import RE_PERCENTAGE
|
| 34 |
+
from .num import RE_POSITIVE_QUANTIFIERS
|
| 35 |
+
from .num import RE_RANGE
|
| 36 |
+
from .num import RE_TO_RANGE
|
| 37 |
+
from .num import RE_ASMD
|
| 38 |
+
from .num import replace_default_num
|
| 39 |
+
from .num import replace_frac
|
| 40 |
+
from .num import replace_negative_num
|
| 41 |
+
from .num import replace_number
|
| 42 |
+
from .num import replace_percentage
|
| 43 |
+
from .num import replace_positive_quantifier
|
| 44 |
+
from .num import replace_range
|
| 45 |
+
from .num import replace_to_range
|
| 46 |
+
from .num import replace_asmd
|
| 47 |
+
from .phonecode import RE_MOBILE_PHONE
|
| 48 |
+
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
|
| 49 |
+
from .phonecode import RE_TELEPHONE
|
| 50 |
+
from .phonecode import replace_mobile
|
| 51 |
+
from .phonecode import replace_phone
|
| 52 |
+
from .quantifier import RE_TEMPERATURE
|
| 53 |
+
from .quantifier import replace_measure
|
| 54 |
+
from .quantifier import replace_temperature
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class TextNormalizer():
|
| 58 |
+
def __init__(self):
|
| 59 |
+
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
|
| 60 |
+
|
| 61 |
+
def _split(self, text: str, lang="zh") -> List[str]:
|
| 62 |
+
"""Split long text into sentences with sentence-splitting punctuations.
|
| 63 |
+
Args:
|
| 64 |
+
text (str): The input text.
|
| 65 |
+
Returns:
|
| 66 |
+
List[str]: Sentences.
|
| 67 |
+
"""
|
| 68 |
+
# Only for pure Chinese here
|
| 69 |
+
if lang == "zh":
|
| 70 |
+
text = text.replace(" ", "")
|
| 71 |
+
# 过滤掉特殊字符
|
| 72 |
+
text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text)
|
| 73 |
+
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
| 74 |
+
text = text.strip()
|
| 75 |
+
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
| 76 |
+
return sentences
|
| 77 |
+
|
| 78 |
+
def _post_replace(self, sentence: str) -> str:
|
| 79 |
+
sentence = sentence.replace('/', '每')
|
| 80 |
+
# sentence = sentence.replace('~', '至')
|
| 81 |
+
# sentence = sentence.replace('~', '至')
|
| 82 |
+
sentence = sentence.replace('①', '一')
|
| 83 |
+
sentence = sentence.replace('②', '二')
|
| 84 |
+
sentence = sentence.replace('③', '三')
|
| 85 |
+
sentence = sentence.replace('④', '四')
|
| 86 |
+
sentence = sentence.replace('⑤', '五')
|
| 87 |
+
sentence = sentence.replace('⑥', '六')
|
| 88 |
+
sentence = sentence.replace('⑦', '七')
|
| 89 |
+
sentence = sentence.replace('⑧', '八')
|
| 90 |
+
sentence = sentence.replace('⑨', '九')
|
| 91 |
+
sentence = sentence.replace('⑩', '十')
|
| 92 |
+
sentence = sentence.replace('α', '阿尔法')
|
| 93 |
+
sentence = sentence.replace('β', '贝塔')
|
| 94 |
+
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
|
| 95 |
+
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
|
| 96 |
+
sentence = sentence.replace('ε', '艾普西龙')
|
| 97 |
+
sentence = sentence.replace('ζ', '捷塔')
|
| 98 |
+
sentence = sentence.replace('η', '依塔')
|
| 99 |
+
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
|
| 100 |
+
sentence = sentence.replace('ι', '艾欧塔')
|
| 101 |
+
sentence = sentence.replace('κ', '喀帕')
|
| 102 |
+
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
|
| 103 |
+
sentence = sentence.replace('μ', '缪')
|
| 104 |
+
sentence = sentence.replace('ν', '拗')
|
| 105 |
+
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
|
| 106 |
+
sentence = sentence.replace('ο', '欧米克伦')
|
| 107 |
+
sentence = sentence.replace('π', '派').replace('Π', '派')
|
| 108 |
+
sentence = sentence.replace('ρ', '肉')
|
| 109 |
+
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
|
| 110 |
+
'σ', '西格玛')
|
| 111 |
+
sentence = sentence.replace('τ', '套')
|
| 112 |
+
sentence = sentence.replace('υ', '宇普西龙')
|
| 113 |
+
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
|
| 114 |
+
sentence = sentence.replace('χ', '器')
|
| 115 |
+
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
|
| 116 |
+
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
|
| 117 |
+
# re filter special characters, have one more character "-" than line 68
|
| 118 |
+
sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence)
|
| 119 |
+
return sentence
|
| 120 |
+
|
| 121 |
+
def normalize_sentence(self, sentence: str) -> str:
|
| 122 |
+
# basic character conversions
|
| 123 |
+
sentence = tranditional_to_simplified(sentence)
|
| 124 |
+
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
|
| 125 |
+
F2H_DIGITS).translate(F2H_SPACE)
|
| 126 |
+
|
| 127 |
+
# number related NSW verbalization
|
| 128 |
+
sentence = RE_DATE.sub(replace_date, sentence)
|
| 129 |
+
sentence = RE_DATE2.sub(replace_date2, sentence)
|
| 130 |
+
|
| 131 |
+
# range first
|
| 132 |
+
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
|
| 133 |
+
sentence = RE_TIME.sub(replace_time, sentence)
|
| 134 |
+
|
| 135 |
+
# 处理~波浪号作为至的替换
|
| 136 |
+
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
|
| 137 |
+
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
| 138 |
+
sentence = replace_measure(sentence)
|
| 139 |
+
sentence = RE_FRAC.sub(replace_frac, sentence)
|
| 140 |
+
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
|
| 141 |
+
sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
|
| 142 |
+
|
| 143 |
+
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
|
| 144 |
+
sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
|
| 145 |
+
|
| 146 |
+
sentence = RE_RANGE.sub(replace_range, sentence)
|
| 147 |
+
|
| 148 |
+
# 处理加减乘除
|
| 149 |
+
while RE_ASMD.search(sentence):
|
| 150 |
+
sentence = RE_ASMD.sub(replace_asmd, sentence)
|
| 151 |
+
|
| 152 |
+
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
|
| 153 |
+
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
|
| 154 |
+
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
|
| 155 |
+
sentence)
|
| 156 |
+
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
|
| 157 |
+
sentence = RE_NUMBER.sub(replace_number, sentence)
|
| 158 |
+
sentence = self._post_replace(sentence)
|
| 159 |
+
|
| 160 |
+
return sentence
|
| 161 |
+
|
| 162 |
+
def normalize(self, text: str) -> List[str]:
|
| 163 |
+
sentences = self._split(text)
|
| 164 |
+
sentences = [self.normalize_sentence(sent) for sent in sentences]
|
| 165 |
+
return sentences
|
SongBloom/g2p/lyric_common.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, sys
|
| 2 |
+
|
| 3 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 4 |
+
from pinyin.pinyin import G2P_PinYin
|
| 5 |
+
from cn_zh_g2p import G2P_Mix, symbols
|
| 6 |
+
|
| 7 |
+
key2processor = {
|
| 8 |
+
'pinyin': G2P_PinYin(),
|
| 9 |
+
'phoneme': G2P_Mix(),
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
valid_struct_type = ['[chorus]', '[verse]', '[bridge]']
|
| 13 |
+
start_struct_type = ['[intro]', '[start]']
|
| 14 |
+
end_struct_type = ['[outro]', '[end]']
|
| 15 |
+
conn_struct_type = ['[inst]', '[solo]', '[break]']
|
| 16 |
+
|
| 17 |
+
LABELS = {
|
| 18 |
+
'[intro]': 0,
|
| 19 |
+
'[outro]': 1,
|
| 20 |
+
'[bridge]': 2,
|
| 21 |
+
'[inst]': 3,
|
| 22 |
+
'[verse]': 4,
|
| 23 |
+
'[chorus]': 5,
|
| 24 |
+
'[silence]': 6,
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
NUMBERS = {
|
| 28 |
+
'0': ['零', 'zero'],
|
| 29 |
+
'1': ['一', 'one'],
|
| 30 |
+
'2': ['二', 'two'],
|
| 31 |
+
'3': ['三', 'three'],
|
| 32 |
+
'4': ['四', 'four'],
|
| 33 |
+
'5': ['五', 'five'],
|
| 34 |
+
'6': ['六', 'six'],
|
| 35 |
+
'7': ['七', 'seven'],
|
| 36 |
+
'8': ['八', 'eight'],
|
| 37 |
+
'9': ['九', 'nine']
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
def detect_structure(structure):
|
| 41 |
+
valid_start = ['start', 'intro']
|
| 42 |
+
valid_end = ['outro', 'end']
|
| 43 |
+
valid_instru = ['solo', 'inst', 'break']
|
| 44 |
+
valid_bridge = ['bridge']
|
| 45 |
+
|
| 46 |
+
if structure in ['verse', 'chorus', 'silence']:
|
| 47 |
+
return structure
|
| 48 |
+
|
| 49 |
+
if structure in valid_start:
|
| 50 |
+
return 'intro'
|
| 51 |
+
if structure in valid_end:
|
| 52 |
+
return 'outro'
|
| 53 |
+
if structure in valid_instru:
|
| 54 |
+
return 'inst'
|
| 55 |
+
if structure in valid_bridge:
|
| 56 |
+
return 'bridge'
|
| 57 |
+
|
| 58 |
+
def merge_structure(start_time, end_time, structure, lyric):
|
| 59 |
+
cnt = 1
|
| 60 |
+
while cnt < len(start_time):
|
| 61 |
+
if structure[cnt] == structure[cnt-1]:
|
| 62 |
+
end_time[cnt-1] = end_time[cnt]
|
| 63 |
+
if structure[cnt] not in ["verse", "chorus", "bridge"]:
|
| 64 |
+
del start_time[cnt]
|
| 65 |
+
del end_time[cnt]
|
| 66 |
+
del structure[cnt]
|
| 67 |
+
del lyric[cnt]
|
| 68 |
+
else:
|
| 69 |
+
cnt += 1
|
| 70 |
+
else:
|
| 71 |
+
cnt += 1
|
| 72 |
+
|
| 73 |
+
return start_time, end_time, structure, lyric
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def is_struct_legal(struct, text):
|
| 77 |
+
if struct in valid_struct_type and text != "":
|
| 78 |
+
return True
|
| 79 |
+
elif struct not in valid_struct_type and text == "":
|
| 80 |
+
return True
|
| 81 |
+
return False
|
SongBloom/g2p/pinyin/__init__.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .symbols import symbols
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
pinyin_dict = {
|
| 7 |
+
"a": ("^", "a"),
|
| 8 |
+
"ai": ("^", "ai"),
|
| 9 |
+
"an": ("^", "an"),
|
| 10 |
+
"ang": ("^", "ang"),
|
| 11 |
+
"ao": ("^", "ao"),
|
| 12 |
+
"ba": ("b", "a"),
|
| 13 |
+
"bai": ("b", "ai"),
|
| 14 |
+
"ban": ("b", "an"),
|
| 15 |
+
"bang": ("b", "ang"),
|
| 16 |
+
"bao": ("b", "ao"),
|
| 17 |
+
"be": ("b", "e"),
|
| 18 |
+
"bei": ("b", "ei"),
|
| 19 |
+
"ben": ("b", "en"),
|
| 20 |
+
"beng": ("b", "eng"),
|
| 21 |
+
"bi": ("b", "i"),
|
| 22 |
+
"bian": ("b", "ian"),
|
| 23 |
+
"biao": ("b", "iao"),
|
| 24 |
+
"bie": ("b", "ie"),
|
| 25 |
+
"bin": ("b", "in"),
|
| 26 |
+
"bing": ("b", "ing"),
|
| 27 |
+
"bo": ("b", "o"),
|
| 28 |
+
"bu": ("b", "u"),
|
| 29 |
+
"ca": ("c", "a"),
|
| 30 |
+
"cai": ("c", "ai"),
|
| 31 |
+
"can": ("c", "an"),
|
| 32 |
+
"cang": ("c", "ang"),
|
| 33 |
+
"cao": ("c", "ao"),
|
| 34 |
+
"ce": ("c", "e"),
|
| 35 |
+
"cen": ("c", "en"),
|
| 36 |
+
"ceng": ("c", "eng"),
|
| 37 |
+
"cha": ("ch", "a"),
|
| 38 |
+
"chai": ("ch", "ai"),
|
| 39 |
+
"chan": ("ch", "an"),
|
| 40 |
+
"chang": ("ch", "ang"),
|
| 41 |
+
"chao": ("ch", "ao"),
|
| 42 |
+
"che": ("ch", "e"),
|
| 43 |
+
"chen": ("ch", "en"),
|
| 44 |
+
"cheng": ("ch", "eng"),
|
| 45 |
+
"chi": ("ch", "iii"),
|
| 46 |
+
"chong": ("ch", "ong"),
|
| 47 |
+
"chou": ("ch", "ou"),
|
| 48 |
+
"chu": ("ch", "u"),
|
| 49 |
+
"chua": ("ch", "ua"),
|
| 50 |
+
"chuai": ("ch", "uai"),
|
| 51 |
+
"chuan": ("ch", "uan"),
|
| 52 |
+
"chuang": ("ch", "uang"),
|
| 53 |
+
"chui": ("ch", "uei"),
|
| 54 |
+
"chun": ("ch", "uen"),
|
| 55 |
+
"chuo": ("ch", "uo"),
|
| 56 |
+
"ci": ("c", "ii"),
|
| 57 |
+
"cong": ("c", "ong"),
|
| 58 |
+
"cou": ("c", "ou"),
|
| 59 |
+
"cu": ("c", "u"),
|
| 60 |
+
"cuan": ("c", "uan"),
|
| 61 |
+
"cui": ("c", "uei"),
|
| 62 |
+
"cun": ("c", "uen"),
|
| 63 |
+
"cuo": ("c", "uo"),
|
| 64 |
+
"da": ("d", "a"),
|
| 65 |
+
"dai": ("d", "ai"),
|
| 66 |
+
"dan": ("d", "an"),
|
| 67 |
+
"dang": ("d", "ang"),
|
| 68 |
+
"dao": ("d", "ao"),
|
| 69 |
+
"de": ("d", "e"),
|
| 70 |
+
"dei": ("d", "ei"),
|
| 71 |
+
"den": ("d", "en"),
|
| 72 |
+
"deng": ("d", "eng"),
|
| 73 |
+
"di": ("d", "i"),
|
| 74 |
+
"dia": ("d", "ia"),
|
| 75 |
+
"dian": ("d", "ian"),
|
| 76 |
+
"diao": ("d", "iao"),
|
| 77 |
+
"die": ("d", "ie"),
|
| 78 |
+
"ding": ("d", "ing"),
|
| 79 |
+
"diu": ("d", "iou"),
|
| 80 |
+
"dong": ("d", "ong"),
|
| 81 |
+
"dou": ("d", "ou"),
|
| 82 |
+
"du": ("d", "u"),
|
| 83 |
+
"duan": ("d", "uan"),
|
| 84 |
+
"dui": ("d", "uei"),
|
| 85 |
+
"dun": ("d", "uen"),
|
| 86 |
+
"duo": ("d", "uo"),
|
| 87 |
+
"e": ("^", "e"),
|
| 88 |
+
"ei": ("^", "ei"),
|
| 89 |
+
"en": ("^", "en"),
|
| 90 |
+
"ng": ("^", "en"),
|
| 91 |
+
"eng": ("^", "eng"),
|
| 92 |
+
"er": ("^", "er"),
|
| 93 |
+
"fa": ("f", "a"),
|
| 94 |
+
"fan": ("f", "an"),
|
| 95 |
+
"fang": ("f", "ang"),
|
| 96 |
+
"fei": ("f", "ei"),
|
| 97 |
+
"fen": ("f", "en"),
|
| 98 |
+
"feng": ("f", "eng"),
|
| 99 |
+
"fo": ("f", "o"),
|
| 100 |
+
"fou": ("f", "ou"),
|
| 101 |
+
"fu": ("f", "u"),
|
| 102 |
+
"ga": ("g", "a"),
|
| 103 |
+
"gai": ("g", "ai"),
|
| 104 |
+
"gan": ("g", "an"),
|
| 105 |
+
"gang": ("g", "ang"),
|
| 106 |
+
"gao": ("g", "ao"),
|
| 107 |
+
"ge": ("g", "e"),
|
| 108 |
+
"gei": ("g", "ei"),
|
| 109 |
+
"gen": ("g", "en"),
|
| 110 |
+
"geng": ("g", "eng"),
|
| 111 |
+
"gong": ("g", "ong"),
|
| 112 |
+
"gou": ("g", "ou"),
|
| 113 |
+
"gu": ("g", "u"),
|
| 114 |
+
"gua": ("g", "ua"),
|
| 115 |
+
"guai": ("g", "uai"),
|
| 116 |
+
"guan": ("g", "uan"),
|
| 117 |
+
"guang": ("g", "uang"),
|
| 118 |
+
"gui": ("g", "uei"),
|
| 119 |
+
"gun": ("g", "uen"),
|
| 120 |
+
"guo": ("g", "uo"),
|
| 121 |
+
"ha": ("h", "a"),
|
| 122 |
+
"hai": ("h", "ai"),
|
| 123 |
+
"han": ("h", "an"),
|
| 124 |
+
"hang": ("h", "ang"),
|
| 125 |
+
"hao": ("h", "ao"),
|
| 126 |
+
"he": ("h", "e"),
|
| 127 |
+
"hei": ("h", "ei"),
|
| 128 |
+
"hen": ("h", "en"),
|
| 129 |
+
"heng": ("h", "eng"),
|
| 130 |
+
"hong": ("h", "ong"),
|
| 131 |
+
"hou": ("h", "ou"),
|
| 132 |
+
"hu": ("h", "u"),
|
| 133 |
+
"hua": ("h", "ua"),
|
| 134 |
+
"huai": ("h", "uai"),
|
| 135 |
+
"huan": ("h", "uan"),
|
| 136 |
+
"huang": ("h", "uang"),
|
| 137 |
+
"hui": ("h", "uei"),
|
| 138 |
+
"hun": ("h", "uen"),
|
| 139 |
+
"huo": ("h", "uo"),
|
| 140 |
+
"ji": ("j", "i"),
|
| 141 |
+
"jia": ("j", "ia"),
|
| 142 |
+
"jian": ("j", "ian"),
|
| 143 |
+
"jiang": ("j", "iang"),
|
| 144 |
+
"jiao": ("j", "iao"),
|
| 145 |
+
"jie": ("j", "ie"),
|
| 146 |
+
"jin": ("j", "in"),
|
| 147 |
+
"jing": ("j", "ing"),
|
| 148 |
+
"jiong": ("j", "iong"),
|
| 149 |
+
"jiu": ("j", "iou"),
|
| 150 |
+
"ju": ("j", "v"),
|
| 151 |
+
"juan": ("j", "van"),
|
| 152 |
+
"jue": ("j", "ve"),
|
| 153 |
+
"jun": ("j", "vn"),
|
| 154 |
+
"ka": ("k", "a"),
|
| 155 |
+
"kai": ("k", "ai"),
|
| 156 |
+
"kan": ("k", "an"),
|
| 157 |
+
"kang": ("k", "ang"),
|
| 158 |
+
"kao": ("k", "ao"),
|
| 159 |
+
"ke": ("k", "e"),
|
| 160 |
+
"kei": ("k", "ei"),
|
| 161 |
+
"ken": ("k", "en"),
|
| 162 |
+
"keng": ("k", "eng"),
|
| 163 |
+
"kong": ("k", "ong"),
|
| 164 |
+
"kou": ("k", "ou"),
|
| 165 |
+
"ku": ("k", "u"),
|
| 166 |
+
"kua": ("k", "ua"),
|
| 167 |
+
"kuai": ("k", "uai"),
|
| 168 |
+
"kuan": ("k", "uan"),
|
| 169 |
+
"kuang": ("k", "uang"),
|
| 170 |
+
"kui": ("k", "uei"),
|
| 171 |
+
"kun": ("k", "uen"),
|
| 172 |
+
"kuo": ("k", "uo"),
|
| 173 |
+
"la": ("l", "a"),
|
| 174 |
+
"lai": ("l", "ai"),
|
| 175 |
+
"lan": ("l", "an"),
|
| 176 |
+
"lang": ("l", "ang"),
|
| 177 |
+
"lao": ("l", "ao"),
|
| 178 |
+
"le": ("l", "e"),
|
| 179 |
+
"lei": ("l", "ei"),
|
| 180 |
+
"leng": ("l", "eng"),
|
| 181 |
+
"li": ("l", "i"),
|
| 182 |
+
"lia": ("l", "ia"),
|
| 183 |
+
"lian": ("l", "ian"),
|
| 184 |
+
"liang": ("l", "iang"),
|
| 185 |
+
"liao": ("l", "iao"),
|
| 186 |
+
"lie": ("l", "ie"),
|
| 187 |
+
"lin": ("l", "in"),
|
| 188 |
+
"ling": ("l", "ing"),
|
| 189 |
+
"liu": ("l", "iou"),
|
| 190 |
+
"lo": ("l", "o"),
|
| 191 |
+
"long": ("l", "ong"),
|
| 192 |
+
"lou": ("l", "ou"),
|
| 193 |
+
"lu": ("l", "u"),
|
| 194 |
+
"lv": ("l", "v"),
|
| 195 |
+
"luan": ("l", "uan"),
|
| 196 |
+
"lve": ("l", "ve"),
|
| 197 |
+
"lue": ("l", "ve"),
|
| 198 |
+
"lun": ("l", "uen"),
|
| 199 |
+
"luo": ("l", "uo"),
|
| 200 |
+
"ma": ("m", "a"),
|
| 201 |
+
"mai": ("m", "ai"),
|
| 202 |
+
"man": ("m", "an"),
|
| 203 |
+
"mang": ("m", "ang"),
|
| 204 |
+
"mao": ("m", "ao"),
|
| 205 |
+
"me": ("m", "e"),
|
| 206 |
+
"mei": ("m", "ei"),
|
| 207 |
+
"men": ("m", "en"),
|
| 208 |
+
"meng": ("m", "eng"),
|
| 209 |
+
"mi": ("m", "i"),
|
| 210 |
+
"mian": ("m", "ian"),
|
| 211 |
+
"miao": ("m", "iao"),
|
| 212 |
+
"mie": ("m", "ie"),
|
| 213 |
+
"min": ("m", "in"),
|
| 214 |
+
"ming": ("m", "ing"),
|
| 215 |
+
"miu": ("m", "iou"),
|
| 216 |
+
"mo": ("m", "o"),
|
| 217 |
+
"mou": ("m", "ou"),
|
| 218 |
+
"mu": ("m", "u"),
|
| 219 |
+
"na": ("n", "a"),
|
| 220 |
+
"nai": ("n", "ai"),
|
| 221 |
+
"nan": ("n", "an"),
|
| 222 |
+
"nang": ("n", "ang"),
|
| 223 |
+
"nao": ("n", "ao"),
|
| 224 |
+
"ne": ("n", "e"),
|
| 225 |
+
"nei": ("n", "ei"),
|
| 226 |
+
"nen": ("n", "en"),
|
| 227 |
+
"neng": ("n", "eng"),
|
| 228 |
+
"ni": ("n", "i"),
|
| 229 |
+
"nia": ("n", "ia"),
|
| 230 |
+
"nian": ("n", "ian"),
|
| 231 |
+
"niang": ("n", "iang"),
|
| 232 |
+
"niao": ("n", "iao"),
|
| 233 |
+
"nie": ("n", "ie"),
|
| 234 |
+
"nin": ("n", "in"),
|
| 235 |
+
"ning": ("n", "ing"),
|
| 236 |
+
"niu": ("n", "iou"),
|
| 237 |
+
"nong": ("n", "ong"),
|
| 238 |
+
"nou": ("n", "ou"),
|
| 239 |
+
"nu": ("n", "u"),
|
| 240 |
+
"nv": ("n", "v"),
|
| 241 |
+
"nuan": ("n", "uan"),
|
| 242 |
+
"nve": ("n", "ve"),
|
| 243 |
+
"nue": ("n", "ve"),
|
| 244 |
+
"nuo": ("n", "uo"),
|
| 245 |
+
"o": ("^", "o"),
|
| 246 |
+
"ou": ("^", "ou"),
|
| 247 |
+
"pa": ("p", "a"),
|
| 248 |
+
"pai": ("p", "ai"),
|
| 249 |
+
"pan": ("p", "an"),
|
| 250 |
+
"pang": ("p", "ang"),
|
| 251 |
+
"pao": ("p", "ao"),
|
| 252 |
+
"pe": ("p", "e"),
|
| 253 |
+
"pei": ("p", "ei"),
|
| 254 |
+
"pen": ("p", "en"),
|
| 255 |
+
"peng": ("p", "eng"),
|
| 256 |
+
"pi": ("p", "i"),
|
| 257 |
+
"pian": ("p", "ian"),
|
| 258 |
+
"piao": ("p", "iao"),
|
| 259 |
+
"pie": ("p", "ie"),
|
| 260 |
+
"pin": ("p", "in"),
|
| 261 |
+
"ping": ("p", "ing"),
|
| 262 |
+
"po": ("p", "o"),
|
| 263 |
+
"pou": ("p", "ou"),
|
| 264 |
+
"pu": ("p", "u"),
|
| 265 |
+
"qi": ("q", "i"),
|
| 266 |
+
"qia": ("q", "ia"),
|
| 267 |
+
"qian": ("q", "ian"),
|
| 268 |
+
"qiang": ("q", "iang"),
|
| 269 |
+
"qiao": ("q", "iao"),
|
| 270 |
+
"qie": ("q", "ie"),
|
| 271 |
+
"qin": ("q", "in"),
|
| 272 |
+
"qing": ("q", "ing"),
|
| 273 |
+
"qiong": ("q", "iong"),
|
| 274 |
+
"qiu": ("q", "iou"),
|
| 275 |
+
"qu": ("q", "v"),
|
| 276 |
+
"quan": ("q", "van"),
|
| 277 |
+
"que": ("q", "ve"),
|
| 278 |
+
"qun": ("q", "vn"),
|
| 279 |
+
"ran": ("r", "an"),
|
| 280 |
+
"rang": ("r", "ang"),
|
| 281 |
+
"rao": ("r", "ao"),
|
| 282 |
+
"re": ("r", "e"),
|
| 283 |
+
"ren": ("r", "en"),
|
| 284 |
+
"reng": ("r", "eng"),
|
| 285 |
+
"ri": ("r", "iii"),
|
| 286 |
+
"rong": ("r", "ong"),
|
| 287 |
+
"rou": ("r", "ou"),
|
| 288 |
+
"ru": ("r", "u"),
|
| 289 |
+
"rua": ("r", "ua"),
|
| 290 |
+
"ruan": ("r", "uan"),
|
| 291 |
+
"rui": ("r", "uei"),
|
| 292 |
+
"run": ("r", "uen"),
|
| 293 |
+
"ruo": ("r", "uo"),
|
| 294 |
+
"sa": ("s", "a"),
|
| 295 |
+
"sai": ("s", "ai"),
|
| 296 |
+
"san": ("s", "an"),
|
| 297 |
+
"sang": ("s", "ang"),
|
| 298 |
+
"sao": ("s", "ao"),
|
| 299 |
+
"se": ("s", "e"),
|
| 300 |
+
"sen": ("s", "en"),
|
| 301 |
+
"seng": ("s", "eng"),
|
| 302 |
+
"sha": ("sh", "a"),
|
| 303 |
+
"shai": ("sh", "ai"),
|
| 304 |
+
"shan": ("sh", "an"),
|
| 305 |
+
"shang": ("sh", "ang"),
|
| 306 |
+
"shao": ("sh", "ao"),
|
| 307 |
+
"she": ("sh", "e"),
|
| 308 |
+
"shei": ("sh", "ei"),
|
| 309 |
+
"shen": ("sh", "en"),
|
| 310 |
+
"sheng": ("sh", "eng"),
|
| 311 |
+
"shi": ("sh", "iii"),
|
| 312 |
+
"shou": ("sh", "ou"),
|
| 313 |
+
"shu": ("sh", "u"),
|
| 314 |
+
"shua": ("sh", "ua"),
|
| 315 |
+
"shuai": ("sh", "uai"),
|
| 316 |
+
"shuan": ("sh", "uan"),
|
| 317 |
+
"shuang": ("sh", "uang"),
|
| 318 |
+
"shui": ("sh", "uei"),
|
| 319 |
+
"shun": ("sh", "uen"),
|
| 320 |
+
"shuo": ("sh", "uo"),
|
| 321 |
+
"si": ("s", "ii"),
|
| 322 |
+
"song": ("s", "ong"),
|
| 323 |
+
"sou": ("s", "ou"),
|
| 324 |
+
"su": ("s", "u"),
|
| 325 |
+
"suan": ("s", "uan"),
|
| 326 |
+
"sui": ("s", "uei"),
|
| 327 |
+
"sun": ("s", "uen"),
|
| 328 |
+
"suo": ("s", "uo"),
|
| 329 |
+
"ta": ("t", "a"),
|
| 330 |
+
"tai": ("t", "ai"),
|
| 331 |
+
"tan": ("t", "an"),
|
| 332 |
+
"tang": ("t", "ang"),
|
| 333 |
+
"tao": ("t", "ao"),
|
| 334 |
+
"te": ("t", "e"),
|
| 335 |
+
"tei": ("t", "ei"),
|
| 336 |
+
"teng": ("t", "eng"),
|
| 337 |
+
"ti": ("t", "i"),
|
| 338 |
+
"tian": ("t", "ian"),
|
| 339 |
+
"tiao": ("t", "iao"),
|
| 340 |
+
"tie": ("t", "ie"),
|
| 341 |
+
"ting": ("t", "ing"),
|
| 342 |
+
"tong": ("t", "ong"),
|
| 343 |
+
"tou": ("t", "ou"),
|
| 344 |
+
"tu": ("t", "u"),
|
| 345 |
+
"tuan": ("t", "uan"),
|
| 346 |
+
"tui": ("t", "uei"),
|
| 347 |
+
"tun": ("t", "uen"),
|
| 348 |
+
"tuo": ("t", "uo"),
|
| 349 |
+
"wa": ("^", "ua"),
|
| 350 |
+
"wai": ("^", "uai"),
|
| 351 |
+
"wan": ("^", "uan"),
|
| 352 |
+
"wang": ("^", "uang"),
|
| 353 |
+
"wei": ("^", "uei"),
|
| 354 |
+
"wen": ("^", "uen"),
|
| 355 |
+
"weng": ("^", "ueng"),
|
| 356 |
+
"wo": ("^", "uo"),
|
| 357 |
+
"wu": ("^", "u"),
|
| 358 |
+
"xi": ("x", "i"),
|
| 359 |
+
"xia": ("x", "ia"),
|
| 360 |
+
"xian": ("x", "ian"),
|
| 361 |
+
"xiang": ("x", "iang"),
|
| 362 |
+
"xiao": ("x", "iao"),
|
| 363 |
+
"xie": ("x", "ie"),
|
| 364 |
+
"xin": ("x", "in"),
|
| 365 |
+
"xing": ("x", "ing"),
|
| 366 |
+
"xiong": ("x", "iong"),
|
| 367 |
+
"xiu": ("x", "iou"),
|
| 368 |
+
"xu": ("x", "v"),
|
| 369 |
+
"xuan": ("x", "van"),
|
| 370 |
+
"xue": ("x", "ve"),
|
| 371 |
+
"xun": ("x", "vn"),
|
| 372 |
+
"ya": ("^", "ia"),
|
| 373 |
+
"yan": ("^", "ian"),
|
| 374 |
+
"yang": ("^", "iang"),
|
| 375 |
+
"yao": ("^", "iao"),
|
| 376 |
+
"ye": ("^", "ie"),
|
| 377 |
+
"yi": ("^", "i"),
|
| 378 |
+
"yin": ("^", "in"),
|
| 379 |
+
"ying": ("^", "ing"),
|
| 380 |
+
"yo": ("^", "iou"),
|
| 381 |
+
"yong": ("^", "iong"),
|
| 382 |
+
"you": ("^", "iou"),
|
| 383 |
+
"yu": ("^", "v"),
|
| 384 |
+
"yuan": ("^", "van"),
|
| 385 |
+
"yue": ("^", "ve"),
|
| 386 |
+
"yun": ("^", "vn"),
|
| 387 |
+
"za": ("z", "a"),
|
| 388 |
+
"zai": ("z", "ai"),
|
| 389 |
+
"zan": ("z", "an"),
|
| 390 |
+
"zang": ("z", "ang"),
|
| 391 |
+
"zao": ("z", "ao"),
|
| 392 |
+
"ze": ("z", "e"),
|
| 393 |
+
"zei": ("z", "ei"),
|
| 394 |
+
"zen": ("z", "en"),
|
| 395 |
+
"zeng": ("z", "eng"),
|
| 396 |
+
"zha": ("zh", "a"),
|
| 397 |
+
"zhai": ("zh", "ai"),
|
| 398 |
+
"zhan": ("zh", "an"),
|
| 399 |
+
"zhang": ("zh", "ang"),
|
| 400 |
+
"zhao": ("zh", "ao"),
|
| 401 |
+
"zhe": ("zh", "e"),
|
| 402 |
+
"zhei": ("zh", "ei"),
|
| 403 |
+
"zhen": ("zh", "en"),
|
| 404 |
+
"zheng": ("zh", "eng"),
|
| 405 |
+
"zhi": ("zh", "iii"),
|
| 406 |
+
"zhong": ("zh", "ong"),
|
| 407 |
+
"zhou": ("zh", "ou"),
|
| 408 |
+
"zhu": ("zh", "u"),
|
| 409 |
+
"zhua": ("zh", "ua"),
|
| 410 |
+
"zhuai": ("zh", "uai"),
|
| 411 |
+
"zhuan": ("zh", "uan"),
|
| 412 |
+
"zhuang": ("zh", "uang"),
|
| 413 |
+
"zhui": ("zh", "uei"),
|
| 414 |
+
"zhun": ("zh", "uen"),
|
| 415 |
+
"zhuo": ("zh", "uo"),
|
| 416 |
+
"zi": ("z", "ii"),
|
| 417 |
+
"zong": ("z", "ong"),
|
| 418 |
+
"zou": ("z", "ou"),
|
| 419 |
+
"zu": ("z", "u"),
|
| 420 |
+
"zuan": ("z", "uan"),
|
| 421 |
+
"zui": ("z", "uei"),
|
| 422 |
+
"zun": ("z", "uen"),
|
| 423 |
+
"zuo": ("z", "uo"),
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def gen_vocabs():
|
| 428 |
+
import yaml
|
| 429 |
+
vocab = [f"<{c}{i}>" for c in list(pinyin_dict.keys()) for i in range(1,6)]
|
| 430 |
+
yaml.dump(vocab, open('./vocab.yaml', 'w'))
|
SongBloom/g2p/pinyin/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (9.71 kB). View file
|
|
|
SongBloom/g2p/pinyin/__pycache__/pinyin.cpython-39.pyc
ADDED
|
Binary file (3.25 kB). View file
|
|
|
SongBloom/g2p/pinyin/__pycache__/symbols.cpython-39.pyc
ADDED
|
Binary file (673 Bytes). View file
|
|
|
SongBloom/g2p/pinyin/pinyin.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
from pypinyin import Style
|
| 4 |
+
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
| 5 |
+
from pypinyin.converter import DefaultConverter
|
| 6 |
+
from pypinyin.core import Pinyin
|
| 7 |
+
|
| 8 |
+
from . import pinyin_dict
|
| 9 |
+
import torch
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def is_chinese(uchar):
|
| 17 |
+
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
|
| 18 |
+
return True
|
| 19 |
+
else:
|
| 20 |
+
return False
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def clean_chinese(text: str):
|
| 24 |
+
text = text.strip()
|
| 25 |
+
text_clean = []
|
| 26 |
+
for char in text:
|
| 27 |
+
if (is_chinese(char)):
|
| 28 |
+
text_clean.append(char)
|
| 29 |
+
else:
|
| 30 |
+
if len(text_clean) > 1 and is_chinese(text_clean[-1]):
|
| 31 |
+
text_clean.append(',')
|
| 32 |
+
text_clean = ''.join(text_clean).strip(',')
|
| 33 |
+
return text_clean
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class G2P_PinYin():
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
super(G2P_PinYin, self).__init__()
|
| 40 |
+
self.pinyin_parser = Pinyin(MyConverter())
|
| 41 |
+
|
| 42 |
+
def get_phoneme4pinyin(self, pinyins):
|
| 43 |
+
result = []
|
| 44 |
+
count_phone = []
|
| 45 |
+
for pinyin in pinyins:
|
| 46 |
+
if pinyin[:-1] in pinyin_dict:
|
| 47 |
+
tone = pinyin[-1]
|
| 48 |
+
a = pinyin[:-1]
|
| 49 |
+
a1, a2 = pinyin_dict[a]
|
| 50 |
+
result += [a1, a2 + tone]
|
| 51 |
+
count_phone.append(2)
|
| 52 |
+
return result, count_phone
|
| 53 |
+
|
| 54 |
+
# def chinese_to_phonemes(self, text):
|
| 55 |
+
# text = clean_chinese(text)
|
| 56 |
+
# phonemes = ["sil"]
|
| 57 |
+
# chars = ['[PAD]']
|
| 58 |
+
# all_pinyins = []
|
| 59 |
+
# count_phone = []
|
| 60 |
+
# count_phone.append(1)
|
| 61 |
+
# for subtext in text.split(","):
|
| 62 |
+
# if (len(subtext) == 0):
|
| 63 |
+
# continue
|
| 64 |
+
# pinyins = self.correct_pinyin_tone3(subtext)
|
| 65 |
+
# all_pinyins.append(' '.join(pinyins))
|
| 66 |
+
# sub_p, sub_c = self.get_phoneme4pinyin(pinyins)
|
| 67 |
+
# phonemes.extend(sub_p)
|
| 68 |
+
# phonemes.append(",")
|
| 69 |
+
# count_phone.extend(sub_c)
|
| 70 |
+
# count_phone.append(1)
|
| 71 |
+
# chars.append(subtext)
|
| 72 |
+
# chars.append(',')
|
| 73 |
+
# phonemes.append("sil")
|
| 74 |
+
# count_phone.append(1)
|
| 75 |
+
# chars.append('[PAD]')
|
| 76 |
+
# # char_embeds = self.prosody.expand_for_phone(char_embeds, count_phone)
|
| 77 |
+
# return " ".join(phonemes), " ".join(chars), ' , '.join(all_pinyins)
|
| 78 |
+
|
| 79 |
+
def chinese_to_phonemes(self, text):
|
| 80 |
+
all_pinyins = []
|
| 81 |
+
subtext = []
|
| 82 |
+
for chr in text:
|
| 83 |
+
if is_chinese(chr):
|
| 84 |
+
subtext.append(chr)
|
| 85 |
+
else:
|
| 86 |
+
if subtext != []:
|
| 87 |
+
subtext = ''.join(subtext)
|
| 88 |
+
pinyins = self.correct_pinyin_tone3(subtext)
|
| 89 |
+
pinyins = [f"<{i}>" for i in pinyins]
|
| 90 |
+
all_pinyins.append(' '+ ' '.join(pinyins)+ ' ')
|
| 91 |
+
all_pinyins.append(chr)
|
| 92 |
+
subtext = []
|
| 93 |
+
if subtext != []:
|
| 94 |
+
subtext = ''.join(subtext)
|
| 95 |
+
pinyins = self.correct_pinyin_tone3(subtext)
|
| 96 |
+
pinyins = [f"<{i}>" for i in pinyins]
|
| 97 |
+
all_pinyins.append(' '+ ' '.join(pinyins)+ ' ')
|
| 98 |
+
# char_embeds = self.prosody.expand_for_phone(char_embeds, count_phone)
|
| 99 |
+
return ''.join(all_pinyins)
|
| 100 |
+
|
| 101 |
+
def correct_pinyin_tone3(self, text):
|
| 102 |
+
pinyin_list = [
|
| 103 |
+
p[0]
|
| 104 |
+
for p in self.pinyin_parser.pinyin(text,
|
| 105 |
+
style=Style.TONE3,
|
| 106 |
+
strict=False,
|
| 107 |
+
neutral_tone_with_five=True)
|
| 108 |
+
]
|
| 109 |
+
if len(pinyin_list) >= 2:
|
| 110 |
+
for i in range(1, len(pinyin_list)):
|
| 111 |
+
try:
|
| 112 |
+
if re.findall(r'\d',
|
| 113 |
+
pinyin_list[i - 1])[0] == '3' and re.findall(
|
| 114 |
+
r'\d', pinyin_list[i])[0] == '3':
|
| 115 |
+
pinyin_list[i - 1] = pinyin_list[i - 1].replace(
|
| 116 |
+
'3', '2')
|
| 117 |
+
except IndexError:
|
| 118 |
+
pass
|
| 119 |
+
return pinyin_list
|
| 120 |
+
|
| 121 |
+
# def expand_for_phone(self, char_embeds, length): # length of phones for char
|
| 122 |
+
# if(char_embeds.size(0) > len(length)):
|
| 123 |
+
# print(char_embeds.shape, len(length))
|
| 124 |
+
# char_embeds = char_embeds[0:len(length),:]
|
| 125 |
+
# elif(char_embeds.size(0) < len(length)):
|
| 126 |
+
# print(char_embeds.shape, len(length))
|
| 127 |
+
# length = length[0:char_embeds.size(0)]
|
| 128 |
+
# expand_vecs = list()
|
| 129 |
+
# for vec, leng in zip(char_embeds, length):
|
| 130 |
+
# vec = vec.expand(leng, -1)
|
| 131 |
+
# expand_vecs.append(vec)
|
| 132 |
+
# expand_embeds = torch.cat(expand_vecs, 0)
|
| 133 |
+
# assert expand_embeds.size(0) == sum(length)
|
| 134 |
+
# return expand_embeds.numpy()
|
| 135 |
+
|
| 136 |
+
def __call__(self, text):
|
| 137 |
+
return self.chinese_to_phonemes(text)
|
SongBloom/g2p/pinyin/symbols.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
|
| 2 |
+
|
| 3 |
+
_initials = [
|
| 4 |
+
"^",
|
| 5 |
+
"b",
|
| 6 |
+
"c",
|
| 7 |
+
"ch",
|
| 8 |
+
"d",
|
| 9 |
+
"f",
|
| 10 |
+
"g",
|
| 11 |
+
"h",
|
| 12 |
+
"j",
|
| 13 |
+
"k",
|
| 14 |
+
"l",
|
| 15 |
+
"m",
|
| 16 |
+
"n",
|
| 17 |
+
"p",
|
| 18 |
+
"q",
|
| 19 |
+
"r",
|
| 20 |
+
"s",
|
| 21 |
+
"sh",
|
| 22 |
+
"t",
|
| 23 |
+
"x",
|
| 24 |
+
"z",
|
| 25 |
+
"zh",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
_tones = ["1", "2", "3", "4", "5"]
|
| 29 |
+
|
| 30 |
+
_finals = [
|
| 31 |
+
"a",
|
| 32 |
+
"ai",
|
| 33 |
+
"an",
|
| 34 |
+
"ang",
|
| 35 |
+
"ao",
|
| 36 |
+
"e",
|
| 37 |
+
"ei",
|
| 38 |
+
"en",
|
| 39 |
+
"eng",
|
| 40 |
+
"er",
|
| 41 |
+
"i",
|
| 42 |
+
"ia",
|
| 43 |
+
"ian",
|
| 44 |
+
"iang",
|
| 45 |
+
"iao",
|
| 46 |
+
"ie",
|
| 47 |
+
"ii",
|
| 48 |
+
"iii",
|
| 49 |
+
"in",
|
| 50 |
+
"ing",
|
| 51 |
+
"iong",
|
| 52 |
+
"iou",
|
| 53 |
+
"o",
|
| 54 |
+
"ong",
|
| 55 |
+
"ou",
|
| 56 |
+
"u",
|
| 57 |
+
"ua",
|
| 58 |
+
"uai",
|
| 59 |
+
"uan",
|
| 60 |
+
"uang",
|
| 61 |
+
"uei",
|
| 62 |
+
"uen",
|
| 63 |
+
"ueng",
|
| 64 |
+
"uo",
|
| 65 |
+
"v",
|
| 66 |
+
"van",
|
| 67 |
+
"ve",
|
| 68 |
+
"vn",
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
symbols = _pause + _initials + [i + j for i in _finals for j in _tones]
|
SongBloom/models/__pycache__/transformer.cpython-39.pyc
ADDED
|
Binary file (21.9 kB). View file
|
|
|
SongBloom/models/base/__pycache__/sample.cpython-39.pyc
ADDED
|
Binary file (2.5 kB). View file
|
|
|
SongBloom/models/base/__pycache__/utils.cpython-39.pyc
ADDED
|
Binary file (2.52 kB). View file
|
|
|
SongBloom/models/base/sample.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
|
| 5 |
+
"""torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
input (torch.Tensor): The input tensor containing probabilities.
|
| 9 |
+
num_samples (int): Number of samples to draw.
|
| 10 |
+
replacement (bool): Whether to draw with replacement or not.
|
| 11 |
+
Keywords args:
|
| 12 |
+
generator (torch.Generator): A pseudorandom number generator for sampling.
|
| 13 |
+
Returns:
|
| 14 |
+
torch.Tensor: Last dimension contains num_samples indices
|
| 15 |
+
sampled from the multinomial probability distribution
|
| 16 |
+
located in the last dimension of tensor input.
|
| 17 |
+
"""
|
| 18 |
+
input_ = input.reshape(-1, input.shape[-1])
|
| 19 |
+
output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
|
| 20 |
+
output = output_.reshape(*list(input.shape[:-1]), -1)
|
| 21 |
+
return output
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
|
| 25 |
+
"""Sample next token from top K values along the last dimension of the input probs tensor.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
|
| 29 |
+
k (int): The k in “top-k”.
|
| 30 |
+
Returns:
|
| 31 |
+
torch.Tensor: Sampled tokens.
|
| 32 |
+
"""
|
| 33 |
+
top_k_value, _ = torch.topk(probs, k, dim=-1)
|
| 34 |
+
min_value_top_k = top_k_value[..., [-1]]
|
| 35 |
+
probs *= (probs >= min_value_top_k).float()
|
| 36 |
+
probs.div_(probs.sum(dim=-1, keepdim=True))
|
| 37 |
+
next_token = multinomial(probs, num_samples=1)
|
| 38 |
+
return next_token
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
|
| 42 |
+
"""Sample next token from top P probabilities along the last dimension of the input probs tensor.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
|
| 46 |
+
p (int): The p in “top-p”.
|
| 47 |
+
Returns:
|
| 48 |
+
torch.Tensor: Sampled tokens.
|
| 49 |
+
"""
|
| 50 |
+
probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
|
| 51 |
+
probs_sum = torch.cumsum(probs_sort, dim=-1)
|
| 52 |
+
mask = probs_sum - probs_sort > p
|
| 53 |
+
probs_sort *= (~mask).float()
|
| 54 |
+
probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
|
| 55 |
+
next_token = multinomial(probs_sort, num_samples=1)
|
| 56 |
+
next_token = torch.gather(probs_idx, -1, next_token)
|
| 57 |
+
return next_token
|
SongBloom/models/base/utils.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import typing as tp
|
| 4 |
+
|
| 5 |
+
def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
|
| 6 |
+
"""Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
|
| 7 |
+
For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
lengths (torch.Tensor): tensor with lengths
|
| 11 |
+
max_len (int): can set the max length manually. Defaults to None.
|
| 12 |
+
Returns:
|
| 13 |
+
torch.Tensor: mask with 0s where there is pad tokens else 1s
|
| 14 |
+
"""
|
| 15 |
+
assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
|
| 16 |
+
final_length = lengths.max().item() if not max_len else max_len
|
| 17 |
+
final_length = max(final_length, 1) # if all seqs are of len zero we don't want a zero-size tensor
|
| 18 |
+
return torch.arange(final_length)[None, :].to(lengths.device) < lengths[:, None]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
|
| 22 |
+
dtype: torch.dtype = torch.float32) -> torch.Tensor:
|
| 23 |
+
"""Create sinusoidal positional embedding, with shape `[B, T, C]`.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
positions (torch.Tensor): LongTensor of positions.
|
| 27 |
+
dim (int): Dimension of the embedding.
|
| 28 |
+
max_period (float): Maximum period of the cosine/sine functions.
|
| 29 |
+
dtype (torch.dtype or str): dtype to use to generate the embedding.
|
| 30 |
+
Returns:
|
| 31 |
+
torch.Tensor: Sinusoidal positional embedding.
|
| 32 |
+
"""
|
| 33 |
+
# We aim for BTC format
|
| 34 |
+
assert dim % 2 == 0
|
| 35 |
+
half_dim = dim // 2
|
| 36 |
+
positions = positions.to(dtype)
|
| 37 |
+
adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
|
| 38 |
+
max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
|
| 39 |
+
phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
|
| 40 |
+
# phase = phase.to(torch.bfloat16)
|
| 41 |
+
return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
|
| 45 |
+
"""Create normalization module for transformer encoder layer.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
norm_type (str): Normalization method.
|
| 49 |
+
dim (int): Dimension of the normalized layer.
|
| 50 |
+
**kwargs (dict): Additional parameters for normalization layer.
|
| 51 |
+
Returns:
|
| 52 |
+
nn.Module: Normalization module.
|
| 53 |
+
"""
|
| 54 |
+
if norm_type == 'layer_norm':
|
| 55 |
+
return nn.LayerNorm(dim, eps=1e-5, **kwargs)
|
| 56 |
+
else:
|
| 57 |
+
raise ValueError(f"Unknown norm type: {norm_type}")
|
SongBloom/models/musicgen/__init__.py
ADDED
|
File without changes
|