allow hyphens and single quotes between words
This commit is contained in:
@@ -245,9 +245,7 @@ class Tokenizer:
|
|||||||
|
|
||||||
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
|
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
|
||||||
"""
|
"""
|
||||||
|
symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
|
||||||
result = set()
|
|
||||||
symbols = list("'\"#()*+-/:;<=>@[\\]^_`{|}~「」『』")
|
|
||||||
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
|
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
|
||||||
|
|
||||||
# symbols that may be a single token or multiple tokens depending on the tokenizer.
|
# symbols that may be a single token or multiple tokens depending on the tokenizer.
|
||||||
@@ -257,6 +255,8 @@ class Tokenizer:
|
|||||||
miscellaneous = set("♩♪♫♬♭♮♯")
|
miscellaneous = set("♩♪♫♬♭♮♯")
|
||||||
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
|
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
|
||||||
|
|
||||||
|
# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
|
||||||
|
result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
|
||||||
for symbol in symbols + list(miscellaneous):
|
for symbol in symbols + list(miscellaneous):
|
||||||
for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
|
for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
|
||||||
if len(tokens) == 1 or symbol in miscellaneous:
|
if len(tokens) == 1 or symbol in miscellaneous:
|
||||||
|
Reference in New Issue
Block a user