Update encoder.py to work on windows

This fixes https://github.com/openai/gpt-2/issues/26

```  File "C:\Users\James Pollack\Desktop\gpt-2\src\encoder.py", line 112, in get_encoder
    bpe_data = f.read()
  File "C:\Anaconda\envs\gpt-2\lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 66951: character maps to <undefined>```
This commit is contained in:
James B. Pollack
2019-02-15 13:14:08 -06:00
committed by Jeff Wu
parent ebd5894d22
commit 2d0b62225c

View File

@ -108,7 +108,7 @@ class Encoder:
def get_encoder(model_name): def get_encoder(model_name):
with open(os.path.join('models', model_name, 'encoder.json'), 'r') as f: with open(os.path.join('models', model_name, 'encoder.json'), 'r') as f:
encoder = json.load(f) encoder = json.load(f)
with open(os.path.join('models', model_name, 'vocab.bpe'), 'r') as f: with open(os.path.join('models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
bpe_data = f.read() bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
return Encoder( return Encoder(