From 2d0b62225ce6e5360f633a168f33913cca951e74 Mon Sep 17 00:00:00 2001 From: "James B. Pollack" Date: Fri, 15 Feb 2019 13:14:08 -0600 Subject: [PATCH] Update encoder.py to work on windows This fixes https://github.com/openai/gpt-2/issues/26 ``` File "C:\Users\James Pollack\Desktop\gpt-2\src\encoder.py", line 112, in get_encoder bpe_data = f.read() File "C:\Anaconda\envs\gpt-2\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 66951: character maps to ``` --- src/encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoder.py b/src/encoder.py index 5068cc6..03e0ce2 100644 --- a/src/encoder.py +++ b/src/encoder.py @@ -108,7 +108,7 @@ class Encoder: def get_encoder(model_name): with open(os.path.join('models', model_name, 'encoder.json'), 'r') as f: encoder = json.load(f) - with open(os.path.join('models', model_name, 'vocab.bpe'), 'r') as f: + with open(os.path.join('models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f: bpe_data = f.read() bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] return Encoder(