fix #1 - UTF-8 decode problems when using en_US.

Apparently the en_US dictionary in ubuntu cannot be decoded with utf-8,
but works fine with iso-8859-1. As a workaround this adds another
configuration and command line option: encoding. If left empty it
autodetects from the environment, otherwise it should be an encoding
definition that python should use to read the dictionary.
This commit is contained in:
Fredrik Eriksson 2020-05-02 10:31:03 +02:00
parent b34c41281a
commit 6795454af1
Signed by: feffe
GPG Key ID: 18524638BE25530A
2 changed files with 11 additions and 2 deletions

View File

@ -22,6 +22,9 @@ def main():
parser.add_argument( parser.add_argument(
'--myspell-dir', '-i', '--myspell-dir', '-i',
help='Directory containing myspell dictionaries') help='Directory containing myspell dictionaries')
parser.add_argument(
'--encoding', '-e',
help="Character encoding of the directory")
parser.add_argument( parser.add_argument(
'--lang', '-l', '--lang', '-l',

View File

@ -4,6 +4,7 @@ import math
import os import os
import sys import sys
import subprocess import subprocess
import locale
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
import ConfigParser as configparser import ConfigParser as configparser
@ -36,6 +37,7 @@ def update_config(
word_min_char=2, word_min_char=2,
word_max_char=0, word_max_char=0,
unmunch_bin='', unmunch_bin='',
encoding='',
words=4, words=4,
capitalize='random', capitalize='random',
@ -65,6 +67,7 @@ def update_config(
set_if_defined(conf, 'dictionary', 'word_min_char', word_min_char) set_if_defined(conf, 'dictionary', 'word_min_char', word_min_char)
set_if_defined(conf, 'dictionary', 'word_max_char', word_max_char) set_if_defined(conf, 'dictionary', 'word_max_char', word_max_char)
set_if_defined(conf, 'dictionary', 'unmunch_bin', unmunch_bin) set_if_defined(conf, 'dictionary', 'unmunch_bin', unmunch_bin)
set_if_defined(conf, 'dictionary', 'encoding', encoding)
if not conf.has_section('passwords'): if not conf.has_section('passwords'):
conf.add_section('passwords') conf.add_section('passwords')
@ -93,6 +96,9 @@ def _read_dictionary(conf):
dict_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.dic'.format(conf.get('dictionary', 'lang'))) dict_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.dic'.format(conf.get('dictionary', 'lang')))
aff_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.aff'.format(conf.get('dictionary', 'lang'))) aff_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.aff'.format(conf.get('dictionary', 'lang')))
unmunch_bin = conf.get('dictionary', 'unmunch_bin') unmunch_bin = conf.get('dictionary', 'unmunch_bin')
encoding = conf.get('dictionary', 'encoding')
if not encoding:
encoding = locale.getpreferredencoding(False)
words = set() words = set()
chars = 0 chars = 0
if os.path.exists(aff_file) and unmunch_bin: if os.path.exists(aff_file) and unmunch_bin:
@ -106,7 +112,7 @@ def _read_dictionary(conf):
if proc.returncode != 0: if proc.returncode != 0:
raise DictReadError('Unmunching dictionaries failed') raise DictReadError('Unmunching dictionaries failed')
for word in out.splitlines(): for word in out.splitlines():
save = word.strip().decode('utf-8') save = word.strip().decode(encoding)
if not save: if not save:
continue continue
first_char = save[:1] first_char = save[:1]
@ -122,7 +128,7 @@ def _read_dictionary(conf):
words.add(save) words.add(save)
chars += len(save) chars += len(save)
else: else:
with open(dict_file, 'r') as f: with open(dict_file, encoding=encoding, mode='r') as f:
for line in f: for line in f:
if not line: if not line:
continue continue