fix #1 - UTF-8 decode problems when using en_US.
Apparently the en_US dictionary in ubuntu cannot be decoded with utf-8, but works fine with iso-8859-1. As a workaround this adds another configuration and command line option: encoding. If left empty it autodetects from the environment, otherwise it should be an encoding definition that python should use to read the dictionary.
This commit is contained in:
parent
b34c41281a
commit
6795454af1
@ -22,6 +22,9 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--myspell-dir', '-i',
|
'--myspell-dir', '-i',
|
||||||
help='Directory containing myspell dictionaries')
|
help='Directory containing myspell dictionaries')
|
||||||
|
parser.add_argument(
|
||||||
|
'--encoding', '-e',
|
||||||
|
help="Character encoding of the directory")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--lang', '-l',
|
'--lang', '-l',
|
||||||
|
@ -4,6 +4,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import locale
|
||||||
|
|
||||||
if sys.version_info[0] < 3:
|
if sys.version_info[0] < 3:
|
||||||
import ConfigParser as configparser
|
import ConfigParser as configparser
|
||||||
@ -36,6 +37,7 @@ def update_config(
|
|||||||
word_min_char=2,
|
word_min_char=2,
|
||||||
word_max_char=0,
|
word_max_char=0,
|
||||||
unmunch_bin='',
|
unmunch_bin='',
|
||||||
|
encoding='',
|
||||||
|
|
||||||
words=4,
|
words=4,
|
||||||
capitalize='random',
|
capitalize='random',
|
||||||
@ -65,6 +67,7 @@ def update_config(
|
|||||||
set_if_defined(conf, 'dictionary', 'word_min_char', word_min_char)
|
set_if_defined(conf, 'dictionary', 'word_min_char', word_min_char)
|
||||||
set_if_defined(conf, 'dictionary', 'word_max_char', word_max_char)
|
set_if_defined(conf, 'dictionary', 'word_max_char', word_max_char)
|
||||||
set_if_defined(conf, 'dictionary', 'unmunch_bin', unmunch_bin)
|
set_if_defined(conf, 'dictionary', 'unmunch_bin', unmunch_bin)
|
||||||
|
set_if_defined(conf, 'dictionary', 'encoding', encoding)
|
||||||
|
|
||||||
if not conf.has_section('passwords'):
|
if not conf.has_section('passwords'):
|
||||||
conf.add_section('passwords')
|
conf.add_section('passwords')
|
||||||
@ -93,6 +96,9 @@ def _read_dictionary(conf):
|
|||||||
dict_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.dic'.format(conf.get('dictionary', 'lang')))
|
dict_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.dic'.format(conf.get('dictionary', 'lang')))
|
||||||
aff_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.aff'.format(conf.get('dictionary', 'lang')))
|
aff_file = os.path.join(conf.get('dictionary', 'myspell_dir'), '{}.aff'.format(conf.get('dictionary', 'lang')))
|
||||||
unmunch_bin = conf.get('dictionary', 'unmunch_bin')
|
unmunch_bin = conf.get('dictionary', 'unmunch_bin')
|
||||||
|
encoding = conf.get('dictionary', 'encoding')
|
||||||
|
if not encoding:
|
||||||
|
encoding = locale.getpreferredencoding(False)
|
||||||
words = set()
|
words = set()
|
||||||
chars = 0
|
chars = 0
|
||||||
if os.path.exists(aff_file) and unmunch_bin:
|
if os.path.exists(aff_file) and unmunch_bin:
|
||||||
@ -106,7 +112,7 @@ def _read_dictionary(conf):
|
|||||||
if proc.returncode != 0:
|
if proc.returncode != 0:
|
||||||
raise DictReadError('Unmunching dictionaries failed')
|
raise DictReadError('Unmunching dictionaries failed')
|
||||||
for word in out.splitlines():
|
for word in out.splitlines():
|
||||||
save = word.strip().decode('utf-8')
|
save = word.strip().decode(encoding)
|
||||||
if not save:
|
if not save:
|
||||||
continue
|
continue
|
||||||
first_char = save[:1]
|
first_char = save[:1]
|
||||||
@ -122,7 +128,7 @@ def _read_dictionary(conf):
|
|||||||
words.add(save)
|
words.add(save)
|
||||||
chars += len(save)
|
chars += len(save)
|
||||||
else:
|
else:
|
||||||
with open(dict_file, 'r') as f:
|
with open(dict_file, encoding=encoding, mode='r') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
Loading…
Reference in New Issue
Block a user