manuskript/manuskript/functions/spellchecker.py
TheJackiMonster 41e59d71c1
Fix newline changes to read universally
Signed-off-by: TheJackiMonster <thejackimonster@gmail.com>
2023-12-12 16:18:43 +01:00

701 lines
21 KiB
Python

#!/usr/bin/env python
# --!-- coding: utf8 --!--
import os, gzip, json, glob, re, string
from PyQt5.QtCore import QLocale
from collections import OrderedDict
from manuskript.functions import writablePath
try:
import enchant
except ImportError:
enchant = None
try:
import spellchecker as pyspellchecker
except ImportError:
pyspellchecker = None
SYMSPELLPY_MIN_VERSION = "6.3.8"
try:
import symspellpy
import distutils.version
if distutils.version.LooseVersion(symspellpy.__version__) < SYMSPELLPY_MIN_VERSION:
symspellpy = None
except ImportError:
symspellpy = None
use_language_check = False
try:
try:
import language_tool_python as languagetool
except:
import language_check as languagetool
use_language_check = True
except:
languagetool = None
class Spellchecker:
dictionaries = {}
# In order of priority
implementations = []
def __init__(self):
pass
@staticmethod
def registerImplementation(impl):
Spellchecker.implementations.append(impl)
@staticmethod
def isInstalled():
for impl in Spellchecker.implementations:
if impl.isInstalled():
return True
return False
@staticmethod
def supportedLibraries():
libs = OrderedDict()
for impl in Spellchecker.implementations:
libs[impl.getLibraryName()] = impl.getLibraryRequirement()
return libs
@staticmethod
def availableLibraries():
ret = []
for impl in Spellchecker.implementations:
if impl.isInstalled():
ret.append(impl.getLibraryName())
return ret
@staticmethod
def availableDictionaries():
dictionaries = OrderedDict()
for impl in Spellchecker.implementations:
if impl.isInstalled():
dictionaries[impl.getLibraryName()] = impl.availableDictionaries()
return dictionaries
@staticmethod
def normalizeDictName(lib, dictionary):
return "{}:{}".format(lib, dictionary)
@staticmethod
def getDefaultDictionary():
for impl in Spellchecker.implementations:
default = impl.getDefaultDictionary()
if default:
return Spellchecker.normalizeDictName(impl.getLibraryName(), default)
return None
@staticmethod
def getLibraryURL(lib=None):
urls = {}
for impl in Spellchecker.implementations:
urls[impl.getLibraryName()] = impl.getLibraryURL()
if lib:
return urls.get(lib, None)
return urls
@staticmethod
def getDictionary(dictionary):
if not dictionary:
dictionary = Spellchecker.getDefaultDictionary()
if not dictionary:
return None
values = dictionary.split(":", 1)
if len(values) == 1:
(lib, name) = (Spellchecker.implementations[0].getLibraryName(), dictionary)
dictionary = Spellchecker.normalizeDictName(lib, name)
else:
(lib, name) = values
try:
d = Spellchecker.dictionaries.get(dictionary, None)
if d == None:
for impl in Spellchecker.implementations:
if impl.isInstalled() and lib == impl.getLibraryName():
d = impl(name)
Spellchecker.dictionaries[dictionary] = d
break
return d
except Exception as e:
pass
return None
class BasicMatch:
def __init__(self, startIndex, endIndex):
self.start = startIndex
self.end = endIndex
self.locqualityissuetype = 'misspelling'
self.replacements = []
self.msg = ''
def getWord(self, text):
return text[self.start:self.end]
class BasicDictionary:
def __init__(self, name):
self._lang = name
if not self._lang:
self._lang = self.getDefaultDictionary()
self._customDict = set()
customPath = self.getCustomDictionaryPath()
try:
with gzip.open(customPath, 'rt', encoding='utf-8') as f:
self._customDict = set(json.loads(f.read()))
for word in self._customDict:
self._dict.create_dictionary_entry(word, self.CUSTOM_COUNT)
except:
# If error loading the file, overwrite with empty dictionary
self._saveCustomDict()
@property
def name(self):
return self._lang
@staticmethod
def getLibraryName():
raise NotImplemented
@staticmethod
def getLibraryRequirement():
return None
@staticmethod
def getLibraryURL():
raise NotImplemented
@staticmethod
def isInstalled():
raise NotImplemented
@staticmethod
def getDefaultDictionary():
raise NotImplemented
@staticmethod
def availableDictionaries():
raise NotImplemented
def checkText(self, text):
# Based on http://john.nachtimwald.com/2009/08/22/qplaintextedit-with-in-line-spell-check/
WORDS = r'(?iu)((?:[^_\W]|\')+)[^A-Za-z0-9\']'
# (?iu) means case insensitive and Unicode
# ((?:[^_\W]|\')+) means words exclude underscores but include apostrophes
# [^A-Za-z0-9\'] used with above hack to prevent spellcheck while typing word
#
# See also https://stackoverflow.com/questions/2062169/regex-w-in-utf-8
matches = []
for word_object in re.finditer(WORDS, text):
word = word_object.group(1)
mispelled = self.isMisspelled(word)
if mispelled == False:
continue
punctuation = string.punctuation.replace('-', '')
FALSE_POSITIVE = r'^[^\w]|([^{}])$'.format(punctuation)
#inorder to prevent apostrophes causing false positives and keep the same functionality otherwise,
#check that the word doesn't have any additional punctuation on it.
if re.match(FALSE_POSITIVE, word):
# ^[^\w] checks that it doesn't start with a word character
# ([\p{P}'])$ checks it doesn't end with punctuation characters
apostrophe_WORDS = r'(?iu)\b(?<=[\s\'"(])((?:[a-zA-Z]|\')+)(?=\b)'
# \b(?<=[\s\'"(]) looks for nonword characters and starts grouping after
# (?=\b) looks for the word boundary
# ((?:[a-zA-Z]|\')+) greedily matches for letters and apostrophes
temp = re.match(apostrophe_WORDS, word)
mispelled = self.isMisspelled(temp.group(1)) if temp else False
if (mispelled and not self.isCustomWord(word)):
matches.append(BasicMatch(
word_object.start(1), word_object.end(1)
))
return matches
def isMisspelled(self, word):
raise NotImplemented
def getSuggestions(self, word):
raise NotImplemented
def findSuggestions(self, text, start, end):
if start < end:
word = text[start:end]
if (self.isMisspelled(word) and not self.isCustomWord(word)):
match = BasicMatch(start, end)
match.replacements = self.getSuggestions(word)
return [ match ]
return []
def isCustomWord(self, word):
return word.lower() in self._customDict
def addWord(self, word):
word = word.lower()
if not word in self._customDict:
self._customDict.add(word)
self._saveCustomDict()
def removeWord(self, word):
word = word.lower()
if word in self._customDict:
self._customDict.remove(word)
self._saveCustomDict()
@classmethod
def getResourcesPath(cls):
path = os.path.join(writablePath(), "resources", "dictionaries", cls.getLibraryName())
if not os.path.exists(path):
os.makedirs(path)
return path
def getCustomDictionaryPath(self):
return os.path.join(self.getResourcesPath(), "{}.json.gz".format(self._lang))
def _saveCustomDict(self):
customPath = self.getCustomDictionaryPath()
with gzip.open(customPath, "wt", newline="\n") as f:
f.write(json.dumps(list(self._customDict)))
class EnchantDictionary(BasicDictionary):
def __init__(self, name):
self._lang = name
if not (self._lang and enchant.dict_exists(self._lang)):
self._lang = self.getDefaultDictionary()
self._dict = enchant.DictWithPWL(self._lang, self.getCustomDictionaryPath())
@staticmethod
def getLibraryName():
return "PyEnchant"
@staticmethod
def getLibraryURL():
return "https://pypi.org/project/pyenchant/"
@staticmethod
def isInstalled():
return enchant != None
@staticmethod
def availableDictionaries():
if EnchantDictionary.isInstalled():
return list(map(lambda i: str(i[0]), enchant.list_dicts()))
return []
@staticmethod
def getDefaultDictionary():
if not EnchantDictionary.isInstalled():
return None
default_locale = enchant.get_default_language()
if default_locale and not enchant.dict_exists(default_locale):
default_locale = None
if default_locale == None:
default_locale = QLocale.system().name()
if default_locale == None:
default_locale = self.availableDictionaries()[0]
return default_locale
def isMisspelled(self, word):
return not self._dict.check(word)
def getSuggestions(self, word):
return self._dict.suggest(word)
def isCustomWord(self, word):
return self._dict.is_added(word)
def addWord(self, word):
self._dict.add(word)
def removeWord(self, word):
self._dict.remove(word)
def getCustomDictionaryPath(self):
return os.path.join(self.getResourcesPath(), "{}.txt".format(self.name))
class PySpellcheckerDictionary(BasicDictionary):
def __init__(self, name):
BasicDictionary.__init__(self, name)
self._dict = pyspellchecker.SpellChecker(self.name)
self._dict.word_frequency.load_words(self._customDict)
@staticmethod
def getLibraryName():
return "pyspellchecker"
@staticmethod
def getLibraryURL():
return "https://pyspellchecker.readthedocs.io/en/latest/"
@staticmethod
def isInstalled():
return pyspellchecker != None
@staticmethod
def availableDictionaries():
if PySpellcheckerDictionary.isInstalled():
dictionaries = []
files = glob.glob(os.path.join(pyspellchecker.__path__[0], "resources", "*.json.gz"))
for file in files:
dictionaries.append(os.path.basename(file)[:-8])
return dictionaries
return []
@staticmethod
def getDefaultDictionary():
if not PySpellcheckerDictionary.isInstalled():
return None
default_locale = QLocale.system().name()
if default_locale:
default_locale = default_locale[0:2]
if default_locale == None:
default_locale = "en"
return default_locale
def isMisspelled(self, word):
return len(self._dict.unknown([word])) > 0
def getSuggestions(self, word):
candidates = self._dict.candidates(word)
if candidates and word in candidates:
candidates.remove(word)
return candidates
def addWord(self, word):
BasicDictionary.addWord(self, word)
self._dict.word_frequency.add(word.lower())
def removeWord(self, word):
BasicDictionary.removeWord(self, word)
self._dict.word_frequency.remove(word.lower())
class SymSpellDictionary(BasicDictionary):
CUSTOM_COUNT = 1
DISTANCE = 2
def __init__(self, name):
BasicDictionary.__init__(self, name)
self._dict = symspellpy.SymSpell(self.DISTANCE)
cachePath = self.getCachedDictionaryPath()
try:
if not self._dict.load_pickle(cachePath, False):
raise Exception("Can't load cached dictionary. " +
"File might be corrupted or incompatible with installed symspellpy version")
except:
if pyspellchecker:
path = os.path.join(pyspellchecker.__path__[0], "resources", "{}.json.gz".format(self.name))
if os.path.exists(path):
with gzip.open(path, 'rt', encoding='utf-8') as f:
data = json.loads(f.read())
for key in data:
self._dict.create_dictionary_entry(key, data[key])
self._dict.save_pickle(cachePath, False)
for word in self._customDict:
self._dict.create_dictionary_entry(word, self.CUSTOM_COUNT)
def getCachedDictionaryPath(self):
return os.path.join(self.getResourcesPath(), "{}.sym".format(self.name))
@staticmethod
def getLibraryName():
return "symspellpy"
@staticmethod
def getLibraryRequirement():
return ">= " + SYMSPELLPY_MIN_VERSION
@staticmethod
def getLibraryURL():
return "https://github.com/mammothb/symspellpy"
@staticmethod
def isInstalled():
return symspellpy != None
@classmethod
def availableDictionaries(cls):
if SymSpellDictionary.isInstalled():
files = glob.glob(os.path.join(cls.getResourcesPath(), "*.sym"))
dictionaries = []
for file in files:
dictionaries.append(os.path.basename(file)[:-4])
for sp_dict in PySpellcheckerDictionary.availableDictionaries():
if not sp_dict in dictionaries:
dictionaries.append(sp_dict)
return dictionaries
return []
@staticmethod
def getDefaultDictionary():
if not SymSpellDictionary.isInstalled():
return None
return PySpellcheckerDictionary.getDefaultDictionary()
def isMisspelled(self, word):
suggestions = self._dict.lookup(word.lower(), symspellpy.Verbosity.TOP)
if len(suggestions) > 0 and suggestions[0].distance == 0:
return False
# Try the word as is, since a dictionary might have uppercase letter as part
# of it's spelling ("I'm" or "January" for example)
suggestions = self._dict.lookup(word, symspellpy.Verbosity.TOP)
if len(suggestions) > 0 and suggestions[0].distance == 0:
return False
return True
def getSuggestions(self, word):
upper = word.isupper()
upper1 = word[0].isupper()
suggestions = self._dict.lookup_compound(word, 2)
suggestions.extend(self._dict.lookup(word, symspellpy.Verbosity.CLOSEST))
candidates = []
for sug in suggestions:
if upper:
term = sug.term.upper()
elif upper1:
term = sug.term[0].upper() + sug.term[1:]
else:
term = sug.term
if sug.distance > 0 and not term in candidates:
candidates.append(term)
return candidates
def addWord(self, word):
BasicDictionary.addWord(self, word)
self._dict.create_dictionary_entry(word.lower(), self.CUSTOM_COUNT)
def removeWord(self, word):
BasicDictionary.removeWord(self, word)
# Since 6.3.8
self._dict.delete_dictionary_entry(word)
def get_languagetool_match_errorLength(match):
if use_language_check:
return match.errorlength
else:
return match.errorLength
def get_languagetool_match_ruleIssueType(match):
if use_language_check:
return match.locqualityissuetype
else:
return match.ruleIssueType
def get_languagetool_match_message(match):
if use_language_check:
return match.msg
else:
return match.message
class LanguageToolCache:
def __init__(self, tool, text):
self._length = len(text)
self._matches = self._buildMatches(tool, text)
def getMatches(self):
return self._matches
def _buildMatches(self, tool, text):
matches = []
for match in tool.check(text):
start = match.offset
end = start + get_languagetool_match_errorLength(match)
basic_match = BasicMatch(start, end)
basic_match.locqualityissuetype = get_languagetool_match_ruleIssueType(match)
basic_match.replacements = match.replacements
basic_match.msg = get_languagetool_match_message(match)
matches.append(basic_match)
return matches
def update(self, tool, text):
if len(text) != self._length:
self._matches = self._buildMatches(tool, text)
def get_languagetool_languages(tool):
if use_language_check:
return languagetool.get_languages()
else:
return tool._get_languages()
def get_languagetool_locale_language():
if use_language_check:
return languagetool.get_locale_language()
else:
return languagetool.utils.get_locale_language()
class LanguageToolDictionary(BasicDictionary):
_tool = None
def __init__(self, name):
BasicDictionary.__init__(self, name)
if not (self._lang and self._lang in get_languagetool_languages(self.getTool())):
self._lang = self.getDefaultDictionary()
self.tool = languagetool.LanguageTool(self._lang)
self._cache = {}
@staticmethod
def getTool():
if LanguageToolDictionary._tool == None:
try:
LanguageToolDictionary._tool = languagetool.LanguageTool()
except:
return None
return LanguageToolDictionary._tool
@staticmethod
def getLibraryName():
return "LanguageTool"
@staticmethod
def getLibraryURL():
if use_language_check:
return "https://pypi.org/project/language-check/"
else:
return "https://pypi.org/project/language-tool-python/"
@staticmethod
def isInstalled():
if (languagetool != None) and (LanguageToolDictionary.getTool() != None):
# This check, if Java is installed, is necessary to
# make sure LanguageTool can be run without problems.
#
return (os.system('java -version') == 0)
return False
@staticmethod
def availableDictionaries():
if LanguageToolDictionary.isInstalled():
tool = LanguageToolDictionary.getTool()
languages = list(get_languagetool_languages(tool))
languages.sort()
return languages
return []
@staticmethod
def getDefaultDictionary():
if not LanguageToolDictionary.isInstalled():
return None
default_locale = get_languagetool_locale_language()
tool = LanguageToolDictionary.getTool()
if default_locale and not default_locale in get_languagetool_languages(tool):
default_locale = None
if default_locale == None:
default_locale = QLocale.system().name()
if default_locale == None:
default_locale = self.availableDictionaries()[0]
return default_locale
def checkText(self, text):
matches = []
if len(text) == 0:
return matches
textId = hash(text)
cacheEntry = None
if not textId in self._cache:
cacheEntry = LanguageToolCache(self.tool, text)
self._cache[textId] = cacheEntry
else:
cacheEntry = self._cache[textId]
cacheEntry.update(self.tool, text)
for match in cacheEntry.getMatches():
word = match.getWord(text)
if not (match.locqualityissuetype == 'misspelling' and self.isCustomWord(word)):
matches.append(match)
return matches
def isMisspelled(self, word):
if self.isCustomWord(word):
return False
for match in self.checkText(word):
if match.locqualityissuetype == 'misspelling':
return True
return False
def getSuggestions(self, word):
suggestions = []
for match in self.checkText(word):
suggestions += match.replacements
return suggestions
def findSuggestions(self, text, start, end):
matches = []
checked = self.checkText(text)
if start == end:
# Check for containing area:
for match in checked:
if (start >= match.start and start <= match.end):
matches.append(match)
else:
# Check for overlapping area:
for match in checked:
if (match.end > start and match.start < end):
matches.append(match)
return matches
# Register the implementations in order of priority
Spellchecker.registerImplementation(EnchantDictionary)
Spellchecker.registerImplementation(SymSpellDictionary)
Spellchecker.registerImplementation(PySpellcheckerDictionary)
Spellchecker.registerImplementation(LanguageToolDictionary)