mirror of
https://github.com/daniel-j/fimfic2epub.git
synced 2024-09-29 08:41:29 +13:00
fix/clean up sentence tokenizer in kepubify
This commit is contained in:
parent
ce3e7022b6
commit
ecde5e437a
1 changed files with 2 additions and 6 deletions
|
@ -38,7 +38,7 @@ function createSpan (paragraph, segment) {
|
||||||
|
|
||||||
function textToSentences (text) {
|
function textToSentences (text) {
|
||||||
const tokenSentences = text
|
const tokenSentences = text
|
||||||
.replace('\0', '')
|
.replace(/\0+/g, '')
|
||||||
.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
|
.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
|
||||||
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them
|
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them
|
||||||
.replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1')
|
.replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1')
|
||||||
|
@ -54,11 +54,7 @@ function textToSentences (text) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokenSentences.map((sentence, i) => {
|
return tokenSentences
|
||||||
// const span = createSpan(state.paragraph, state.segment++)
|
|
||||||
// span.text = sentence
|
|
||||||
return sentence
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Makes text nodes of .text and .tail as children
|
// Makes text nodes of .text and .tail as children
|
||||||
|
|
Loading…
Reference in a new issue