fimfic2epub/src/utils.js

272 lines
11 KiB
JavaScript
Raw Permalink Normal View History

2018-03-13 23:10:04 +13:00
import htmlToTextModule from 'html-to-text'
2020-10-26 23:13:16 +13:00
import urlRegexSafe from 'url-regex-safe'
2018-03-13 23:10:04 +13:00
import matchWords from 'match-words'
import syllable from 'syllable'
import typogr from 'typogr'
import { unicode } from './constants'
2018-03-13 23:10:04 +13:00
export function replaceAsync (str, re, callback) {
// http://es5.github.io/#x15.5.4.11
str = String(str)
2019-10-08 22:31:42 +13:00
const parts = []
let i = 0
if (Object.prototype.toString.call(re) === '[object RegExp]') {
if (re.global) { re.lastIndex = i }
let m
while ((m = re.exec(str))) {
2019-10-08 22:31:42 +13:00
const args = m.concat([m.index, m.input])
parts.push(str.slice(i, m.index), callback.apply(null, args))
i = re.lastIndex
if (!re.global) { break } // for non-global regexes only take the first match
if (m[0].length === 0) { re.lastIndex++ }
}
} else {
re = String(re)
i = str.indexOf(re)
parts.push(str.slice(0, i), callback(re, i, str))
i += re.length
}
parts.push(str.slice(i))
return Promise.all(parts).then(function (strings) {
return strings.join('')
})
}
2018-03-13 10:05:57 +13:00
2018-03-15 23:33:52 +13:00
export function sleep (ms) {
return new Promise((resolve) => setTimeout(resolve, ms))
}
export function htmlToText (html, options = {}) {
options = Object.assign({
2018-03-13 23:10:04 +13:00
wordwrap: false,
ignoreImage: true,
ignoreHref: true
2018-03-15 23:33:52 +13:00
}, options)
return htmlToTextModule.fromString(html, options)
}
2018-03-13 23:10:04 +13:00
export function htmlWordCount (html) {
2018-03-15 23:33:52 +13:00
html = html.replace(/<pre>.*?<\/pre>/g, '') // Ignore codeblocks
2018-03-13 23:10:04 +13:00
let text = htmlToText(html)
2020-10-26 23:17:59 +13:00
text = text.replace(urlRegexSafe({ tlds: [] }), '') // Remove urls
2018-03-13 23:10:04 +13:00
let count = 0
try {
count = matchWords(text).length
} catch (err) { count = 0 }
return count
}
export async function readingEase (text, wakeupInterval = Infinity, progresscb) {
const result = {
sentences: 0, words: 0, syllables: 0, grade: NaN, ease: NaN
}
// sentence tokenizer by Darkentor
const tokenSentences = text
.replace('\0', '')
.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
2018-03-20 23:49:49 +13:00
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them
.replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1')
.replace(/(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g, '$1\0')
.split(/\s*\0\s*/)
2018-03-20 23:49:49 +13:00
if (!/[a-z]/i.test(text)) {
return null
}
await sleep(0)
if (typeof progresscb === 'function') {
progresscb(0)
}
await sleep(0)
const counts = { syllables: 0, words: 0 }
let lastTime = Date.now()
for (let i = 0; i < tokenSentences.length; i++) {
2019-10-08 22:31:42 +13:00
const now = Date.now()
if (lastTime + wakeupInterval < now) {
lastTime = now
if (typeof progresscb === 'function') {
progresscb(i / tokenSentences.length)
}
await sleep(0)
}
const sentence = tokenSentences[i]
// strip all punctuation and numbers from the sentence
const words = sentence
.replace(/[^\w\s]|_/g, '')
.replace(/\s+/g, ' ')
.split(' ')
.filter(letter => letter)
counts.syllables += words.reduce((total, word) => total + syllable(word), 0)
counts.words += words.length
}
const { words, syllables } = counts
const sentences = tokenSentences.length
const grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59
const ease = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
tokenSentences.length = 0
if (!ease) {
return null
}
Object.assign(result, {
sentences, words, syllables, grade, ease
})
if (typeof progresscb === 'function') {
progresscb(1)
}
return result
2018-03-13 23:10:04 +13:00
}
export function typogrify (content) {
content = typogr(content.replace(/&quot;/ig, '"').replace(/\.\.\.+/ig, '...')).chain().smartypants().ord().value()
content = content.replace(/&nbsp;/ig, unicode.NO_BREAK_SPACE) // non-breaking space
content = content.replace(/&#8217;/ig, '').replace(/&#8216;/ig, '') // curly single quotation marks
content = content.replace(/&#8220;/ig, '“').replace(/&#8221;/ig, '”') // curly double quotation marks
content = content.replace(/&#8230;/ig, '…') // ellipsis
content = content.replace(/&#8211;/ig, '').replace(/&#8212;/ig, '—') // en and em dash
/*
2018-03-26 19:51:43 +13:00
* Some of the following is from Standard Ebooks typogrify tool:
* https://github.com/standardebooks/tools/blob/master/typogrify
*/
content = content.replace(/[]”<\/p>/ig, '' + unicode.HAIR_SPACE + '”</p>')
let inSkippedTag = false
let closeMatch
const reSkipTags = /<(\/)?(style|pre|code|kbd|script|math|title)[^>]*>/i
2019-10-08 19:37:27 +13:00
content = typogr.tokenize(content).map(({ type, txt }) => {
if (type === 'tag') {
closeMatch = reSkipTags.exec(txt)
if (closeMatch && closeMatch[1] === undefined) {
inSkippedTag = true
} else {
inSkippedTag = false
}
} else if (!inSkippedTag) {
// Remove spaces between en and em dashes
// Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
txt = txt.replace(/([^.\s])\s*([–—])\s*/g, '$1$2')
// First, remove stray word joiners
txt = txt.replace(new RegExp(unicode.WORD_JOINER, 'g'), '')
// Some older texts use the ,— construct; remove that archaichism
txt = txt.replace(/,—/g, '—')
// Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
// txt = txt.replace(new RegExp('([^\\s' + unicode.WORD_JOINER + unicode.NO_BREAK_SPACE + unicode.HAIR_SPACE + '])([—⸻])', 'ig'), '$1' + unicode.WORD_JOINER + '$2')
// Add en dashes between numbers
txt = txt.replace(/([0-9]+)-([0-9]+)/g, '$1$2')
// Add a word joiner on both sides of en dashes
txt = txt.replace(new RegExp(unicode.WORD_JOINER + '?' + unicode.WORD_JOINER + '?', 'g'), unicode.WORD_JOINER + '' + unicode.WORD_JOINER)
// Replace Mr., Mrs., and other abbreviations, and include a non-breaking space
txt = txt.replace(/\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+/g, '$1.' + unicode.NO_BREAK_SPACE)
txt = txt.replace(/\bNo\.\s+([0-9]+)/g, 'No.' + unicode.NO_BREAK_SPACE + '$1')
// Fix common abbreviatons
txt = txt.replace(/(\s)([an])(\s)/ig, '$1$2$3')
// Years
// txt = txt.replace(/([0-9]{2,}[^a-zA-Z0-9])/ig, '$1')
txt = txt.replace(/([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]was|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on)\b/g, '$1')
// txt = txt.replace(/\be\b/g, 'e')
// txt = txt.replace(/\b([Ee])r\b/g, '$1r')
txt = txt.replace(/\b([Ee])re\b/g, '$1re')
// txt = txt.replace(/\b([Aa])ppen\b/g, '$1ppen')
txt = txt.replace(/\b([Aa])ven\b/g, '$1ven') // 'aven't
// nth (as in nth degree)
txt = txt.replace(/\bn-?th\b/g, '<i>n</i>th')
// Remove double spaces that use NO_BREAK_SPACE for spacing
txt = txt.replace(new RegExp(unicode.NO_BREAK_SPACE + '[' + unicode.NO_BREAK_SPACE + ' ]+', 'g'), ' ')
txt = txt.replace(new RegExp(' [' + unicode.NO_BREAK_SPACE + ' ]+', 'g'), ' ')
// Put spacing next to close quotes
txt = txt.replace(new RegExp('“[\\s' + unicode.NO_BREAK_SPACE + ']*', 'ig'), '“' + unicode.HAIR_SPACE + '')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']*”', 'ig'), '' + unicode.HAIR_SPACE + '”')
txt = txt.replace(new RegExp('“[\\s' + unicode.NO_BREAK_SPACE + ']*', 'ig'), '“' + unicode.HAIR_SPACE + '')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']*“', 'ig'), '' + unicode.HAIR_SPACE + '“')
// We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” e said.
txt = txt.replace(new RegExp('”[\\s' + unicode.NO_BREAK_SPACE + ']*([^a-zA-Z])', 'ig'), '”' + unicode.HAIR_SPACE + '$1')
// Fix ellipses spacing
txt = txt.replace(/\s*\.\s*\.\s*\.\s*/ig, '…')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']?…[\\s' + unicode.NO_BREAK_SPACE + ']?\\.', 'ig'), '.' + unicode.HAIR_SPACE + '…')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']?…[\\s' + unicode.NO_BREAK_SPACE + ']?', 'ig'), unicode.HAIR_SPACE + '… ')
// Add non-breaking spaces between amounts with an abbreviated unit. E.g. 8 oz., 10 lbs.
2018-03-26 19:51:43 +13:00
txt = txt.replace(/([0-9])\s+([a-z]{1,3}\.)/ig, '$1' + unicode.NO_BREAK_SPACE + '$2')
// Add non-breaking spaces between Arabic numbers and AM/PM
2018-03-26 19:51:43 +13:00
txt = txt.replace(/([0-9])\s+([ap])\.?m\./ig, '$1' + unicode.NO_BREAK_SPACE + '$2.m.')
// Fractions
2018-03-26 19:51:43 +13:00
txt = txt.replace(/1\/4/g, '¼')
txt = txt.replace(/1\/2/g, '½')
txt = txt.replace(/3\/4/g, '¾')
txt = txt.replace(/1\/3/g, '⅓')
txt = txt.replace(/2\/3/g, '⅔')
txt = txt.replace(/1\/5/g, '⅕')
txt = txt.replace(/2\/5/g, '⅖')
txt = txt.replace(/3\/5/g, '⅗')
txt = txt.replace(/4\/5/g, '⅘')
txt = txt.replace(/1\/6/g, '⅙')
txt = txt.replace(/5\/6/g, '⅚')
txt = txt.replace(/1\/8/g, '⅛')
txt = txt.replace(/3\/8/g, '⅜')
txt = txt.replace(/5\/8/g, '⅝')
txt = txt.replace(/7\/8/g, '⅞')
// Remove spaces between whole numbers and fractions
2018-03-26 19:51:43 +13:00
txt = txt.replace(/([0-9,]+)\s+([¼½¾⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞])/g, '$1$2')
// Use the Unicode Minus glyph (U+2212) for negative numbers
2018-03-26 19:51:43 +13:00
txt = txt.replace(/([\s])-([0-9,]+)/g, '$1$2')
txt = txt.replace(new RegExp(unicode.NO_BREAK_SPACE, 'ig'), '&#160;') // non-breaking space entity
}
return txt
}).join('')
// content = content.replace(new RegExp('<p([^>]*?)>' + unicode.HAIR_SPACE + '…', 'ig'), '<p$1>…')
// Remove spaces between opening tags and ellipses
// content = content.replace(new RegExp('(<[a-z0-9]+[^<]+?>)[\\s' + unicode.NO_BREAK_SPACE + ']+?…', 'ig'), '$1…')
// Remove spaces between closing tags and ellipses
// content = content.replace(new RegExp('…[\\s' + unicode.NO_BREAK_SPACE + ']?(</[a-z0-9]+>)', 'ig'), '…$1')
// content = content.replace(new RegExp('…[\\s' + unicode.NO_BREAK_SPACE + ']+([\\)”’])', 'ig'), '…$1')
// content = content.replace(new RegExp('([\\(“‘])[\\s' + unicode.NO_BREAK_SPACE + ']+…', 'ig'), '$1…')
// content = content.replace(new RegExp('…[\\s' + unicode.NO_BREAK_SPACE + ']?([\\!\\?\\.\\;\\,])', 'ig'), '…' + unicode.HAIR_SPACE + '$1')
// content = content.replace(new RegExp('([\\!\\?\\.\\;”’])[\\s' + unicode.NO_BREAK_SPACE + ']?…', 'ig'), '$1' + unicode.HAIR_SPACE + '…')
// content = content.replace(new RegExp('\\,[\\s' + unicode.NO_BREAK_SPACE + ']?…', 'ig'), ',' + unicode.HAIR_SPACE + '…')
2018-07-24 22:25:15 +12:00
content = content.replace(new RegExp(unicode.NO_BREAK_SPACE, 'g'), '&#160;')
content = content.replace(new RegExp(unicode.HAIR_SPACE, 'g'), '&#8202;')
content = content.replace(new RegExp(unicode.WORD_JOINER, 'g'), '&#8288;')
return content
}