fimfic2epub/src/utils.js

272 lines
11 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import htmlToTextModule from 'html-to-text'
import urlRegexSafe from 'url-regex-safe'
import matchWords from 'match-words'
import syllable from 'syllable'
import typogr from 'typogr'
import { unicode } from './constants'
export function replaceAsync (str, re, callback) {
// http://es5.github.io/#x15.5.4.11
str = String(str)
const parts = []
let i = 0
if (Object.prototype.toString.call(re) === '[object RegExp]') {
if (re.global) { re.lastIndex = i }
let m
while ((m = re.exec(str))) {
const args = m.concat([m.index, m.input])
parts.push(str.slice(i, m.index), callback.apply(null, args))
i = re.lastIndex
if (!re.global) { break } // for non-global regexes only take the first match
if (m[0].length === 0) { re.lastIndex++ }
}
} else {
re = String(re)
i = str.indexOf(re)
parts.push(str.slice(0, i), callback(re, i, str))
i += re.length
}
parts.push(str.slice(i))
return Promise.all(parts).then(function (strings) {
return strings.join('')
})
}
export function sleep (ms) {
return new Promise((resolve) => setTimeout(resolve, ms))
}
export function htmlToText (html, options = {}) {
options = Object.assign({
wordwrap: false,
ignoreImage: true,
ignoreHref: true
}, options)
return htmlToTextModule.fromString(html, options)
}
export function htmlWordCount (html) {
html = html.replace(/<pre>.*?<\/pre>/g, '') // Ignore codeblocks
let text = htmlToText(html)
text = text.replace(urlRegexSafe({ tlds: [] }), '') // Remove urls
let count = 0
try {
count = matchWords(text).length
} catch (err) { count = 0 }
return count
}
export async function readingEase (text, wakeupInterval = Infinity, progresscb) {
const result = {
sentences: 0, words: 0, syllables: 0, grade: NaN, ease: NaN
}
// sentence tokenizer by Darkentor
const tokenSentences = text
.replace('\0', '')
.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them
.replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1')
.replace(/(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g, '$1\0')
.split(/\s*\0\s*/)
if (!/[a-z]/i.test(text)) {
return null
}
await sleep(0)
if (typeof progresscb === 'function') {
progresscb(0)
}
await sleep(0)
const counts = { syllables: 0, words: 0 }
let lastTime = Date.now()
for (let i = 0; i < tokenSentences.length; i++) {
const now = Date.now()
if (lastTime + wakeupInterval < now) {
lastTime = now
if (typeof progresscb === 'function') {
progresscb(i / tokenSentences.length)
}
await sleep(0)
}
const sentence = tokenSentences[i]
// strip all punctuation and numbers from the sentence
const words = sentence
.replace(/[^\w\s]|_/g, '')
.replace(/\s+/g, ' ')
.split(' ')
.filter(letter => letter)
counts.syllables += words.reduce((total, word) => total + syllable(word), 0)
counts.words += words.length
}
const { words, syllables } = counts
const sentences = tokenSentences.length
const grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59
const ease = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
tokenSentences.length = 0
if (!ease) {
return null
}
Object.assign(result, {
sentences, words, syllables, grade, ease
})
if (typeof progresscb === 'function') {
progresscb(1)
}
return result
}
export function typogrify (content) {
content = typogr(content.replace(/&quot;/ig, '"').replace(/\.\.\.+/ig, '...')).chain().smartypants().ord().value()
content = content.replace(/&nbsp;/ig, unicode.NO_BREAK_SPACE) // non-breaking space
content = content.replace(/&#8217;/ig, '').replace(/&#8216;/ig, '') // curly single quotation marks
content = content.replace(/&#8220;/ig, '“').replace(/&#8221;/ig, '”') // curly double quotation marks
content = content.replace(/&#8230;/ig, '…') // ellipsis
content = content.replace(/&#8211;/ig, '').replace(/&#8212;/ig, '—') // en and em dash
/*
* Some of the following is from Standard Ebooks typogrify tool:
* https://github.com/standardebooks/tools/blob/master/typogrify
*/
content = content.replace(/[]”<\/p>/ig, '' + unicode.HAIR_SPACE + '”</p>')
let inSkippedTag = false
let closeMatch
const reSkipTags = /<(\/)?(style|pre|code|kbd|script|math|title)[^>]*>/i
content = typogr.tokenize(content).map(({ type, txt }) => {
if (type === 'tag') {
closeMatch = reSkipTags.exec(txt)
if (closeMatch && closeMatch[1] === undefined) {
inSkippedTag = true
} else {
inSkippedTag = false
}
} else if (!inSkippedTag) {
// Remove spaces between en and em dashes
// Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
txt = txt.replace(/([^.\s])\s*([–—])\s*/g, '$1$2')
// First, remove stray word joiners
txt = txt.replace(new RegExp(unicode.WORD_JOINER, 'g'), '')
// Some older texts use the ,— construct; remove that archaichism
txt = txt.replace(/,—/g, '—')
// Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
// txt = txt.replace(new RegExp('([^\\s' + unicode.WORD_JOINER + unicode.NO_BREAK_SPACE + unicode.HAIR_SPACE + '])([—⸻])', 'ig'), '$1' + unicode.WORD_JOINER + '$2')
// Add en dashes between numbers
txt = txt.replace(/([0-9]+)-([0-9]+)/g, '$1$2')
// Add a word joiner on both sides of en dashes
txt = txt.replace(new RegExp(unicode.WORD_JOINER + '?' + unicode.WORD_JOINER + '?', 'g'), unicode.WORD_JOINER + '' + unicode.WORD_JOINER)
// Replace Mr., Mrs., and other abbreviations, and include a non-breaking space
txt = txt.replace(/\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+/g, '$1.' + unicode.NO_BREAK_SPACE)
txt = txt.replace(/\bNo\.\s+([0-9]+)/g, 'No.' + unicode.NO_BREAK_SPACE + '$1')
// Fix common abbreviatons
txt = txt.replace(/(\s)([an])(\s)/ig, '$1$2$3')
// Years
// txt = txt.replace(/([0-9]{2,}[^a-zA-Z0-9])/ig, '$1')
txt = txt.replace(/([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]was|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on)\b/g, '$1')
// txt = txt.replace(/\be\b/g, 'e')
// txt = txt.replace(/\b([Ee])r\b/g, '$1r')
txt = txt.replace(/\b([Ee])re\b/g, '$1re')
// txt = txt.replace(/\b([Aa])ppen\b/g, '$1ppen')
txt = txt.replace(/\b([Aa])ven\b/g, '$1ven') // 'aven't
// nth (as in nth degree)
txt = txt.replace(/\bn-?th\b/g, '<i>n</i>th')
// Remove double spaces that use NO_BREAK_SPACE for spacing
txt = txt.replace(new RegExp(unicode.NO_BREAK_SPACE + '[' + unicode.NO_BREAK_SPACE + ' ]+', 'g'), ' ')
txt = txt.replace(new RegExp(' [' + unicode.NO_BREAK_SPACE + ' ]+', 'g'), ' ')
// Put spacing next to close quotes
txt = txt.replace(new RegExp('“[\\s' + unicode.NO_BREAK_SPACE + ']*', 'ig'), '“' + unicode.HAIR_SPACE + '')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']*”', 'ig'), '' + unicode.HAIR_SPACE + '”')
txt = txt.replace(new RegExp('“[\\s' + unicode.NO_BREAK_SPACE + ']*', 'ig'), '“' + unicode.HAIR_SPACE + '')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']*“', 'ig'), '' + unicode.HAIR_SPACE + '“')
// We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” e said.
txt = txt.replace(new RegExp('”[\\s' + unicode.NO_BREAK_SPACE + ']*([^a-zA-Z])', 'ig'), '”' + unicode.HAIR_SPACE + '$1')
// Fix ellipses spacing
txt = txt.replace(/\s*\.\s*\.\s*\.\s*/ig, '…')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']?…[\\s' + unicode.NO_BREAK_SPACE + ']?\\.', 'ig'), '.' + unicode.HAIR_SPACE + '…')
txt = txt.replace(new RegExp('[\\s' + unicode.NO_BREAK_SPACE + ']?…[\\s' + unicode.NO_BREAK_SPACE + ']?', 'ig'), unicode.HAIR_SPACE + '… ')
// Add non-breaking spaces between amounts with an abbreviated unit. E.g. 8 oz., 10 lbs.
txt = txt.replace(/([0-9])\s+([a-z]{1,3}\.)/ig, '$1' + unicode.NO_BREAK_SPACE + '$2')
// Add non-breaking spaces between Arabic numbers and AM/PM
txt = txt.replace(/([0-9])\s+([ap])\.?m\./ig, '$1' + unicode.NO_BREAK_SPACE + '$2.m.')
// Fractions
txt = txt.replace(/1\/4/g, '¼')
txt = txt.replace(/1\/2/g, '½')
txt = txt.replace(/3\/4/g, '¾')
txt = txt.replace(/1\/3/g, '⅓')
txt = txt.replace(/2\/3/g, '⅔')
txt = txt.replace(/1\/5/g, '⅕')
txt = txt.replace(/2\/5/g, '⅖')
txt = txt.replace(/3\/5/g, '⅗')
txt = txt.replace(/4\/5/g, '⅘')
txt = txt.replace(/1\/6/g, '⅙')
txt = txt.replace(/5\/6/g, '⅚')
txt = txt.replace(/1\/8/g, '⅛')
txt = txt.replace(/3\/8/g, '⅜')
txt = txt.replace(/5\/8/g, '⅝')
txt = txt.replace(/7\/8/g, '⅞')
// Remove spaces between whole numbers and fractions
txt = txt.replace(/([0-9,]+)\s+([¼½¾⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞])/g, '$1$2')
// Use the Unicode Minus glyph (U+2212) for negative numbers
txt = txt.replace(/([\s])-([0-9,]+)/g, '$1$2')
txt = txt.replace(new RegExp(unicode.NO_BREAK_SPACE, 'ig'), '&#160;') // non-breaking space entity
}
return txt
}).join('')
// content = content.replace(new RegExp('<p([^>]*?)>' + unicode.HAIR_SPACE + '…', 'ig'), '<p$1>…')
// Remove spaces between opening tags and ellipses
// content = content.replace(new RegExp('(<[a-z0-9]+[^<]+?>)[\\s' + unicode.NO_BREAK_SPACE + ']+?…', 'ig'), '$1…')
// Remove spaces between closing tags and ellipses
// content = content.replace(new RegExp('…[\\s' + unicode.NO_BREAK_SPACE + ']?(</[a-z0-9]+>)', 'ig'), '…$1')
// content = content.replace(new RegExp('…[\\s' + unicode.NO_BREAK_SPACE + ']+([\\)”’])', 'ig'), '…$1')
// content = content.replace(new RegExp('([\\(“‘])[\\s' + unicode.NO_BREAK_SPACE + ']+…', 'ig'), '$1…')
// content = content.replace(new RegExp('…[\\s' + unicode.NO_BREAK_SPACE + ']?([\\!\\?\\.\\;\\,])', 'ig'), '…' + unicode.HAIR_SPACE + '$1')
// content = content.replace(new RegExp('([\\!\\?\\.\\;”’])[\\s' + unicode.NO_BREAK_SPACE + ']?…', 'ig'), '$1' + unicode.HAIR_SPACE + '…')
// content = content.replace(new RegExp('\\,[\\s' + unicode.NO_BREAK_SPACE + ']?…', 'ig'), ',' + unicode.HAIR_SPACE + '…')
content = content.replace(new RegExp(unicode.NO_BREAK_SPACE, 'g'), '&#160;')
content = content.replace(new RegExp(unicode.HAIR_SPACE, 'g'), '&#8202;')
content = content.replace(new RegExp(unicode.WORD_JOINER, 'g'), '&#8288;')
return content
}