Better reading ease method, async with thread wakeups, to prevent thread blocking

This commit is contained in:
daniel-j 2018-03-14 09:33:46 +01:00
parent c328997748
commit 37b8a6671c
5 changed files with 108 additions and 33 deletions

View file

@ -12,9 +12,9 @@ const args = require('commander')
.option('-e, --no-external', 'Don\'t embed external resources, such as images (breaks EPUB spec)')
.option('-n, --no-notes', 'Don\'t include author notes')
.option('-i, --notes-index', 'Create an index with all author notes at the end of the ebook')
.option('-p, --paragraphs <style>', 'Select a paragraph style [spaced|indented|indentedall|both]', 'spaced')
.option('-j, --join-subjects', 'Join subjects to a single value')
.option('-r, --reading-ease', 'Calculate Flesch reading ease (slow for long stories)')
.option('-p, --paragraphs <style>', 'Select a paragraph style <spaced|indented|indentedall|both>', 'spaced')
.option('-j, --join-subjects', 'Join dc:subjects to a single value')
.option('-r, --reading-ease [wakeup-interval]', 'Calculate Flesch reading ease, optional thread wakeup interval in ms (default: 50)')
.option('-C, --cover <url>', 'Set cover image url')
.parse(process.argv)
@ -40,7 +40,8 @@ const ffc = new FimFic2Epub(STORY_ID, {
includeExternal: !!args.external,
paragraphStyle: args.paragraphs,
joinSubjects: !!args.joinSubjects,
calculateReadingEase: !!args.readingEase
calculateReadingEase: !!args.readingEase,
readingEaseWakeupInterval: typeof args.readingEase === 'boolean' ? 50 : +args.readingEase
})
ffc.coverUrl = args.cover

View file

@ -1,6 +1,6 @@
{
"name": "fimfic2epub",
"version": "1.7.9",
"version": "1.7.11",
"description": "Tool to generate improved EPUB ebooks from Fimfiction stories",
"author": "djazz",
"repository": {
@ -36,9 +36,9 @@
"mithril-node-render": "^2.2.0",
"node-png": "^0.4.3",
"pretty-data": "^0.40.0",
"reading-level": "0.0.7",
"request": "^2.85.0",
"sanitize-filename": "^1.6.0",
"syllable": "^3.0.0",
"twemoji": "^2.5.0",
"zero-fill": "^2.2.3"
},

View file

@ -98,7 +98,8 @@ class FimFic2Epub extends Emitter {
includeExternal: true,
paragraphStyle: 'spaced',
joinSubjects: false,
calculateReadingEase: false
calculateReadingEase: false,
readingEaseWakeupInterval: isNode ? 50 : 200 // lower for node, to not slow down thread
}
this.options = Object.assign(this.defaultOptions, options)
@ -130,6 +131,7 @@ class FimFic2Epub extends Emitter {
this.coverFilename = ''
this.coverType = ''
this.coverImageDimensions = {width: 0, height: 0}
this.readingEase = null
this.hasRemoteResources = {
titlePage: false
@ -383,19 +385,31 @@ class FimFic2Epub extends Emitter {
this.notesHtml[i] = html
})
}
chain = chain.then(() => {
if (!ch.realWordCount) {
ch.realWordCount = utils.htmlWordCount(chapter.content)
}
if (this.options.calculateReadingEase && !ch.readingEase) {
let text = utils.htmlToText(chapter.content)
text = text.replace(/\s+/g, ' ').trim()
ch.readingEase = utils.readingEase(text)
}
this.progress(0, (i + 1) / this.chapters.length, 'Processed chapter ' + (i + 1) + ' / ' + this.chapters.length)
}).then(() => new Promise((resolve) => setTimeout(resolve, 20)))
chain = chain
.then(() => {
if (!ch.realWordCount) {
ch.realWordCount = utils.htmlWordCount(chapter.content)
}
this.progress(0, (i + 1) / this.chapters.length, 'Processed chapter ' + (i + 1) + ' / ' + this.chapters.length)
})
.then(() => new Promise((resolve) => setTimeout(resolve, 0)))
}
chain = chain.then(async () => {
if (this.options.calculateReadingEase && !this.readingEase) {
const content = this.chapters.reduce((str, ch) => {
return str + utils.htmlToText(ch.content) + '\n\n'
}, '')
this.progress(0, 0, 'Calculating Flesch reading ease...')
this.readingEase = await utils.readingEase(
content, this.options.readingEaseSleepInterval,
(progress) => {
this.progress(0, progress, 'Calculating Flesch reading ease ' + Math.round(progress * 100) + '%')
}
)
}
})
return chain
}

View file

@ -346,15 +346,6 @@ function calcWordCount (chapters) {
}
return count
}
function calcReadingEase (chapters) {
let avg = 0
for (let i = 0; i < chapters.length; i++) {
let ch = chapters[i]
avg += ch.readingEase.ease
}
avg = avg / chapters.length
return Math.round(avg * 100) / 100
}
export function createTitlePage (ffc) {
const tokenContent = '%%HTML_CONTENT_' + Math.random() + '%%'
@ -402,7 +393,7 @@ export function createTitlePage (ffc) {
ffc.storyInfo.publishDate && infoBox('First Published', prettyDate(new Date(ffc.storyInfo.publishDate * 1000))),
infoBox('Last Modified', prettyDate(new Date(ffc.storyInfo.date_modified * 1000))),
infoBox('Word Count', calcWordCount(ffc.storyInfo.chapters).toLocaleString('en-GB')),
ffc.options.calculateReadingEase ? infoBox('Reading Ease', calcReadingEase(ffc.storyInfo.chapters).toLocaleString('en-GB')) : null
ffc.options.calculateReadingEase && ffc.readingEase ? infoBox('Reading Ease', (Math.round(ffc.readingEase.ease * 100) / 100).toLocaleString('en-GB')) : null
]),
// m('hr'),
m('section.tags', [

View file

@ -1,7 +1,7 @@
import htmlToTextModule from 'html-to-text'
import matchWords from 'match-words'
import { readingLevel } from 'reading-level'
import syllable from 'syllable'
export function replaceAsync (str, re, callback) {
// http://es5.github.io/#x15.5.4.11
@ -72,6 +72,10 @@ export function htmlToText (html) {
})
}
export function sleep (ms) {
return new Promise((resolve) => setTimeout(resolve, ms))
}
export function htmlWordCount (html) {
let text = htmlToText(html)
@ -82,8 +86,73 @@ export function htmlWordCount (html) {
return count
}
export function readingEase (text) {
const result = readingLevel(text, 'full')
const ease = 206.835 - 1.015 * (result.words / result.sentences) - 84.6 * (result.syllables / result.words)
return {ease, gradeLevel: result.unrounded}
export async function readingEase (text, wakeupInterval = Infinity, progresscb) {
const result = {
sentences: 0, words: 0, syllables: 0, grade: NaN, ease: NaN
}
if (!/[a-z]/i.test(text)) {
return null
}
await sleep(0)
// sentence tokenizer by Darkentor
const tokenSentences = text
.replace('\0', '')
.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
.replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg)\./gi, '$1')
.replace(/(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g, '$1\0')
.split(/\s*\0\s*/)
if (typeof progresscb === 'function') {
progresscb(0)
}
await sleep(0)
const counts = { syllables: 0, words: 0 }
let lastTime = Date.now()
for (let i = 0; i < tokenSentences.length; i++) {
let now = Date.now()
if (lastTime + wakeupInterval < now) {
lastTime = now
if (typeof progresscb === 'function') {
progresscb(i / tokenSentences.length)
}
await sleep(0)
}
const sentence = tokenSentences[i]
// strip all punctuation and numbers from the sentence
const words = sentence
.replace(/[^\w\s]|_/g, '')
.replace(/\s+/g, ' ')
.split(' ')
.filter(letter => letter)
counts.syllables += words.reduce((total, word) => total + syllable(word), 0)
counts.words += words.length
}
const { words, syllables } = counts
const sentences = tokenSentences.length
const grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59
const ease = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
tokenSentences.length = 0
if (!ease) {
return null
}
Object.assign(result, {
sentences, words, syllables, grade, ease
})
if (typeof progresscb === 'function') {
progresscb(1)
}
return result
}