From c66c73494c6669513ada66fc6f90119981faa5cf Mon Sep 17 00:00:00 2001 From: daniel-j Date: Mon, 26 Mar 2018 16:50:39 +0200 Subject: [PATCH] Add kepubify, to be continued --- package.json | 1 + src/kepubify.js | 78 +++++++++++++++++++++++++++++++++++++++++++ test/test-kepubify.js | 7 ++++ 3 files changed, 86 insertions(+) create mode 100644 src/kepubify.js create mode 100644 test/test-kepubify.js diff --git a/package.json b/package.json index d005f67..14343ea 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "commander": "^2.15.1", "crc-32": "^1.2.0", "detect-node": "^2.0.3", + "elementtree": "^0.1.7", "escape-string-regexp": "^1.0.5", "file-type": "^7.2.0", "fonteditor-core": "^1.0.2", diff --git a/src/kepubify.js b/src/kepubify.js new file mode 100644 index 0000000..f885327 --- /dev/null +++ b/src/kepubify.js @@ -0,0 +1,78 @@ + +import et from 'elementtree' + +export default function kepubify (html) { + const tree = et.parse(html) + const body = tree.find('./body') + addDivs(body) + body.getchildren().forEach((child) => addSpansToNode(child, body)) + return tree.write() +} + +const specialTags = /^(img|pre)$/i +const sentenceRe = /(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g +let paragraph_counter = 0 +let segment_counter = 0 + +function addDivs (body) { + const bookInner = et.Element('div', {class: 'book-inner'}) + const bookColumns = et.SubElement(bookInner, 'div', {class: 'book-columns'}) + + body.getchildren().forEach((child, i) => { + body.getchildren().splice(i, 1) + bookColumns.getchildren().push(child) + }) + body.append(bookInner) +} + +function createSpan (paragraph, segment) { + const span = et.Element('span', { + class: 'koboSpan', + id: 'kobo.' + paragraph + '.' + segment + }) + return span +} + +function addSpans (node, text) { + const tokenSentences = text + .replace('\0', '') + .replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space + .normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them + .replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1') + .replace(/(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g, '$1\0') + .split(/\s*\0\s*/) + + return tokenSentences.map((sentence, i) => { + if (!sentence) return null + const span = createSpan(paragraph_counter, segment_counter) + span.text = sentence + return span + }).filter((el) => el) +} + +function addSpansToNode (node, parent) { + const nodePosition = parent.getchildren().indexOf(node) + + if (node.tag.match(specialTags)) { + const span = createSpan(paragraph_counter, segment_counter) + span.append(node) + parent.getchildren().splice(nodePosition, 1, span) + } else { + node.getchildren().forEach((child) => { + addSpansToNode(child, node) + }) + } + + if (node.text) { + addSpans(node, node.text).forEach((span, i) => { + node.getchildren().splice(i, 0, span) + }) + node.text = null + } + if (node.tail) { + addSpans(node, node.tail).forEach((span, i) => { + parent.getchildren().splice(nodePosition + 1 + i, 0, span) + }) + node.tail = null + } +} diff --git a/test/test-kepubify.js b/test/test-kepubify.js new file mode 100644 index 0000000..ff2c574 --- /dev/null +++ b/test/test-kepubify.js @@ -0,0 +1,7 @@ +require('babel-register') +// use a mock DOM so we can run mithril on the server +require('mithril/test-utils/browserMock')(global) + +const kepubify = require('../src/kepubify').default + +console.log(kepubify(`

Some text. Woo or not. Here is another sentence.

`))