From c66c73494c6669513ada66fc6f90119981faa5cf Mon Sep 17 00:00:00 2001
From: daniel-j <daniel.j.hede@gmail.com>
Date: Mon, 26 Mar 2018 16:50:39 +0200
Subject: [PATCH] Add kepubify, to be continued

---
 package.json          |  1 +
 src/kepubify.js       | 78 +++++++++++++++++++++++++++++++++++++++++++
 test/test-kepubify.js |  7 ++++
 3 files changed, 86 insertions(+)
 create mode 100644 src/kepubify.js
 create mode 100644 test/test-kepubify.js
diff --git a/package.json b/package.json
index d005f67..14343ea 100644
--- a/package.json
+++ b/package.json
@@ -25,6 +25,7 @@
     "commander": "^2.15.1",
     "crc-32": "^1.2.0",
     "detect-node": "^2.0.3",
+    "elementtree": "^0.1.7",
     "escape-string-regexp": "^1.0.5",
     "file-type": "^7.2.0",
     "fonteditor-core": "^1.0.2",
diff --git a/src/kepubify.js b/src/kepubify.js
new file mode 100644
index 0000000..f885327
--- /dev/null
+++ b/src/kepubify.js
@@ -0,0 +1,78 @@
+
+import et from 'elementtree'
+
+export default function kepubify (html) {
+  const tree = et.parse(html)
+  const body = tree.find('./body')
+  addDivs(body)
+  body.getchildren().forEach((child) => addSpansToNode(child, body))
+  return tree.write()
+}
+
+const specialTags = /^(img|pre)$/i
+const sentenceRe = /(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g
+let paragraph_counter = 0
+let segment_counter = 0
+
+function addDivs (body) {
+  const bookInner = et.Element('div', {class: 'book-inner'})
+  const bookColumns = et.SubElement(bookInner, 'div', {class: 'book-columns'})
+
+  body.getchildren().forEach((child, i) => {
+    body.getchildren().splice(i, 1)
+    bookColumns.getchildren().push(child)
+  })
+  body.append(bookInner)
+}
+
+function createSpan (paragraph, segment) {
+  const span = et.Element('span', {
+    class: 'koboSpan',
+    id: 'kobo.' + paragraph + '.' + segment
+  })
+  return span
+}
+
+function addSpans (node, text) {
+  const tokenSentences = text
+    .replace('\0', '')
+    .replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
+    .normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them
+    .replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1')
+    .replace(/(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g, '$1\0')
+    .split(/\s*\0\s*/)
+
+  return tokenSentences.map((sentence, i) => {
+    if (!sentence) return null
+    const span = createSpan(paragraph_counter, segment_counter)
+    span.text = sentence
+    return span
+  }).filter((el) => el)
+}
+
+function addSpansToNode (node, parent) {
+  const nodePosition = parent.getchildren().indexOf(node)
+
+  if (node.tag.match(specialTags)) {
+    const span = createSpan(paragraph_counter, segment_counter)
+    span.append(node)
+    parent.getchildren().splice(nodePosition, 1, span)
+  } else {
+    node.getchildren().forEach((child) => {
+      addSpansToNode(child, node)
+    })
+  }
+
+  if (node.text) {
+    addSpans(node, node.text).forEach((span, i) => {
+      node.getchildren().splice(i, 0, span)
+    })
+    node.text = null
+  }
+  if (node.tail) {
+    addSpans(node, node.tail).forEach((span, i) => {
+      parent.getchildren().splice(nodePosition + 1 + i, 0, span)
+    })
+    node.tail = null
+  }
+}
diff --git a/test/test-kepubify.js b/test/test-kepubify.js
new file mode 100644
index 0000000..ff2c574
--- /dev/null
+++ b/test/test-kepubify.js
@@ -0,0 +1,7 @@
+require('babel-register')
+// use a mock DOM so we can run mithril on the server
+require('mithril/test-utils/browserMock')(global)
+
+const kepubify = require('../src/kepubify').default
+
+console.log(kepubify(`<html><body><p>Some text. Woo or not. Here is <img /> another sentence.</p></body></html>`))