fimfic2epub/src/kepubify.js


import et from 'elementtree'

// Todo: Fix missing tails outside of <body></body>

export default function kepubify (html) {
  const tree = et.parse(html)
  const body = tree.find('./body')
  addDivs(body)
  const state = {paragraph: 0, segment: 0}
  body.getchildren().forEach((child) => {
    fixupTree(child, body)
    addSpansToNode(child, body, state)
  })
  return '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>\n' + tree.write({
    xml_declaration: false
  })
}

const specialTags = /^(img|pre|svg)$/i
const paragraphTags = /^(p|ol|ul)$/i

function addDivs (body) {
  const bookInner = et.Element('div', {class: 'book-inner'})
  const bookColumns = et.SubElement(bookInner, 'div', {class: 'book-columns'})

  bookColumns._children = body.getchildren()
  body._children = [bookInner]
}

function createSpan (paragraph, segment) {
  const span = et.Element('span', {
    class: 'koboSpan',
    id: 'kobo.' + paragraph + '.' + segment
  })
  return span
}

function textToSentences (text) {
  const tokenSentences = text
    .replace('\0', '')
    .replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space
    .normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]|[A-Z] don't account for them
    .replace(/(mr|mrs|dr|ms|prof|rev|col|cmdr|flt|lt|brgdr|hon|wng|capt|rt|revd|gen|cdre|admrl|herr|hr|frau|alderman|alhaji|brig|cdr|cik|consul|datin|dato|datuk|seri|dhr|dipl|ing|dott|sa|dra|drs|en|encik|eng|eur|exma|sra|exmo|sr|lieut|fr|fraulein|fru|graaf|gravin|grp|hajah|haji|hajim|hra|ir|lcda|lic|maj|mlle|mme|mstr|nti|sri|rva|sig|na|ra|sqn|ldr|srta|wg|co|esq|inc|iou|ltd|mdlle|messers|messrs|mlles|mm|mmes|mt|p\.s|pvt|st|viz)\./gi, '$1')
    .replace(/(((^|\w).*?[^\w\s,]+)(?=\s+\W*[A-Z])|:|;)/g, '$1\0')
    .split(/\s*\0/)

  for (let i = 0; i < tokenSentences.length; i++) {
    let s = tokenSentences[i]
    if (s.trim().length === 0) {
      if (i - 1 >= 0) tokenSentences[i - 1] += s
      tokenSentences.splice(i, 1)
      i--
    }
  }

  return tokenSentences.map((sentence, i) => {
    // const span = createSpan(state.paragraph, state.segment++)
    // span.text = sentence
    return sentence
  })
}

// Makes text nodes of .text and .tail as children
function fixupTree (node, parent) {
  if (node.tag !== '#') {
    if (node.text && !node.tag.match(specialTags)) {
      let el = et.Element('#')
      el.text = node.text
      node._children.unshift(el)
      delete node.text
    }
    if (node.tail) {
      let el = et.Element('#')
      el.text = node.tail
      let pos = parent._children.indexOf(node) + 1
      parent._children.splice(pos, 0, el)
      delete node.tail
    }
  }
  node._children.slice(0).forEach((child) => {
    fixupTree(child, node)
  })
}

function addSpansToNode (node, parent, state) {
  // text node
  if (node.tag === '#') {
    state.segment++

    let sentences = textToSentences(node.text)
    let pos

    sentences.forEach((sentence) => {
      let span = createSpan(state.paragraph, state.segment++)
      span.text = sentence

      // insert the span before the text node
      pos = parent._children.indexOf(node)
      parent._children.splice(pos, 0, span)
    })

    // remove the text node
    pos = parent._children.indexOf(node)
    parent._children.splice(pos, 1)
  }

  if (node.tag.match(paragraphTags)) {
    state.segment = 0
    state.paragraph++
  }

  node.getchildren().slice(0).forEach((child) => {
    addSpansToNode(child, node, state)
  })
}
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00
			`import et from 'elementtree'`

More kepubify stuff 2018-03-27 07:32:02 +13:00			`// Todo: Fix missing tails outside of <body></body>`

Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`export default function kepubify (html) {`
			`const tree = et.parse(html)`
			`const body = tree.find('./body')`
			`addDivs(body)`
More kepubify stuff 2018-03-27 07:32:02 +13:00			`const state = {paragraph: 0, segment: 0}`
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`body.getchildren().forEach((child) => {`
			`fixupTree(child, body)`
			`addSpansToNode(child, body, state)`
			`})`
More kepubify stuff 2018-03-27 07:32:02 +13:00			`return '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>\n' + tree.write({`
			`xml_declaration: false`
			`})`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`}`

More kepubify stuff 2018-03-27 07:32:02 +13:00			`const specialTags = /^(img\|pre\|svg)$/i`
			`const paragraphTags = /^(p\|ol\|ul)$/i`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00
			`function addDivs (body) {`
			`const bookInner = et.Element('div', {class: 'book-inner'})`
			`const bookColumns = et.SubElement(bookInner, 'div', {class: 'book-columns'})`

More kepubify stuff 2018-03-27 07:32:02 +13:00			`bookColumns._children = body.getchildren()`
			`body._children = [bookInner]`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`}`

			`function createSpan (paragraph, segment) {`
			`const span = et.Element('span', {`
			`class: 'koboSpan',`
			`id: 'kobo.' + paragraph + '.' + segment`
			`})`
			`return span`
			`}`

Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`function textToSentences (text) {`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`const tokenSentences = text`
			`.replace('\0', '')`
			`.replace(/\s+/g, ' ') // Replace all whitespace (including newlines) with a single space`
			`.normalize('NFD').replace(/[\u0300-\u036f]/g, '') // strip diacritics since JS's \w group and explicit [a-z]\|[A-Z] don't account for them`
			`.replace(/(mr\|mrs\|dr\|ms\|prof\|rev\|col\|cmdr\|flt\|lt\|brgdr\|hon\|wng\|capt\|rt\|revd\|gen\|cdre\|admrl\|herr\|hr\|frau\|alderman\|alhaji\|brig\|cdr\|cik\|consul\|datin\|dato\|datuk\|seri\|dhr\|dipl\|ing\|dott\|sa\|dra\|drs\|en\|encik\|eng\|eur\|exma\|sra\|exmo\|sr\|lieut\|fr\|fraulein\|fru\|graaf\|gravin\|grp\|hajah\|haji\|hajim\|hra\|ir\|lcda\|lic\|maj\|mlle\|mme\|mstr\|nti\|sri\|rva\|sig\|na\|ra\|sqn\|ldr\|srta\|wg\|co\|esq\|inc\|iou\|ltd\|mdlle\|messers\|messrs\|mlles\|mm\|mmes\|mt\|p\.s\|pvt\|st\|viz)\./gi, '$1')`
			`.replace(/(((^\|\w).?[^\w\s,]+)(?=\s+\W[A-Z])\|:\|;)/g, '$1\0')`
More kepubify stuff 2018-03-27 07:32:02 +13:00			`.split(/\s*\0/)`

			`for (let i = 0; i < tokenSentences.length; i++) {`
			`let s = tokenSentences[i]`
			`if (s.trim().length === 0) {`
lint and bump to 1.7.31 2018-03-27 08:52:04 +13:00			`if (i - 1 >= 0) tokenSentences[i - 1] += s`
More kepubify stuff 2018-03-27 07:32:02 +13:00			`tokenSentences.splice(i, 1)`
			`i--`
			`}`
			`}`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00
			`return tokenSentences.map((sentence, i) => {`
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`// const span = createSpan(state.paragraph, state.segment++)`
			`// span.text = sentence`
			`return sentence`
kepubify fix and tag fix 2018-03-27 08:48:28 +13:00			`})`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`}`

Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`// Makes text nodes of .text and .tail as children`
			`function fixupTree (node, parent) {`
			`if (node.tag !== '#') {`
			`if (node.text && !node.tag.match(specialTags)) {`
			`let el = et.Element('#')`
			`el.text = node.text`
			`node._children.unshift(el)`
			`delete node.text`
			`}`
			`if (node.tail) {`
			`let el = et.Element('#')`
			`el.text = node.tail`
			`let pos = parent._children.indexOf(node) + 1`
			`parent._children.splice(pos, 0, el)`
			`delete node.tail`
			`}`
More kepubify stuff 2018-03-27 07:32:02 +13:00			`}`
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`node._children.slice(0).forEach((child) => {`
			`fixupTree(child, node)`
			`})`
			`}`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`function addSpansToNode (node, parent, state) {`
			`// text node`
			`if (node.tag === '#') {`
			`state.segment++`
More kepubify stuff 2018-03-27 07:32:02 +13:00
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`let sentences = textToSentences(node.text)`
			`let pos`
More kepubify stuff 2018-03-27 07:32:02 +13:00
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`sentences.forEach((sentence) => {`
			`let span = createSpan(state.paragraph, state.segment++)`
			`span.text = sentence`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00			`// insert the span before the text node`
			`pos = parent._children.indexOf(node)`
			`parent._children.splice(pos, 0, span)`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`})`
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00
			`// remove the text node`
			`pos = parent._children.indexOf(node)`
			`parent._children.splice(pos, 1)`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`}`
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00
			`if (node.tag.match(paragraphTags)) {`
			`state.segment = 0`
			`state.paragraph++`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`}`
Fix kepub issue, bump version and update deps 2018-05-17 21:36:04 +12:00
			`node.getchildren().slice(0).forEach((child) => {`
			`addSpansToNode(child, node, state)`
			`})`
Add kepubify, to be continued 2018-03-27 03:50:39 +13:00			`}`