fimfic2epub/src/cleanMarkup.js


import m from 'mithril'
import { XmlEntities } from 'html-entities'
import twemoji from 'twemoji'
import render from 'mithril-node-render'

import fetch from './fetch'
import { youtubeKey } from './constants'
import { replaceAsync } from './utils'

const entities = new XmlEntities()

export async function cleanMarkup (html) {
  if (!html) {
    return Promise.resolve('')
  }

  html = html.normalize('NFC') // normalize unicode

  html = twemoji.parse(html, { ext: '.svg', folder: 'svg' })

  // replace HTML entities with decimal entities
  /* eslint-disable no-control-regex */
  html = html.replace(/\xA0/g, '&#160;')
  html = html.replace(/&nbsp;/ig, '&#160;')
  html = html.replace(/&emsp;/ig, '&#8195;')
  html = html.replace(/[\u000C\u007F]/g, '') // remove invalid token (formfeed and u007F)
  /* eslint-enable no-control-regex */

  // fix some tags
  html = html.replace(/<u>/ig, '<span style="text-decoration: underline">')
  html = html.replace(/<\/u>/ig, '</span>')
  html = html.replace(/<s>/ig, '<span style="text-decoration: line-through">')
  html = html.replace(/<\/s>/ig, '</span>')
  html = html.replace(/<span style="font-variant-caps:small-caps">/ig, '<span class="smcp">')

  html = html.replace(/<p>\s*/ig, '<p>')
  html = html.replace(/\s*<\/p>/ig, '</p>')

  // html = fixParagraphIndent(html)

  html = fixDoubleSpacing(html)

  // fix floating blockquote tags
  html = html.replace('<blockquote style="margin: 10px 0px; box-sizing:border-box; -moz-box-sizing:border-box;margin-right:25px; padding: 15px;background-color: #F7F7F7;border: 1px solid #AAA;width: 50%;float:left;box-shadow: 5px 5px 0px #EEE;">', '<blockquote class="left_insert">')
  html = html.replace('<blockquote style="margin: 10px 0px; box-sizing:border-box; -moz-box-sizing:border-box;margin-left:25px; padding: 15px;background-color: #F7F7F7;border: 1px solid #AAA;width: 50%;float:right;box-shadow: 5px 5px 0px #EEE;">', '<blockquote class="right_insert">')

  // add alt attributes to images that don't have them
  const imageEmbed = /<img src="(.*?)" \/>/g
  html = await replaceAsync(html, imageEmbed, (match, src) => render(m('img', { src: entities.decode(src), alt: 'Image' }), { strict: true }))

  // Fix links pointing to pages on fimfiction
  // Example: <a href="/user/djazz" rel="nofollow">djazz</a>
  const matchLink = /(<a .?href=")(.+?)(".+?>)/g
  html = html.replace(matchLink, (match, head, url, tail) => {
    if (url.substring(0, 1) !== '#' && url.substring(0, 2) !== '//' && url.substring(0, 4) !== 'http' && url.substring(0, 1) === '/') {
      url = 'https://fimfiction.net' + url
    }

    return head + url + tail
  })

  const cache = new Map()
  const query = new Map()
  let completeCount = 0

  const matchYouTube = /<p><a class="embed" href="https:\/\/www\.youtube\.com\/watch\?v=(.*?)">.*?<\/a><\/p>/g
  for (let ma; (ma = matchYouTube.exec(html));) {
    const youtubeId = ma[1].match(/^[^&]+/)[0]
    cache.set(youtubeId, null)
    query.set(entities.decode(ma[1]), youtubeId)
  }

  const matchSoundCloud = /<p><a class="embed" href="(https:\/\/soundcloud\.com\/.*?)">.*?<\/a><\/p>/g
  html = await replaceAsync(html, matchSoundCloud, (match, url) => {
    return render(m('.soundcloud.leftalign', [
      'SoundCloud: ', m('a', { href: entities.decode(url), rel: 'nofollow' }, url.replace('https://soundcloud.com/', '').replace(/[-_]/g, ' ').replace('/', ' - ').replace(/ {2}/g, ' '))
    ]), { strict: true })
  })

  if (cache.size === 0) {
    return html
  } else {
    return getYoutubeInfo([...cache.keys()])
  }

  async function getYoutubeInfo (ids) {
    return fetch('https://www.googleapis.com/youtube/v3/videos?id=' + ids + '&part=snippet&maxResults=50&key=' + youtubeKey).then(async (raw) => {
      let data = []
      try {
        data = JSON.parse(raw).items
      } catch (e) {}
      if (!data) {
        data = []
      }
      data.forEach((video) => {
        cache.set(video.id, video.snippet)
        completeCount++
      })
      if (completeCount === cache.size || data.length === 0) {
        html = await replaceAsync(html, matchYouTube, replaceYouTube)
      }
      return html
    })
  }

  function replaceYouTube (match, queryString) {
    queryString = entities.decode(queryString)
    const youtubeId = query.get(queryString)
    let thumbnail = 'https://img.youtube.com/vi/' + youtubeId + '/hqdefault.jpg'
    const youtubeUrl = 'https://youtube.com/watch?v=' + queryString
    let title = 'Youtube Video'
    let caption = ''
    const data = cache.get(youtubeId)

    if (data) {
      thumbnail = (data.thumbnails.standard || data.thumbnails.high || data.thumbnails.medium || data.thumbnails.default).url
      title = data.title
      caption = data.title + ' on YouTube'
    } else {
      return Promise.resolve(match)
    }
    return render(m('figure.youtube', [
      m('a', { href: youtubeUrl, rel: 'nofollow' },
        m('img', { src: thumbnail, alt: title })
      ),
      m('figcaption', m('a', { href: youtubeUrl, rel: 'nofollow' }, caption))
    ]), { strict: true })
  }
}

export function fixDoubleSpacing (html) {
  // from FimFictionConverter by Nyerguds
  html = html.replace(/\s\s+/g, ' ')
  // push spaces to the closed side of tags
  html = html.replace(/\s+(<[a-z][^>]*>)\s+/g, ' $1')
  html = html.replace(/\s+(<\/[a-z][^>]*>)\s+/g, '$1 ')
  return html
}

export function fixParagraphIndent (html) {
  // from FimFictionConverter by Nyerguds
  const fixIndent = 2
  if (fixIndent > 0) {
    // only trigger indenting when finding as many whitespace characters in a row as indicated by the FixIndent setting.

    // Add indented class, with the search keeping into account that there could be opening tags behind the p tag.
    html = html.replace(new RegExp('<p>((<([^>]+)>)*)\\s{' + fixIndent + '}\\s*', 'g'), '<p class="indented">$1')
    html = html.replace(new RegExp('<p class="(((?!indented)[^>])*)">((<([^>]+)>)*)\\s{' + fixIndent + '}\\s*', 'g'), '<p class="indented $1">$3')

    // Cleanup of remaining start whitespace in already indented paragraphs:
    html = html.replace(/<p([^>]*)>((<[^>]+>)*)\\s+/g, '<p$1>$2')
  }
  return html
}
moved code, fixed bugs, added title page. can run with node 2016-06-28 09:19:01 +12:00
			`import m from 'mithril'`
Fix image embeds 2017-06-07 21:13:31 +12:00			`import { XmlEntities } from 'html-entities'`
Use HTML download instead of fetching chapters separately Added twemoji Fixed progress bar 2017-06-12 23:53:17 +12:00			`import twemoji from 'twemoji'`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`import render from 'mithril-node-render'`
moved code, fixed bugs, added title page. can run with node 2016-06-28 09:19:01 +12:00
youtube api, pretty title page 2016-06-28 23:59:39 +12:00			`import fetch from './fetch'`
Fixes for Fimfiction 4.0 update. Drop Tidy, add paragraph customiztion 2017-06-07 08:15:05 +12:00			`import { youtubeKey } from './constants'`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`import { replaceAsync } from './utils'`
moved code, fixed bugs, added title page. can run with node 2016-06-28 09:19:01 +12:00
Fix image embeds 2017-06-07 21:13:31 +12:00			`const entities = new XmlEntities()`

Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`export async function cleanMarkup (html) {`
button to chapter comments, calculate word count (unused atm), clean up cleanup code 2016-08-20 02:51:40 +12:00			`if (!html) {`
			`return Promise.resolve('')`
			`}`

Fix for arabic etc, remove DELETE control character, add alt attributes, less sections 2018-05-09 07:11:50 +12:00			`html = html.normalize('NFC') // normalize unicode`

lint 2019-10-08 19:37:27 +13:00			`html = twemoji.parse(html, { ext: '.svg', folder: 'svg' })`
small fixes 2018-03-13 10:08:52 +13:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`// replace HTML entities with decimal entities`
fix an issue with an invalid control character (formfeed) 2020-03-04 21:10:37 +13:00			`/* eslint-disable no-control-regex */`
Fix for arabic etc, remove DELETE control character, add alt attributes, less sections 2018-05-09 07:11:50 +12:00			`html = html.replace(/\xA0/g, ' ')`
make small caps more backwards-compatible 2018-03-23 23:16:32 +13:00			`html = html.replace(/ /ig, ' ')`
			`html = html.replace(/&emsp;/ig, ' ')`
fix an issue with an invalid control character (formfeed) 2020-03-04 21:10:37 +13:00			`html = html.replace(/[\u000C\u007F]/g, '') // remove invalid token (formfeed and u007F)`
			`/* eslint-enable no-control-regex */`
Fixes for Fimfiction 4.0 update. Drop Tidy, add paragraph customiztion 2017-06-07 08:15:05 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`// fix some tags`
make small caps more backwards-compatible 2018-03-23 23:16:32 +13:00			`html = html.replace(/<u>/ig, '<span style="text-decoration: underline">')`
			`html = html.replace(/<\/u>/ig, '</span>')`
			`html = html.replace(/<s>/ig, '<span style="text-decoration: line-through">')`
			`html = html.replace(/<\/s>/ig, '</span>')`
			`html = html.replace(/<span style="font-variant-caps:small-caps">/ig, '<span class="smcp">')`

			`html = html.replace(/<p>\s*/ig, '<p>')`
			`html = html.replace(/\s*<\/p>/ig, '</p>')`
Fixes for Fimfiction 4.0 update. Drop Tidy, add paragraph customiztion 2017-06-07 08:15:05 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`// html = fixParagraphIndent(html)`
Fix youtube embeds 2017-06-07 17:46:57 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`html = fixDoubleSpacing(html)`
Fix youtube embeds 2017-06-07 17:46:57 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`// fix floating blockquote tags`
			`html = html.replace('<blockquote style="margin: 10px 0px; box-sizing:border-box; -moz-box-sizing:border-box;margin-right:25px; padding: 15px;background-color: #F7F7F7;border: 1px solid #AAA;width: 50%;float:left;box-shadow: 5px 5px 0px #EEE;">', '<blockquote class="left_insert">')`
			`html = html.replace('<blockquote style="margin: 10px 0px; box-sizing:border-box; -moz-box-sizing:border-box;margin-left:25px; padding: 15px;background-color: #F7F7F7;border: 1px solid #AAA;width: 50%;float:right;box-shadow: 5px 5px 0px #EEE;">', '<blockquote class="right_insert">')`
almost ready... 2016-08-24 02:32:55 +12:00
Fix for arabic etc, remove DELETE control character, add alt attributes, less sections 2018-05-09 07:11:50 +12:00			`// add alt attributes to images that don't have them`
lint 2019-10-08 22:31:42 +13:00			`const imageEmbed = /<img src="(.*?)" \/>/g`
lint 2019-10-08 19:37:27 +13:00			`html = await replaceAsync(html, imageEmbed, (match, src) => render(m('img', { src: entities.decode(src), alt: 'Image' }), { strict: true }))`
Fix image embeds 2017-06-07 21:13:31 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`// Fix links pointing to pages on fimfiction`
			`// Example: <a href="/user/djazz" rel="nofollow">djazz</a>`
lint 2019-10-08 22:31:42 +13:00			`const matchLink = /(<a .?href=")(.+?)(".+?>)/g`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`html = html.replace(matchLink, (match, head, url, tail) => {`
fix bad urls containing & on fimfic site, bump version 2018-10-10 20:31:39 +13:00			`if (url.substring(0, 1) !== '#' && url.substring(0, 2) !== '//' && url.substring(0, 4) !== 'http' && url.substring(0, 1) === '/') {`
			`url = 'https://fimfiction.net' + url`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`}`
Fix internal Fimfiction links 2016-08-22 21:42:33 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`return head + url + tail`
			`})`
Fix internal Fimfiction links 2016-08-22 21:42:33 +12:00
Fix critical bug in cleanMarkup 2018-03-26 18:49:15 +13:00			`const cache = new Map()`
			`const query = new Map()`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`let completeCount = 0`
youtube api, pretty title page 2016-06-28 23:59:39 +12:00
lint 2019-10-08 22:31:42 +13:00			`const matchYouTube = /<p><a class="embed" href="https:\/\/www\.youtube\.com\/watch\?v=(.?)">.?<\/a><\/p>/g`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`for (let ma; (ma = matchYouTube.exec(html));) {`
lint 2019-10-08 22:31:42 +13:00			`const youtubeId = ma[1].match(/^[^&]+/)[0]`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`cache.set(youtubeId, null)`
Fix critical bug in cleanMarkup 2018-03-26 18:49:15 +13:00			`query.set(entities.decode(ma[1]), youtubeId)`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`}`
almost ready... 2016-08-24 02:32:55 +12:00
lint 2019-10-08 22:31:42 +13:00			`const matchSoundCloud = /<p><a class="embed" href="(https:\/\/soundcloud\.com\/.?)">.?<\/a><\/p>/g`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`html = await replaceAsync(html, matchSoundCloud, (match, url) => {`
			`return render(m('.soundcloud.leftalign', [`
lint 2019-10-08 19:37:27 +13:00			`'SoundCloud: ', m('a', { href: entities.decode(url), rel: 'nofollow' }, url.replace('https://soundcloud.com/', '').replace(/[-_]/g, ' ').replace('/', ' - ').replace(/ {2}/g, ' '))`
			`]), { strict: true })`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`})`
Support for story embeds 2017-06-07 20:41:58 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`if (cache.size === 0) {`
			`return html`
			`} else {`
			`return getYoutubeInfo([...cache.keys()])`
			`}`
almost ready... 2016-08-24 02:32:55 +12:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`async function getYoutubeInfo (ids) {`
			`return fetch('https://www.googleapis.com/youtube/v3/videos?id=' + ids + '&part=snippet&maxResults=50&key=' + youtubeKey).then(async (raw) => {`
			`let data = []`
			`try {`
			`data = JSON.parse(raw).items`
small fixes 2018-03-13 10:08:52 +13:00			`} catch (e) {}`
			`if (!data) {`
			`data = []`
			`}`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`data.forEach((video) => {`
			`cache.set(video.id, video.snippet)`
			`completeCount++`
button to chapter comments, calculate word count (unused atm), clean up cleanup code 2016-08-20 02:51:40 +12:00			`})`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`if (completeCount === cache.size \|\| data.length === 0) {`
			`html = await replaceAsync(html, matchYouTube, replaceYouTube)`
almost ready... 2016-08-24 02:32:55 +12:00			`}`
Fix critical bug in cleanMarkup 2018-03-26 18:49:15 +13:00			`return html`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`})`
			`}`
Don't prettify html content, it breaks whitespace. 2017-05-26 08:11:44 +12:00
Fix critical bug in cleanMarkup 2018-03-26 18:49:15 +13:00			`function replaceYouTube (match, queryString) {`
			`queryString = entities.decode(queryString)`
lint 2019-10-08 22:31:42 +13:00			`const youtubeId = query.get(queryString)`
small fixes 2018-03-13 10:08:52 +13:00			`let thumbnail = 'https://img.youtube.com/vi/' + youtubeId + '/hqdefault.jpg'`
lint 2019-10-08 22:31:42 +13:00			`const youtubeUrl = 'https://youtube.com/watch?v=' + queryString`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`let title = 'Youtube Video'`
			`let caption = ''`
lint 2019-10-08 22:31:42 +13:00			`const data = cache.get(youtubeId)`
Fix critical bug in cleanMarkup 2018-03-26 18:49:15 +13:00
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`if (data) {`
			`thumbnail = (data.thumbnails.standard \|\| data.thumbnails.high \|\| data.thumbnails.medium \|\| data.thumbnails.default).url`
			`title = data.title`
			`caption = data.title + ' on YouTube'`
			`} else {`
Fix critical bug in cleanMarkup 2018-03-26 18:49:15 +13:00			`return Promise.resolve(match)`
button to chapter comments, calculate word count (unused atm), clean up cleanup code 2016-08-20 02:51:40 +12:00			`}`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`return render(m('figure.youtube', [`
lint 2019-10-08 19:37:27 +13:00			`m('a', { href: youtubeUrl, rel: 'nofollow' },`
			`m('img', { src: thumbnail, alt: title })`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`),`
lint 2019-10-08 19:37:27 +13:00			`m('figcaption', m('a', { href: youtubeUrl, rel: 'nofollow' }, caption))`
			`]), { strict: true })`
Upgrade mithril, bugfixes, UTF8 character support 2017-10-19 02:02:07 +13:00			`}`
moved code, fixed bugs, added title page. can run with node 2016-06-28 09:19:01 +12:00			`}`

			`export function fixDoubleSpacing (html) {`
			`// from FimFictionConverter by Nyerguds`
			`html = html.replace(/\s\s+/g, ' ')`
			`// push spaces to the closed side of tags`
			`html = html.replace(/\s+(<[a-z][^>]*>)\s+/g, ' $1')`
			`html = html.replace(/\s+(<\/[a-z][^>]*>)\s+/g, '$1 ')`
			`return html`
			`}`
autofix bad indentation 2016-08-11 08:26:14 +12:00
			`export function fixParagraphIndent (html) {`
Fix internal Fimfiction links 2016-08-22 21:42:33 +12:00			`// from FimFictionConverter by Nyerguds`
lint 2019-10-08 22:31:42 +13:00			`const fixIndent = 2`
autofix bad indentation 2016-08-11 08:26:14 +12:00			`if (fixIndent > 0) {`
			`// only trigger indenting when finding as many whitespace characters in a row as indicated by the FixIndent setting.`

			`// Add indented class, with the search keeping into account that there could be opening tags behind the p tag.`
			`html = html.replace(new RegExp('<p>((<([^>]+)>))\\s{' + fixIndent + '}\\s', 'g'), '<p class="indented">$1')`
			`html = html.replace(new RegExp('<p class="(((?!indented)[^>]))">((<([^>]+)>))\\s{' + fixIndent + '}\\s*', 'g'), '<p class="indented $1">$3')`

			`// Cleanup of remaining start whitespace in already indented paragraphs:`
			`html = html.replace(/<p([^>])>((<[^>]+>))\\s+/g, '<p$1>$2')`
			`}`
			`return html`
			`}`