fimfic2epub/src/cleanMarkup.js

157 lines
6.2 KiB
JavaScript
Raw Normal View History

import m from 'mithril'
2021-05-25 11:42:43 +12:00
import { decode } from 'html-entities'
import twemoji from 'twemoji'
import render from 'mithril-node-render'
2020-10-19 23:43:48 +13:00
import fetchRemote from './fetchRemote'
import { youtubeKey } from './constants'
import { replaceAsync } from './utils'
export async function cleanMarkup (html) {
if (!html) {
return Promise.resolve('')
}
html = html.normalize('NFC') // normalize unicode
2019-10-08 19:37:27 +13:00
html = twemoji.parse(html, { ext: '.svg', folder: 'svg' })
2018-03-13 10:08:52 +13:00
// replace HTML entities with decimal entities
/* eslint-disable no-control-regex */
html = html.replace(/\xA0/g, ' ')
html = html.replace(/ /ig, ' ')
html = html.replace(/ /ig, ' ')
html = html.replace(/[\u000C\u007F]/g, '') // remove invalid token (formfeed and u007F)
/* eslint-enable no-control-regex */
// fix some tags
html = html.replace(/<u>/ig, '<span style="text-decoration: underline">')
html = html.replace(/<\/u>/ig, '</span>')
html = html.replace(/<s>/ig, '<span style="text-decoration: line-through">')
html = html.replace(/<\/s>/ig, '</span>')
html = html.replace(/<span style="font-variant-caps:small-caps">/ig, '<span class="smcp">')
html = html.replace(/<p>\s*/ig, '<p>')
html = html.replace(/\s*<\/p>/ig, '</p>')
// html = fixParagraphIndent(html)
2017-06-07 17:46:57 +12:00
html = fixDoubleSpacing(html)
2017-06-07 17:46:57 +12:00
// fix floating blockquote tags
html = html.replace('<blockquote style="margin: 10px 0px; box-sizing:border-box; -moz-box-sizing:border-box;margin-right:25px; padding: 15px;background-color: #F7F7F7;border: 1px solid #AAA;width: 50%;float:left;box-shadow: 5px 5px 0px #EEE;">', '<blockquote class="left_insert">')
html = html.replace('<blockquote style="margin: 10px 0px; box-sizing:border-box; -moz-box-sizing:border-box;margin-left:25px; padding: 15px;background-color: #F7F7F7;border: 1px solid #AAA;width: 50%;float:right;box-shadow: 5px 5px 0px #EEE;">', '<blockquote class="right_insert">')
2016-08-24 02:32:55 +12:00
// add alt attributes to images that don't have them
2019-10-08 22:31:42 +13:00
const imageEmbed = /<img src="(.*?)" \/>/g
2021-05-25 11:42:43 +12:00
html = await replaceAsync(html, imageEmbed, (match, src) => render(m('img', { src: decode(src, { level: 'xml' }), alt: 'Image' }), { strict: true }))
2017-06-07 21:13:31 +12:00
// Fix links pointing to pages on fimfiction
// Example: <a href="/user/djazz" rel="nofollow">djazz</a>
2019-10-08 22:31:42 +13:00
const matchLink = /(<a .?href=")(.+?)(".+?>)/g
html = html.replace(matchLink, (match, head, url, tail) => {
if (url.substring(0, 1) !== '#' && url.substring(0, 2) !== '//' && url.substring(0, 4) !== 'http' && url.substring(0, 1) === '/') {
url = 'https://fimfiction.net' + url
}
2016-08-22 21:42:33 +12:00
return head + url + tail
})
2016-08-22 21:42:33 +12:00
2018-03-26 18:49:15 +13:00
const cache = new Map()
const query = new Map()
let completeCount = 0
2016-06-28 23:59:39 +12:00
2019-10-08 22:31:42 +13:00
const matchYouTube = /<p><a class="embed" href="https:\/\/www\.youtube\.com\/watch\?v=(.*?)">.*?<\/a><\/p>/g
for (let ma; (ma = matchYouTube.exec(html));) {
2019-10-08 22:31:42 +13:00
const youtubeId = ma[1].match(/^[^&]+/)[0]
cache.set(youtubeId, null)
2021-05-25 11:42:43 +12:00
query.set(decode(ma[1], { level: 'xml' }), youtubeId)
}
2016-08-24 02:32:55 +12:00
2019-10-08 22:31:42 +13:00
const matchSoundCloud = /<p><a class="embed" href="(https:\/\/soundcloud\.com\/.*?)">.*?<\/a><\/p>/g
html = await replaceAsync(html, matchSoundCloud, (match, url) => {
return render(m('.soundcloud.leftalign', [
2021-05-25 11:42:43 +12:00
'SoundCloud: ', m('a', { href: decode(url, { level: 'xml' }), rel: 'nofollow' }, url.replace('https://soundcloud.com/', '').replace(/[-_]/g, ' ').replace('/', ' - ').replace(/ {2}/g, ' '))
2019-10-08 19:37:27 +13:00
]), { strict: true })
})
2017-06-07 20:41:58 +12:00
if (cache.size === 0) {
return html
} else {
return getYoutubeInfo([...cache.keys()])
}
2016-08-24 02:32:55 +12:00
async function getYoutubeInfo (ids) {
2020-10-19 23:43:48 +13:00
return fetchRemote('https://www.googleapis.com/youtube/v3/videos?id=' + ids + '&part=snippet&maxResults=50&key=' + youtubeKey).then(async (raw) => {
let data = []
try {
data = JSON.parse(raw).items
2020-10-19 23:43:48 +13:00
} catch (e) {
console.error('Error parsing Youtube API response:', e)
}
2018-03-13 10:08:52 +13:00
if (!data) {
data = []
}
data.forEach((video) => {
2020-10-19 23:43:48 +13:00
console.log('Adding Youtube video ' + video.id + ' to cache')
cache.set(video.id, video.snippet)
completeCount++
})
if (completeCount === cache.size || data.length === 0) {
html = await replaceAsync(html, matchYouTube, replaceYouTube)
2016-08-24 02:32:55 +12:00
}
2018-03-26 18:49:15 +13:00
return html
})
}
2018-03-26 18:49:15 +13:00
function replaceYouTube (match, queryString) {
2021-05-25 11:42:43 +12:00
queryString = decode(queryString, { level: 'xml' })
2019-10-08 22:31:42 +13:00
const youtubeId = query.get(queryString)
2018-03-13 10:08:52 +13:00
let thumbnail = 'https://img.youtube.com/vi/' + youtubeId + '/hqdefault.jpg'
2019-10-08 22:31:42 +13:00
const youtubeUrl = 'https://youtube.com/watch?v=' + queryString
let title = 'Youtube Video'
let caption = ''
2019-10-08 22:31:42 +13:00
const data = cache.get(youtubeId)
2018-03-26 18:49:15 +13:00
if (data) {
thumbnail = (data.thumbnails.standard || data.thumbnails.high || data.thumbnails.medium || data.thumbnails.default).url
title = data.title
caption = data.title + ' on YouTube'
} else {
2018-03-26 18:49:15 +13:00
return Promise.resolve(match)
}
return render(m('figure.youtube', [
2019-10-08 19:37:27 +13:00
m('a', { href: youtubeUrl, rel: 'nofollow' },
m('img', { src: thumbnail, alt: title })
),
2019-10-08 19:37:27 +13:00
m('figcaption', m('a', { href: youtubeUrl, rel: 'nofollow' }, caption))
]), { strict: true })
}
}
export function fixDoubleSpacing (html) {
// from FimFictionConverter by Nyerguds
html = html.replace(/\s\s+/g, ' ')
// push spaces to the closed side of tags
html = html.replace(/\s+(<[a-z][^>]*>)\s+/g, ' $1')
html = html.replace(/\s+(<\/[a-z][^>]*>)\s+/g, '$1 ')
return html
}
2016-08-11 08:26:14 +12:00
export function fixParagraphIndent (html) {
2016-08-22 21:42:33 +12:00
// from FimFictionConverter by Nyerguds
2019-10-08 22:31:42 +13:00
const fixIndent = 2
2016-08-11 08:26:14 +12:00
if (fixIndent > 0) {
// only trigger indenting when finding as many whitespace characters in a row as indicated by the FixIndent setting.
// Add indented class, with the search keeping into account that there could be opening tags behind the p tag.
html = html.replace(new RegExp('<p>((<([^>]+)>)*)\\s{' + fixIndent + '}\\s*', 'g'), '<p class="indented">$1')
html = html.replace(new RegExp('<p class="(((?!indented)[^>])*)">((<([^>]+)>)*)\\s{' + fixIndent + '}\\s*', 'g'), '<p class="indented $1">$3')
// Cleanup of remaining start whitespace in already indented paragraphs:
html = html.replace(/<p([^>]*)>((<[^>]+>)*)\\s+/g, '<p$1>$2')
}
return html
}