Use HTML download instead of fetching chapters separately

Added twemoji
Fixed progress bar
This commit is contained in:
daniel-j 2017-06-12 13:53:17 +02:00
parent d780668cfb
commit dac3a01a1b
5 changed files with 84 additions and 23 deletions

View file

@ -4,7 +4,7 @@
"name": "fimfic2epub",
"short_name": "fimfic2epub",
"description": "Improved EPUB exporter for Fimfiction",
"version": "1.6.3",
"version": "1.6.4",
"icons": {
"128": "icon-128.png"

View file

@ -1,6 +1,6 @@
{
"name": "fimfic2epub",
"version": "1.6.3",
"version": "1.6.4",
"description": "Tool to generate improved EPUB ebooks from Fimfiction stories",
"author": "djazz",
"repository": {
@ -26,12 +26,14 @@
"html-entities": "^1.2.0",
"html-to-text": "^2.1.3",
"image-size": "^0.5.0",
"is-svg": "^2.1.0",
"jszip": "^3.1.2",
"match-words": "^0.1.0",
"mithril": "^0.2.5",
"pretty-data": "^0.40.0",
"request": "^2.74.0",
"sanitize-filename": "^1.6.0",
"twemoji": "^2.3.0",
"zero-fill": "^2.2.3"
},
"devDependencies": {

View file

@ -7,6 +7,7 @@ import sanitize from 'sanitize-filename'
import URL from 'url'
import isNode from 'detect-node'
import fileType from 'file-type'
import isSvg from 'is-svg'
import sizeOf from 'image-size'
import Emitter from 'es6-event-emitter'
@ -22,6 +23,8 @@ import { containerXml } from './constants'
const entities = new XmlEntities()
const trimWhitespace = /^\s*(<br\s*\/?\s*>)+|(<br\s*\/?\s*>)+\s*$/ig
class FimFic2Epub extends Emitter {
static getStoryId (id) {
@ -197,6 +200,58 @@ class FimFic2Epub extends Emitter {
this.chaptersWithNotes.length = 0
this.progress(0, 0, 'Fetching chapters...')
let chapterCount = this.storyInfo.chapters.length
let url = 'https://www.fimfiction.net/story/download/' + this.storyInfo.id + '/html'
this.pcache.chapters = fetch(url).then((html) => {
// console.log(html)
let p = Promise.resolve()
let matchChapter = /<article class="chapter">[\s\S]*?<\/header>([\s\S]*?)<\/article>/g
for (let ma, i = 0; (ma = matchChapter.exec(html)); i++) {
let chapterContent = ma[1]
chapterContent = chapterContent.replace(/<footer>[\s\S]*?<\/footer>/g, '').trim()
let authorNotesPos = chapterContent.indexOf('<aside ')
let notesContent = ''
let notesFirst = authorNotesPos === 0
if (authorNotesPos !== -1) {
// console.log(chapterContent.length)
chapterContent = chapterContent.replace(/<aside class="authors-note">([\s\S]*?)<\/aside>/, (match, content, pos) => {
// console.log(pos + match.length)
content = content.replace(/<header><h1>.*?<\/h1><\/header>/, '')
notesContent = content.trim().replace(trimWhitespace, '')
return ''
})
}
chapterContent = chapterContent.trim().replace(trimWhitespace, '')
let chapter = {content: chapterContent, notes: notesContent, notesFirst}
p = p.then(cleanMarkup(chapter.content).then((content) => {
chapter.content = content
}))
if (notesContent) {
p = p.then(cleanMarkup(chapter.notes).then((notes) => {
chapter.notes = notes
}))
}
p = p.then(() => {
this.progress(0, (i + 1) / chapterCount, 'Processed chapter ' + (i + 1) + ' / ' + chapterCount)
if (chapter.notes) {
this.hasAuthorNotes = true
this.chaptersWithNotes.push(i)
}
this.chapters[i] = chapter
let ch = this.storyInfo.chapters[i]
ch.realWordCount = htmlWordCount(chapter.content)
})
}
return p
}).then(() => {
this.pcache.chapters = null
})
/*
this.pcache.chapters = new Promise((resolve, reject) => {
let chapters = this.storyInfo.chapters
let chapterCount = this.storyInfo.chapters.length
@ -251,6 +306,7 @@ class FimFic2Epub extends Emitter {
}).then(() => {
this.pcache.chapters = null
})
*/
return this.pcache.chapters
}
@ -284,6 +340,15 @@ class FimFic2Epub extends Emitter {
fetchRemote(url, 'arraybuffer').then((data) => {
r.dest = null
let info = fileType(isNode ? data : new Uint8Array(data))
if (!info) {
// file-type doesn't support SVG, extra check:
if (isSvg(Buffer.from(data).toString('utf8'))) {
info = {
mime: 'image/svg+xml',
ext: 'svg'
}
}
}
if (info) {
let type = info.mime
r.type = type
@ -649,7 +714,7 @@ class FimFic2Epub extends Emitter {
let chapterPos = html.indexOf('<div class="bbcode">')
let chapter = html.substring(chapterPos + 20)
let pos = chapter.indexOf('\t\t</div>\n\t</div>')
let pos = chapter.indexOf('\t\t</div>\n\t</div>\t\t\n\t\t\t\t\t</div>\n')
chapter = chapter.substring(0, pos).trim()

View file

@ -1,6 +1,7 @@
import m from 'mithril'
import { XmlEntities } from 'html-entities'
import twemoji from 'twemoji'
import render from './lib/mithril-node-render'
import fetch from './fetch'
@ -14,6 +15,8 @@ export function cleanMarkup (html) {
}
return new Promise((resolve, reject) => {
html = twemoji.parse(html, {ext: '.svg', folder: 'svg'})
html = html.replace(/(<img class="emoji" draggable="false" alt=".*?" src=".*?")>/g, '$1/>')
// replace HTML entities with decimal entities
html = html.replace(/&nbsp;/g, '&#160;')
html = html.replace(/&emsp;/g, '&#8195;')
@ -58,28 +61,16 @@ export function cleanMarkup (html) {
let cache = new Map()
let completeCount = 0
let matchYoutube = /<div class="embed-container" data-original-src="(.*?)" data-src="(.*?)" data-id="(.*?)" data-origin="(.*?)">(.+?)<\/div><\/div><\/div>/g
for (let ma; (ma = matchYoutube.exec(html));) {
if (ma[4] === 'YouTube') {
let youtubeId = ma[3]
cache.set(youtubeId, null)
}
let matchYouTube = /<p><a class="embed" href="https:\/\/www\.youtube\.com\/watch\?v=(.*?)">.*?<\/a><\/p>/g
for (let ma; (ma = matchYouTube.exec(html));) {
let youtubeId = ma[1]
cache.set(youtubeId, null)
}
let matchSoundCloud = /<div data-controller="oembed" class="oembed" data-url="(.*?)" .+?<\/div>/g
let matchSoundCloud = /<p><a class="embed" href="(https:\/\/soundcloud\.com\/.*?)">.*?<\/a><\/p>/g
html = html.replace(matchSoundCloud, (match, url) => {
return render(m('.soundcloud.leftalign', [
'SoundCloud song ', m('a', {href: entities.decode(url), rel: 'nofollow'}, url.replace('https://soundcloud.com', ''))
]))
})
// Story embed
let matchStoryEmbed = /<div style='[^']*?' class='bbcode__block'><div style="position:relative;" class="story-card-container".*?data-story-id="([^"]*?)"[\s\S]*?<a class="story_link" href="(.*?)" title=".*?">(.*?)<\/a>[\s\S]*?" class="story-card__author">(.*?)<\/a>[\s\S]*?<\/div><\/div>[\s\S]*?<\/div><\/div>/g
html = html.replace(matchStoryEmbed, (match, id, storyLink, storyTitle, author) => {
return render(m('.story', [
'Story: ',
m('a', {href: 'http://fimfiction.net' + entities.decode(storyLink), rel: 'nofollow'}, storyTitle),
' by ' + author
'SoundCloud: ', m('a', {href: entities.decode(url), rel: 'nofollow'}, url.replace('https://soundcloud.com/', '').replace(/[-_]/g, ' ').replace('/', ' - ').replace(/ {2}/g, ' '))
]))
})
@ -100,13 +91,13 @@ export function cleanMarkup (html) {
completeCount++
})
if (completeCount === cache.size || data.length === 0) {
html = html.replace(matchYoutube, replaceYoutube)
html = html.replace(matchYouTube, replaceYouTube)
continueParsing()
}
})
}
function replaceYoutube (match, origSrc, src, id, origin) {
function replaceYouTube (match, id) {
let youtubeId = id
let thumbnail = 'http://img.youtube.com/vi/' + youtubeId + '/hqdefault.jpg'
let youtubeUrl = 'https://youtube.com/watch?v=' + youtubeId

View file

@ -58,6 +58,9 @@ img {
max-width: 100%;
max-height: 100%;
}
img.emoji {
height: 1em;
}
hr.old {
padding: 0;