manuskript/libs/odf/odf2moinmoin.py
2015-07-10 12:29:25 +02:00

580 lines
18 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (C) 2006-2008 Søren Roug, European Environment Agency
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# See http://trac.edgewall.org/wiki/WikiFormatting
#
# Contributor(s):
#
import sys, zipfile, xml.dom.minidom
from namespaces import nsdict
from elementtypes import *
IGNORED_TAGS = [
'draw:a'
'draw:g',
'draw:line',
'draw:object-ole',
'office:annotation',
'presentation:notes',
'svg:desc',
] + [ nsdict[item[0]]+":"+item[1] for item in empty_elements]
INLINE_TAGS = [ nsdict[item[0]]+":"+item[1] for item in inline_elements]
class TextProps:
""" Holds properties for a text style. """
def __init__(self):
self.italic = False
self.bold = False
self.fixed = False
self.underlined = False
self.strikethrough = False
self.superscript = False
self.subscript = False
def setItalic(self, value):
if value == "italic":
self.italic = True
elif value == "normal":
self.italic = False
def setBold(self, value):
if value == "bold":
self.bold = True
elif value == "normal":
self.bold = False
def setFixed(self, value):
self.fixed = value
def setUnderlined(self, value):
if value and value != "none":
self.underlined = True
def setStrikethrough(self, value):
if value and value != "none":
self.strikethrough = True
def setPosition(self, value):
if value is None or value == '':
return
posisize = value.split(' ')
textpos = posisize[0]
if textpos.find('%') == -1:
if textpos == "sub":
self.superscript = False
self.subscript = True
elif textpos == "super":
self.superscript = True
self.subscript = False
else:
itextpos = int(textpos[:textpos.find('%')])
if itextpos > 10:
self.superscript = False
self.subscript = True
elif itextpos < -10:
self.superscript = True
self.subscript = False
def __str__(self):
return "[italic=%s, bold=i%s, fixed=%s]" % (str(self.italic),
str(self.bold),
str(self.fixed))
class ParagraphProps:
""" Holds properties of a paragraph style. """
def __init__(self):
self.blockquote = False
self.headingLevel = 0
self.code = False
self.title = False
self.indented = 0
def setIndented(self, value):
self.indented = value
def setHeading(self, level):
self.headingLevel = level
def setTitle(self, value):
self.title = value
def setCode(self, value):
self.code = value
def __str__(self):
return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
self.headingLevel,
str(self.code))
class ListProperties:
""" Holds properties for a list style. """
def __init__(self):
self.ordered = False
def setOrdered(self, value):
self.ordered = value
class ODF2MoinMoin(object):
def __init__(self, filepath):
self.footnotes = []
self.footnoteCounter = 0
self.textStyles = {"Standard": TextProps()}
self.paragraphStyles = {"Standard": ParagraphProps()}
self.listStyles = {}
self.fixedFonts = []
self.hasTitle = 0
self.lastsegment = None
# Tags
self.elements = {
'draw:page': self.textToString,
'draw:frame': self.textToString,
'draw:image': self.draw_image,
'draw:text-box': self.textToString,
'text:a': self.text_a,
'text:note': self.text_note,
}
for tag in IGNORED_TAGS:
self.elements[tag] = self.do_nothing
for tag in INLINE_TAGS:
self.elements[tag] = self.inline_markup
self.elements['text:line-break'] = self.text_line_break
self.elements['text:s'] = self.text_s
self.elements['text:tab'] = self.text_tab
self.load(filepath)
def processFontDeclarations(self, fontDecl):
""" Extracts necessary font information from a font-declaration
element.
"""
for fontFace in fontDecl.getElementsByTagName("style:font-face"):
if fontFace.getAttribute("style:font-pitch") == "fixed":
self.fixedFonts.append(fontFace.getAttribute("style:name"))
def extractTextProperties(self, style, parent=None):
""" Extracts text properties from a style element. """
textProps = TextProps()
if parent:
parentProp = self.textStyles.get(parent, None)
if parentProp:
textProp = parentProp
textPropEl = style.getElementsByTagName("style:text-properties")
if not textPropEl: return textProps
textPropEl = textPropEl[0]
textProps.setItalic(textPropEl.getAttribute("fo:font-style"))
textProps.setBold(textPropEl.getAttribute("fo:font-weight"))
textProps.setUnderlined(textPropEl.getAttribute("style:text-underline-style"))
textProps.setStrikethrough(textPropEl.getAttribute("style:text-line-through-style"))
textProps.setPosition(textPropEl.getAttribute("style:text-position"))
if textPropEl.getAttribute("style:font-name") in self.fixedFonts:
textProps.setFixed(True)
return textProps
def extractParagraphProperties(self, style, parent=None):
""" Extracts paragraph properties from a style element. """
paraProps = ParagraphProps()
name = style.getAttribute("style:name")
if name.startswith("Heading_20_"):
level = name[11:]
try:
level = int(level)
paraProps.setHeading(level)
except:
level = 0
if name == "Title":
paraProps.setTitle(True)
paraPropEl = style.getElementsByTagName("style:paragraph-properties")
if paraPropEl:
paraPropEl = paraPropEl[0]
leftMargin = paraPropEl.getAttribute("fo:margin-left")
if leftMargin:
try:
leftMargin = float(leftMargin[:-2])
if leftMargin > 0.01:
paraProps.setIndented(True)
except:
pass
textProps = self.extractTextProperties(style)
if textProps.fixed:
paraProps.setCode(True)
return paraProps
def processStyles(self, styleElements):
""" Runs through "style" elements extracting necessary information.
"""
for style in styleElements:
name = style.getAttribute("style:name")
if name == "Standard": continue
family = style.getAttribute("style:family")
parent = style.getAttribute("style:parent-style-name")
if family == "text":
self.textStyles[name] = self.extractTextProperties(style, parent)
elif family == "paragraph":
self.paragraphStyles[name] = \
self.extractParagraphProperties(style, parent)
self.textStyles[name] = self.extractTextProperties(style, parent)
def processListStyles(self, listStyleElements):
for style in listStyleElements:
name = style.getAttribute("style:name")
prop = ListProperties()
if style.hasChildNodes():
subitems = [el for el in style.childNodes
if el.nodeType == xml.dom.Node.ELEMENT_NODE
and el.tagName == "text:list-level-style-number"]
if len(subitems) > 0:
prop.setOrdered(True)
self.listStyles[name] = prop
def load(self, filepath):
""" Loads an ODT file. """
zip = zipfile.ZipFile(filepath)
styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
fontfacedecls = styles_doc.getElementsByTagName("office:font-face-decls")
if fontfacedecls:
self.processFontDeclarations(fontfacedecls[0])
self.processStyles(styles_doc.getElementsByTagName("style:style"))
self.processListStyles(styles_doc.getElementsByTagName("text:list-style"))
self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
fontfacedecls = self.content.getElementsByTagName("office:font-face-decls")
if fontfacedecls:
self.processFontDeclarations(fontfacedecls[0])
self.processStyles(self.content.getElementsByTagName("style:style"))
self.processListStyles(self.content.getElementsByTagName("text:list-style"))
def compressCodeBlocks(self, text):
""" Removes extra blank lines from code blocks. """
return text
lines = text.split("\n")
buffer = []
numLines = len(lines)
for i in range(numLines):
if (lines[i].strip() or i == numLines-1 or i == 0 or
not ( lines[i-1].startswith(" ")
and lines[i+1].startswith(" ") ) ):
buffer.append("\n" + lines[i])
return ''.join(buffer)
#-----------------------------------
def do_nothing(self, node):
return ''
def draw_image(self, node):
"""
"""
link = node.getAttribute("xlink:href")
if link and link[:2] == './': # Indicates a sub-object, which isn't supported
return "%s\n" % link
if link and link[:9] == 'Pictures/':
link = link[9:]
return "[[Image(%s)]]\n" % link
def text_a(self, node):
text = self.textToString(node)
link = node.getAttribute("xlink:href")
if link.strip() == text.strip():
return "[%s] " % link.strip()
else:
return "[%s %s] " % (link.strip(), text.strip())
def text_line_break(self, node):
return "[[BR]]"
def text_note(self, node):
cite = (node.getElementsByTagName("text:note-citation")[0]
.childNodes[0].nodeValue)
body = (node.getElementsByTagName("text:note-body")[0]
.childNodes[0])
self.footnotes.append((cite, self.textToString(body)))
return "^%s^" % cite
def text_s(self, node):
try:
num = int(node.getAttribute("text:c"))
return " "*num
except:
return " "
def text_tab(self, node):
return " "
def inline_markup(self, node):
text = self.textToString(node)
if not text.strip():
return '' # don't apply styles to white space
styleName = node.getAttribute("text:style-name")
style = self.textStyles.get(styleName, TextProps())
if style.fixed:
return "`" + text + "`"
mark = []
if style:
if style.italic:
mark.append("''")
if style.bold:
mark.append("'''")
if style.underlined:
mark.append("__")
if style.strikethrough:
mark.append("~~")
if style.superscript:
mark.append("^")
if style.subscript:
mark.append(",,")
revmark = mark[:]
revmark.reverse()
return "%s%s%s" % (''.join(mark), text, ''.join(revmark))
#-----------------------------------
def listToString(self, listElement, indent = 0):
self.lastsegment = listElement.tagName
buffer = []
styleName = listElement.getAttribute("text:style-name")
props = self.listStyles.get(styleName, ListProperties())
i = 0
for item in listElement.childNodes:
buffer.append(" "*indent)
i += 1
if props.ordered:
number = str(i)
number = " " + number + ". "
buffer.append(" 1. ")
else:
buffer.append(" * ")
subitems = [el for el in item.childNodes
if el.tagName in ["text:p", "text:h", "text:list"]]
for subitem in subitems:
if subitem.tagName == "text:list":
buffer.append("\n")
buffer.append(self.listToString(subitem, indent+3))
else:
buffer.append(self.paragraphToString(subitem, indent+3))
self.lastsegment = subitem.tagName
self.lastsegment = item.tagName
buffer.append("\n")
return ''.join(buffer)
def tableToString(self, tableElement):
""" MoinMoin uses || to delimit table cells
"""
self.lastsegment = tableElement.tagName
buffer = []
for item in tableElement.childNodes:
self.lastsegment = item.tagName
if item.tagName == "table:table-header-rows":
buffer.append(self.tableToString(item))
if item.tagName == "table:table-row":
buffer.append("\n||")
for cell in item.childNodes:
buffer.append(self.inline_markup(cell))
buffer.append("||")
self.lastsegment = cell.tagName
return ''.join(buffer)
def toString(self):
""" Converts the document to a string.
FIXME: Result from second call differs from first call
"""
body = self.content.getElementsByTagName("office:body")[0]
text = body.childNodes[0]
buffer = []
paragraphs = [el for el in text.childNodes
if el.tagName in ["draw:page", "text:p", "text:h","text:section",
"text:list", "table:table"]]
for paragraph in paragraphs:
if paragraph.tagName == "text:list":
text = self.listToString(paragraph)
elif paragraph.tagName == "text:section":
text = self.textToString(paragraph)
elif paragraph.tagName == "table:table":
text = self.tableToString(paragraph)
else:
text = self.paragraphToString(paragraph)
if text:
buffer.append(text)
if self.footnotes:
buffer.append("----")
for cite, body in self.footnotes:
buffer.append("%s: %s" % (cite, body))
buffer.append("")
return self.compressCodeBlocks('\n'.join(buffer))
def textToString(self, element):
buffer = []
for node in element.childNodes:
if node.nodeType == xml.dom.Node.TEXT_NODE:
buffer.append(node.nodeValue)
elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
tag = node.tagName
if tag in ("draw:text-box", "draw:frame"):
buffer.append(self.textToString(node))
elif tag in ("text:p", "text:h"):
text = self.paragraphToString(node)
if text:
buffer.append(text)
elif tag == "text:list":
buffer.append(self.listToString(node))
else:
method = self.elements.get(tag)
if method:
buffer.append(method(node))
else:
buffer.append(" {" + tag + "} ")
return ''.join(buffer)
def paragraphToString(self, paragraph, indent = 0):
dummyParaProps = ParagraphProps()
style_name = paragraph.getAttribute("text:style-name")
paraProps = self.paragraphStyles.get(style_name, dummyParaProps)
text = self.inline_markup(paragraph)
if paraProps and not paraProps.code:
text = text.strip()
if paragraph.tagName == "text:p" and self.lastsegment == "text:p":
text = "\n" + text
self.lastsegment = paragraph.tagName
if paraProps.title:
self.hasTitle = 1
return "= " + text + " =\n"
outlinelevel = paragraph.getAttribute("text:outline-level")
if outlinelevel:
level = int(outlinelevel)
if self.hasTitle: level += 1
if level >= 1:
return "=" * level + " " + text + " " + "=" * level + "\n"
elif paraProps.code:
return "{{{\n" + text + "\n}}}\n"
if paraProps.indented:
return self.wrapParagraph(text, indent = indent, blockquote = True)
else:
return self.wrapParagraph(text, indent = indent)
def wrapParagraph(self, text, indent = 0, blockquote=False):
counter = 0
buffer = []
LIMIT = 50
if blockquote:
buffer.append(" ")
return ''.join(buffer) + text
# Unused from here
for token in text.split():
if counter > LIMIT - indent:
buffer.append("\n" + " "*indent)
if blockquote:
buffer.append(" ")
counter = 0
buffer.append(token + " ")
counter += len(token)
return ''.join(buffer)