#!/usr/bin/env python2 r""" db2vim [options] file.xml SHORT OPTIONS -d Prints some debugging information on stderr. -s If given, the db2vim operates in a 'stict' conversion mode, i.e, any element which does not have a handler defined for them it be completeley ignored including all its children. Otherwise, db2vim will recurse into an unknown tag and process any of its children it recognizes. Since db2vim always recognizes text nodes, not using this option has the effect that all text will be printed out, even if somewhat incorrectly. LONG OPTIONS --prefix= This is a string like "ls_" which will be prepended to the section numbers. Default to 'ls_' if unsupplied. """ import xml.dom.minidom import getopt import string import re import sys # Okay. so I import *. Shoot me. from textutils import * from domutils import * # define a bunch of constants for formatting. TEXT_WIDTH = 80 BLOCK_QUOTE = 4 COL_SPACE = 2 # a bunch of globals used in creating the Table of contents. # # TOC_HASH['section 1.1 label'] = 'ls_1_1' # # LEVEL_HASH['section 1.1 label'] = 1 # (top level article has level 0) # # TITLE_HASH['section 1.1 label'] = 'Title of section 1.1' # # FILENAME = the name of the file being processed with the last extension # changed to .txt # # TOC_PREFIX = 'ls_' (the prefix used to create the section labels). TOC_HASH = {} LEVEL_HASH = {} TITLE_HASH = {} FILENAME = '' TOC_PREFIX = '' ANCHOR_HASH = {} URL_HASH = {} # STDERR for printing debugging info. DEBUG = 0 STDERR = sys.stderr STRICT = 0 NUM_ANCHORS = {0: 1} ############################################################################### # Miscellaneous utility functions ############################################################################### def encodeTo52(num): if num < 26: return unichr(ord('a') + num) elif num < 52: return unichr(ord('A') + num - 26) else: return encodeTo52(int(num / 52)) + encodeTo52(num % 52) def makeTocHash(rootElement, width, prefix='', level=0): lastLabelUsed = 0 for section in rootElement.getChildrenByTagName('section'): title = section.getChildrenByTagName('title')[0] titleText = handleElement(title, width) lastLabelUsed += 1 thisLabel = TOC_PREFIX + prefix + str(lastLabelUsed) sectionid = section.getAttribute('id') if not sectionid: section.setAttribute('id', thisLabel) sectionid = thisLabel NUM_ANCHORS[0] += 1 ANCHOR_HASH[sectionid] = TOC_PREFIX + 'a_' + encodeTo52( NUM_ANCHORS[0] + 52) TOC_HASH[sectionid] = thisLabel LEVEL_HASH[sectionid] = level TITLE_HASH[sectionid] = titleText if section.getChildrenByTagName('section'): makeTocHash(section, width - 5, prefix=prefix + str(lastLabelUsed) + '_', level=level + 1) def makeAnchorHash(rootElement): anchors = rootElement.getElementsByTagName( 'anchor') + rootElement.getElementsByTagName('note') for anchor in anchors: if not anchor.getAttribute('id'): continue NUM_ANCHORS[0] += 1 if anchor.getAttribute('id') in ANCHOR_HASH or \ anchor.getAttribute('id') in TOC_HASH: sys.stderr.write("Warning: anchor [%s] multiply defined\n" % anchor.getAttribute('id')) ANCHOR_HASH[anchor.getAttribute( 'id')] = TOC_PREFIX + 'a_' + encodeTo52(NUM_ANCHORS[0] + 52) def makeURLHash(rootElement): urls = rootElement.getElementsByTagName('ulink') numURLs = 0 for url in urls: if not url.getAttribute('url') or url.getAttribute('url') in URL_HASH: continue numURLs += 1 URL_HASH[url.getAttribute('url')] = TOC_PREFIX + 'u_' + str(numURLs) def makeTOC(node, width, maxlevel=1): retText = "" for section in node.getChildrenByTagName('section'): sectionid = section.getAttribute('id') thisLabel = TOC_HASH.get(sectionid, '') titleText = TITLE_HASH.get(sectionid, '') level = LEVEL_HASH.get(sectionid, 10) if level <= maxlevel: retText += '|' + thisLabel + '| ' + titleText + '\n' if level < maxlevel and section.getChildrenByTagName('section'): childText = makeTOC(section, width - 5) retText += VertCatString(" ", 4, childText) + '\n' retText = re.sub(r'\s+$', r'\n', retText) return retText ############################################################################### # Generalized function for handling dom elements. ############################################################################### def IsInlineTag(self): if self.nodeType == self.TEXT_NODE: return 1 elif inlineTags.get(self.tagName, 0): return 1 else: return 0 def getChildrenByTagName(self, name): """ extension to the xml.dom.minidom.Element class. returns all direct descendants of this Element. """ nodeList = [] child = self.firstChild while not child is None: if child.nodeType == child.ELEMENT_NODE and child.nodeName == name: nodeList.append(child) child = child.nextSibling return nodeList xml.dom.minidom.Element.getChildrenByTagName = getChildrenByTagName def handleElement(rootElement, width=TEXT_WIDTH): """ Generalized function to handle an Element node in a DOM tree. """ retText = "" child = rootElement.firstChild while not child is None: printerr('node type = %d' % child.nodeType) if child.nodeType == child.ELEMENT_NODE: printerr('processing [%s]' % child.tagName) isinline = IsInlineTag(child) # if the child is an Element and if a handler exists, then call it. if not isinline \ and child.nodeType == child.ELEMENT_NODE \ and child.tagName in handlerMaps: # offset the child text by the current indentation value printerr('making recursive call to known child.') retText += handlerMaps[child.tagName](child, width) child = child.nextSibling elif not isinline \ and child.nodeType == child.PROCESSING_INSTRUCTION_NODE \ and child.target == 'vimhelp': if child.data in handlerMaps: retText += handlerMaps[child.data](child, width) child = child.nextSibling # if its a text node or an inline element node, collect consecutive # text nodes into a single paragraph and indent it. elif isinline: text = "" while not child is None and IsInlineTag(child): if child.nodeType == child.TEXT_NODE: text += child.data elif child.nodeType == child.ELEMENT_NODE: if child.tagName in handlerMaps: text += handlerMaps[child.tagName](child, width) else: text += GetText(child.childNodes) child = child.nextSibling retText += IndentParagraphs(text, width) # If we cannot understand _anything_ about the element, then just # handle its children hoping we have something to gather from # there. elif not STRICT: printerr('making recursive call for unkown child') retText += handleElement(child, width) child = child.nextSibling else: child = child.nextSibling return retText ############################################################################### # Functions for handling various xml tags ############################################################################### def handleArticleInfo(articleinfo, width): makeTocHash(articleinfo.parentNode, width) makeAnchorHash(articleinfo.parentNode) makeURLHash(articleinfo.parentNode) title = articleinfo.getChildrenByTagName('title') if title is None: print("Article should have a title!") sys.exit(1) name = GetText(title[0].childNodes) authors = articleinfo.getChildrenByTagName('author') authorText = '' for author in authors: firstname = '' surname = '' if author.getElementsByTagName('firstname'): firstname = GetTextFromElementNode(author, 'firstname')[0] if author.getChildrenByTagName('surname'): surname = GetTextFromElementNode(author, 'surname')[0] if author.getElementsByTagName('email'): email = GetTextFromElementNode(author, 'email')[0] authorText = authorText + firstname + ' ' + surname + \ ' <' + email + '>\n' abstractText = '' abstract = articleinfo.getChildrenByTagName('abstract') if abstract is not None: abstractText = '\n\n' + CenterText('Abstract\n========', width) abstractText += handleElement(abstract[0], width) + '\n' retText = CenterText(name + '\n*' + FILENAME + '*\n' + authorText, width) retText += abstractText toc = makeTOC(articleinfo.parentNode, width) return retText + '\n' + RightJustify('*' + FILENAME + '-toc*', width) + \ '\n' + toc def handleOption(option, width): retText = "" names = GetTextFromElementNode(option, "name") for name in names: retText += string.rjust("*" + name + "*", width) + "\n" nameTexts = "" maxNameLen = -1 for name in names: maxNameLen = max(maxNameLen, len(name + " ")) nameTexts += name + " \n" desc = option.getChildrenByTagName("desc")[0] descText = handleElement(desc, width=width - maxNameLen) retText += VertCatString(nameTexts + " ", None, descText) return retText + "\n" def handleOptionDefault(default, width): type = string.join(GetTextFromElementNode(default, "type"), "\n") extra = string.join(GetTextFromElementNode(default, "extra"), "\n") return type + "\t(" + extra + ")" def handleTableRoot(root, width): tgroup = root.getChildrenByTagName('tgroup')[0] if tgroup is None: return '' rows = [] numHeadRows = 0 if tgroup.getChildrenByTagName('thead'): thead = tgroup.getChildrenByTagName('thead')[0] rows = thead.getChildrenByTagName('row') numHeadRows = len(rows) tbody = tgroup.getChildrenByTagName('tbody')[0] rows += tbody.getChildrenByTagName('row') widths, text = calculateColumnWidthsDoublePass(rows, width) headText = text[0:numHeadRows] bodyText = text[numHeadRows:] headTable = FormatTable(headText, ROW_SPACE=1, COL_SPACE= COL_SPACE, justify=0, widths=widths) if headTable: headTable = re.sub(r'\n|$', '\g<0>~', headTable) bodyTable = FormatTable(bodyText, ROW_SPACE=1, COL_SPACE= COL_SPACE, justify=0, widths=widths) return headTable + '\n' + re.sub(r'\n+$', '', bodyTable) + '\n\n' def calculateColumnWidths(rows, alloc_widths): widths = {} text = [] for row in rows: cols = row.getChildrenByTagName("entry") if len(alloc_widths) == 1: alloc_widths *= len(cols) colwidths = [] rowtext = [] for col, width in zip(cols, alloc_widths): coltext = handleElement(col, width) rowtext.append(coltext) # This is the 'width' of the current cell including the # whitespace padding. colwidths.append(max(map(len, coltext.split("\n"))) + COL_SPACE) text.append(rowtext) # update the widths of the columns by finding the maximum # width of all cells in this column. for i in range(len(colwidths)): widths[i] = max(colwidths[i], widths.get(i, -1)) return widths, text def calculateColumnWidthsDoublePass(rows, width): maxwidths, text = calculateColumnWidths(rows, [width]) if reduce(lambda x, y: x + y, maxwidths.values()) <= width: return maxwidths, text # now find out how many columns exceed the maximum permitted width. # nlarge: number of columns which are too wide. # remainingWidth: width which these large columns can share. nlarge = 0 remainingWidth = width for colwidth in maxwidths.values(): if colwidth > width / len(maxwidths): nlarge += 1 else: remainingWidth += -colwidth # newmaxwidth: width which each of the large columns is allowed. newmaxwidth = remainingWidth / max(nlarge, 1) newcolwidths = [] for colwidth in maxwidths.values(): newcolwidths += [min(colwidth, newmaxwidth)] # make another run and this time ask each cell to restrict itself to # newmaxwidth as calculated above. newmaxwidth, newtext = calculateColumnWidths(rows, newcolwidths) return newmaxwidth, newtext def handleCode(code, width): retText = GetText(code.childNodes) return " &codebegin;\n" + VertCatString(" ", 4, retText) + "&codeend;" def handleList(list, width, marker=0): if list.tagName == 'simplelist': child = 'member' decoration = '' elif list.tagName == 'orderedlist': child = 'listitem' else: child = 'member' decoration = '- ' retText = "" items = list.getChildrenByTagName(child) i = 1 for item in items: if list.tagName == 'orderedlist': decoration = str(i) + '. ' i = i + 1 itemText = handleElement(item, width - len(decoration)) itemText = VertCatString(decoration, None, itemText) retText += '\n' + re.sub(r'\s+$', '', itemText) + "\n" return retText def handleNote(note, width): title = None if note.getChildrenByTagName('title'): title = note.getChildrenByTagName('title')[0] name = GetText(title.childNodes) note.removeChild(title) noteid = '' if note.getAttribute('id'): noteTagText = '*' + note.getAttribute('id') + '* ' noteTagText += '*' + ANCHOR_HASH[note.getAttribute('id')] + '*' noteTagText = IndentParagraphs(noteTagText, width / 2) noteid = RightJustify(noteTagText, width) + '\n' noteText = handleElement(note, width - len("NOTE: ")) if title is not None: noteText = name + '\n' + ('-' * len(name)) + '\n' + noteText noteText = noteid + VertCatString("NOTE: ", None, noteText) return noteText + "\n" def handleParagraph(paragraph, width): partext = handleElement(paragraph, width) partext = re.sub(r'\n+$', '', partext) partext = re.sub(r'^\n+', '', partext) return partext + "\n\n" def handleFormalParagraph(formalparagraph, width): title = None if formalparagraph.getChildrenByTagName('title'): title = formalparagraph.getChildrenByTagName('title')[0] name = GetText(title.childNodes) formalparagraph.removeChild(title) partext = handleElement(formalparagraph, width) partext = re.sub(r'\n+$', '', partext) partext = re.sub(r'^\n+', '', partext) if title is not None: partext = name + '\n' + ('-' * len(name)) + '\n' + partext return partext + "\n\n" def handleBlockQuote(block, width): text = handleElement(block, width - BLOCK_QUOTE) text = VertCatString(" " * BLOCK_QUOTE, BLOCK_QUOTE, text) return text + "\n" def handleLink(link, width): linkend = link.getAttribute('linkend') if linkend not in ANCHOR_HASH: print >> STDERR, "Warning: Link ID [%s] not found in TOC" % linkend text = handleElement(link, width) anchorpt = ANCHOR_HASH.get(linkend) if not anchorpt: anchorpt = '' return text + ' [|' + anchorpt + '|]' def handleAnchor(anchor, width): anchorText = '*' + anchor.getAttribute('id') + '* ' anchorText += '*' + ANCHOR_HASH[anchor.getAttribute('id')] + '*' return RightJustify(anchorText, width) + "\n" def handleSection(section, width): title = section.getChildrenByTagName('title')[0] name = handleElement(title, width) sectionid = section.getAttribute('id') tagsformatted = '' if sectionid in TOC_HASH: tagsformatted = '*%s* ' % TOC_HASH[sectionid] if sectionid in ANCHOR_HASH: tagsformatted += '*%s* ' % ANCHOR_HASH[sectionid] if sectionid and sectionid in TOC_HASH and \ sectionid != TOC_HASH[sectionid]: tagsformatted += '*%s*' % sectionid # try to indent to a width of 20 tagsformatted = RightJustify(IndentParagraphs(tagsformatted, 30), 0) tagswidth = TextWidth(tagsformatted) # width(name) + nspaces + width(tags) = 80 if len(tagsformatted) > 2: header = VertCatString(name, 80 - tagswidth, tagsformatted) else: header = name section.removeChild(title) text = handleElement(section, width) thislevel = LEVEL_HASH.get(sectionid, -1) if thislevel == 0: delim = '=' newlines = '\n\n' elif thislevel == 1: delim = '-' newlines = '\n' else: delim = '' newlines = '\n' thisTOC = '' if thislevel <= 1: thisTOC = makeTOC(section, width, maxlevel=1) return "\n" + (delim * TEXT_WIDTH) + \ "\n" + header + newlines + thisTOC + newlines + re.sub( r'\n+$', '', text) + "\n" def handleUlink(ulink, width): url = ulink.getAttribute('url') text = handleElement(ulink) # URL_HASH is created at the very beginning if url: return text + ' |%s|' % URL_HASH[url] else: print >> STDERR, "Warning: url attribute empty for [%s]" % text return text def handleIndexTerm(indexterm, width): return '' def handleEmphasis(emphasis, width): return '_' + GetText(emphasis.childNodes) + '_' ############################################################################### # A dictionary for mapping xml tags to functions. ############################################################################### handlerMaps = { 'articleinfo': handleArticleInfo, 'table': handleTableRoot, 'informaltable': handleTableRoot, 'code': handleCode, 'programlisting': handleCode, 'list': handleList, 'simplelist': handleList, 'orderedlist': handleList, 'para': handleParagraph, 'formalpara': handleFormalParagraph, 'note': handleNote, 'link': handleLink, 'anchor': handleAnchor, 'section': handleSection, 'blockquote': handleBlockQuote, 'ulink': handleUlink, 'emphasis': handleEmphasis, 'indexterm': handleIndexTerm } inlineTags = {'tag': 1, 'literal': 1, 'link': 1, 'ulink': 1, 'citetitle': 1, 'indexterm': 1, 'emphasis': 1, 'filename': 1} # helper functions for usage() and printerr() def usage(): print __doc__ def printerr(statement): if DEBUG: print >> STDERR, statement def replaceComment(matchobj): initspace = matchobj.group(1) firstsent = matchobj.group(2) code = matchobj.group(3) if len(initspace) > 0: if initspace[0] == '<': lastspace = initspace else: lastspace = '<' + initspace[:-1] else: lastspace = initspace return '\n' + initspace + firstsent + ' >\n' + code + '\n' + lastspace if __name__ == "__main__": option = {} try: opts, args = getopt.getopt(sys.argv[1:], 'ds', ['prefix=', 'help']) for oa, ov in opts: option[oa] = ov except getopt.GetoptError: print >> STDERR, "Usage error: db2vim --help for usage" sys.exit(1) if '--help' in option: usage() sys.exit(0) TOC_PREFIX = option.get('--prefix', 'ls_') DEBUG = '-d' in option if len(args) != 1: print >> STDERR, "Usage error: db2vim --help for usage" sys.exit(1) fileName = args[0] FILENAME = re.sub(r'\.\w+$', r'.txt', fileName) try: fp = open(fileName) except: print "Error opening xml file" dom = xml.dom.minidom.parse(fp) modeline = r''' ================================================================================ About this file This file was created automatically from its XML variant using db2vim. db2vim is a python script which understands a very limited subset of the Docbook XML 4.2 DTD and outputs a plain text file in vim help format. db2vim can be obtained via anonymous CVS from sourceforge.net. Use cvs -d:pserver:anonymous@cvs.vim-latex.sf.net:/cvsroot/vim-latex co db2vim Or you can visit the web-interface to sourceforge CVS at: http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/vim-latex/db2vim/ ================================================================================''' STRICT = '-s' in option pattern = re.compile( r'\n([< ]*)([^\n]+)&codebegin;\n(.*?)&codeend;', re.DOTALL) processedDoc = handleElement(dom.documentElement) while re.search('&codebegin;', processedDoc): processedDoc = re.sub(pattern, replaceComment, processedDoc) urlsection = r""" ================================================================================ URLs used in this file """ labels = zip(URL_HASH.values(), URL_HASH.keys()) labels.sort() for label, url in labels: urlsection += '*%s* : %s\n' % (label, url) processedDoc = processedDoc + urlsection + modeline print processedDoc.encode('iso-8859-1') # vim:et:sts=4