#!/usr/bin/env python # -*- coding: utf-8 -*- import re import sys import os import readline from xml.sax.saxutils import XMLGenerator import xml.sax.xmlreader def create_attrs(attrs): return xml.sax.xmlreader.AttributesNSImpl( attrs, dict( map(lambda k: (k, k[1]), attrs.iterkeys()) ) ) class LatexParseError: def __init__(self, message, reader): self.verbatim_mode = reader.verbatim_mode self.stack = list(reader.stack) self.message = message def __repr__(self): return u"%s: %d %s" % (self.message, self.verbatim_mode, self.stack) class LatexReader: latex_re = re.compile(ur"""(\\[a-zA-Z]+|\\[^a-zA-Z])|([\${}\[\]])|([^\\\${}\[\]]+)""") blank_re = re.compile(ur"""^\s*$""") texml_ns = u'http://getfo.sourceforge.net/texml/ns1' math_text_commands = frozenset([ u'\\textrm', u'\\textsl', u'\\textit', u'\\texttt', u'\\textbf', u'\\textnormal', u'\\text', u'\\hbox', u'\\mbox' ]) math_mode_environments = frozenset([ u'displaymath', u'align', u'align*' ]) def __init__(self, f, encoding='us-ascii', out=None): if out is None: out = XMLGenerator(sys.stdout, 'us-ascii') self.file = f self.encoding = encoding self.out = out self.out.startDocument() self.out.startPrefixMapping(None, self.texml_ns) self.out.startElementNS((self.texml_ns,u'TeXML'), u'TeXML', {}) self.out.characters(u"\n") self.stack = [ u'\0' ] self.verbatim_mode = False self.line = None def close(self): self.out.endElementNS((self.texml_ns,u'TeXML'), u'TeXML') self.out.characters(u"\n") self.out.endPrefixMapping(None) self.out.endDocument() def next_token(self): while True: if self.line is None: self.line = unicode(self.file.readline(), self.encoding) self.pos = 0 if self.line == u"": self.line = None return None elif self.blank_re.match(self.line): self.line = None return u"\n" else: m = self.latex_re.match(self.line, self.pos) if m is None: self.line = None else: self.pos = m.end() return m def process_token(self, m): if self.verbatim_mode == True: if m == u"\n": self.out.characters(u"\n") elif m.group(2) is not None: if m.group(2) == u'{': if self.stack[-1] == u't': self.stack[-1] = u'T' else: self.stack.append(m.group(2)) self.out.characters(m.group(2)) elif m.group(2) == u'}': self.stack.pop() self.out.characters(m.group(2)) elif m.group(2) == u'$': if self.stack[-1] == m.group(2): self.stack.pop() if self.stack[-1] == u'm': self.stack.pop() self.verbatim_mode = False self.out.endElementNS((self.texml_ns,u'TeXML'), u'TeXML') self.out.endElementNS((self.texml_ns,u'math'), u'math') else: self.out.characters(m.group(2)) elif self.stack[-1] == u'T': self.stack.append(m.group(2)) self.out.characters(m.group(2)) else: raise LatexParseError(u"unexpected $", self) else: self.out.characters(m.group(2)) elif m.group(1) is not None: if m.group(1) in self.math_text_commands: self.stack.append(u't') self.out.characters(m.group(0)) elif m.group(1) == u'\\end' and self.stack[-1][0] == u'@': self.stack.append(u'e>') self.verbatim_mode = False elif m.group(1) == u'\\]': if self.stack[-1] == u'@displaymath': self.stack.pop() self.verbatim_mode = False self.out.endElementNS((self.texml_ns,u'env'), u'env') else: raise LatexParseError(u'unexpected \\]') else: self.out.characters(m.group(1)) else: self.out.characters(m.group(0)) elif m == u"\n": self.out.startElementNS((self.texml_ns,u'cmd'), u'cmd', create_attrs({ (None, u'name'): u'par'})) self.out.endElementNS((self.texml_ns,u'cmd'), u'cmd') self.out.characters(u"\n") elif self.verbatim_mode == False: if m.group(1) is not None: if self.stack[-1][0] == u'\\': if self.stack[-1] != u'\\begin': self.out.endElementNS((self.texml_ns,u'cmd'), u'cmd') self.stack.pop() if self.stack[-1][0] == u'@' and \ self.stack[-1][1:] in self.math_mode_environments: self.verbatim_mode = True if m.group(1) == u'\\begin': self.stack.append(u'e<') elif m.group(1) == u'\\end': self.stack.append(u'e>') elif m.group(1) == u'\\[': self.stack.append(u'@displaymath') self.verbatim_mode = True self.out.startElementNS((self.texml_ns,u'env'), u'env', create_attrs({ (None, u'name'): u'displaymath' })) else: self.out.startElementNS((self.texml_ns,u'cmd'), u'cmd', create_attrs({ (None, u'name'): m.group(1)[1:] })) self.stack.append(m.group(1)) elif m.group(2) is not None: if m.group(2) == u'{': if self.stack[-1][0] == u'e': self.stack[-1] = u'E' + self.stack[-1][1] else: gi = self.stack[-1][0] == u'\\' and u'parm' \ or u'group' self.out.startElementNS((self.texml_ns,gi), gi, create_attrs({})) self.stack.append(m.group(2)) return elif m.group(2) == u'[': if self.stack[-1][0] == u'\\': self.out.startElementNS((self.texml_ns,u'opt'), u'opt', create_attrs({})) self.stack.append(m.group(2)) else: self.out.characters(m.group(2)) return if self.stack[-1][0] == u'\\': if self.stack[-1] != u'\\begin': self.out.endElementNS((self.texml_ns,u'cmd'), u'cmd') self.stack.pop() if self.stack[-1][0] == u'@' and \ self.stack[-1][1:] in self.math_mode_environments: self.verbatim_mode = True if m.group(2) == u'}': if self.stack[-1][0] == u'E': env_name = self.stack[-1][2:] if self.stack[-1][1] == u'<': self.out.startElementNS((self.texml_ns,u'env'), u'env', create_attrs({ (None, u'name'): env_name })) self.stack.pop() self.stack.append(u'@' + env_name) self.stack.append(u'\\begin') else: self.stack.pop() self.out.endElementNS((self.texml_ns,u'env'), u'env') if self.stack[-1][0] == u'@': old_env_name = self.stack.pop()[1:] if old_env_name != env_name: raise LatexParseError(u"error in environment name", self) else: raise LatexParseError(u"unexpected end", self) else: self.stack.pop() gi = self.stack[-1][0] == u'\\' and u'parm' \ or u'group' self.out.endElementNS((self.texml_ns,gi), gi) elif m.group(2) == u'$': self.stack.append(u'm') self.stack.append(m.group(2)) self.verbatim_mode = True self.out.startElementNS((self.texml_ns,u'math'), u'math', create_attrs({})) self.out.startElementNS((self.texml_ns,u'TeXML'), u'TeXML', create_attrs({(None, u'escape'): u"0"})) elif m.group(2) == u']': if self.stack[-1] == u'[': self.out.endElementNS((self.texml_ns,u'opt'), u'opt') self.stack.pop() else: self.out.characters(m.group(2)) else: if self.stack[-1][0] == u'E': self.stack[-1] = self.stack[-1] + m.group(0) else: if self.stack[-1][0] == u'\\': if self.stack[-1] != u'\\begin': self.out.endElementNS((self.texml_ns,u'cmd'), u'cmd') self.stack.pop() if self.stack[-1][0] == u'@' and \ self.stack[-1][1:] in self.math_mode_environments: self.verbatim_mode = True self.out.characters(m.group(0)) def parse(self): while True: m = self.next_token() if m is None: return self.process_token(m) def debug(filename): import threading import pygtk pygtk.require('2.0') import gtk gtk.threads_init() class TextViewWriterStream: def __init__(self, textview=None): self.textview = textview def write(self, str): buffer = self.textview.get_buffer() buffer.insert(buffer.get_end_iter(), str) class LatexReaderMonitor(threading.Thread): def process_once(self, widget=None, data=None): t = self.reader.next_token() if t is None: return buffer = self.latexview.get_buffer() if t == u"\n": buffer.insert(buffer.get_end_iter(), "\n") else: buffer.insert(buffer.get_end_iter(), t.group(0)) self.reader.process_token(t) if self.reader.verbatim_mode == False: self.stackview.set_text(u" ".join(self.reader.stack[1:])) else: self.stackview.set_text(u" ".join([u'V'] + self.reader.stack[1:])) def delete_event(self, widget, event, data=None): return False def destroy(self, widget, data=None): gtk.main_quit() def attach(self, reader): self.reader = reader def __init__(self): threading.Thread.__init__(self) self.reader = None self._xml_writer_stream = None self.window = gtk.Window(gtk.WINDOW_TOPLEVEL) self.window.connect("delete_event", self.delete_event) self.window.connect("destroy", self.destroy) box1 = gtk.VBox() box2 = gtk.HBox() box1.pack_start(box2, True, True, 0) self.window.add(box1) button1 = gtk.Button(u"處理下一個的 _token") box2.pack_end(button1, True, True, 0) button1.connect("clicked", self.process_once) button1.show() self.stackview = gtk.Label("") box2.pack_end(self.stackview, True, True, 0) self.stackview.set_justify(gtk.JUSTIFY_LEFT) self.stackview.set_line_wrap(True) self.stackview.show() latexviewscroll = gtk.ScrolledWindow() self.latexview = gtk.TextView() self.latexview.set_editable(False) latexviewscroll.add_with_viewport(self.latexview) box1.pack_start(latexviewscroll, True, True, 0) self.latexview.show() latexviewscroll.show() xmlviewscroll = gtk.ScrolledWindow() self.xmlview = gtk.TextView() self.xmlview.set_editable(False) xmlviewscroll.add_with_viewport(self.xmlview) box1.pack_start(xmlviewscroll, True, True, 0) self.xmlview.show() xmlviewscroll.show() box2.show() box1.show() self.window.show() def xml_writer_stream(self): if self._xml_writer_stream is None: self._xml_writer_stream = xml.sax.saxutils.XMLGenerator( TextViewWriterStream(self.xmlview), "utf-8") return self._xml_writer_stream def run(self): gtk.main() mon = LatexReaderMonitor() reader = LatexReader(open(filename), out=mon.xml_writer_stream()) mon.attach(reader) mon.start() return mon def main(): try: reader = LatexReader(sys.stdin) reader.parse() reader.close() except LatexParseError, e: print e if __name__ == "__main__": main()