After exporting pages from Confluence, the next step was to convert between the Confluence wiki format and the MediaWiki format. The differences are sometimes quite amusing - in a link with a different name from the page it is linking to, one puts the name first and the page second, and the other the page first and the name second.

Not really my best code - I think I was writing Python in a very PHPesque way. But it functioned sufficiently to convert our pages.

#!/usr/bin/env python

from cStringIO import StringIO
import os.path
import codecs
import re

def append_page(all_files, name, filenames):
    for f in filenames:
        all_files.append('%s' % (f,))

link_re = re.compile("\[([^\]]+)\]")
linktext_re = re.compile("\[([^\|\]]+)\|([^\]]+)\]")
bold_re = re.compile("\*([^ *]+)\*")
italic_re = re.compile(r"\b_([^ _]+)_\b")

def mangle(contents):
    utf8_w = codecs.getwriter('utf-8')
    utf8_r = codecs.getreader('utf-8')
    output = utf8_w(StringIO())
    noformat_count = 0
    for line in contents.split("\n"):
        if "h1." in line:
            line = line.replace("h1.", "==")
            line = line + " =="

        if "h2." in line:
            line = line.replace("h2.", "===")
            line = line + " ==="

        if "h3." in line:
            line = line.replace("h3.", "====")
            line = line + " ===="

        if "h4." in line:
            line = line.replace("h4.", "=====")
            line = line + " ====="

        ltm = linktext_re.search(line)
        if ltm:
            if 'http:' in ltm.group(2) or 'ftp:' in ltm.group(2):
                line = re.sub(linktext_re, r"[\2 \1]", line)
            else:
                line = re.sub(linktext_re, r"[\2|\1]", line)

        if (noformat_count % 2) == 0:
            lm = link_re.search(line)
            if lm:
                line = re.sub(link_re, r"[[\1]]", line)

        bm = bold_re.search(line)
        if bm:
            line = re.sub(bold_re, r"'''\1'''", line)

        im = italic_re.search(line)
        if im:
            line = re.sub(italic_re, r"''\1''", line)

        while '{noformat}' in line:
            if (noformat_count % 2) == 0:
                line = line.replace('{noformat}', '<pre>')
            else:
                line = line.replace('{noformat}', '</pre>')
            noformat_count += 1

        output.write(line + "\n")

    value = output.getvalue()
    a = utf8_r(StringIO(value)).read()
    return a

files = []
os.path.walk('orig-pages', append_page, files)

print files

for f in files:
    contents = codecs.open('orig-pages/%s' % (f,), 'r', 'utf-8').read()
    contents = mangle(contents)
    codecs.open('conv-pages/%s' % (f,), 'w', 'utf-8').write(contents)