Código fuente de MediaBook

From Software libre para los países en desarrollo

Jump to: navigation, search

Este es el código fuente de una versión preliminar de MediaBook. A diferencia de lo que señala la impronta de este trabajo, el código de este programa (y, por consiguiente, este listado particular) se distribuye bajo los términos de la licencia pública GNU GPL, versión 2.

Para ejecutarlo:

  • instalar Python
  • instalar httplib2
  • depositar el código en un archivo de texto
  • ejecutar el archivo con Python
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# MediaBook
# copyright 2006 Manuel Amador (Rudd-O) [[http://rudd-o.com/]]
# Reuses the magnificent Boom! standard
# laid out at http://alistapart.com/articles/boom
# version 0.0.1alpha
# distributed under the GNU GPL, version 2

import sys
import os
import urllib
#from xml.dom.ext.reader import Sax2
import xml.dom
xml.dom.getDOMImplementation("minidom")
import xml.dom.Element
import time
import re
import new
import httplib2
from sets import Set
# resources

# FIXME option to make self-contained file with data: URIs
# FIXME make the META header generation, at least for AUTHOR, easier
# maybe we should leave this particular concern to an external tool such as htmlxmlutils?

# FIXME the title should be detected from the title page if present
# FIXME audit because the trailing slash MATTERS A LOT

def walkTree(node):
	yield node
	if hasattr(node,"childNodes"):
		for child in node.childNodes:
			for n1 in walkTree(child):
				yield n1

# manipulation functions

httpobject = httplib2.Http(os.path.expanduser("~/.mediabook.cache"))
def fetch_url(url):
	global httpobject
	print "Requesting %s"%url
	resp,data = httpobject.request(url)
	#print "status %s"%resp["status"]
	if int(resp["status"]) not in [200,301,304,302]:
		raise Exception, "Unexpected status %s while retrieving %s"%(resp["status"],url)
	return data

def parse_document(string):
	#t = time.time()
	parsed = xml.dom.minidom.parseString(string)
	#print "Parse took", time.time()-t
	return parsed

def get_document(url):
	contents = fetch_url(url)
	try: return parse_document(contents)
	except: raise Exception,"XML parsing error in URL %s"%url


def firstChildElement(e): return [ a for a in e.childNodes if a.nodeType == 1 ][0]
def firstChildTextNode(e): return [ a for a in e.childNodes if a.nodeType == 3 ][0]
def nextSiblingElement(e):
	e = e.nextSibling
	while True:
		if e.nodeType == 1: return e
		if not e.nextSibling: return None
		e = e.nextSibling
def replaceNode(old,new):
	old.parentNode.replaceChild(new,old)
	return new
def morphElement(old,newTagName):
	new = old.ownerDocument.createElement(newTagName)
	for key,value in old.attributes.items(): new.setAttribute(key,value)
	for child in old.childNodes[:]: new.appendChild(child)
	if old.parentNode: replaceNode(old,new)
	return new
def removeNode(node):
	node.parentNode.removeChild(node)
def heightDifference(start,ancestor):
	height = 0
	while start != ancestor:
		height = height + 1
		start = start.parentNode
	return height
def genid(prefix):
	a = 0
	while True:
		a = a + 1; yield unicode(prefix) + unicode(a)
def transformtotext(node):
	return " ".join( [ a.nodeValue for a in walkTree(node) if a.nodeType == 3 ] )

sanitizer = re.compile("[^\w]")
def sanitize_anchor(anchortext):
	global sanitizer
	return sanitizer.sub("-",anchortext)
def gethtmlheadingnames():
	return map(lambda x: "h" + str(x),range(1,7))
def addClass(node,clas):
	if node.getAttribute("class"): node.setAttribute("class",node.getAttribute("class")+u" "+clas)
	else: node.setAttribute("class",clas)
def insertAfter(newnode,afterwhich):
	if afterwhich.nextSibling: afterwhich.parentNode.insertBefore(newnode,afterwhich.nextSibling)
	else: afterwhich.parentNode.appendChild(newnode)


frontmattersections = ["frontcover","halftitlepage","titlepage","imprint","preface","dedication","thanks","toc"]
contentsections = ["bookcontents","appendices"]
endmattersections = ["references","glossary","colophon","backcover"]
notocsections = ["frontcover","halftitlepage","titlepage","backcover"] # FIXME check which ones have to be notocized

def bootstrap(url):
	def get_key_element(self,element):
		return filter(
			lambda x: x.getAttribute("id") == element,
			self.getElementsByTagName("div"))[0]
	
	document = get_document(url)
	for a in ["get_key_element"]:
		document.__class__.__dict__[a]=new.instancemethod(locals()[a],document,document.__class__)
	
	return document

def sectionize(group,mode):
	# mode: bookcontents or appendices
	list_elements = group.getElementsByTagName("li")
	def treatListElement(le):
		distance = heightDifference(le,group) / 2
		if mode == "appendices":
			if distance == 1: le.setAttribute("class","appendix")
			else: le.setAttribute("class","subappendix-level-"+str(distance))
		elif mode == "bookcontents":
			if distance == 1: le.setAttribute("class","chapter")
			elif distance == 2: le.setAttribute("class","section")
			else: le.setAttribute("class","subsection-level-"+str(distance))
		else: raise ValueError, "invalid mode %s passed"%mode
	map(treatListElement,list_elements)
	for a in group.getElementsByTagName("li"): morphElement(a,"div")
	ols = group.getElementsByTagName("ol")
	for ol in ols:
		for div in ol.childNodes[:]: ol.parentNode.insertBefore(div,ol)
	map(removeNode,group.getElementsByTagName("ol"))

def subdocument_integration(front_page,url):
	#cleanup everything first
	def nodeGood(node):
		if node.nodeName in ["li","ol","a","div"]: return False
		if node.parentNode.nodeName == "a": return False
		return True
	
	for a in contentsections:
		try: group = front_page.get_key_element(a)
		except IndexError: continue
		map(removeNode,filter(nodeGood,walkTree(group)))
		links = filter(lambda x: x.getAttribute("href"),group.getElementsByTagName("a"))
		for link in links:
			subdocument = get_document(urllib.basejoin(url,link.getAttribute("href").split("#")[0]))
			contents = filter(
				lambda x: x.getAttribute("id") == "content",
				subdocument.getElementsByTagName("div"))[0]
			contents.setAttribute("sourceurl",link.getAttribute("href"))
			contents.setAttribute("class","subdocument")
			contents.removeAttribute("id")
			replaceNode(link,contents)
	
	for a in frontmattersections + endmattersections:
		# FIXME define as resource!
		try: page = front_page.get_key_element(a)
		except IndexError: continue
		map(removeNode,filter(nodeGood,walkTree(page)))
		link = filter(lambda x: x.getAttribute("href"),page.getElementsByTagName("a"))[0]
		subdocument = get_document(urllib.basejoin(url,link.getAttribute("href").split("#")[0]))
		contents = filter(
			lambda x: x.getAttribute("id") == "content",
			subdocument.getElementsByTagName("div"))[0]
		contents.setAttribute("sourceurl",link.getAttribute("href"))
		contents.setAttribute("class","subdocument")
		contents.removeAttribute("id")
		page.setAttribute("class",a)
		replaceNode(link,contents)
		
def anchor_fixup(front_page,url):
	def jumpids(node):
		nextsibling = nextSiblingElement(node)
		if nextsibling.getAttribute("id"):
			print node
			print nextsibling
			raise Exception, "already had an id %s"%nextsibling.getAttribute("id")
		nextsibling.setAttribute("id",node.getAttribute("id"))
		removeNode(node)
					#node.appendChild(node.ownerDocument.createTextNode("(an anchor - hide me with stylesheets)"))
	def getsourceurl(node): return node.parentNode.getAttribute("sourceurl")
	def namespace_anchor(anchor):
		namespace = getsourceurl(anchor)
		original_anchor_name = anchor.getAttribute("name")
		if not original_anchor_name: original_anchor_name = anchor.getAttribute("id")
		newanchor = sanitize_anchor(namespace+u"---"+original_anchor_name)
		#print "Tidying anchor:\n  Before: %s\n  After:  %s"%("top",newanchor)
		anchor.setAttribute("name",newanchor)
		anchor.setAttribute("id",newanchor)
	def makeintoabsolutehref(tag):
		newlink = urllib.basejoin(url,tag.getAttribute("href"))
		#print "Absolutizing link:\n  Before: %s\n  After:  %s"%(tag.getAttribute("href"),newlink)
		tag.setAttribute("originalhref",tag.getAttribute("href"))
		tag.setAttribute("href",newlink)
	tobemapped = filter(lambda x: x.getAttribute("name"),front_page.getElementsByTagName("a"))
	for a in frontmattersections + endmattersections:
		try:
			for b in front_page.get_key_element(a).childNodes:
				tobemapped = tobemapped + filter(lambda x: x.nodeName != "a" and hasattr(x,"getAttribute") and x.getAttribute("id"),walkTree(b))
		except IndexError: continue 
	for a in contentsections:
		try:
			for b in front_page.get_key_element(a).childNodes:
				tobemapped = tobemapped + filter(lambda x: x.nodeName != "a" and hasattr(x,"getAttribute") and x.getAttribute("id"),walkTree(b)) 
		except IndexError: continue
	#print tobemapped
	#raise Exception,"a"
	map(namespace_anchor,tobemapped)
	map(jumpids,filter(lambda x: not x.getAttribute("href"),front_page.getElementsByTagName("a")))
	map(makeintoabsolutehref,filter(lambda x: x.getAttribute("href"),front_page.getElementsByTagName("a")))
	
def fix_hyperlinks(front_page,url):
	
	def getsourceurl(node):
		if node.getAttribute("sourceurl"): return node.getAttribute("sourceurl")
		return getsourceurl(node.parentNode)
	
	def buildsourceurlcache():
		nodeswithpurls = filter(lambda x:hasattr(x,"getAttribute") and x.getAttribute("sourceurl"),walkTree(front_page))
		return [ urllib.basejoin(url,x.getAttribute("sourceurl").split("#")[0]) for x in nodeswithpurls ]
	def incache(i):
		try:
			cache.index(i)
			return True
		except ValueError: return False
	
	cache = buildsourceurlcache()
	
	def processahref(tag):
		href = tag.getAttribute("href")
		
		if not href: return
		if tag.getAttribute("originalhref")[0] == "#":
			#we deal with this special case here.  Since now every ID is namespaced...
			try: namespace = getsourceurl(tag)
			except Exception,e:
				print "Skipping URL: No source URL can be found for URL %s found in document"%href
				return
			fragment = sanitize_anchor(namespace+u"---"+href.split("#")[1])
			t = tag.getAttribute("class")
			if not t: t = u""
			tag.setAttribute("class",t + u" fragmentref")
			tag.setAttribute("href",u"#"+fragment)
			return
		
		try: uri,fragment = href.split("#",1)
		except:
			uri = href
			fragment = None
		
		if incache(uri):
			attr = ""
			namespace = tag.getAttribute("originalhref").split("#")[0]
			if fragment is None: original_anchor_name = "top"
			else: original_anchor_name = fragment
			fragment = sanitize_anchor(namespace+u"---"+original_anchor_name)
		else:
			attr = uri
		
		if fragment is not None: attr = attr + "#" + fragment
		tag.setAttribute("href",attr)
		if attr[0] == "#":
			t = tag.getAttribute("class")
			if not t: t = u""
			tag.setAttribute("class",t + u" fragmentref")
	
	map(processahref,front_page.getElementsByTagName("a"))
	
def remove_moles(front_page):
	map(removeNode,filter(lambda x: x.getAttribute("id") in ["contentSub","jump-to-nav"],front_page.getElementsByTagName("div")))
	map(removeNode,filter(lambda x: x.getAttribute("id") in ["toc"],front_page.getElementsByTagName("table")))
	map(removeNode,filter(lambda x: x.getAttribute("id") in ["filetoc"],front_page.getElementsByTagName("ul")))
	for a in filter(lambda x: hasattr(x,"getAttribute") and x.getAttribute("id") in ["filehistory"],walkTree(front_page)):
		for b in range(7): removeNode(a.nextSibling)
		removeNode(a)
	map(removeNode,filter(lambda x: x.getAttribute("id") in ["siteSub"],front_page.getElementsByTagName("h3")))
	map(removeNode,filter(lambda x: x.getAttribute("class") in ["visualClear","printfooter","magnify"],
		front_page.getElementsByTagName("div")))
	# editsection in the line above disabled: "editsection",
	bodycontents = filter(lambda x:x.getAttribute("id") == "bodyContent",front_page.getElementsByTagName("div"))
	for bodycontent in bodycontents:
		for snippet in bodycontent.childNodes[:]:
			bodycontent.parentNode.insertBefore(snippet,bodycontent)
	map(removeNode,bodycontents)
	def morphthumbcaption(node):
		element = morphElement(node,"p")
		element.setAttribute("class","caption")
		element.parentNode.insertBefore(element,element.parentNode.firstChild)
	map(morphthumbcaption,filter(lambda x: x.getAttribute("class") == "thumbcaption",
		front_page.getElementsByTagName("div")))
	def morphfigure(node):
		newlist = []
		for i in node.getAttribute("class").split(" "):
			if i == "thumb": newlist.append("figure")
			else: newlist.append(i)
		node.setAttribute("class"," ".join(newlist))
		map(node.appendChild,node.childNodes[0].childNodes[:])
		removeNode(node.childNodes[0])
	map(morphfigure,filter(lambda x: x.getAttribute("class") in ["thumb tnone","thumb tleft","thumb tright"],
		front_page.getElementsByTagName("div")))
	def notocize(node): addClass(node,"no-toc")
	for keyelement in notocsections:
		for tagname in gethtmlheadingnames():
			try: map(notocize,front_page.get_key_element(keyelement).getElementsByTagName(tagname))
			except IndexError: continue
	for keyelement in ["imprint","dedication"]: #FIXME instead of doing this hard, let's do it with CSS
		try: map(removeNode,front_page.get_key_element(keyelement).getElementsByTagName("h1"))
		except IndexError: continue
		
def remove_duplicate_ids(front_page):
	ids = []
	for node in walkTree(front_page):
		if not hasattr(node,"getAttribute"): continue
		try:
			ids.index(node.getAttribute("id"))
			if not node.getAttribute("id"): continue
			#print "Warning: ID %s already exists in the document, removing"%node.getAttribute("id")
			node.removeAttribute("id")
		except ValueError: ids.append(node.getAttribute("id"))

def demotion(front_page):
	toc = front_page.get_key_element("bookcontents")
	def demote_headers_in_node(node,steps):
		tagarray = gethtmlheadingnames() ; tagarray.reverse()
		for oldtagname in tagarray:
			oldheadings = node.getElementsByTagName(oldtagname)
			if tagarray.index(oldtagname)-steps < 0 or not oldheadings: continue
			newtagname = tagarray[tagarray.index(oldtagname)-steps]
			#print "    Moving %s into %s"%(oldtagname,newtagname)
			for oldheading in oldheadings: morphElement(oldheading,newtagname)
	contentNodes = filter(lambda x:x.getAttribute("class") == "subdocument",toc.getElementsByTagName("div"))
	for node in contentNodes:
		demotion_level = heightDifference(node,toc) - 2
		if demotion_level: demote_headers_in_node(node,demotion_level)

def footnotes_and_citations(front_page): # propagate citation titles to all other anonymous citations when the URL matches.  use that to generate the list of references and the page numbers where they are shown
	for a in frontmattersections + contentsections + endmattersections:
		try: key = front_page.get_key_element(a)
		except IndexError: continue
		cite_cache = {} # url that points to a list of texts that go in the footnote
		def add_footnote_after_cite(citenode):
			def get_link(fromcite):
				if fromcite.getElementsByTagName("a"): return citenode.getElementsByTagName("a")[0].getAttribute("href")
			def get_title(fromcite): return fromcite.getAttribute("title")
			if get_link(citenode):
				try: list_of_texts = cite_cache[get_link(citenode)]
				except: list_of_texts = None
			else: list_of_texts = None
			if not list_of_texts: list_of_texts = filter(lambda x: x,[ get_title(citenode), get_link(citenode)])
			if list_of_texts:
				span = citenode.ownerDocument.createElement("span")
				span.setAttribute("class","footnote")
				for element in list_of_texts:
					span.appendChild(citenode.ownerDocument.createTextNode(element))
					span.appendChild(citenode.ownerDocument.createElement("br"))
				removeNode(span.lastChild)
				citenode.appendChild(span)
				if citenode.getElementsByTagName("a"):
					citenode.getElementsByTagName("a")[0].setAttribute("footnoted","1")
				if get_link(citenode): cite_cache[get_link(citenode)] = list_of_texts
		def add_footnote_after_hyperlink(anode):
			href = anode.getAttribute("href")
			if re.search("\&action=edit",href) is not None: return # we exclude wiki edit links!
			span = anode.ownerDocument.createElement("span")
			span.setAttribute("class","footnote")
			span.appendChild(anode.ownerDocument.createTextNode(href))
			insertAfter(span,anode)
		map(add_footnote_after_cite,key.getElementsByTagName("cite"))
		map(add_footnote_after_hyperlink,
				filter(lambda x:x.getAttribute("href") and x.getAttribute("href")[0] != "#" and not x.getAttribute("footnoted"),
				key.getElementsByTagName("a")))

def definitions_abbreviations_acronyms(front_page): #FIXME  make it so unmarked text gets automatic definitions.  Should this be a job of MediaWiki?
	try: glossarypage = front_page.get_key_element("glossary")
	except: return
	
	gendfnid = genid("dfn").next
	dl = glossarypage.ownerDocument.createElement("dl")
	nodes = []
	for a in frontmattersections + contentsections + endmattersections:
		try: key = front_page.get_key_element(a)
		except: continue
		nodes = nodes + key.getElementsByTagName("abbr") + key.getElementsByTagName("acronym") + key.getElementsByTagName("dfn")
	
	term2definitionnode = {}
	term2sourcenodes = {}
	terms = []
	for node in nodes:
		term = transformtotext(node)
		terms.append(term)
		if term2sourcenodes.has_key(term): term2sourcenodes[term].append(node)
		else: term2sourcenodes[term] = [node]
		if node.getAttribute("title") and not term2definitionnode.has_key(term):
			term2definitionnode[term] = node

	terms = Set(terms)

	def alphasort(listorset):
		lowerterms = [ (a.lower().encode("utf-8"),a) for a in terms ]
		lowerterms.sort()
		return [ a[1] for a in lowerterms ]
	
	terms = alphasort(terms)

	for term in terms:
		try: definitionnode = term2definitionnode[term]
		except KeyError:
			print "definition for term",term,"missing in list of definitions"
			definitionnode = None
		dt = front_page.createElement("dt")
		dd = front_page.createElement("dd")
		
		# fill the DT out with the term
		if definitionnode:
			for child in definitionnode.childNodes: dt.appendChild(child.cloneNode(True))
			dd.appendChild(front_page.createTextNode(definitionnode.getAttribute("title")))
		else:
			dt.appendChild(front_page.createTextNode("(no node found for term %s)"%term))
			dd.appendChild(front_page.createTextNode("(no node found for term %s"%term))
		for refnode in term2sourcenodes[term]:
			dfnid = gendfnid()
			refnode.setAttribute("id",dfnid)
			dt.appendChild(front_page.createTextNode(" "))
			pointertoabove = front_page.createElement("a")
			pointertoabove.setAttribute("href",u"#" + dfnid)
			pointertoabove.setAttribute("class","glossaryref")
			pointertoabove.appendChild(front_page.createTextNode("(ref)"))
			dt.appendChild(pointertoabove)
		
		dl.appendChild(dt)
		dl.appendChild(dd)
		
	contents = filter(lambda x:x.getAttribute("class") == "subdocument",glossarypage.getElementsByTagName("div"))[0]
	contents.appendChild(dl)

def gen_ref_list(front_page):
	try: referencespage = front_page.get_key_element("references")
	except: return
	
	gencitationid = genid("citation").next
	
	def geturl(node):
		meck = filter(lambda x:x.getAttribute("href"), node.getElementsByTagName("a"))
		if meck: return meck[0].getAttribute("href")
	
	ul = referencespage.ownerDocument.createElement("ul")
	nodes = []
	for a in frontmattersections + contentsections + endmattersections:
		try: key = front_page.get_key_element(a)
		except: continue
		nodes = nodes + key.getElementsByTagName("cite")
	
	url2definitionnode = {}
	url2sourcenodes = {}
	urls = []
	for node in nodes:
		url = geturl(node)
		if not url: continue
		urls.append(url)
		if url2sourcenodes.has_key(url): url2sourcenodes[url].append(node)
		else: url2sourcenodes[url] = [node]
		if node.getAttribute("title") and not url2definitionnode.has_key(url):
			url2definitionnode[url] = node

	urls = Set(urls)

	for url in urls:
		try: definitionnode = url2definitionnode[url]
		except KeyError:
			print "citation for url",url,"missing in citation database"
			definitionnode = None
		li = front_page.createElement("li")
		if definitionnode:
			li.appendChild(front_page.createTextNode(definitionnode.getAttribute("title")))
		else:
			li.appendChild(front_page.createTextNode("(no reference description found in any CITE title)"))
			print "Warning: no reference found for",url
		li.appendChild(front_page.createElement("br"))
		li.appendChild(front_page.createTextNode(url))
		li.appendChild(front_page.createElement("br"))
		for refnode in url2sourcenodes[url]:
			citationid = gencitationid()
			refnode.setAttribute("id",citationid)
			li.appendChild(front_page.createTextNode(" "))
			pointertoabove = front_page.createElement("a")
			pointertoabove.setAttribute("href",u"#" + citationid)
			pointertoabove.setAttribute("class","citationref")
			pointertoabove.appendChild(front_page.createTextNode("(ref)"))
			li.appendChild(pointertoabove)
		ul.appendChild(li)
		
	contents = filter(lambda x:x.getAttribute("class") == "subdocument",referencespage.getElementsByTagName("div"))[0]
	contents.appendChild(ul)

def gen_toc(front_page):
	try: tocpage  = front_page.get_key_element("toc")
	except: return
	level = {"h1":1,"h2":2,"h3":3,"h4":4,"h5":5,"h6":6}
	root = front_page.createElement("ul")
	root.setAttribute("class","toc")
	current_container = root
	oldtoclevel = 1
	def gentocelement(bodyelement):
		link = bodyelement.ownerDocument.createElement("a")
		link.appendChild(bodyelement.cloneNode(True))
		for tag in ["a"] + level.keys():
			for node in link.getElementsByTagName(tag):
				if not link.getAttribute("href") and node.getAttribute("id"):
					link.setAttribute("href",u"#"+node.getAttribute("id"))
				for child in node.childNodes[:]: node.parentNode.insertBefore(child,node)
				removeNode(node)
		li = bodyelement.ownerDocument.createElement("li")
		li.appendChild(link)
		return li
	sections = frontmattersections + contentsections + endmattersections
	sections.remove("toc")
	def istagandinclass(node,tagname,clas):
		if node.nodeName == tagname and clas in node.parentNode.parentNode.getAttribute("class").split(" "): return True
	def ischaptertitle(node): return istagandinclass(node,"h1","chapter")
	def isappendixtitle(node): return istagandinclass(node,"h1","appendix")
	def isfrontmattertitle(node):
		if True in [ istagandinclass(node,"h1",a) for a in frontmattersections ]: return True
	def isendmattertitle(node):
		if True in [ istagandinclass(node,"h1",a) for a in endmattersections ]: return True
	for section in sections:
		try: page = front_page.get_key_element(section)
		except IndexError: continue
		for node in walkTree(page):
			if node.nodeType != 1 or not level.has_key(node.nodeName): continue
			# we skip the no-tocs marked ones
			if node.getAttribute("class") and "no-toc" in node.getAttribute("class").split(" "): continue
			# if we reach here, it's a title!
			toclevel = level[node.nodeName]
			if toclevel == oldtoclevel: pass
			elif toclevel < oldtoclevel:
				for stepdown in range(oldtoclevel - toclevel): current_container = current_container.parentNode.parentNode
			elif toclevel > oldtoclevel:
				for stepup in range(toclevel - oldtoclevel):
					li = front_page.createElement("li")
					current_container.appendChild(li)
					ul = front_page.createElement("ul")
					li.appendChild(ul)
					current_container = ul
			element = gentocelement(node)
			for a in ["chapter","appendix","frontmatter","endmatter"]:
				if locals()["is"+a+"title"](node): element.setAttribute("class",a)
			current_container.appendChild(element)
			oldtoclevel = toclevel
		contents = filter(lambda x:x.getAttribute("class") == "subdocument",tocpage.getElementsByTagName("div"))[0]
		contents.appendChild(root)
	
def download_images(front_page,url,folder):
	images = front_page.getElementsByTagName("img")
	for image in images:
		src = image.getAttribute("src")
		filename = os.path.basename(src)
		imgurl = urllib.basejoin(url,src)
		# FIXME audit because the trailing slash in URL MATTERS A LOT
		if imgurl.startswith(urllib.basejoin(url,"images/thumb")):
			imgurl = re.sub("images/thumb/","images/",os.path.dirname(imgurl))
		contents = fetch_url(imgurl)
		file(os.path.join(folder,"images",urllib.unquote_plus(filename.encode("utf-8"))),"wb").write(contents)
		image.setAttribute("src","images/"+filename)

def storage(front_page,save_file,title,extra_head_markup):
	
	carcass = \
"""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
%s
</head>
<body>
</body>
</html>"""%extra_head_markup

	#FIXME at some point, this would require more than the TOC
	target_document = parse_document(carcass)
	title_element = target_document.createElement("title");
	title_element.appendChild(target_document.createTextNode(title))
	target_document.getElementsByTagName("head")[0].appendChild(title_element)
	body =  target_document.getElementsByTagName("body")[0]
	for a in frontmattersections + contentsections + endmattersections:
		try: page =body.appendChild(front_page.get_key_element(a))
		except IndexError: continue
	xml = target_document.toxml("UTF-8")
	file(save_file,"w").write(xml)

def gen_mediawiki_book(url,folder,title,extra_head):
	front_page = bootstrap(url)
	for a in contentsections:
		try: sectionize(front_page.get_key_element(a),a)
		except IndexError: continue
	print "a"
	subdocument_integration(front_page,url)
	print "b"
	remove_moles(front_page)
	print "c"
	anchor_fixup(front_page,url)
	print "d"
	remove_duplicate_ids(front_page)
	print "e"
	fix_hyperlinks(front_page,url)
	print "f"
	demotion(front_page)
	print "g"
	download_images(front_page,url,folder)
	print "i"
	gen_toc(front_page)
	print "j"
	definitions_abbreviations_acronyms(front_page)
	print "k"
	footnotes_and_citations(front_page)
	print "l"
	gen_ref_list(front_page)
	print "msunia"
	storage(front_page,os.path.join(folder,"index.html"),title,extra_head)

def main():
	gen_mediawiki_book("http://software-libre.rudd-o.com/","book",u"Software libre para los países en desarrollo",'<link rel="stylesheet" href="book.css"/><link rel="stylesheet" href="bookextra.css"/>')


if __name__ == "__main__": main()
Personal tools