#! /usr/bin/env python
'''XML Canonicalization
This module generates canonical XML, as defined in
It is limited in that it can only canonicalize an element and all its
children; general document subsets are not supported.
_copyright = '''Copyright 2001, Zolera Systems Inc. All Rights Reserved.
Distributed under the terms of the Python 2.0 Copyright or later.'''
from xml.dom import Node
from xml.ns import XMLNS
import re
import cStringIO
StringIO = cStringIO
import StringIO
_attrs = lambda E: E.attributes or []
_children = lambda E: E.childNodes or []
def _sorter(n1, n2):
'''Sorting predicate for non-NS attributes.'''
i = cmp(n1.namespaceURI, n2.namespaceURI)
if i: return i
return cmp(n1.localName, n2.localName)
def _sorter_ns(n1, n2):
'''Sorting predicate for NS attributes; "xmlns" always comes first.'''
if n1.localName == 'xmlns': return -1
if n2.localName == 'xmlns': return 1
return cmp(n1.localName, n2.localName)
class _implementation:
'''Implementation class for C14N.'''
# Handlers for each node, by node type.
handlers = {}
# pattern/replacement list for whitespace stripping.
repats = (
( re.compile(r'[ \t]+'), ' ' ),
( re.compile(r'[\r\n]+'), '\n' ),
def __init__(self, node, write, nsdict={}, stripspace=0, nocomments=1):
'''Create and run the implementation.'''
if node.nodeType != Node.ELEMENT_NODE:
raise TypeError, 'Non-element node'
self.write, self.stripspace, self.nocomments = \
write, stripspace, nocomments
if nsdict == None or nsdict == {}:
nsdict = { 'xml': XMLNS.XML, 'xmlns': XMLNS.BASE }
self.ns_stack = [ nsdict ]
# Collect the initial list of xml:XXX attributes.
xmlattrs = []
for a in _attrs(node):
if a.namespaceURI == XMLNS.XML:
n = a.localName
# Walk up and get all xml:XXX attributes we inherit.
parent, inherited = node.parentNode, []
while parent:
if parent.nodeType != Node.ELEMENT_NODE: break
for a in _attrs(parent):
if a.namespaceURI != XMLNS.XML: continue
n = a.localName
if n not in xmlattrs:
parent = parent.parentNode
self._do_element(node, inherited)
def _do_text(self, node):
'Process a text node.'
s = node.data \
.replace("&", "&") \
.replace(" .replace(">", ">") \
.replace("\015", "
if self.stripspace:
for pat,repl in _implementation.repats: s = re.sub(pat, repl, s)
if s: self.write(s)
handlers[Node.TEXT_NODE] =_do_text
handlers[Node.CDATA_SECTION_NODE] =_do_text
def _do_pi(self, node):
'''Process a PI node. Since we start with an element, we're
never a child of the root, so we never write leading or trailing
W = self.write
W(' W(node.nodeName)
s = node.data
if s:
W(' ')
def _do_comment(self, node):
'''Process a comment node. Since we start with an element, we're
never a child of the root, so we never write leading or trailing
if self.nocomments: return
W = self.write
W(' W(node.data)
handlers[Node.COMMENT_NODE] =_do_comment
def _do_attr(self, n, value):
'Process an attribute.'
W = self.write
W(' ')
s = value \
.replace("&", "&") \
.replace(" .replace('"', '"') \
.replace('\011', ' ') \
.replace('\012', '
') \
.replace('\015', '
def _do_element(self, node, initialattrlist = []):
'Process an element (and its children).'
name = node.nodeName
W = self.write
W(' W(name)
# Get parent namespace, make a copy for us to inherit.
parent_ns = self.ns_stack[-1]
my_ns = parent_ns.copy()
# Divide attributes into NS definitions and others.
nsnodes, others = [], initialattrlist[:]
for a in _attrs(node):
if a.namespaceURI == XMLNS.BASE:
# Namespace attributes: update dictionary; if not already
# in parent, output it.
for a in nsnodes:
# Some DOMs seem to rename "xmlns='xxx'" strangely
n = a.nodeName
if n == "xmlns:":
key, n = "", "xmlns"
key = a.localName
v = my_ns[key] = a.nodeValue
pval = parent_ns.get(key, None)
if n == "xmlns" and v in [ '', XMLNS.BASE ] \
and pval in [ '', XMLNS.BASE ]:
# Default namespace set to default value.
elif v != pval:
self._do_attr(n, v)
# Other attributes: sort and output.
for a in others: self._do_attr(a.nodeName, a.value)
# Push our namespace dictionary, recurse, pop the dicionary.
for c in _children(node):
_implementation.handlers[c.nodeType](self, c)
# XXX Ignore unknown node types?
#handler = _implementation.handlers.get(c.nodeType, None)
#if handler: handler(self, c)
W('' % (name,))
handlers[Node.ELEMENT_NODE] =_do_element
def Canonicalize(node, output=None, **kw):
'''Canonicalize a DOM element node and everything underneath it.
Return the text; if output is specified then output.write will
be called to output the text and None will be returned
Keyword parameters:
stripspace -- remove extra (almost all) whitespace from text nodes
nsdict -- a dictionary of prefix:uri namespace entries assumed
to exist in the surrounding context
comments -- keep comments if non-zero (default is zero)
if not output: s = StringIO.StringIO()
(output and output.write) or s.write,
nsdict=kw.get('nsdict', {}),
stripspace=kw.get('stripspace', 0),
nocomments=kw.get('comments', 0) == 0,
if not output: return s.getvalue()
if __name__ == '__main__':
text = ''' xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:xsd="http://www.w3.org/2001/XMLSchemaZ" xmlns:spare='foo'
This is the nameSome
content here on two lines.
more content. indented
12rich salz
The value of n3
print _copyright
from xml.dom.ext.reader import PyExpat
reader = PyExpat.Reader()
dom = reader.fromString(text)
for e in _children(dom):
if e.nodeType != Node.ELEMENT_NODE: continue
for ee in _children(e):
if ee.nodeType != Node.ELEMENT_NODE: continue
print '\n', '=' * 60
print Canonicalize(ee, nsdict={'spare':'foo'}, stripspace=1)
print '-' * 60
print Canonicalize(ee, stripspace=0)
print '-' * 60
print Canonicalize(ee, comments=1)
print '=' * 60