Imagine that you have Nokogiri nodes representing the <a> elements in the following two documents:
<r xmlns:x="foo"><a foo="bar" jim="jam" x:oh="no"><x:b>Hello</x:b></a></r>
<r xmlns:i="foo"><a jim="jam" i:oh="no" foo="bar"><i:b>Hello</i:b></a></r>
The two are equivalent from a DOM standpoint. I’d like to detect this efficiently, but Nokogiri::XML::Node#== just checks object equality. Since Nokogiri 1.5.0 does not yet have support for canonicalization, I can’t just serialize the nodes and compare the strings.
What’s the fastest way to compare two nodes to ensure that their names, attributes, and contents are canonically equivalent?
Answers may rely on features only available in Ruby 1.9.2+, if desired.
Test Cases
ORIG1 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
ORIG2 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b jim='jam' foo='bar'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NOTEXT = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c/></a3>
</a>"
EXTRATEXT1 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foobar</a3c></a3>
</a>"
EXTRATEXT2 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b>hi</a3b><a3c>foo</a3c></a3>
</a>"
MISSINGNODE = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/></a3>
</a>"
EXTRANODE = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c><a3d/></a3>
</a>"
SWAPNODE = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3x/><a3b/><a3c>foo</a3c></a3>
</a>"
MISSINGATTRIB = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
EXTRAATTRIB1 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam' kits='meow'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
EXTRAATTRIB2 = "<a>
<a1><a1a/><a1b/><a1c kits='meow'/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
SWAPATTRIB1 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='zzz'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
SWAPATTRIB2 = "<a>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' zzz='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NAMESPACE1 = "<a xmlns:x='foo'>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><x:a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NAMESPACE1B = "<a xmlns:z='foo'>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><z:a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NAMESPACE1C = "<a xmlns:x='bar'>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' jim='jam'/><a2c/></a2>
<a3><x:a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NAMESPACE2 = "<a xmlns:x='foo'>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' x:jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NAMESPACE2B= "<a xmlns:z='foo'>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' z:jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
NAMESPACE2C= "<a xmlns:x='bar'>
<a1><a1a/><a1b/><a1c/></a1>
<a2><a2a/><a2b foo='bar' x:jim='jam'/><a2c/></a2>
<a3><a3a/><a3b/><a3c>foo</a3c></a3>
</a>"
require 'nokogiri'
require 'minitest/autorun'
class NodeEquivalence < MiniTest::Unit::TestCase
def setup
@o1 = Nokogiri::XML(ORIG1,&:noblanks).root
end
def test_equivalence
o2 = Nokogiri::XML(ORIG2,&:noblanks).root
assert @o1 =~ o2, "Equivalent nodes should be equivalent"
assert o2 =~ @o1, "Equivalent nodes should be equivalent"
end
def test_textnodes
no_text = Nokogiri::XML(NOTEXT,&:noblanks).root
extra1 = Nokogiri::XML(EXTRATEXT1,&:noblanks).root
extra2 = Nokogiri::XML(EXTRATEXT2,&:noblanks).root
refute @o1 =~ no_text, "Notice missing text node child"
refute no_text =~ @o1, "Notice missing text node child"
refute @o1 =~ extra1, "Notice different text in text node"
refute extra1 =~ @o1, "Notice different text in text node"
refute @o1 =~ extra2, "Notice extra text node"
refute extra2 =~ @o1, "Notice extra text node"
end
def test_nodes
missing = Nokogiri::XML(MISSINGNODE,&:noblanks).root
extra = Nokogiri::XML(EXTRANODE,&:noblanks).root
changed = Nokogiri::XML(SWAPNODE,&:noblanks).root
refute @o1 =~ missing, "Notice missing node"
refute missing =~ @o1, "Notice missing node"
refute @o1 =~ extra, "Notice extra node"
refute extra =~ @o1, "Notice extra node"
refute @o1 =~ changed, "Notice renamed node"
refute changed =~ @o1, "Notice renamed node"
end
def test_attributes
missing = Nokogiri::XML(MISSINGATTRIB,&:noblanks).root
extra1 = Nokogiri::XML(EXTRAATTRIB1,&:noblanks).root
extra2 = Nokogiri::XML(EXTRAATTRIB2,&:noblanks).root
swap1 = Nokogiri::XML(SWAPATTRIB1,&:noblanks).root
swap2 = Nokogiri::XML(SWAPATTRIB2,&:noblanks).root
refute @o1 =~ missing, "Notice missing attribute"
refute missing =~ @o1, "Notice missing attribute"
refute @o1 =~ extra1, "Notice extra attribute"
refute extra1 =~ @o1, "Notice extra attribute"
refute @o1 =~ extra2, "Notice new attribute"
refute extra2 =~ @o1, "Notice new attribute"
refute @o1 =~ swap1, "Notice changed attribute value"
refute swap1 =~ @o1, "Notice changed attribute value"
refute @o1 =~ swap2, "Notice changed attribute name"
refute swap2 =~ @o1, "Notice changed attribute name"
end
def test_namespaces
ns1 = Nokogiri::XML(NAMESPACE1,&:noblanks).root
ns2 = Nokogiri::XML(NAMESPACE2,&:noblanks).root
ns1b = Nokogiri::XML(NAMESPACE1B,&:noblanks).root
ns2b = Nokogiri::XML(NAMESPACE2B,&:noblanks).root
ns1c = Nokogiri::XML(NAMESPACE1C,&:noblanks).root
ns2c = Nokogiri::XML(NAMESPACE2C,&:noblanks).root
refute @o1 =~ ns1, "Notice added node namespace"
refute ns1 =~ @o1, "Notice removed node namespace"
refute @o1 =~ ns2, "Notice added attribute namespace"
refute ns2 =~ @o1, "Notice removed attribute namespace"
assert ns1 =~ ns1b, "Different namespace names on nodes don't matter"
assert ns2 =~ ns2b, "Different namespace names on attributes don't matter"
refute ns1 =~ ns1c, "Notice different namespace hrefs on nodes"
refute ns2 =~ ns2c, "Notice different namespace hrefs on attributes"
end
end
Here’s my current implementation. It is
notnow namespace aware:Here’s my benchmark code (using the constants from the test cases above):