Skip to content

XML Parsing

#!/usr/bin/env perl
use strict;
use warnings 'all';
use XML::Twig;
my $twig = XML::Twig->parse( \*DATA );
#we can use the 'root' method to find the root of the XML.
my $root = $twig->root;
#first_child finds the first child element matching a value.
my $title = $root->first_child('title');
#text reads the text of the element.
my $title_text = $title->text;
print "Title is: ", $title_text, "\n";
#The above could be combined:
print $twig ->root->first_child_text('title'), "\n";
## You can use the 'children' method to iterate multiple items:
my $list = $twig->root->first_child('list');
#children can optionally take an element 'tag' - otherwise it just returns all of them.
foreach my $element ( $list->children ) {
#the 'att' method reads an attribute
print "Element with ID: ", $element->att('id') // 'none here', " is ", $element->text,
"\n";
}
#And if we need to do something more complicated, we an use 'xpath'.
#get_xpath or findnodes do the same thing:
#return a list of matches, or if you specify a second numeric argument, just that numbered match.
#xpath syntax is fairly extensive, but in this one - we search:
# anywhere in the tree: //
#nodes called 'item'
#with an id attribute [@id]
#and with that id attribute equal to "1000".
#by specifying '0' we say 'return just the first match'.
print "Item 1000 is: ", $twig->get_xpath( '//item[@id="1000"]', 0 )->text, "\n";
#this combines quite well with `map` to e.g. do the same thing on multiple items
print "All IDs:\n", join ( "\n", map { $_ -> att('id') } $twig -> get_xpath('//item'));
#note how this also finds the item under 'summary', because of //
__DATA__
<?xml version="1.0" encoding="utf-8"?>
<root>
<title>some sample xml</title>
<first key="value" key2="value2">
<second>Some text</second>
</first>
<third>
<fourth key3="value">Text here too</fourth>
</third>
<list>
<item id="1">Item1</item>
<item id="2">Item2</item>
<item id="3">Item3</item>
<item id="66">Item66</item>
<item id="88">Item88</item>
<item id="100">Item100</item>
<item id="1000">Item1000</item>
<notanitem>Not an item at all really.</notanitem>
</list>
<summary>
<item id="no_id">Test</item>
</summary>
</root>

With XML::Rabbit it is possible to consume XML files easily. You define in a declarative way and with an XPath syntax what you are looking for in the XML and XML::Rabbit will return objects according to the given definition.

Definition:

package Bookstore;
use XML::Rabbit::Root;
has_xpath_object_list books => './book' => 'Bookstore::Book';
finalize_class();
package Bookstore::Book;
use XML::Rabbit;
has_xpath_value bookid => './@id';
has_xpath_value author => './author';
has_xpath_value title => './title';
has_xpath_value genre => './genre';
has_xpath_value price => './price';
has_xpath_value publish_date => './publish_date';
has_xpath_value description => './description';
has_xpath_object purchase_data => './purchase_data' => 'Bookstore::Purchase';
finalize_class();
package Bookstore::Purchase;
use XML::Rabbit;
has_xpath_value price => './price';
has_xpath_value date => './date';
finalize_class();

XML Consumption:

use strict;
use warnings;
use utf8;
package Library;
use feature qw(say);
use Carp;
use autodie;
say "Showing data information";
my $bookstore = Bookstore->new( file => './sample.xml' );
foreach my $book( @{$bookstore->books} ) {
say "ID: " . $book->bookid;
say "Title: " . $book->title;
say "Author: " . $book->author, "\n";
}

Notes:

Please be careful with the following:

  • The first class has to be `XML::Rabbit::Root`. It will place you inside the main tag of the XML document. In our case it will place us inside ``
  • Nested classes which are optional. Those classes need to be accessed via a try/catch (or `eval / $@` check) block. Optional fields will simply return `null`. For example, for `purchase_data` the loop would be:
  • foreach my $book( @{$bookstore->books} ) {
    say "ID: " . $book->bookid;
    say "Title: " . $book->title;
    say "Author: " . $book->author;
    try {
    say "Purchase price: ". $book->purchase_data->price, "\n";
    } catch {
    say "No purchase price available\n";
    }
    }

    sample.xml

    <?xml version="1.0"?>
    <catalog>
    <book id="bk101">
    <author>Gambardella, Matthew</author>
    <title>XML Developer's Guide</title>
    <genre>Computer</genre>
    <price>44.95</price>
    <publish_date>2000-10-01</publish_date>
    <description>An in-depth look at creating applications
    with XML.</description>
    </book>
    <book id="bk102">
    <author>Ralls, Kim</author>
    <title>Midnight Rain</title>
    <genre>Fantasy</genre>
    <price>5.95</price>
    <publish_date>2000-12-16</publish_date>
    <description>A former architect battles corporate zombies,
    an evil sorceress, and her own childhood to become queen
    of the world.</description>
    </book>
    <book id="bk103">
    <author>Corets, Eva</author>
    <title>Maeve Ascendant</title>
    <genre>Fantasy</genre>
    <price>5.95</price>
    <publish_date>2000-11-17</publish_date>
    <description>After the collapse of a nanotechnology
    society in England, the young survivors lay the
    foundation for a new society.</description>
    </book>
    <book id="bk104">
    <author>Corets, Eva</author>
    <title>Oberon's Legacy</title>
    <genre>Fantasy</genre>
    <price>5.95</price>
    <publish_date>2001-03-10</publish_date>
    <description>In post-apocalypse England, the mysterious
    agent known only as Oberon helps to create a new life
    for the inhabitants of London. Sequel to Maeve
    Ascendant.</description>
    <purchase_data>
    <date>2001-12-21</date>
    <price>20</price>
    </purchase_data>
    </book>
    </catalog>
    # This uses the 'sample.xml' given in the XML::Twig example.
    # Module requirements (1.70 and above for use of load_xml)
    use XML::LibXML '1.70';
    # let's be a good perl dev
    use strict;
    use warnings 'all';
    # Create the LibXML Document Object
    my $xml = XML::LibXML->new();
    # Where we are retrieving the XML from
    my $file = 'sample.xml';
    # Load the XML from the file
    my $dom = XML::LibXML->load_xml(
    location => $file
    );
    # get the docroot
    my $root = $dom->getDocumentElement;
    # if the document has children
    if($root->hasChildNodes) {
    # getElementsByLocalName returns a node list of all elements who's
    # localname matches 'title', and we want the first occurrence
    # (via get_node(1))
    my $title = $root->getElementsByLocalName('title');
    if(defined $title) {
    # Get the first matched node out of the nodeList
    my $node = $title->get_node(1);
    # Get the text of the target node
    my $title_text = $node->textContent;
    print "The first node with name 'title' contains: $title_text\n";
    }
    # The above calls can be combined, but is possibly prone to errors
    # (if the getElementsByLocalName() failed to match a node).
    #
    # my $title_text = $root->getElementsByLocalName('title')->get_node(1)->textContent;
    }
    # Using Xpath, get the price of the book with id 'bk104'
    #
    # Set our xpath
    my $xpath = q!/catalog/book[@id='bk104']/price!;
    # Does that xpath exist?
    if($root->exists($xpath)) {
    # Pull in the twig
    my $match = $root->find($xpath);
    if(defined $match) {
    # Get the first matched node out of the nodeList
    my $node = $match->get_node(1);
    # pull in the text of that node
    my $match_text = $node->textContent;
    print "The price of the book with id bk104 is: $match_text\n";
    }
    }