# XML Parsing

# Parsing with XML::Twig

#!/usr/bin/env perl

use strict;
use warnings 'all';

use XML::Twig;

my $twig = XML::Twig->parse( \*DATA );

#we can use the 'root' method to find the root of the XML.
my $root = $twig->root;

#first_child finds the first child element matching a value.
my $title = $root->first_child('title');

#text reads the text of the element.
my $title_text = $title->text;

print "Title is: ", $title_text, "\n";

#The above could be combined:
print $twig ->root->first_child_text('title'), "\n";

## You can use the 'children' method to iterate multiple items:
my $list = $twig->root->first_child('list');

#children can optionally take an element 'tag' - otherwise it just returns all of them.
foreach my $element ( $list->children ) {

   #the 'att' method reads an attribute
   print "Element with ID: ", $element->att('id') // 'none here', " is ", $element->text,
     "\n";
}

#And if we need to do something more complicated, we an use 'xpath'.
#get_xpath or findnodes do the same thing:
#return a list of matches, or if you specify a second numeric argument, just that numbered match.

#xpath syntax is fairly extensive, but in this one - we search:
# anywhere in the tree: //
#nodes called 'item'
#with an id attribute [@id]
#and with that id attribute equal to "1000". 
#by specifying '0' we say 'return just the first match'.

print "Item 1000 is: ", $twig->get_xpath( '//item[@id="1000"]', 0 )->text, "\n";

#this combines quite well with `map` to e.g. do the same thing on multiple items
print "All IDs:\n", join ( "\n", map { $_ -> att('id') } $twig -> get_xpath('//item')); 
#note how this also finds the item under 'summary', because of //

__DATA__
<?xml version="1.0" encoding="utf-8"?>
<root>
  <title>some sample xml</title>
  <first key="value" key2="value2">
    <second>Some text</second>
  </first>
  <third>
    <fourth key3="value">Text here too</fourth>
  </third>
  <list>
     <item id="1">Item1</item>
     <item id="2">Item2</item>
     <item id="3">Item3</item>
     <item id="66">Item66</item>
     <item id="88">Item88</item>
     <item id="100">Item100</item>
     <item id="1000">Item1000</item>
     <notanitem>Not an item at all really.</notanitem>
  </list>
  <summary>
     <item id="no_id">Test</item>
  </summary>
</root>

# Consuming XML with XML::Rabbit

With XML::Rabbit (opens new window) it is possible to consume XML files easily. You define in a declarative way and with an XPath syntax what you are looking for in the XML and XML::Rabbit will return objects according to the given definition.

Definition:

package Bookstore;
use XML::Rabbit::Root;
has_xpath_object_list books => './book' => 'Bookstore::Book';
finalize_class();
  
package Bookstore::Book;
use XML::Rabbit;
has_xpath_value bookid => './@id';
has_xpath_value author => './author';
has_xpath_value title => './title';
has_xpath_value genre => './genre';
has_xpath_value price => './price';
has_xpath_value publish_date => './publish_date';
has_xpath_value description => './description';
has_xpath_object purchase_data => './purchase_data' => 'Bookstore::Purchase';
finalize_class();

package Bookstore::Purchase;
use XML::Rabbit;
has_xpath_value price => './price';
has_xpath_value date => './date';
finalize_class();

XML Consumption:

use strict;
use warnings;
use utf8;
  
package Library;
use feature qw(say);
use Carp;
use autodie;
 
say "Showing data information";
my $bookstore = Bookstore->new( file => './sample.xml' );
 
foreach my $book( @{$bookstore->books} ) {
    say "ID: " . $book->bookid;
    say "Title: " . $book->title;
    say "Author: " . $book->author, "\n";
}

Notes:

Please be careful with the following:

  • The first class has to be `XML::Rabbit::Root`. It will place you inside the main tag of the XML document. In our case it will place us inside ``
  • Nested classes which are optional. Those classes need to be accessed via a try/catch (or `eval / $@` check) block. Optional fields will simply return `null`. For example, for `purchase_data` the loop would be:
  • foreach my $book( @{$bookstore->books} ) {
        say "ID: " . $book->bookid;
        say "Title: " . $book->title;
        say "Author: " . $book->author;
        try {
            say "Purchase price: ". $book->purchase_data->price, "\n";
        } catch {
            say "No purchase price available\n";
        }
    }
    
    

    sample.xml

    <?xml version="1.0"?>
    <catalog>
       <book id="bk101">
          <author>Gambardella, Matthew</author>
          <title>XML Developer's Guide</title>
          <genre>Computer</genre>
          <price>44.95</price>
          <publish_date>2000-10-01</publish_date>
          <description>An in-depth look at creating applications 
          with XML.</description>
       </book>
       <book id="bk102">
          <author>Ralls, Kim</author>
          <title>Midnight Rain</title>
          <genre>Fantasy</genre>
          <price>5.95</price>
          <publish_date>2000-12-16</publish_date>
          <description>A former architect battles corporate zombies, 
          an evil sorceress, and her own childhood to become queen 
          of the world.</description>
       </book>
       <book id="bk103">
          <author>Corets, Eva</author>
          <title>Maeve Ascendant</title>
          <genre>Fantasy</genre>
          <price>5.95</price>
          <publish_date>2000-11-17</publish_date>
          <description>After the collapse of a nanotechnology 
          society in England, the young survivors lay the 
          foundation for a new society.</description>
       </book>
       <book id="bk104">
          <author>Corets, Eva</author>
          <title>Oberon's Legacy</title>
          <genre>Fantasy</genre>
          <price>5.95</price>
          <publish_date>2001-03-10</publish_date>
          <description>In post-apocalypse England, the mysterious 
          agent known only as Oberon helps to create a new life 
          for the inhabitants of London. Sequel to Maeve 
          Ascendant.</description>
          <purchase_data>
            <date>2001-12-21</date>
            <price>20</price>
          </purchase_data>
       </book>
    </catalog>
    
    

    # Parsing with XML::LibXML

    # This uses the 'sample.xml' given in the XML::Twig example.    
    
    # Module requirements (1.70 and above for use of load_xml)
    use XML::LibXML '1.70';
    
    # let's be a good perl dev
    use strict;
    use warnings 'all';
    
    # Create the LibXML Document Object
    my $xml = XML::LibXML->new();
    
    # Where we are retrieving the XML from
    my $file = 'sample.xml';
    
    # Load the XML from the file
    my $dom = XML::LibXML->load_xml(
        location => $file
    );
    
    # get the docroot
    my $root = $dom->getDocumentElement;
    
    # if the document has children
    if($root->hasChildNodes) {
    
        # getElementsByLocalName returns a node list of all elements who's
        # localname matches 'title', and we want the first occurrence
        # (via get_node(1))
        my $title = $root->getElementsByLocalName('title');
    
        if(defined $title) {
            # Get the first matched node out of the nodeList
            my $node = $title->get_node(1);
    
            # Get the text of the target node
            my $title_text = $node->textContent;
    
            print "The first node with name 'title' contains: $title_text\n";
        }
    
        # The above calls can be combined, but is possibly prone to errors
        # (if the getElementsByLocalName() failed to match a node).
        #
        # my $title_text = $root->getElementsByLocalName('title')->get_node(1)->textContent;
    }
    
    # Using Xpath, get the price of the book with id 'bk104'
    #
    
    # Set our xpath
    my $xpath = q!/catalog/book[@id='bk104']/price!;
    
    # Does that xpath exist?
    if($root->exists($xpath)) {
    
        # Pull in the twig
        my $match = $root->find($xpath);
    
        if(defined $match) {
            # Get the first matched node out of the nodeList
            my $node = $match->get_node(1);
    
            # pull in the text of that node
            my $match_text = $node->textContent;
    
            print "The price of the book with id bk104 is: $match_text\n";
        }
    }