The sources on this page are placed in the Public Domain.

Sample xml (sample.xml):

1
2
3
4
5
6
7
8
9
10
11
<data prefix="Message: ">
  <useful>Hello</useful>
  <boring>
    <totally/>
    <unnecessary>tag</unnecessary>
  </boring>
  <useful>cruel</useful>
  <meta_info> </meta_info>
  <useful>world!</useful>
  <useful>rest</useful>
</data>

The parsing script itself; it reads the XML listed above as a file:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
require 'rubygems'
require 'xml'
 
def stats p
  $stderr.puts sprintf("%d %d %p %p %p",p.depth,p.node_type,p.name,p.empty_element?,p.value) if $debug
end
# Monkey-patch LibXML facilities for convenience
class LibXML::XML::Reader
  # Make default read to print debug information and ignore whitespace
  alias_method :super_read, :read
  # Ignore whitespace and print stats
  def skipping_read
    super_read
    raw_read_whitespace
    # Print stats
    stats self
  end
  alias_method :read, :skipping_read
 
  def raw_read_whitespace
    while node_type == TYPE_WHITESPACE || node_type == TYPE_SIGNIFICANT_WHITESPACE || node_type == TYPE_COMMENT
      # Read one more
      super_read
    end
  end
 
  # If the peeked data is whitespace, read until we encounter a meaningful tag
  def read_whitespace
    raw_read_whitespace
    # Print stats
    stats self
  end
 
  alias_method :super_next, :next
  def skipping_next
    self.super_next
    self.read_whitespace
  end
  alias_method :next, :skipping_next
 
  # Returns if this stream points to an end of a tag
  def end_of? tag
    node_type == XML::Reader::TYPE_END_ELEMENT && name == tag
  end
 
  # Returns if this stream points to a start of a tag
  def start_of? tag
    node_type == XML::Reader::TYPE_ELEMENT && name == tag
  end
 
  # If current node is a start of one of these tags, returns its name; otherwise, returns nil
  def start_of_these? *tags
    return nil if node_type != XML::Reader::TYPE_ELEMENT
    tags.flatten.index(name) ? name : nil
  end
 
  # Returns XML::Node of the current tag and reads up to the end of it
  def consume
    node = self.expand
    self.skipping_next
    node
  end
 
  # Consumes the content of the tag peeked, the tag is expected in form <tag>content</tag>
  def consume_contents
    was_empty = empty_element?
    # We should use super_read here, because the contents of a tag may be whitespace--and we don't want to miss it this time!
    super_read
    return nil if was_empty
    # If there was no text (but the element is not empty, e.g. <a></a>), then don't read twice
    if node_type != XML::Reader::TYPE_END_ELEMENT
      r = value
      read
    else
      r = ''
    end
    read
    r
  end
end
 
class NotEnoughData < Exception
end
 
class UsefulFactory
  def initialize(kv = {})
    re_initialize(kv)
    # Ruby note: variables that start with @ are class members
    @useful_items = []
  end
  def re_initialize(kv = {})
    @prefix = kv[:prefix] if kv[:prefix]
    @delimiter = kv[:delimiter] if kv[:delimiter]
  end
  def consume_tag_if_applicable(docstream)
    # We are at the beginning of a useful tag
    if docstream.start_of? 'useful'
      @useful_items << docstream.consume_contents
      # If we want to yield data as soon as they appear in full form, we do this.
      # Otherwise, this begin-rescue sequence may be omitted
      begin
        puts self.yield
      rescue NotEnoughData
      end
    else
      fail
    end
  end
  def yield
    raise NotEnoughData if @prefix.nil? || @delimiter.nil? || @useful_items.empty?
    # If there was enough data, then do the job with the stuff we've accumulated, and clear the cache
    puts @prefix + @useful_items.join(@delimiter)
    @useful_items = []
  end
end
 
 
# Initialize document parser
docstream = XML::Reader.file 'sample.xml'
 
docstream.read while not docstream.start_of? 'data'
# We're now at the beginning of data tag.  We are going to read <useful> elements.
 
useful_factory = UsefulFactory.new(:prefix => docstream['prefix'])
 
docstream.read
# We're at the beginning of the next <data> child, or at </data> -- safely go to next iteration
 
while not docstream.end_of? 'data' do
  # We're at the beginning of one of the children of <data>
  fail unless docstream.node_type == XML::Reader::TYPE_ELEMENT
 
  # Skip nodes we're not interested in...
  unless %w(useful meta_info).include? docstream.name
    docstream.next
    # We're at the beginning of the next <data> child, or at </data> -- safely go to next iteration
    next
  end
 
  if docstream.start_of? 'useful'
    useful_factory.consume_tag_if_applicable(docstream)
    # consume_tag should reads up to the next tag or end of the enclosing tag
  elsif docstream.start_of? 'meta_info'
    meta_info = docstream.consume_contents
    # consume_contents reads the next tag and returns the text inside the tag read
    useful_factory.re_initialize(:delimiter => meta_info)
    # we could add this meta-information to other factories.  That's why we do not use consume_tag here.
  end
 
  # We're at the beginning of the next <data> child, or at </data> -- safely go to next iteration
end
 
# We're at </data> -- no more interesting information left
begin
  useful_factory.yield
rescue NotEnoughData
end

Output:

Message: Hello cruel world!
Message: rest