]> git.pld-linux.org Git - packages/ruby-DelimScanner.git/commitdiff
- added
authoraredridel <aredridel@pld-linux.org>
Sun, 27 Jun 2004 07:32:48 +0000 (07:32 +0000)
committercvs2git <feedback@pld-linux.org>
Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
Changed files:
    DelimScanner.rb?dl=1 -> 1.1

DelimScanner.rb?dl=1 [new file with mode: 0644]

diff --git a/DelimScanner.rb?dl=1 b/DelimScanner.rb?dl=1
new file mode 100644 (file)
index 0000000..b36aede
--- /dev/null
@@ -0,0 +1,1259 @@
+#!/usr/bin/ruby
+# 
+# A derivative of StringScanner that can scan for delimited constructs in
+# addition to regular expressions. It is a loose port of the Text::Balanced
+# module for Perl by Damian Conway <damian@cs.monash.edu.au>.
+# 
+# == Synopsis
+# 
+#   se = DelimScanner::new( myString )
+# 
+# == Authors
+# 
+# * Michael Granger <ged@FaerieMUD.org>
+# 
+# Copyright (c) 2002, 2003 The FaerieMUD Consortium. Most rights reserved.
+# 
+# This work is licensed under the Creative Commons Attribution License. To view
+# a copy of this license, visit http://creativecommons.org/licenses/by/1.0 or
+# send a letter to Creative Commons, 559 Nathan Abbott Way, Stanford, California
+# 94305, USA.
+# 
+# == Version
+#
+#  $Id$
+# 
+
+require 'strscan'
+require 'forwardable'
+
+### Add some stuff to the String class to allow easy transformation to Regexp
+### and in-place interpolation.
+class String
+       def to_re( casefold=false, extended=false )
+               return Regexp::new( self.dup )
+       end
+
+       ### Ideas for String-interpolation stuff courtesy of Hal E. Fulton
+       ### <hal9000@hypermetrics.com> via ruby-talk
+
+    def interpolate( scope )
+        unless scope.is_a?( Binding )
+            raise TypeError, "Argument to interpolate must be a Binding, not "\
+                "a #{scope.class.name}"
+        end
+
+               # $stderr.puts ">>> Interpolating '#{self}'..."
+
+        copy = self.gsub( /"/, %q:\": )
+        eval( '"' + copy + '"', scope )
+    end
+
+end
+
+
+### A derivative of StringScanner that can scan for delimited constructs in
+### addition to regular expressions.
+class DelimScanner
+
+       ### Scanner exception classes
+       class MatchFailure < RuntimeError ; end
+       class DelimiterError < RuntimeError ; end
+       
+
+       extend Forwardable
+       StringScanner.must_C_version
+
+
+       ### Class constants
+       Version = /([\d\.]+)/.match( %q{$Revision$} )[1]
+       Rcsid = %q$Id$
+
+       # Pattern to match a valid XML name
+       XmlName = '[a-zA-Z_:][a-zA-Z0-9:.-]*'
+
+
+       ### Namespace module for DelimString constants
+       module Default
+
+               # The list of default opening => closing codeblock delimiters to use for
+               # scanCodeblock.
+               CodeblockDelimiters = {
+                       '{'             => '}',
+                       'begin' => 'end',
+                       'do'    => 'end',
+               }
+
+               # Default scanMultiple operations and their arguments
+               MultipleFunctions = [
+                       :scanVariable   => [],
+                       :scanQuotelike  => [],
+                       :scanCodeblock  => [],
+               ]
+
+       end
+       include Default
+
+
+       ### Define delegating methods that cast their argument to a Regexp from a
+       ### String. This allows the scanner's scanning methods to be called with
+       ### Strings in addition to Regexps. This was mostly stolen from
+       ### forwardable.rb.
+       def self.def_casting_delegators( *methods )
+               methods.each {|methodName|
+                       class_eval( <<-EOF, "(--def_casting_delegators--)", 1 )
+                               def #{methodName}( pattern )
+                                       pattern = pattern.to_s.to_re unless pattern.is_a?( Regexp )
+                                       @scanner.#{methodName}( pattern )
+                               end
+                       EOF
+               }
+       end
+
+       
+       ### Create a new DelimScanner object for the specified <tt>string</tt>. If
+       ### <tt>dup</tt> is <tt>true</tt>, a duplicate of the target string will be
+       ### used instead of the one given. The target string will be frozen after
+       ### the scanner is created.
+       def initialize( string, dup=true )
+               @scanner        = StringScanner::new( string, dup )
+               @matchError = nil
+               @debugLevel     = 0
+       end
+
+
+       
+       ######
+       public
+       ######
+
+       # Here, some delegation trickery is done to make a DelimScanner behave like
+       # a StringScanner. Some methods are directly delegated, while some are
+       # delegated via a method which casts its argument to a Regexp first so some
+       # scanner methods can be called with Strings as well as Regexps.
+
+       # A list of delegated methods that need casting.
+       NeedCastingDelegators = :scan, :skip, :match?, :check,
+               :scan_until, :skip_until, :exist?, :check_until
+
+       # Delegate all StringScanner instance methods to the associated scanner
+       # object, except those that need a casting delegator, which uses an indirect
+       # delegation method.
+       def_delegators :@scanner,
+               *( StringScanner.instance_methods - NeedCastingDelegators.collect {|sym| sym.id2name} )
+
+       def_casting_delegators( *NeedCastingDelegators )
+
+
+       
+       # The last match error encountered by the scanner
+       attr_accessor :matchError
+       protected :matchError=  ;       # ; is to work around a ruby-mode indent bug
+       
+       # Debugging level
+       attr_accessor :debugLevel
+
+
+       
+       ### Returns <tt>true</tt> if the scanner has encountered a match error.
+       def matchError?
+               return ! @matchError.nil?
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring delimited by the
+       ### specified <tt>delimiters</tt>, skipping the specified <tt>prefix</tt>
+       ### and any character escaped by the specified <tt>escape</tt>
+       ### character/s. If matched, advances the scan pointer and returns a Hash
+       ### with the following key/value pairs on success:
+       ### 
+       ### [<tt>:match</tt>]
+       ###   The text of the match, including delimiters.
+       ### [<tt>:prefix</tt>]
+       ###   The matched prefix, if any.
+       ###
+       ### If the match fails, returns nil.
+       def scanDelimited( delimiters="'\"`", prefix='\\s*', escape='\\' )
+               delimiters      ||= "'\"`"
+               prefix          ||= '\\s*'
+               escape          ||= '\\'
+
+               debugMsg( 1, "Scanning for delimited text: delim = (%s), prefix=(%s), escape=(%s)",
+                                 delimiters, prefix, escape )
+               self.matchError = nil
+
+               # Try to match the prefix first to get the length
+               unless (( prefixLength = self.match?(prefix.to_re) ))
+                       self.matchError = "Failed to match prefix '%s' at offset %d" %
+                               [ prefix, self.pointer ]
+                       return nil
+               end
+                       
+               # Now build a delimited pattern with the specified parameters.
+               delimPattern = makeDelimPattern( delimiters, escape, prefix )
+               debugMsg( 2, "Delimiter pattern is %s" % delimPattern.inspect )
+
+               # Fail if no match
+               unless (( matchedString = self.scan(delimPattern) ))
+                       self.matchError = "No delimited string found."
+                       return nil
+               end
+
+               return {
+                       :match  => matchedString[prefixLength .. -1],
+                       :prefix => matchedString[0..prefixLength-1],
+               }
+       end
+
+
+       ### Match using the #scanDelimited method, but only return the match or nil.
+       def extractDelimited( *args )
+               rval = scanDelimited( *args ) or return nil
+               return rval[:match]
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring delimited by the
+       ### specified <tt>delimiters</tt>, skipping the specified <tt>prefix</tt>
+       ### and any character escaped by the specified <tt>escape</tt>
+       ### character/s. If matched, advances the scan pointer and returns the
+       ### length of the matched string; if it fails the match, returns nil.
+       def skipDelimited( delimiters="'\"`", prefix='\\s*', escape='\\' )
+               delimiters      ||= "'\"`"
+               prefix          ||= '\\s*'
+               escape          ||= '\\'
+
+               self.matchError = nil
+               return self.skip( makeDelimPattern(delimiters, escape, prefix) )
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring delimited by
+       ### balanced <tt>delimiters</tt> of the type specified, after skipping the
+       ### specified <tt>prefix</tt>. On a successful match, this method advances
+       ### the scan pointer and returns a Hash with the following key/value pairs:
+       ### 
+       ### [<tt>:match</tt>]
+       ###   The text of the match, including the delimiting brackets.
+       ### [<tt>:prefix</tt>]
+       ###   The matched prefix, if any.
+       ###
+       ### On failure, returns nil.
+       def scanBracketed( delimiters="{([<", prefix='\s*' )
+               delimiters      ||= "{([<"
+               prefix          ||= '\s*'
+
+               prefix = prefix.to_re unless prefix.kind_of?( Regexp )
+
+               debugMsg( 1, "Scanning for bracketed text: delimiters = (%s), prefix = (%s)",
+                                 delimiters, prefix )
+
+               self.matchError = nil
+
+               # Split the left-delimiters (brackets) from the quote delimiters.
+               ldel = delimiters.dup
+               qdel = ldel.squeeze.split(//).find_all {|char| char =~ /["'`]/ }.join('|')
+               qdel = nil if qdel.empty?
+               quotelike = true if ldel =~ /q/
+
+               # Change all instances of delimiters to the left-hand versions, and
+               # strip away anything but bracketing delimiters
+               ldel = ldel.tr( '[](){}<>', '[[(({{<<' ).gsub(/[^#{Regexp.quote('[\\](){}<>')}]+/, '').squeeze
+
+               ### Now build the right-delim equivalent of the left delim string
+               rdel = ldel.dup
+               unless rdel.tr!( '[({<', '])}>' )
+                       raise DelimiterError, "Did not find a suitable bracket in delimiter: '#{delimiters}'"
+               end
+
+               # Build regexps from both bracketing delimiter strings
+               ldel = ldel.split(//).collect {|ch| Regexp.quote(ch)}.join('|')
+               rdel = rdel.split(//).collect {|ch| Regexp.quote(ch)}.join('|')
+
+               depth = self.scanDepth
+               result = nil
+               startPos = self.pointer
+
+               begin
+                       result = matchBracketed( prefix, ldel, qdel, quotelike, rdel )
+               rescue MatchFailure => e
+                       debugMsg( depth + 1, "Match error: %s" % e.message )
+                       self.matchError = e.message
+                       self.pointer = startPos
+                       result = nil
+               rescue => e
+                       self.pointer = startPos
+                       Kernel::raise
+               end
+
+               return result
+       end
+
+
+       ### Match using the #scanBracketed method, but only return the match or nil.
+       def extractBracketed( *args )
+               rval = scanBracketed( *args ) or return nil
+               return rval[:match]
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring with
+       ### #scanBracketed. On a successful match, this method advances the scan
+       ### pointer and returns the length of the match, including the delimiters
+       ### and any prefix that was skipped. On failure, returns nil.
+       def skipBracketed( *args )
+               startPos = self.pointer
+
+               match = scanBracketed( *args )
+
+               return nil unless match
+               return match.length + prefix.length
+       ensure
+               debugMsg( 2, "Resetting scan pointer." )
+               self.pointer = startPos
+       end
+
+
+       ### Extracts and segments text from the scan pointer forward that occurs
+       ### between (balanced) specified tags, after skipping the specified
+       ### <tt>prefix</tt>. If the opentag argument is <tt>nil</tt>, a pattern which
+       ### will match any standard HTML/XML tag will be used. If the
+       ### <tt>closetag</tt> argument is <tt>nil</tt>, a pattern is created which
+       ### prepends a <tt>/</tt> character to the matched opening tag, after any
+       ### bracketing characters. The <tt>options</tt> argument is a Hash of one or
+       ### more options which govern the matching operation. They are described in
+       ### more detail in the Description section of 'lib/DelimScanner.rb'. On a
+       ### successful match, this method advances the scan pointer and returns an
+       ### 
+       ### [<tt>:match</tt>]
+       ###   The text of the match, including the delimiting tags.
+       ### [<tt>:prefix</tt>]
+       ###   The matched prefix, if any.
+       ###
+       ### On failure, returns nil.
+       def scanTagged( opentag=nil, closetag=nil, prefix='\s*', options={} )
+               prefix ||= '\s*'
+
+               ldel = opentag || %Q,<\\w+(?:#{ makeDelimPattern(%q:'":) }|[^>])*>,
+               rdel = closetag
+               raise ArgumentError, "Options argument must be a hash" unless options.kind_of?( Hash )
+
+               failmode        = options[:fail]
+               bad             = if options[:reject].is_a?( Array ) then
+                                         options[:reject].join("|")
+                                 else
+                                         (options[:reject] || '')
+                                 end
+               ignore  = if options[:ignore].is_a?( Array ) then
+                                         options[:ignore].join("|")
+                                 else
+                                         (options[:ignore] || '')
+                                 end
+
+               self.matchError = nil
+               result                  = nil
+               startPos                = self.pointer
+
+               depth = self.scanDepth
+
+               begin
+                       result = matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
+               rescue MatchFailure => e
+                       debugMsg( depth + 1, "Match error: %s" % e.message )
+                       self.matchError = e.message
+                       self.pointer = startPos
+                       result = nil
+               rescue => e
+                       self.pointer = startPos
+                       Kernel::raise
+               end
+
+               return result
+       end
+
+
+       ### Match using the #scanTagged method, but only return the match or nil.
+       def extractTagged( *args )
+               rval = scanTagged( *args ) or return nil
+               return rval[:match]
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring with
+       ### #scanTagged. On a successful match, this method advances the scan
+       ### pointer and returns the length of the match, including any delimiters
+       ### and any prefix that was skipped. On failure, returns nil.
+       def skipTagged( *args )
+               startPos = self.pointer
+
+               match = scanTagged( *args )
+
+               return nil unless match
+               return match.length + prefix.length
+       ensure
+               debugMsg( 2, "Resetting scan pointer." )
+               self.pointer = startPos
+       end
+
+
+       # :NOTE:
+       # Since the extract_quotelike function isn't documented at all in
+       # Text::Balanced, I'm only guessing this is correct...
+
+       ### Starting from the scan pointer, try to match any one of the various Ruby
+       ### quotes and quotelike operators after skipping the specified
+       ### <tt>prefix</tt>.  Nested backslashed delimiters, embedded balanced
+       ### bracket delimiters (for the quotelike operators), and trailing modifiers
+       ### are all caught. If <tt>matchRawRegex</tt> is <tt>true</tt>, inline
+       ### regexen (eg., <tt>/pattern/</tt>) are matched as well. Advances the scan
+       ### pointer and returns a Hash with the following key/value pairs on
+       ### success:
+       ### 
+       ### [<tt>:match</tt>]
+       ###   The entire text of the match.
+       ### [<tt>:prefix</tt>]
+       ###   The matched prefix, if any.
+       ### [<tt>:quoteOp</tt>]
+       ###   The name of the quotelike operator (if any) (eg., '%Q', '%r', etc).
+       ### [<tt>:leftDelim</tt>]
+       ###   The left delimiter of the first block of the operation.
+       ### [<tt>:delimText</tt>]
+       ###   The text of the first block of the operation.
+       ### [<tt>:rightDelim</tt>]
+       ###   The right delimiter of the first block of the operation.
+       ### [<tt>:modifiers</tt>]
+       ###   The trailing modifiers on the operation (if any).
+       ### 
+       ### On failure, returns nil.
+       def scanQuotelike( prefix='\s*', matchRawRegex=true )
+
+               self.matchError = nil
+               result                  = nil
+               startPos                = self.pointer
+
+               depth = self.scanDepth
+
+               begin
+                       result = matchQuotelike( prefix, matchRawRegex )
+               rescue MatchFailure => e
+                       debugMsg( depth + 1, "Match error: %s" % e.message )
+                       self.matchError = e.message
+                       self.pointer = startPos
+                       result = nil
+               rescue => e
+                       self.pointer = startPos
+                       Kernel::raise
+               end
+
+               return result
+       end
+
+       
+       ### Match using the #scanQuotelike method, but only return the match or nil.
+       def extractQuotelike( *args )
+               rval = scanQuotelike( *args ) or return nil
+               return rval[:match]
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring with
+       ### #scanQuotelike. On a successful match, this method advances the scan
+       ### pointer and returns the length of the match, including any delimiters
+       ### and any prefix that was skipped. On failure, returns nil.
+       def skipQuotelike( *args )
+               startPos = self.pointer
+
+               match = scanQuotelike( *args )
+
+               return nil unless match
+               return match.length + prefix.length
+       ensure
+               debugMsg( 2, "Resetting scan pointer." )
+               self.pointer = startPos
+       end
+
+
+       ### Starting from the scan pointer, try to match a Ruby variable after
+       ### skipping the specified prefix.
+       def scanVariable( prefix='\s*' )
+               self.matchError = nil
+               result                  = nil
+               startPos                = self.pointer
+
+               depth = self.scanDepth
+
+               begin
+                       result = matchVariable( prefix )
+               rescue MatchFailure => e
+                       debugMsg( depth + 1, "Match error: %s" % e.message )
+                       self.matchError = e.message
+                       self.pointer = startPos
+                       result = nil
+               rescue => e
+                       self.pointer = startPos
+                       Kernel::raise
+               end
+
+               return result
+       end
+
+
+       ### Match using the #scanVariable method, but only return the match or nil.
+       def extractVariable( *args )
+               rval = scanVariable( *args ) or return nil
+               return rval[:match]
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring with
+       ### #scanVariable. On a successful match, this method advances the scan
+       ### pointer and returns the length of the match, including any delimiters
+       ### and any prefix that was skipped. On failure, returns nil.
+       def skipVariable( *args )
+               startPos = self.pointer
+
+               match = scanVariable( *args )
+
+               return nil unless match
+               return match.length + prefix.length
+       ensure
+               debugMsg( 2, "Resetting scan pointer." )
+               self.pointer = startPos
+       end
+
+
+       ### Starting from the scan pointer, and skipping the specified
+       ### <tt>prefix</tt>, try to to recognize and match a balanced bracket-,
+       ### do/end-, or begin/end-delimited substring that may contain unbalanced
+       ### delimiters inside quotes or quotelike operations.
+       def scanCodeblock( innerDelim=CodeblockDelimiters, prefix='\s*', outerDelim=innerDelim )
+               self.matchError = nil
+               result                  = nil
+               startPos                = self.pointer
+
+               prefix                  ||= '\s*'
+               innerDelim              ||= CodeblockDelimiters
+               outerDelim              ||= innerDelim
+
+               depth = caller(1).find_all {|frame|
+                       frame =~ /in `scan(Variable|Tagged|Codeblock|Bracketed|Quotelike)'/
+               }.length
+
+               begin
+                       debugMsg 3, "Calling matchCodeBlock( %s, %s, %s )",
+                               prefix.inspect, innerDelim.inspect, outerDelim.inspect
+                       result = matchCodeblock( prefix, innerDelim, outerDelim )
+               rescue MatchFailure => e
+                       debugMsg( depth + 1, "Match error: %s" % e.message )
+                       self.matchError = e.message
+                       self.pointer = startPos
+                       result = nil
+               rescue => e
+                       self.pointer = startPos
+                       Kernel::raise
+               end
+
+               return result
+       end
+
+
+       ### Match using the #scanCodeblock method, but only return the match or nil.
+       def extractCodeblock( *args )
+               rval = scanCodeblock( *args ) or return nil
+               return rval[:match]
+       end
+
+
+       ### Starting at the scan pointer, try to match a substring with
+       ### #scanCodeblock. On a successful match, this method advances the scan
+       ### pointer and returns the length of the match, including any delimiters
+       ### and any prefix that was skipped. On failure, returns nil.
+       def skipCodeblock( *args )
+               startPos = self.pointer
+
+               match = scanCodeblock( *args )
+
+               return nil unless match
+               return match.length + prefix.length
+       ensure
+               debugMsg( 2, "Resetting scan pointer." )
+               self.pointer = startPos
+       end
+
+
+
+
+       #########
+       protected
+       #########
+
+       ### Scan the string from the scan pointer forward, skipping the specified
+       ### <tt>prefix</tt> and trying to match a string delimited by bracketing
+       ### delimiters <tt>ldel</tt> and <tt>rdel</tt> (Regexp objects), and quoting
+       ### delimiters <tt>qdel</tt> (Regexp). If <tt>quotelike</tt> is
+       ### <tt>true</tt>, Ruby quotelike constructs will also be honored.
+       def matchBracketed( prefix, ldel, qdel, quotelike, rdel )
+               startPos = self.pointer
+               debugMsg( 2, "matchBracketed starting at pos = %d: prefix = %s, "\
+                                "ldel = %s, qdel = %s, quotelike = %s, rdel = %s",
+                                startPos, prefix.inspect, ldel.inspect, qdel.inspect, quotelike.inspect,
+                                rdel.inspect )
+
+               # Test for the prefix, failing if not found
+               raise MatchFailure, "Did not find prefix: #{prefix.inspect}" unless 
+                       self.skip( prefix )
+
+               # Mark this position as the left-delimiter pointer
+               ldelpos = self.pointer
+               debugMsg( 3, "Found prefix. Left delim pointer at %d", ldelpos )
+               
+               # Match opening delimiter or fail
+               unless (( delim = self.scan(ldel) ))
+                       raise MatchFailure, "Did not find opening bracket after prefix: '%s' (%d)" %
+                               [ self.string[startPos..ldelpos].chomp, ldelpos ]
+               end
+
+               # A stack to keep track of nested delimiters
+               nesting = [ delim ]
+               debugMsg( 3, "Found opening bracket. Nesting = %s", nesting.inspect )
+               
+               while self.rest?
+
+                       debugMsg( 5, "Starting scan loop. Nesting = %s", nesting.inspect )
+
+                       # Skip anything that's backslashed
+                       if self.skip( /\\./ )
+                               debugMsg( 4, "Skipping backslashed literal at offset %d: '%s'",
+                                                 self.pointer - 2, self.string[ self.pointer - 2, 2 ].chomp )
+                               next
+                       end
+
+                       # Opening bracket (left delimiter)
+                       if self.scan(ldel)
+                               delim = self.matched
+                               debugMsg( 4, "Found opening delim %s at offset %d",
+                                                 delim.inspect, self.pointer - 1 )
+                               nesting.push delim
+
+                       # Closing bracket (right delimiter)
+                       elsif self.scan(rdel)
+                               delim = self.matched
+
+                               debugMsg( 4, "Found closing delim %s at offset %d",
+                                                 delim.inspect, self.pointer - 1 )
+
+                               # :TODO: When is this code reached?
+                               if nesting.empty?
+                                       raise MatchFailure, "Unmatched closing bracket '%s' at offset %d" %
+                                               [ delim, self.pointer - 1 ]
+                               end
+
+                               # Figure out what the compliment of the bracket next off the
+                               # stack should be.
+                               expected = nesting.pop.tr( '({[<', ')}]>' )
+                               debugMsg( 4, "Got a '%s' bracket off nesting stack", expected )
+
+                               # Check for mismatched brackets
+                               if expected != delim
+                                       raise MatchFailure, "Mismatched closing bracket at offset %d: "\
+                                               "Expected '%s', but found '%s' instead." %
+                                               [ self.pointer - 1, expected, delim ]
+                               end
+
+                               # If we've found the closing delimiter, stop scanning
+                               if nesting.empty?
+                                       debugMsg( 4, "Finished with scan: nesting stack empty." )
+                                       break
+                               end
+
+                       # Quoted chunk (quoted delimiter)
+                       elsif qdel && self.scan(qdel)
+                               match = self.matched
+
+                               if self. scan( /[^\\#{match}]*(?:\\.[^\\#{match}]*)*(#{Regexp::quote(match)})/ )
+                                       debugMsg( 4, "Skipping quoted chunk. Scan pointer now at offset %d", self.pointer )
+                                       next
+                               end
+
+                               raise MatchFailure, "Unmatched embedded quote (%s) at offset %d" %
+                                       [ match, self.pointer - 1 ]
+
+                       # Embedded quotelike
+                       elsif quotelike && self.scanQuotelike
+                               debugMsg( 4, "Matched a quotelike. Scan pointer now at offset %d", self.pointer )
+                               next
+
+                       # Skip word characters, or a single non-word character
+                       else
+                               self.skip( /(?:[a-zA-Z0-9]+|.)/m )
+                               debugMsg 5, "Skipping '%s' at offset %d." %
+                                       [ self.matched, self.pointer ]
+                       end
+
+               end
+
+               # If there's one or more brackets left on the delimiter stack, we're
+               # missing a closing delim.
+               unless nesting.empty?
+                       raise MatchFailure, "Unmatched opening bracket(s): %s.. at offset %d" %
+                               [ nesting.join('..'), self.pointer ]
+               end
+
+               rval = {
+                       :match  => self.string[ ldelpos .. (self.pointer - 1) ],
+                       :prefix => self.string[ startPos, (ldelpos-startPos) ],
+               }
+               debugMsg 1, "matchBracketed succeeded: %s" % rval.inspect
+               return rval
+       end
+
+
+       ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
+       ### try to match text bracketed by the given left and right tag-delimiters
+       ### (<tt>ldel</tt> and <tt>rdel</tt>). 
+       def matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
+               failmode = failmode.to_s.intern if failmode
+               startPos = self.pointer
+               debugMsg 2, "matchTagged starting at pos = %d: prefix = %s, "\
+                                "ldel = %s, rdel = %s, failmode = %s, bad = %s, ignore = %s",
+                                startPos, prefix.inspect, ldel.inspect, rdel.inspect,
+                                failmode.inspect, bad.inspect, ignore.inspect
+
+               rdelspec = ''
+               openTagPos, textPos, paraPos, closeTagPos, endPos = ([nil] * 5)
+               match = nil
+
+               # Look for the prefix
+               raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
+                       self.skip( prefix )
+
+               openTagPos = self.pointer
+               debugMsg 3, "Found prefix. Pointer now at offset %d" % self.pointer
+
+               # Look for the opening delimiter
+               unless (( match = self.scan(ldel) ))
+                       raise MatchFailure, "Did not find opening tag %s at offset %d" % 
+                               [ ldel.inspect, self.pointer ]
+               end
+
+               textPos = self.pointer
+               debugMsg 3, "Found left delimiter '%s': offset now %d" % [ match, textPos ]
+
+               # Make a right delim out of the tag we found if none was specified
+               if rdel.nil?
+                       rdelspec = makeClosingTag( match )
+                       debugMsg 3, "Generated right-delimiting tag: %s" % rdelspec.inspect
+               else
+                       # Make the regexp-related globals from the match
+                       rdelspec = rdel.gsub( /(\A|[^\\])\$([1-9])/, '\1self[\2]' ).interpolate( binding )
+                       debugMsg 3, "Right delimiter (after interpolation) is: %s" % rdelspec.inspect
+               end
+
+               # Process until we reach the end of the string or find a closing tag
+               while self.rest? && closeTagPos.nil?
+
+                       # Skip backslashed characters
+                       if (( self.skip( /^\\./ ) ))
+                               debugMsg 4, "Skipping backslashed literal at offset %d" % self.pointer
+                               next
+
+                       # Match paragraphs-break for fail == :para
+                       elsif (( matchlength = self.skip( /^(\n[ \t]*\n)/ ) ))
+                               paraPos ||= self.pointer - matchlength
+                               debugMsg 4, "Found paragraph position at offset %d" % paraPos
+                               
+                       # Match closing tag
+                       elsif (( matchlength = self.skip( rdelspec ) ))
+                               closeTagPos = self.pointer - matchlength
+                               debugMsg 3, "Found closing tag at offset %d" % closeTagPos
+
+                       # If we're ignoring anything, try to match and move beyond it
+                       elsif ignore && !ignore.empty? && self.skip(ignore)
+                               debugMsg 3, "Skipping ignored text '%s' at offset %d" %
+                                       [ self.matched, self.pointer - self.matched_size ]
+                               next
+
+                       # If there's a "bad" pattern, try to match it, shorting the
+                       # outer loop if it matches in para or max mode, or failing with
+                       # a match error if not.
+                       elsif bad && !bad.empty? && self.match?( bad )
+                               if failmode == :para || failmode == :max
+                                       break
+                               else
+                                       raise MatchFailure, "Found invalid nested tag '%s' at offset %d" %
+                                               [ match, self.pointer ]
+                               end
+
+                       # If there's another opening tag, make a recursive call to
+                       # ourselves to move the cursor beyond it
+                       elsif (( match = self.scan( ldel ) ))
+                               tag = match
+                               self.unscan
+
+                               unless self.matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
+                                       break if failmode == :para || failmode == :max
+
+                                       raise MatchFailure, "Found unbalanced nested tag '%s' at offset %d" %
+                                               [ tag, self.pointer ]
+                               end
+
+                       else 
+                               self.pointer += 1
+                               debugMsg 5, "Advanced scan pointer to offset %d" % self.pointer
+                       end
+               end
+
+               # If the closing hasn't been found, then it's a "short" match, which is
+               # okay if the failmode indicates we don't care. Otherwise, it's an error.
+               unless closeTagPos
+                       debugMsg 3, "No close tag position found. "
+                       
+                       if failmode == :max || failmode == :para
+                               closeTagPos = self.pointer - 1
+                               debugMsg 4, "Failmode %s tolerates no closing tag. Close tag position set to %d" %
+                                       [ failmode.inspect, closeTagPos ]
+
+                               # Sync the scan pointer and the paragraph marker if it's set.
+                               if failmode == :para && paraPos
+                                       self.pointer = paraPos + 1
+                               end
+                       else
+                               raise MatchFailure, "No closing tag found."
+                       end
+               end
+
+               rval = {
+                       :match  => self.string[ openTagPos .. (self.pointer - 1) ],
+                       :prefix => self.string[ startPos, (openTagPos-startPos) ],
+               }
+               debugMsg 1, "matchTagged succeeded: %s" % rval.inspect
+               return rval
+       end
+
+
+       ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
+       ### try to match text inside a Ruby quotelike construct. If
+       ### <tt>matchRawRegex</tt> is <tt>true</tt>, the regex construct
+       ### <tt>/pattern/</tt> is also matched.
+       def matchQuotelike( prefix, matchRawRegex )
+               startPos = self.pointer
+               debugMsg 2, "matchQuotelike starting at pos = %d: prefix = %s, "\
+                       "matchRawRegex = %s",
+                       startPos, prefix.inspect, matchRawRegex.inspect
+
+               # Init position markers
+               rval = oppos = preldpos = ldpos = strpos = rdpos = modpos = nil
+
+               # Look for the prefix
+               raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
+                       self.skip( prefix )
+               oppos = self.pointer
+               
+               # Peek at the next character
+               # If the initial quote is a simple quote, our job is easy
+               if self.check(/^["`']/) || ( matchRawRegex && self.check(%r:/:) )
+                       initial = self.matched
+
+                       # Build the pattern for matching the simple string
+                       pattern = "%s [^\\%s]* (\\.[^\\%s]*)* %s" %
+                               [ Regexp.quote(initial),
+                                 initial, initial,
+                                 Regexp.quote(initial) ]
+                       debugMsg 2, "Matching simple quote at offset %d with /%s/" % 
+                               [ self.pointer, pattern ]
+
+                       # Search for it, raising an exception if it's not found
+                       unless self.scan( /#{pattern}/xism )
+                               raise MatchFailure,
+                                       "Did not find closing delimiter to match '%s' at '%s...' (offset %d)" %
+                                       [ initial, self.string[ oppos, 20 ].chomp, self.pointer ]
+                       end
+
+                       modpos = self.pointer
+                       rdpos = modpos - 1
+
+                       # If we're matching a regex, look for any trailing modifiers
+                       if initial == '/'
+                               pattern = if RUBY_VERSION >= "1.7.3" then /[imoxs]*/ else /[imox]*/ end
+                               self.scan( pattern )
+                       end
+
+                       rval = {
+                               :prefix         => self.string[ startPos, (oppos-startPos) ],
+                               :match          => self.string[ oppos .. (self.pointer - 1) ],
+                               :leftDelim      => self.string[ oppos, 1 ],
+                               :delimText      => self.string[ (oppos+1) .. (rdpos-1) ],
+                               :rightDelim     => self.string[ rdpos, 1 ],
+                               :modifiers      => self.string[ modpos, (self.pointer-modpos) ],
+                       }
+
+
+               # If it's one of the fancy quotelike operators, our job is somewhat
+               # complicated (though nothing like Perl's, thank the Goddess)
+               elsif self.scan( %r:%[rwqQx]?(?=\S): )
+                       op = self.matched
+                       debugMsg 2, "Matching a real quotelike ('%s') at offset %d" % 
+                               [ op, self.pointer ]
+                       modifiers = nil
+
+                       ldpos = self.pointer
+                       strpos = ldpos + 1
+
+                       # Peek ahead to see what the delimiter is
+                       ldel = self.check( /\S/ )
+                       
+                       # If it's a bracketing character, just use matchBracketed
+                       if ldel =~ /[[(<{]/
+                               rdel = ldel.tr( '[({<', '])}>' )
+                               debugMsg 4, "Left delim is a bracket: %s; looking for compliment: %s" %
+                                       [ ldel, rdel ]
+                               self.matchBracketed( '', Regexp::quote(ldel), nil, nil, Regexp::quote(rdel) )
+                       else
+                               debugMsg 4, "Left delim isn't a bracket: '#{ldel}'; looking for closing instance"
+                               self.scan( /#{ldel}[^\\#{ldel}]*(\\.[^\\#{ldel}]*)*#{ldel}/ ) or
+                                       raise MatchFailure,
+                                       "Can't find a closing delimiter '%s' at '%s...' (offset %d)" %
+                                       [ ldel, self.rest[0,20].chomp, self.pointer ]
+                       end
+                       rdelpos = self.pointer - 1
+
+                       # Match modifiers for Regexp quote
+                       if op == '%r'
+                               pattern = if RUBY_VERSION >= "1.7.3" then /[imoxs]*/ else /[imox]*/ end
+                               modifiers = self.scan( pattern ) || ''
+                       end
+
+                       rval = {
+                               :prefix         => self.string[ startPos, (oppos-startPos) ],
+                               :match          => self.string[ oppos .. (self.pointer - 1) ],
+                               :quoteOp        => op,
+                               :leftDelim      => self.string[ ldpos, 1 ],
+                               :delimText      => self.string[ strpos, (rdelpos-strpos) ],
+                               :rightDelim     => self.string[ rdelpos, 1 ],
+                               :modifiers      => modifiers,
+                       }
+
+               # If it's a here-doc, things get even hairier.
+               elsif self.scan( %r:<<(-)?: )
+                       debugMsg 2, "Matching a here-document at offset %d" % self.pointer
+                       op = self.matched
+
+                       # If there was a dash, start with optional whitespace
+                       indent = self[1] ? '\s*' : ''
+                       ldpos = self.pointer
+                       label = ''
+
+                       # Plain identifier
+                       if self.scan( /[A-Za-z_]\w*/ )
+                               label = self.matched
+                               debugMsg 3, "Setting heredoc terminator to bare identifier '%s'" % label
+
+                       # Quoted string
+                       elsif self.scan( / ' ([^'\\]* (?:\\.[^'\\]*)*) ' /sx ) ||
+                                 self.scan( / " ([^"\\]* (?:\\.[^"\\]*)*) " /sx ) ||
+                                 self.scan( / ` ([^`\\]* (?:\\.[^`\\]*)*) ` /sx )
+                               label = self[1]
+                               debugMsg 3, "Setting heredoc terminator to quoted identifier '%s'" % label
+
+                       # Ruby, unlike Perl, requires a terminal, even if it's only an empty
+                       # string
+                       else
+                               raise MatchFailure,
+                                       "Missing heredoc terminator before end of line at "\
+                                       "'%s...' (offset %d)" %
+                                       [ self.rest[0,20].chomp, self.pointer ]
+                       end
+                       extrapos = self.pointer
+
+                       # Advance to the beginning of the string
+                       self.skip( /.*\n/ )
+                       strpos = self.pointer
+                       debugMsg 3, "Scanning until /\\n#{indent}#{label}\\n/m"
+
+                       # Match to the label
+                       unless self.scan_until( /\n#{indent}#{label}\n/m )
+                               raise MatchFailure,
+                                       "Couldn't find heredoc terminator '%s' after '%s...' (offset %d)" %
+                                       [ label, self.rest[0,20].chomp, self.pointer ]
+                       end
+
+                       rdpos = self.pointer - self.matched_size
+
+                       rval = {
+                               :prefix         => self.string[ startPos, (oppos-startPos) ],
+                               :match          => self.string[ oppos .. (self.pointer - 1) ],
+                               :quoteOp        => op,
+                               :leftDelim      => self.string[ ldpos, (extrapos-ldpos) ],
+                               :delimText      => self.string[ strpos, (rdpos-strpos) ],
+                               :rightDelim     => self.string[ rdpos, (self.pointer-rdpos) ],
+                       }
+
+               else
+                       raise MatchFailure,
+                               "No quotelike operator found after prefix at '%s...'" %
+                                       self.rest[0,20].chomp
+               end
+
+               
+               debugMsg 1, "matchQuotelike succeeded: %s" % rval.inspect
+               return rval
+       end
+
+
+       ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
+       ### try to match text that is a valid Ruby variable or identifier, ...?
+       def matchVariable( prefix )
+               startPos = self.pointer
+               debugMsg 2, "matchVariable starting at pos = %d: prefix = %s",
+                                startPos, prefix.inspect
+
+               # Look for the prefix
+               raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
+                       self.skip( prefix )
+
+               varPos = self.pointer
+
+               # If the variable matched is a predefined global, no need to look for an
+               # identifier
+               unless self.scan( %r~\$(?:[!@/\\,;.<>$?:_\~&`'+]|-\w|\d+)~ )
+
+                       debugMsg 2, "Not a predefined global at '%s...' (offset %d)" %
+                               [ self.rest[0,20].chomp, self.pointer ]
+                       
+                       # Look for a valid identifier
+                       unless self.scan( /\*?(?:[$@]|::)?(?:[a-z_]\w*(?:::\s*))*[_a-z]\w*/is )
+                               raise MatchFailure, "No variable found: Bad identifier (offset %d)" % self.pointer
+                       end
+               end
+
+               debugMsg 2, "Matched '%s' at offset %d" % [ self.matched, self.pointer ]
+
+               # Match methodchain with trailing codeblock
+               while self.rest?
+                       # Match a regular chained method
+                       next if scanCodeblock( {"("=>")", "do"=>"end", "begin"=>"end", "{"=>"}"},
+                                                                  /\s*(?:\.|::)\s*[a-zA-Z_]\w+\s*/ )
+
+                       # Match a trailing block or an element ref
+                       next if scanCodeblock( nil, /\s*/, {'{' => '}', '[' => ']'} )
+
+                       # This matched a dereferencer in Perl, which doesn't have any
+                       # equivalent in Ruby.
+                       #next if scanVariable( '\s*(\.|::)\s*' )
+
+                       # Match a method call without parens (?)
+                       next if self.scan( '\s*(\.|::)\s*\w+(?![{([])' )
+
+                       break
+               end
+
+               rval = {
+                       :match  => self.string[ varPos .. (self.pointer - 1) ],
+                       :prefix => self.string[ startPos, (varPos-startPos) ],
+               }
+               debugMsg 1, "matchVariable succeeded: %s" % rval.inspect
+               return rval
+       end
+
+
+       ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
+       ### try to match text inside a Ruby code block construct which must be
+       ### delimited by the specified <tt>outerDelimPairs</tt>. It may optionally
+       ### contain sub-blocks delimited with the given <tt>innerDelimPairs</tt>.
+       def matchCodeblock( prefix, innerDelimPairs, outerDelimPairs )
+               startPos = self.pointer
+               debugMsg 2, "Starting matchCodeblock at offset %d (%s)", startPos, self.rest.inspect
+
+               # Look for the prefix
+               raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
+                       self.skip( prefix )
+               codePos = self.pointer
+               debugMsg 3, "Skipped prefix '%s' to offset %d" %
+                       [ self.matched, codePos ]
+
+               # Build a regexp for the outer delimiters
+               ldelimOuter = "(" + outerDelimPairs.keys  .uniq.collect {|delim| Regexp::quote(delim)}.join('|') + ")"
+               rdelimOuter = "(" + outerDelimPairs.values.uniq.collect {|delim| Regexp::quote(delim)}.join('|') + ")"
+               debugMsg 4, "Using /%s/ as the outer delim regex" % ldelimOuter
+
+               unless self.scan( ldelimOuter )
+                       raise MatchFailure, %q:Did not find opening bracket at "%s..." offset %d: %
+                               [ self.rest[0,20].chomp, codePos ]
+               end
+
+               # Look up the corresponding outer delimiter
+               closingDelim = outerDelimPairs[self.matched] or
+                       raise DelimiterError, "Could not find closing delimiter for '%s'" %
+                               self.matched
+               
+               debugMsg 3, "Scanning for closing delim '#{closingDelim}'"
+               matched = ''
+               patvalid = true
+
+               # Scan until the end of the text or until an explicit break
+               while self.rest?
+                       debugMsg 5, "Scanning from offset %d (%s)", self.pointer, self.rest.inspect
+                       matched = ''
+
+                       # Skip comments
+                       debugMsg 5, "Trying to match a comment"
+                       if self.scan( /\s*#.*/ )
+                               debugMsg 4, "Skipping comment '%s' to offset %d" % 
+                                       [ self.matched, self.pointer ]
+                               next
+                       end
+
+                       # Look for (any) closing delimiter
+                       debugMsg 5, "Trying to match a closing outer delimiter with /\s*(#{rdelimOuter})/"
+                       if self.scan( /\s*(#{rdelimOuter})/ )
+                               debugMsg 4, "Found a right delimiter '#{self.matched}'"
+
+                               # If it's the delimiter we're looking for, stop the scan
+                               if self.matched.strip == closingDelim
+                                       matched = self.matched
+                                       debugMsg 3, "Found the closing delimiter we've been looking for (#{matched.inspect})."
+                                       break
+
+                               # Otherwise, it's an error, as we've apparently seen a closing
+                               # delimiter without a corresponding opening one.
+                               else
+                                       raise MatchFailure,
+                                               %q:Mismatched closing bracket at "%s..." (offset %s). Expected '%s': %
+                                               [ self.rest[0,20], self.pointer, closingDelim ]
+                               end
+                       end
+
+                       # Try to match a variable or a quoted phrase
+                       debugMsg 5, "Trying to match either a variable or quotelike"
+                       if self.scanVariable( '\s*' ) || self.scanQuotelike( '\s*', patvalid )
+                               debugMsg 3, "Matched either a variable or quotelike. Offset now %d" % self.pointer
+                               patvalid = false
+                               next
+                       end
+
+                       # Match some operators
+                       # :TODO: This hasn't really been ruby-ified
+                       debugMsg 5, "Trying to match an operator"
+                       if self.scan( %r:\s*([-+*x/%^&|.]=?
+                                       | [!=]~
+                                       | =(?!>)
+                                       | (\*\*|&&|\|\||<<|>>)=?
+                                       | split|grep|map|return
+                                       ):x )
+                               debugMsg 3, "Skipped miscellaneous operator '%s' to offset %d." %
+                                       [ self.matched, self.pointer ]
+                               patvalid = true
+                               next
+                       end
+
+                       # Try to match an embedded codeblock
+                       debugMsg 5, "Trying to match an embedded codeblock with delim pairs: %s",
+                               innerDelimPairs.inspect
+                       if self.scanCodeblock( innerDelimPairs )
+                               debugMsg 3, "Skipped inner codeblock to offset %d." % self.pointer
+                               patvalid = true
+                               next
+                       end
+
+                       # Try to match a stray outer-left delimiter
+                       debugMsg 5, "Trying to match a stray outer-left delimiter (#{ldelimOuter})"
+                       if self.match?( ldelimOuter )
+                               raise MatchFailure, "Improperly nested codeblock at offset %d: %s... " %
+                                       [ self.pointer, self.rest[0,20] ]
+                       end
+
+                       patvalid = false
+                       self.scan( /\s*(\w+|[-=>]>|.|\Z)/m )
+                       debugMsg 3, "Skipped '%s' to offset %d" %
+                               [ self.matched, self.pointer ]
+               end
+
+
+               unless matched
+                       raise MatchFailure, "No match found for opening bracket"
+               end
+
+               rval = {
+                       :match  => self.string[codePos .. (self.pointer - 1)],
+                       :prefix => self.string[startPos, (codePos-startPos)]
+               }
+               debugMsg 1, "matchCodeblock succeeded: %s" % rval.inspect
+               return rval
+       end
+
+
+       ### Attempt to derive and return the number of scan methods traversed up to
+       ### this point by examining the call stack.
+       def scanDepth
+               return caller(2).find_all {|frame|
+                       frame =~ /in `scan(Variable|Tagged|Codeblock|Bracketed|Quotelike)'/
+               }.length
+       end
+
+
+       #######
+       private
+       #######
+
+       ### Print the specified <tt>message</tt> to STDERR if the scanner's
+       ### debugging level is greater than or equal to <tt>level</tt>.
+       def debugMsg( level, msgFormat, *args )
+               return unless level.nonzero? && self.debugLevel >= level
+               msg = if args.empty? then msgFormat else format(msgFormat, *args) end
+               $stderr.puts( (" " * (level-1) * 2) + msg )
+       end
+
+
+       ### Given a series of one or more bracket characters (eg., '<', '[', '{',
+       ### etc.), return the brackets reversed in order and direction.
+       def revbracket( bracket )
+               return bracket.to_s.reverse.tr( '<[{(', '>]})' )
+       end
+
+
+       ### Given an opening <tt>tag</tt> of the sort matched by #scanTagged,
+       ### construct and return a closing tag.
+       def makeClosingTag( tag )
+               debugMsg 3, "Making a closing tag for '%s'" % tag
+
+               closingTag = tag.gsub( /^([[(<{]+)(#{XmlName}).*/ ) {
+                       Regexp.quote( "#{$1}/#{$2}" + revbracket($1) )
+               }
+
+               raise MatchFailure, "Unable to construct closing tag to match: #{tag}" unless closingTag
+               return closingTag
+       end
+
+
+       ### Make and return a new Regexp which matches substrings bounded by the
+       ### specified +delimiters+, not counting those which have been escaped with
+       ### the escape characters in +escapes+.
+       def makeDelimPattern( delimiters, escapes='\\', prefix='\\s*' )
+               delimiters = delimiters.to_s
+               escapes = escapes.to_s
+               
+               raise DelimiterError, "Illegal delimiter '#{delimiter}'" unless delimiters =~ /\S/
+
+               # Pad the escapes string to the same length as the delimiters
+               escapes.concat( escapes[-1,1] * (delimiters.length - escapes.length) )
+               patParts = []
+               
+               # Escape each delimiter and a corresponding escape character, and then
+               # build a pattern part from them
+               delimiters.length.times do |i|
+                       del = Regexp.escape( delimiters[i, 1] )
+                       esc = Regexp.escape( escapes[i, 1] )
+
+                       if del == esc then
+                               patParts.push "#{del}(?:[^#{del}]*(?:(?:#{del}#{del})[^#{del}]*)*)#{del}"
+                       else
+                               patParts.push "#{del}(?:[^#{esc}#{del}]*(?:#{esc}.[^#{esc}#{del}]*)*)#{del}";
+                       end
+               end
+
+               # Join all the parts together and return one big pattern
+               return Regexp::new( "#{prefix}(?:#{patParts.join("|")})" )
+       end
+
+end # class StringExtractor
+
This page took 0.091989 seconds and 4 git commands to generate.