3 # A derivative of StringScanner that can scan for delimited constructs in
4 # addition to regular expressions. It is a loose port of the Text::Balanced
5 # module for Perl by Damian Conway <damian@cs.monash.edu.au>.
9 # se = DelimScanner::new( myString )
13 # * Michael Granger <ged@FaerieMUD.org>
15 # Copyright (c) 2002, 2003 The FaerieMUD Consortium. Most rights reserved.
17 # This work is licensed under the Creative Commons Attribution License. To view
18 # a copy of this license, visit http://creativecommons.org/licenses/by/1.0 or
19 # send a letter to Creative Commons, 559 Nathan Abbott Way, Stanford, California
30 ### Add some stuff to the String class to allow easy transformation to Regexp
31 ### and in-place interpolation.
33 def to_re( casefold=false, extended=false )
34 return Regexp::new( self.dup )
37 ### Ideas for String-interpolation stuff courtesy of Hal E. Fulton
38 ### <hal9000@hypermetrics.com> via ruby-talk
40 def interpolate( scope )
41 unless scope.is_a?( Binding )
42 raise TypeError, "Argument to interpolate must be a Binding, not "\
43 "a #{scope.class.name}"
46 # $stderr.puts ">>> Interpolating '#{self}'..."
48 copy = self.gsub( /"/, %q:\": )
49 eval( '"' + copy + '"', scope )
55 ### A derivative of StringScanner that can scan for delimited constructs in
56 ### addition to regular expressions.
59 ### Scanner exception classes
60 class MatchFailure < RuntimeError ; end
61 class DelimiterError < RuntimeError ; end
65 StringScanner.must_C_version
69 Version = /([\d\.]+)/.match( %q{$Revision$} )[1]
72 # Pattern to match a valid XML name
73 XmlName = '[a-zA-Z_:][a-zA-Z0-9:.-]*'
76 ### Namespace module for DelimString constants
79 # The list of default opening => closing codeblock delimiters to use for
81 CodeblockDelimiters = {
87 # Default scanMultiple operations and their arguments
98 ### Define delegating methods that cast their argument to a Regexp from a
99 ### String. This allows the scanner's scanning methods to be called with
100 ### Strings in addition to Regexps. This was mostly stolen from
102 def self.def_casting_delegators( *methods )
103 methods.each {|methodName|
104 class_eval( <<-EOF, "(--def_casting_delegators--)", 1 )
105 def #{methodName}( pattern )
106 pattern = pattern.to_s.to_re unless pattern.is_a?( Regexp )
107 @scanner.#{methodName}( pattern )
114 ### Create a new DelimScanner object for the specified <tt>string</tt>. If
115 ### <tt>dup</tt> is <tt>true</tt>, a duplicate of the target string will be
116 ### used instead of the one given. The target string will be frozen after
117 ### the scanner is created.
118 def initialize( string, dup=true )
119 @scanner = StringScanner::new( string, dup )
130 # Here, some delegation trickery is done to make a DelimScanner behave like
131 # a StringScanner. Some methods are directly delegated, while some are
132 # delegated via a method which casts its argument to a Regexp first so some
133 # scanner methods can be called with Strings as well as Regexps.
135 # A list of delegated methods that need casting.
136 NeedCastingDelegators = :scan, :skip, :match?, :check,
137 :scan_until, :skip_until, :exist?, :check_until
139 # Delegate all StringScanner instance methods to the associated scanner
140 # object, except those that need a casting delegator, which uses an indirect
142 def_delegators :@scanner,
143 *( StringScanner.instance_methods - NeedCastingDelegators.collect {|sym| sym.id2name} )
145 def_casting_delegators( *NeedCastingDelegators )
149 # The last match error encountered by the scanner
150 attr_accessor :matchError
151 protected :matchError= ; # ; is to work around a ruby-mode indent bug
154 attr_accessor :debugLevel
158 ### Returns <tt>true</tt> if the scanner has encountered a match error.
160 return ! @matchError.nil?
164 ### Starting at the scan pointer, try to match a substring delimited by the
165 ### specified <tt>delimiters</tt>, skipping the specified <tt>prefix</tt>
166 ### and any character escaped by the specified <tt>escape</tt>
167 ### character/s. If matched, advances the scan pointer and returns a Hash
168 ### with the following key/value pairs on success:
170 ### [<tt>:match</tt>]
171 ### The text of the match, including delimiters.
172 ### [<tt>:prefix</tt>]
173 ### The matched prefix, if any.
175 ### If the match fails, returns nil.
176 def scanDelimited( delimiters="'\"`", prefix='\\s*', escape='\\' )
177 delimiters ||= "'\"`"
181 debugMsg( 1, "Scanning for delimited text: delim = (%s), prefix=(%s), escape=(%s)",
182 delimiters, prefix, escape )
183 self.matchError = nil
185 # Try to match the prefix first to get the length
186 unless (( prefixLength = self.match?(prefix.to_re) ))
187 self.matchError = "Failed to match prefix '%s' at offset %d" %
188 [ prefix, self.pointer ]
192 # Now build a delimited pattern with the specified parameters.
193 delimPattern = makeDelimPattern( delimiters, escape, prefix )
194 debugMsg( 2, "Delimiter pattern is %s" % delimPattern.inspect )
197 unless (( matchedString = self.scan(delimPattern) ))
198 self.matchError = "No delimited string found."
203 :match => matchedString[prefixLength .. -1],
204 :prefix => matchedString[0..prefixLength-1],
209 ### Match using the #scanDelimited method, but only return the match or nil.
210 def extractDelimited( *args )
211 rval = scanDelimited( *args ) or return nil
216 ### Starting at the scan pointer, try to match a substring delimited by the
217 ### specified <tt>delimiters</tt>, skipping the specified <tt>prefix</tt>
218 ### and any character escaped by the specified <tt>escape</tt>
219 ### character/s. If matched, advances the scan pointer and returns the
220 ### length of the matched string; if it fails the match, returns nil.
221 def skipDelimited( delimiters="'\"`", prefix='\\s*', escape='\\' )
222 delimiters ||= "'\"`"
226 self.matchError = nil
227 return self.skip( makeDelimPattern(delimiters, escape, prefix) )
231 ### Starting at the scan pointer, try to match a substring delimited by
232 ### balanced <tt>delimiters</tt> of the type specified, after skipping the
233 ### specified <tt>prefix</tt>. On a successful match, this method advances
234 ### the scan pointer and returns a Hash with the following key/value pairs:
236 ### [<tt>:match</tt>]
237 ### The text of the match, including the delimiting brackets.
238 ### [<tt>:prefix</tt>]
239 ### The matched prefix, if any.
241 ### On failure, returns nil.
242 def scanBracketed( delimiters="{([<", prefix='\s*' )
243 delimiters ||= "{([<"
246 prefix = prefix.to_re unless prefix.kind_of?( Regexp )
248 debugMsg( 1, "Scanning for bracketed text: delimiters = (%s), prefix = (%s)",
251 self.matchError = nil
253 # Split the left-delimiters (brackets) from the quote delimiters.
254 ldel = delimiters.dup
255 qdel = ldel.squeeze.split(//).find_all {|char| char =~ /["'`]/ }.join('|')
256 qdel = nil if qdel.empty?
257 quotelike = true if ldel =~ /q/
259 # Change all instances of delimiters to the left-hand versions, and
260 # strip away anything but bracketing delimiters
261 ldel = ldel.tr( '[](){}<>', '[[(({{<<' ).gsub(/[^#{Regexp.quote('[\\](){}<>')}]+/, '').squeeze
263 ### Now build the right-delim equivalent of the left delim string
265 unless rdel.tr!( '[({<', '])}>' )
266 raise DelimiterError, "Did not find a suitable bracket in delimiter: '#{delimiters}'"
269 # Build regexps from both bracketing delimiter strings
270 ldel = ldel.split(//).collect {|ch| Regexp.quote(ch)}.join('|')
271 rdel = rdel.split(//).collect {|ch| Regexp.quote(ch)}.join('|')
273 depth = self.scanDepth
275 startPos = self.pointer
278 result = matchBracketed( prefix, ldel, qdel, quotelike, rdel )
279 rescue MatchFailure => e
280 debugMsg( depth + 1, "Match error: %s" % e.message )
281 self.matchError = e.message
282 self.pointer = startPos
285 self.pointer = startPos
293 ### Match using the #scanBracketed method, but only return the match or nil.
294 def extractBracketed( *args )
295 rval = scanBracketed( *args ) or return nil
300 ### Starting at the scan pointer, try to match a substring with
301 ### #scanBracketed. On a successful match, this method advances the scan
302 ### pointer and returns the length of the match, including the delimiters
303 ### and any prefix that was skipped. On failure, returns nil.
304 def skipBracketed( *args )
305 startPos = self.pointer
307 match = scanBracketed( *args )
309 return nil unless match
310 return match.length + prefix.length
312 debugMsg( 2, "Resetting scan pointer." )
313 self.pointer = startPos
317 ### Extracts and segments text from the scan pointer forward that occurs
318 ### between (balanced) specified tags, after skipping the specified
319 ### <tt>prefix</tt>. If the opentag argument is <tt>nil</tt>, a pattern which
320 ### will match any standard HTML/XML tag will be used. If the
321 ### <tt>closetag</tt> argument is <tt>nil</tt>, a pattern is created which
322 ### prepends a <tt>/</tt> character to the matched opening tag, after any
323 ### bracketing characters. The <tt>options</tt> argument is a Hash of one or
324 ### more options which govern the matching operation. They are described in
325 ### more detail in the Description section of 'lib/DelimScanner.rb'. On a
326 ### successful match, this method advances the scan pointer and returns an
328 ### [<tt>:match</tt>]
329 ### The text of the match, including the delimiting tags.
330 ### [<tt>:prefix</tt>]
331 ### The matched prefix, if any.
333 ### On failure, returns nil.
334 def scanTagged( opentag=nil, closetag=nil, prefix='\s*', options={} )
337 ldel = opentag || %Q,<\\w+(?:#{ makeDelimPattern(%q:'":) }|[^>])*>,
339 raise ArgumentError, "Options argument must be a hash" unless options.kind_of?( Hash )
341 failmode = options[:fail]
342 bad = if options[:reject].is_a?( Array ) then
343 options[:reject].join("|")
345 (options[:reject] || '')
347 ignore = if options[:ignore].is_a?( Array ) then
348 options[:ignore].join("|")
350 (options[:ignore] || '')
353 self.matchError = nil
355 startPos = self.pointer
357 depth = self.scanDepth
360 result = matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
361 rescue MatchFailure => e
362 debugMsg( depth + 1, "Match error: %s" % e.message )
363 self.matchError = e.message
364 self.pointer = startPos
367 self.pointer = startPos
375 ### Match using the #scanTagged method, but only return the match or nil.
376 def extractTagged( *args )
377 rval = scanTagged( *args ) or return nil
382 ### Starting at the scan pointer, try to match a substring with
383 ### #scanTagged. On a successful match, this method advances the scan
384 ### pointer and returns the length of the match, including any delimiters
385 ### and any prefix that was skipped. On failure, returns nil.
386 def skipTagged( *args )
387 startPos = self.pointer
389 match = scanTagged( *args )
391 return nil unless match
392 return match.length + prefix.length
394 debugMsg( 2, "Resetting scan pointer." )
395 self.pointer = startPos
400 # Since the extract_quotelike function isn't documented at all in
401 # Text::Balanced, I'm only guessing this is correct...
403 ### Starting from the scan pointer, try to match any one of the various Ruby
404 ### quotes and quotelike operators after skipping the specified
405 ### <tt>prefix</tt>. Nested backslashed delimiters, embedded balanced
406 ### bracket delimiters (for the quotelike operators), and trailing modifiers
407 ### are all caught. If <tt>matchRawRegex</tt> is <tt>true</tt>, inline
408 ### regexen (eg., <tt>/pattern/</tt>) are matched as well. Advances the scan
409 ### pointer and returns a Hash with the following key/value pairs on
412 ### [<tt>:match</tt>]
413 ### The entire text of the match.
414 ### [<tt>:prefix</tt>]
415 ### The matched prefix, if any.
416 ### [<tt>:quoteOp</tt>]
417 ### The name of the quotelike operator (if any) (eg., '%Q', '%r', etc).
418 ### [<tt>:leftDelim</tt>]
419 ### The left delimiter of the first block of the operation.
420 ### [<tt>:delimText</tt>]
421 ### The text of the first block of the operation.
422 ### [<tt>:rightDelim</tt>]
423 ### The right delimiter of the first block of the operation.
424 ### [<tt>:modifiers</tt>]
425 ### The trailing modifiers on the operation (if any).
427 ### On failure, returns nil.
428 def scanQuotelike( prefix='\s*', matchRawRegex=true )
430 self.matchError = nil
432 startPos = self.pointer
434 depth = self.scanDepth
437 result = matchQuotelike( prefix, matchRawRegex )
438 rescue MatchFailure => e
439 debugMsg( depth + 1, "Match error: %s" % e.message )
440 self.matchError = e.message
441 self.pointer = startPos
444 self.pointer = startPos
452 ### Match using the #scanQuotelike method, but only return the match or nil.
453 def extractQuotelike( *args )
454 rval = scanQuotelike( *args ) or return nil
459 ### Starting at the scan pointer, try to match a substring with
460 ### #scanQuotelike. On a successful match, this method advances the scan
461 ### pointer and returns the length of the match, including any delimiters
462 ### and any prefix that was skipped. On failure, returns nil.
463 def skipQuotelike( *args )
464 startPos = self.pointer
466 match = scanQuotelike( *args )
468 return nil unless match
469 return match.length + prefix.length
471 debugMsg( 2, "Resetting scan pointer." )
472 self.pointer = startPos
476 ### Starting from the scan pointer, try to match a Ruby variable after
477 ### skipping the specified prefix.
478 def scanVariable( prefix='\s*' )
479 self.matchError = nil
481 startPos = self.pointer
483 depth = self.scanDepth
486 result = matchVariable( prefix )
487 rescue MatchFailure => e
488 debugMsg( depth + 1, "Match error: %s" % e.message )
489 self.matchError = e.message
490 self.pointer = startPos
493 self.pointer = startPos
501 ### Match using the #scanVariable method, but only return the match or nil.
502 def extractVariable( *args )
503 rval = scanVariable( *args ) or return nil
508 ### Starting at the scan pointer, try to match a substring with
509 ### #scanVariable. On a successful match, this method advances the scan
510 ### pointer and returns the length of the match, including any delimiters
511 ### and any prefix that was skipped. On failure, returns nil.
512 def skipVariable( *args )
513 startPos = self.pointer
515 match = scanVariable( *args )
517 return nil unless match
518 return match.length + prefix.length
520 debugMsg( 2, "Resetting scan pointer." )
521 self.pointer = startPos
525 ### Starting from the scan pointer, and skipping the specified
526 ### <tt>prefix</tt>, try to to recognize and match a balanced bracket-,
527 ### do/end-, or begin/end-delimited substring that may contain unbalanced
528 ### delimiters inside quotes or quotelike operations.
529 def scanCodeblock( innerDelim=CodeblockDelimiters, prefix='\s*', outerDelim=innerDelim )
530 self.matchError = nil
532 startPos = self.pointer
535 innerDelim ||= CodeblockDelimiters
536 outerDelim ||= innerDelim
538 depth = caller(1).find_all {|frame|
539 frame =~ /in `scan(Variable|Tagged|Codeblock|Bracketed|Quotelike)'/
543 debugMsg 3, "Calling matchCodeBlock( %s, %s, %s )",
544 prefix.inspect, innerDelim.inspect, outerDelim.inspect
545 result = matchCodeblock( prefix, innerDelim, outerDelim )
546 rescue MatchFailure => e
547 debugMsg( depth + 1, "Match error: %s" % e.message )
548 self.matchError = e.message
549 self.pointer = startPos
552 self.pointer = startPos
560 ### Match using the #scanCodeblock method, but only return the match or nil.
561 def extractCodeblock( *args )
562 rval = scanCodeblock( *args ) or return nil
567 ### Starting at the scan pointer, try to match a substring with
568 ### #scanCodeblock. On a successful match, this method advances the scan
569 ### pointer and returns the length of the match, including any delimiters
570 ### and any prefix that was skipped. On failure, returns nil.
571 def skipCodeblock( *args )
572 startPos = self.pointer
574 match = scanCodeblock( *args )
576 return nil unless match
577 return match.length + prefix.length
579 debugMsg( 2, "Resetting scan pointer." )
580 self.pointer = startPos
590 ### Scan the string from the scan pointer forward, skipping the specified
591 ### <tt>prefix</tt> and trying to match a string delimited by bracketing
592 ### delimiters <tt>ldel</tt> and <tt>rdel</tt> (Regexp objects), and quoting
593 ### delimiters <tt>qdel</tt> (Regexp). If <tt>quotelike</tt> is
594 ### <tt>true</tt>, Ruby quotelike constructs will also be honored.
595 def matchBracketed( prefix, ldel, qdel, quotelike, rdel )
596 startPos = self.pointer
597 debugMsg( 2, "matchBracketed starting at pos = %d: prefix = %s, "\
598 "ldel = %s, qdel = %s, quotelike = %s, rdel = %s",
599 startPos, prefix.inspect, ldel.inspect, qdel.inspect, quotelike.inspect,
602 # Test for the prefix, failing if not found
603 raise MatchFailure, "Did not find prefix: #{prefix.inspect}" unless
606 # Mark this position as the left-delimiter pointer
607 ldelpos = self.pointer
608 debugMsg( 3, "Found prefix. Left delim pointer at %d", ldelpos )
610 # Match opening delimiter or fail
611 unless (( delim = self.scan(ldel) ))
612 raise MatchFailure, "Did not find opening bracket after prefix: '%s' (%d)" %
613 [ self.string[startPos..ldelpos].chomp, ldelpos ]
616 # A stack to keep track of nested delimiters
618 debugMsg( 3, "Found opening bracket. Nesting = %s", nesting.inspect )
622 debugMsg( 5, "Starting scan loop. Nesting = %s", nesting.inspect )
624 # Skip anything that's backslashed
625 if self.skip( /\\./ )
626 debugMsg( 4, "Skipping backslashed literal at offset %d: '%s'",
627 self.pointer - 2, self.string[ self.pointer - 2, 2 ].chomp )
631 # Opening bracket (left delimiter)
634 debugMsg( 4, "Found opening delim %s at offset %d",
635 delim.inspect, self.pointer - 1 )
638 # Closing bracket (right delimiter)
639 elsif self.scan(rdel)
642 debugMsg( 4, "Found closing delim %s at offset %d",
643 delim.inspect, self.pointer - 1 )
645 # :TODO: When is this code reached?
647 raise MatchFailure, "Unmatched closing bracket '%s' at offset %d" %
648 [ delim, self.pointer - 1 ]
651 # Figure out what the compliment of the bracket next off the
653 expected = nesting.pop.tr( '({[<', ')}]>' )
654 debugMsg( 4, "Got a '%s' bracket off nesting stack", expected )
656 # Check for mismatched brackets
658 raise MatchFailure, "Mismatched closing bracket at offset %d: "\
659 "Expected '%s', but found '%s' instead." %
660 [ self.pointer - 1, expected, delim ]
663 # If we've found the closing delimiter, stop scanning
665 debugMsg( 4, "Finished with scan: nesting stack empty." )
669 # Quoted chunk (quoted delimiter)
670 elsif qdel && self.scan(qdel)
673 if self. scan( /[^\\#{match}]*(?:\\.[^\\#{match}]*)*(#{Regexp::quote(match)})/ )
674 debugMsg( 4, "Skipping quoted chunk. Scan pointer now at offset %d", self.pointer )
678 raise MatchFailure, "Unmatched embedded quote (%s) at offset %d" %
679 [ match, self.pointer - 1 ]
682 elsif quotelike && self.scanQuotelike
683 debugMsg( 4, "Matched a quotelike. Scan pointer now at offset %d", self.pointer )
686 # Skip word characters, or a single non-word character
688 self.skip( /(?:[a-zA-Z0-9]+|.)/m )
689 debugMsg 5, "Skipping '%s' at offset %d." %
690 [ self.matched, self.pointer ]
695 # If there's one or more brackets left on the delimiter stack, we're
696 # missing a closing delim.
697 unless nesting.empty?
698 raise MatchFailure, "Unmatched opening bracket(s): %s.. at offset %d" %
699 [ nesting.join('..'), self.pointer ]
703 :match => self.string[ ldelpos .. (self.pointer - 1) ],
704 :prefix => self.string[ startPos, (ldelpos-startPos) ],
706 debugMsg 1, "matchBracketed succeeded: %s" % rval.inspect
711 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
712 ### try to match text bracketed by the given left and right tag-delimiters
713 ### (<tt>ldel</tt> and <tt>rdel</tt>).
714 def matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
715 failmode = failmode.to_s.intern if failmode
716 startPos = self.pointer
717 debugMsg 2, "matchTagged starting at pos = %d: prefix = %s, "\
718 "ldel = %s, rdel = %s, failmode = %s, bad = %s, ignore = %s",
719 startPos, prefix.inspect, ldel.inspect, rdel.inspect,
720 failmode.inspect, bad.inspect, ignore.inspect
723 openTagPos, textPos, paraPos, closeTagPos, endPos = ([nil] * 5)
726 # Look for the prefix
727 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
730 openTagPos = self.pointer
731 debugMsg 3, "Found prefix. Pointer now at offset %d" % self.pointer
733 # Look for the opening delimiter
734 unless (( match = self.scan(ldel) ))
735 raise MatchFailure, "Did not find opening tag %s at offset %d" %
736 [ ldel.inspect, self.pointer ]
739 textPos = self.pointer
740 debugMsg 3, "Found left delimiter '%s': offset now %d" % [ match, textPos ]
742 # Make a right delim out of the tag we found if none was specified
744 rdelspec = makeClosingTag( match )
745 debugMsg 3, "Generated right-delimiting tag: %s" % rdelspec.inspect
747 # Make the regexp-related globals from the match
748 rdelspec = rdel.gsub( /(\A|[^\\])\$([1-9])/, '\1self[\2]' ).interpolate( binding )
749 debugMsg 3, "Right delimiter (after interpolation) is: %s" % rdelspec.inspect
752 # Process until we reach the end of the string or find a closing tag
753 while self.rest? && closeTagPos.nil?
755 # Skip backslashed characters
756 if (( self.skip( /^\\./ ) ))
757 debugMsg 4, "Skipping backslashed literal at offset %d" % self.pointer
760 # Match paragraphs-break for fail == :para
761 elsif (( matchlength = self.skip( /^(\n[ \t]*\n)/ ) ))
762 paraPos ||= self.pointer - matchlength
763 debugMsg 4, "Found paragraph position at offset %d" % paraPos
766 elsif (( matchlength = self.skip( rdelspec ) ))
767 closeTagPos = self.pointer - matchlength
768 debugMsg 3, "Found closing tag at offset %d" % closeTagPos
770 # If we're ignoring anything, try to match and move beyond it
771 elsif ignore && !ignore.empty? && self.skip(ignore)
772 debugMsg 3, "Skipping ignored text '%s' at offset %d" %
773 [ self.matched, self.pointer - self.matched_size ]
776 # If there's a "bad" pattern, try to match it, shorting the
777 # outer loop if it matches in para or max mode, or failing with
778 # a match error if not.
779 elsif bad && !bad.empty? && self.match?( bad )
780 if failmode == :para || failmode == :max
783 raise MatchFailure, "Found invalid nested tag '%s' at offset %d" %
784 [ match, self.pointer ]
787 # If there's another opening tag, make a recursive call to
788 # ourselves to move the cursor beyond it
789 elsif (( match = self.scan( ldel ) ))
793 unless self.matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
794 break if failmode == :para || failmode == :max
796 raise MatchFailure, "Found unbalanced nested tag '%s' at offset %d" %
797 [ tag, self.pointer ]
802 debugMsg 5, "Advanced scan pointer to offset %d" % self.pointer
806 # If the closing hasn't been found, then it's a "short" match, which is
807 # okay if the failmode indicates we don't care. Otherwise, it's an error.
809 debugMsg 3, "No close tag position found. "
811 if failmode == :max || failmode == :para
812 closeTagPos = self.pointer - 1
813 debugMsg 4, "Failmode %s tolerates no closing tag. Close tag position set to %d" %
814 [ failmode.inspect, closeTagPos ]
816 # Sync the scan pointer and the paragraph marker if it's set.
817 if failmode == :para && paraPos
818 self.pointer = paraPos + 1
821 raise MatchFailure, "No closing tag found."
826 :match => self.string[ openTagPos .. (self.pointer - 1) ],
827 :prefix => self.string[ startPos, (openTagPos-startPos) ],
829 debugMsg 1, "matchTagged succeeded: %s" % rval.inspect
834 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
835 ### try to match text inside a Ruby quotelike construct. If
836 ### <tt>matchRawRegex</tt> is <tt>true</tt>, the regex construct
837 ### <tt>/pattern/</tt> is also matched.
838 def matchQuotelike( prefix, matchRawRegex )
839 startPos = self.pointer
840 debugMsg 2, "matchQuotelike starting at pos = %d: prefix = %s, "\
841 "matchRawRegex = %s",
842 startPos, prefix.inspect, matchRawRegex.inspect
844 # Init position markers
845 rval = oppos = preldpos = ldpos = strpos = rdpos = modpos = nil
847 # Look for the prefix
848 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
852 # Peek at the next character
853 # If the initial quote is a simple quote, our job is easy
854 if self.check(/^["`']/) || ( matchRawRegex && self.check(%r:/:) )
855 initial = self.matched
857 # Build the pattern for matching the simple string
858 pattern = "%s [^\\%s]* (\\.[^\\%s]*)* %s" %
859 [ Regexp.quote(initial),
861 Regexp.quote(initial) ]
862 debugMsg 2, "Matching simple quote at offset %d with /%s/" %
863 [ self.pointer, pattern ]
865 # Search for it, raising an exception if it's not found
866 unless self.scan( /#{pattern}/xism )
868 "Did not find closing delimiter to match '%s' at '%s...' (offset %d)" %
869 [ initial, self.string[ oppos, 20 ].chomp, self.pointer ]
872 modpos = self.pointer
875 # If we're matching a regex, look for any trailing modifiers
877 pattern = if RUBY_VERSION >= "1.7.3" then /[imoxs]*/ else /[imox]*/ end
882 :prefix => self.string[ startPos, (oppos-startPos) ],
883 :match => self.string[ oppos .. (self.pointer - 1) ],
884 :leftDelim => self.string[ oppos, 1 ],
885 :delimText => self.string[ (oppos+1) .. (rdpos-1) ],
886 :rightDelim => self.string[ rdpos, 1 ],
887 :modifiers => self.string[ modpos, (self.pointer-modpos) ],
891 # If it's one of the fancy quotelike operators, our job is somewhat
892 # complicated (though nothing like Perl's, thank the Goddess)
893 elsif self.scan( %r:%[rwqQx]?(?=\S): )
895 debugMsg 2, "Matching a real quotelike ('%s') at offset %d" %
902 # Peek ahead to see what the delimiter is
903 ldel = self.check( /\S/ )
905 # If it's a bracketing character, just use matchBracketed
907 rdel = ldel.tr( '[({<', '])}>' )
908 debugMsg 4, "Left delim is a bracket: %s; looking for compliment: %s" %
910 self.matchBracketed( '', Regexp::quote(ldel), nil, nil, Regexp::quote(rdel) )
912 debugMsg 4, "Left delim isn't a bracket: '#{ldel}'; looking for closing instance"
913 self.scan( /#{ldel}[^\\#{ldel}]*(\\.[^\\#{ldel}]*)*#{ldel}/ ) or
915 "Can't find a closing delimiter '%s' at '%s...' (offset %d)" %
916 [ ldel, self.rest[0,20].chomp, self.pointer ]
918 rdelpos = self.pointer - 1
920 # Match modifiers for Regexp quote
922 pattern = if RUBY_VERSION >= "1.7.3" then /[imoxs]*/ else /[imox]*/ end
923 modifiers = self.scan( pattern ) || ''
927 :prefix => self.string[ startPos, (oppos-startPos) ],
928 :match => self.string[ oppos .. (self.pointer - 1) ],
930 :leftDelim => self.string[ ldpos, 1 ],
931 :delimText => self.string[ strpos, (rdelpos-strpos) ],
932 :rightDelim => self.string[ rdelpos, 1 ],
933 :modifiers => modifiers,
936 # If it's a here-doc, things get even hairier.
937 elsif self.scan( %r:<<(-)?: )
938 debugMsg 2, "Matching a here-document at offset %d" % self.pointer
941 # If there was a dash, start with optional whitespace
942 indent = self[1] ? '\s*' : ''
947 if self.scan( /[A-Za-z_]\w*/ )
949 debugMsg 3, "Setting heredoc terminator to bare identifier '%s'" % label
952 elsif self.scan( / ' ([^'\\]* (?:\\.[^'\\]*)*) ' /sx ) ||
953 self.scan( / " ([^"\\]* (?:\\.[^"\\]*)*) " /sx ) ||
954 self.scan( / ` ([^`\\]* (?:\\.[^`\\]*)*) ` /sx )
956 debugMsg 3, "Setting heredoc terminator to quoted identifier '%s'" % label
958 # Ruby, unlike Perl, requires a terminal, even if it's only an empty
962 "Missing heredoc terminator before end of line at "\
963 "'%s...' (offset %d)" %
964 [ self.rest[0,20].chomp, self.pointer ]
966 extrapos = self.pointer
968 # Advance to the beginning of the string
970 strpos = self.pointer
971 debugMsg 3, "Scanning until /\\n#{indent}#{label}\\n/m"
974 unless self.scan_until( /\n#{indent}#{label}\n/m )
976 "Couldn't find heredoc terminator '%s' after '%s...' (offset %d)" %
977 [ label, self.rest[0,20].chomp, self.pointer ]
980 rdpos = self.pointer - self.matched_size
983 :prefix => self.string[ startPos, (oppos-startPos) ],
984 :match => self.string[ oppos .. (self.pointer - 1) ],
986 :leftDelim => self.string[ ldpos, (extrapos-ldpos) ],
987 :delimText => self.string[ strpos, (rdpos-strpos) ],
988 :rightDelim => self.string[ rdpos, (self.pointer-rdpos) ],
993 "No quotelike operator found after prefix at '%s...'" %
994 self.rest[0,20].chomp
998 debugMsg 1, "matchQuotelike succeeded: %s" % rval.inspect
1003 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
1004 ### try to match text that is a valid Ruby variable or identifier, ...?
1005 def matchVariable( prefix )
1006 startPos = self.pointer
1007 debugMsg 2, "matchVariable starting at pos = %d: prefix = %s",
1008 startPos, prefix.inspect
1010 # Look for the prefix
1011 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
1014 varPos = self.pointer
1016 # If the variable matched is a predefined global, no need to look for an
1018 unless self.scan( %r~\$(?:[!@/\\,;.<>$?:_\~&`'+]|-\w|\d+)~ )
1020 debugMsg 2, "Not a predefined global at '%s...' (offset %d)" %
1021 [ self.rest[0,20].chomp, self.pointer ]
1023 # Look for a valid identifier
1024 unless self.scan( /\*?(?:[$@]|::)?(?:[a-z_]\w*(?:::\s*))*[_a-z]\w*/is )
1025 raise MatchFailure, "No variable found: Bad identifier (offset %d)" % self.pointer
1029 debugMsg 2, "Matched '%s' at offset %d" % [ self.matched, self.pointer ]
1031 # Match methodchain with trailing codeblock
1033 # Match a regular chained method
1034 next if scanCodeblock( {"("=>")", "do"=>"end", "begin"=>"end", "{"=>"}"},
1035 /\s*(?:\.|::)\s*[a-zA-Z_]\w+\s*/ )
1037 # Match a trailing block or an element ref
1038 next if scanCodeblock( nil, /\s*/, {'{' => '}', '[' => ']'} )
1040 # This matched a dereferencer in Perl, which doesn't have any
1041 # equivalent in Ruby.
1042 #next if scanVariable( '\s*(\.|::)\s*' )
1044 # Match a method call without parens (?)
1045 next if self.scan( '\s*(\.|::)\s*\w+(?![{([])' )
1051 :match => self.string[ varPos .. (self.pointer - 1) ],
1052 :prefix => self.string[ startPos, (varPos-startPos) ],
1054 debugMsg 1, "matchVariable succeeded: %s" % rval.inspect
1059 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
1060 ### try to match text inside a Ruby code block construct which must be
1061 ### delimited by the specified <tt>outerDelimPairs</tt>. It may optionally
1062 ### contain sub-blocks delimited with the given <tt>innerDelimPairs</tt>.
1063 def matchCodeblock( prefix, innerDelimPairs, outerDelimPairs )
1064 startPos = self.pointer
1065 debugMsg 2, "Starting matchCodeblock at offset %d (%s)", startPos, self.rest.inspect
1067 # Look for the prefix
1068 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
1070 codePos = self.pointer
1071 debugMsg 3, "Skipped prefix '%s' to offset %d" %
1072 [ self.matched, codePos ]
1074 # Build a regexp for the outer delimiters
1075 ldelimOuter = "(" + outerDelimPairs.keys .uniq.collect {|delim| Regexp::quote(delim)}.join('|') + ")"
1076 rdelimOuter = "(" + outerDelimPairs.values.uniq.collect {|delim| Regexp::quote(delim)}.join('|') + ")"
1077 debugMsg 4, "Using /%s/ as the outer delim regex" % ldelimOuter
1079 unless self.scan( ldelimOuter )
1080 raise MatchFailure, %q:Did not find opening bracket at "%s..." offset %d: %
1081 [ self.rest[0,20].chomp, codePos ]
1084 # Look up the corresponding outer delimiter
1085 closingDelim = outerDelimPairs[self.matched] or
1086 raise DelimiterError, "Could not find closing delimiter for '%s'" %
1089 debugMsg 3, "Scanning for closing delim '#{closingDelim}'"
1093 # Scan until the end of the text or until an explicit break
1095 debugMsg 5, "Scanning from offset %d (%s)", self.pointer, self.rest.inspect
1099 debugMsg 5, "Trying to match a comment"
1100 if self.scan( /\s*#.*/ )
1101 debugMsg 4, "Skipping comment '%s' to offset %d" %
1102 [ self.matched, self.pointer ]
1106 # Look for (any) closing delimiter
1107 debugMsg 5, "Trying to match a closing outer delimiter with /\s*(#{rdelimOuter})/"
1108 if self.scan( /\s*(#{rdelimOuter})/ )
1109 debugMsg 4, "Found a right delimiter '#{self.matched}'"
1111 # If it's the delimiter we're looking for, stop the scan
1112 if self.matched.strip == closingDelim
1113 matched = self.matched
1114 debugMsg 3, "Found the closing delimiter we've been looking for (#{matched.inspect})."
1117 # Otherwise, it's an error, as we've apparently seen a closing
1118 # delimiter without a corresponding opening one.
1121 %q:Mismatched closing bracket at "%s..." (offset %s). Expected '%s': %
1122 [ self.rest[0,20], self.pointer, closingDelim ]
1126 # Try to match a variable or a quoted phrase
1127 debugMsg 5, "Trying to match either a variable or quotelike"
1128 if self.scanVariable( '\s*' ) || self.scanQuotelike( '\s*', patvalid )
1129 debugMsg 3, "Matched either a variable or quotelike. Offset now %d" % self.pointer
1134 # Match some operators
1135 # :TODO: This hasn't really been ruby-ified
1136 debugMsg 5, "Trying to match an operator"
1137 if self.scan( %r:\s*([-+*x/%^&|.]=?
1140 | (\*\*|&&|\|\||<<|>>)=?
1141 | split|grep|map|return
1143 debugMsg 3, "Skipped miscellaneous operator '%s' to offset %d." %
1144 [ self.matched, self.pointer ]
1149 # Try to match an embedded codeblock
1150 debugMsg 5, "Trying to match an embedded codeblock with delim pairs: %s",
1151 innerDelimPairs.inspect
1152 if self.scanCodeblock( innerDelimPairs )
1153 debugMsg 3, "Skipped inner codeblock to offset %d." % self.pointer
1158 # Try to match a stray outer-left delimiter
1159 debugMsg 5, "Trying to match a stray outer-left delimiter (#{ldelimOuter})"
1160 if self.match?( ldelimOuter )
1161 raise MatchFailure, "Improperly nested codeblock at offset %d: %s... " %
1162 [ self.pointer, self.rest[0,20] ]
1166 self.scan( /\s*(\w+|[-=>]>|.|\Z)/m )
1167 debugMsg 3, "Skipped '%s' to offset %d" %
1168 [ self.matched, self.pointer ]
1173 raise MatchFailure, "No match found for opening bracket"
1177 :match => self.string[codePos .. (self.pointer - 1)],
1178 :prefix => self.string[startPos, (codePos-startPos)]
1180 debugMsg 1, "matchCodeblock succeeded: %s" % rval.inspect
1185 ### Attempt to derive and return the number of scan methods traversed up to
1186 ### this point by examining the call stack.
1188 return caller(2).find_all {|frame|
1189 frame =~ /in `scan(Variable|Tagged|Codeblock|Bracketed|Quotelike)'/
1198 ### Print the specified <tt>message</tt> to STDERR if the scanner's
1199 ### debugging level is greater than or equal to <tt>level</tt>.
1200 def debugMsg( level, msgFormat, *args )
1201 return unless level.nonzero? && self.debugLevel >= level
1202 msg = if args.empty? then msgFormat else format(msgFormat, *args) end
1203 $stderr.puts( (" " * (level-1) * 2) + msg )
1207 ### Given a series of one or more bracket characters (eg., '<', '[', '{',
1208 ### etc.), return the brackets reversed in order and direction.
1209 def revbracket( bracket )
1210 return bracket.to_s.reverse.tr( '<[{(', '>]})' )
1214 ### Given an opening <tt>tag</tt> of the sort matched by #scanTagged,
1215 ### construct and return a closing tag.
1216 def makeClosingTag( tag )
1217 debugMsg 3, "Making a closing tag for '%s'" % tag
1219 closingTag = tag.gsub( /^([[(<{]+)(#{XmlName}).*/ ) {
1220 Regexp.quote( "#{$1}/#{$2}" + revbracket($1) )
1223 raise MatchFailure, "Unable to construct closing tag to match: #{tag}" unless closingTag
1228 ### Make and return a new Regexp which matches substrings bounded by the
1229 ### specified +delimiters+, not counting those which have been escaped with
1230 ### the escape characters in +escapes+.
1231 def makeDelimPattern( delimiters, escapes='\\', prefix='\\s*' )
1232 delimiters = delimiters.to_s
1233 escapes = escapes.to_s
1235 raise DelimiterError, "Illegal delimiter '#{delimiter}'" unless delimiters =~ /\S/
1237 # Pad the escapes string to the same length as the delimiters
1238 escapes.concat( escapes[-1,1] * (delimiters.length - escapes.length) )
1241 # Escape each delimiter and a corresponding escape character, and then
1242 # build a pattern part from them
1243 delimiters.length.times do |i|
1244 del = Regexp.escape( delimiters[i, 1] )
1245 esc = Regexp.escape( escapes[i, 1] )
1248 patParts.push "#{del}(?:[^#{del}]*(?:(?:#{del}#{del})[^#{del}]*)*)#{del}"
1250 patParts.push "#{del}(?:[^#{esc}#{del}]*(?:#{esc}.[^#{esc}#{del}]*)*)#{del}";
1254 # Join all the parts together and return one big pattern
1255 return Regexp::new( "#{prefix}(?:#{patParts.join("|")})" )
1258 end # class StringExtractor