]> git.pld-linux.org Git - packages/ruby-DelimScanner.git/blame - DelimScanner.rb?dl=1
- change %%ruby_mod_ver_requires_eq to %%{?ruby_mod_ver_requires_eq} -
[packages/ruby-DelimScanner.git] / DelimScanner.rb?dl=1
CommitLineData
4c45dbaf 1#!/usr/bin/ruby
2#
3# A derivative of StringScanner that can scan for delimited constructs in
4# addition to regular expressions. It is a loose port of the Text::Balanced
5# module for Perl by Damian Conway <damian@cs.monash.edu.au>.
6#
7# == Synopsis
8#
9# se = DelimScanner::new( myString )
10#
11# == Authors
12#
13# * Michael Granger <ged@FaerieMUD.org>
14#
15# Copyright (c) 2002, 2003 The FaerieMUD Consortium. Most rights reserved.
16#
17# This work is licensed under the Creative Commons Attribution License. To view
18# a copy of this license, visit http://creativecommons.org/licenses/by/1.0 or
19# send a letter to Creative Commons, 559 Nathan Abbott Way, Stanford, California
20# 94305, USA.
21#
22# == Version
23#
24# $Id$
25#
26
27require 'strscan'
28require 'forwardable'
29
30### Add some stuff to the String class to allow easy transformation to Regexp
31### and in-place interpolation.
32class String
33 def to_re( casefold=false, extended=false )
34 return Regexp::new( self.dup )
35 end
36
37 ### Ideas for String-interpolation stuff courtesy of Hal E. Fulton
38 ### <hal9000@hypermetrics.com> via ruby-talk
39
40 def interpolate( scope )
41 unless scope.is_a?( Binding )
42 raise TypeError, "Argument to interpolate must be a Binding, not "\
43 "a #{scope.class.name}"
44 end
45
46 # $stderr.puts ">>> Interpolating '#{self}'..."
47
48 copy = self.gsub( /"/, %q:\": )
49 eval( '"' + copy + '"', scope )
50 end
51
52end
53
54
55### A derivative of StringScanner that can scan for delimited constructs in
56### addition to regular expressions.
57class DelimScanner
58
59 ### Scanner exception classes
60 class MatchFailure < RuntimeError ; end
61 class DelimiterError < RuntimeError ; end
62
63
64 extend Forwardable
65 StringScanner.must_C_version
66
67
68 ### Class constants
69 Version = /([\d\.]+)/.match( %q{$Revision$} )[1]
70 Rcsid = %q$Id$
71
72 # Pattern to match a valid XML name
73 XmlName = '[a-zA-Z_:][a-zA-Z0-9:.-]*'
74
75
76 ### Namespace module for DelimString constants
77 module Default
78
79 # The list of default opening => closing codeblock delimiters to use for
80 # scanCodeblock.
81 CodeblockDelimiters = {
82 '{' => '}',
83 'begin' => 'end',
84 'do' => 'end',
85 }
86
87 # Default scanMultiple operations and their arguments
88 MultipleFunctions = [
89 :scanVariable => [],
90 :scanQuotelike => [],
91 :scanCodeblock => [],
92 ]
93
94 end
95 include Default
96
97
98 ### Define delegating methods that cast their argument to a Regexp from a
99 ### String. This allows the scanner's scanning methods to be called with
100 ### Strings in addition to Regexps. This was mostly stolen from
101 ### forwardable.rb.
102 def self.def_casting_delegators( *methods )
103 methods.each {|methodName|
104 class_eval( <<-EOF, "(--def_casting_delegators--)", 1 )
105 def #{methodName}( pattern )
106 pattern = pattern.to_s.to_re unless pattern.is_a?( Regexp )
107 @scanner.#{methodName}( pattern )
108 end
109 EOF
110 }
111 end
112
113
114 ### Create a new DelimScanner object for the specified <tt>string</tt>. If
115 ### <tt>dup</tt> is <tt>true</tt>, a duplicate of the target string will be
116 ### used instead of the one given. The target string will be frozen after
117 ### the scanner is created.
118 def initialize( string, dup=true )
119 @scanner = StringScanner::new( string, dup )
120 @matchError = nil
121 @debugLevel = 0
122 end
123
124
125
126 ######
127 public
128 ######
129
130 # Here, some delegation trickery is done to make a DelimScanner behave like
131 # a StringScanner. Some methods are directly delegated, while some are
132 # delegated via a method which casts its argument to a Regexp first so some
133 # scanner methods can be called with Strings as well as Regexps.
134
135 # A list of delegated methods that need casting.
136 NeedCastingDelegators = :scan, :skip, :match?, :check,
137 :scan_until, :skip_until, :exist?, :check_until
138
139 # Delegate all StringScanner instance methods to the associated scanner
140 # object, except those that need a casting delegator, which uses an indirect
141 # delegation method.
142 def_delegators :@scanner,
143 *( StringScanner.instance_methods - NeedCastingDelegators.collect {|sym| sym.id2name} )
144
145 def_casting_delegators( *NeedCastingDelegators )
146
147
148
149 # The last match error encountered by the scanner
150 attr_accessor :matchError
151 protected :matchError= ; # ; is to work around a ruby-mode indent bug
152
153 # Debugging level
154 attr_accessor :debugLevel
155
156
157
158 ### Returns <tt>true</tt> if the scanner has encountered a match error.
159 def matchError?
160 return ! @matchError.nil?
161 end
162
163
164 ### Starting at the scan pointer, try to match a substring delimited by the
165 ### specified <tt>delimiters</tt>, skipping the specified <tt>prefix</tt>
166 ### and any character escaped by the specified <tt>escape</tt>
167 ### character/s. If matched, advances the scan pointer and returns a Hash
168 ### with the following key/value pairs on success:
169 ###
170 ### [<tt>:match</tt>]
171 ### The text of the match, including delimiters.
172 ### [<tt>:prefix</tt>]
173 ### The matched prefix, if any.
174 ###
175 ### If the match fails, returns nil.
176 def scanDelimited( delimiters="'\"`", prefix='\\s*', escape='\\' )
177 delimiters ||= "'\"`"
178 prefix ||= '\\s*'
179 escape ||= '\\'
180
181 debugMsg( 1, "Scanning for delimited text: delim = (%s), prefix=(%s), escape=(%s)",
182 delimiters, prefix, escape )
183 self.matchError = nil
184
185 # Try to match the prefix first to get the length
186 unless (( prefixLength = self.match?(prefix.to_re) ))
187 self.matchError = "Failed to match prefix '%s' at offset %d" %
188 [ prefix, self.pointer ]
189 return nil
190 end
191
192 # Now build a delimited pattern with the specified parameters.
193 delimPattern = makeDelimPattern( delimiters, escape, prefix )
194 debugMsg( 2, "Delimiter pattern is %s" % delimPattern.inspect )
195
196 # Fail if no match
197 unless (( matchedString = self.scan(delimPattern) ))
198 self.matchError = "No delimited string found."
199 return nil
200 end
201
202 return {
203 :match => matchedString[prefixLength .. -1],
204 :prefix => matchedString[0..prefixLength-1],
205 }
206 end
207
208
209 ### Match using the #scanDelimited method, but only return the match or nil.
210 def extractDelimited( *args )
211 rval = scanDelimited( *args ) or return nil
212 return rval[:match]
213 end
214
215
216 ### Starting at the scan pointer, try to match a substring delimited by the
217 ### specified <tt>delimiters</tt>, skipping the specified <tt>prefix</tt>
218 ### and any character escaped by the specified <tt>escape</tt>
219 ### character/s. If matched, advances the scan pointer and returns the
220 ### length of the matched string; if it fails the match, returns nil.
221 def skipDelimited( delimiters="'\"`", prefix='\\s*', escape='\\' )
222 delimiters ||= "'\"`"
223 prefix ||= '\\s*'
224 escape ||= '\\'
225
226 self.matchError = nil
227 return self.skip( makeDelimPattern(delimiters, escape, prefix) )
228 end
229
230
231 ### Starting at the scan pointer, try to match a substring delimited by
232 ### balanced <tt>delimiters</tt> of the type specified, after skipping the
233 ### specified <tt>prefix</tt>. On a successful match, this method advances
234 ### the scan pointer and returns a Hash with the following key/value pairs:
235 ###
236 ### [<tt>:match</tt>]
237 ### The text of the match, including the delimiting brackets.
238 ### [<tt>:prefix</tt>]
239 ### The matched prefix, if any.
240 ###
241 ### On failure, returns nil.
242 def scanBracketed( delimiters="{([<", prefix='\s*' )
243 delimiters ||= "{([<"
244 prefix ||= '\s*'
245
246 prefix = prefix.to_re unless prefix.kind_of?( Regexp )
247
248 debugMsg( 1, "Scanning for bracketed text: delimiters = (%s), prefix = (%s)",
249 delimiters, prefix )
250
251 self.matchError = nil
252
253 # Split the left-delimiters (brackets) from the quote delimiters.
254 ldel = delimiters.dup
255 qdel = ldel.squeeze.split(//).find_all {|char| char =~ /["'`]/ }.join('|')
256 qdel = nil if qdel.empty?
257 quotelike = true if ldel =~ /q/
258
259 # Change all instances of delimiters to the left-hand versions, and
260 # strip away anything but bracketing delimiters
261 ldel = ldel.tr( '[](){}<>', '[[(({{<<' ).gsub(/[^#{Regexp.quote('[\\](){}<>')}]+/, '').squeeze
262
263 ### Now build the right-delim equivalent of the left delim string
264 rdel = ldel.dup
265 unless rdel.tr!( '[({<', '])}>' )
266 raise DelimiterError, "Did not find a suitable bracket in delimiter: '#{delimiters}'"
267 end
268
269 # Build regexps from both bracketing delimiter strings
270 ldel = ldel.split(//).collect {|ch| Regexp.quote(ch)}.join('|')
271 rdel = rdel.split(//).collect {|ch| Regexp.quote(ch)}.join('|')
272
273 depth = self.scanDepth
274 result = nil
275 startPos = self.pointer
276
277 begin
278 result = matchBracketed( prefix, ldel, qdel, quotelike, rdel )
279 rescue MatchFailure => e
280 debugMsg( depth + 1, "Match error: %s" % e.message )
281 self.matchError = e.message
282 self.pointer = startPos
283 result = nil
284 rescue => e
285 self.pointer = startPos
286 Kernel::raise
287 end
288
289 return result
290 end
291
292
293 ### Match using the #scanBracketed method, but only return the match or nil.
294 def extractBracketed( *args )
295 rval = scanBracketed( *args ) or return nil
296 return rval[:match]
297 end
298
299
300 ### Starting at the scan pointer, try to match a substring with
301 ### #scanBracketed. On a successful match, this method advances the scan
302 ### pointer and returns the length of the match, including the delimiters
303 ### and any prefix that was skipped. On failure, returns nil.
304 def skipBracketed( *args )
305 startPos = self.pointer
306
307 match = scanBracketed( *args )
308
309 return nil unless match
310 return match.length + prefix.length
311 ensure
312 debugMsg( 2, "Resetting scan pointer." )
313 self.pointer = startPos
314 end
315
316
317 ### Extracts and segments text from the scan pointer forward that occurs
318 ### between (balanced) specified tags, after skipping the specified
319 ### <tt>prefix</tt>. If the opentag argument is <tt>nil</tt>, a pattern which
320 ### will match any standard HTML/XML tag will be used. If the
321 ### <tt>closetag</tt> argument is <tt>nil</tt>, a pattern is created which
322 ### prepends a <tt>/</tt> character to the matched opening tag, after any
323 ### bracketing characters. The <tt>options</tt> argument is a Hash of one or
324 ### more options which govern the matching operation. They are described in
325 ### more detail in the Description section of 'lib/DelimScanner.rb'. On a
326 ### successful match, this method advances the scan pointer and returns an
327 ###
328 ### [<tt>:match</tt>]
329 ### The text of the match, including the delimiting tags.
330 ### [<tt>:prefix</tt>]
331 ### The matched prefix, if any.
332 ###
333 ### On failure, returns nil.
334 def scanTagged( opentag=nil, closetag=nil, prefix='\s*', options={} )
335 prefix ||= '\s*'
336
337 ldel = opentag || %Q,<\\w+(?:#{ makeDelimPattern(%q:'":) }|[^>])*>,
338 rdel = closetag
339 raise ArgumentError, "Options argument must be a hash" unless options.kind_of?( Hash )
340
341 failmode = options[:fail]
342 bad = if options[:reject].is_a?( Array ) then
343 options[:reject].join("|")
344 else
345 (options[:reject] || '')
346 end
347 ignore = if options[:ignore].is_a?( Array ) then
348 options[:ignore].join("|")
349 else
350 (options[:ignore] || '')
351 end
352
353 self.matchError = nil
354 result = nil
355 startPos = self.pointer
356
357 depth = self.scanDepth
358
359 begin
360 result = matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
361 rescue MatchFailure => e
362 debugMsg( depth + 1, "Match error: %s" % e.message )
363 self.matchError = e.message
364 self.pointer = startPos
365 result = nil
366 rescue => e
367 self.pointer = startPos
368 Kernel::raise
369 end
370
371 return result
372 end
373
374
375 ### Match using the #scanTagged method, but only return the match or nil.
376 def extractTagged( *args )
377 rval = scanTagged( *args ) or return nil
378 return rval[:match]
379 end
380
381
382 ### Starting at the scan pointer, try to match a substring with
383 ### #scanTagged. On a successful match, this method advances the scan
384 ### pointer and returns the length of the match, including any delimiters
385 ### and any prefix that was skipped. On failure, returns nil.
386 def skipTagged( *args )
387 startPos = self.pointer
388
389 match = scanTagged( *args )
390
391 return nil unless match
392 return match.length + prefix.length
393 ensure
394 debugMsg( 2, "Resetting scan pointer." )
395 self.pointer = startPos
396 end
397
398
399 # :NOTE:
400 # Since the extract_quotelike function isn't documented at all in
401 # Text::Balanced, I'm only guessing this is correct...
402
403 ### Starting from the scan pointer, try to match any one of the various Ruby
404 ### quotes and quotelike operators after skipping the specified
405 ### <tt>prefix</tt>. Nested backslashed delimiters, embedded balanced
406 ### bracket delimiters (for the quotelike operators), and trailing modifiers
407 ### are all caught. If <tt>matchRawRegex</tt> is <tt>true</tt>, inline
408 ### regexen (eg., <tt>/pattern/</tt>) are matched as well. Advances the scan
409 ### pointer and returns a Hash with the following key/value pairs on
410 ### success:
411 ###
412 ### [<tt>:match</tt>]
413 ### The entire text of the match.
414 ### [<tt>:prefix</tt>]
415 ### The matched prefix, if any.
416 ### [<tt>:quoteOp</tt>]
417 ### The name of the quotelike operator (if any) (eg., '%Q', '%r', etc).
418 ### [<tt>:leftDelim</tt>]
419 ### The left delimiter of the first block of the operation.
420 ### [<tt>:delimText</tt>]
421 ### The text of the first block of the operation.
422 ### [<tt>:rightDelim</tt>]
423 ### The right delimiter of the first block of the operation.
424 ### [<tt>:modifiers</tt>]
425 ### The trailing modifiers on the operation (if any).
426 ###
427 ### On failure, returns nil.
428 def scanQuotelike( prefix='\s*', matchRawRegex=true )
429
430 self.matchError = nil
431 result = nil
432 startPos = self.pointer
433
434 depth = self.scanDepth
435
436 begin
437 result = matchQuotelike( prefix, matchRawRegex )
438 rescue MatchFailure => e
439 debugMsg( depth + 1, "Match error: %s" % e.message )
440 self.matchError = e.message
441 self.pointer = startPos
442 result = nil
443 rescue => e
444 self.pointer = startPos
445 Kernel::raise
446 end
447
448 return result
449 end
450
451
452 ### Match using the #scanQuotelike method, but only return the match or nil.
453 def extractQuotelike( *args )
454 rval = scanQuotelike( *args ) or return nil
455 return rval[:match]
456 end
457
458
459 ### Starting at the scan pointer, try to match a substring with
460 ### #scanQuotelike. On a successful match, this method advances the scan
461 ### pointer and returns the length of the match, including any delimiters
462 ### and any prefix that was skipped. On failure, returns nil.
463 def skipQuotelike( *args )
464 startPos = self.pointer
465
466 match = scanQuotelike( *args )
467
468 return nil unless match
469 return match.length + prefix.length
470 ensure
471 debugMsg( 2, "Resetting scan pointer." )
472 self.pointer = startPos
473 end
474
475
476 ### Starting from the scan pointer, try to match a Ruby variable after
477 ### skipping the specified prefix.
478 def scanVariable( prefix='\s*' )
479 self.matchError = nil
480 result = nil
481 startPos = self.pointer
482
483 depth = self.scanDepth
484
485 begin
486 result = matchVariable( prefix )
487 rescue MatchFailure => e
488 debugMsg( depth + 1, "Match error: %s" % e.message )
489 self.matchError = e.message
490 self.pointer = startPos
491 result = nil
492 rescue => e
493 self.pointer = startPos
494 Kernel::raise
495 end
496
497 return result
498 end
499
500
501 ### Match using the #scanVariable method, but only return the match or nil.
502 def extractVariable( *args )
503 rval = scanVariable( *args ) or return nil
504 return rval[:match]
505 end
506
507
508 ### Starting at the scan pointer, try to match a substring with
509 ### #scanVariable. On a successful match, this method advances the scan
510 ### pointer and returns the length of the match, including any delimiters
511 ### and any prefix that was skipped. On failure, returns nil.
512 def skipVariable( *args )
513 startPos = self.pointer
514
515 match = scanVariable( *args )
516
517 return nil unless match
518 return match.length + prefix.length
519 ensure
520 debugMsg( 2, "Resetting scan pointer." )
521 self.pointer = startPos
522 end
523
524
525 ### Starting from the scan pointer, and skipping the specified
526 ### <tt>prefix</tt>, try to to recognize and match a balanced bracket-,
527 ### do/end-, or begin/end-delimited substring that may contain unbalanced
528 ### delimiters inside quotes or quotelike operations.
529 def scanCodeblock( innerDelim=CodeblockDelimiters, prefix='\s*', outerDelim=innerDelim )
530 self.matchError = nil
531 result = nil
532 startPos = self.pointer
533
534 prefix ||= '\s*'
535 innerDelim ||= CodeblockDelimiters
536 outerDelim ||= innerDelim
537
538 depth = caller(1).find_all {|frame|
539 frame =~ /in `scan(Variable|Tagged|Codeblock|Bracketed|Quotelike)'/
540 }.length
541
542 begin
543 debugMsg 3, "Calling matchCodeBlock( %s, %s, %s )",
544 prefix.inspect, innerDelim.inspect, outerDelim.inspect
545 result = matchCodeblock( prefix, innerDelim, outerDelim )
546 rescue MatchFailure => e
547 debugMsg( depth + 1, "Match error: %s" % e.message )
548 self.matchError = e.message
549 self.pointer = startPos
550 result = nil
551 rescue => e
552 self.pointer = startPos
553 Kernel::raise
554 end
555
556 return result
557 end
558
559
560 ### Match using the #scanCodeblock method, but only return the match or nil.
561 def extractCodeblock( *args )
562 rval = scanCodeblock( *args ) or return nil
563 return rval[:match]
564 end
565
566
567 ### Starting at the scan pointer, try to match a substring with
568 ### #scanCodeblock. On a successful match, this method advances the scan
569 ### pointer and returns the length of the match, including any delimiters
570 ### and any prefix that was skipped. On failure, returns nil.
571 def skipCodeblock( *args )
572 startPos = self.pointer
573
574 match = scanCodeblock( *args )
575
576 return nil unless match
577 return match.length + prefix.length
578 ensure
579 debugMsg( 2, "Resetting scan pointer." )
580 self.pointer = startPos
581 end
582
583
584
585
586 #########
587 protected
588 #########
589
590 ### Scan the string from the scan pointer forward, skipping the specified
591 ### <tt>prefix</tt> and trying to match a string delimited by bracketing
592 ### delimiters <tt>ldel</tt> and <tt>rdel</tt> (Regexp objects), and quoting
593 ### delimiters <tt>qdel</tt> (Regexp). If <tt>quotelike</tt> is
594 ### <tt>true</tt>, Ruby quotelike constructs will also be honored.
595 def matchBracketed( prefix, ldel, qdel, quotelike, rdel )
596 startPos = self.pointer
597 debugMsg( 2, "matchBracketed starting at pos = %d: prefix = %s, "\
598 "ldel = %s, qdel = %s, quotelike = %s, rdel = %s",
599 startPos, prefix.inspect, ldel.inspect, qdel.inspect, quotelike.inspect,
600 rdel.inspect )
601
602 # Test for the prefix, failing if not found
603 raise MatchFailure, "Did not find prefix: #{prefix.inspect}" unless
604 self.skip( prefix )
605
606 # Mark this position as the left-delimiter pointer
607 ldelpos = self.pointer
608 debugMsg( 3, "Found prefix. Left delim pointer at %d", ldelpos )
609
610 # Match opening delimiter or fail
611 unless (( delim = self.scan(ldel) ))
612 raise MatchFailure, "Did not find opening bracket after prefix: '%s' (%d)" %
613 [ self.string[startPos..ldelpos].chomp, ldelpos ]
614 end
615
616 # A stack to keep track of nested delimiters
617 nesting = [ delim ]
618 debugMsg( 3, "Found opening bracket. Nesting = %s", nesting.inspect )
619
620 while self.rest?
621
622 debugMsg( 5, "Starting scan loop. Nesting = %s", nesting.inspect )
623
624 # Skip anything that's backslashed
625 if self.skip( /\\./ )
626 debugMsg( 4, "Skipping backslashed literal at offset %d: '%s'",
627 self.pointer - 2, self.string[ self.pointer - 2, 2 ].chomp )
628 next
629 end
630
631 # Opening bracket (left delimiter)
632 if self.scan(ldel)
633 delim = self.matched
634 debugMsg( 4, "Found opening delim %s at offset %d",
635 delim.inspect, self.pointer - 1 )
636 nesting.push delim
637
638 # Closing bracket (right delimiter)
639 elsif self.scan(rdel)
640 delim = self.matched
641
642 debugMsg( 4, "Found closing delim %s at offset %d",
643 delim.inspect, self.pointer - 1 )
644
645 # :TODO: When is this code reached?
646 if nesting.empty?
647 raise MatchFailure, "Unmatched closing bracket '%s' at offset %d" %
648 [ delim, self.pointer - 1 ]
649 end
650
651 # Figure out what the compliment of the bracket next off the
652 # stack should be.
653 expected = nesting.pop.tr( '({[<', ')}]>' )
654 debugMsg( 4, "Got a '%s' bracket off nesting stack", expected )
655
656 # Check for mismatched brackets
657 if expected != delim
658 raise MatchFailure, "Mismatched closing bracket at offset %d: "\
659 "Expected '%s', but found '%s' instead." %
660 [ self.pointer - 1, expected, delim ]
661 end
662
663 # If we've found the closing delimiter, stop scanning
664 if nesting.empty?
665 debugMsg( 4, "Finished with scan: nesting stack empty." )
666 break
667 end
668
669 # Quoted chunk (quoted delimiter)
670 elsif qdel && self.scan(qdel)
671 match = self.matched
672
673 if self. scan( /[^\\#{match}]*(?:\\.[^\\#{match}]*)*(#{Regexp::quote(match)})/ )
674 debugMsg( 4, "Skipping quoted chunk. Scan pointer now at offset %d", self.pointer )
675 next
676 end
677
678 raise MatchFailure, "Unmatched embedded quote (%s) at offset %d" %
679 [ match, self.pointer - 1 ]
680
681 # Embedded quotelike
682 elsif quotelike && self.scanQuotelike
683 debugMsg( 4, "Matched a quotelike. Scan pointer now at offset %d", self.pointer )
684 next
685
686 # Skip word characters, or a single non-word character
687 else
688 self.skip( /(?:[a-zA-Z0-9]+|.)/m )
689 debugMsg 5, "Skipping '%s' at offset %d." %
690 [ self.matched, self.pointer ]
691 end
692
693 end
694
695 # If there's one or more brackets left on the delimiter stack, we're
696 # missing a closing delim.
697 unless nesting.empty?
698 raise MatchFailure, "Unmatched opening bracket(s): %s.. at offset %d" %
699 [ nesting.join('..'), self.pointer ]
700 end
701
702 rval = {
703 :match => self.string[ ldelpos .. (self.pointer - 1) ],
704 :prefix => self.string[ startPos, (ldelpos-startPos) ],
705 }
706 debugMsg 1, "matchBracketed succeeded: %s" % rval.inspect
707 return rval
708 end
709
710
711 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
712 ### try to match text bracketed by the given left and right tag-delimiters
713 ### (<tt>ldel</tt> and <tt>rdel</tt>).
714 def matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
715 failmode = failmode.to_s.intern if failmode
716 startPos = self.pointer
717 debugMsg 2, "matchTagged starting at pos = %d: prefix = %s, "\
718 "ldel = %s, rdel = %s, failmode = %s, bad = %s, ignore = %s",
719 startPos, prefix.inspect, ldel.inspect, rdel.inspect,
720 failmode.inspect, bad.inspect, ignore.inspect
721
722 rdelspec = ''
723 openTagPos, textPos, paraPos, closeTagPos, endPos = ([nil] * 5)
724 match = nil
725
726 # Look for the prefix
727 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
728 self.skip( prefix )
729
730 openTagPos = self.pointer
731 debugMsg 3, "Found prefix. Pointer now at offset %d" % self.pointer
732
733 # Look for the opening delimiter
734 unless (( match = self.scan(ldel) ))
735 raise MatchFailure, "Did not find opening tag %s at offset %d" %
736 [ ldel.inspect, self.pointer ]
737 end
738
739 textPos = self.pointer
740 debugMsg 3, "Found left delimiter '%s': offset now %d" % [ match, textPos ]
741
742 # Make a right delim out of the tag we found if none was specified
743 if rdel.nil?
744 rdelspec = makeClosingTag( match )
745 debugMsg 3, "Generated right-delimiting tag: %s" % rdelspec.inspect
746 else
747 # Make the regexp-related globals from the match
748 rdelspec = rdel.gsub( /(\A|[^\\])\$([1-9])/, '\1self[\2]' ).interpolate( binding )
749 debugMsg 3, "Right delimiter (after interpolation) is: %s" % rdelspec.inspect
750 end
751
752 # Process until we reach the end of the string or find a closing tag
753 while self.rest? && closeTagPos.nil?
754
755 # Skip backslashed characters
756 if (( self.skip( /^\\./ ) ))
757 debugMsg 4, "Skipping backslashed literal at offset %d" % self.pointer
758 next
759
760 # Match paragraphs-break for fail == :para
761 elsif (( matchlength = self.skip( /^(\n[ \t]*\n)/ ) ))
762 paraPos ||= self.pointer - matchlength
763 debugMsg 4, "Found paragraph position at offset %d" % paraPos
764
765 # Match closing tag
766 elsif (( matchlength = self.skip( rdelspec ) ))
767 closeTagPos = self.pointer - matchlength
768 debugMsg 3, "Found closing tag at offset %d" % closeTagPos
769
770 # If we're ignoring anything, try to match and move beyond it
771 elsif ignore && !ignore.empty? && self.skip(ignore)
772 debugMsg 3, "Skipping ignored text '%s' at offset %d" %
773 [ self.matched, self.pointer - self.matched_size ]
774 next
775
776 # If there's a "bad" pattern, try to match it, shorting the
777 # outer loop if it matches in para or max mode, or failing with
778 # a match error if not.
779 elsif bad && !bad.empty? && self.match?( bad )
780 if failmode == :para || failmode == :max
781 break
782 else
783 raise MatchFailure, "Found invalid nested tag '%s' at offset %d" %
784 [ match, self.pointer ]
785 end
786
787 # If there's another opening tag, make a recursive call to
788 # ourselves to move the cursor beyond it
789 elsif (( match = self.scan( ldel ) ))
790 tag = match
791 self.unscan
792
793 unless self.matchTagged( prefix, ldel, rdel, failmode, bad, ignore )
794 break if failmode == :para || failmode == :max
795
796 raise MatchFailure, "Found unbalanced nested tag '%s' at offset %d" %
797 [ tag, self.pointer ]
798 end
799
800 else
801 self.pointer += 1
802 debugMsg 5, "Advanced scan pointer to offset %d" % self.pointer
803 end
804 end
805
806 # If the closing hasn't been found, then it's a "short" match, which is
807 # okay if the failmode indicates we don't care. Otherwise, it's an error.
808 unless closeTagPos
809 debugMsg 3, "No close tag position found. "
810
811 if failmode == :max || failmode == :para
812 closeTagPos = self.pointer - 1
813 debugMsg 4, "Failmode %s tolerates no closing tag. Close tag position set to %d" %
814 [ failmode.inspect, closeTagPos ]
815
816 # Sync the scan pointer and the paragraph marker if it's set.
817 if failmode == :para && paraPos
818 self.pointer = paraPos + 1
819 end
820 else
821 raise MatchFailure, "No closing tag found."
822 end
823 end
824
825 rval = {
826 :match => self.string[ openTagPos .. (self.pointer - 1) ],
827 :prefix => self.string[ startPos, (openTagPos-startPos) ],
828 }
829 debugMsg 1, "matchTagged succeeded: %s" % rval.inspect
830 return rval
831 end
832
833
834 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
835 ### try to match text inside a Ruby quotelike construct. If
836 ### <tt>matchRawRegex</tt> is <tt>true</tt>, the regex construct
837 ### <tt>/pattern/</tt> is also matched.
838 def matchQuotelike( prefix, matchRawRegex )
839 startPos = self.pointer
840 debugMsg 2, "matchQuotelike starting at pos = %d: prefix = %s, "\
841 "matchRawRegex = %s",
842 startPos, prefix.inspect, matchRawRegex.inspect
843
844 # Init position markers
845 rval = oppos = preldpos = ldpos = strpos = rdpos = modpos = nil
846
847 # Look for the prefix
848 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
849 self.skip( prefix )
850 oppos = self.pointer
851
852 # Peek at the next character
853 # If the initial quote is a simple quote, our job is easy
854 if self.check(/^["`']/) || ( matchRawRegex && self.check(%r:/:) )
855 initial = self.matched
856
857 # Build the pattern for matching the simple string
858 pattern = "%s [^\\%s]* (\\.[^\\%s]*)* %s" %
859 [ Regexp.quote(initial),
860 initial, initial,
861 Regexp.quote(initial) ]
862 debugMsg 2, "Matching simple quote at offset %d with /%s/" %
863 [ self.pointer, pattern ]
864
865 # Search for it, raising an exception if it's not found
866 unless self.scan( /#{pattern}/xism )
867 raise MatchFailure,
868 "Did not find closing delimiter to match '%s' at '%s...' (offset %d)" %
869 [ initial, self.string[ oppos, 20 ].chomp, self.pointer ]
870 end
871
872 modpos = self.pointer
873 rdpos = modpos - 1
874
875 # If we're matching a regex, look for any trailing modifiers
876 if initial == '/'
877 pattern = if RUBY_VERSION >= "1.7.3" then /[imoxs]*/ else /[imox]*/ end
878 self.scan( pattern )
879 end
880
881 rval = {
882 :prefix => self.string[ startPos, (oppos-startPos) ],
883 :match => self.string[ oppos .. (self.pointer - 1) ],
884 :leftDelim => self.string[ oppos, 1 ],
885 :delimText => self.string[ (oppos+1) .. (rdpos-1) ],
886 :rightDelim => self.string[ rdpos, 1 ],
887 :modifiers => self.string[ modpos, (self.pointer-modpos) ],
888 }
889
890
891 # If it's one of the fancy quotelike operators, our job is somewhat
892 # complicated (though nothing like Perl's, thank the Goddess)
893 elsif self.scan( %r:%[rwqQx]?(?=\S): )
894 op = self.matched
895 debugMsg 2, "Matching a real quotelike ('%s') at offset %d" %
896 [ op, self.pointer ]
897 modifiers = nil
898
899 ldpos = self.pointer
900 strpos = ldpos + 1
901
902 # Peek ahead to see what the delimiter is
903 ldel = self.check( /\S/ )
904
905 # If it's a bracketing character, just use matchBracketed
906 if ldel =~ /[[(<{]/
907 rdel = ldel.tr( '[({<', '])}>' )
908 debugMsg 4, "Left delim is a bracket: %s; looking for compliment: %s" %
909 [ ldel, rdel ]
910 self.matchBracketed( '', Regexp::quote(ldel), nil, nil, Regexp::quote(rdel) )
911 else
912 debugMsg 4, "Left delim isn't a bracket: '#{ldel}'; looking for closing instance"
913 self.scan( /#{ldel}[^\\#{ldel}]*(\\.[^\\#{ldel}]*)*#{ldel}/ ) or
914 raise MatchFailure,
915 "Can't find a closing delimiter '%s' at '%s...' (offset %d)" %
916 [ ldel, self.rest[0,20].chomp, self.pointer ]
917 end
918 rdelpos = self.pointer - 1
919
920 # Match modifiers for Regexp quote
921 if op == '%r'
922 pattern = if RUBY_VERSION >= "1.7.3" then /[imoxs]*/ else /[imox]*/ end
923 modifiers = self.scan( pattern ) || ''
924 end
925
926 rval = {
927 :prefix => self.string[ startPos, (oppos-startPos) ],
928 :match => self.string[ oppos .. (self.pointer - 1) ],
929 :quoteOp => op,
930 :leftDelim => self.string[ ldpos, 1 ],
931 :delimText => self.string[ strpos, (rdelpos-strpos) ],
932 :rightDelim => self.string[ rdelpos, 1 ],
933 :modifiers => modifiers,
934 }
935
936 # If it's a here-doc, things get even hairier.
937 elsif self.scan( %r:<<(-)?: )
938 debugMsg 2, "Matching a here-document at offset %d" % self.pointer
939 op = self.matched
940
941 # If there was a dash, start with optional whitespace
942 indent = self[1] ? '\s*' : ''
943 ldpos = self.pointer
944 label = ''
945
946 # Plain identifier
947 if self.scan( /[A-Za-z_]\w*/ )
948 label = self.matched
949 debugMsg 3, "Setting heredoc terminator to bare identifier '%s'" % label
950
951 # Quoted string
952 elsif self.scan( / ' ([^'\\]* (?:\\.[^'\\]*)*) ' /sx ) ||
953 self.scan( / " ([^"\\]* (?:\\.[^"\\]*)*) " /sx ) ||
954 self.scan( / ` ([^`\\]* (?:\\.[^`\\]*)*) ` /sx )
955 label = self[1]
956 debugMsg 3, "Setting heredoc terminator to quoted identifier '%s'" % label
957
958 # Ruby, unlike Perl, requires a terminal, even if it's only an empty
959 # string
960 else
961 raise MatchFailure,
962 "Missing heredoc terminator before end of line at "\
963 "'%s...' (offset %d)" %
964 [ self.rest[0,20].chomp, self.pointer ]
965 end
966 extrapos = self.pointer
967
968 # Advance to the beginning of the string
969 self.skip( /.*\n/ )
970 strpos = self.pointer
971 debugMsg 3, "Scanning until /\\n#{indent}#{label}\\n/m"
972
973 # Match to the label
974 unless self.scan_until( /\n#{indent}#{label}\n/m )
975 raise MatchFailure,
976 "Couldn't find heredoc terminator '%s' after '%s...' (offset %d)" %
977 [ label, self.rest[0,20].chomp, self.pointer ]
978 end
979
980 rdpos = self.pointer - self.matched_size
981
982 rval = {
983 :prefix => self.string[ startPos, (oppos-startPos) ],
984 :match => self.string[ oppos .. (self.pointer - 1) ],
985 :quoteOp => op,
986 :leftDelim => self.string[ ldpos, (extrapos-ldpos) ],
987 :delimText => self.string[ strpos, (rdpos-strpos) ],
988 :rightDelim => self.string[ rdpos, (self.pointer-rdpos) ],
989 }
990
991 else
992 raise MatchFailure,
993 "No quotelike operator found after prefix at '%s...'" %
994 self.rest[0,20].chomp
995 end
996
997
998 debugMsg 1, "matchQuotelike succeeded: %s" % rval.inspect
999 return rval
1000 end
1001
1002
1003 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
1004 ### try to match text that is a valid Ruby variable or identifier, ...?
1005 def matchVariable( prefix )
1006 startPos = self.pointer
1007 debugMsg 2, "matchVariable starting at pos = %d: prefix = %s",
1008 startPos, prefix.inspect
1009
1010 # Look for the prefix
1011 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
1012 self.skip( prefix )
1013
1014 varPos = self.pointer
1015
1016 # If the variable matched is a predefined global, no need to look for an
1017 # identifier
1018 unless self.scan( %r~\$(?:[!@/\\,;.<>$?:_\~&`'+]|-\w|\d+)~ )
1019
1020 debugMsg 2, "Not a predefined global at '%s...' (offset %d)" %
1021 [ self.rest[0,20].chomp, self.pointer ]
1022
1023 # Look for a valid identifier
1024 unless self.scan( /\*?(?:[$@]|::)?(?:[a-z_]\w*(?:::\s*))*[_a-z]\w*/is )
1025 raise MatchFailure, "No variable found: Bad identifier (offset %d)" % self.pointer
1026 end
1027 end
1028
1029 debugMsg 2, "Matched '%s' at offset %d" % [ self.matched, self.pointer ]
1030
1031 # Match methodchain with trailing codeblock
1032 while self.rest?
1033 # Match a regular chained method
1034 next if scanCodeblock( {"("=>")", "do"=>"end", "begin"=>"end", "{"=>"}"},
1035 /\s*(?:\.|::)\s*[a-zA-Z_]\w+\s*/ )
1036
1037 # Match a trailing block or an element ref
1038 next if scanCodeblock( nil, /\s*/, {'{' => '}', '[' => ']'} )
1039
1040 # This matched a dereferencer in Perl, which doesn't have any
1041 # equivalent in Ruby.
1042 #next if scanVariable( '\s*(\.|::)\s*' )
1043
1044 # Match a method call without parens (?)
1045 next if self.scan( '\s*(\.|::)\s*\w+(?![{([])' )
1046
1047 break
1048 end
1049
1050 rval = {
1051 :match => self.string[ varPos .. (self.pointer - 1) ],
1052 :prefix => self.string[ startPos, (varPos-startPos) ],
1053 }
1054 debugMsg 1, "matchVariable succeeded: %s" % rval.inspect
1055 return rval
1056 end
1057
1058
1059 ### Starting from the scan pointer, skip the specified <tt>prefix</tt>, and
1060 ### try to match text inside a Ruby code block construct which must be
1061 ### delimited by the specified <tt>outerDelimPairs</tt>. It may optionally
1062 ### contain sub-blocks delimited with the given <tt>innerDelimPairs</tt>.
1063 def matchCodeblock( prefix, innerDelimPairs, outerDelimPairs )
1064 startPos = self.pointer
1065 debugMsg 2, "Starting matchCodeblock at offset %d (%s)", startPos, self.rest.inspect
1066
1067 # Look for the prefix
1068 raise MatchFailure, "Did not find prefix: /#{prefix.inspect}/" unless
1069 self.skip( prefix )
1070 codePos = self.pointer
1071 debugMsg 3, "Skipped prefix '%s' to offset %d" %
1072 [ self.matched, codePos ]
1073
1074 # Build a regexp for the outer delimiters
1075 ldelimOuter = "(" + outerDelimPairs.keys .uniq.collect {|delim| Regexp::quote(delim)}.join('|') + ")"
1076 rdelimOuter = "(" + outerDelimPairs.values.uniq.collect {|delim| Regexp::quote(delim)}.join('|') + ")"
1077 debugMsg 4, "Using /%s/ as the outer delim regex" % ldelimOuter
1078
1079 unless self.scan( ldelimOuter )
1080 raise MatchFailure, %q:Did not find opening bracket at "%s..." offset %d: %
1081 [ self.rest[0,20].chomp, codePos ]
1082 end
1083
1084 # Look up the corresponding outer delimiter
1085 closingDelim = outerDelimPairs[self.matched] or
1086 raise DelimiterError, "Could not find closing delimiter for '%s'" %
1087 self.matched
1088
1089 debugMsg 3, "Scanning for closing delim '#{closingDelim}'"
1090 matched = ''
1091 patvalid = true
1092
1093 # Scan until the end of the text or until an explicit break
1094 while self.rest?
1095 debugMsg 5, "Scanning from offset %d (%s)", self.pointer, self.rest.inspect
1096 matched = ''
1097
1098 # Skip comments
1099 debugMsg 5, "Trying to match a comment"
1100 if self.scan( /\s*#.*/ )
1101 debugMsg 4, "Skipping comment '%s' to offset %d" %
1102 [ self.matched, self.pointer ]
1103 next
1104 end
1105
1106 # Look for (any) closing delimiter
1107 debugMsg 5, "Trying to match a closing outer delimiter with /\s*(#{rdelimOuter})/"
1108 if self.scan( /\s*(#{rdelimOuter})/ )
1109 debugMsg 4, "Found a right delimiter '#{self.matched}'"
1110
1111 # If it's the delimiter we're looking for, stop the scan
1112 if self.matched.strip == closingDelim
1113 matched = self.matched
1114 debugMsg 3, "Found the closing delimiter we've been looking for (#{matched.inspect})."
1115 break
1116
1117 # Otherwise, it's an error, as we've apparently seen a closing
1118 # delimiter without a corresponding opening one.
1119 else
1120 raise MatchFailure,
1121 %q:Mismatched closing bracket at "%s..." (offset %s). Expected '%s': %
1122 [ self.rest[0,20], self.pointer, closingDelim ]
1123 end
1124 end
1125
1126 # Try to match a variable or a quoted phrase
1127 debugMsg 5, "Trying to match either a variable or quotelike"
1128 if self.scanVariable( '\s*' ) || self.scanQuotelike( '\s*', patvalid )
1129 debugMsg 3, "Matched either a variable or quotelike. Offset now %d" % self.pointer
1130 patvalid = false
1131 next
1132 end
1133
1134 # Match some operators
1135 # :TODO: This hasn't really been ruby-ified
1136 debugMsg 5, "Trying to match an operator"
1137 if self.scan( %r:\s*([-+*x/%^&|.]=?
1138 | [!=]~
1139 | =(?!>)
1140 | (\*\*|&&|\|\||<<|>>)=?
1141 | split|grep|map|return
1142 ):x )
1143 debugMsg 3, "Skipped miscellaneous operator '%s' to offset %d." %
1144 [ self.matched, self.pointer ]
1145 patvalid = true
1146 next
1147 end
1148
1149 # Try to match an embedded codeblock
1150 debugMsg 5, "Trying to match an embedded codeblock with delim pairs: %s",
1151 innerDelimPairs.inspect
1152 if self.scanCodeblock( innerDelimPairs )
1153 debugMsg 3, "Skipped inner codeblock to offset %d." % self.pointer
1154 patvalid = true
1155 next
1156 end
1157
1158 # Try to match a stray outer-left delimiter
1159 debugMsg 5, "Trying to match a stray outer-left delimiter (#{ldelimOuter})"
1160 if self.match?( ldelimOuter )
1161 raise MatchFailure, "Improperly nested codeblock at offset %d: %s... " %
1162 [ self.pointer, self.rest[0,20] ]
1163 end
1164
1165 patvalid = false
1166 self.scan( /\s*(\w+|[-=>]>|.|\Z)/m )
1167 debugMsg 3, "Skipped '%s' to offset %d" %
1168 [ self.matched, self.pointer ]
1169 end
1170
1171
1172 unless matched
1173 raise MatchFailure, "No match found for opening bracket"
1174 end
1175
1176 rval = {
1177 :match => self.string[codePos .. (self.pointer - 1)],
1178 :prefix => self.string[startPos, (codePos-startPos)]
1179 }
1180 debugMsg 1, "matchCodeblock succeeded: %s" % rval.inspect
1181 return rval
1182 end
1183
1184
1185 ### Attempt to derive and return the number of scan methods traversed up to
1186 ### this point by examining the call stack.
1187 def scanDepth
1188 return caller(2).find_all {|frame|
1189 frame =~ /in `scan(Variable|Tagged|Codeblock|Bracketed|Quotelike)'/
1190 }.length
1191 end
1192
1193
1194 #######
1195 private
1196 #######
1197
1198 ### Print the specified <tt>message</tt> to STDERR if the scanner's
1199 ### debugging level is greater than or equal to <tt>level</tt>.
1200 def debugMsg( level, msgFormat, *args )
1201 return unless level.nonzero? && self.debugLevel >= level
1202 msg = if args.empty? then msgFormat else format(msgFormat, *args) end
1203 $stderr.puts( (" " * (level-1) * 2) + msg )
1204 end
1205
1206
1207 ### Given a series of one or more bracket characters (eg., '<', '[', '{',
1208 ### etc.), return the brackets reversed in order and direction.
1209 def revbracket( bracket )
1210 return bracket.to_s.reverse.tr( '<[{(', '>]})' )
1211 end
1212
1213
1214 ### Given an opening <tt>tag</tt> of the sort matched by #scanTagged,
1215 ### construct and return a closing tag.
1216 def makeClosingTag( tag )
1217 debugMsg 3, "Making a closing tag for '%s'" % tag
1218
1219 closingTag = tag.gsub( /^([[(<{]+)(#{XmlName}).*/ ) {
1220 Regexp.quote( "#{$1}/#{$2}" + revbracket($1) )
1221 }
1222
1223 raise MatchFailure, "Unable to construct closing tag to match: #{tag}" unless closingTag
1224 return closingTag
1225 end
1226
1227
1228 ### Make and return a new Regexp which matches substrings bounded by the
1229 ### specified +delimiters+, not counting those which have been escaped with
1230 ### the escape characters in +escapes+.
1231 def makeDelimPattern( delimiters, escapes='\\', prefix='\\s*' )
1232 delimiters = delimiters.to_s
1233 escapes = escapes.to_s
1234
1235 raise DelimiterError, "Illegal delimiter '#{delimiter}'" unless delimiters =~ /\S/
1236
1237 # Pad the escapes string to the same length as the delimiters
1238 escapes.concat( escapes[-1,1] * (delimiters.length - escapes.length) )
1239 patParts = []
1240
1241 # Escape each delimiter and a corresponding escape character, and then
1242 # build a pattern part from them
1243 delimiters.length.times do |i|
1244 del = Regexp.escape( delimiters[i, 1] )
1245 esc = Regexp.escape( escapes[i, 1] )
1246
1247 if del == esc then
1248 patParts.push "#{del}(?:[^#{del}]*(?:(?:#{del}#{del})[^#{del}]*)*)#{del}"
1249 else
1250 patParts.push "#{del}(?:[^#{esc}#{del}]*(?:#{esc}.[^#{esc}#{del}]*)*)#{del}";
1251 end
1252 end
1253
1254 # Join all the parts together and return one big pattern
1255 return Regexp::new( "#{prefix}(?:#{patParts.join("|")})" )
1256 end
1257
1258end # class StringExtractor
1259
This page took 3.718434 seconds and 4 git commands to generate.