regexp.sa
Generated by gen_html_sa_files from ICSI. Contact gomes@icsi.berkeley.edu for details
-------------------------> GNU Sather - sourcefile <-------------------------
-- Copyright (C) 2000 by K Hopper, University of Waikato, New Zealand --
-- This file is part of the GNU Sather library. It is free software; you may --
-- redistribute and/or modify it under the terms of the GNU Library General --
-- Public License (LGPL) as published by the Free Software Foundation; --
-- either version 2 of the license, or (at your option) any later version. --
-- This library is distributed in the hope that it will be useful, but --
-- WITHOUT ANY WARRANTY without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See Doc/LGPL for more details. --
-- The license text is also available from: Free Software Foundation, Inc., --
-- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --
--------------> Please email comments to <bug-sather@gnu.org> <--------------
immutable class ERE_LEX_TOKENS < $ENUMS{ERE_LEX_TOKENS}
immutable class ERE_LEX_TOKENS < $ENUMS{ERE_LEX_TOKENS} is
-- This class provides a simple mapping facility from characters to
-- lexical tokens.
--
-- The associated resource file contains only a single character per
-- line (after the first which contains the count of remaining lines).
-- Version 1.0 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
include ENUM{ERE_LEX_TOKENS} ;
private const val_count : CARD := 18 ;
-- The next routines provide the enumeration itself.
Left_Bracket : SAME is return enum(1) end ;
Right_Bracket : SAME is return enum(2) end ;
Left_Brace : SAME is return enum(3) end ;
Right_Brace : SAME is return enum(4) end ;
Left_Parenthesis : SAME is return enum(5) end ;
Right_Parenthesis : SAME is return enum(6) end ;
Left_Anchor : SAME is return enum(7) end ;
Right_Anchor : SAME is return enum(8) end ;
Minus_Sign : SAME is return enum(9) end ;
Plus_Sign : SAME is return enum(10) end ;
Fullstop : SAME is return enum(11) end ;
Vertical_Line : SAME is return enum(12) end ;
Asterisk : SAME is return enum(13) end ;
Question_Mark : SAME is return enum(14) end ;
Escape : SAME is return enum(15) end ;
Colon : SAME is return enum(16) end ;
Equals_Mark : SAME is return enum(17) end ;
Comma : SAME is return enum(18) end ;
token( ch : CHAR ) : SAME is
-- This routine returns the token value corresponding to the given
-- character -- or void if there is no match.
return create(ch.str);
end ;
end ; -- ERE_LEX_TOKENS
immutable class ERE_ERROR_KINDS < $ENUMS{ERE_ERROR_KINDS}
immutable class ERE_ERROR_KINDS < $ENUMS{ERE_ERROR_KINDS} is
-- This class provides all of the different kinds of error which may
-- may be detected when parsing regular expressions.
-- Version 1.0 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
include ENUM{ERE_ERROR_KINDS} ;
const val_count : CARD := 11 ;
-- The next routines provide the enumeration itself.
Heading : SAME is return enum(1) end ;
Trailing_Chars : SAME is return enum(2) end ;
Unexpected_Char : SAME is return enum(3) end ;
Missing_Brace : SAME is return enum(4) end ;
Missing_Dup_No : SAME is return enum(5) end ;
Missing_Right_Paren : SAME is return enum(6) end ;
Missing_Right_Bracket : SAME is return enum(7) end ;
Escape_Last : SAME is return enum(8) end ;
Empty_Brackets : SAME is return enum(9) end ;
Unexpected_Termination : SAME is return enum(10) end ;
Invalid_Class : SAME is return enum(11) end ;
end ; -- ERE_ERROR_KINDS
class ERE_ERRORS
class ERE_ERRORS is
-- This class provides all of the error reporting (exception raising)
-- needed when creating extended regular expressions. It only has the single
-- feature - error.
-- Version 1.0 May 97. Copyright K Hopper, U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
error( index : ERE_ERROR_KINDS, val : $FMT )
pre ~index.is_nil and (index /= ERE_ERROR_KINDS::Heading)
post false -- Well! an exception is actually raised!!
is
-- This routine raises an exception when an error is found during
-- parsing the extended regular expression.
raise FMT::create(ERE_ERROR_KINDS::Heading.str + index.str,val).str
end ;
end ; -- ERE_ERRORS
class ONE_CHAR_ERE
class ONE_CHAR_ERE is
-- This class embodies the concept of a single 'character' Extended
-- Regular Expression which may indeed correspond to a single character,
-- or be a bracketed expression from which one 'character' must be 'equal'
-- to the current source stream character for a match to succeed.
-- NOTE The implementation provided in this class is a strictly conforming
-- implementation as specified in ISO/IEC 9945-2 and does NOT, therefore,
-- support range expressions!
-- Version 1.0 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
private const Any_Char : CHAR := CHAR::create(
CONTROL_CODES::PADDING_CHARACTER,LIBCHARS::default) ;
private attr char : CHAR ;
private attr bracketed : BOOL ; -- ie NOT a single char.
private attr matching : BOOL ; -- ie NOT inverse matching
-- The following components are non-void only if bracketed is true!
-- The bracketed expression may be a list of either characters, including
-- char_names as defined in ISO/IEC 14652, or character classes (see class
-- ERE_CLASSES).
private attr literal : FLIST{CHAR} ;
private attr chclass : CHAR_CLASS ;
private bracketed_expression(
str_index : STR_CURSOR
) : SAME is
-- This private routine handles the contents of a bracketed regular
-- expression during creation of a one char ERE.
str_index.advance ; -- over left anchor/bracket!
res : SAME := self ; -- usually
recursive : BOOL := false ;
case ERE_LEX_TOKENS::token(str_index.item)
when ERE_LEX_TOKENS::Left_Bracket then -- this could be a class!
recursive := true ;
res := bracketed_expression(str_index) ;
when ERE_LEX_TOKENS::Right_Bracket then
ERE_ERRORS::error(ERE_ERROR_KINDS::Empty_Brackets,str_index.index)
when ERE_LEX_TOKENS::Left_Anchor then -- here it signifies non-matching
matching := false ;
-- NOTE The caret is left in the stream where it will
-- be skipped by the following recursive call.
recursive := true ;
res := bracketed_expression(str_index) ;
when ERE_LEX_TOKENS::Colon then
str_index.advance ;
loc_ch : CHAR := ERE_LEX_TOKENS::Colon.str[0] ;
name : STR := str_index.get_upto_char(loc_ch) ;
str_index.advance ; -- over colon
recursive := false ;
chclass := CHAR_CLASS::create(name) ;
if chclass.is_nil then
ERE_ERRORS::error(ERE_ERROR_KINDS::Invalid_Class,name)
end
when ERE_LEX_TOKENS::Fullstop then
str_index.advance ;
loc_ch : CHAR := ERE_LEX_TOKENS::Fullstop.str[0] ;
name : STR := str_index.get_upto_char(loc_ch) ;
str_index.advance ; -- over the second full stop!
recursive := false ;
char := CHAR::build(name.binstr.cursor,CULTURE::default.sather_lib)
when ERE_LEX_TOKENS::Equals_Mark then
else -- ordinary bracketed chars
literal := FLIST{CHAR}::create ;
loop -- over chars in brackets
if ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Escape then
str_index.advance ;
if str_index.is_done then
str_index.retract ;
ERE_ERRORS::error(
ERE_ERROR_KINDS::Escape_Last,str_index.index)
end
end ;
literal := literal.push(str_index.get_item) ;
if str_index.is_done then
ERE_ERRORS::error(ERE_ERROR_KINDS::Missing_Right_Bracket,
str_index.index)
else
if ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Bracket then
str_index.advance ;
break!
end
end
end ;
return res
end ;
if str_index.is_done then
ERE_ERRORS::error(
ERE_ERROR_KINDS::Missing_Right_Bracket,str_index.index) ;
return res -- never reached -- error raised!!
else
if ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Bracket then
str_index.advance
end ;
return res
end
end ;
create(
str_index : STR_CURSOR
) : SAME is
-- This routine creates a single matching expression, either literal or
-- bracketed.
me : SAME := new ;
me.matching := true ; -- assumption!
case ERE_LEX_TOKENS::token(str_index.item)
when ERE_LEX_TOKENS::Escape then
str_index.advance ; -- over the escape char
me.char := str_index.get_item
when ERE_LEX_TOKENS::Fullstop then
str_index.advance ; -- over the full stop
me.char := Any_Char -- the 'wild card' char!
when ERE_LEX_TOKENS::Left_Bracket then -- it's a bracketed expression
me.bracketed := true ;
return me.bracketed_expression(str_index) ;
else -- just one ordinary char
me.char := str_index.get_item ;
end ;
return me
end ;
matches_any : BOOL is
-- This predicate returns true if and only if this object matches any
-- character!
return char = Any_Char
end ;
matches(
str_index : STR_CURSOR
) : BOOL is
-- This predicate returns true if and only if the stream item at the
-- current position matches the expression self.
if str_index.is_done then -- nothing to check!!
return false
end ;
if ~chclass.is_nil then
res : BOOL := chclass.contains(str_index.item) ;
if res then
str_index.advance
end ;
return res
elsif bracketed then
if matching = literal.contains(str_index.item) then
str_index.advance ;
return true
else
return false
end
else -- just a single char match!
if (str_index.item = char)
or (char = Any_Char) then
str_index.advance ;
return true
else
return false
end
end
end ;
end ; -- ONE_CHAR_ERE
class ONE_ERE
class ONE_ERE is
-- This class embodies the concept of a single extended regular
-- expression which may either be a single character expression, a
-- parenthesised expression or a duplicated expression of either.
-- Version 1.0 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
private attr single : ONE_CHAR_ERE ; -- This only if not parenthesised
private attr parenthesised : REG_EXP ; -- parenthesised form
private const -- private enum for dup kinds
None, Any_Num, One_Plus, Zero_One,
Exact_Count, At_Least, Range ;
private attr dup_kind : CARD ; -- as needed.
private attr low,
high : CARD ;
private const -- a private enum for variant
One_Char, Parenthesised, Duplicated ;
private const Number_Base : CARD := 10 ;
private attr variant : CARD ;
private duplicate_expression(
str_index : STR_CURSOR
) is
-- This private routine handles the duploication expression which may
-- be found in the place of the ordinary single character duplication
-- symbols.
str_index.advance ; -- over left brace!
if str_index.is_done then
ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Termination,
str_index.item)
end ;
variant := Duplicated ;
if CHAR_CLASS::Digit.contains(str_index.item) then
low := 0 ;
loop -- needed - comma MAY be ignored
if CHAR_CLASS::Digit.contains(str_index.item) then
low := low * Number_Base +
LIBCHARS::default.card(str_index.get_item)
else
break!
end
end ;
dup_kind := Exact_Count -- so far anyway!
else
ERE_ERRORS::error(ERE_ERROR_KINDS::Missing_Dup_No,str_index.item)
end ;
case ERE_LEX_TOKENS::token(str_index.item)
when ERE_LEX_TOKENS::Comma then -- another number perhaps?
str_index.advance ;
if str_index.is_done then
ERE_ERRORS::error(
ERE_ERROR_KINDS::Unexpected_Termination,str_index.item)
end ;
dup_kind := At_Least ;
if ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Brace then
str_index.advance
else -- might be a second number!
if CHAR_CLASS::Digit.contains(str_index.item) then
high := str_index.card ;
dup_kind := Range
else
ERE_ERRORS::error(ERE_ERROR_KINDS::Missing_Dup_No,
str_index.item)
end ;
if str_index.is_done then
ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Termination,
str_index.item)
elsif ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Brace then
str_index.advance
else
ERE_ERRORS::error(
ERE_ERROR_KINDS::Missing_Brace,str_index.item)
end
end
when ERE_LEX_TOKENS::Right_Brace then -- just a single number in braces
str_index.advance
else
ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Char,str_index.item)
end
end ;
create(
str_index : STR_CURSOR
) : SAME is
-- This routine creates a single extended regular expression from
-- the source stream provided, otherwise, if the stream is invalid, raises
-- a string exception.
me : SAME := new ;
if ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Left_Parenthesis then
str_index.advance ; -- over the parenthesis
me.parenthesised := REG_EXP::make_reg_exp(str_index,true) ;
if ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Parenthesis then
me.variant := Parenthesised ;
str_index.advance
else
ERE_ERRORS::error(
ERE_ERROR_KINDS::Missing_Right_Paren,str_index.index)
end
elsif ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Vertical_Line then
return me
else
me.single := ONE_CHAR_ERE::create(str_index)
end ;
if str_index.is_done then
return me
end ;
case ERE_LEX_TOKENS::token(str_index.item) -- duplication tokens/exprn
when ERE_LEX_TOKENS::Asterisk then
me.dup_kind := Any_Num ;
str_index.advance
when ERE_LEX_TOKENS::Plus_Sign then
me.dup_kind := One_Plus ;
str_index.advance
when ERE_LEX_TOKENS::Question_Mark then
me.dup_kind := Zero_One ;
str_index.advance
when ERE_LEX_TOKENS::Left_Brace then
me.duplicate_expression(str_index)
else -- Nothing to do
end ;
return me
end ;
matches(
str_index : STR_CURSOR
) : BOOL is
-- This predicate returns true if and only if the stream provided matches
-- the expression contained in this object, setting the cursor to the next
-- character beyond the end of the matched string portion. If false is
-- returned then str_index is not changed.
cnt : CARD := 0 ;
matched : BOOL ;
start_index : CARD ; -- for backtrack when paren fails
loop
if void(single) then
start_index := str_index.index ;
matched := parenthesised.matches(str_index) ;
if ~matched then
str_index.set_index(start_index)
end
else
matched := single.matches(str_index)
end ;
if matched then
cnt := cnt + 1
end ;
success : BOOL ;
case dup_kind
when None then
return matched
when Any_Num then
success := true
when One_Plus then
success := (cnt > 0)
when Zero_One then
success := (cnt < 2)
when Exact_Count then
success := (cnt = low)
when At_Least then
success := (cnt >= low)
when Range then
success := ((cnt >= low)
and (cnt <= high))
else
end ;
if str_index.is_done
or ~matched then
return success
end
end
end ;
end ; -- ONE_ERE
class NON_ANCHORED_ERE
class NON_ANCHORED_ERE is
-- This class embodies the concept of a single extended regular
-- expression which is not anchored. It may contain a list of single
-- expressions.
-- Version 1.0 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
private attr expr : FLIST{ONE_ERE} ;
create(
str_index : STR_CURSOR,
paren : BOOL
) : SAME is
-- This routine creates a (list of) non-anchored extended regular
-- expressions from the source stream provided, otherwise, if the stream
-- is invalid, raises a string exception.
--
-- Note that this routine is only entered if the input stream is
-- non-empty.
me : SAME := new ;
me.expr := FLIST{ONE_ERE}::create ;
loop
case ERE_LEX_TOKENS::token(str_index.item)
when ERE_LEX_TOKENS::Vertical_Line then
break!
when ERE_LEX_TOKENS::Right_Anchor then
break!
when ERE_LEX_TOKENS::Right_Parenthesis then
if paren then
break!
end
else
end ;
me.expr := me.expr.push(ONE_ERE::create(str_index)) ;
if str_index.is_done then
break!
end
end ;
return me
end ;
matches(
str_index : STR_CURSOR
) : BOOL is
-- This predicate returns true if and only if the stream provided matches
-- the list of expressions contained in this object, setting the cursor to
-- the character beyond the end of the matched string portion. If false is
-- returned then str_index is not changed,
loop
loc_index : CARD := 0.upto!(expr.size - 1) ;
if ~expr[loc_index].matches(str_index) then
return false
end
end ;
return true
end ;
end ; -- NON_ANCHORED_ERE
class EXT_REG_EXP
class EXT_REG_EXP is
-- This class embodies the concept of a single extended regular
-- expression which may or may not be anchored. The class parameter is
-- expected to be a text string type.
-- Version 1.0 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 30 May 97 kh Original for Extended Regular_Expressions
private attr expr : NON_ANCHORED_ERE ; -- MAY be void!
-- One or both of the following may be true if this expression is
-- anchored!
private attr start : BOOL ;
private attr finish : BOOL ;
create(str_index : STR_CURSOR, paren : BOOL ) : SAME is
-- This routine creates a (list of) non-anchored extended regular
-- expressions from the source stream provided, otherwise, if the stream
-- is invalid, raises a string exception. If paren is true then detection of
-- it in the stream (unescaped) terminates an expression.
me : SAME := new ;
itm::=str_index.item;
tok::=ERE_LEX_TOKENS::token(itm);
tok2::=ERE_LEX_TOKENS::Left_Anchor;
if tok = tok2 then
me.start := true ;
str_index.advance;
end ;
if str_index.is_done then
ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Termination,str_index.item)
elsif ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Anchor then
me.finish := true ; -- empty expression
str_index.advance;
else
-- there IS an expression
me.expr := NON_ANCHORED_ERE::create(str_index,paren) ;
if str_index.is_done then me.finish := false
elsif (ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Anchor) then
me.finish := true ;
str_index.advance ;
if ~(str_index.is_done
or (ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Vertical_Line)
)
then
ERE_ERRORS::error(ERE_ERROR_KINDS::Trailing_Chars, str_index.index)
end
end
end ;
return me
end ;
matches(str_index : STR_CURSOR ) : BOOL is
-- This predicate returns true if and only if the stream provided matches
-- the list of expressions contained in this object, setting the cursor to
-- the character beyond the end of the matched string portion. If false is
-- returned then str_index is not changed,
if start then
if ~(str_index.index = 0) then -- No match
return false
end
end ;
if str_index.is_done then -- empty string -- OK?
return void(expr) -- Yes if expr also void!
else
if void(expr)
or expr.matches(str_index) then
if finish then
return str_index.is_done -- Anchored at end?
else
return true
end
else
return false
end
end
end ;
end ; -- EXT_REG_EXP
class REG_EXP < $BINARY
class REG_EXP < $BINARY is
-- This class provides POSIX-style string pattern matching, used, for
-- example, when matching file names in path searching. It provides a match
-- operation to find out if a test string matches the extended regular
-- expression.
--
-- An extended regular expression consists of one or more of the
-- following kinds of components :--
--
-- a. Collating Elements -- any single character that is not
-- a meta-character
-- b. Duplication count -- a numeric constant
-- c. Meta-characters -- ^ - ]
-- d. Left Anchor -- ^ meaning the beginning of the expression
-- e. Ordinary characters
-- f. Quoted characters -- escaped (using a reverse solidus) ^.*[$\
-- g. Right Anchor -- $ meaning the end of the expression.
-- h. Special Characters -- .\[^$*()|?{+
-- The syntax of extended regular expressions is defined in the POSIX
-- standard ISO/IEC 9945-2:1995 to which reference should be made in
-- understanding the following code.
-- Version 1.3 May 97. Copyright K Hopper,U of Waikato
-- Development History
-- -------------------
-- Date Who By Detail
-- ---- ------ ------
-- 10 Jun 96 kh Original
-- 19 Feb 97 kh Additions for string/char portability.
-- 10 Apr 97 kh Modified for INT to CARD, etc
-- 29 May 97 kh Parameterised for internationalisation,
-- and extended to Extended Regular Expressions.
-- 9 Dec 99 kh Revised matching to use pattern lib.
include BINARY ;
private attr pattern : STR ; -- retained for output to file
private attr expr : FLIST{EXT_REG_EXP} ; -- all alternatives!
make_reg_exp( str_index : STR_CURSOR, recursive_call : BOOL ) : SAME is
-- This routine creates a new regular expression. Its primary purpose
-- is to handle expression alternatives and recursive calling! Note that
-- it is not necessary to provide a value for pattern in this routine.
res : SAME := new ;
res.expr := FLIST{EXT_REG_EXP}::create ;
loop
exp::=EXT_REG_EXP::create(str_index,recursive_call);
res.expr := res.expr.push(exp) ;
if str_index.is_done
or (ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Right_Parenthesis) then
break!
end ;
if (ERE_LEX_TOKENS::token(str_index.item) =
ERE_LEX_TOKENS::Vertical_Line) then
str_index.advance
end
end ;
return res
end ;
create( inout str : STR ) : SAME is
-- This routine creates a new regular expression pattern object unless
-- an error has been detected, when void is returned and the parameter set
-- to contain the error message.
protect
me : SAME := make_reg_exp(str.cursor,false) ;
me.pattern := str ;
return me
when STR then
str := exception.str ;
return void
end -- protect!
end ;
build( index : BIN_CURSOR ) : SAME
pre ~void(index) and ~index.is_done
post true
is
-- This routine creates a new regular expression from the binary string
-- cursor at the current position.
pattern : STR := index.get_sized.str ;
if void(pattern) then return void; end; -- nothing there
return create(inout pattern)
end ;
binstr : BINSTR
pre ~void(self)
post (result.size = (pattern.size + 1))
is
-- This routine returns a binary string representation of self suitable
-- for filing and re-building!
loc_res : BINSTR := pattern.binstr ;
return BINSTR::create + OCTET::create(loc_res.size) + loc_res
end ;
matches( test_cursor : STR_CURSOR ) : BOOL is
-- This predicate returns true iff the regular expression matches
-- the test cursor, starting at the current buffer position.
start_index : CARD ;
loop -- over all alternatives!
if test_cursor.is_done then return false
else start_index := test_cursor.index
end ;
if expr.elt!.matches(test_cursor) then return true
else test_cursor.set_index(start_index) ; -- try another alternative
end
end ;
return false
end ;
matches( test_str : STR ) : BOOL is
-- This predicate converts the test string to be the same encodings
-- as the expression pattern if needed, before creating a string cursor and
-- invoking the above matching routine.
str_index : STR_CURSOR ;
if pattern.index_lib = test_str.index_lib then str_index := test_str.cursor
else str_index := test_str.convert(pattern.index_lib).cursor
end ;
return matches(str_index)
end ;
end ; -- REG_EXP