UTF7.sa


Generated by gen_html_sa_files from ICSI. Contact gomes@icsi.berkeley.edu for details
 
------------------------->  GNU Sather - sourcefile  <-------------------------
-- Copyright (C) 2000 by K Hopper, University of Waikato, New Zealand        --
-- This file is part of the GNU Sather library. It is free software; you may --
-- redistribute  and/or modify it under the terms of the GNU Library General --
-- Public  License (LGPL)  as published  by the  Free  Software  Foundation; --
-- either version 2 of the license, or (at your option) any later version.   --
-- This  library  is distributed  in the  hope that it will  be  useful, but --
-- WITHOUT ANY WARRANTY without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See Doc/LGPL for more details.       --
-- The license text is also available from:  Free Software Foundation, Inc., --
-- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA                     --
-------------->  Please email comments to <bug-sather@gnu.org>  <--------------


class UTF7 < $IS_EQ, $IMMUTABLE

class UTF7 < $IS_EQ, $IMMUTABLE is -- This class provides for conversion of encoding sequences (ie text -- strings in some encoding) to a sequence of one or more octets in which -- a clear octet is only found when the original encoding was void. -- The algorithm provided is designed to achieve the translation given -- in the following table in which v signifies a value bit of the original -- code. -- Bits Hex Min Hex Max Octet Sequence in Binary -- -- 7 00000000 0000007f 0vvvvvvv -- 11 00000080 000007FF 110vvvvv 10vvvvvv ** needed for 8-bit!! -- 16 00000800 0000FFFF 1110vvvv 10vvvvvv 10vvvvvv -- 21 00010000 001FFFFF 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv -- 26 00200000 03FFFFFF 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv -- 31 04000000 7FFFFFFF 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv -- -- The original code value is just the concatenation of the v bits in -- the multiple octet encoding. When there are multiple ways to encode a -- value, for example UCS 0, only the shortest encoding is legal. -- Version 1.3 Apr 2001. Copyright K Hopper, U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 11 Dec 96 kh Original -- 30 Oct 98 kh Added pre/post conditions -- 26 Mar 99 kh Revised for restructured text library -- 27 Apr 01 kh Changed hex_str to str, eight_bit_str to -- tgt_str for uniformity. include AREF{OCTET} ; include COMPARABLE ; include BINARY ; private const code_mask : OCTET := OCTET::create(0xA0) ; private const code_value : OCTET := OCTET::create(0x80) ; private const Six_Bits : CARD := 6 ; private const Six_Bits_Val : CARD := 0x40 ; private const Six_Bit_Mask : OCTET := OCTET::create(0x3F) ; private const Valid_Bit_Limit : CARD := 5 ; private const Seven_Bit_Max : CARD := 0x7F ; private const Eleven_Bit_Max : CARD := 0x7FF ; private const Sixteen_Bit_Max : CARD := 0xFFFF ; private const Twenty_One_Bit_Max : CARD := 0x1FFFFF ; private const Twenty_Six_Bit_Max : CARD := 0x3FFFFFF ; private const Thirty_One_Bit_Max : CARD := 0x7FFFFFFF ; private const lengths : ARRAY{OCTET} := | OCTET::create(0), OCTET::create(0xC0), OCTET::create(0xE0), OCTET::create(0xF0), OCTET::create(0xF8), OCTET::create(0xFC) | ; private count( oct : OCTET ) : CARD pre true post (result = 0) or ((result > 1) and (result <= 6)) is -- This routine returns the number of following code octets in a single -- encoding representation or CARD::nil if invalid. res : CARD := oct.highest(clearbit) ; case res when 0, CARD::nil then -- Oops! invalid! return CARD::nil when 7 then return 0 else return OCTET::Octet_Bits - res - 1 end end ; private code_size( num : CARD ) : CARD pre (num < Thirty_One_Bit_Max) and (num > 0) post (result > 1) and (result <= 6) is -- This private routine returns the number of octets needed to encode -- the value contained in num. if (num > Thirty_One_Bit_Max) then SYS_ERROR::create.error(SYS::str_for_tp(SYS::tp(self)), SYS_EXCEPT::Bad_Value,num.str) ; return void -- to keep compiler happy elsif num <= Seven_Bit_Max then return 1 elsif num <= Eleven_Bit_Max then return 2 elsif num <= Sixteen_Bit_Max then return 3 elsif num <= Twenty_One_Bit_Max then return 4 elsif num <= Twenty_Six_Bit_Max then return 5 else return 6 end end ; create( num : CARD ) : SAME is -- This routine creates a valid UTF7 encoding corresponding to the given -- numeric value treated as a bit pattern. length : CARD := code_size(num) ; me : SAME := new(length) ; length := length - 1 ; if length = 0 then me[0] := OCTET::create(num) else val : CARD := num ; loop index : CARD := length.downto!(1) ; code : OCTET := OCTET::create(val % Six_Bits_Val) ; val := val / Six_Bits_Val ; me[index] := code.bit_or(code_value) end ; me[0] := lengths[length].bit_or(OCTET::create(val)) end ; return me end ; private build_list( octets : BINSTR, chunk_size : CARD ) : SAME pre ~void(octets) and (chunk_size > 0) post ~void(result) is -- This private routine returns the UTF7 value of the binary encoding -- which is processed in chunk_size 'bites'. res : FLIST{OCTET} := FLIST{OCTET}::create ; loop numval : CARD := CARD::create(octets.chunk!(chunk_size)) ; bits : QUADBITS := QUADBITS::create(numval) ; if numval <= Seven_Bit_Max then res := res.push(OCTET::create(numval)) else loc_list : FLIST{OCTET} := FLIST{OCTET}::create ; octet_cnt : CARD ; -- of six-bit octets! if numval <= Eleven_Bit_Max then octet_cnt := 1 elsif numval <= Sixteen_Bit_Max then octet_cnt := 2 elsif numval <= Twenty_One_Bit_Max then octet_cnt := 3 elsif numval <= Twenty_Six_Bit_Max then octet_cnt := 4 else octet_cnt := 5 end ; loop octet_cnt.times! ; loc_val : OCTET := bits.octet.bit_and(Six_Bit_Mask) ; loc_list := loc_list.push(loc_val.bit_or(code_value)) ; bits := bits.right(Six_Bits) end ; loc_list := loc_list.push(lengths[octet_cnt].bit_or( OCTET::create(bits.card))) ; loop (octet_cnt + 1).times! ; res := res.push(loc_list.pop) end end end ; me : SAME := new(res.size) ; loop me.aset!(res.elt!) end ; return me end ; create( rn : RUNE ) : SAME is -- This routine creates a new encoding from the given rune, provided -- that there is a valid translation, otherwise void is returned! return build_list(rn.binstr,rn.lib.my_size) end ; create( rns : RUNES ) : SAME is -- This routine converts the given binary string which is expected to be -- a character encoding sequence for conversion into a valid UTF7 code. if void(rns) then return void else return build_list(rns.binstr,rns.index_lib.my_size) end end ; build( octets : BIN_CURSOR, lib : LIBCHARS ) : SAME pre ~void(octets) and octets.remaining % lib.my_size = 0 and ~void(lib) post ~void(result) is -- This routine converts the indicated binary string into a UTF7 coding -- sequence, using the given repertoire and encoding. return build_list(octets.get_remainder,lib.my_size) end ; build( octets : BIN_CURSOR ) : SAME pre ~void(octets) and octets.remaining % LIBCHARS::default.my_size = 0 post ~void(result) is -- This routine converts the indicated binary string into a UTF7 coding -- sequence, using the default repertoire and encoding. return build(octets,LIBCHARS::default) end ; private length( first_octet : OCTET ) : CARD is -- This private routine returns the length in octets of the code which -- begins with the given octet. If this is not a valid leading octet then -- CARD::nil is returned. res : CARD := first_octet.highest(clearbit) ; if res /= CARD::nil then res := res + 1 end ; return res end ; is_eq( other : SAME ) : BOOL is -- This predicate returns true if and only if all components of this -- coding are equal to all those of other. loop if aelt! /= other.aelt! then return false end end ; return true end ; is_singleton : BOOL is -- This predicate returns true if and only if self is the transformed -- coding of a single character/rune. return asize > length([0]) end ; card : CARD pre ~void(self) and (asize = length([0])) post true is -- This routine returns the encoded bits of self as a numeric value -- -- providing that self is merely a single code. loc_lgth : CARD := length([0]) - 1 ; res : CARD := (lengths[loc_lgth].bit_invert.bit_and([0])).card ; loop index : CARD := 1.upto!(loc_lgth) ; res := res * Six_Bits_Val + [index].bit_and(Six_Bit_Mask).card end ; return res end ; tgt_str : STR pre ~void(self) post result.asize = asize is -- This routine is the one which turns a UTF7 string into a locally -- defined default character string using eight, sixteen or thirty-two bit -- code spaces as defined in the environment. res : STR := STR::create(asize,LIBCHARS::default) ; loop index : CARD := 0.upto!(asize - 1) ; loc_val : CARD := aelt!.card ; res[index] := CHAR::create(CHAR_CODE::create(loc_val, LIBCHARS::default)) end ; return res end ; binstr : BINSTR pre ~void(self) post (result.size <= asize) is -- This routine returns the value of self as a decoded binary string, -- whose individual codes occupy the given number of octets -- for conversion -- as desired into characters, etc. res : BINSTR := BINSTR::create ; octet_cnt : CARD ; to_do : CARD ; loc_val : OCTET ; bits_so_far : CARD ; started : BOOL := false ; loop oct : OCTET := aelt! ; if ~started then started := true ; octet_cnt := count(oct) ; to_do := octet_cnt - 1 ; -- one has already been 'read' if octet_cnt = 0 then loc_val := oct ; bits_so_far := OCTET::Octet_Bits else loc_val := oct.bit_xor(lengths[octet_cnt]) ; bits_so_far := Valid_Bit_Limit + 1 - octet_cnt end else to_do := to_do - 1 ; if (bits_so_far % OCTET::Octet_Bits) = 0 then res := res + loc_val ; loc_val := OCTET::null end ; loc_val := loc_val.left(Six_Bits).bit_or( oct.bit_and(Six_Bit_Mask)) ; if to_do = 0 then res := res + loc_val ; started := false end end end ; return res end ; runes( lib : LIBCHARS ) : RUNES pre true post (void(self) and void(result)) or ~void(result) is -- This routine returns a string consisting of the individual codes of -- self treated as a character string in the given repertoire and encoding. res : RUNES := RUNES::create(lib) ; loop res := res + rune!(lib) end ; return res end ; runes : RUNES pre ~void(self) post ~void(result) is -- This routine returns a string consisting of the individual codes of -- self treated as a character string in the default repertoire and encoding. return runes(LIBCHARS::default) end ; rune!( once lib : LIBCHARS ) : RUNE pre ~void(lib) and ~void(self) post ~void(result) is -- This iter yields successive octets converted to character form in -- the given repertoire and encoding. cursor : BIN_CURSOR := binstr.cursor ; loop yield RUNE::build(cursor.get_upto(lib.my_size).cursor,lib) ; if cursor.is_done then quit end end end ; rune! : RUNE pre true post ~void(result) is -- This iter yields successive octets converted to character form in -- the current repertoire and encoding. loop yield rune!(LIBCHARS::default) end end ; code! : SAME pre true post ~void(result) is -- This iter yields successive individual UTF codes in self if void(self) then quit end ; started : BOOL := false ; res : SAME ; loc_cnt : CARD ; index : CARD ; loop oct : OCTET := aelt! ; if ~started then loc_cnt := count(oct) ; res := new(loc_cnt + 1) ; index := 0 ; started := true end ; res[index] := oct ; if index = loc_cnt then yield res ; started := false else index := index - 1 end end end ; str( lib : LIBCHARS ) : STR pre ~void(lib) and ~void(self) post true is -- This routine produces a character string from the octets of which -- the value is encoded as space separated hexadecimal numbers in the -- given repertoire and encoding. res : STR := STR::create ; loop res := res + lib.Space.char.str(lib).separate!(aelt!.hex_str(lib)) end ; return res end ; str : STR pre ~void(self) post true is -- This routine produces a character string from the octets of which -- the value is encoded as space separated hexadecimal numbers in the -- default repertoire and encoding. return str(LIBCHARS::default) end ; end ; -- UTF7