File : unicode-ces-utf32.ads


-----------------------------------------------------------------------

--                XML/Ada - An XML suite for Ada95                   --

--                                                                   --

--                       Copyright (C) 2001                          --

--                            ACT-Europe                             --

--                       Author: Emmanuel Briot                      --

--                                                                   --

-- This library is free software; you can redistribute it and/or     --

-- modify it under the terms of the GNU General Public               --

-- License as published by the Free Software Foundation; either      --

-- version 2 of the License, or (at your option) any later version.  --

--                                                                   --

-- This library is distributed in the hope that it will be useful,   --

-- but WITHOUT ANY WARRANTY; without even the implied warranty of    --

-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU --

-- General Public License for more details.                          --

--                                                                   --

-- You should have received a copy of the GNU General Public         --

-- License along with this library; if not, write to the             --

-- Free Software Foundation, Inc., 59 Temple Place - Suite 330,      --

-- Boston, MA 02111-1307, USA.                                       --

--                                                                   --

-- As a special exception, if other files instantiate generics from  --

-- this unit, or you link this unit with other files to produce an   --

-- executable, this  unit  does not  by itself cause  the resulting  --

-- executable to be covered by the GNU General Public License. This  --

-- exception does not however invalidate any other reasons why the   --

-- executable file  might be covered by the  GNU Public License.     --

-----------------------------------------------------------------------


--  This package provides support for Utf32-encoding.

--

--  The main advantage to this encoding is that each character is coded on

--  the same number of bytes, ie 4 bytes. It is thus very easy and fast to

--  traverse a byte sequence and get each character.

--

--  On the other hand, this also means that strings are much bigger than what

--  they should be (when using standard ASCII character, for instance, the

--  byte sequence is four times as big as it needs to be).

--  This encoding is also dependent on specific byte-ordering. You should thus

--  always convert your Utf32 strings to little-endian before usage (see

--  To_Utf32_LE below).


with Unchecked_Deallocation;
with Unicode.CCS;

package Unicode.CES.Utf32 is

   -----------

   -- Types --

   -----------


   subtype Utf32_String is String;
   type Utf32_String_Access is access Utf32_String;
   --  A UTF32-encoded string. Byte-order is unspecified


   subtype Utf32_LE_String is Utf32_String;
   type Utf32_LE_String_Access is access Utf32_LE_String;
   --  A Utf32-encoded, little-endian string.


   subtype Utf32_BE_String is Utf32_String;
   --  A Utf32-encoded, big-endian string.


   Utf32_Char_Width : constant := 4;
   --  Number of bytes used to represent every character in Utf32


   -------------------------------------------

   -- Conversion to and from byte sequences --

   -------------------------------------------


   function Encode (Char : Unicode_Char) return Utf32_LE_String;
   --  Return the byte sequence representing Char in the Utf32 character

   --  encoding form.

   --  The character is encoded in little-endian byte order.


   function Read (Str : Utf32_LE_String; Index : Positive) return Unicode_Char;
   --  Return the character starting at location Index in Str


   function Encode_BE (Char : Unicode_Char) return Utf32_BE_String;
   --  Return the byte sequence representing Char in the Utf32 character

   --  encoding form.

   --  The character is encoded in big-endian byte order.


   function Read_BE (Str : Utf32_BE_String; Index : Positive)
      return Unicode_Char;
   --  Same as Read, but when Str is in big-endian order


   function Width (Char : Unicode_Char) return Natural;
   --  Return the number of bytes occupied by the Utf32 representation of Char


   function Length (Str : Utf32_String) return Natural;
   --  Return the number of characters in Str


   ---------------------------

   -- Byte order conversion --

   ---------------------------


   function To_Unicode_LE
     (Str   : Utf32_String;
      Cs    : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
      Order : Byte_Order := Default_Byte_Order) return Utf32_LE_String;
   --  Convert a string from any byte-order, any character set (CS) to

   --  Unicode little-endian byte sequence

   --  Order is the order in which bytes are coded in Str. This is silently

   --  overriden in case Str as a BOM (byte-order-marker) at the beginning

   --  that specifies an explicit order.

   --  The BOM is removed from the resulting string

   --  Invalid_Encoding is raised if there is a BOM that indicates an

   --  encoding other than Utf32.


   function To_CS
     (Str   : Utf32_LE_String;
      Cs    : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
      Order : Byte_Order := Default_Byte_Order) return Utf32_String;
   --  Convert a Unicode, little-endian string to a string with any byte-order

   --  and a new character set.


   ---------------------

   -- Encoding Scheme --

   ---------------------


   Utf32_LE_Encoding : constant Encoding_Scheme :=
     (Read   => Read'Access,
      Width  => Width'Access,
      Encode => Encode_Function' (Encode'Access),
      Length => Length'Access);

   Utf32_BE_Encoding : constant Encoding_Scheme :=
     (Read   => Read_BE'Access,
      Width  => Width'Access,
      Encode => Encode_Function' (Encode_BE'Access),
      Length => Length'Access);

   ------------------

   -- Deallocation --

   ------------------


   procedure Free is new Unchecked_Deallocation
     (Utf32_String, Utf32_String_Access);
   procedure Free is new Unchecked_Deallocation
     (Utf32_LE_String, Utf32_LE_String_Access);
   --  Free the memory occupied by a utf32-encoded string


private
   pragma Inline (Width);
end Unicode.CES.Utf32;