File : unicode-ces-utf32.ads
-----------------------------------------------------------------------
-- XML/Ada - An XML suite for Ada95 --
-- --
-- Copyright (C) 2001 --
-- ACT-Europe --
-- Author: Emmanuel Briot --
-- --
-- This library is free software; you can redistribute it and/or --
-- modify it under the terms of the GNU General Public --
-- License as published by the Free Software Foundation; either --
-- version 2 of the License, or (at your option) any later version. --
-- --
-- This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY; without even the implied warranty of --
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU --
-- General Public License for more details. --
-- --
-- You should have received a copy of the GNU General Public --
-- License along with this library; if not, write to the --
-- Free Software Foundation, Inc., 59 Temple Place - Suite 330, --
-- Boston, MA 02111-1307, USA. --
-- --
-- As a special exception, if other files instantiate generics from --
-- this unit, or you link this unit with other files to produce an --
-- executable, this unit does not by itself cause the resulting --
-- executable to be covered by the GNU General Public License. This --
-- exception does not however invalidate any other reasons why the --
-- executable file might be covered by the GNU Public License. --
-----------------------------------------------------------------------
-- This package provides support for Utf32-encoding.
--
-- The main advantage to this encoding is that each character is coded on
-- the same number of bytes, ie 4 bytes. It is thus very easy and fast to
-- traverse a byte sequence and get each character.
--
-- On the other hand, this also means that strings are much bigger than what
-- they should be (when using standard ASCII character, for instance, the
-- byte sequence is four times as big as it needs to be).
-- This encoding is also dependent on specific byte-ordering. You should thus
-- always convert your Utf32 strings to little-endian before usage (see
-- To_Utf32_LE below).
with Unchecked_Deallocation;
with Unicode.CCS;
package Unicode.CES.Utf32 is
-----------
-- Types --
-----------
subtype Utf32_String is String;
type Utf32_String_Access is access Utf32_String;
-- A UTF32-encoded string. Byte-order is unspecified
subtype Utf32_LE_String is Utf32_String;
type Utf32_LE_String_Access is access Utf32_LE_String;
-- A Utf32-encoded, little-endian string.
subtype Utf32_BE_String is Utf32_String;
-- A Utf32-encoded, big-endian string.
Utf32_Char_Width : constant := 4;
-- Number of bytes used to represent every character in Utf32
-------------------------------------------
-- Conversion to and from byte sequences --
-------------------------------------------
function Encode (Char : Unicode_Char) return Utf32_LE_String;
-- Return the byte sequence representing Char in the Utf32 character
-- encoding form.
-- The character is encoded in little-endian byte order.
function Read (Str : Utf32_LE_String; Index : Positive) return Unicode_Char;
-- Return the character starting at location Index in Str
function Encode_BE (Char : Unicode_Char) return Utf32_BE_String;
-- Return the byte sequence representing Char in the Utf32 character
-- encoding form.
-- The character is encoded in big-endian byte order.
function Read_BE (Str : Utf32_BE_String; Index : Positive)
return Unicode_Char;
-- Same as Read, but when Str is in big-endian order
function Width (Char : Unicode_Char) return Natural;
-- Return the number of bytes occupied by the Utf32 representation of Char
function Length (Str : Utf32_String) return Natural;
-- Return the number of characters in Str
---------------------------
-- Byte order conversion --
---------------------------
function To_Unicode_LE
(Str : Utf32_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf32_LE_String;
-- Convert a string from any byte-order, any character set (CS) to
-- Unicode little-endian byte sequence
-- Order is the order in which bytes are coded in Str. This is silently
-- overriden in case Str as a BOM (byte-order-marker) at the beginning
-- that specifies an explicit order.
-- The BOM is removed from the resulting string
-- Invalid_Encoding is raised if there is a BOM that indicates an
-- encoding other than Utf32.
function To_CS
(Str : Utf32_LE_String;
Cs : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
Order : Byte_Order := Default_Byte_Order) return Utf32_String;
-- Convert a Unicode, little-endian string to a string with any byte-order
-- and a new character set.
---------------------
-- Encoding Scheme --
---------------------
Utf32_LE_Encoding : constant Encoding_Scheme :=
(Read => Read'Access,
Width => Width'Access,
Encode => Encode_Function' (Encode'Access),
Length => Length'Access);
Utf32_BE_Encoding : constant Encoding_Scheme :=
(Read => Read_BE'Access,
Width => Width'Access,
Encode => Encode_Function' (Encode_BE'Access),
Length => Length'Access);
------------------
-- Deallocation --
------------------
procedure Free is new Unchecked_Deallocation
(Utf32_String, Utf32_String_Access);
procedure Free is new Unchecked_Deallocation
(Utf32_LE_String, Utf32_LE_String_Access);
-- Free the memory occupied by a utf32-encoded string
private
pragma Inline (Width);
end Unicode.CES.Utf32;