File : unicode-ces.ads
-----------------------------------------------------------------------
-- XML/Ada - An XML suite for Ada95 --
-- --
-- Copyright (C) 2001 --
-- ACT-Europe --
-- Author: Emmanuel Briot --
-- --
-- This library is free software; you can redistribute it and/or --
-- modify it under the terms of the GNU General Public --
-- License as published by the Free Software Foundation; either --
-- version 2 of the License, or (at your option) any later version. --
-- --
-- This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY; without even the implied warranty of --
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU --
-- General Public License for more details. --
-- --
-- You should have received a copy of the GNU General Public --
-- License along with this library; if not, write to the --
-- Free Software Foundation, Inc., 59 Temple Place - Suite 330, --
-- Boston, MA 02111-1307, USA. --
-- --
-- As a special exception, if other files instantiate generics from --
-- this unit, or you link this unit with other files to produce an --
-- executable, this unit does not by itself cause the resulting --
-- executable to be covered by the GNU General Public License. This --
-- exception does not however invalidate any other reasons why the --
-- executable file might be covered by the GNU Public License. --
-----------------------------------------------------------------------
-- This is the root of the hierarchy that provides different encoding
-- schemes.
-- Each of the child package provides function to convert to and from
-- Utf32-encoded string, which thus acts as the central encoding scheme.
-- In some cases, the string can be preceded by a BOM (byte-order mark),
-- that indicates the encoding used and the byte-order used for the
-- encoding. This BOM is always optional, and can never be mixed up with
-- other characters.
with Unchecked_Deallocation;
package Unicode.CES is
-------------------
-- Byte sequence --
-------------------
subtype Byte_Sequence is String;
type Byte_Sequence_Access is access all Byte_Sequence;
-- A sequence of bytes. The encoding is unknown.
procedure Free is new Unchecked_Deallocation
(Byte_Sequence, Byte_Sequence_Access);
-------------------------
-- Byte order handling --
-------------------------
type Byte_Order is (High_Byte_First, Low_Byte_First);
-- Order of bytes in word machines.
Default_Byte_Order : constant Byte_Order := Low_Byte_First;
------------------------------
-- Byte-order mark handling --
------------------------------
type Bom_Type is
(Utf8_All, -- Utf8-encoding
Utf16_LE, -- Utf16 little-endian encoding
Utf16_BE, -- Utf16 big-endian encoding
Utf32_LE, -- Utf32 little-endian encoding
Utf32_BE, -- Utf32 big-endian encoding
Unknown);
-- the type of encoding used for a string, that can be deduced from the
-- BOM.
subtype Bom_Type_Utf16 is Bom_Type range Utf16_LE .. Utf16_BE;
subtype Bom_Type_Utf32 is Bom_Type range Utf32_LE .. Utf32_BE;
procedure Read_Bom (Str : String; Len : out Natural; BOM : out Bom_Type);
-- Read the optional Byte-Order-Mark at the beginning of the byte
-- sequence Str.
-- Len will contain the number of characters that made up that BOM, and
-- that should be ignored when reading Str.
-----------------------
-- Parsing functions --
-----------------------
-- All the packages in this hierarchy must provide at least two functions
-- to read from a string (whatever its encoding).
--
-- These can be used to completly parse a string.
-- J := Str'First;
-- while J <= Str'Last loop
-- C := Read (Str, J); -- Read the character
-- J := J + Width (C); -- Move to the next
-- end loop;
type Read_Function is access function
(Str : Byte_Sequence; Index : Positive) return Unicode.Unicode_Char;
-- This function returns the character at position Index in the byte
-- sequence Str.
type Width_Function is access
function (Char : Unicode.Unicode_Char) return Natural;
-- This function returns the number of bytes that encode Char in the
-- specific encoding scheme.
type Encode_Function is access
function (Char : Unicode.Unicode_Char) return Byte_Sequence;
-- This function converts Char to the appropriate byte sequence that
-- represents it in the specific encoding
type Length_Function is access
function (Str : Byte_Sequence) return Natural;
type Encoding_Scheme is record
Read : Read_Function;
Width : Width_Function;
Encode : Encode_Function;
Length : Length_Function;
end record;
--------------------
-- Misc functions --
--------------------
function Index_From_Offset
(Str : Byte_Sequence; Offset : Natural; Encoding : Encoding_Scheme)
return Integer;
-- return the index in Str matching starting at Offset.
-- (-1) is returned if Offset is invalid (outside of the range of Str).
----------------
-- Exceptions --
----------------
Invalid_Encoding : exception;
-- Raised whener the byte sequence associated with a given encoding
-- scheme is not valid.
end Unicode.CES;