File : unicode-ces-utf8.adb


-----------------------------------------------------------------------

--                XML/Ada - An XML suite for Ada95                   --

--                                                                   --

--                       Copyright (C) 2001                          --

--                            ACT-Europe                             --

--                       Author: Emmanuel Briot                      --

--                                                                   --

-- This library is free software; you can redistribute it and/or     --

-- modify it under the terms of the GNU General Public               --

-- License as published by the Free Software Foundation; either      --

-- version 2 of the License, or (at your option) any later version.  --

--                                                                   --

-- This library is distributed in the hope that it will be useful,   --

-- but WITHOUT ANY WARRANTY; without even the implied warranty of    --

-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU --

-- General Public License for more details.                          --

--                                                                   --

-- You should have received a copy of the GNU General Public         --

-- License along with this library; if not, write to the             --

-- Free Software Foundation, Inc., 59 Temple Place - Suite 330,      --

-- Boston, MA 02111-1307, USA.                                       --

--                                                                   --

-- As a special exception, if other files instantiate generics from  --

-- this unit, or you link this unit with other files to produce an   --

-- executable, this  unit  does not  by itself cause  the resulting  --

-- executable to be covered by the GNU General Public License. This  --

-- exception does not however invalidate any other reasons why the   --

-- executable file  might be covered by the  GNU Public License.     --

-----------------------------------------------------------------------


with Unicode.CES.Utf32; use Unicode.CES.Utf32;
with Unicode.CCS;       use Unicode.CCS;

package body Unicode.CES.Utf8 is

   function Compute_Mask (Char : Unicode_Char) return Unicode_Char;
   pragma Inline (Compute_Mask);
   --  Return the mask to be used for the encoding of the first byte in the

   --  sequence representing Char.


   function Internal_Convert
     (Str     : Utf8_String;
      Convert : Unicode.CCS.Conversion_Function := Identity'Access;
      Order   : Byte_Order := Default_Byte_Order) return Utf8_String;
   --  Internal function used to convert character sets


   ------------

   -- Encode --

   ------------


   function Encode (Char : Unicode_Char) return String is
      Len    : constant Natural := Width (Char);
      Mask   : constant Unicode_Char := Compute_Mask (Char);
      Val    : Unicode_Char := Char;
      Output : String (1 .. Len);
   begin
      for J in reverse 2 .. Len loop
         Output (J) := Character'Val ((Val and 16#3f#) or 16#80#);
         Val := Val / (2 ** 6);
      end loop;
      Output (1) := Character'Val (Val or Mask);
      return Output;
   end Encode;

   ----------

   -- Read --

   ----------


   function Read (Str : Utf8_String; Index : Positive) return Unicode_Char is
      Mask : Unicode_Char;
      Len  : Natural;
      Val  : Unicode_Char;
      C    : Unicode_Char := Character'Pos (Str (Index));
   begin
      --  Compute the length of the encoding given what was in the first byte

      if C < 128 then
         Len := 1;
         Mask := 16#7f#;
      elsif (C and 16#E0#) = 16#C0# then
         Len := 2;
         Mask := 16#1f#;
      elsif (C and 16#F0#) = 16#E0# then
         Len := 3;
         Mask := 16#0f#;
      elsif (C and 16#F8#) = 16#F0# then
         Len := 4;
         Mask := 16#07#;
      elsif (C and 16#FC#) = 16#F8# then
         Len := 5;
         Mask := 16#03#;
      elsif (C and 16#FE#) = 16#FC# then
         Len := 6;
         Mask := 16#01#;
      else
         raise Invalid_Encoding;
      end if;

      Val := C and Mask;
      for Count in 1 .. Len - 1 loop
         C := Character'Pos (Str (Index + Count));
         if (C and 16#C0#) /= 16#80# then
            raise Invalid_Encoding;
         end if;
         Val := Val * (2 ** 6);
         Val := Val or (C and 16#3f#);
      end loop;
      return Val;
   end Read;

   ------------

   -- Length --

   ------------


   function Length (Str : Utf8_String) return Natural is
      Pos : Natural := Str'First;
      Length : Natural := 0;
   begin
      while Pos <= Str'Last loop
         Pos := Pos + Width (Read (Str, Pos));
         Length := Length + 1;
      end loop;
      return Length;
   end Length;

   ------------------

   -- Compute_Mask --

   ------------------


   function Compute_Mask (Char : Unicode_Char) return Unicode_Char is
   begin
      if Char < 16#80# then
         return 16#0#;
      elsif Char < 16#800# then
         return 16#C0#;
      elsif Char < 16#10000# then
         return 16#E0#;
      elsif Char < 16#200000# then
         return 16#F0#;
      elsif Char < 16#4000000# then
         return 16#F8#;
      else
         return 16#FC#;
      end if;
   end Compute_Mask;

   -----------

   -- Width --

   -----------


   function Width (Char : Unicode_Char) return Natural is
   begin
      if Char < 16#80# then
         return 1;
      elsif Char < 16#800# then
         return 2;
      elsif Char < 16#10000# then
         return 3;
      elsif Char < 16#200000# then
         return 4;
      elsif Char < 16#4000000# then
         return 5;
      else
         return 6;
      end if;
   end Width;

   ----------------

   -- From_Utf32 --

   ----------------


   function From_Utf32 (Str : Unicode.CES.Utf32.Utf32_String)
      return Utf8_String
   is
      --  Allocate worst case

      Result : Utf8_String (1 .. (Str'Length / Utf32_Char_Width) * 6);
      J : Positive := Str'First;
      R_Index : Positive := Result'First;
      C : Unicode_Char;
   begin
      while J <= Str'Last loop
         C := Unicode.CES.Utf32.Read (Str, J);
         declare
            Tmp : constant String := Encode (C);
         begin
            Result (R_Index .. R_Index + Tmp'Length - 1) := Tmp;
            R_Index := R_Index + Tmp'Length;
         end;
         J := J + Unicode.CES.Utf32.Width (C);
      end loop;

      return Result (1 .. R_Index - 1);
   end From_Utf32;

   --------------

   -- To_Utf32 --

   --------------


   function To_Utf32 (Str : Utf8_String)
      return Unicode.CES.Utf32.Utf32_LE_String
   is
      --  Allocate worst case

      Result : Utf32_LE_String (1 .. Str'Length * Utf32_Char_Width);
      J : Positive := Str'First;
      R_Index : Positive := 1;
      C : Unicode_Char;
   begin
      while J <= Str'Last loop
         C := Read (Str, J);
         Result (R_Index .. R_Index + Utf32_Char_Width - 1) :=
           Unicode.CES.Utf32.Encode (C);
         R_Index := R_Index + Utf32_Char_Width;
         J := J + Width (C);
      end loop;
      return Result (1 .. R_Index - 1);
   end To_Utf32;

   ----------------------

   -- Internal_Convert --

   ----------------------


   function Internal_Convert
     (Str     : Utf8_String;
      Convert : Unicode.CCS.Conversion_Function := Identity'Access;
      Order   : Byte_Order := Default_Byte_Order) return Utf8_String
   is
      Offset : Natural := 0;
      BOM : Bom_Type;
      C : Unicode_Char;
      J : Natural;
   begin
      Read_Bom (Str, Offset, BOM);

      case BOM is
         when Utf8_All | Unknown => null;
         when others             => raise Invalid_Encoding;
      end case;

      if Convert = Identity'Access then
         return Str (Str'First + Offset .. Str'Last);
      else
         declare
            --  Allocate worst case for the string

            Result : Utf8_String (1 .. Str'Length * 6);
            R_Index : Natural := Result'First;
         begin
            J := Str'First + Offset;
            while J <= Str'Last loop
               C := Read (Str, J);
               declare
                  Tmp : constant String := Encode (Convert (C));
               begin
                  Result (R_Index .. R_Index + Tmp'Length - 1) := Tmp;
                  R_Index := R_Index + Tmp'Length;
               end;
               J := J + Width (C);
            end loop;
            return Result (1 .. R_Index - 1);
         end;
      end if;
   end Internal_Convert;

   -------------------

   -- To_Unicode_LE --

   -------------------


   function To_Unicode_LE
     (Str   : Utf8_String;
      Cs    : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
      Order : Byte_Order := Default_Byte_Order) return Utf8_String is
   begin
      return Internal_Convert (Str, Cs.To_Unicode, Order);
   end To_Unicode_LE;

   -----------

   -- To_CS --

   -----------


   function To_CS
     (Str   : Utf8_String;
      Cs    : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
      Order : Byte_Order := Default_Byte_Order) return Utf8_String is
   begin
      return Internal_Convert (Str, Cs.To_CS, Order);
   end To_CS;
end Unicode.CES.Utf8;