File : unicode-ces-utf16.adb


-----------------------------------------------------------------------

--                XML/Ada - An XML suite for Ada95                   --

--                                                                   --

--                       Copyright (C) 2001                          --

--                            ACT-Europe                             --

--                       Author: Emmanuel Briot                      --

--                                                                   --

-- This library is free software; you can redistribute it and/or     --

-- modify it under the terms of the GNU General Public               --

-- License as published by the Free Software Foundation; either      --

-- version 2 of the License, or (at your option) any later version.  --

--                                                                   --

-- This library is distributed in the hope that it will be useful,   --

-- but WITHOUT ANY WARRANTY; without even the implied warranty of    --

-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU --

-- General Public License for more details.                          --

--                                                                   --

-- You should have received a copy of the GNU General Public         --

-- License along with this library; if not, write to the             --

-- Free Software Foundation, Inc., 59 Temple Place - Suite 330,      --

-- Boston, MA 02111-1307, USA.                                       --

--                                                                   --

-- As a special exception, if other files instantiate generics from  --

-- this unit, or you link this unit with other files to produce an   --

-- executable, this  unit  does not  by itself cause  the resulting  --

-- executable to be covered by the GNU General Public License. This  --

-- exception does not however invalidate any other reasons why the   --

-- executable file  might be covered by the  GNU Public License.     --

-----------------------------------------------------------------------


with Unicode.CES.Utf32;  use Unicode.CES.Utf32;
with Unicode.CCS;        use Unicode.CCS;

package body Unicode.CES.Utf16 is

   ------------

   -- Encode --

   ------------


   function Encode (Char : Unicode_Char) return Utf16_LE_String is
      C, D : Unicode_Char;
   begin
      if Char < 16#10000# then
         C := Char and 16#00FF#;
         D := (Char and 16#FF00#) / (2 ** 8);
         return Character'Val (C) & Character'Val (D);
      else
         C := 16#D800#
           + ((Char - 16#10000#) and 2#11111111110000000000#) / (2 ** 10);
         D := 16#DC00#
           + ((Char - 16#10000#) and 2#1111111111#);
         return Character'Val (C and 16#00FF#)
           & Character'Val ((C and 16#FF00#) / (2 ** 8))
           & Character'Val (D and 16#00FF#)
           & Character'Val ((D and 16#FF00#) / (2 ** 8));
      end if;
   end Encode;

   ---------------

   -- Encode_BE --

   ---------------


   function Encode_BE (Char : Unicode_Char) return Utf16_BE_String is
      C, D : Unicode_Char;
   begin
      if Char < 16#10000# then
         C := Char and 16#00FF#;
         D := (Char and 16#FF00#) / (2 ** 8);
         return Character'Val (D) & Character'Val (C);
      else
         C := 16#D800#
           + ((Char - 16#10000#) and 2#11111111110000000000#) / (2 ** 10);
         D := 16#DC00#
           + ((Char - 16#10000#) and 2#1111111111#);
         return Character'Val ((C and 16#FF00#) / (2 ** 8))
           & Character'Val (C and 16#00FF#)
           & Character'Val ((D and 16#FF00#) / (2 ** 8))
           & Character'Val (D and 16#00FF#);
      end if;
   end Encode_BE;

   ----------

   -- Read --

   ----------


   function Read (Str : Utf16_LE_String; Index : Positive) return Unicode_Char
   is
      C, D : Unicode_Char;
   begin
      C := Character'Pos (Str (Index + 1)) * 256 + Character'Pos (Str (Index));

      --  High surrogate value

      if C in 16#D800# .. 16#DBFF# then
         D := Character'Pos (Str (Index + 3)) * 256
           + Character'Pos (Str (Index + 2));

         --  Not a low surrogate ?

         if not (D in 16#DC00# .. 16#DFFF#) then
            raise Invalid_Encoding;
         end if;

         C := C and 2#1111111111#;
         D := D and 2#1111111111#;
         C := C * 2#10000000000# + D + 16#10000#;
      end if;

      return C;
   end Read;

   -------------

   -- Read_BE --

   -------------


   function Read_BE
     (Str : Utf16_BE_String; Index : Positive) return Unicode_Char
   is
      C, D : Unicode_Char;
   begin
      C := Character'Pos (Str (Index)) * 256 + Character'Pos (Str (Index + 1));

      --  High surrogate value

      if C in 16#D800# .. 16#DBFF# then
         D := Character'Pos (Str (Index + 2)) * 256
           + Character'Pos (Str (Index + 3));

         --  Not a low surrogate ?

         if not (D in 16#DC00# .. 16#DFFF#) then
            raise Invalid_Encoding;
         end if;

         C := C and 2#1111111111#;
         D := D and 2#1111111111#;
         C := C * 2#10000000000# + D + 16#10000#;
      end if;

      return C;
   end Read_BE;

   -----------

   -- Width --

   -----------


   function Width (Char : Unicode_Char) return Natural is
   begin
      if Char >= 16#10000# then
         return 4;
      else
         return 2;
      end if;
   end Width;

   ------------

   -- Length --

   ------------


   function Length (Str : Utf16_String) return Natural is
      Pos : Natural := Str'First;
      Len : Natural := 0;
   begin
      while Pos <= Str'Last loop
         Pos := Pos + Width (Read (Str, Pos));
         Len := Len + 1;
      end loop;
      return Len;
   end Length;

   ----------------

   -- From_Utf32 --

   ----------------


   function From_Utf32
     (Str : Unicode.CES.Utf32.Utf32_LE_String)
      return Utf16_LE_String
   is
      Result : Utf16_LE_String (1 .. (Str'Length / Utf32_Char_Width) * 4);
      J : Positive := Str'First;
      R_Index : Positive := Result'First;
      C : Unicode_Char;
   begin
      while J <= Str'Last loop
         C := Unicode.CES.Utf32.Read (Str, J);
         declare
            Tmp : constant String := Encode (C);
         begin
            Result (R_Index .. R_Index + Tmp'Length - 1) := Tmp;
            R_Index := R_Index + Tmp'Length;
         end;
         J := J + Unicode.CES.Utf32.Width (C);
      end loop;
      return Result (1 .. R_Index - 1);
   end From_Utf32;

   --------------

   -- To_Utf32 --

   --------------


   function To_Utf32
     (Str : Utf16_LE_String)
      return Unicode.CES.Utf32.Utf32_LE_String
   is
      Result : Utf32_LE_String (1 .. (Str'Length / 2) * Utf32_Char_Width);
      J : Positive := Str'First;
      R_Index : Positive := 1;
      C : Unicode_Char;
   begin
      while J <= Str'Last loop
         C := Read (Str, J);
         Result (R_Index .. R_Index + Utf32_Char_Width - 1) :=
           Unicode.CES.Utf32.Encode (C);
         R_Index := R_Index + Utf32_Char_Width;
         J := J + Width (C);
      end loop;
      return Result (1 .. R_Index - 1);
   end To_Utf32;

   -------------------

   -- To_Unicode_LE --

   -------------------

   --  ??? Note: this assumes that the original character and its

   --  conversion are encoded on the same length, which is always

   --  right so far with Unicode.


   function To_Unicode_LE
     (Str   : Utf16_String;
      Cs    : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
      Order : Byte_Order := Default_Byte_Order) return Utf16_LE_String
   is
      BOM    : Bom_Type;
      Offset : Natural := 0;
      O      : Byte_Order := Order;
      J      : Positive := Str'First;
      S      : Utf16_LE_String (1 .. Str'Length);
      C      : Unicode_Char;

   begin
      Read_Bom (Str, Offset, BOM);

      case BOM is
         when Utf16_LE => O := Low_Byte_First;
         when Utf16_BE => O := High_Byte_First;
         when Unknown  => null;
         when others   => raise Invalid_Encoding;
      end case;

      if O = Low_Byte_First then
         if Cs.To_Unicode = Identity'Access then
            return Str (Str'First + Offset .. Str'Last);
         else
            J := J + Offset;
            while J <= Str'Last loop
               C := Cs.To_Unicode (Read (Str, J));
               S (J .. J + Width (C) - 1) := Encode (C);
               J := J + Width (C);
            end loop;
            return S (S'First + Offset .. S'Last);
         end if;
      else
         J := J + Offset;
         if Cs.To_Unicode = Identity'Access then
            while J <= Str'Last loop
               S (J + 1) := Str (J);
               S (J)     := Str (J + 1);
               J := J + 2;
            end loop;
         else
            while J <= Str'Last loop
               C := Cs.To_Unicode (Read_BE (Str, J));
               S (J .. J + Width (C) - 1) := Encode (C);
               J := J + Width (C);
            end loop;
            return S (S'First + Offset .. S'Last);
         end if;
         return S (S'First + Offset .. S'Last);
      end if;
   end To_Unicode_LE;

   -----------

   -- To_CS --

   -----------


   function To_CS
     (Str   : Utf16_LE_String;
      Cs    : Unicode.CCS.Character_Set := Unicode.CCS.Unicode_Character_Set;
      Order : Byte_Order := Default_Byte_Order) return Utf16_String
   is
      Offset : Natural := 0;
      J      : Positive := Str'First;
      S      : Utf16_LE_String (1 .. Str'Length);
      C      : Unicode_Char;
   begin
      if Order = Low_Byte_First then
         if Cs.To_CS = Identity'Access then
            return Str (Str'First + Offset .. Str'Last);
         else
            J := J + Offset;
            while J <= Str'Last loop
               C := Cs.To_CS (Read (Str, J));
               S (J .. J + Width (C) - 1) := Encode (C);
               J := J + Width (C);
            end loop;
            return S (S'First + Offset .. S'Last);
         end if;
      else
         J := J + Offset;
         if Cs.To_CS = Identity'Access then
            while J <= Str'Last loop
               S (J + 1) := Str (J);
               S (J)     := Str (J + 1);
               J := J + 2;
            end loop;
         else
            while J <= Str'Last loop
               C := Cs.To_CS (Read (Str, J));
               S (J .. J + Width (C) - 1) := Encode_BE (C);
               J := J + Width (C);
            end loop;
            return S (S'First + Offset .. S'Last);
         end if;
         return S (S'First + Offset .. S'Last);
      end if;
   end To_CS;

end Unicode.CES.Utf16;