File : sax-readers.ads
with Input_Sources;
with Sax.Locators;
with Sax.Exceptions;
with Sax.Attributes;
with Sax.Models;
with Unicode;
with Unicode.CES;
with GNAT.Spitbol;
package Sax.Readers is
type Reader is tagged private;
procedure Parse
(Parser : in out Reader;
Input : in out Input_Sources.Input_Source'Class);
-- Parse an XML stream, and calls the appropriate SAX callbacks for each
-- event.
-- This is not re-entrant: you can not call Parse with the same Parser
-- argument in one of the SAX callbacks. This has undefined behavior.
function Get_Feature (Parser : Reader; Name : String) return Boolean;
-- lookup the value of a feature
-- Name is a fully qualified URI.
-- All XML_Readers must recognize the two features Namespace_Feature
-- and Namespace_Prefix_Feature
procedure Set_Feature
(Parser : in out Reader; Name : String; Value : Boolean);
-- Set the state of a feature
-------------------------
-- Recognized features --
-------------------------
-- The two strings below reference the two default features that are
-- recognized by all parsers.
Namespace_Feature : constant String :=
"http://www.xml.org/sax/features/namespace";
-- Controls general namespace processing. If it is true (the default),
-- namespace URIs will be used in events.
-- In fact, this is only given for full compatibility with the SAX
-- standard. As authorized in the standard, this parser will always
-- report URIs to the Start_Element and End_Element callbacks.
--
-- Default is True.
Namespace_Prefixes_Feature : constant String :=
"http://www.xml.org/sax/features/namespace-prefixes";
-- Controls the reporting of qNames and namespace attributes (xmlns*) to
-- the application.
-- When this is False (the default), qNames may optionaly be reported,
-- and namespace attributes must not be reported.
-- Summary of the above two features:
-- 1: Namespace names
-- 2: Start/endPrefixMapping
-- 3: qNames
-- 4: xmlns* attributes
-- namespaces namespace-prefixes 1 2 3 4
-- true false YES YES unknown NO
-- true true YES YES YES YES
-- false false (ILLEGAL COMBINATION)
-- false true unknown unknown YES YES
--
-- Default is False.
Validation_Feature : constant String :=
"http://www.xml.org/sax/features/validation";
-- If True (not the default), a number of additional tests are performed
-- while parsing the document, most notably that the document matches
-- the DTD (internal and external subset).
External_General_Entities_Feature : constant String :=
"http://xml.org/sax/features/external-general-entities";
-- If True, include all external general text entities.
-- If False, these are not included, and will be reported with
-- Content_Handlers.Skipped_Entity.
--
-- Default is True
External_Parameter_Entities_Feature : constant String :=
"http://xml.org/sax/features/external-parameter-entities";
-- If True, include all external parameter entities, including the
-- external DTD subset. Parameter entities are the ones defined in DTDs
-- and whose name starts with '%'
Parameter_Entities_Feature : constant String :=
"http://xml.org/sax/features/lexical-handler/parameter-entities";
-- True if the SAX parser will reports parameter entities through its
-- Lexical_Handler.
-------------------
-- Error handler --
-------------------
-- The following functions are defined in the Error_Handler interface
-- in the SAX standard.
procedure Warning
(Handler : in out Reader;
Except : Sax.Exceptions.Sax_Parse_Exception'Class);
-- Receive notification of a warning.
-- This method is used to report conditions that are not errors or fatal
-- errors.
-- The SAX parser must continue to provide normal parsing events after
-- invoking this method.
-- Default action is to do nothing.
procedure Error
(Handler : in out Reader;
Except : Sax.Exceptions.Sax_Parse_Exception'Class);
-- Receive notification of a recoverable error.
-- For example, a validating parser would use this callback to report the
-- violation of a validity constraint. The default behaviour is to take no
-- Action.
-- The SAX parser must continue to provide normal parsing events after
-- invoking this method. If the application cannot do so, then the parser
-- should report a fatal error.
-- Default action is to do nothing.
procedure Fatal_Error
(Handler : in out Reader;
Except : Sax.Exceptions.Sax_Parse_Exception'Class);
-- Receive notification of a non-recoverable error.
-- For example, a parser would use this callback to report the violation
-- of a well-Formedness constraint.
-- The application must assume that the document is unusable after the
-- parser has invoked this method. Thus, a Program_Error will be raised
-- if your callback returns. You should always raise an exception.
-- Default action is to raise an exception Fatal_Error;
----------------------
-- Content Handlers --
----------------------
-- The following functions are defined in the Content_Handler interface
-- in the SAX standard.
-- The default for all the subprograms below is to do nothing, unless
-- otherwise specified.
procedure Set_Document_Locator
(Handler : in out Reader;
Loc : access Sax.Locators.Locator'Class);
-- Receive an object for locating the origin of SAX document events.
-- SAX parsers are strongly encouraged but not required to give this
-- information. This callback will always be called before any other.
procedure Start_Document (Handler : in out Reader);
-- Receive notification of the beginning of a document.
-- This callback is called only once by the parser, before any other
-- function in this interface except Set_Document_Locator.
procedure End_Document (Handler : in out Reader);
-- Receive notification of the end of a document.
-- This callback will be called only once once it has reached the end of
-- the input stream. It won't be called if a Fatal_Error is raised, it is
-- your responsability to call the callback yourself in this case.
procedure Start_Prefix_Mapping
(Handler : in out Reader;
Prefix : Unicode.CES.Byte_Sequence;
URI : Unicode.CES.Byte_Sequence);
-- Begin the scope of a prefix-URI mapping.
-- This callback is not necessarily for normal namespace processing, since
-- the SAX parser will automatically substitute prefixes for elements and
-- attributes if XML_Readers.Namespace_Feature is set to True.
-- However, there are cases where the automatic replacement can not be
-- safely done, and in this case this callback is invoked.
-- It is not garanteed that calls to End_Prefix_Mapping will occur in the
-- same order (or the reverse one) as Start_Prefix_Mapping.
procedure End_Prefix_Mapping
(Handler : in out Reader;
Prefix : Unicode.CES.Byte_Sequence);
-- End the scope of a prefix-URI mapping.
-- This will always occur after the corresponding End_Element event.
procedure Start_Element
(Handler : in out Reader;
Namespace_URI : Unicode.CES.Byte_Sequence := "";
Local_Name : Unicode.CES.Byte_Sequence := "";
Qname : Unicode.CES.Byte_Sequence := "";
Atts : Sax.Attributes.Attributes'Class);
-- Receive notification of the beginning of an element.
-- There will always be a matching call to End_Element, even for empty
-- elements.
-- Up to three name components can be given for each element, depending
-- on the value of the XML_Reader features.
-- - Namespace_URI and Local_Name are required when Namespace_Feature is
-- True, but are optional if False. If one is specified, both must be.
-- - Qname (qualified name) is required if Namespace_Prefixes_Feature is
-- True, and optional if False. This is basically of the form "Ns:Name"
-- The attribute list will only contain attributes with explicit values. It
-- will contain attributes used for namespace declaration (xmlns*) only if
-- Namespace_Prefixes_Feature is True.
procedure End_Element
(Handler : in out Reader;
Namespace_URI : Unicode.CES.Byte_Sequence := "";
Local_Name : Unicode.CES.Byte_Sequence := "";
Qname : Unicode.CES.Byte_Sequence := "");
-- Receive notification of the end of an element.
procedure Characters
(Handler : in out Reader;
Ch : Unicode.CES.Byte_Sequence);
-- Receives notification of character data.
-- XML parsers may return all contiguous character data in a single chunk,
-- or they may split them into several chunks. However, all of the
-- characters in any single event must come from the same external entity
-- so that the Locator provides useful information
--
-- Note that some parsers will report (and validating parsers must) report
-- whitespace in element content using the Ignorable_Whitespace event.
procedure Ignorable_Whitespace
(Handler : in out Reader;
Ch : Unicode.CES.Byte_Sequence);
-- Receive notification of ignorable whitespace in element content (ie
-- for elements whose xml:space attribute is not set to 'preserve', see
-- XML specifications 2.10)
-- Validating Parsers must use this method to report each chunk of
-- whitespace in element content.
-- SAX parsers may return all contiguous whitespace in a single chunk, or
-- they may split it into several chunks.
procedure Processing_Instruction
(Handler : in out Reader;
Target : Unicode.CES.Byte_Sequence;
Data : Unicode.CES.Byte_Sequence);
-- Receive notification of a processing instruction.
-- A SAX parser must never report an XML declaration (<?xml..?>, 2.8 in
-- XML specifications) or a text declaration (<?xml?>, 4.3.1 in XML
-- specifications) using this method.
procedure Skipped_Entity
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence);
-- Receive notification of a skipped entity.
-- The Parser will invoke this method once for each entity
-- skipped. Non-validating processors may skip entities if they have not
-- seen the declarations (because, for example, the entity was declared in
-- an external DTD subset). All processors may skip external Entities,
-- depending on the value of External_General_Entities_Feature and
-- External_Parameter_Entities_Feature.
--
-- Name is the name of the skipped entity. If it is a parameter entity,
-- the name will begin with '%', and if it is the external DTD subset,
-- it will be the string "[dtd]".
------------------
-- DTD Handlers --
------------------
-- The following functions are defined in the DTD_Handler interface
-- in the SAX standard.
procedure Unparsed_Entity_Decl
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence;
System_Id : Unicode.CES.Byte_Sequence;
Notation_Name : Unicode.CES.Byte_Sequence);
-- Receive notification of an unparsed entity declaration event.
-- This is for entities like "<!ENTITY foo SYSTEM ".." NDATA gif>"
procedure Notation_Decl
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence;
Public_Id : Unicode.CES.Byte_Sequence;
System_Id : Unicode.CES.Byte_Sequence);
-- Receive notification of a notation declaration event.
-- At least one of publicId and systemId must be non-null. If a system
-- identifier is present, and it is a URL, the SAX parser must resolve it
-- fully before passing it to the application through this event.
-- There is no guarantee that the notation declaration will be reported
-- before any unparsed entities that use it.
---------------------
-- Entity Resolver --
---------------------
-- The following functions are defined in the Entity_Resolver interface
-- in the SAX standard.
---------------------
-- Lexical Handler --
---------------------
-- The following functions are defined in the Lexical_Handler interface
-- in the extended SAX standard. This is not part of the standard itself,
-- but rather part of the extension for it.
-- Note that the SAX standard indicates that such extended handlers should
-- be set through properties, but this is not necessary in this
-- implementation where you simply have to override the following
-- subprograms.
procedure Comment
(Handler : in out Reader; Ch : Unicode.CES.Byte_Sequence);
-- Report an XML comment anywhere in the document.
-- Default behavior is to do nothing.
procedure Start_Cdata (Handler : in out Reader);
-- Report the start of a CData section.
-- The content of the section is reported through the usual Characters
-- event, this only acts as the boundary.
procedure End_Cdata (Handler : in out Reader);
-- Report the end of a CData section
procedure Start_Entity
(Handler : in out Reader; Name : Unicode.CES.Byte_Sequence);
-- Report the beginning of some internal and external XML entities.
-- Check the feature Parameter_Entities_Feature to know if the handler
-- will report these events.
procedure End_Entity
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence);
-- Report the end of an entity
procedure Start_DTD
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence;
Public_Id : Unicode.CES.Byte_Sequence := "";
System_Id : Unicode.CES.Byte_Sequence := "");
-- Report the start of DTD declarations, if any.
-- All events reported to a Decl_Handler are reported between a Start_DTD
-- and an End_DTD event.
-- Public_Id and System_Id might be the empty string if none was declared.
-- The events following Start_DTD (and before the matching End_DTD) are
-- assumed to be part of the internal subset of the DTD, unless they
-- appear between a Start_Entity and End_Entity events (with "[dtd]" for
-- the name).
procedure End_DTD (Handler : in out Reader);
-- Report the end of a DTD section
------------------
-- Decl Handler --
------------------
-- The following functions are defined in the Decl_Handler interface
-- in the extended SAX standard. This is not part of the standard itself,
-- but rather part of the extension for it.
procedure Internal_Entity_Decl
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence;
Value : Unicode.CES.Byte_Sequence);
-- Report an internal entity declaration.
-- This is for <!ENTITY...> notations in the DTD, where the value is
-- specified directly as a string.
-- Only the effective (first) declaration for each entity will be reported.
-- All parameter entities in the value will be expanded, but general
-- entities will not.
-- For Parameter entities, Name will start with '%'
procedure External_Entity_Decl
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence;
Public_Id : Unicode.CES.Byte_Sequence;
System_Id : Unicode.CES.Byte_Sequence);
-- Report a parsed external entity declaration, ie when their value is
-- not defined as a string.
procedure Element_Decl
(Handler : in out Reader;
Name : Unicode.CES.Byte_Sequence;
Model : Sax.Models.Element_Model_Ptr);
-- Report an element type declaration.
-- The content model will consist of the string "EMPTY", the string "ANY",
-- or a parenthesised group, optionally followed by an occurrence
-- indicator. The model will be normalized so that all parameter entities
-- are fully resolved and all whitespace is removed,and will include the
-- enclosing parentheses.
-- In addition to the SAX standard, the parsed_model parameter is used to
-- manipulate directly a pre-parsed formed of Model. You shouldn't keep
-- a reference to the pointer, since the memory is freed as soon as your
-- callback ends.
procedure Attribute_Decl
(Handler : in out Reader;
Ename : Unicode.CES.Byte_Sequence;
Aname : Unicode.CES.Byte_Sequence;
Typ : Sax.Attributes.Attribute_Type;
Content : Sax.Models.Element_Model_Ptr;
Value_Default : Sax.Attributes.Default_Declaration;
Value : Unicode.CES.Byte_Sequence);
-- Report an attribute type declaration.
-- Only the first declaration for an attribute will be reported.
-- If Typ is Notation or Enumeration, then Content will contain the
-- description model for the attribute. Otherwise Content is null.
-- Content might be freed when returning from this call.
-- Value_Default represents the attribute default requirements
-- ("#IMPLIED", "#REQUIRED", or "#FIXED").
-- Value is a string representing the attribute's default value, or ""
-- if there is none
XML_Fatal_Error : exception;
private
Max_Buffer_Length : constant := 10000;
-- Length of internal buffer.
-- This is also the maximum length of tag names.
Entities_Table_Size : constant := 50;
-- Size of the hash-table used to store entities.
-- This is not a hard limit on the number of entities that can be defined.
-- However, if this number is too small with regards to the number of
-- entities, there will be conflicts in the hash-table that will slow
-- down the lookup.
Default_Atts_Table_Size : constant := 50;
-- Size of the hash-table used to store the default attributes
--------------
-- Entities --
--------------
-- We need to memorize all the declared entities, so as to do the
-- substitution ourselves.
type Entity_Entry is record
Value : Unicode.CES.Byte_Sequence_Access;
External : Boolean;
Already_Read : Boolean := False;
-- True if the value of the entity was already read. This is used to
-- detect entities referencing themselves.
end record;
Null_Entity : constant Entity_Entry := (null, False, False);
function Entity_Img (A : Entity_Entry) return String;
package Entity_Table is new GNAT.Spitbol.Table
(Value_Type => Entity_Entry,
Null_Value => Null_Entity,
Img => Entity_Img);
type Entity_Input_Source;
type Entity_Input_Source_Access is access Entity_Input_Source;
type Entity_Input_Source is record
External : Boolean;
Next : Entity_Input_Source_Access;
Name : Unicode.CES.Byte_Sequence_Access;
-- Name of the entity
Handle_Strings : Boolean := True;
-- True if " and ' should be recognized as special characters.
-- This is used so that a string started in one stream isn't terminated
-- in another entity or stream.
Id : Natural;
-- Uniq ID for each input source
Input : Input_Sources.Input_Source_Access;
Save_Loc : Sax.Locators.Locator_Impl;
end record;
type Parser_State is record
Name : String (1 .. 3);
-- Name of the state (debugging purposes)
Ignore_Special : Boolean := False;
-- True if special characters should be ignored (as is the case in
-- strings). ??? Could be ignored, duplicates Greater_Special,
-- Less_Special, ..
Detect_End_Of_PI : Boolean := False;
-- Whether ?> should be reported as end of PI
Greater_Special : Boolean := False;
-- Whether > is considered a special character
Less_Special : Boolean := False;
-- Should be true if < should be reported separately. Note that in that
-- case it won't even be associated with the following character if
-- it is '!', '?',...
Expand_Param_Entities : Boolean := False;
-- True if %...; param entities should be recognized, as is the case in
-- the DTD
Expand_Entities : Boolean := True;
-- True if &...; should be recognized
Expand_Character_Ref : Boolean := True;
-- True if character references &#...; should be recognized
In_DTD : Boolean := False;
-- True if we are parsing the DTD, and '['. ']' and '<!' should be
-- recognized as special tags
Recognize_External : Boolean := False;
-- True if PUBLIC, SYSTEM and NDATA should be recognized as special
-- tokens
In_Attlist : Boolean := False;
-- True if we are in an <!ATTLIST, and we should recognize special
-- keywords like ID, NMTOKEN,...
Handle_Strings : Boolean := False;
-- True if " and ' should be recognized as special characters
-- ??? Should be merged with a In_String field, that would also replace
-- Ignore_Special.
In_Tag : Boolean := False;
-- True if = and : should be recognized as special characters
Report_Parenthesis : Boolean := False;
-- True if Opening_Parenthesis should be reported separately
end record;
type XML_NS_Record;
type XML_NS is access XML_NS_Record;
type XML_NS_Record is record
Prefix : Unicode.CES.Byte_Sequence_Access;
URI : Unicode.CES.Byte_Sequence_Access;
Next : XML_NS;
end record;
type Element;
type Element_Access is access Element;
type Element is record
NS : Unicode.CES.Byte_Sequence_Access;
Name : Unicode.CES.Byte_Sequence_Access;
Parent : Element_Access;
Start_Id : Natural;
-- Id of the Input source for the start tag. End tag must end on the
-- same entity.
Namespaces : XML_NS;
-- Namespaces defined for that element and its children
end record;
type Attributes_Ptr is access all Sax.Attributes.Attributes'Class;
function Attributes_Img (A : Attributes_Ptr) return String;
package Attributes_Table is new GNAT.Spitbol.Table
(Value_Type => Attributes_Ptr,
Null_Value => null,
Img => Attributes_Img);
package Notations_Table is new GNAT.Spitbol.Table
(Value_Type => Boolean,
Null_Value => False,
Img => Boolean'Image);
-- For notations, we simply store whether they have been defined or not,
-- and then only for validating parsers
type Reader is tagged record
Buffer_Length : Natural := 0;
Buffer : Unicode.CES.Byte_Sequence (1 .. Max_Buffer_Length);
Last_Read : Unicode.Unicode_Char;
State : Parser_State;
Locator : Sax.Locators.Locator_Impl_Access;
Current_Node : Element_Access;
Inputs : Entity_Input_Source_Access;
-- Entities and parameter entities are processed inline (if we
-- temporarily substitute the input stream with the replacement text
-- for the entity).
-- When Inputs is null, the characters are read from the input stream
-- given in the call to Parser.
Close_Inputs : Entity_Input_Source_Access;
-- List of entities to be closed at the next call to Next_Token
In_External_Entity : Boolean;
-- Whether we are parsing an external entity
Previous_Char_Was_CR : Boolean;
-- True if the previous character read from the stream was a
-- Carriage_Return (needed since XML parsers must convert these to
-- one single Line_Feed).
Default_Atts : Attributes_Table.Table (Default_Atts_Table_Size);
-- This table contains the list of default attributes defined for
-- each element in the DTD. Index is the name of the elements
-- ??? Should be freed
Notations : Notations_Table.Table (Default_Atts_Table_Size);
-- List of notations defined in the XML document. This is left empty
-- if the parser isn't configured to do validation.
Entities : Entity_Table.Table (Entities_Table_Size);
-- ??? Entities should be freed on destruction
DTD_Name : Unicode.CES.Byte_Sequence_Access;
-- Name of the DTD, and also name of the root element (in case we have
-- a validating parser). This is left to null for non-validating
-- parsers.
Ignore_State_Special : Boolean;
-- If True, ignore the State.Ignore_Special flag in the next call
-- to Next_Token. This is used for handling of special characters
-- withing strings.
Default_Namespaces : XML_NS;
-- All the namespaces defined by default
Num_Toplevel_Elements : Natural;
-- Number of elements at the toplevel
Element_Id : Natural := 0;
-- Id of the current element. All elements created will have a
-- different Id
Feature_Namespace : Boolean := True;
Feature_Namespace_Prefixes : Boolean := False;
Feature_External_General_Entities : Boolean := True;
Feature_External_Parameter_Entities : Boolean := True;
Feature_Validation : Boolean := False;
end record;
end Sax.Readers;