utf8rewind  1.5.0
System library for processing UTF-8 encoded text
utf8rewind.h File Reference

Public interface for UTF-8 functions. More...

Go to the source code of this file.

Macros

#define UTF8_VERSION_MAKE(_major, _minor, _bugfix)   ((_major) * 10000) + ((_minor) * 100) + (_bugfix)
 Macro for creating a version number from a major, minor and bugfix number. More...
 
#define UTF8_VERSION_MAJOR   1
 The major version number of this release. More...
 
#define UTF8_VERSION_MINOR   5
 The minor version number of this release. More...
 
#define UTF8_VERSION_BUGFIX   0
 The bugfix version number of this release. More...
 
#define UTF8_VERSION   UTF8_VERSION_MAKE(UTF8_VERSION_MAJOR, UTF8_VERSION_MINOR, UTF8_VERSION_BUGFIX)
 The version number as an integer. More...
 
#define UTF8_VERSION_STRING   "1.5.0"
 The verion number as a string. More...
 
#define UTF8_VERSION_GUARD(_major, _minor, _bugfix)   (UTF8_VERSION >= UTF8_VERSION_MAKE(_major, _minor, _bugfix))
 Check if feature is supported by the current release. More...
 
#define UTF8_ERR_NONE   (0)
 No errors. More...
 
#define UTF8_ERR_INVALID_DATA   (-1)
 Input data is invalid. More...
 
#define UTF8_ERR_INVALID_FLAG   (-2)
 Input flag is invalid. More...
 
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-3)
 Not enough space in buffer to store result. More...
 
#define UTF8_ERR_OVERLAPPING_PARAMETERS   (-4)
 Input and output buffers overlap in memory. More...
 
#define UTF8_ERR_INVALID_LOCALE   (-5)
 Invalid locale specified. More...
 
#define UTF8_LOCALE_DEFAULT   0
 Used for text unaffected by changes in locale. More...
 
#define UTF8_LOCALE_LITHUANIAN   1
 Changes behavior of the case mapping implementation when processing specific code points. For more information, see here: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt. More...
 
#define UTF8_LOCALE_TURKISH_AND_AZERI_LATIN   2
 Changes behavior of the case mapping implementation when processing specific code points. For more information, see here: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt. More...
 
#define UTF8_LOCALE_MAXIMUM   3
 Terminal value for locales. Valid locales do not exceed this value. More...
 
#define UTF8_NORMALIZE_COMPOSE   0x00000001
 Normalize input to Normalization Form C (NFC). More...
 
#define UTF8_NORMALIZE_DECOMPOSE   0x00000002
 Normalize input to Normalization Form D (NFD). More...
 
#define UTF8_NORMALIZE_COMPATIBILITY   0x00000004
 Changes Normalization Form from NFC to NFKC or from NFD to NFKD. More...
 
#define UTF8_NORMALIZATION_RESULT_YES   (0)
 Text is stable and does not need to be normalized. More...
 
#define UTF8_NORMALIZATION_RESULT_MAYBE   (1)
 Text is unstable, but normalization may be skipped. More...
 
#define UTF8_NORMALIZATION_RESULT_NO   (2)
 Text is unstable and must be normalized. More...
 
#define UTF8_CATEGORY_LETTER_UPPERCASE   0x00000001
 Uppercase letter code points, Lu in the Unicode database. More...
 
#define UTF8_CATEGORY_LETTER_LOWERCASE   0x00000002
 Lowercase letter code points, Ll in the Unicode database. More...
 
#define UTF8_CATEGORY_LETTER_TITLECASE   0x00000004
 Titlecase letter code points, Lt in the Unicode database. More...
 
#define UTF8_CATEGORY_LETTER_MODIFIER   0x00000008
 Modifier letter code points, Lm in the Unicode database. More...
 
#define UTF8_CATEGORY_LETTER_OTHER   0x00000010
 Other letter code points, Lo in the Unicode database. More...
 
#define UTF8_CATEGORY_LETTER
 Combined flag for all letter categories. More...
 
#define UTF8_CATEGORY_CASE_MAPPED
 Combined flag for all letter categories with case mapping. More...
 
#define UTF8_CATEGORY_MARK_NON_SPACING   0x00000020
 Non-spacing mark code points, Mn in the Unicode database. More...
 
#define UTF8_CATEGORY_MARK_SPACING   0x00000040
 Spacing mark code points, Mc in the Unicode database. More...
 
#define UTF8_CATEGORY_MARK_ENCLOSING   0x00000080
 Enclosing mark code points, Me in the Unicode database. More...
 
#define UTF8_CATEGORY_MARK
 Combined flag for all mark categories. More...
 
#define UTF8_CATEGORY_NUMBER_DECIMAL   0x00000100
 Decimal number code points, Nd in the Unicode database. More...
 
#define UTF8_CATEGORY_NUMBER_LETTER   0x00000200
 Letter number code points, Nl in the Unicode database. More...
 
#define UTF8_CATEGORY_NUMBER_OTHER   0x00000400
 Other number code points, No in the Unicode database. More...
 
#define UTF8_CATEGORY_NUMBER
 Combined flag for all number categories. More...
 
#define UTF8_CATEGORY_PUNCTUATION_CONNECTOR   0x00000800
 Connector punctuation category, Pc in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION_DASH   0x00001000
 Dash punctuation category, Pd in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION_OPEN   0x00002000
 Open punctuation category, Ps in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION_CLOSE   0x00004000
 Close punctuation category, Pe in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION_INITIAL   0x00008000
 Initial punctuation category, Pi in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION_FINAL   0x00010000
 Final punctuation category, Pf in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION_OTHER   0x00020000
 Other punctuation category, Po in the Unicode database. More...
 
#define UTF8_CATEGORY_PUNCTUATION
 Combined flag for all punctuation categories. More...
 
#define UTF8_CATEGORY_SYMBOL_MATH   0x00040000
 Math symbol category, Sm in the Unicode database. More...
 
#define UTF8_CATEGORY_SYMBOL_CURRENCY   0x00080000
 Currency symbol category, Sc in the Unicode database. More...
 
#define UTF8_CATEGORY_SYMBOL_MODIFIER   0x00100000
 Modifier symbol category, Sk in the Unicode database. More...
 
#define UTF8_CATEGORY_SYMBOL_OTHER   0x00200000
 Other symbol category, So in the Unicode database. More...
 
#define UTF8_CATEGORY_SYMBOL
 Combined flag for all symbol categories. More...
 
#define UTF8_CATEGORY_SEPARATOR_SPACE   0x00400000
 Space separator category, Zs in the Unicode database. More...
 
#define UTF8_CATEGORY_SEPARATOR_LINE   0x00800000
 Line separator category, Zl in the Unicode database. More...
 
#define UTF8_CATEGORY_SEPARATOR_PARAGRAPH   0x01000000
 Paragraph separator category, Zp in the Unicode database. More...
 
#define UTF8_CATEGORY_SEPARATOR
 Combined flag for all separator categories. More...
 
#define UTF8_CATEGORY_CONTROL   0x02000000
 Control category, Cc in the Unicode database. More...
 
#define UTF8_CATEGORY_FORMAT   0x04000000
 Format category, Cf in the Unicode database. More...
 
#define UTF8_CATEGORY_SURROGATE   0x08000000
 Surrogate category, Cs in the Unicode database. More...
 
#define UTF8_CATEGORY_PRIVATE_USE   0x10000000
 Private use category, Co in the Unicode database. More...
 
#define UTF8_CATEGORY_UNASSIGNED   0x20000000
 Unassigned category, Cn in the Unicode database. More...
 
#define UTF8_CATEGORY_COMPATIBILITY   0x40000000
 Flag used for maintaining backwards compatibility with POSIX functions, not found in the Unicode database. More...
 
#define UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER   0x80000000
 Flag used for checking only the general category of code points at the start of a grapheme cluster. More...
 
#define UTF8_CATEGORY_ISCNTRL
 Flag used for maintaining backwards compatibility with POSIX iscntrl function. More...
 
#define UTF8_CATEGORY_ISPRINT
 Flag used for maintaining backwards compatibility with POSIX isprint function. More...
 
#define UTF8_CATEGORY_ISSPACE
 Flag used for maintaining backwards compatibility with POSIX isspace function. More...
 
#define UTF8_CATEGORY_ISBLANK
 Flag used for maintaining backwards compatibility with POSIX isblank function. More...
 
#define UTF8_CATEGORY_ISGRAPH
 Flag used for maintaining backwards compatibility with POSIX isgraph function. More...
 
#define UTF8_CATEGORY_ISPUNCT
 Flag used for maintaining backwards compatibility with POSIX ispunct function. More...
 
#define UTF8_CATEGORY_ISALNUM
 Flag used for maintaining backwards compatibility with POSIX isalnum function. More...
 
#define UTF8_CATEGORY_ISALPHA
 Flag used for maintaining backwards compatibility with POSIX isalpha function. More...
 
#define UTF8_CATEGORY_ISUPPER
 Flag used for maintaining backwards compatibility with POSIX isupper function. More...
 
#define UTF8_CATEGORY_ISLOWER
 Flag used for maintaining backwards compatibility with POSIX islower function. More...
 
#define UTF8_CATEGORY_ISDIGIT
 Flag used for maintaining backwards compatibility with POSIX isdigit function. More...
 
#define UTF8_CATEGORY_ISXDIGIT
 Flag used for maintaining backwards compatibility with POSIX isxdigit function. More...
 
#define UTF8_WCHAR_SIZE   (2)
 Specifies the size of the wchar_t type. On Windows this is two bytes, on POSIX systems it is four. If not specified on the command line, the compiler tries to automatically determine the size of the wchar_t type based on the environment. More...
 
#define UTF8_WCHAR_UTF16   (1)
 The wchar_t type is treated as UTF-16 (two byte variable length encoding). More...
 
#define UTF8_API
 Calling convention for public functions. More...
 

Typedefs

typedef uint16_t utf16_t
 UTF-16 encoded code point. More...
 
typedef uint32_t unicode_t
 UTF-32 encoded code point. More...
 

Functions

UTF8_API size_t utf8len (const char *text)
 Get the length in code points of a UTF-8 encoded string. More...
 
UTF8_API size_t utf16toutf8 (const utf16_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a UTF-16 encoded string to a UTF-8 encoded string. More...
 
UTF8_API size_t utf32toutf8 (const unicode_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a UTF-32 encoded string to a UTF-8 encoded string. More...
 
UTF8_API size_t widetoutf8 (const wchar_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a wide string to a UTF-8 encoded string. More...
 
UTF8_API size_t utf8toutf16 (const char *input, size_t inputSize, utf16_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a UTF-16 encoded string. More...
 
UTF8_API size_t utf8toutf32 (const char *input, size_t inputSize, unicode_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a UTF-32 encoded string. More...
 
UTF8_API size_t utf8towide (const char *input, size_t inputSize, wchar_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a wide string. More...
 
UTF8_API const char * utf8seek (const char *text, size_t textSize, const char *textStart, off_t offset, int direction)
 Seek into a UTF-8 encoded string. More...
 
UTF8_API size_t utf8envlocale ()
 Returns the environment's locale as an enum value. More...
 
UTF8_API size_t utf8toupper (const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
 Convert UTF-8 encoded text to uppercase. More...
 
UTF8_API size_t utf8tolower (const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
 Convert UTF-8 encoded text to lowercase. More...
 
UTF8_API size_t utf8totitle (const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
 Convert UTF-8 encoded text to titlecase. More...
 
UTF8_API size_t utf8casefold (const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
 Remove case distinction from UTF-8 encoded text. More...
 
UTF8_API uint8_t utf8isnormalized (const char *input, size_t inputSize, size_t flags, size_t *offset)
 Check if a string is stable in the specified Unicode Normalization Form. More...
 
UTF8_API size_t utf8normalize (const char *input, size_t inputSize, char *target, size_t targetSize, size_t flags, int32_t *errors)
 Normalize a string to the specified Unicode Normalization Form. More...
 
UTF8_API size_t utf8iscategory (const char *input, size_t inputSize, size_t flags)
 Check if the input string conforms to the category specified by the flags. More...
 

Detailed Description

Public interface for UTF-8 functions.

utf8rewind is a system library written in C designed to extend the default string handling functions with support for UTF-8 encoded text.