utf8rewind  1.5.1
System library for processing UTF-8 encoded text
utf8rewind.h
Go to the documentation of this file.
1 /*
2  Copyright (C) 2014-2016 Quinten Lansu
3 
4  Permission is hereby granted, free of charge, to any person
5  obtaining a copy of this software and associated documentation
6  files (the "Software"), to deal in the Software without
7  restriction, including without limitation the rights to use,
8  copy, modify, merge, publish, distribute, sublicense, and/or
9  sell copies of the Software, and to permit persons to whom the
10  Software is furnished to do so, subject to the following
11  conditions:
12 
13  The above copyright notice and this permission notice shall be
14  included in all copies or substantial portions of the Software.
15 
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  OTHER DEALINGS IN THE SOFTWARE.
24 */
25 
26 #ifndef _UTF8REWIND_H_
27 #define _UTF8REWIND_H_
28 
65 #include <locale.h>
66 #include <stddef.h>
67 #include <stdio.h>
68 #include <stdint.h>
69 #include <string.h>
70 #include <wchar.h>
71 
82 #define UTF8_VERSION_MAKE(_major, _minor, _bugfix) \
83  ((_major) * 10000) + ((_minor) * 100) + (_bugfix)
84 
89 #define UTF8_VERSION_MAJOR 1
90 
95 #define UTF8_VERSION_MINOR 5
96 
101 #define UTF8_VERSION_BUGFIX 1
102 
107 #define UTF8_VERSION \
108  UTF8_VERSION_MAKE(UTF8_VERSION_MAJOR, UTF8_VERSION_MINOR, UTF8_VERSION_BUGFIX)
109 
114 #define UTF8_VERSION_STRING "1.5.1"
115 
120 #define UTF8_VERSION_GUARD(_major, _minor, _bugfix) \
121  (UTF8_VERSION >= UTF8_VERSION_MAKE(_major, _minor, _bugfix))
122 
136 #define UTF8_ERR_NONE (0)
137 
142 #define UTF8_ERR_INVALID_DATA (-1)
143 
148 #define UTF8_ERR_INVALID_FLAG (-2)
149 
154 #define UTF8_ERR_NOT_ENOUGH_SPACE (-3)
155 
160 #define UTF8_ERR_OVERLAPPING_PARAMETERS (-4)
161 
166 #define UTF8_ERR_INVALID_LOCALE (-5)
167 
181 #define UTF8_LOCALE_DEFAULT 0
182 
189 #define UTF8_LOCALE_LITHUANIAN 1
190 
197 #define UTF8_LOCALE_TURKISH_AND_AZERI_LATIN 2
198 
203 #define UTF8_LOCALE_MAXIMUM 3
204 
218 #define UTF8_NORMALIZE_COMPOSE 0x00000001
219 
224 #define UTF8_NORMALIZE_DECOMPOSE 0x00000002
225 
230 #define UTF8_NORMALIZE_COMPATIBILITY 0x00000004
231 
236 #define UTF8_NORMALIZATION_RESULT_YES (0)
237 
242 #define UTF8_NORMALIZATION_RESULT_MAYBE (1)
243 
248 #define UTF8_NORMALIZATION_RESULT_NO (2)
249 
263 #define UTF8_CATEGORY_LETTER_UPPERCASE 0x00000001
264 
269 #define UTF8_CATEGORY_LETTER_LOWERCASE 0x00000002
270 
275 #define UTF8_CATEGORY_LETTER_TITLECASE 0x00000004
276 
281 #define UTF8_CATEGORY_LETTER_MODIFIER 0x00000008
282 
287 #define UTF8_CATEGORY_LETTER_OTHER 0x00000010
288 
293 #define UTF8_CATEGORY_LETTER \
294  (UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
295  UTF8_CATEGORY_LETTER_TITLECASE | UTF8_CATEGORY_LETTER_MODIFIER | \
296  UTF8_CATEGORY_LETTER_OTHER)
297 
302 #define UTF8_CATEGORY_CASE_MAPPED \
303  (UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
304  UTF8_CATEGORY_LETTER_TITLECASE)
305 
310 #define UTF8_CATEGORY_MARK_NON_SPACING 0x00000020
311 
316 #define UTF8_CATEGORY_MARK_SPACING 0x00000040
317 
322 #define UTF8_CATEGORY_MARK_ENCLOSING 0x00000080
323 
328 #define UTF8_CATEGORY_MARK \
329  (UTF8_CATEGORY_MARK_NON_SPACING | UTF8_CATEGORY_MARK_SPACING | \
330  UTF8_CATEGORY_MARK_ENCLOSING)
331 
336 #define UTF8_CATEGORY_NUMBER_DECIMAL 0x00000100
337 
342 #define UTF8_CATEGORY_NUMBER_LETTER 0x00000200
343 
348 #define UTF8_CATEGORY_NUMBER_OTHER 0x00000400
349 
354 #define UTF8_CATEGORY_NUMBER \
355  (UTF8_CATEGORY_NUMBER_DECIMAL | UTF8_CATEGORY_NUMBER_LETTER | \
356  UTF8_CATEGORY_NUMBER_OTHER)
357 
362 #define UTF8_CATEGORY_PUNCTUATION_CONNECTOR 0x00000800
363 
368 #define UTF8_CATEGORY_PUNCTUATION_DASH 0x00001000
369 
374 #define UTF8_CATEGORY_PUNCTUATION_OPEN 0x00002000
375 
380 #define UTF8_CATEGORY_PUNCTUATION_CLOSE 0x00004000
381 
386 #define UTF8_CATEGORY_PUNCTUATION_INITIAL 0x00008000
387 
392 #define UTF8_CATEGORY_PUNCTUATION_FINAL 0x00010000
393 
398 #define UTF8_CATEGORY_PUNCTUATION_OTHER 0x00020000
399 
404 #define UTF8_CATEGORY_PUNCTUATION \
405  (UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_PUNCTUATION_DASH | \
406  UTF8_CATEGORY_PUNCTUATION_OPEN | UTF8_CATEGORY_PUNCTUATION_CLOSE | \
407  UTF8_CATEGORY_PUNCTUATION_INITIAL | UTF8_CATEGORY_PUNCTUATION_FINAL | \
408  UTF8_CATEGORY_PUNCTUATION_OTHER)
409 
414 #define UTF8_CATEGORY_SYMBOL_MATH 0x00040000
415 
420 #define UTF8_CATEGORY_SYMBOL_CURRENCY 0x00080000
421 
426 #define UTF8_CATEGORY_SYMBOL_MODIFIER 0x00100000
427 
432 #define UTF8_CATEGORY_SYMBOL_OTHER 0x00200000
433 
438 #define UTF8_CATEGORY_SYMBOL \
439  (UTF8_CATEGORY_SYMBOL_MATH | UTF8_CATEGORY_SYMBOL_CURRENCY | \
440  UTF8_CATEGORY_SYMBOL_MODIFIER | UTF8_CATEGORY_SYMBOL_OTHER)
441 
446 #define UTF8_CATEGORY_SEPARATOR_SPACE 0x00400000
447 
452 #define UTF8_CATEGORY_SEPARATOR_LINE 0x00800000
453 
458 #define UTF8_CATEGORY_SEPARATOR_PARAGRAPH 0x01000000
459 
464 #define UTF8_CATEGORY_SEPARATOR \
465  (UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_SEPARATOR_LINE | \
466  UTF8_CATEGORY_SEPARATOR_PARAGRAPH)
467 
472 #define UTF8_CATEGORY_CONTROL 0x02000000
473 
478 #define UTF8_CATEGORY_FORMAT 0x04000000
479 
484 #define UTF8_CATEGORY_SURROGATE 0x08000000
485 
490 #define UTF8_CATEGORY_PRIVATE_USE 0x10000000
491 
496 #define UTF8_CATEGORY_UNASSIGNED 0x20000000
497 
503 #define UTF8_CATEGORY_COMPATIBILITY 0x40000000
504 
510 #define UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER 0x80000000
511 
517 #define UTF8_CATEGORY_ISCNTRL \
518  (UTF8_CATEGORY_COMPATIBILITY | \
519  UTF8_CATEGORY_CONTROL)
520 
526 #define UTF8_CATEGORY_ISPRINT \
527  (UTF8_CATEGORY_COMPATIBILITY | \
528  UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
529  UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL | \
530  UTF8_CATEGORY_SEPARATOR)
531 
537 #define UTF8_CATEGORY_ISSPACE \
538  (UTF8_CATEGORY_COMPATIBILITY | \
539  UTF8_CATEGORY_SEPARATOR_SPACE)
540 
546 #define UTF8_CATEGORY_ISBLANK \
547  (UTF8_CATEGORY_COMPATIBILITY | \
548  UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_PRIVATE_USE)
549 
555 #define UTF8_CATEGORY_ISGRAPH \
556  (UTF8_CATEGORY_COMPATIBILITY | \
557  UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
558  UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
559 
565 #define UTF8_CATEGORY_ISPUNCT \
566  (UTF8_CATEGORY_COMPATIBILITY | \
567  UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
568 
574 #define UTF8_CATEGORY_ISALNUM \
575  (UTF8_CATEGORY_COMPATIBILITY | \
576  UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER)
577 
583 #define UTF8_CATEGORY_ISALPHA \
584  (UTF8_CATEGORY_COMPATIBILITY | \
585  UTF8_CATEGORY_LETTER)
586 
592 #define UTF8_CATEGORY_ISUPPER \
593  (UTF8_CATEGORY_COMPATIBILITY | \
594  UTF8_CATEGORY_LETTER_UPPERCASE)
595 
601 #define UTF8_CATEGORY_ISLOWER \
602  (UTF8_CATEGORY_COMPATIBILITY | \
603  UTF8_CATEGORY_LETTER_LOWERCASE)
604 
610 #define UTF8_CATEGORY_ISDIGIT \
611  (UTF8_CATEGORY_COMPATIBILITY | \
612  UTF8_CATEGORY_NUMBER)
613 
619 #define UTF8_CATEGORY_ISXDIGIT \
620  (UTF8_CATEGORY_COMPATIBILITY | \
621  UTF8_CATEGORY_NUMBER | UTF8_CATEGORY_PRIVATE_USE)
622 
640 #ifndef UTF8_WCHAR_SIZE
641  #if (__SIZEOF_WCHAR_T__ == 4) || (WCHAR_MAX > UINT16_MAX) || (__WCHAR_MAX__ > UINT16_MAX)
642  #define UTF8_WCHAR_SIZE (4)
643  #else
644  #define UTF8_WCHAR_SIZE (2)
645  #endif
646 #endif
647 
648 #if (UTF8_WCHAR_SIZE == 4)
649 
654  #define UTF8_WCHAR_UTF32 (1)
655 #elif (UTF8_WCHAR_SIZE == 2)
656 
661  #define UTF8_WCHAR_UTF16 (1)
662 #else
663  #error Invalid size for wchar_t type.
664 #endif
665 
671 #ifndef UTF8_API
672  #ifdef __cplusplus
673  #define UTF8_API extern "C"
674  #else
675  #define UTF8_API
676  #endif
677 #endif
678 
692 typedef uint16_t utf16_t;
693 
698 typedef uint32_t unicode_t;
699 
726 UTF8_API size_t utf8len(const char* text);
727 
772 UTF8_API size_t utf16toutf8(const utf16_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
773 
830 UTF8_API size_t utf32toutf8(const unicode_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
831 
893 UTF8_API size_t widetoutf8(const wchar_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
894 
944 UTF8_API size_t utf8toutf16(const char* input, size_t inputSize, utf16_t* target, size_t targetSize, int32_t* errors);
945 
990 UTF8_API size_t utf8toutf32(const char* input, size_t inputSize, unicode_t* target, size_t targetSize, int32_t* errors);
991 
1064 UTF8_API size_t utf8towide(const char* input, size_t inputSize, wchar_t* target, size_t targetSize, int32_t* errors);
1065 
1119 UTF8_API const char* utf8seek(const char* text, size_t textSize, const char* textStart, off_t offset, int direction);
1120 
1175 UTF8_API size_t utf8envlocale();
1176 
1267 UTF8_API size_t utf8toupper(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors);
1268 
1363 UTF8_API size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors);
1364 
1446 UTF8_API size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors);
1447 
1551 UTF8_API size_t utf8casefold(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors);
1552 
1636 UTF8_API uint8_t utf8isnormalized(const char* input, size_t inputSize, size_t flags, size_t* offset);
1637 
1769 UTF8_API size_t utf8normalize(const char* input, size_t inputSize, char* target, size_t targetSize, size_t flags, int32_t* errors);
1770 
1864 UTF8_API size_t utf8iscategory(const char* input, size_t inputSize, size_t flags);
1865 
1870 #endif /* _UTF8REWIND_H_ */
UTF8_API size_t utf8totitle(const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
Convert UTF-8 encoded text to titlecase.
UTF8_API size_t utf8iscategory(const char *input, size_t inputSize, size_t flags)
Check if the input string conforms to the category specified by the flags.
UTF8_API const char * utf8seek(const char *text, size_t textSize, const char *textStart, off_t offset, int direction)
Seek into a UTF-8 encoded string.
UTF8_API size_t utf8envlocale()
Returns the environment&#39;s locale as an enum value.
UTF8_API size_t utf8towide(const char *input, size_t inputSize, wchar_t *target, size_t targetSize, int32_t *errors)
Convert a UTF-8 encoded string to a wide string.
#define UTF8_API
Calling convention for public functions.
Definition: utf8rewind.h:675
UTF8_API size_t utf8toutf16(const char *input, size_t inputSize, utf16_t *target, size_t targetSize, int32_t *errors)
Convert a UTF-8 encoded string to a UTF-16 encoded string.
uint16_t utf16_t
UTF-16 encoded code point.
Definition: utf8rewind.h:692
UTF8_API size_t utf8toupper(const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
Convert UTF-8 encoded text to uppercase.
UTF8_API size_t utf32toutf8(const unicode_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert a UTF-32 encoded string to a UTF-8 encoded string.
UTF8_API size_t utf8len(const char *text)
Get the length in code points of a UTF-8 encoded string.
UTF8_API size_t utf8tolower(const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
Convert UTF-8 encoded text to lowercase.
UTF8_API size_t widetoutf8(const wchar_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert a wide string to a UTF-8 encoded string.
UTF8_API size_t utf16toutf8(const utf16_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert a UTF-16 encoded string to a UTF-8 encoded string.
UTF8_API size_t utf8casefold(const char *input, size_t inputSize, char *target, size_t targetSize, size_t locale, int32_t *errors)
Remove case distinction from UTF-8 encoded text.
UTF8_API uint8_t utf8isnormalized(const char *input, size_t inputSize, size_t flags, size_t *offset)
Check if a string is stable in the specified Unicode Normalization Form.
UTF8_API size_t utf8normalize(const char *input, size_t inputSize, char *target, size_t targetSize, size_t flags, int32_t *errors)
Normalize a string to the specified Unicode Normalization Form.
UTF8_API size_t utf8toutf32(const char *input, size_t inputSize, unicode_t *target, size_t targetSize, int32_t *errors)
Convert a UTF-8 encoded string to a UTF-32 encoded string.
uint32_t unicode_t
UTF-32 encoded code point.
Definition: utf8rewind.h:698