utf8rewind  1.4.1
System library for processing UTF-8 encoded text
utf8rewind.h
Go to the documentation of this file.
1 /*
2  Copyright (C) 2014-2016 Quinten Lansu
3 
4  Permission is hereby granted, free of charge, to any person
5  obtaining a copy of this software and associated documentation
6  files (the "Software"), to deal in the Software without
7  restriction, including without limitation the rights to use,
8  copy, modify, merge, publish, distribute, sublicense, and/or
9  sell copies of the Software, and to permit persons to whom the
10  Software is furnished to do so, subject to the following
11  conditions:
12 
13  The above copyright notice and this permission notice shall be
14  included in all copies or substantial portions of the Software.
15 
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  OTHER DEALINGS IN THE SOFTWARE.
24 */
25 
26 #ifndef _UTF8REWIND_H_
27 #define _UTF8REWIND_H_
28 
34 #include <locale.h>
35 #include <stddef.h>
36 #include <stdio.h>
37 #include <stdint.h>
38 #include <string.h>
39 #include <wchar.h>
40 
52 #define UTF8_VERSION_MAKE(_major, _minor, _bugfix) \
53  ((_major) * 10000) + ((_minor) * 100) + (_bugfix)
54 
59 #define UTF8_VERSION_MAJOR 1
60 
65 #define UTF8_VERSION_MINOR 4
66 
71 #define UTF8_VERSION_BUGFIX 1
72 
77 #define UTF8_VERSION \
78  UTF8_VERSION_MAKE(UTF8_VERSION_MAJOR, UTF8_VERSION_MINOR, UTF8_VERSION_BUGFIX)
79 
84 #define UTF8_VERSION_STRING "1.4.1"
85 
90 #define UTF8_VERSION_GUARD(_major, _minor, _bugfix) \
91  (UTF8_VERSION >= UTF8_VERSION_MAKE(_major, _minor, _bugfix))
92 
107 #define UTF8_ERR_NONE (0)
108 
113 #define UTF8_ERR_INVALID_DATA (-1)
114 
119 #define UTF8_ERR_INVALID_FLAG (-2)
120 
125 #define UTF8_ERR_NOT_ENOUGH_SPACE (-3)
126 
131 #define UTF8_ERR_OVERLAPPING_PARAMETERS (-4)
132 
152 #ifndef UTF8_WCHAR_SIZE
153  #if (__SIZEOF_WCHAR_T__ == 4) || (WCHAR_MAX > UINT16_MAX) || (__WCHAR_MAX__ > UINT16_MAX)
154  #define UTF8_WCHAR_SIZE (4)
155  #else
156  #define UTF8_WCHAR_SIZE (2)
157  #endif
158 #endif
159 
160 #if (UTF8_WCHAR_SIZE == 4)
161 
165  #define UTF8_WCHAR_UTF32 (1)
166 #elif (UTF8_WCHAR_SIZE == 2)
167 
171  #define UTF8_WCHAR_UTF16 (1)
172 #else
173  #error Invalid size for wchar_t type.
174 #endif
175 
181 #ifndef UTF8_API
182  #ifdef __cplusplus
183  #define UTF8_API extern "C"
184  #else
185  #define UTF8_API
186  #endif
187 #endif
188 
197 typedef uint16_t utf16_t;
198 
203 typedef uint32_t unicode_t;
204 
222 UTF8_API size_t utf8len(const char* text);
223 
269 UTF8_API size_t utf16toutf8(const utf16_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
270 
328 UTF8_API size_t utf32toutf8(const unicode_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
329 
392 UTF8_API size_t widetoutf8(const wchar_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
393 
440 UTF8_API size_t utf8toutf16(const char* input, size_t inputSize, utf16_t* target, size_t targetSize, int32_t* errors);
441 
486 UTF8_API size_t utf8toutf32(const char* input, size_t inputSize, unicode_t* target, size_t targetSize, int32_t* errors);
487 
560 UTF8_API size_t utf8towide(const char* input, size_t inputSize, wchar_t* target, size_t targetSize, int32_t* errors);
561 
615 UTF8_API const char* utf8seek(const char* text, size_t textSize, const char* textStart, off_t offset, int direction);
616 
704 UTF8_API size_t utf8toupper(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
705 
798 UTF8_API size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
799 
886 UTF8_API size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
887 
996 UTF8_API size_t utf8casefold(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
997 
1008 #define UTF8_NORMALIZE_COMPOSE 0x00000001
1009 
1014 #define UTF8_NORMALIZE_DECOMPOSE 0x00000002
1015 
1020 #define UTF8_NORMALIZE_COMPATIBILITY 0x00000004
1021 
1026 #define UTF8_NORMALIZATION_RESULT_YES (0)
1027 
1032 #define UTF8_NORMALIZATION_RESULT_MAYBE (1)
1033 
1038 #define UTF8_NORMALIZATION_RESULT_NO (2)
1039 
1127 UTF8_API uint8_t utf8isnormalized(const char* input, size_t inputSize, size_t flags, size_t* offset);
1128 
1260 UTF8_API size_t utf8normalize(const char* input, size_t inputSize, char* target, size_t targetSize, size_t flags, int32_t* errors);
1261 
1273 #define UTF8_CATEGORY_LETTER_UPPERCASE 0x00000001
1274 
1279 #define UTF8_CATEGORY_LETTER_LOWERCASE 0x00000002
1280 
1285 #define UTF8_CATEGORY_LETTER_TITLECASE 0x00000004
1286 
1291 #define UTF8_CATEGORY_LETTER_MODIFIER 0x00000008
1292 
1297 #define UTF8_CATEGORY_LETTER_OTHER 0x00000010
1298 
1303 #define UTF8_CATEGORY_LETTER \
1304  (UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
1305  UTF8_CATEGORY_LETTER_TITLECASE | UTF8_CATEGORY_LETTER_MODIFIER | \
1306  UTF8_CATEGORY_LETTER_OTHER)
1307 
1312 #define UTF8_CATEGORY_CASE_MAPPED \
1313  (UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
1314  UTF8_CATEGORY_LETTER_TITLECASE)
1315 
1320 #define UTF8_CATEGORY_MARK_NON_SPACING 0x00000020
1321 
1326 #define UTF8_CATEGORY_MARK_SPACING 0x00000040
1327 
1332 #define UTF8_CATEGORY_MARK_ENCLOSING 0x00000080
1333 
1338 #define UTF8_CATEGORY_MARK \
1339  (UTF8_CATEGORY_MARK_NON_SPACING | UTF8_CATEGORY_MARK_SPACING | \
1340  UTF8_CATEGORY_MARK_ENCLOSING)
1341 
1346 #define UTF8_CATEGORY_NUMBER_DECIMAL 0x00000100
1347 
1352 #define UTF8_CATEGORY_NUMBER_LETTER 0x00000200
1353 
1358 #define UTF8_CATEGORY_NUMBER_OTHER 0x00000400
1359 
1364 #define UTF8_CATEGORY_NUMBER \
1365  (UTF8_CATEGORY_NUMBER_DECIMAL | UTF8_CATEGORY_NUMBER_LETTER | \
1366  UTF8_CATEGORY_NUMBER_OTHER)
1367 
1372 #define UTF8_CATEGORY_PUNCTUATION_CONNECTOR 0x00000800
1373 
1378 #define UTF8_CATEGORY_PUNCTUATION_DASH 0x00001000
1379 
1384 #define UTF8_CATEGORY_PUNCTUATION_OPEN 0x00002000
1385 
1390 #define UTF8_CATEGORY_PUNCTUATION_CLOSE 0x00004000
1391 
1396 #define UTF8_CATEGORY_PUNCTUATION_INITIAL 0x00008000
1397 
1402 #define UTF8_CATEGORY_PUNCTUATION_FINAL 0x00010000
1403 
1408 #define UTF8_CATEGORY_PUNCTUATION_OTHER 0x00020000
1409 
1414 #define UTF8_CATEGORY_PUNCTUATION \
1415  (UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_PUNCTUATION_DASH | \
1416  UTF8_CATEGORY_PUNCTUATION_OPEN | UTF8_CATEGORY_PUNCTUATION_CLOSE | \
1417  UTF8_CATEGORY_PUNCTUATION_INITIAL | UTF8_CATEGORY_PUNCTUATION_FINAL | \
1418  UTF8_CATEGORY_PUNCTUATION_OTHER)
1419 
1424 #define UTF8_CATEGORY_SYMBOL_MATH 0x00040000
1425 
1430 #define UTF8_CATEGORY_SYMBOL_CURRENCY 0x00080000
1431 
1436 #define UTF8_CATEGORY_SYMBOL_MODIFIER 0x00100000
1437 
1442 #define UTF8_CATEGORY_SYMBOL_OTHER 0x00200000
1443 
1448 #define UTF8_CATEGORY_SYMBOL \
1449  (UTF8_CATEGORY_SYMBOL_MATH | UTF8_CATEGORY_SYMBOL_CURRENCY | \
1450  UTF8_CATEGORY_SYMBOL_MODIFIER | UTF8_CATEGORY_SYMBOL_OTHER)
1451 
1456 #define UTF8_CATEGORY_SEPARATOR_SPACE 0x00400000
1457 
1462 #define UTF8_CATEGORY_SEPARATOR_LINE 0x00800000
1463 
1468 #define UTF8_CATEGORY_SEPARATOR_PARAGRAPH 0x01000000
1469 
1474 #define UTF8_CATEGORY_SEPARATOR \
1475  (UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_SEPARATOR_LINE | \
1476  UTF8_CATEGORY_SEPARATOR_PARAGRAPH)
1477 
1482 #define UTF8_CATEGORY_CONTROL 0x02000000
1483 
1488 #define UTF8_CATEGORY_FORMAT 0x04000000
1489 
1494 #define UTF8_CATEGORY_SURROGATE 0x08000000
1495 
1500 #define UTF8_CATEGORY_PRIVATE_USE 0x10000000
1501 
1506 #define UTF8_CATEGORY_UNASSIGNED 0x20000000
1507 
1513 #define UTF8_CATEGORY_COMPATIBILITY 0x40000000
1514 
1520 #define UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER 0x80000000
1521 
1527 #define UTF8_CATEGORY_ISCNTRL \
1528  (UTF8_CATEGORY_COMPATIBILITY | \
1529  UTF8_CATEGORY_CONTROL)
1530 
1536 #define UTF8_CATEGORY_ISPRINT \
1537  (UTF8_CATEGORY_COMPATIBILITY | \
1538  UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
1539  UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL | \
1540  UTF8_CATEGORY_SEPARATOR)
1541 
1547 #define UTF8_CATEGORY_ISSPACE \
1548  (UTF8_CATEGORY_COMPATIBILITY | \
1549  UTF8_CATEGORY_SEPARATOR_SPACE)
1550 
1556 #define UTF8_CATEGORY_ISBLANK \
1557  (UTF8_CATEGORY_COMPATIBILITY | \
1558  UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_PRIVATE_USE)
1559 
1565 #define UTF8_CATEGORY_ISGRAPH \
1566  (UTF8_CATEGORY_COMPATIBILITY | \
1567  UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
1568  UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
1569 
1575 #define UTF8_CATEGORY_ISPUNCT \
1576  (UTF8_CATEGORY_COMPATIBILITY | \
1577  UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
1578 
1584 #define UTF8_CATEGORY_ISALNUM \
1585  (UTF8_CATEGORY_COMPATIBILITY | \
1586  UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER)
1587 
1593 #define UTF8_CATEGORY_ISALPHA \
1594  (UTF8_CATEGORY_COMPATIBILITY | \
1595  UTF8_CATEGORY_LETTER)
1596 
1602 #define UTF8_CATEGORY_ISUPPER \
1603  (UTF8_CATEGORY_COMPATIBILITY | \
1604  UTF8_CATEGORY_LETTER_UPPERCASE)
1605 
1611 #define UTF8_CATEGORY_ISLOWER \
1612  (UTF8_CATEGORY_COMPATIBILITY | \
1613  UTF8_CATEGORY_LETTER_LOWERCASE)
1614 
1620 #define UTF8_CATEGORY_ISDIGIT \
1621  (UTF8_CATEGORY_COMPATIBILITY | \
1622  UTF8_CATEGORY_NUMBER)
1623 
1629 #define UTF8_CATEGORY_ISXDIGIT \
1630  (UTF8_CATEGORY_COMPATIBILITY | \
1631  UTF8_CATEGORY_NUMBER | UTF8_CATEGORY_PRIVATE_USE)
1632 
1728 UTF8_API size_t utf8iscategory(const char* input, size_t inputSize, size_t flags);
1729 
1730 #endif /* _UTF8REWIND_H_ */
uint32_t unicode_t
UTF-32 encoded code point.
Definition: utf8rewind.h:203
UTF8_API size_t utf8totitle(const char *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert UTF-8 encoded text to titlecase.
#define UTF8_API
Calling convention for public functions.
Definition: utf8rewind.h:185
UTF8_API size_t utf8normalize(const char *input, size_t inputSize, char *target, size_t targetSize, size_t flags, int32_t *errors)
Normalize a string to the specified Unicode Normalization Form.
UTF8_API size_t utf8iscategory(const char *input, size_t inputSize, size_t flags)
Check if the input string conforms to the category specified by the flags.
UTF8_API size_t utf8toutf32(const char *input, size_t inputSize, unicode_t *target, size_t targetSize, int32_t *errors)
Convert a UTF-8 encoded string to a UTF-32 encoded string.
UTF8_API size_t utf8toupper(const char *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert UTF-8 encoded text to uppercase.
uint16_t utf16_t
UTF-16 encoded code point.
Definition: utf8rewind.h:197
UTF8_API size_t utf8casefold(const char *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Remove case distinction from UTF-8 encoded text.
UTF8_API size_t utf16toutf8(const utf16_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert a UTF-16 encoded string to a UTF-8 encoded string.
UTF8_API size_t utf8toutf16(const char *input, size_t inputSize, utf16_t *target, size_t targetSize, int32_t *errors)
Convert a UTF-8 encoded string to a UTF-16 encoded string.
UTF8_API size_t utf8towide(const char *input, size_t inputSize, wchar_t *target, size_t targetSize, int32_t *errors)
Convert a UTF-8 encoded string to a wide string.
UTF8_API uint8_t utf8isnormalized(const char *input, size_t inputSize, size_t flags, size_t *offset)
Check if a string is stable in the specified Unicode Normalization Form.
UTF8_API size_t utf8tolower(const char *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert UTF-8 encoded text to lowercase.
UTF8_API size_t utf8len(const char *text)
Get the length in code points of a UTF-8 encoded string.
UTF8_API size_t utf32toutf8(const unicode_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert a UTF-32 encoded string to a UTF-8 encoded string.
UTF8_API size_t widetoutf8(const wchar_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
Convert a wide string to a UTF-8 encoded string.
UTF8_API const char * utf8seek(const char *text, size_t textSize, const char *textStart, off_t offset, int direction)
Seek into a UTF-8 encoded string.