|
size_t | utf8len (const char *text) |
| Get the length in codepoints of a UTF-8 encoded string. More...
|
|
size_t | utf16toutf8 (const utf16_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors) |
| Convert a UTF-16 encoded string to a UTF-8 encoded string. More...
|
|
size_t | utf32toutf8 (const unicode_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors) |
| Convert a UTF-32 encoded string to a UTF-8 encoded string. More...
|
|
size_t | widetoutf8 (const wchar_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors) |
| Convert a wide string to a UTF-8 encoded string. More...
|
|
size_t | utf8toutf16 (const char *input, size_t inputSize, utf16_t *target, size_t targetSize, int32_t *errors) |
| Convert a UTF-8 encoded string to a UTF-16 encoded string. More...
|
|
size_t | utf8toutf32 (const char *input, size_t inputSize, unicode_t *target, size_t targetSize, int32_t *errors) |
| Convert a UTF-8 encoded string to a UTF-32 encoded string. More...
|
|
size_t | utf8towide (const char *input, size_t inputSize, wchar_t *target, size_t targetSize, int32_t *errors) |
| Convert a UTF-8 encoded string to a wide string. More...
|
|
const char * | utf8seek (const char *text, const char *textStart, off_t offset, int direction) |
| Seek into a UTF-8 encoded string. More...
|
|
Functions for working with UTF-8 encoded text.
const char* utf8seek |
( |
const char * |
text, |
|
|
const char * |
textStart, |
|
|
off_t |
offset, |
|
|
int |
direction |
|
) |
| |
Seek into a UTF-8 encoded string.
Working with UTF-8 encoded strings can be tricky due to the nature of the variable-length encoding. Because one character no longer equals one byte, it can be difficult to skip around in a UTF-8 encoded string without decoding the codepoints.
This function provides an interface similar to fseek
in order to enable skipping to another part of the string.
- Note
textStart
must come before text
in memory when seeking from the current or end position.
Example:
const char* text = "Press \xE0\x80\x13 to continue.";
const char fixed[1024] = { 0 };
const char* commandStart;
const char* commandEnd;
commandStart = strstr(text, "\xE0\x80\x13");
if (commandStart == 0)
{
return 0;
}
strncpy(fixed, text, commandStart - text);
strcat(fixed, "ENTER");
commandEnd =
utf8seek(commandStart, text, 1, SEEK_CUR);
if (commandEnd != commandStart)
{
strcat(fixed, commandEnd);
}
- Parameters
-
[in] | text | Input string. |
[in] | textStart | Start of input string. |
[in] | offset | Requested offset in codepoints. |
[in] | direction | Direction to seek in.
SEEK_SET Offset is from the start of the string.
SEEK_CUR Offset is from the current position of the string.
SEEK_END Offset is from the end of the string.
|
- Returns
- Changed string or no change on error.
size_t utf8towide |
( |
const char * |
input, |
|
|
size_t |
inputSize, |
|
|
wchar_t * |
target, |
|
|
size_t |
targetSize, |
|
|
int32_t * |
errors |
|
) |
| |
Convert a UTF-8 encoded string to a wide string.
Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded. This function takes a UTF-8 encoded string as input and automatically calls the correct conversion function.
This allows for a cross-platform treatment of wide text and is preferable to using the UTF-16 or UTF-32 versions directly.
Erroneous byte sequences such as missing bytes, illegal bytes or overlong encodings of codepoints are converted to the replacement character U+FFFD.
- Note
- Codepoints outside the Basic Multilingual Plane (BMP) are converted to surrogate pairs when using UTF-16. This means that strings containing characters outside the BMP converted on a platform with UTF-32 wide strings are not compatible with platforms with UTF-16 wide strings.
- Hence, it is preferable to keep all data as UTF-8 and only
- convert to wide strings when required by a third-party interface.
Example:
const char* input = "Bj\xC3\xB6rn Zonderland";
size_t input_size = strlen(input);
wchar_t* output = 0;
size_t output_size = 0;
size_t result = 0;
int32_t errors = 0;
output_size =
utf8towide(input, input_size, 0, 0, &errors);
if (errors == 0)
{
output = (wchar_t*)malloc(output_size);
memset(output, 0, output_size);
utf8towide(input, input_size, output, output_size, &errors);
if (errors == 0)
{
Player_SetName(output);
}
free(output);
}
- Parameters
-
[in] | input | UTF-8 encoded string. |
[in] | inputSize | Size of the input in bytes. |
[out] | target | Output buffer for the result. |
[in] | targetSize | Size of the output buffer in bytes. |
[out] | errors | Output for errors. |
- Returns
- Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
- Return values
-
- See also
- widetoutf8
-
utf8toutf16
-
utf8toutf32