utf8rewind
 All Files Functions Typedefs Macros Groups Pages
utf8rewind.h File Reference

Functions for working with UTF-8 encoded text. More...

Go to the source code of this file.

Macros

#define UTF8_ERR_INVALID_DATA   (-1)
 
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-2)
 
#define UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIR   (-3)
 
#define UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIR   (-4)
 
#define UTF8_WCHAR_SIZE   (2)
 
#define UTF8_WCHAR_UTF16   (1)
 

Typedefs

typedef uint16_t utf16_t
 
typedef uint32_t unicode_t
 

Functions

size_t utf8len (const char *text)
 Get the length in codepoints of a UTF-8 encoded string. More...
 
size_t utf16toutf8 (const utf16_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a UTF-16 encoded string to a UTF-8 encoded string. More...
 
size_t utf32toutf8 (const unicode_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a UTF-32 encoded string to a UTF-8 encoded string. More...
 
size_t widetoutf8 (const wchar_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a wide string to a UTF-8 encoded string. More...
 
size_t utf8toutf16 (const char *input, size_t inputSize, utf16_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a UTF-16 encoded string. More...
 
size_t utf8toutf32 (const char *input, size_t inputSize, unicode_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a UTF-32 encoded string. More...
 
size_t utf8towide (const char *input, size_t inputSize, wchar_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a wide string. More...
 
const char * utf8seek (const char *text, const char *textStart, off_t offset, int direction)
 Seek into a UTF-8 encoded string. More...
 

Detailed Description

Functions for working with UTF-8 encoded text.

Macro Definition Documentation

#define UTF8_ERR_INVALID_DATA   (-1)
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-2)
#define UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIR   (-3)
#define UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIR   (-4)

Typedef Documentation

typedef uint32_t unicode_t

Unicode codepoint.

typedef uint16_t utf16_t

UTF-16 encoded codepoint.

Function Documentation

size_t utf16toutf8 ( const utf16_t input,
size_t  inputSize,
char *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-16 encoded string to a UTF-8 encoded string.

Note
This function should only be called directly if you are positive that you're working with UTF-16 encoded text. If you're working with wide strings, take a look at widetoutf8() instead.

Example:

int8_t Player_SetName(const utf16_t* name, size_t nameSize)
{
int32_t errors = 0;
char converted_name[256] = { 0 };
utf16toutf8(name, nameSize, converted_name, 256, &errors);
if (errors != 0)
{
return 0;
}
return Player_SetName(converted_name);
}
Parameters
[in]inputUTF-16 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for encoding.
UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIRHigh surrogate pair was not matched.
UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIRLow surrogate pair was not matched.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf32toutf8
widetoutf8
size_t utf32toutf8 ( const unicode_t input,
size_t  inputSize,
char *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-32 encoded string to a UTF-8 encoded string.

Note
This function should only be called directly if you are positive that you're working with UTF-32 encoded text. If you're working with wide strings, take a look at widetoutf8() instead.

Example:

int8_t Database_ExecuteQuery(const unicode_t* query, size_t querySize)
{
int32_t errors = 0;
char* converted = 0;
int8_t result = 0;
size_t converted_size = utf32toutf8(query, querySize, 0, 0, &errors);
if (errors != 0)
{
goto cleanup;
}
converted = (char*)malloc(converted_size + 1);
memset(converted, 0, converted_size + 1);
utf32toutf8(query, querySize, converted, converted_size, &errors);
if (errors != 0)
{
goto cleanup;
}
result = Database_ExecuteQuery(converted);
cleanup:
if (converted != 0)
{
free(converted);
converted = 0;
}
return result;
}
Parameters
[in]inputUTF-32 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for encoding.
UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIRHigh surrogate pair was not matched.
UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIRLow surrogate pair was not matched.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf16toutf8
widetoutf8
size_t utf8len ( const char *  text)

Get the length in codepoints of a UTF-8 encoded string.

Example:

int8_t CheckPassword(const char* password)
{
size_t length = utf8len(password);
return (length == utf8len("hunter2"));
}
Parameters
[in]textUTF-8 encoded string.
Returns
Length in codepoints.
const char* utf8seek ( const char *  text,
const char *  textStart,
off_t  offset,
int  direction 
)

Seek into a UTF-8 encoded string.

Working with UTF-8 encoded strings can be tricky due to the nature of the variable-length encoding. Because one character no longer equals one byte, it can be difficult to skip around in a UTF-8 encoded string without decoding the codepoints.

This function provides an interface similar to fseek in order to enable skipping to another part of the string.

Note
textStart must come before text in memory when seeking from the current or end position.

Example:

const char* text = "Press \xE0\x80\x13 to continue.";
const char fixed[1024] = { 0 };
const char* commandStart;
const char* commandEnd;
commandStart = strstr(text, "\xE0\x80\x13");
if (commandStart == 0)
{
return 0;
}
strncpy(fixed, text, commandStart - text);
strcat(fixed, "ENTER");
commandEnd = utf8seek(commandStart, text, 1, SEEK_CUR);
if (commandEnd != commandStart)
{
strcat(fixed, commandEnd);
}
Parameters
[in]textInput string.
[in]textStartStart of input string.
[in]offsetRequested offset in codepoints.
[in]directionDirection to seek in.
  • SEEK_SET Offset is from the start of the string.
  • SEEK_CUR Offset is from the current position of the string.
  • SEEK_END Offset is from the end of the string.
Returns
Changed string or no change on error.
size_t utf8toutf16 ( const char *  input,
size_t  inputSize,
utf16_t target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-8 encoded string to a UTF-16 encoded string.

Note
This function should only be called directly if you are positive that you must convert to UTF-16, independent of platform. If you're working with wide strings, take a look at utf8towide() instead.

Erroneous byte sequences such as missing bytes, illegal bytes or overlong encodings of codepoints are converted to the replacement character U+FFFD.

Example:

void Font_DrawText(int x, int y, const char* text)
{
int32_t errors = 0;
utf16_t converted[256] = { 0 };
size_t converted_size = utf8toutf16(title, strlen(title), converted, 256 * sizeof(utf16_t), &errors);
if (errors == 0)
{
Legacy_DrawText(g_FontCurrent, x, y, (unsigned short*)converted, converted_size);
}
}
Parameters
[in]inputUTF-8 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for decoding.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf8towide
utf8toutf32
size_t utf8toutf32 ( const char *  input,
size_t  inputSize,
unicode_t target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-8 encoded string to a UTF-32 encoded string.

Note
This function should only be called directly if you are positive that you must convert to UTF-32, independent of platform. If you're working with wide strings, take a look at utf8towide() instead.

Erroneous byte sequences such as missing bytes, illegal bytes or overlong encodings of codepoints are converted to the replacement character U+FFFD.

Example:

void TextField_AddCharacter(const char* encoded)
{
int32_t errors = 0;
unicode_t codepoint = 0;
utf8toutf32(encoded, strlen(encoded), &codepoint, sizeof(unicode_t), &errors);
if (errors == 0)
{
TextField_AddCodepoint(codepoint);
}
}
Parameters
[in]inputUTF-8 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for decoding.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf8towide
utf8toutf16
size_t utf8towide ( const char *  input,
size_t  inputSize,
wchar_t *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-8 encoded string to a wide string.

Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded. This function takes a UTF-8 encoded string as input and automatically calls the correct conversion function.

This allows for a cross-platform treatment of wide text and is preferable to using the UTF-16 or UTF-32 versions directly.

Erroneous byte sequences such as missing bytes, illegal bytes or overlong encodings of codepoints are converted to the replacement character U+FFFD.

Note
Codepoints outside the Basic Multilingual Plane (BMP) are converted to surrogate pairs when using UTF-16. This means that strings containing characters outside the BMP converted on a platform with UTF-32 wide strings are not compatible with platforms with UTF-16 wide strings.
Hence, it is preferable to keep all data as UTF-8 and only
convert to wide strings when required by a third-party interface.

Example:

const char* input = "Bj\xC3\xB6rn Zonderland";
size_t input_size = strlen(input);
wchar_t* output = 0;
size_t output_size = 0;
size_t result = 0;
int32_t errors = 0;
output_size = utf8towide(input, input_size, 0, 0, &errors);
if (errors == 0)
{
output = (wchar_t*)malloc(output_size);
memset(output, 0, output_size);
utf8towide(input, input_size, output, output_size, &errors);
if (errors == 0)
{
Player_SetName(output);
}
free(output);
}
Parameters
[in]inputUTF-8 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for decoding.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
widetoutf8
utf8toutf16
utf8toutf32
size_t widetoutf8 ( const wchar_t *  input,
size_t  inputSize,
char *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a wide string to a UTF-8 encoded string.

Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded. This function takes a wide string as input and automatically calls the correct conversion function.

This allows for a cross-platform treatment of wide text and is preferable to using the UTF-16 or UTF-32 versions directly.

Example:

const wchar_t* input = L"textures/\xD803\xDC11.png";
size_t input_size = wcslen(input) * sizeof(wchar_t);
size_t output_size = 0;
char* output = 0;
size_t result = 0;
int32_t errors = 0;
result = widetoutf8(input, input_size, 0, 0, &errors);
if (errors == 0)
{
output_size = result + 1;
output = (char*)malloc(output_size);
memset(output, 0, output_size);
widetoutf8(input, wcslen(input) * sizeof(wchar_t), output, output_size, &errors);
if (errors == 0)
{
Texture_Load(output);
}
free(output);
}
Parameters
[in]inputWide-encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for encoding.
UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIRHigh surrogate pair was not matched.
UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIRLow surrogate pair was not matched.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf8towide
utf16toutf8
utf32toutf8