utf8rewind
 All Files Functions Typedefs Macros Groups Pages
utf8rewind.h File Reference

Functions for working with UTF-8 encoded text. More...

Go to the source code of this file.

Macros

#define UTF8_ERR_INVALID_CHARACTER   (-1)
 
#define UTF8_ERR_INVALID_DATA   (-2)
 
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-3)
 
#define UTF8_ERR_OUT_OF_RANGE   (-4)
 
#define UTF8_ERR_UNHANDLED_SURROGATE_PAIR   (-5)
 
#define UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIR   (-6)
 
#define UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIR   (-7)
 
#define UTF8_WCHAR_SIZE   (2)
 
#define UTF8_WCHAR_UTF16   (1)
 

Typedefs

typedef uint32_t unicode_t
 
typedef uint16_t ucs2_t
 
typedef uint16_t utf16_t
 

Functions

int8_t utf8charvalid (char encodedCharacter)
 Check if a character is valid according to UTF-8 encoding. More...
 
size_t utf8charlen (char encodedCharacter)
 Returns the length in bytes of the encoded character. More...
 
size_t utf8len (const char *text)
 Get the length in codepoints of a UTF-8 encoded string. More...
 
size_t utf16toutf8 (const utf16_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a UTF-16 encoded string to a UTF-8 encoded string. More...
 
size_t utf32toutf8 (const unicode_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a UTF-32 encoded string to a UTF-8 encoded string. More...
 
size_t widetoutf8 (const wchar_t *input, size_t inputSize, char *target, size_t targetSize, int32_t *errors)
 Convert a wide string to a UTF-8 encoded string. More...
 
size_t utf8toutf16 (const char *input, size_t inputSize, utf16_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a UTF-16 encoded string. More...
 
size_t utf8toutf32 (const char *input, size_t inputSize, unicode_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a UTF-32 encoded string. More...
 
size_t utf8towide (const char *input, size_t inputSize, wchar_t *target, size_t targetSize, int32_t *errors)
 Convert a UTF-8 encoded string to a wide string. More...
 
const char * utf8seek (const char *text, const char *textStart, off_t offset, int direction)
 Seek into a UTF-8 encoded string. More...
 

Detailed Description

Functions for working with UTF-8 encoded text.

Macro Definition Documentation

#define UTF8_ERR_INVALID_CHARACTER   (-1)
#define UTF8_ERR_INVALID_DATA   (-2)
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-3)
#define UTF8_ERR_OUT_OF_RANGE   (-4)
#define UTF8_ERR_UNHANDLED_SURROGATE_PAIR   (-5)
#define UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIR   (-6)
#define UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIR   (-7)

Typedef Documentation

typedef uint16_t ucs2_t

UCS-2 encoded codepoint.

typedef uint32_t unicode_t

Unicode codepoint.

typedef uint16_t utf16_t

UTF-16 encoded codepoint.

Function Documentation

size_t utf16toutf8 ( const utf16_t input,
size_t  inputSize,
char *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-16 encoded string to a UTF-8 encoded string.

Note
This function should only be called directly if you are positive that you're working with UTF-16 encoded text. If you're working with wide strings, take a look at widetoutf8() instead.
Warning
Conversion does not add a null-terminator. You must set it yourself, either by clearing the string beforehand or by setting the last byte(s) to zero.

Example:

int8_t Player_SetName(const utf16_t* name, size_t nameSize)
{
int32_t errors = 0;
char converted_name[256] = { 0 };
utf16toutf8(name, nameSize, converted_name, 256, &errors);
if (errors != 0)
{
return 0;
}
return Player_SetName(converted_name);
}
Parameters
[in]inputUTF-16 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for encoding.
UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIRHigh surrogate pair was not matched.
UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIRLow surrogate pair was not matched.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
UTF8_ERR_INVALID_CHARACTERCodepoint could not be encoded.
See also
utf32toutf8
widetoutf8
size_t utf32toutf8 ( const unicode_t input,
size_t  inputSize,
char *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-32 encoded string to a UTF-8 encoded string.

Note
This function should only be called directly if you are positive that you're working with UTF-32 encoded text. If you're working with wide strings, take a look at widetoutf8() instead.
Warning
Conversion does not add a null-terminator. You must set it yourself, either by clearing the string beforehand or by setting the last byte(s) to zero.

Example:

int8_t Database_ExecuteQuery(const unicode_t* query, size_t querySize)
{
int32_t errors = 0;
char* converted = 0;
int8_t result = 0;
size_t converted_size = utf32toutf8(query, querySize, 0, 0, &errors);
if (errors != 0)
{
goto cleanup;
}
converted = (char*)malloc(converted_size + 1);
memset(converted, 0, converted_size + 1);
utf32toutf8(query, querySize, converted, converted_size, &errors);
if (errors != 0)
{
goto cleanup;
}
result = Database_ExecuteQuery(converted);
cleanup:
if (converted != 0)
{
free(converted);
converted = 0;
}
return result;
}
Parameters
[in]inputUTF-32 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for encoding.
UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIRHigh surrogate pair was not matched.
UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIRLow surrogate pair was not matched.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
UTF8_ERR_INVALID_CHARACTERCodepoint could not be encoded.
See also
utf16toutf8
widetoutf8
size_t utf8charlen ( char  encodedCharacter)

Returns the length in bytes of the encoded character.

A UTF-8 encoded codepoint must start with a special byte. This byte indicates how many bytes are used to encode the codepoint, up to a maximum of 4.

This function can be used to determine the amount of bytes used to encode a codepoint.

Parameters
[in]encodedCharacterByte to check.
Returns
Amount of bytes used for encoding or SIZE_MAX on error.
int8_t utf8charvalid ( char  encodedCharacter)

Check if a character is valid according to UTF-8 encoding.

Parameters
[in]encodedCharacterByte to check.
Returns
1 on success or 0 on failure.
size_t utf8len ( const char *  text)

Get the length in codepoints of a UTF-8 encoded string.

Example:

int8_t CheckPassword(const char* password)
{
size_t length = utf8len(password);
return (length == utf8len("hunter2"));
}
Parameters
[in]textUTF-8 encoded string.
Returns
Length in codepoints or SIZE_MAX on error.
const char* utf8seek ( const char *  text,
const char *  textStart,
off_t  offset,
int  direction 
)

Seek into a UTF-8 encoded string.

Working with UTF-8 encoded strings can be tricky due to the nature of the variable-length encoding. Because one character no longer equals one byte, it can be difficult to skip around in a UTF-8 encoded string without decoding the codepoints.

This function provides an interface similar to fseek in order to enable skipping to another part of the string.

Note
textStart must come before text in memory when seeking from the current or end position.

Example:

const char* text = "Press \xE0\x80\x13 to continue.";
const char fixed[1024] = { 0 };
const char* commandStart;
const char* commandEnd;
commandStart = strstr(text, "\xE0\x80\x13");
if (commandStart == 0)
{
return 0;
}
strncpy(fixed, text, commandStart - text);
strcat(fixed, "ENTER");
commandEnd = utf8seek(commandStart, text, 1, SEEK_CUR);
if (commandEnd != commandStart)
{
strcat(fixed, commandEnd);
}
Parameters
[in]textInput string.
[in]textStartStart of input string.
[in]offsetRequested offset in codepoints.
[in]directionDirection to seek in.
  • SEEK_SET Offset is from the start of the string.
  • SEEK_CUR Offset is from the current position of the string.
  • SEEK_END Offset is from the end of the string.
Returns
Changed string or no change on error.
size_t utf8toutf16 ( const char *  input,
size_t  inputSize,
utf16_t target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-8 encoded string to a UTF-16 encoded string.

Note
This function should only be called directly if you are positive that you must convert to UTF-16, independent of platform. If you're working with wide strings, take a look at utf8towide() instead.
Warning
Conversion does not add a null-terminator. You must set it yourself, either by clearing the string beforehand or by setting the last byte(s) to zero.

Example:

void Font_DrawText(int x, int y, const char* text)
{
int32_t errors = 0;
utf16_t converted[256] = { 0 };
size_t converted_size = utf8toutf16(title, strlen(title), converted, 256 * sizeof(utf16_t), &errors);
if (errors == 0)
{
Legacy_DrawText(g_FontCurrent, x, y, (unsigned short*)converted, converted_size);
}
}
Parameters
[in]inputUTF-8 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for decoding.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf8towide
utf8toutf32
size_t utf8toutf32 ( const char *  input,
size_t  inputSize,
unicode_t target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-8 encoded string to a UTF-32 encoded string.

Note
This function should only be called directly if you are positive that you must convert to UTF-32, independent of platform. If you're working with wide strings, take a look at utf8towide() instead.
Warning
Conversion does not add a null-terminator. You must set it yourself, either by clearing the string beforehand or by setting the last byte(s) to zero.

Example:

void TextField_AddCharacter(const char* encoded)
{
int32_t errors = 0;
unicode_t codepoint = 0;
utf8toutf32(encoded, strlen(encoded), &codepoint, sizeof(unicode_t), &errors);
if (errors == 0)
{
TextField_AddCodepoint(codepoint);
}
}
Parameters
[in]inputUTF-8 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for decoding.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
utf8towide
utf8toutf16
size_t utf8towide ( const char *  input,
size_t  inputSize,
wchar_t *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a UTF-8 encoded string to a wide string.

Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded. This function takes a UTF-8 encoded string as input and automatically calls the correct conversion function.

This allows for a cross-platform treatment of wide text and is preferable to using the UTF-16 or UTF-32 versions directly.

Note
Codepoints outside the Basic Multilingual Plane (BMP) are converted to surrogate pairs when using UTF-16. This means that strings containing characters outside the BMP converted on a platform with UTF-32 wide strings are not compatible with platforms with UTF-16 wide strings.
Hence, it is preferable to keep all data as UTF-8 and only convert to wide strings when required by a third-party interface.
Warning
Conversion does not add a null-terminator. You must set it yourself, either by clearing the string beforehand or by setting the last byte(s) to zero.

Example:

const char* input = "Bj\xC3\xB6rn Zonderland";
size_t input_size = strlen(input);
wchar_t* output = 0;
size_t output_size = 0;
size_t result = 0;
int32_t errors = 0;
output_size = utf8towide(input, input_size, 0, 0, &errors);
if (errors == 0)
{
output = (wchar_t*)malloc(output_size);
memset(output, 0, output_size);
utf8towide(input, input_size, output, output_size, &errors);
if (errors == 0)
{
Player_SetName(output);
}
free(output);
}
Parameters
[in]inputUTF-8 encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for decoding.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
See also
widetoutf8
utf8toutf16
utf8toutf32
size_t widetoutf8 ( const wchar_t *  input,
size_t  inputSize,
char *  target,
size_t  targetSize,
int32_t *  errors 
)

Convert a wide string to a UTF-8 encoded string.

Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded. This function takes a wide string as input and automatically calls the correct conversion function.

This allows for a cross-platform treatment of wide text and is preferable to using the UTF-16 or UTF-32 versions directly.

Warning
Conversion does not add a null-terminator. You must set it yourself, either by clearing the string beforehand or by setting the last byte(s) to zero.

Example:

const wchar_t* input = L"textures/\xD803\xDC11.png";
size_t input_size = wcslen(input) * sizeof(wchar_t);
size_t output_size = 0;
char* output = 0;
size_t result = 0;
int32_t errors = 0;
result = widetoutf8(input, input_size, 0, 0, &errors);
if (errors == 0)
{
output_size = result + 1;
output = (char*)malloc(output_size);
memset(output, 0, output_size);
widetoutf8(input, wcslen(input) * sizeof(wchar_t), output, output_size, &errors);
if (errors == 0)
{
Texture_Load(output);
}
free(output);
}
Parameters
[in]inputWide-encoded string.
[in]inputSizeSize of the input in bytes.
[out]targetOutput buffer for the result.
[in]targetSizeSize of the output buffer in bytes.
[out]errorsOutput for errors.
Returns
Bytes written or amount of bytes needed for output if target buffer is specified as NULL.
Return values
UTF8_ERR_INVALID_DATAInput does not contain enough bytes for encoding.
UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIRHigh surrogate pair was not matched.
UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIRLow surrogate pair was not matched.
UTF8_ERR_NOT_ENOUGH_SPACETarget buffer could not contain result.
UTF8_ERR_INVALID_CHARACTERCodepoint could not be encoded.
See also
utf8towide
utf16toutf8
utf32toutf8