utf8rewind
 All Files Functions Typedefs Macros Pages
utf8rewind.h File Reference

Functions for working with UTF-8 encoded text. More...

Go to the source code of this file.

Macros

#define UTF8_ERR_INVALID_CHARACTER   (-1)
 
#define UTF8_ERR_INVALID_DATA   (-2)
 
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-3)
 
#define UTF8_ERR_OUT_OF_RANGE   (-4)
 
#define UTF8_ERR_UNHANDLED_SURROGATE_PAIR   (-5)
 
#define UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIR   (-6)
 
#define UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIR   (-7)
 

Typedefs

typedef unsigned int unicode_t
 
typedef unsigned short ucs2_t
 
typedef unsigned short utf16_t
 

Functions

int utf8charvalid (char encodedCharacter)
 Check if a character is valid according to UTF-8 encoding. More...
 
int utf8charlen (char encodedCharacter)
 Returns the length in bytes of the encoded character. More...
 
int utf8len (const char *text)
 Get the length in codepoints of a UTF-8 encoded string. More...
 
int utf8encode (unicode_t codepoint, char *target, size_t targetSize)
 Encode a Unicode codepoint to UTF-8. More...
 
int utf8convertucs2 (ucs2_t codepoint, char *target, size_t targetSize)
 Convert a UCS-2 codepoint to UTF-8. More...
 
int wctoutf8 (const wchar_t *input, size_t inputSize, char *target, size_t targetSize)
 Convert a UTF-16 encoded string to UTF-8. More...
 
int utf8decode (const char *text, unicode_t *result)
 Decode a UTF-8 encoded codepoint to a Unicode codepoint. More...
 
int utf8towc (const char *input, size_t inputSize, wchar_t *target, size_t targetSize)
 Convert a UTF-8 encoded string to UTF-16. More...
 
const char * utf8seek (const char *text, const char *textStart, off_t offset, int direction)
 Seek into a UTF-8 encoded string. More...
 

Detailed Description

Functions for working with UTF-8 encoded text.

Macro Definition Documentation

#define UTF8_ERR_INVALID_CHARACTER   (-1)
#define UTF8_ERR_INVALID_DATA   (-2)
#define UTF8_ERR_NOT_ENOUGH_SPACE   (-3)
#define UTF8_ERR_OUT_OF_RANGE   (-4)
#define UTF8_ERR_UNHANDLED_SURROGATE_PAIR   (-5)
#define UTF8_ERR_UNMATCHED_HIGH_SURROGATE_PAIR   (-6)
#define UTF8_ERR_UNMATCHED_LOW_SURROGATE_PAIR   (-7)

Typedef Documentation

typedef unsigned short ucs2_t

UCS-2 encoded codepoint.

typedef unsigned int unicode_t

Unicode codepoint.

typedef unsigned short utf16_t

UTF-16 encoded codepoint.

Function Documentation

int utf8charlen ( char  encodedCharacter)

Returns the length in bytes of the encoded character.

A UTF-8 encoded codepoint must start with a special byte. This byte indicates how many bytes are used to encode the codepoint, up to a maximum of 6.

This function can be used to determine the amount of bytes used to encode a codepoint.

Parameters
encodedCharacterCharacter to check.
Returns
Amount of bytes written or an error code.
int utf8charvalid ( char  encodedCharacter)

Check if a character is valid according to UTF-8 encoding.

Parameters
encodedCharacterCharacter to check.
Returns
1 on success or 0 on failure.
int utf8convertucs2 ( ucs2_t  codepoint,
char *  target,
size_t  targetSize 
)

Convert a UCS-2 codepoint to UTF-8.

UCS-2 encoding is similar to UTF-16 encoding, except that it does not use surrogate pairs to encode values beyond U+FFFF.

This encoding was standard on Microsoft Windows XP. Newer versions of Windows use UTF-16 instead.

If 0 is specified as the target buffer, this function returns the number of bytes needed to store the codepoint.

Note
Surrogate pairs cannot be converted using this function. Use wctoutf8() instead.

Example:

ucs2_t input[] = { 0x3041, 0x304B, 0x3060, 0x3074 };
const size_t input_size = sizeof(input) / sizeof(ucs2_t);
const size_t text_size = 128;
char text[text_size] = { 0 };
char* dst = text;
size_t i;
int offset;
for (i = 0; i < input_size; ++i)
{
offset = utf8convertucs2(input[i], dst, text_size);
if (offset <= 0)
{
return 0;
}
dst += offset;
}
Parameters
codepointUCS-2 encoded codepoint.
targetString to write the result to.
targetSizeAmount of bytes remaining in the string.
Returns
Amount of bytes written or an error code.
See also
wctoutf8
utf8convertucs2
int utf8decode ( const char *  text,
unicode_t result 
)

Decode a UTF-8 encoded codepoint to a Unicode codepoint.

The result of this function can be used to offset the input string in order to decode all characters in a string.

Example:

const char* input = "Name: Bj\xC3\xB6rn Zonderland";
const char* src = input;
unicode_t codepoint;
int offset;
int i;
FontBatch_Start();
for (i = 0; i < utf8len(input); ++i)
{
offset = utf8decode(src, &codepoint);
if (offset <= 0)
{
break;
}
Font_AddCharacter(codepoint);
src += offset;
}
FontBatch_End();
FontBatch_Draw(100, 100);
Parameters
textInput string.
resultString to write the result to.
Returns
Input offset in bytes or an error code.
See also
utf8encode
int utf8encode ( unicode_t  codepoint,
char *  target,
size_t  targetSize 
)

Encode a Unicode codepoint to UTF-8.

Unicode codepoints must be in the range 0 - U+10FFFF, however the range U+D800 to U+DFFF is reserved for surrogate pairs and cannot be encoded.

Example:

char result[128];
char* dst;
memset(result, 0, 128);
strcat(result, "STARG");
dst = result + strlen(result);
utf8encode(0x1402, dst, 128 - strlen(result));
strcat(result, "TE");
Parameters
codepointUnicode codepoint.
targetString to write the result to.
targetSizeAmount of bytes remaining in the string.
Returns
Amount of bytes written or an error code.
See also
wctoutf8
utf8convertucs2
int utf8len ( const char *  text)

Get the length in codepoints of a UTF-8 encoded string.

Example:

int CheckPassword(const char* password)
{
int length = utf8len(password);
return (length == utf8len("hunter2"));
}
Parameters
textUTF-8 encoded string.
Returns
Length in codepoints or an error code.
const char* utf8seek ( const char *  text,
const char *  textStart,
off_t  offset,
int  direction 
)

Seek into a UTF-8 encoded string.

Working with UTF-8 encoded strings can be tricky due to the nature of the variable-length encoding. Because one character no longer equals one byte, it can be difficult to skip around in a UTF-8 encoded string without decoding the codepoints.

This function provides an interface similar to fseek in order to enable skipping to another part of the string.

Example:

const char* text = "Input: <LEFT ARROW>";
const char* input = utf8seek(text, text, utf8len("Input: "), SEEK_SET);

Directions:

  • SEEK_SET Offset is from the start of the string.
  • SEEK_CUR Offset is from the current position of the string.
  • SEEK_END Offset is from the end of the string.
Note
textStart must come before text in memory when seeking from the current or end position.
Parameters
textInput string.
textStartStart of input string.
offsetRequested offset in codepoints.
directionDirection to seek in.
Returns
Changed string or no change on error.
int utf8towc ( const char *  input,
size_t  inputSize,
wchar_t *  target,
size_t  targetSize 
)

Convert a UTF-8 encoded string to UTF-16.

Example:

const char* input = "Bj\xC3\xB6rn Zonderland";
size_t output_size = (strlen(input) + 1) * sizeof(wchar_t);
wchar_t* output = (wchar_t*)malloc(output_size);
int result = 0;
memset(output, 0, output_size);
result = utf8towc(input, strlen(input), output, output_size);
if (result > 0)
{
Player_SetName(output);
}
Parameters
inputUTF-8 encoded string.
inputSizeSize of the input in bytes.
targetString to write the result to.
targetSizeAmount of bytes remaining in the string.
Returns
Amount of bytes written or an error code.
See also
wctoutf8
utf8decode
int wctoutf8 ( const wchar_t *  input,
size_t  inputSize,
char *  target,
size_t  targetSize 
)

Convert a UTF-16 encoded string to UTF-8.

UTF-16 encoded text consists of two up to four bytes per encoded codepoint. A codepoint may consist of a high and low surrogate pair, which allows the encoding of the full range of Unicode characters that would otherwise not fit in a single 16-bit integer.

If 0 is specified as the target buffer, this function returns the number of bytes needed to store the string.

Example:

const wchar_t* input = L"textures/\xD803\xDC11.png";
size_t output_size = 0;
char* output = 0;
int result = 0;
int result = wctoutf8(input, wcslen(input) * sizeof(wchar_t), 0, 0);
if (result > 0)
{
output_size = (size_t)result + 1;
output = (char*)malloc(output_size);
memset(output, 0, output_size);
result = wctoutf8(input, wcslen(input) * sizeof(wchar_t), output, output_size);
if (result > 0)
{
Texture_Load(output);
}
free(output);
}
Parameters
inputUTF-16 encoded string.
inputSizeSize of the input in bytes.
targetString to write the result to.
targetSizeAmount of bytes remaining in the string.
Returns
Amount of bytes written or an error code.
See also
utf8towc