utf 8 handling sample
strncpy() is a terrible function:
1. If there is insufficient space, the resulting string will not be nul terminated.
2. If there is enough space, the remainder is filled with NULs. This can be painful if the target string is very big.
Even if the characters stay in the ASCII range (0x7f and below), the resulting string will not be what you want. In the UTF-8 case it might be not nul-terminated and end in an invalid UTF-8 sequence.
Best advice is to avoid strncpy().
EDIT: ad 1):
#include
#include
int main (void)
{
char buff [4];
strncpy (buff, "hello world!\n", sizeof buff );
printf("%s\n", buff );
return 0;
}
Agreed, the buffer will not be overrun. But the result is still unwanted. strncpy() solves only part of the problem. It is misleading and unwanted.
UPDATE(2012-10-31): Since this is a nasty problem, I decided to hack my own version, mimicking the ugly strncpy() behavior. The return value is the number of characters copied, though..
#include
#include
size_t utf8ncpy(char *dst, char *src, size_t todo);
static int cnt_utf8(unsigned ch, size_t len);
static int cnt_utf8(unsigned ch, size_t len)
{
if (!len) return 0;
if ((ch & 0x80) == 0x00) return 1;
else if ((ch & 0xe0) == 0xc0) return 2;
else if ((ch & 0xf0) == 0xe0) return 3;
else if ((ch & 0xf8) == 0xf0) return 4;
else if ((ch & 0xfc) == 0xf8) return 5;
else if ((ch & 0xfe) == 0xfc) return 6;
else return -1; /* Default (Not in the spec) */
}
size_t utf8ncpy(char *dst, char *src, size_t todo)
{
size_t done, idx, chunk, srclen;
srclen = strlen(src);
for(done=idx=0; idx < srclen; idx+=chunk) {
int ret;
for (chunk=0; done+chunk < todo; chunk++) {
ret = cnt_utf8( src[idx+chunk], srclen - (idx+chunk) );
if (ret ==1) continue; /* Normal character: collect it into chunk */
if (ret < 0) continue; /* Bad stuff: treat as normal char */
if (ret ==0) break; /* EOF */
if (!chunk) chunk = ret;/* an UTF8 multibyte character */
else ret = 1; /* we allready collected a number (chunk) of normal characters */
break;