318 lines
7.6 KiB
C
318 lines
7.6 KiB
C
/*
|
|
WDL - wdlutf8.h
|
|
Copyright (C) 2005 and later, Cockos Incorporated
|
|
|
|
This software is provided 'as-is', without any express or implied
|
|
warranty. In no event will the authors be held liable for any damages
|
|
arising from the use of this software.
|
|
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it
|
|
freely, subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must not
|
|
claim that you wrote the original software. If you use this software
|
|
in a product, an acknowledgment in the product documentation would be
|
|
appreciated but is not required.
|
|
2. Altered source versions must be plainly marked as such, and must not be
|
|
misrepresented as being the original software.
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
|
|
*/
|
|
|
|
#ifndef _WDLUTF8_H_
|
|
#define _WDLUTF8_H_
|
|
|
|
/* todo: handle overlongs?
|
|
* todo: handle multi-byte (make WideStr support UTF-16)
|
|
*/
|
|
|
|
#include "wdltypes.h"
|
|
|
|
#ifndef WDL_WCHAR
|
|
#ifdef _WIN32
|
|
#define WDL_WCHAR WCHAR
|
|
#else
|
|
// this is often 4 bytes on macOS/linux! beware dragons!
|
|
#define WDL_WCHAR wchar_t
|
|
#endif
|
|
#endif
|
|
|
|
|
|
// returns size, sets cOut to code point.
|
|
// if invalid UTF-8, sets cOut to first character (as unsigned char).
|
|
// cOut may be NULL if you only want the size of the character
|
|
static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
|
|
{
|
|
const unsigned char *p = (const unsigned char *)rd;
|
|
const unsigned char b0 = *p;
|
|
unsigned char b1,b2,b3;
|
|
|
|
if (cOut) *cOut = b0;
|
|
if (b0 < 0x80)
|
|
{
|
|
return 1;
|
|
}
|
|
if (((b1=p[1])&0xC0) != 0x80) return 1;
|
|
|
|
if (b0 < 0xE0)
|
|
{
|
|
if (!(b0&0x1E)) return 1; // detect overlong
|
|
if (cOut) *cOut = ((b0&0x1F)<<6)|(b1&0x3F);
|
|
return 2;
|
|
}
|
|
|
|
if (((b2=p[2])&0xC0) != 0x80) return 1;
|
|
|
|
if (b0 < 0xF0)
|
|
{
|
|
if (!(b0&0xF) && !(b1&0x20)) return 1; // detect overlong
|
|
|
|
if (cOut) *cOut = ((b0&0x0F)<<12)|((b1&0x3F)<<6)|(b2&0x3f);
|
|
return 3;
|
|
}
|
|
|
|
if (((b3=p[3])&0xC0) != 0x80) return 1;
|
|
|
|
if (b0 < 0xF8)
|
|
{
|
|
if (!(b0&0x7) && !(b1&0x30)) return 1; // detect overlong
|
|
|
|
if (cOut) *cOut = ((b0&7)<<18)|((b1&0x3F)<<12)|((b2&0x3F)<<6)|(b3&0x3F);
|
|
return 4;
|
|
}
|
|
|
|
// UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
|
|
// skip them and return _
|
|
if ((p[4]&0xC0) != 0x80) return 1;
|
|
if (b0 < 0xFC)
|
|
{
|
|
if (cOut) *cOut = '_';
|
|
return 5;
|
|
}
|
|
|
|
if ((p[5]&0xC0) != 0x80) return 1;
|
|
if (cOut) *cOut = '_';
|
|
return 6;
|
|
}
|
|
|
|
|
|
// makes a character, returns length. does NOT nul terminate.
|
|
// returns 0 if insufficient space, -1 if out of range value
|
|
static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
|
|
{
|
|
if (c < 0) return -1; // out of range character
|
|
|
|
if (c < 0x80)
|
|
{
|
|
if (dest_len<1) return 0;
|
|
dest[0]=(char)c;
|
|
return 1;
|
|
}
|
|
if (c < 0x800)
|
|
{
|
|
if (dest_len < 2) return 0;
|
|
|
|
dest[0]=0xC0|(c>>6);
|
|
dest[1]=0x80|(c&0x3F);
|
|
return 2;
|
|
}
|
|
if (c < 0x10000)
|
|
{
|
|
if (dest_len < 3) return 0;
|
|
|
|
dest[0]=0xE0|(c>>12);
|
|
dest[1]=0x80|((c>>6)&0x3F);
|
|
dest[2]=0x80|(c&0x3F);
|
|
return 3;
|
|
}
|
|
if (c < 0x200000)
|
|
{
|
|
if (dest_len < 4) return 0;
|
|
dest[0]=0xF0|(c>>18);
|
|
dest[1]=0x80|((c>>12)&0x3F);
|
|
dest[2]=0x80|((c>>6)&0x3F);
|
|
dest[3]=0x80|(c&0x3F);
|
|
return 4;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
// invalid UTF-8 are now treated as ANSI characters for this function
|
|
static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
|
|
{
|
|
WDL_WCHAR *w = dest, *dest_endp = dest+(size_t)destlenbytes/sizeof(WDL_WCHAR)-1;
|
|
if (!dest || destlenbytes < 1) return 0;
|
|
|
|
if (src) for (; *src && w < dest_endp; )
|
|
{
|
|
int c,sz=wdl_utf8_parsechar(src,&c);
|
|
*w++ = c;
|
|
src+=sz;
|
|
}
|
|
*w=0;
|
|
return (int)(w-dest);
|
|
}
|
|
|
|
|
|
// like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
|
|
// negative values for character are treated as 0.
|
|
static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char* dest, int c, int destlen)
|
|
{
|
|
if (destlen < 2)
|
|
{
|
|
if (destlen == 1) dest[0]=0;
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
const int v = wdl_utf8_makechar(c>0?c:0,dest,destlen-1);
|
|
if (v < 1) // implies either insufficient space or out of range character
|
|
{
|
|
dest[0]='_';
|
|
dest[1]=0;
|
|
return 1;
|
|
}
|
|
dest[v]=0;
|
|
return v;
|
|
}
|
|
}
|
|
|
|
static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
|
|
{
|
|
char *p = dest, *dest_endp = dest + destlenbytes - 1;
|
|
if (!dest || destlenbytes < 1) return 0;
|
|
|
|
if (src) while (*src && p < dest_endp)
|
|
{
|
|
const int v = wdl_utf8_makechar(*src++,p,(int)(dest_endp-p));
|
|
if (v > 0)
|
|
{
|
|
p += v;
|
|
}
|
|
else if (v == 0) break; // out of space
|
|
}
|
|
*p=0;
|
|
return (int)(p-dest);
|
|
}
|
|
|
|
// returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
|
|
static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
|
|
{
|
|
int hasUTF=0;
|
|
|
|
if (!str) return 0;
|
|
|
|
for (;;)
|
|
{
|
|
const unsigned char c = *(const unsigned char *)str;
|
|
|
|
if (c < 0xC2 || c > 0xF7)
|
|
{
|
|
if (!c) return hasUTF;
|
|
if (c >= 0x80) return -1;
|
|
str++;
|
|
}
|
|
else
|
|
{
|
|
const int l = wdl_utf8_parsechar(str,NULL);
|
|
if (l < 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
|
|
str+=l;
|
|
hasUTF=1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
|
|
{
|
|
int bpos = 0;
|
|
while (charpos-- > 0 && str[bpos])
|
|
{
|
|
bpos += wdl_utf8_parsechar(str+bpos,NULL);
|
|
}
|
|
return bpos;
|
|
}
|
|
static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
|
|
{
|
|
int bpos = 0, cpos=0;
|
|
while (bpos < bytepos && str[bpos])
|
|
{
|
|
bpos += wdl_utf8_parsechar(str+bpos,NULL);
|
|
cpos++;
|
|
}
|
|
return cpos;
|
|
}
|
|
|
|
#define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)
|
|
|
|
static void WDL_STATICFUNC_UNUSED wdl_utf8_set_char_case(char *p, int upper) // upper 1 or -1 only
|
|
{
|
|
const unsigned char c1 = (unsigned char)*p;
|
|
WDL_ASSERT(upper == 1 || upper == -1);
|
|
if (c1 >= 'a' && c1 <= 'z')
|
|
{
|
|
if (upper>0) *p += 'A'-'a';
|
|
}
|
|
else if (c1 >= 'A' && c1 <= 'Z')
|
|
{
|
|
if (upper<0) *p -= 'A'-'a';
|
|
}
|
|
else if (c1 >= 0x80)
|
|
{
|
|
const unsigned char cc = (unsigned char)p[1] - 0x80;
|
|
switch (c1)
|
|
{
|
|
case 0xc3: // u+0c0 to u+0ff as 0..0x3f
|
|
if ((cc&~0x20) != 0x17) // all values except 0xc7 and 0xf7
|
|
{
|
|
if (upper>0) p[1] &= ~0x20;
|
|
else p[1] |= 0x20;
|
|
}
|
|
break;
|
|
case 0xc4: // u+100 to u+13f
|
|
if (cc <= 0x37)
|
|
{
|
|
// u+100 to u+137 low bit is lowercase
|
|
if (upper>0) p[1] &= ~1;
|
|
else p[1] |= 1;
|
|
}
|
|
// u+138 is not cased
|
|
else if (cc >= 0x39 && cc < 0x3f)
|
|
{
|
|
// u+139 to u+13e, odd is uppercase
|
|
if ((cc & 1) != (upper>0)) p[1] -= upper;
|
|
}
|
|
else if (cc == 0x3f && upper<0) // u+139 convert to u+140
|
|
{
|
|
p[0]++;
|
|
p[1] -= 0x3f;
|
|
}
|
|
break;
|
|
case 0xc5: // u+140 to u+17f
|
|
// u+149 and u+178 and u+17f are not cased
|
|
if (cc == 0 && upper>0) // u+140 -> u+13f
|
|
{
|
|
p[0]--;
|
|
p[1] |= 0x3f;
|
|
}
|
|
else if (cc >= 0xa && cc <= 0x37) // u+14a to u+177 low bit is lowercase
|
|
{
|
|
if (upper>0) p[1] &= ~1;
|
|
else p[1] |= 1;
|
|
}
|
|
else if ((cc > 0 && cc <= 8) || (cc >= 0x39 && cc <= 0x3e))
|
|
{
|
|
// u+141 to u+148 and u+179 to u+17e have odd=uppercase
|
|
if ((cc & 1) != (upper>0)) p[1] -= upper;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#endif
|