tlib/oversampling/WDL/wdlutf8.h

/*
WDL - wdlutf8.h
Copyright (C) 2005 and later, Cockos Incorporated

This software is provided 'as-is', without any express or implied
warranty.  In no event will the authors be held liable for any damages
arising from the use of this software.

Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.

*/

#ifndef _WDLUTF8_H_
#define _WDLUTF8_H_

/* todo: handle overlongs?
 * todo: handle multi-byte (make WideStr support UTF-16)
 */

#include "wdltypes.h"

#ifndef WDL_WCHAR
  #ifdef _WIN32
    #define WDL_WCHAR WCHAR
  #else
    // this is often 4 bytes on macOS/linux! beware dragons!
    #define WDL_WCHAR wchar_t
  #endif
#endif


// returns size, sets cOut to code point.
// if invalid UTF-8, sets cOut to first character (as unsigned char).
// cOut may be NULL if you only want the size of the character
static int WDL_STATICFUNC_UNUSED wdl_utf8_parsechar(const char *rd, int *cOut)
{
  const unsigned char *p = (const unsigned char *)rd;
  const unsigned char b0 = *p;
  unsigned char b1,b2,b3;

  if (cOut) *cOut = b0;
  if (b0 < 0x80)
  {
    return 1;
  }
  if (((b1=p[1])&0xC0) != 0x80) return 1;

  if (b0 < 0xE0)
  {
    if (!(b0&0x1E)) return 1; // detect overlong
    if (cOut) *cOut = ((b0&0x1F)<<6)|(b1&0x3F);
    return 2;
  }

  if (((b2=p[2])&0xC0) != 0x80) return 1;

  if (b0 < 0xF0)
  {
    if (!(b0&0xF) && !(b1&0x20)) return 1; // detect overlong

    if (cOut) *cOut = ((b0&0x0F)<<12)|((b1&0x3F)<<6)|(b2&0x3f);
    return 3;
  }

  if (((b3=p[3])&0xC0) != 0x80) return 1;

  if (b0 < 0xF8)
  {
    if (!(b0&0x7) && !(b1&0x30)) return 1; // detect overlong

    if (cOut) *cOut = ((b0&7)<<18)|((b1&0x3F)<<12)|((b2&0x3F)<<6)|(b3&0x3F);
    return 4;
  }

  // UTF-8 does not actually support 5-6 byte sequences as of 2003 (RFC-3629)
  // skip them and return _
  if ((p[4]&0xC0) != 0x80) return 1;
  if (b0 < 0xFC)
  {
    if (cOut) *cOut = '_';
    return 5;
  }

  if ((p[5]&0xC0) != 0x80) return 1;
  if (cOut) *cOut = '_';
  return 6;
}


// makes a character, returns length. does NOT nul terminate.
// returns 0 if insufficient space, -1 if out of range value
static int WDL_STATICFUNC_UNUSED wdl_utf8_makechar(int c, char *dest, int dest_len)
{
  if (c < 0) return -1; // out of range character

  if (c < 0x80)
  {
    if (dest_len<1) return 0;
    dest[0]=(char)c;
    return 1;
  }
  if (c < 0x800)
  {
    if (dest_len < 2) return 0;

    dest[0]=0xC0|(c>>6);
    dest[1]=0x80|(c&0x3F);
    return 2;
  }
  if (c < 0x10000)
  {
    if (dest_len < 3) return 0;

    dest[0]=0xE0|(c>>12);
    dest[1]=0x80|((c>>6)&0x3F);
    dest[2]=0x80|(c&0x3F);
    return 3;
  }
  if (c < 0x200000)
  {
    if (dest_len < 4) return 0;
    dest[0]=0xF0|(c>>18);
    dest[1]=0x80|((c>>12)&0x3F);
    dest[2]=0x80|((c>>6)&0x3F);
    dest[3]=0x80|(c&0x3F);
    return 4;
  }

  return -1;
}


// invalid UTF-8 are now treated as ANSI characters for this function
static int WDL_STATICFUNC_UNUSED WDL_MBtoWideStr(WDL_WCHAR *dest, const char *src, int destlenbytes)
{
  WDL_WCHAR *w = dest, *dest_endp = dest+(size_t)destlenbytes/sizeof(WDL_WCHAR)-1;
  if (!dest || destlenbytes < 1) return 0;

  if (src) for (; *src && w < dest_endp; )
  {
    int c,sz=wdl_utf8_parsechar(src,&c);
    *w++ = c;
    src+=sz;
  }
  *w=0;
  return (int)(w-dest);
}


// like wdl_utf8_makechar, except nul terminates and handles errors differently (returns _ and 1 on errors)
// negative values for character are treated as 0.
static int WDL_STATICFUNC_UNUSED WDL_MakeUTFChar(char* dest, int c, int destlen)
{
  if (destlen < 2)
  {
    if (destlen == 1) dest[0]=0;
    return 0;
  }
  else
  {
    const int v = wdl_utf8_makechar(c>0?c:0,dest,destlen-1);
    if (v < 1) // implies either insufficient space or out of range character
    {
      dest[0]='_';
      dest[1]=0;
      return 1;
    }
    dest[v]=0;
    return v;
  }
}

static int WDL_STATICFUNC_UNUSED WDL_WideToMBStr(char *dest, const WDL_WCHAR *src, int destlenbytes)
{
  char *p = dest, *dest_endp = dest + destlenbytes - 1;
  if (!dest || destlenbytes < 1) return 0;

  if (src) while (*src && p < dest_endp)
  {
    const int v = wdl_utf8_makechar(*src++,p,(int)(dest_endp-p));
    if (v > 0)
    {
      p += v;
    }
    else if (v == 0) break; // out of space
  }
  *p=0;
  return (int)(p-dest);
}

// returns >0 if UTF-8, -1 if 8-bit chars occur that are not UTF-8, or 0 if ASCII
static int WDL_STATICFUNC_UNUSED WDL_DetectUTF8(const char *str)
{
  int hasUTF=0;

  if (!str) return 0;

  for (;;)
  {
    const unsigned char c = *(const unsigned char *)str;

    if (c < 0xC2 || c > 0xF7)
    {
      if (!c) return hasUTF;
      if (c >= 0x80) return -1;
      str++;
    }
    else
    {
      const int l = wdl_utf8_parsechar(str,NULL);
      if (l < 2) return -1; // wdl_utf8_parsechar returns length=1 if it couldn't parse UTF-8 properly
      str+=l;
      hasUTF=1;
    }
  }
}


static int WDL_STATICFUNC_UNUSED WDL_utf8_charpos_to_bytepos(const char *str, int charpos)
{
  int bpos = 0;
  while (charpos-- > 0 && str[bpos])
  {
    bpos += wdl_utf8_parsechar(str+bpos,NULL);
  }
  return bpos;
}
static int WDL_STATICFUNC_UNUSED WDL_utf8_bytepos_to_charpos(const char *str, int bytepos)
{
  int bpos = 0, cpos=0;
  while (bpos < bytepos && str[bpos])
  {
    bpos += wdl_utf8_parsechar(str+bpos,NULL);
    cpos++;
  }
  return cpos;
}

#define WDL_utf8_get_charlen(rd) WDL_utf8_bytepos_to_charpos((rd), 0x7fffffff)

static void WDL_STATICFUNC_UNUSED wdl_utf8_set_char_case(char *p, int upper) // upper 1 or -1 only
{
  const unsigned char c1 = (unsigned char)*p;
  WDL_ASSERT(upper == 1 || upper == -1);
  if (c1 >= 'a' && c1 <= 'z')
  {
    if (upper>0) *p += 'A'-'a';
  }
  else if (c1 >= 'A' && c1 <= 'Z')
  {
    if (upper<0) *p -= 'A'-'a';
  }
  else if (c1 >= 0x80)
  {
    const unsigned char cc = (unsigned char)p[1] - 0x80;
    switch (c1)
    {
      case 0xc3: // u+0c0 to u+0ff as 0..0x3f
        if ((cc&~0x20) != 0x17) // all values except 0xc7 and 0xf7
        {
          if (upper>0) p[1] &= ~0x20;
          else p[1] |= 0x20;
        }
      break;
      case 0xc4: // u+100 to u+13f
        if (cc <= 0x37)
        {
          // u+100 to u+137 low bit is lowercase
          if (upper>0) p[1] &= ~1;
          else p[1] |= 1;
        }
        // u+138 is not cased
        else if (cc >= 0x39 && cc < 0x3f)
        {
          // u+139 to u+13e, odd is uppercase
          if ((cc & 1) != (upper>0)) p[1] -= upper;
        }
        else if (cc == 0x3f && upper<0) // u+139 convert to u+140
        {
          p[0]++;
          p[1] -= 0x3f;
        }
      break;
      case 0xc5: // u+140 to u+17f
        // u+149 and u+178 and u+17f are not cased
        if (cc == 0 && upper>0) // u+140 -> u+13f
        {
          p[0]--;
          p[1] |= 0x3f;
        }
        else if (cc >= 0xa && cc <= 0x37) // u+14a to u+177 low bit is lowercase
        {
          if (upper>0) p[1] &= ~1;
          else p[1] |= 1;
        }
        else if ((cc > 0 && cc <= 8) || (cc >= 0x39 && cc <= 0x3e))
        {
          // u+141 to u+148 and u+179 to u+17e have odd=uppercase
          if ((cc & 1) != (upper>0)) p[1] -= upper;
        }
      break;
    }
  }
}


#endif