tlib/oversampling/WDL/has_strings.h

#ifndef _WDL_HASSTRINGS_H_
#define _WDL_HASSTRINGS_H_

#ifndef WDL_HASSTRINGS_EXPORT
#define WDL_HASSTRINGS_EXPORT
#endif

WDL_HASSTRINGS_EXPORT const char *hasStrings_rewutf8(const char *str, const char *base)
{
  while (str > base && (*(unsigned char *)str & 0xC0) == 0x80) str--;
  return str;
}
WDL_HASSTRINGS_EXPORT int hasStrings_isNonWordChar(const char *cptr)
{
  // treat non-alnum non-utf-8 as whitespace when searching for " foo "
  const unsigned char c = *(const unsigned char *)cptr;
  if (c < 128)
  {
    if ((c >= 'a' && c <= 'z') ||
        (c >= 'A' && c <= 'Z') ||
        (c >= '0' && c <= '9'))
    {
      return 0;
    }

    return 1; // non-alnum ascii are non-word chars
  }

  // most UTF-8 characters are word characters
  if (c == 0xE2)
  {
    // except UTF-8 apostrophes
    if (((unsigned char*)cptr)[1] == 0x80 && (((unsigned char*)cptr)[2]&~1) == 0x98) return 3;
  }

  return 0;
}

#include "utf8_extended.h"

// returns negative if does not match but more of a is available to search
// returns 0 if done searching without match
// returns >0 if matches (return bytelen of match)
// note that this assumes that b was preprocessed by WDL_makeSearchFilter and that strlen(b) >= n
WDL_HASSTRINGS_EXPORT int hasStrings_utf8cmp(const unsigned char * const a, const unsigned char *b, unsigned int n)
{
  int aidx=0;
  while (n)
  {
    int ca=a[aidx], cb=b[0];
    // ca may be any character (including A-Z), utf8, cb will never be A-Z or NUL
    WDL_ASSERT(cb != 0 && !(cb >= 'A' && cb <= 'Z'));
    cb -= ca;
    // if ca is A, and cb is a, cb will be 'a'-'A'
    if (cb)
    {
      if (cb != 'a'-'A')
      {
        if (ca < 0xc3 || ca > 0xc5)
        {
          if (ca == 0xE2 && cb == ('\''-0xE2) && a[aidx+1] == 0x80 && (a[aidx+2]&~1) == 0x98)
          {
            aidx+=3;
            ++b;
            --n;
            continue;
          }
          const int skipl = WDL_IS_UTF8_SKIPPABLE(ca,a[aidx+1]);
          if (skipl)
          {
            aidx += skipl;
            continue;
          }
          return -ca;
        }

        const int ccf = a[++aidx];
        if (ccf < 0x80) return -ca;

        if (ca == 0xc3)
        {
          // latin-1 supplemental
          const int cc = ccf & ~0x20;
          switch (*b)
          {
#define SCAN(ch, CH) case ch: if (!WDL_IS_UTF8_BYTE2_LATIN1S_##CH(cc,ccf)) return -ca; break;
          SCAN('a',A)
          SCAN('c',C)
          SCAN('e',E)
          SCAN('i',I)
          SCAN('n',N)
          SCAN('o',O)
          SCAN('u',U)
          SCAN('y',Y)
          default: return -ca; break;
#undef SCAN
          }
        }
        else
        {
          // latin extended A
          switch (*b)
          {
#define SCAN(ch, CH) case ch: if (!WDL_IS_UTF8_EXT1A_##CH(ca,ccf)) return -ca; break;
          SCAN('a',A)
          SCAN('c',C)
          SCAN('d',D)
          SCAN('e',E)
          SCAN('g',G)
          SCAN('h',H)
          SCAN('i',I)
          SCAN('j',J)
          SCAN('k',K)
          SCAN('l',L)
          SCAN('n',N)
          SCAN('o',O)
          SCAN('r',R)
          SCAN('s',S)
          SCAN('t',T)
          SCAN('u',U)
          SCAN('w',W)
          SCAN('y',Y)
          SCAN('z',Z)
          default: return -ca; break;
#undef SCAN
          }
        }
      }
      else if (ca < 'A' || ca > 'Z') return -ca;
    }
    ++aidx;
    ++b;
    --n;
  }
  return aidx;
}

static const char *hasStrings_scan_for_char_match(const char *p, char v)
{
  if (v < 'a' || v > 'z')
    for (;;)
    {
      char c = *p;
      if (!c) return NULL;
      if (c == v) return p;
      p++;
    }

  switch (v)
  {
    case '\'':
      for (;;) {
        unsigned char c = *(const unsigned char *)p;
        if (!c) return NULL;
        if (c == '\'') return p;
        if (c == 0xE2 && ((unsigned char*)p)[1] == 0x80 && (((unsigned char*)p)[2]&~1) == 0x98)
          return p;
        p++;
      }

#define SCAN(ch, CH) case (ch): for (;;) { \
      unsigned char c = *(const unsigned char *)p; \
      if (!c) return NULL; \
      if ((c|0x20) == (ch)) return p; \
      if (c >= 0xc3) { \
        if (c == 0xc3) { \
          const unsigned char ccf = ((const unsigned char*)p)[1]; \
          const unsigned char cc = ccf & ~0x20; \
          if (WDL_IS_UTF8_BYTE2_LATIN1S_##CH(cc,ccf)) return p; \
        } else { \
          if (WDL_IS_UTF8_EXT1A_##CH(c, ((const unsigned char*)p)[1])) return p; \
        } \
      } \
      p++; \
    }
    SCAN('a',A)
    SCAN('c',C)
    SCAN('e',E)
    SCAN('i',I)
    SCAN('n',N)
    SCAN('o',O)
    SCAN('u',U)
    SCAN('y',Y)
#undef SCAN

    // latin extended A only
#define SCAN(ch, CH) case (ch): for (;;) { \
      unsigned char c = *(const unsigned char *)p; \
      if (!c) return NULL; \
      if ((c|0x20) == (ch)) return p; \
      if (WDL_IS_UTF8_EXT1A_##CH(c, ((const unsigned char*)p)[1])) return p; \
      p++; \
    }

    SCAN('d',D)
    SCAN('g',G)
    SCAN('h',H)
    SCAN('j',J)
    SCAN('k',K)
    SCAN('l',L)
    SCAN('r',R)
    SCAN('s',S)
    SCAN('t',T)
    SCAN('w',W)
    SCAN('z',Z)
#undef SCAN

  }

  for (;;)
  {
    char c = *p;
    if (!c) return NULL;
    if ((c|0x20) == v) return p;
    p++;
  }
}

WDL_HASSTRINGS_EXPORT const char *hasStrings_skipSkippable(const char *cptr)
{
  int skip;
  while ((skip=WDL_IS_UTF8_SKIPPABLE(((unsigned char*)cptr)[0],((unsigned char*)cptr)[1]))>0) cptr+=skip;
  return cptr;
}

WDL_HASSTRINGS_EXPORT bool WDL_hasStringsEx2(const char **name_list, int name_list_size, const LineParser *lp
#ifdef WDL_HASSTRINGS_EXTRA_PARAMETERS
   WDL_HASSTRINGS_EXTRA_PARAMETERS
#endif
    )
{
  if (!lp) return true;
  const int ntok = lp->getnumtokens();
  if (ntok<1) return true;

  char stack_[1024]; // &1=not bit, 0x10 = ignoring subscopes, &2= state when 0x10 set
  int stacktop = 0, stacktop_v;
#define TOP_OF_STACK stacktop_v
#define PUSH_STACK(x) do { if (stacktop < (int)sizeof(stack_) - 1) stack_[stacktop++] = stacktop_v&0xff; stacktop_v = (x); } while(0)
#define POP_STACK() (stacktop_v = stack_[--stacktop])
  TOP_OF_STACK = 0;

  char matched_local=-1; // -1 = first eval for scope, 0=did not pass scope, 1=OK, 2=ignore rest of scope
  for (int x = 0; x < ntok; x ++)
  {
    const char *n=lp->gettoken_str(x);

    if (n[0] == '(' && !n[1] && !lp->gettoken_quotingchar(x))
    {
      if (!(matched_local&1))
      {
        TOP_OF_STACK |= matched_local | 0x10;
        matched_local=2; // ignore subscope
      }
      else
      {
        matched_local = -1; // new scope
      }

      PUSH_STACK(0);
    }
    else if (n[0] == ')' && !n[1] && stacktop && !lp->gettoken_quotingchar(x))
    {
      if (POP_STACK()&0x10)
      {
        // restore state
        matched_local = TOP_OF_STACK&2;
      }
      else
      {
        matched_local = (matched_local != 0 ? 1 : 0) ^ (TOP_OF_STACK&1);
      }
      TOP_OF_STACK = 0;
    }
    else if (n[0] == 'O' && n[1] == 'R' && !n[2] && matched_local != 2 && !lp->gettoken_quotingchar(x))
    {
      matched_local = (matched_local > 0) ? 2 : -1;
      TOP_OF_STACK = 0;
    }
    else if (matched_local&1) // matches 1, -1
    {
      int ln = (int)strlen(n);
      if (ln>0)
      {
        // ^foo -- string starts (or follows \1 separator with) foo
        // foo$ -- string ends with foo (or is immediately followed by \1 separator)
        // " foo ", "foo ", " foo" include end of string/start of string has whitespace
        int wc_left = 0; // 1=require \1 or start of string, 2=require space or \1 or start
        int wc_right = 0; // 1=require \1 or \0, 2 = require space or \1 or \0
        // perhaps wc_left/wc_right of 2 should also match non-alnum characters in addition to space?
        if (ln>1)
        {
          switch (*n)
          {
            case ' ':
              if (*++n != ' ') wc_left=2;
              // else { multiple characters of whitespace = literal whitespace search (two spaces requires a single space, etc) }

              ln--;
            break;
            case '^':
              ln--;
              n++;
              wc_left=1;
            break;
            // upper case being here implies it is almost certainly NOT/AND due to postprocessing in WDL_makeSearchFilter
            case 'N':
              if (WDL_NORMALLY(!strcmp(n,"NOT") && !lp->gettoken_quotingchar(x)))
              {
                TOP_OF_STACK^=1;
                continue;
              }
            break;
            case 'A':
              if (WDL_NORMALLY(!strcmp(n,"AND") && !lp->gettoken_quotingchar(x)))
              {
                // ignore unquoted uppercase AND
                continue;
              }
            break;
          }
        }
        if (ln>1)
        {
          switch (n[ln-1])
          {
            case ' ':
              if (n[--ln - 1] != ' ') wc_right=2;
              // else { multiple characters of whitespace = literal whitespace search (two spaces requires a single space, etc) }
            break;
            case '$':
              ln--;
              wc_right++;
            break;
          }
        }

        if (!wc_left && !wc_right && *n)
        {
          switch (lp->gettoken_quotingchar(x))
          {
            case '\'':
            case '"':
              { // if a quoted string has no whitespace in it, treat as whole word search
                const char *p = n;
                while (*p && *p != ' ' && *p != '\t') p++;
                if (!*p)
                {
                  wc_left=wc_right=2;
                }
              }
            break;
          }
        }

        bool matched = false;

#ifdef WDL_HASSTRINGS_PRE_MATCH
        if (!wc_left && !wc_right && WDL_HASSTRINGS_PRE_MATCH(n))
          matched = true;
        else
#endif
        for (int i = 0; i < name_list_size; i ++)
        {
          const char *name = name_list[i];
          const char *t = name;

#define MATCH_RIGHT_CHECK_WORD(SZ) \
                (wc_right == 0 || \
                  ((const unsigned char*)(t))[SZ] < 2 || \
                  (wc_right > 1 && hasStrings_isNonWordChar(hasStrings_skipSkippable((t)+(SZ)))) \
                )

#define MATCH_LEFT_SKIP_TO_WORD() do { \
                if (*(unsigned char*)t < 2) { t++; break; } \
                if (wc_left>1) { const int l = hasStrings_isNonWordChar(t); if (l > 0) { t+=l; break; } } \
                t++; \
              } while (t[0])

          {
            const char n0 = n[0];
            if (wc_left>0)
            {
              for (;;)
              {
                t = hasStrings_scan_for_char_match(t,n0);
                if (!t) break;
                if (t==name || t[-1] == 1 || (wc_left>1 && hasStrings_isNonWordChar(hasStrings_rewutf8(t-1,name))))
                {
                  const int v = hasStrings_utf8cmp((const unsigned char *)t,(const unsigned char *)n,ln);
                  if (v>=0)
                  {
                    if (!v) break;
                    if (MATCH_RIGHT_CHECK_WORD(v)) { matched = true; break; }
                  }
                }
                t++;
              }
            }
            else
            {
              for (;;)
              {
                t = hasStrings_scan_for_char_match(t,n0);
                if (!t) break;
                const int v = hasStrings_utf8cmp((const unsigned char *)t,(const unsigned char *)n,ln);
                if (v>=0)
                {
                  if (!v) break;
                  if (MATCH_RIGHT_CHECK_WORD(v)) { matched = true; break; }
                }
                t++;
              }
            }
          }
#undef MATCH_RIGHT_CHECK_WORD
#undef MATCH_LEFT_SKIP_TO_WORD
          if (matched) break;
        }

        matched_local = (matched?1:0) ^ (TOP_OF_STACK&1);
        TOP_OF_STACK=0;
      }
    }
  }
  while (stacktop > 0)
  {
    if (POP_STACK() & 0x10) matched_local=TOP_OF_STACK&2;
    else matched_local = (matched_local > 0 ? 1 : 0) ^ (TOP_OF_STACK&1);
  }

  return matched_local!=0;
#undef TOP_OF_STACK
#undef POP_STACK
#undef PUSH_STACK
}

#ifndef WDL_HASSTRINGS_EXTRA_PARAMETERS
WDL_HASSTRINGS_EXPORT bool WDL_hasStringsEx(const char *name, const LineParser *lp)
{
  return WDL_hasStringsEx2(&name,1,lp);
}

WDL_HASSTRINGS_EXPORT bool WDL_hasStrings(const char *name, const LineParser *lp)
{
  return WDL_hasStringsEx2(&name,1,lp);
}
#endif

WDL_HASSTRINGS_EXPORT char *WDL_hasstrings_preproc_searchitem(char *wr, const char *src)
{
  while (*src)
  {
    unsigned char c = *(unsigned char*)src++;
    if (c >= 'A' && c <= 'Z') c+='a'-'A';
    else if (c == 0xC3)
    {
      const unsigned char ccf = *(unsigned char*)src;
      const unsigned char cc = ccf & ~0x20;
      if (WDL_IS_UTF8_BYTE2_LATIN1S_A(cc,ccf)) c = 'a';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_C(cc,ccf)) c = 'c';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_E(cc,ccf)) c = 'e';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_I(cc,ccf)) c = 'i';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_N(cc,ccf)) c = 'n';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_O(cc,ccf)) c = 'o';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_U(cc,ccf)) c = 'u';
      else if (WDL_IS_UTF8_BYTE2_LATIN1S_Y(cc,ccf)) c = 'y';

      if (c != 0xC3) src++;
    }
    else if (c == 0xE2)
    {
      // convert u+2018/2019 to '
      if (*(unsigned char*)src == 0x80 && (((unsigned char*)src)[1]&~1) == 0x98)
      {
        c = '\'';
        src+=2;
      }
    }
    else
    {
      const int skipl = WDL_IS_UTF8_SKIPPABLE(c, *(unsigned char*)src);
      if (skipl > 0)
      {
        src += skipl-1;
        continue;
      }
    }

    // we could also convert latin extended A characters to ascii here, but meh
    *wr++ = c;
  }
  *wr=0;
  return wr;
}

WDL_HASSTRINGS_EXPORT bool WDL_makeSearchFilter(const char *flt, LineParser *lp)
{
  if (WDL_NOT_NORMALLY(!lp)) return false;

  if (WDL_NOT_NORMALLY(!flt)) flt="";

#ifdef WDL_LINEPARSER_HAS_LINEPARSERINT
  if (lp->parse_ex(flt,true,false,true)) // allow unterminated quotes
#else
  if (lp->parse_ex(flt,true,false))
#endif
  {
    if (*flt) lp->set_one_token(flt); // failed parsing search string, search as a single token
  }
  for (int x = 0; x < lp->getnumtokens(); x ++)
  {
    char *p = (char *)lp->gettoken_str(x);
    if (lp->gettoken_quotingchar(x) || (strcmp(p,"NOT") && strcmp(p,"AND") && strcmp(p,"OR")))
    {
      WDL_hasstrings_preproc_searchitem(p, p);
    }
  }

  return lp->getnumtokens()>0;
}

#endif