Files
tlib/oversampling/WDL/has_strings.h
2024-05-24 13:28:31 +02:00

524 lines
14 KiB
C

#ifndef _WDL_HASSTRINGS_H_
#define _WDL_HASSTRINGS_H_
#ifndef WDL_HASSTRINGS_EXPORT
#define WDL_HASSTRINGS_EXPORT
#endif
WDL_HASSTRINGS_EXPORT const char *hasStrings_rewutf8(const char *str, const char *base)
{
while (str > base && (*(unsigned char *)str & 0xC0) == 0x80) str--;
return str;
}
WDL_HASSTRINGS_EXPORT int hasStrings_isNonWordChar(const char *cptr)
{
// treat non-alnum non-utf-8 as whitespace when searching for " foo "
const unsigned char c = *(const unsigned char *)cptr;
if (c < 128)
{
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9'))
{
return 0;
}
return 1; // non-alnum ascii are non-word chars
}
// most UTF-8 characters are word characters
if (c == 0xE2)
{
// except UTF-8 apostrophes
if (((unsigned char*)cptr)[1] == 0x80 && (((unsigned char*)cptr)[2]&~1) == 0x98) return 3;
}
return 0;
}
#include "utf8_extended.h"
// returns negative if does not match but more of a is available to search
// returns 0 if done searching without match
// returns >0 if matches (return bytelen of match)
// note that this assumes that b was preprocessed by WDL_makeSearchFilter and that strlen(b) >= n
WDL_HASSTRINGS_EXPORT int hasStrings_utf8cmp(const unsigned char * const a, const unsigned char *b, unsigned int n)
{
int aidx=0;
while (n)
{
int ca=a[aidx], cb=b[0];
// ca may be any character (including A-Z), utf8, cb will never be A-Z or NUL
WDL_ASSERT(cb != 0 && !(cb >= 'A' && cb <= 'Z'));
cb -= ca;
// if ca is A, and cb is a, cb will be 'a'-'A'
if (cb)
{
if (cb != 'a'-'A')
{
if (ca < 0xc3 || ca > 0xc5)
{
if (ca == 0xE2 && cb == ('\''-0xE2) && a[aidx+1] == 0x80 && (a[aidx+2]&~1) == 0x98)
{
aidx+=3;
++b;
--n;
continue;
}
const int skipl = WDL_IS_UTF8_SKIPPABLE(ca,a[aidx+1]);
if (skipl)
{
aidx += skipl;
continue;
}
return -ca;
}
const int ccf = a[++aidx];
if (ccf < 0x80) return -ca;
if (ca == 0xc3)
{
// latin-1 supplemental
const int cc = ccf & ~0x20;
switch (*b)
{
#define SCAN(ch, CH) case ch: if (!WDL_IS_UTF8_BYTE2_LATIN1S_##CH(cc,ccf)) return -ca; break;
SCAN('a',A)
SCAN('c',C)
SCAN('e',E)
SCAN('i',I)
SCAN('n',N)
SCAN('o',O)
SCAN('u',U)
SCAN('y',Y)
default: return -ca; break;
#undef SCAN
}
}
else
{
// latin extended A
switch (*b)
{
#define SCAN(ch, CH) case ch: if (!WDL_IS_UTF8_EXT1A_##CH(ca,ccf)) return -ca; break;
SCAN('a',A)
SCAN('c',C)
SCAN('d',D)
SCAN('e',E)
SCAN('g',G)
SCAN('h',H)
SCAN('i',I)
SCAN('j',J)
SCAN('k',K)
SCAN('l',L)
SCAN('n',N)
SCAN('o',O)
SCAN('r',R)
SCAN('s',S)
SCAN('t',T)
SCAN('u',U)
SCAN('w',W)
SCAN('y',Y)
SCAN('z',Z)
default: return -ca; break;
#undef SCAN
}
}
}
else if (ca < 'A' || ca > 'Z') return -ca;
}
++aidx;
++b;
--n;
}
return aidx;
}
static const char *hasStrings_scan_for_char_match(const char *p, char v)
{
if (v < 'a' || v > 'z')
for (;;)
{
char c = *p;
if (!c) return NULL;
if (c == v) return p;
p++;
}
switch (v)
{
case '\'':
for (;;) {
unsigned char c = *(const unsigned char *)p;
if (!c) return NULL;
if (c == '\'') return p;
if (c == 0xE2 && ((unsigned char*)p)[1] == 0x80 && (((unsigned char*)p)[2]&~1) == 0x98)
return p;
p++;
}
#define SCAN(ch, CH) case (ch): for (;;) { \
unsigned char c = *(const unsigned char *)p; \
if (!c) return NULL; \
if ((c|0x20) == (ch)) return p; \
if (c >= 0xc3) { \
if (c == 0xc3) { \
const unsigned char ccf = ((const unsigned char*)p)[1]; \
const unsigned char cc = ccf & ~0x20; \
if (WDL_IS_UTF8_BYTE2_LATIN1S_##CH(cc,ccf)) return p; \
} else { \
if (WDL_IS_UTF8_EXT1A_##CH(c, ((const unsigned char*)p)[1])) return p; \
} \
} \
p++; \
}
SCAN('a',A)
SCAN('c',C)
SCAN('e',E)
SCAN('i',I)
SCAN('n',N)
SCAN('o',O)
SCAN('u',U)
SCAN('y',Y)
#undef SCAN
// latin extended A only
#define SCAN(ch, CH) case (ch): for (;;) { \
unsigned char c = *(const unsigned char *)p; \
if (!c) return NULL; \
if ((c|0x20) == (ch)) return p; \
if (WDL_IS_UTF8_EXT1A_##CH(c, ((const unsigned char*)p)[1])) return p; \
p++; \
}
SCAN('d',D)
SCAN('g',G)
SCAN('h',H)
SCAN('j',J)
SCAN('k',K)
SCAN('l',L)
SCAN('r',R)
SCAN('s',S)
SCAN('t',T)
SCAN('w',W)
SCAN('z',Z)
#undef SCAN
}
for (;;)
{
char c = *p;
if (!c) return NULL;
if ((c|0x20) == v) return p;
p++;
}
}
WDL_HASSTRINGS_EXPORT const char *hasStrings_skipSkippable(const char *cptr)
{
int skip;
while ((skip=WDL_IS_UTF8_SKIPPABLE(((unsigned char*)cptr)[0],((unsigned char*)cptr)[1]))>0) cptr+=skip;
return cptr;
}
WDL_HASSTRINGS_EXPORT bool WDL_hasStringsEx2(const char **name_list, int name_list_size, const LineParser *lp
#ifdef WDL_HASSTRINGS_EXTRA_PARAMETERS
WDL_HASSTRINGS_EXTRA_PARAMETERS
#endif
)
{
if (!lp) return true;
const int ntok = lp->getnumtokens();
if (ntok<1) return true;
char stack_[1024]; // &1=not bit, 0x10 = ignoring subscopes, &2= state when 0x10 set
int stacktop = 0, stacktop_v;
#define TOP_OF_STACK stacktop_v
#define PUSH_STACK(x) do { if (stacktop < (int)sizeof(stack_) - 1) stack_[stacktop++] = stacktop_v&0xff; stacktop_v = (x); } while(0)
#define POP_STACK() (stacktop_v = stack_[--stacktop])
TOP_OF_STACK = 0;
char matched_local=-1; // -1 = first eval for scope, 0=did not pass scope, 1=OK, 2=ignore rest of scope
for (int x = 0; x < ntok; x ++)
{
const char *n=lp->gettoken_str(x);
if (n[0] == '(' && !n[1] && !lp->gettoken_quotingchar(x))
{
if (!(matched_local&1))
{
TOP_OF_STACK |= matched_local | 0x10;
matched_local=2; // ignore subscope
}
else
{
matched_local = -1; // new scope
}
PUSH_STACK(0);
}
else if (n[0] == ')' && !n[1] && stacktop && !lp->gettoken_quotingchar(x))
{
if (POP_STACK()&0x10)
{
// restore state
matched_local = TOP_OF_STACK&2;
}
else
{
matched_local = (matched_local != 0 ? 1 : 0) ^ (TOP_OF_STACK&1);
}
TOP_OF_STACK = 0;
}
else if (n[0] == 'O' && n[1] == 'R' && !n[2] && matched_local != 2 && !lp->gettoken_quotingchar(x))
{
matched_local = (matched_local > 0) ? 2 : -1;
TOP_OF_STACK = 0;
}
else if (matched_local&1) // matches 1, -1
{
int ln = (int)strlen(n);
if (ln>0)
{
// ^foo -- string starts (or follows \1 separator with) foo
// foo$ -- string ends with foo (or is immediately followed by \1 separator)
// " foo ", "foo ", " foo" include end of string/start of string has whitespace
int wc_left = 0; // 1=require \1 or start of string, 2=require space or \1 or start
int wc_right = 0; // 1=require \1 or \0, 2 = require space or \1 or \0
// perhaps wc_left/wc_right of 2 should also match non-alnum characters in addition to space?
if (ln>1)
{
switch (*n)
{
case ' ':
if (*++n != ' ') wc_left=2;
// else { multiple characters of whitespace = literal whitespace search (two spaces requires a single space, etc) }
ln--;
break;
case '^':
ln--;
n++;
wc_left=1;
break;
// upper case being here implies it is almost certainly NOT/AND due to postprocessing in WDL_makeSearchFilter
case 'N':
if (WDL_NORMALLY(!strcmp(n,"NOT") && !lp->gettoken_quotingchar(x)))
{
TOP_OF_STACK^=1;
continue;
}
break;
case 'A':
if (WDL_NORMALLY(!strcmp(n,"AND") && !lp->gettoken_quotingchar(x)))
{
// ignore unquoted uppercase AND
continue;
}
break;
}
}
if (ln>1)
{
switch (n[ln-1])
{
case ' ':
if (n[--ln - 1] != ' ') wc_right=2;
// else { multiple characters of whitespace = literal whitespace search (two spaces requires a single space, etc) }
break;
case '$':
ln--;
wc_right++;
break;
}
}
if (!wc_left && !wc_right && *n)
{
switch (lp->gettoken_quotingchar(x))
{
case '\'':
case '"':
{ // if a quoted string has no whitespace in it, treat as whole word search
const char *p = n;
while (*p && *p != ' ' && *p != '\t') p++;
if (!*p)
{
wc_left=wc_right=2;
}
}
break;
}
}
bool matched = false;
#ifdef WDL_HASSTRINGS_PRE_MATCH
if (!wc_left && !wc_right && WDL_HASSTRINGS_PRE_MATCH(n))
matched = true;
else
#endif
for (int i = 0; i < name_list_size; i ++)
{
const char *name = name_list[i];
const char *t = name;
#define MATCH_RIGHT_CHECK_WORD(SZ) \
(wc_right == 0 || \
((const unsigned char*)(t))[SZ] < 2 || \
(wc_right > 1 && hasStrings_isNonWordChar(hasStrings_skipSkippable((t)+(SZ)))) \
)
#define MATCH_LEFT_SKIP_TO_WORD() do { \
if (*(unsigned char*)t < 2) { t++; break; } \
if (wc_left>1) { const int l = hasStrings_isNonWordChar(t); if (l > 0) { t+=l; break; } } \
t++; \
} while (t[0])
{
const char n0 = n[0];
if (wc_left>0)
{
for (;;)
{
t = hasStrings_scan_for_char_match(t,n0);
if (!t) break;
if (t==name || t[-1] == 1 || (wc_left>1 && hasStrings_isNonWordChar(hasStrings_rewutf8(t-1,name))))
{
const int v = hasStrings_utf8cmp((const unsigned char *)t,(const unsigned char *)n,ln);
if (v>=0)
{
if (!v) break;
if (MATCH_RIGHT_CHECK_WORD(v)) { matched = true; break; }
}
}
t++;
}
}
else
{
for (;;)
{
t = hasStrings_scan_for_char_match(t,n0);
if (!t) break;
const int v = hasStrings_utf8cmp((const unsigned char *)t,(const unsigned char *)n,ln);
if (v>=0)
{
if (!v) break;
if (MATCH_RIGHT_CHECK_WORD(v)) { matched = true; break; }
}
t++;
}
}
}
#undef MATCH_RIGHT_CHECK_WORD
#undef MATCH_LEFT_SKIP_TO_WORD
if (matched) break;
}
matched_local = (matched?1:0) ^ (TOP_OF_STACK&1);
TOP_OF_STACK=0;
}
}
}
while (stacktop > 0)
{
if (POP_STACK() & 0x10) matched_local=TOP_OF_STACK&2;
else matched_local = (matched_local > 0 ? 1 : 0) ^ (TOP_OF_STACK&1);
}
return matched_local!=0;
#undef TOP_OF_STACK
#undef POP_STACK
#undef PUSH_STACK
}
#ifndef WDL_HASSTRINGS_EXTRA_PARAMETERS
WDL_HASSTRINGS_EXPORT bool WDL_hasStringsEx(const char *name, const LineParser *lp)
{
return WDL_hasStringsEx2(&name,1,lp);
}
WDL_HASSTRINGS_EXPORT bool WDL_hasStrings(const char *name, const LineParser *lp)
{
return WDL_hasStringsEx2(&name,1,lp);
}
#endif
WDL_HASSTRINGS_EXPORT char *WDL_hasstrings_preproc_searchitem(char *wr, const char *src)
{
while (*src)
{
unsigned char c = *(unsigned char*)src++;
if (c >= 'A' && c <= 'Z') c+='a'-'A';
else if (c == 0xC3)
{
const unsigned char ccf = *(unsigned char*)src;
const unsigned char cc = ccf & ~0x20;
if (WDL_IS_UTF8_BYTE2_LATIN1S_A(cc,ccf)) c = 'a';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_C(cc,ccf)) c = 'c';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_E(cc,ccf)) c = 'e';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_I(cc,ccf)) c = 'i';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_N(cc,ccf)) c = 'n';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_O(cc,ccf)) c = 'o';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_U(cc,ccf)) c = 'u';
else if (WDL_IS_UTF8_BYTE2_LATIN1S_Y(cc,ccf)) c = 'y';
if (c != 0xC3) src++;
}
else if (c == 0xE2)
{
// convert u+2018/2019 to '
if (*(unsigned char*)src == 0x80 && (((unsigned char*)src)[1]&~1) == 0x98)
{
c = '\'';
src+=2;
}
}
else
{
const int skipl = WDL_IS_UTF8_SKIPPABLE(c, *(unsigned char*)src);
if (skipl > 0)
{
src += skipl-1;
continue;
}
}
// we could also convert latin extended A characters to ascii here, but meh
*wr++ = c;
}
*wr=0;
return wr;
}
WDL_HASSTRINGS_EXPORT bool WDL_makeSearchFilter(const char *flt, LineParser *lp)
{
if (WDL_NOT_NORMALLY(!lp)) return false;
if (WDL_NOT_NORMALLY(!flt)) flt="";
#ifdef WDL_LINEPARSER_HAS_LINEPARSERINT
if (lp->parse_ex(flt,true,false,true)) // allow unterminated quotes
#else
if (lp->parse_ex(flt,true,false))
#endif
{
if (*flt) lp->set_one_token(flt); // failed parsing search string, search as a single token
}
for (int x = 0; x < lp->getnumtokens(); x ++)
{
char *p = (char *)lp->gettoken_str(x);
if (lp->gettoken_quotingchar(x) || (strcmp(p,"NOT") && strcmp(p,"AND") && strcmp(p,"OR")))
{
WDL_hasstrings_preproc_searchitem(p, p);
}
}
return lp->getnumtokens()>0;
}
#endif