#ifndef _WDL_HASSTRINGS_H_ #define _WDL_HASSTRINGS_H_ #ifndef WDL_HASSTRINGS_EXPORT #define WDL_HASSTRINGS_EXPORT #endif WDL_HASSTRINGS_EXPORT const char *hasStrings_rewutf8(const char *str, const char *base) { while (str > base && (*(unsigned char *)str & 0xC0) == 0x80) str--; return str; } WDL_HASSTRINGS_EXPORT int hasStrings_isNonWordChar(const char *cptr) { // treat non-alnum non-utf-8 as whitespace when searching for " foo " const unsigned char c = *(const unsigned char *)cptr; if (c < 128) { if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return 0; } return 1; // non-alnum ascii are non-word chars } // most UTF-8 characters are word characters if (c == 0xE2) { // except UTF-8 apostrophes if (((unsigned char*)cptr)[1] == 0x80 && (((unsigned char*)cptr)[2]&~1) == 0x98) return 3; } return 0; } #include "utf8_extended.h" // returns negative if does not match but more of a is available to search // returns 0 if done searching without match // returns >0 if matches (return bytelen of match) // note that this assumes that b was preprocessed by WDL_makeSearchFilter and that strlen(b) >= n WDL_HASSTRINGS_EXPORT int hasStrings_utf8cmp(const unsigned char * const a, const unsigned char *b, unsigned int n) { int aidx=0; while (n) { int ca=a[aidx], cb=b[0]; // ca may be any character (including A-Z), utf8, cb will never be A-Z or NUL WDL_ASSERT(cb != 0 && !(cb >= 'A' && cb <= 'Z')); cb -= ca; // if ca is A, and cb is a, cb will be 'a'-'A' if (cb) { if (cb != 'a'-'A') { if (ca < 0xc3 || ca > 0xc5) { if (ca == 0xE2 && cb == ('\''-0xE2) && a[aidx+1] == 0x80 && (a[aidx+2]&~1) == 0x98) { aidx+=3; ++b; --n; continue; } const int skipl = WDL_IS_UTF8_SKIPPABLE(ca,a[aidx+1]); if (skipl) { aidx += skipl; continue; } return -ca; } const int ccf = a[++aidx]; if (ccf < 0x80) return -ca; if (ca == 0xc3) { // latin-1 supplemental const int cc = ccf & ~0x20; switch (*b) { #define SCAN(ch, CH) case ch: if (!WDL_IS_UTF8_BYTE2_LATIN1S_##CH(cc,ccf)) return -ca; break; SCAN('a',A) SCAN('c',C) SCAN('e',E) SCAN('i',I) SCAN('n',N) SCAN('o',O) SCAN('u',U) SCAN('y',Y) default: return -ca; break; #undef SCAN } } else { // latin extended A switch (*b) { #define SCAN(ch, CH) case ch: if (!WDL_IS_UTF8_EXT1A_##CH(ca,ccf)) return -ca; break; SCAN('a',A) SCAN('c',C) SCAN('d',D) SCAN('e',E) SCAN('g',G) SCAN('h',H) SCAN('i',I) SCAN('j',J) SCAN('k',K) SCAN('l',L) SCAN('n',N) SCAN('o',O) SCAN('r',R) SCAN('s',S) SCAN('t',T) SCAN('u',U) SCAN('w',W) SCAN('y',Y) SCAN('z',Z) default: return -ca; break; #undef SCAN } } } else if (ca < 'A' || ca > 'Z') return -ca; } ++aidx; ++b; --n; } return aidx; } static const char *hasStrings_scan_for_char_match(const char *p, char v) { if (v < 'a' || v > 'z') for (;;) { char c = *p; if (!c) return NULL; if (c == v) return p; p++; } switch (v) { case '\'': for (;;) { unsigned char c = *(const unsigned char *)p; if (!c) return NULL; if (c == '\'') return p; if (c == 0xE2 && ((unsigned char*)p)[1] == 0x80 && (((unsigned char*)p)[2]&~1) == 0x98) return p; p++; } #define SCAN(ch, CH) case (ch): for (;;) { \ unsigned char c = *(const unsigned char *)p; \ if (!c) return NULL; \ if ((c|0x20) == (ch)) return p; \ if (c >= 0xc3) { \ if (c == 0xc3) { \ const unsigned char ccf = ((const unsigned char*)p)[1]; \ const unsigned char cc = ccf & ~0x20; \ if (WDL_IS_UTF8_BYTE2_LATIN1S_##CH(cc,ccf)) return p; \ } else { \ if (WDL_IS_UTF8_EXT1A_##CH(c, ((const unsigned char*)p)[1])) return p; \ } \ } \ p++; \ } SCAN('a',A) SCAN('c',C) SCAN('e',E) SCAN('i',I) SCAN('n',N) SCAN('o',O) SCAN('u',U) SCAN('y',Y) #undef SCAN // latin extended A only #define SCAN(ch, CH) case (ch): for (;;) { \ unsigned char c = *(const unsigned char *)p; \ if (!c) return NULL; \ if ((c|0x20) == (ch)) return p; \ if (WDL_IS_UTF8_EXT1A_##CH(c, ((const unsigned char*)p)[1])) return p; \ p++; \ } SCAN('d',D) SCAN('g',G) SCAN('h',H) SCAN('j',J) SCAN('k',K) SCAN('l',L) SCAN('r',R) SCAN('s',S) SCAN('t',T) SCAN('w',W) SCAN('z',Z) #undef SCAN } for (;;) { char c = *p; if (!c) return NULL; if ((c|0x20) == v) return p; p++; } } WDL_HASSTRINGS_EXPORT const char *hasStrings_skipSkippable(const char *cptr) { int skip; while ((skip=WDL_IS_UTF8_SKIPPABLE(((unsigned char*)cptr)[0],((unsigned char*)cptr)[1]))>0) cptr+=skip; return cptr; } WDL_HASSTRINGS_EXPORT bool WDL_hasStringsEx2(const char **name_list, int name_list_size, const LineParser *lp #ifdef WDL_HASSTRINGS_EXTRA_PARAMETERS WDL_HASSTRINGS_EXTRA_PARAMETERS #endif ) { if (!lp) return true; const int ntok = lp->getnumtokens(); if (ntok<1) return true; char stack_[1024]; // &1=not bit, 0x10 = ignoring subscopes, &2= state when 0x10 set int stacktop = 0, stacktop_v; #define TOP_OF_STACK stacktop_v #define PUSH_STACK(x) do { if (stacktop < (int)sizeof(stack_) - 1) stack_[stacktop++] = stacktop_v&0xff; stacktop_v = (x); } while(0) #define POP_STACK() (stacktop_v = stack_[--stacktop]) TOP_OF_STACK = 0; char matched_local=-1; // -1 = first eval for scope, 0=did not pass scope, 1=OK, 2=ignore rest of scope for (int x = 0; x < ntok; x ++) { const char *n=lp->gettoken_str(x); if (n[0] == '(' && !n[1] && !lp->gettoken_quotingchar(x)) { if (!(matched_local&1)) { TOP_OF_STACK |= matched_local | 0x10; matched_local=2; // ignore subscope } else { matched_local = -1; // new scope } PUSH_STACK(0); } else if (n[0] == ')' && !n[1] && stacktop && !lp->gettoken_quotingchar(x)) { if (POP_STACK()&0x10) { // restore state matched_local = TOP_OF_STACK&2; } else { matched_local = (matched_local != 0 ? 1 : 0) ^ (TOP_OF_STACK&1); } TOP_OF_STACK = 0; } else if (n[0] == 'O' && n[1] == 'R' && !n[2] && matched_local != 2 && !lp->gettoken_quotingchar(x)) { matched_local = (matched_local > 0) ? 2 : -1; TOP_OF_STACK = 0; } else if (matched_local&1) // matches 1, -1 { int ln = (int)strlen(n); if (ln>0) { // ^foo -- string starts (or follows \1 separator with) foo // foo$ -- string ends with foo (or is immediately followed by \1 separator) // " foo ", "foo ", " foo" include end of string/start of string has whitespace int wc_left = 0; // 1=require \1 or start of string, 2=require space or \1 or start int wc_right = 0; // 1=require \1 or \0, 2 = require space or \1 or \0 // perhaps wc_left/wc_right of 2 should also match non-alnum characters in addition to space? if (ln>1) { switch (*n) { case ' ': if (*++n != ' ') wc_left=2; // else { multiple characters of whitespace = literal whitespace search (two spaces requires a single space, etc) } ln--; break; case '^': ln--; n++; wc_left=1; break; // upper case being here implies it is almost certainly NOT/AND due to postprocessing in WDL_makeSearchFilter case 'N': if (WDL_NORMALLY(!strcmp(n,"NOT") && !lp->gettoken_quotingchar(x))) { TOP_OF_STACK^=1; continue; } break; case 'A': if (WDL_NORMALLY(!strcmp(n,"AND") && !lp->gettoken_quotingchar(x))) { // ignore unquoted uppercase AND continue; } break; } } if (ln>1) { switch (n[ln-1]) { case ' ': if (n[--ln - 1] != ' ') wc_right=2; // else { multiple characters of whitespace = literal whitespace search (two spaces requires a single space, etc) } break; case '$': ln--; wc_right++; break; } } if (!wc_left && !wc_right && *n) { switch (lp->gettoken_quotingchar(x)) { case '\'': case '"': { // if a quoted string has no whitespace in it, treat as whole word search const char *p = n; while (*p && *p != ' ' && *p != '\t') p++; if (!*p) { wc_left=wc_right=2; } } break; } } bool matched = false; #ifdef WDL_HASSTRINGS_PRE_MATCH if (!wc_left && !wc_right && WDL_HASSTRINGS_PRE_MATCH(n)) matched = true; else #endif for (int i = 0; i < name_list_size; i ++) { const char *name = name_list[i]; const char *t = name; #define MATCH_RIGHT_CHECK_WORD(SZ) \ (wc_right == 0 || \ ((const unsigned char*)(t))[SZ] < 2 || \ (wc_right > 1 && hasStrings_isNonWordChar(hasStrings_skipSkippable((t)+(SZ)))) \ ) #define MATCH_LEFT_SKIP_TO_WORD() do { \ if (*(unsigned char*)t < 2) { t++; break; } \ if (wc_left>1) { const int l = hasStrings_isNonWordChar(t); if (l > 0) { t+=l; break; } } \ t++; \ } while (t[0]) { const char n0 = n[0]; if (wc_left>0) { for (;;) { t = hasStrings_scan_for_char_match(t,n0); if (!t) break; if (t==name || t[-1] == 1 || (wc_left>1 && hasStrings_isNonWordChar(hasStrings_rewutf8(t-1,name)))) { const int v = hasStrings_utf8cmp((const unsigned char *)t,(const unsigned char *)n,ln); if (v>=0) { if (!v) break; if (MATCH_RIGHT_CHECK_WORD(v)) { matched = true; break; } } } t++; } } else { for (;;) { t = hasStrings_scan_for_char_match(t,n0); if (!t) break; const int v = hasStrings_utf8cmp((const unsigned char *)t,(const unsigned char *)n,ln); if (v>=0) { if (!v) break; if (MATCH_RIGHT_CHECK_WORD(v)) { matched = true; break; } } t++; } } } #undef MATCH_RIGHT_CHECK_WORD #undef MATCH_LEFT_SKIP_TO_WORD if (matched) break; } matched_local = (matched?1:0) ^ (TOP_OF_STACK&1); TOP_OF_STACK=0; } } } while (stacktop > 0) { if (POP_STACK() & 0x10) matched_local=TOP_OF_STACK&2; else matched_local = (matched_local > 0 ? 1 : 0) ^ (TOP_OF_STACK&1); } return matched_local!=0; #undef TOP_OF_STACK #undef POP_STACK #undef PUSH_STACK } #ifndef WDL_HASSTRINGS_EXTRA_PARAMETERS WDL_HASSTRINGS_EXPORT bool WDL_hasStringsEx(const char *name, const LineParser *lp) { return WDL_hasStringsEx2(&name,1,lp); } WDL_HASSTRINGS_EXPORT bool WDL_hasStrings(const char *name, const LineParser *lp) { return WDL_hasStringsEx2(&name,1,lp); } #endif WDL_HASSTRINGS_EXPORT char *WDL_hasstrings_preproc_searchitem(char *wr, const char *src) { while (*src) { unsigned char c = *(unsigned char*)src++; if (c >= 'A' && c <= 'Z') c+='a'-'A'; else if (c == 0xC3) { const unsigned char ccf = *(unsigned char*)src; const unsigned char cc = ccf & ~0x20; if (WDL_IS_UTF8_BYTE2_LATIN1S_A(cc,ccf)) c = 'a'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_C(cc,ccf)) c = 'c'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_E(cc,ccf)) c = 'e'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_I(cc,ccf)) c = 'i'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_N(cc,ccf)) c = 'n'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_O(cc,ccf)) c = 'o'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_U(cc,ccf)) c = 'u'; else if (WDL_IS_UTF8_BYTE2_LATIN1S_Y(cc,ccf)) c = 'y'; if (c != 0xC3) src++; } else if (c == 0xE2) { // convert u+2018/2019 to ' if (*(unsigned char*)src == 0x80 && (((unsigned char*)src)[1]&~1) == 0x98) { c = '\''; src+=2; } } else { const int skipl = WDL_IS_UTF8_SKIPPABLE(c, *(unsigned char*)src); if (skipl > 0) { src += skipl-1; continue; } } // we could also convert latin extended A characters to ascii here, but meh *wr++ = c; } *wr=0; return wr; } WDL_HASSTRINGS_EXPORT bool WDL_makeSearchFilter(const char *flt, LineParser *lp) { if (WDL_NOT_NORMALLY(!lp)) return false; if (WDL_NOT_NORMALLY(!flt)) flt=""; #ifdef WDL_LINEPARSER_HAS_LINEPARSERINT if (lp->parse_ex(flt,true,false,true)) // allow unterminated quotes #else if (lp->parse_ex(flt,true,false)) #endif { if (*flt) lp->set_one_token(flt); // failed parsing search string, search as a single token } for (int x = 0; x < lp->getnumtokens(); x ++) { char *p = (char *)lp->gettoken_str(x); if (lp->gettoken_quotingchar(x) || (strcmp(p,"NOT") && strcmp(p,"AND") && strcmp(p,"OR"))) { WDL_hasstrings_preproc_searchitem(p, p); } } return lp->getnumtokens()>0; } #endif