Atrinik Server 2.5
server/re-cmp.c
Go to the documentation of this file.
00001 /************************************************************************
00002 *            Atrinik, a Multiplayer Online Role Playing Game            *
00003 *                                                                       *
00004 *    Copyright (C) 2009-2011 Alex Tokar and Atrinik Development Team    *
00005 *                                                                       *
00006 * Fork from Daimonin (Massive Multiplayer Online Role Playing Game)     *
00007 * and Crossfire (Multiplayer game for X-windows).                       *
00008 *                                                                       *
00009 * This program is free software; you can redistribute it and/or modify  *
00010 * it under the terms of the GNU General Public License as published by  *
00011 * the Free Software Foundation; either version 2 of the License, or     *
00012 * (at your option) any later version.                                   *
00013 *                                                                       *
00014 * This program is distributed in the hope that it will be useful,       *
00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00017 * GNU General Public License for more details.                          *
00018 *                                                                       *
00019 * You should have received a copy of the GNU General Public License     *
00020 * along with this program; if not, write to the Free Software           *
00021 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.             *
00022 *                                                                       *
00023 * The author can be reached at admin@atrinik.org                        *
00024 ************************************************************************/
00025 
00041 #include <stdio.h>
00042 #include <stdlib.h>
00043 #include <memory.h>
00044 #include <limits.h>
00045 #include <re-cmp.h>
00046 #include <ctype.h>
00047 
00048 const char *re_cmp(const char *, const char *);
00049 static int re_cmp_step(const char *, const char *, int, int);
00050 static void re_init(void);
00051 static int re_match_token(unsigned char, selection *);
00052 static const char *re_get_token(selection *, const char *);
00053 #ifdef DEBUG2
00054 static void re_dump_sel(selection *);
00055 #endif
00056 
00057 static int re_init_done = 0;
00058 static selection *re_token[RE_TOKEN_MAX];
00059 static const char *re_substr[RE_TOKEN_MAX];
00060 static unsigned int re_token_depth;
00061 
00069 const char *re_cmp(const char *str, const char *regexp)
00070 {
00071     const char *next_regexp;
00072     int once = 0;
00073     int matched = 0;
00074 
00075     if (re_init_done == 0)
00076     {
00077         re_init();
00078     }
00079 
00080 #ifdef SAFE_CHECKS
00081     if (regexp == NULL || str == NULL)
00082     {
00083         return NULL;
00084     }
00085 #endif
00086 
00087     if (*regexp == '^')
00088     {
00089         once = 1;
00090         ++regexp;
00091     }
00092 
00093     if (*regexp == '\0')
00094     {
00095         /* // or /^/ matches any string */
00096         return str;
00097     }
00098 
00099     next_regexp = re_get_token(re_token[0], regexp);
00100     re_token_depth = 0;
00101     re_substr[0] = next_regexp;
00102 
00103     while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
00104     {
00105         /* If '^' was present and we didn't get a match above, return
00106          * NULL now so things like '^hi$' won't match on 'thi'. */
00107         if (once)
00108         {
00109             return NULL;
00110         }
00111 
00112         str++;
00113     }
00114 
00115     if (matched && *next_regexp == 0)
00116     {
00117         return str;
00118     }
00119 
00120     /* Apologies for the nearly duplicated code below, hopefully it
00121      * speeds things up. */
00122     if (once)
00123     {
00124         switch (re_token[0]->repeat)
00125         {
00126             case rep_once:
00127                 if (matched == 0)
00128                 {
00129                     return NULL;
00130                 }
00131 
00132                 break;
00133 
00134             case rep_once_or_more:
00135                 if (matched == 0)
00136                 {
00137                     return NULL;
00138                 }
00139 
00140                 if (re_cmp_step(str + 1, regexp, 0, 1))
00141                 {
00142                     return str;
00143                 }
00144 
00145                 break;
00146 
00147             case rep_null_or_once:
00148                 if (matched == 0)
00149                 {
00150                     return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
00151                 }
00152 
00153                 break;
00154 
00155             case rep_null_or_more:
00156                 if (matched)
00157                 {
00158                     if (re_cmp_step(str + 1, regexp, 0, 1))
00159                     {
00160                         return str;
00161                     }
00162                 }
00163                 else
00164                 {
00165                     return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
00166                 }
00167 
00168                 break;
00169         }
00170 
00171         return re_cmp_step(str + 1, next_regexp, 1, 0) ? str : NULL;
00172     }
00173 
00174     if (matched)
00175     {
00176         switch (re_token[0]->repeat)
00177         {
00178             case rep_once:
00179             case rep_null_or_once:
00180                 break;
00181 
00182             case rep_once_or_more:
00183             case rep_null_or_more:
00184                 if (re_cmp_step(str + 1, regexp, 0, 1))
00185                 {
00186                     return str;
00187                 }
00188 
00189                 break;
00190         }
00191 
00192         /* The logic here is that re_match_token only sees if the one
00193          * letter matches. Thus, if the regex is like '@match dragon', and
00194          * the the user enters anything with 'd', re_match_token returns
00195          * true, but they really need to match the entire regexp, which
00196          * re_cmp_step will do. However, what happens is that there can
00197          * be a case where the string being match is something like
00198          * 'where is dragon'. In this case, the re_match_token matches
00199          * that first 'd', but the re_cmp_step below, fails because the
00200          * next character ('a') doesn't match the 'r'. So we call re_cmp
00201          * with the string after the first 'a', so that it should hopefully
00202          * match up properly. */
00203         if (re_cmp_step(str + 1, next_regexp, 1, 0))
00204         {
00205             return str;
00206         }
00207         else if (*(str + 1) != 0)
00208         {
00209             return re_cmp(str + 1, regexp);
00210         }
00211     }
00212 
00213     return NULL;
00214 }
00215 
00223 static int re_cmp_step(const char *str, const char *regexp, int slot, int matches)
00224 {
00225     const char *next_regexp;
00226     int matched;
00227 
00228     if (*regexp == '\0')
00229     {
00230         /* When we reach the end of the regexp, the match is a success */
00231         return 1;
00232     }
00233 
00234     /* This chunk of code makes sure that the regexp-tokenising happens
00235      * only once. We only tokenise as much as we need. */
00236     if ((unsigned int) slot > re_token_depth)
00237     {
00238         re_token_depth = slot;
00239 
00240         if (re_token[slot] == NULL)
00241         {
00242             re_token[slot] = (selection *) malloc(sizeof(selection));
00243         }
00244 
00245         next_regexp = re_get_token(re_token[slot], regexp);
00246 
00247         if (next_regexp == NULL)
00248         {
00249             /* Syntax error, what else can we do? */
00250             return 0;
00251         }
00252 
00253         re_substr[slot] = next_regexp;
00254     }
00255     else
00256     {
00257         next_regexp = re_substr[slot];
00258     }
00259 
00260     matched = re_match_token(*str, re_token[slot]);
00261 
00262     if (matched)
00263     {
00264         ++matches;
00265     }
00266 
00267     if (*str == '\0')
00268     {
00269         return (*next_regexp == '\0' || re_token[slot]->type == sel_end) && matched;
00270     }
00271 
00272     switch (re_token[slot]->repeat)
00273     {
00274         case rep_once:
00275             /* (matches == 1) => (matched == 1) */
00276             if (matches == 1)
00277             {
00278                 return re_cmp_step(str + 1, next_regexp, slot + 1, 0);
00279             }
00280 
00281             return 0;
00282 
00283         case rep_once_or_more:
00284             /* (matched == 1) => (matches >= 1) */
00285             if (matched)
00286             {
00287                 /* First check if the current token repeats more */
00288                 if (re_cmp_step(str + 1, regexp, slot, matches))
00289                 {
00290                     return 1;
00291                 }
00292 
00293                 return re_cmp_step(str + 1, next_regexp, slot + 1, 0);
00294             }
00295 
00296             return 0;
00297 
00298         case rep_null_or_once:
00299             /* We must go on to the next token, but should we advance str? */
00300             if (matches == 0)
00301             {
00302                 return re_cmp_step(str, next_regexp, slot + 1, 0);
00303             }
00304             else if (matches == 1)
00305             {
00306                 return re_cmp_step(str + 1, next_regexp, slot + 1, 0);
00307             }
00308 
00309             /* Not reached */
00310             return 0;
00311 
00312         case rep_null_or_more:
00313             if (matched)
00314             {
00315                 /* Look for further repeats, advance str */
00316                 if (re_cmp_step(str + 1, regexp, slot, matches))
00317                 {
00318                     return 1;
00319                 }
00320 
00321                 return re_cmp_step(str, next_regexp, slot + 1, 0);
00322             }
00323 
00324             return re_cmp_step(str, next_regexp, slot + 1, 0);
00325     }
00326 
00327     return 0;
00328 }
00329 
00332 static void re_init()
00333 {
00334     int i;
00335 
00336     re_token[0] = (selection *) malloc(sizeof(selection));
00337 
00338     for (i = 1; i < RE_TOKEN_MAX; i++)
00339     {
00340         re_token[i] = NULL;
00341     }
00342 
00343     re_init_done = 1;
00344 }
00345 
00351 static int re_match_token(unsigned char c, selection *sel)
00352 {
00353     switch (sel->type)
00354     {
00355         case sel_any:
00356             return 1;
00357 
00358         case sel_end:
00359             return (c == '\0');
00360 
00361         case sel_single:
00362             return (tolower(c) == tolower(sel->u.single));
00363 
00364         case sel_range:
00365             return (c >= sel->u.range.low && c <= sel->u.range.high);
00366 
00367         case sel_array:
00368             return (sel->u.array[c]);
00369 
00370         case sel_not_single:
00371             return (tolower(c) != tolower(sel->u.single));
00372 
00373         case sel_not_range:
00374             return (c < sel->u.range.low && c > sel->u.range.high);
00375     }
00376 
00377     return 0;
00378 }
00379 
00387 static const char *re_get_token(selection *sel, const char *regexp)
00388 {
00389 #ifdef SAFE_CHECKS
00390 #   define exit_if_null if (*regexp == '\0') return NULL
00391 #else
00392 #   define exit_if_null
00393 #endif
00394 
00395     int quoted = 0;
00396     unsigned char looking_at;
00397 
00398 #ifdef SAFE_CHECKS
00399     if (sel == NULL || regexp == NULL || *regexp == '\0')
00400     {
00401         return NULL;
00402     }
00403 #endif
00404 
00405     do
00406     {
00407         looking_at = *regexp++;
00408 
00409         switch (looking_at)
00410         {
00411             case '$':
00412                 if (quoted)
00413                 {
00414                     quoted = 0;
00415                     sel->type = sel_single;
00416                     sel->u.single = looking_at;
00417                 }
00418                 else
00419                 {
00420                     sel->type = sel_end;
00421                 }
00422 
00423                 break;
00424 
00425             case '.':
00426                 if (quoted)
00427                 {
00428                     quoted = 0;
00429                     sel->type = sel_single;
00430                     sel->u.single = looking_at;
00431                 }
00432                 else
00433                 {
00434                     sel->type = sel_any;
00435                 }
00436 
00437                 break;
00438 
00439             case '[':
00440                 /* The fun stuff... perhaps a little obfuscated since I
00441                  * don't trust the compiler to analyze liveness. */
00442                 if (quoted)
00443                 {
00444                     quoted = 0;
00445                     sel->type = sel_single;
00446                     sel->u.single = looking_at;
00447                 }
00448                 else
00449                 {
00450                     int neg = 0;
00451                     unsigned char first, last = 0;
00452 
00453                     exit_if_null;
00454                     looking_at = *regexp++;
00455 
00456                     if (looking_at == '^')
00457                     {
00458                         neg = 1;
00459                         exit_if_null;
00460                         looking_at = *regexp++;
00461                     }
00462 
00463                     first = looking_at;
00464                     exit_if_null;
00465 
00466                     looking_at = *regexp++;
00467 
00468                     if (looking_at == ']')
00469                     {
00470                         /* On the form [q] or [^q] */
00471                         sel->type = neg ? sel_not_single : sel_single;
00472                         sel->u.single = first;
00473                         break;
00474                     }
00475                     else if (looking_at == '-')
00476                     {
00477                         exit_if_null;
00478                         last = *regexp++;
00479 
00480                         if (last == ']')
00481                         {
00482                             /* On the form [A-] or [^A-]. Checking for
00483                              * [,-] and making it a range is probably not
00484                              * worth it :-) */
00485                             sel->type = sel_array;
00486                             memset(sel->u.array, neg, sizeof(sel->u.array));
00487                             sel->u.array[first] = sel->u.array['-'] = !neg;
00488                             break;
00489                         }
00490                         else
00491                         {
00492                             exit_if_null;
00493                             looking_at = *regexp++;
00494 
00495                             if (looking_at == ']')
00496                             {
00497                                 /* On the form [A-G] or [^A-G]. Note that [G-A]
00498                                  * is a syntax error. Fair enough, I think. */
00499 #ifdef SAFE_CHECKS
00500                                 if (first > last)
00501                                 {
00502                                     return NULL;
00503                                 }
00504 #endif
00505                                 sel->type = neg ? sel_not_range : sel_range;
00506                                 sel->u.range.low = first;
00507                                 sel->u.range.high = last;
00508                                 break;
00509                             }
00510                         }
00511                     }
00512 
00513                     {
00514                         /* The datastructure can only represent a RE this
00515                          * complex with an array. */
00516                         int i;
00517                         unsigned char previous;
00518 
00519                         sel->type = sel_array;
00520                         memset(sel->u.array, neg, sizeof(sel->u.array));
00521 
00522                         if (last)
00523                         {
00524                             /* It starts with a range */
00525 #ifdef SAFE_CHECKS
00526                             if (first > last)
00527                             {
00528                                 return NULL;
00529                             }
00530 #endif
00531                             for (i = first; i <= last; i++)
00532                             {
00533                                 sel->u.array[i] = !neg;
00534                             }
00535                         }
00536                         else
00537                         {
00538                             /* It begins with a "random" character */
00539                             sel->u.array[first] = !neg;
00540                         }
00541 
00542                         sel->u.array[looking_at] = !neg;
00543 
00544                         exit_if_null;
00545                         previous = looking_at;
00546                         looking_at = *regexp++;
00547 
00548                         /* Add more characters to the array until we reach
00549                          * ]. Quoting doesn't and shouldn't work in here.
00550                          * ("]" should be put first, and "-" last if they
00551                          * are needed inside this construct.)
00552                          * Look for ranges as we go along. */
00553                         while (looking_at != ']')
00554                         {
00555                             if (looking_at == '-')
00556                             {
00557                                 exit_if_null;
00558                                 looking_at = *regexp++;
00559 
00560                                 if (looking_at != ']')
00561                                 {
00562 #ifdef SAFE_CHECKS
00563                                     if (previous > looking_at)
00564                                     {
00565                                         return NULL;
00566                                     }
00567 #endif
00568                                     for (i = previous + 1; i < looking_at; i++)
00569                                     {
00570                                         /* previous has already been set and
00571                                          * looking_at is set below. */
00572                                         sel->u.array[i] = !neg;
00573                                     }
00574 
00575                                     exit_if_null;
00576                                 }
00577                                 else
00578                                 {
00579                                     sel->u.array['-'] = !neg;
00580                                     break;
00581                                 }
00582                             }
00583 
00584                             sel->u.array[looking_at] = !neg;
00585                             previous = looking_at;
00586                             exit_if_null;
00587                             looking_at = *regexp++;
00588                         }
00589                     }
00590                 }
00591 
00592                 break;
00593 
00594             case '\\':
00595                 if (quoted)
00596                 {
00597                     quoted = 0;
00598                     sel->type = sel_single;
00599                     sel->u.single = looking_at;
00600                 }
00601                 else
00602                 {
00603                     quoted = 1;
00604                 }
00605 
00606                 break;
00607 
00608             default:
00609                 quoted = 0;
00610                 sel->type = sel_single;
00611                 sel->u.single = looking_at;
00612                 break;
00613         }
00614     }
00615     while (quoted);
00616 
00617     if (*regexp == '*')
00618     {
00619         sel->repeat = rep_null_or_more;
00620         ++regexp;
00621     }
00622     else if (*regexp == '?')
00623     {
00624         sel->repeat = rep_null_or_once;
00625         ++regexp;
00626     }
00627     else if (*regexp == '+')
00628     {
00629         sel->repeat = rep_once_or_more;
00630         ++regexp;
00631     }
00632     else
00633     {
00634         sel->repeat = rep_once;
00635     }
00636 
00637     return regexp;
00638 }
00639 
00640 #ifdef DEBUG2
00641 
00645 static void re_dump_sel(selection *sel)
00646 {
00647     switch (sel->type)
00648     {
00649         case sel_any:
00650             printf(".");
00651             break;
00652 
00653         case sel_end:
00654             printf("$");
00655             break;
00656 
00657         case sel_single:
00658             printf("<%c>", sel->u.single);
00659             break;
00660 
00661         case sel_range:
00662             printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
00663             break;
00664 
00665         case sel_array:
00666         {
00667             int i;
00668 
00669             printf("[");
00670 
00671             for (i = 0; i < UCHAR_MAX; i++)
00672             {
00673                 if (sel->u.array[i])
00674                 {
00675                     printf("%c", i);
00676                 }
00677             }
00678 
00679             printf("]");
00680             break;
00681         }
00682 
00683         case sel_not_single:
00684             printf("[^%c]", sel->u.single);
00685             break;
00686 
00687         case sel_not_range:
00688             printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
00689             break;
00690 
00691         default:
00692             printf("<UNKNOWN TOKEN!>");
00693             break;
00694     }
00695 
00696     switch (sel->repeat)
00697     {
00698         case rep_once:
00699             break;
00700 
00701         case rep_null_or_once:
00702             printf("?");
00703             break;
00704 
00705         case rep_null_or_more:
00706             printf("*");
00707             break;
00708 
00709         case rep_once_or_more:
00710             printf("+");
00711             break;
00712 
00713         default:
00714             printf("<UNKNOWN REP-TOKEN!>");
00715             break;
00716     }
00717 }
00718 
00719 int main(int argc, char *argv[])
00720 {
00721     char *re, *m;
00722     selection sel;
00723 
00724     re = re_get_token(&sel, argv[1]);
00725 
00726     printf("'%s' -> '%s'\n", argv[1], re);
00727     re_dump_sel(&sel);
00728     printf("\n");
00729     m = re_cmp(argv[2], argv[1]);
00730 
00731     if (m)
00732     {
00733         printf("MATCH! -> '%s'\n", m);
00734     }
00735 
00736     return 0;
00737 }
00738 #endif