Atrinik Server  4.0
re-cmp.c
Go to the documentation of this file.
1 /*************************************************************************
2  * Atrinik, a Multiplayer Online Role Playing Game *
3  * *
4  * Copyright (C) 2009-2014 Alex Tokar and Atrinik Development Team *
5  * *
6  * Fork from Crossfire (Multiplayer game for X-windows). *
7  * *
8  * This program is free software; you can redistribute it and/or modify *
9  * it under the terms of the GNU General Public License as published by *
10  * the Free Software Foundation; either version 2 of the License, or *
11  * (at your option) any later version. *
12  * *
13  * This program is distributed in the hope that it will be useful, *
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16  * GNU General Public License for more details. *
17  * *
18  * You should have received a copy of the GNU General Public License *
19  * along with this program; if not, write to the Free Software *
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *
21  * *
22  * The author can be reached at admin@atrinik.org *
23  ************************************************************************/
24 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <stdint.h>
44 #include <stdbool.h>
45 #include <memory.h>
46 #include <limits.h>
47 #include <re-cmp.h>
48 #include <ctype.h>
49 #include <string.h>
50 
51 const char *re_cmp(const char *, const char *);
52 static int re_cmp_step(const char *, const char *, int, int);
53 static void re_init(void);
54 static int re_match_token(unsigned char, selection *);
55 static const char *re_get_token(selection *, const char *);
56 #ifdef DEBUG2
57 static void re_dump_sel(selection *);
58 #endif
59 
60 static int re_init_done = 0;
61 static selection *re_token[RE_TOKEN_MAX];
62 static const char *re_substr[RE_TOKEN_MAX];
63 static unsigned int re_token_depth;
64 
76 const char *re_cmp(const char *str, const char *regexp)
77 {
78  const char *next_regexp;
79  int once = 0;
80  int matched = 0;
81 
82  if (re_init_done == 0) {
83  re_init();
84  }
85 
86 #ifdef SAFE_CHECKS
87 
88  if (regexp == NULL || str == NULL) {
89  return NULL;
90  }
91 #endif
92 
93  if (*regexp == '^') {
94  once = 1;
95  ++regexp;
96  }
97 
98  if (*regexp == '\0') {
99  /* // or /^/ matches any string */
100  return str;
101  }
102 
103  next_regexp = re_get_token(re_token[0], regexp);
104  re_token_depth = 0;
105  re_substr[0] = next_regexp;
106 
107  while (*str != '\0' && !(matched = re_match_token(*str, re_token[0]))) {
108  /* If '^' was present and we didn't get a match above, return
109  * NULL now so things like '^hi$' won't match on 'thi'. */
110  if (once) {
111  return NULL;
112  }
113 
114  str++;
115  }
116 
117  if (matched && *next_regexp == 0) {
118  return str;
119  }
120 
121  /* Apologies for the nearly duplicated code below, hopefully it
122  * speeds things up. */
123  if (once) {
124  switch (re_token[0]->repeat) {
125  case rep_once:
126 
127  if (matched == 0) {
128  return NULL;
129  }
130 
131  break;
132 
133  case rep_once_or_more:
134 
135  if (matched == 0) {
136  return NULL;
137  }
138 
139  if (re_cmp_step(str + 1, regexp, 0, 1)) {
140  return str;
141  }
142 
143  break;
144 
145  case rep_null_or_once:
146 
147  if (matched == 0) {
148  return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
149  }
150 
151  break;
152 
153  case rep_null_or_more:
154 
155  if (matched) {
156  if (re_cmp_step(str + 1, regexp, 0, 1)) {
157  return str;
158  }
159  } else {
160  return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
161  }
162 
163  break;
164  }
165 
166  return re_cmp_step(str + 1, next_regexp, 1, 0) ? str : NULL;
167  }
168 
169  if (matched) {
170  switch (re_token[0]->repeat) {
171  case rep_once:
172  case rep_null_or_once:
173  break;
174 
175  case rep_once_or_more:
176  case rep_null_or_more:
177 
178  if (re_cmp_step(str + 1, regexp, 0, 1)) {
179  return str;
180  }
181 
182  break;
183  }
184 
185  /* The logic here is that re_match_token only sees if the one
186  * letter matches. Thus, if the regex is like '@match dragon', and
187  * the the user enters anything with 'd', re_match_token returns
188  * true, but they really need to match the entire regexp, which
189  * re_cmp_step will do. However, what happens is that there can
190  * be a case where the string being match is something like
191  * 'where is dragon'. In this case, the re_match_token matches
192  * that first 'd', but the re_cmp_step below, fails because the
193  * next character ('a') doesn't match the 'r'. So we call re_cmp
194  * with the string after the first 'a', so that it should hopefully
195  * match up properly. */
196  if (re_cmp_step(str + 1, next_regexp, 1, 0)) {
197  return str;
198  } else if (*(str + 1) != 0) {
199  return re_cmp(str + 1, regexp);
200  }
201  }
202 
203  return NULL;
204 }
205 
219 static int re_cmp_step(const char *str, const char *regexp, int slot, int matches)
220 {
221  const char *next_regexp;
222  int matched;
223 
224  if (*regexp == '\0') {
225  /* When we reach the end of the regexp, the match is a success */
226  return 1;
227  }
228 
229  /* This chunk of code makes sure that the regexp-tokenising happens
230  * only once. We only tokenise as much as we need. */
231  if ((unsigned int) slot > re_token_depth) {
232  re_token_depth = slot;
233 
234  if (re_token[slot] == NULL) {
235  re_token[slot] = malloc(sizeof(selection));
236  }
237 
238  next_regexp = re_get_token(re_token[slot], regexp);
239 
240  if (next_regexp == NULL) {
241  /* Syntax error, what else can we do? */
242  return 0;
243  }
244 
245  re_substr[slot] = next_regexp;
246  } else {
247  next_regexp = re_substr[slot];
248  }
249 
250  matched = re_match_token(*str, re_token[slot]);
251 
252  if (matched) {
253  ++matches;
254  }
255 
256  if (*str == '\0') {
257  return (*next_regexp == '\0' || re_token[slot]->type == sel_end) && matched;
258  }
259 
260  switch (re_token[slot]->repeat) {
261  case rep_once:
262 
263  /* (matches == 1) => (matched == 1) */
264  if (matches == 1) {
265  return re_cmp_step(str + 1, next_regexp, slot + 1, 0);
266  }
267 
268  return 0;
269 
270  case rep_once_or_more:
271 
272  /* (matched == 1) => (matches >= 1) */
273  if (matched) {
274  /* First check if the current token repeats more */
275  if (re_cmp_step(str + 1, regexp, slot, matches)) {
276  return 1;
277  }
278 
279  return re_cmp_step(str + 1, next_regexp, slot + 1, 0);
280  }
281 
282  return 0;
283 
284  case rep_null_or_once:
285 
286  /* We must go on to the next token, but should we advance str? */
287  if (matches == 0) {
288  return re_cmp_step(str, next_regexp, slot + 1, 0);
289  } else if (matches == 1) {
290  return re_cmp_step(str + 1, next_regexp, slot + 1, 0);
291  }
292 
293  /* Not reached */
294  return 0;
295 
296  case rep_null_or_more:
297 
298  if (matched) {
299  /* Look for further repeats, advance str */
300  if (re_cmp_step(str + 1, regexp, slot, matches)) {
301  return 1;
302  }
303 
304  return re_cmp_step(str, next_regexp, slot + 1, 0);
305  }
306 
307  return re_cmp_step(str, next_regexp, slot + 1, 0);
308  }
309 
310  return 0;
311 }
312 
316 static void re_init(void)
317 {
318  int i;
319 
320  re_token[0] = malloc(sizeof(selection));
321 
322  for (i = 1; i < RE_TOKEN_MAX; i++) {
323  re_token[i] = NULL;
324  }
325 
326  re_init_done = 1;
327 }
328 
338 static int re_match_token(unsigned char c, selection *sel)
339 {
340  switch (sel->type) {
341  case sel_any:
342  return 1;
343 
344  case sel_end:
345  return (c == '\0');
346 
347  case sel_single:
348  return (tolower(c) == tolower(sel->u.single));
349 
350  case sel_range:
351  return (c >= sel->u.range.low && c <= sel->u.range.high);
352 
353  case sel_array:
354  return (sel->u.array[c]);
355 
356  case sel_not_single:
357  return (tolower(c) != tolower(sel->u.single));
358 
359  case sel_not_range:
360  return (c < sel->u.range.low && c > sel->u.range.high);
361  }
362 
363  return 0;
364 }
365 
376 static const char *re_get_token(selection *sel, const char *regexp)
377 {
378 #ifdef SAFE_CHECKS
379 #define exit_if_null if (*regexp == '\0') return NULL
380 #else
381 #define exit_if_null
382 #endif
383 
384  int quoted = 0;
385  unsigned char looking_at;
386 
387 #ifdef SAFE_CHECKS
388 
389  if (sel == NULL || regexp == NULL || *regexp == '\0') {
390  return NULL;
391  }
392 #endif
393 
394  do {
395  looking_at = *regexp++;
396 
397  switch (looking_at) {
398  case '$':
399 
400  if (quoted) {
401  quoted = 0;
402  sel->type = sel_single;
403  sel->u.single = looking_at;
404  } else {
405  sel->type = sel_end;
406  }
407 
408  break;
409 
410  case '.':
411 
412  if (quoted) {
413  quoted = 0;
414  sel->type = sel_single;
415  sel->u.single = looking_at;
416  } else {
417  sel->type = sel_any;
418  }
419 
420  break;
421 
422  case '[':
423 
424  /* The fun stuff... perhaps a little obfuscated since I
425  * don't trust the compiler to analyze liveness. */
426  if (quoted) {
427  quoted = 0;
428  sel->type = sel_single;
429  sel->u.single = looking_at;
430  } else {
431  int neg = 0;
432  unsigned char first, last = 0;
433 
434  exit_if_null;
435  looking_at = *regexp++;
436 
437  if (looking_at == '^') {
438  neg = 1;
439  exit_if_null;
440  looking_at = *regexp++;
441  }
442 
443  first = looking_at;
444  exit_if_null;
445 
446  looking_at = *regexp++;
447 
448  if (looking_at == ']') {
449  /* On the form [q] or [^q] */
450  sel->type = neg ? sel_not_single : sel_single;
451  sel->u.single = first;
452  break;
453  } else if (looking_at == '-') {
454  exit_if_null;
455  last = *regexp++;
456 
457  if (last == ']') {
458  /* On the form [A-] or [^A-]. Checking for
459  * [,-] and making it a range is probably not
460  * worth it :-) */
461  sel->type = sel_array;
462  memset(sel->u.array, neg, sizeof(sel->u.array));
463  sel->u.array[first] = sel->u.array['-'] = !neg;
464  break;
465  } else {
466  exit_if_null;
467  looking_at = *regexp++;
468 
469  if (looking_at == ']') {
470  /* On the form [A-G] or [^A-G]. Note that [G-A]
471  * is a syntax error. Fair enough, I think. */
472 #ifdef SAFE_CHECKS
473 
474  if (first > last) {
475  return NULL;
476  }
477 #endif
478  sel->type = neg ? sel_not_range : sel_range;
479  sel->u.range.low = first;
480  sel->u.range.high = last;
481  break;
482  }
483  }
484  }
485 
486  {
487  /* The datastructure can only represent a RE this
488  * complex with an array. */
489  int i;
490  unsigned char previous;
491 
492  sel->type = sel_array;
493  memset(sel->u.array, neg, sizeof(sel->u.array));
494 
495  if (last) {
496  /* It starts with a range */
497 #ifdef SAFE_CHECKS
498 
499  if (first > last) {
500  return NULL;
501  }
502 #endif
503  for (i = first; i < last + 1; i++) {
504  sel->u.array[i] = !neg;
505  }
506  } else {
507  /* It begins with a "random" character */
508  sel->u.array[first] = !neg;
509  }
510 
511  sel->u.array[looking_at] = !neg;
512 
513  exit_if_null;
514  previous = looking_at;
515  looking_at = *regexp++;
516 
517  /* Add more characters to the array until we reach
518  * ]. Quoting doesn't and shouldn't work in here.
519  * ("]" should be put first, and "-" last if they
520  * are needed inside this construct.)
521  * Look for ranges as we go along. */
522  while (looking_at != ']') {
523  if (looking_at == '-') {
524  exit_if_null;
525  looking_at = *regexp++;
526 
527  if (looking_at != ']') {
528 #ifdef SAFE_CHECKS
529 
530  if (previous > looking_at) {
531  return NULL;
532  }
533 #endif
534  for (i = previous + 1; i < looking_at; i++) {
535  /* previous has already been set and
536  * looking_at is set below. */
537  sel->u.array[i] = !neg;
538  }
539 
540  exit_if_null;
541  } else {
542  sel->u.array['-'] = !neg;
543  break;
544  }
545  }
546 
547  sel->u.array[looking_at] = !neg;
548  previous = looking_at;
549  exit_if_null;
550  looking_at = *regexp++;
551  }
552  }
553  }
554 
555  break;
556 
557  case '\\':
558 
559  if (quoted) {
560  quoted = 0;
561  sel->type = sel_single;
562  sel->u.single = looking_at;
563  } else {
564  quoted = 1;
565  }
566 
567  break;
568 
569  default:
570  quoted = 0;
571  sel->type = sel_single;
572  sel->u.single = looking_at;
573  break;
574  }
575  } while (quoted);
576 
577  if (*regexp == '*') {
578  sel->repeat = rep_null_or_more;
579  ++regexp;
580  } else if (*regexp == '?') {
581  sel->repeat = rep_null_or_once;
582  ++regexp;
583  } else if (*regexp == '+') {
584  sel->repeat = rep_once_or_more;
585  ++regexp;
586  } else {
587  sel->repeat = rep_once;
588  }
589 
590  return regexp;
591 }
592 
593 #ifdef DEBUG2
594 
600 static void re_dump_sel(selection *sel)
601 {
602  switch (sel->type) {
603  case sel_any:
604  printf(".");
605  break;
606 
607  case sel_end:
608  printf("$");
609  break;
610 
611  case sel_single:
612  printf("<%c>", sel->u.single);
613  break;
614 
615  case sel_range:
616  printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
617  break;
618 
619  case sel_array:
620  {
621  int i;
622 
623  printf("[");
624 
625  for (i = 0; i < UCHAR_MAX; i++) {
626  if (sel->u.array[i]) {
627  printf("%c", i);
628  }
629  }
630 
631  printf("]");
632  break;
633  }
634 
635  case sel_not_single:
636  printf("[^%c]", sel->u.single);
637  break;
638 
639  case sel_not_range:
640  printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
641  break;
642 
643  default:
644  printf("<UNKNOWN TOKEN!>");
645  break;
646  }
647 
648  switch (sel->repeat) {
649  case rep_once:
650  break;
651 
652  case rep_null_or_once:
653  printf("?");
654  break;
655 
656  case rep_null_or_more:
657  printf("*");
658  break;
659 
660  case rep_once_or_more:
661  printf("+");
662  break;
663 
664  default:
665  printf("<UNKNOWN REP-TOKEN!>");
666  break;
667  }
668 }
669 
670 int main(int argc, char *argv[])
671 {
672  char *re, *m;
673  selection sel;
674 
675  re = re_get_token(&sel, argv[1]);
676 
677  printf("'%s' -> '%s'\n", argv[1], re);
678  re_dump_sel(&sel);
679  printf("\n");
680  m = re_cmp(argv[2], argv[1]);
681 
682  if (m) {
683  printf("MATCH! -> '%s'\n", m);
684  }
685 
686  return 0;
687 }
688 #endif
uint8_t type
One of operation types.
Definition: sound_ambient.c:45
static void re_init(void)
Definition: re-cmp.c:316
static int re_cmp_step(const char *, const char *, int, int)
Definition: re-cmp.c:219
static const char * re_get_token(selection *, const char *)
Definition: re-cmp.c:376
static int re_match_token(unsigned char, selection *)
Definition: re-cmp.c:338
int main(int argc, char **argv)
Definition: main.c:588
time_t last
Last successful update.
Definition: metaserver.c:47
const char * re_cmp(const char *, const char *)
Definition: re-cmp.c:76