Git fork
at reftables-rust 3893 lines 112 kB view raw
1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20#pragma GCC diagnostic ignored "-Wunused-parameter" 21 22#if defined __TANDEM 23 /* This is currently duplicated from git-compat-utils.h */ 24# ifdef NO_INTPTR_T 25 typedef long intptr_t; 26 typedef unsigned long uintptr_t; 27# endif 28#endif 29 30static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 31 size_t length, reg_syntax_t syntax); 32static void re_compile_fastmap_iter (regex_t *bufp, 33 const re_dfastate_t *init_state, 34 char *fastmap); 35static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 36#ifdef RE_ENABLE_I18N 37static void free_charset (re_charset_t *cset); 38#endif /* RE_ENABLE_I18N */ 39static void free_workarea_compile (regex_t *preg); 40static reg_errcode_t create_initial_state (re_dfa_t *dfa); 41#ifdef RE_ENABLE_I18N 42static void optimize_utf8 (re_dfa_t *dfa); 43#endif 44static reg_errcode_t analyze (regex_t *preg); 45static reg_errcode_t preorder (bin_tree_t *root, 46 reg_errcode_t (fn (void *, bin_tree_t *)), 47 void *extra); 48static reg_errcode_t postorder (bin_tree_t *root, 49 reg_errcode_t (fn (void *, bin_tree_t *)), 50 void *extra); 51static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 52static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 53static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 54 bin_tree_t *node); 55static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 56static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 57static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 58static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); 59static int search_duplicated_node (const re_dfa_t *dfa, int org_node, 60 unsigned int constraint); 61static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 62static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 63 int node, int root); 64static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 65static int fetch_number (re_string_t *input, re_token_t *token, 66 reg_syntax_t syntax); 67static int peek_token (re_token_t *token, re_string_t *input, 68 reg_syntax_t syntax) internal_function; 69static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 70 reg_syntax_t syntax, reg_errcode_t *err); 71static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 72 re_token_t *token, reg_syntax_t syntax, 73 int nest, reg_errcode_t *err); 74static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 75 re_token_t *token, reg_syntax_t syntax, 76 int nest, reg_errcode_t *err); 77static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 78 re_token_t *token, reg_syntax_t syntax, 79 int nest, reg_errcode_t *err); 80static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 81 re_token_t *token, reg_syntax_t syntax, 82 int nest, reg_errcode_t *err); 83static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 84 re_dfa_t *dfa, re_token_t *token, 85 reg_syntax_t syntax, reg_errcode_t *err); 86static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 87 re_token_t *token, reg_syntax_t syntax, 88 reg_errcode_t *err); 89static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 90 re_string_t *regexp, 91 re_token_t *token, int token_len, 92 re_dfa_t *dfa, 93 reg_syntax_t syntax, 94 int accept_hyphen); 95static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 96 re_string_t *regexp, 97 re_token_t *token); 98#ifdef RE_ENABLE_I18N 99static reg_errcode_t build_equiv_class (bitset_t sbcset, 100 re_charset_t *mbcset, 101 int *equiv_class_alloc, 102 const unsigned char *name); 103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 104 bitset_t sbcset, 105 re_charset_t *mbcset, 106 int *char_class_alloc, 107 const char *class_name, 108 reg_syntax_t syntax); 109#else /* not RE_ENABLE_I18N */ 110static reg_errcode_t build_equiv_class (bitset_t sbcset, 111 const unsigned char *name); 112static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 113 bitset_t sbcset, 114 const char *class_name, 115 reg_syntax_t syntax); 116#endif /* not RE_ENABLE_I18N */ 117static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 118 RE_TRANSLATE_TYPE trans, 119 const char *class_name, 120 const char *extra, 121 int non_match, reg_errcode_t *err); 122static bin_tree_t *create_tree (re_dfa_t *dfa, 123 bin_tree_t *left, bin_tree_t *right, 124 re_token_type_t type); 125static bin_tree_t *create_token_tree (re_dfa_t *dfa, 126 bin_tree_t *left, bin_tree_t *right, 127 const re_token_t *token); 128static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 129static void free_token (re_token_t *node); 130static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 131static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 132 133/* This table gives an error message for each of the error codes listed 134 in regex.h. Obviously the order here has to be same as there. 135 POSIX doesn't require that we do anything for REG_NOERROR, 136 but why not be nice? */ 137 138const char __re_error_msgid[] attribute_hidden = 139 { 140#define REG_NOERROR_IDX 0 141 gettext_noop ("Success") /* REG_NOERROR */ 142 "\0" 143#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 144 gettext_noop ("No match") /* REG_NOMATCH */ 145 "\0" 146#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 147 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 148 "\0" 149#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 150 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 151 "\0" 152#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 153 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 154 "\0" 155#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 156 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 157 "\0" 158#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 159 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 160 "\0" 161#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 162 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 163 "\0" 164#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 165 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 166 "\0" 167#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 168 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 169 "\0" 170#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 171 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 172 "\0" 173#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 174 gettext_noop ("Invalid range end") /* REG_ERANGE */ 175 "\0" 176#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 177 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 178 "\0" 179#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 180 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 181 "\0" 182#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 183 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 184 "\0" 185#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 186 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 187 "\0" 188#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 189 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 190 }; 191 192const size_t __re_error_msgid_idx[] attribute_hidden = 193 { 194 REG_NOERROR_IDX, 195 REG_NOMATCH_IDX, 196 REG_BADPAT_IDX, 197 REG_ECOLLATE_IDX, 198 REG_ECTYPE_IDX, 199 REG_EESCAPE_IDX, 200 REG_ESUBREG_IDX, 201 REG_EBRACK_IDX, 202 REG_EPAREN_IDX, 203 REG_EBRACE_IDX, 204 REG_BADBR_IDX, 205 REG_ERANGE_IDX, 206 REG_ESPACE_IDX, 207 REG_BADRPT_IDX, 208 REG_EEND_IDX, 209 REG_ESIZE_IDX, 210 REG_ERPAREN_IDX 211 }; 212 213/* Entry points for GNU code. */ 214 215 216#ifdef ZOS_USS 217 218/* For ZOS USS we must define btowc */ 219 220wchar_t 221btowc (int c) 222{ 223 wchar_t wtmp[2]; 224 char tmp[2]; 225 226 tmp[0] = c; 227 tmp[1] = 0; 228 229 mbtowc (wtmp, tmp, 1); 230 return wtmp[0]; 231} 232#endif 233 234/* re_compile_pattern is the GNU regular expression compiler: it 235 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 236 Returns 0 if the pattern was valid, otherwise an error string. 237 238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 239 are set in BUFP on entry. */ 240 241const char * 242re_compile_pattern (const char *pattern, 243 size_t length, 244 struct re_pattern_buffer *bufp) 245{ 246 reg_errcode_t ret; 247 248 /* And GNU code determines whether or not to get register information 249 by passing null for the REGS argument to re_match, etc., not by 250 setting no_sub, unless RE_NO_SUB is set. */ 251 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 252 253 /* Match anchors at newline. */ 254 bufp->newline_anchor = 1; 255 256 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 257 258 if (!ret) 259 return NULL; 260 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 261} 262#ifdef _LIBC 263weak_alias (__re_compile_pattern, re_compile_pattern) 264#endif 265 266/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 267 also be assigned to arbitrarily: each pattern buffer stores its own 268 syntax, so it can be changed between regex compilations. */ 269/* This has no initializer because initialized variables in Emacs 270 become read-only after dumping. */ 271reg_syntax_t re_syntax_options; 272 273 274/* Specify the precise syntax of regexps for compilation. This provides 275 for compatibility for various utilities which historically have 276 different, incompatible syntaxes. 277 278 The argument SYNTAX is a bit mask comprised of the various bits 279 defined in regex.h. We return the old syntax. */ 280 281reg_syntax_t 282re_set_syntax (reg_syntax_t syntax) 283{ 284 reg_syntax_t ret = re_syntax_options; 285 286 re_syntax_options = syntax; 287 return ret; 288} 289#ifdef _LIBC 290weak_alias (__re_set_syntax, re_set_syntax) 291#endif 292 293int 294re_compile_fastmap (struct re_pattern_buffer *bufp) 295{ 296 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 297 char *fastmap = bufp->fastmap; 298 299 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 300 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 301 if (dfa->init_state != dfa->init_state_word) 302 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 303 if (dfa->init_state != dfa->init_state_nl) 304 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 305 if (dfa->init_state != dfa->init_state_begbuf) 306 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 307 bufp->fastmap_accurate = 1; 308 return 0; 309} 310#ifdef _LIBC 311weak_alias (__re_compile_fastmap, re_compile_fastmap) 312#endif 313 314static inline void 315__attribute ((always_inline)) 316re_set_fastmap (char *fastmap, int icase, int ch) 317{ 318 fastmap[ch] = 1; 319 if (icase) 320 fastmap[tolower (ch)] = 1; 321} 322 323/* Helper function for re_compile_fastmap. 324 Compile fastmap for the initial_state INIT_STATE. */ 325 326static void 327re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 328 char *fastmap) 329{ 330 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 331 int node_cnt; 332 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 333 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 334 { 335 int node = init_state->nodes.elems[node_cnt]; 336 re_token_type_t type = dfa->nodes[node].type; 337 338 if (type == CHARACTER) 339 { 340 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 341#ifdef RE_ENABLE_I18N 342 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 343 { 344 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p; 345 wchar_t wc; 346 mbstate_t state; 347 348 p = buf; 349 *p++ = dfa->nodes[node].opr.c; 350 while (++node < dfa->nodes_len 351 && dfa->nodes[node].type == CHARACTER 352 && dfa->nodes[node].mb_partial) 353 *p++ = dfa->nodes[node].opr.c; 354 memset (&state, '\0', sizeof (state)); 355 if (__mbrtowc (&wc, (const char *) buf, p - buf, 356 &state) == p - buf 357 && (__wcrtomb ((char *) buf, towlower (wc), &state) 358 != (size_t) -1)) 359 re_set_fastmap (fastmap, 0, buf[0]); 360 re_free (buf); 361 } 362#endif 363 } 364 else if (type == SIMPLE_BRACKET) 365 { 366 int i, ch; 367 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 368 { 369 int j; 370 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 371 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 372 if (w & ((bitset_word_t) 1 << j)) 373 re_set_fastmap (fastmap, icase, ch); 374 } 375 } 376#ifdef RE_ENABLE_I18N 377 else if (type == COMPLEX_BRACKET) 378 { 379 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 380 int i; 381 382# ifdef _LIBC 383 /* See if we have to try all bytes which start multiple collation 384 elements. 385 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 386 collation element, and don't catch 'b' since 'b' is 387 the only collation element which starts from 'b' (and 388 it is caught by SIMPLE_BRACKET). */ 389 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 390 && (cset->ncoll_syms || cset->nranges)) 391 { 392 const int32_t *table = (const int32_t *) 393 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 394 for (i = 0; i < SBC_MAX; ++i) 395 if (table[i] < 0) 396 re_set_fastmap (fastmap, icase, i); 397 } 398# endif /* _LIBC */ 399 400 /* See if we have to start the match at all multibyte characters, 401 i.e. where we would not find an invalid sequence. This only 402 applies to multibyte character sets; for single byte character 403 sets, the SIMPLE_BRACKET again suffices. */ 404 if (dfa->mb_cur_max > 1 405 && (cset->nchar_classes || cset->non_match || cset->nranges 406# ifdef _LIBC 407 || cset->nequiv_classes 408# endif /* _LIBC */ 409 )) 410 { 411 unsigned char c = 0; 412 do 413 { 414 mbstate_t mbs; 415 memset (&mbs, 0, sizeof (mbs)); 416 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 417 re_set_fastmap (fastmap, false, (int) c); 418 } 419 while (++c != 0); 420 } 421 422 else 423 { 424 /* ... Else catch all bytes which can start the mbchars. */ 425 for (i = 0; i < cset->nmbchars; ++i) 426 { 427 char buf[256]; 428 mbstate_t state; 429 memset (&state, '\0', sizeof (state)); 430 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 431 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 432 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 433 { 434 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 435 != (size_t) -1) 436 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 437 } 438 } 439 } 440 } 441#endif /* RE_ENABLE_I18N */ 442 else if (type == OP_PERIOD 443#ifdef RE_ENABLE_I18N 444 || type == OP_UTF8_PERIOD 445#endif /* RE_ENABLE_I18N */ 446 || type == END_OF_RE) 447 { 448 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 449 if (type == END_OF_RE) 450 bufp->can_be_null = 1; 451 return; 452 } 453 } 454} 455 456/* Entry point for POSIX code. */ 457/* regcomp takes a regular expression as a string and compiles it. 458 459 PREG is a regex_t *. We do not expect any fields to be initialized, 460 since POSIX says we shouldn't. Thus, we set 461 462 `buffer' to the compiled pattern; 463 `used' to the length of the compiled pattern; 464 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 465 REG_EXTENDED bit in CFLAGS is set; otherwise, to 466 RE_SYNTAX_POSIX_BASIC; 467 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 468 `fastmap' to an allocated space for the fastmap; 469 `fastmap_accurate' to zero; 470 `re_nsub' to the number of subexpressions in PATTERN. 471 472 PATTERN is the address of the pattern string. 473 474 CFLAGS is a series of bits which affect compilation. 475 476 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 477 use POSIX basic syntax. 478 479 If REG_NEWLINE is set, then . and [^...] don't match newline. 480 Also, regexec will try a match beginning after every newline. 481 482 If REG_ICASE is set, then we considers upper- and lowercase 483 versions of letters to be equivalent when matching. 484 485 If REG_NOSUB is set, then when PREG is passed to regexec, that 486 routine will report only success or failure, and nothing about the 487 registers. 488 489 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 490 the return codes and their meanings.) */ 491 492int 493regcomp (regex_t *__restrict preg, 494 const char *__restrict pattern, 495 int cflags) 496{ 497 reg_errcode_t ret; 498 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 499 : RE_SYNTAX_POSIX_BASIC); 500 501 preg->buffer = NULL; 502 preg->allocated = 0; 503 preg->used = 0; 504 505 /* Try to allocate space for the fastmap. */ 506 preg->fastmap = re_malloc (char, SBC_MAX); 507 if (BE (preg->fastmap == NULL, 0)) 508 return REG_ESPACE; 509 510 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 511 512 /* If REG_NEWLINE is set, newlines are treated differently. */ 513 if (cflags & REG_NEWLINE) 514 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 515 syntax &= ~RE_DOT_NEWLINE; 516 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 517 /* It also changes the matching behavior. */ 518 preg->newline_anchor = 1; 519 } 520 else 521 preg->newline_anchor = 0; 522 preg->no_sub = !!(cflags & REG_NOSUB); 523 preg->translate = NULL; 524 525 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 526 527 /* POSIX doesn't distinguish between an unmatched open-group and an 528 unmatched close-group: both are REG_EPAREN. */ 529 if (ret == REG_ERPAREN) 530 ret = REG_EPAREN; 531 532 /* We have already checked preg->fastmap != NULL. */ 533 if (BE (ret == REG_NOERROR, 1)) 534 /* Compute the fastmap now, since regexec cannot modify the pattern 535 buffer. This function never fails in this implementation. */ 536 (void) re_compile_fastmap (preg); 537 else 538 { 539 /* Some error occurred while compiling the expression. */ 540 re_free (preg->fastmap); 541 preg->fastmap = NULL; 542 } 543 544 return (int) ret; 545} 546#ifdef _LIBC 547weak_alias (__regcomp, regcomp) 548#endif 549 550/* Returns a message corresponding to an error code, ERRCODE, returned 551 from either regcomp or regexec. We don't use PREG here. */ 552 553size_t 554regerror(int errcode, const regex_t *__restrict preg, 555 char *__restrict errbuf, size_t errbuf_size) 556{ 557 const char *msg; 558 size_t msg_size; 559 560 if (BE (errcode < 0 561 || errcode >= (int) (sizeof (__re_error_msgid_idx) 562 / sizeof (__re_error_msgid_idx[0])), 0)) 563 /* Only error codes returned by the rest of the code should be passed 564 to this routine. If we are given anything else, or if other regex 565 code generates an invalid error code, then the program has a bug. 566 Dump core so we can fix it. */ 567 abort (); 568 569 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 570 571 msg_size = strlen (msg) + 1; /* Includes the null. */ 572 573 if (BE (errbuf_size != 0, 1)) 574 { 575 if (BE (msg_size > errbuf_size, 0)) 576 { 577 memcpy (errbuf, msg, errbuf_size - 1); 578 errbuf[errbuf_size - 1] = 0; 579 } 580 else 581 memcpy (errbuf, msg, msg_size); 582 } 583 584 return msg_size; 585} 586#ifdef _LIBC 587weak_alias (__regerror, regerror) 588#endif 589 590 591#ifdef RE_ENABLE_I18N 592/* This static array is used for the map to single-byte characters when 593 UTF-8 is used. Otherwise we would allocate memory just to initialize 594 it the same all the time. UTF-8 is the preferred encoding so this is 595 a worthwhile optimization. */ 596#if __GNUC__ >= 3 597static const bitset_t utf8_sb_map = { 598 /* Set the first 128 bits. */ 599 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 600}; 601#else /* ! (__GNUC__ >= 3) */ 602static bitset_t utf8_sb_map; 603#endif /* __GNUC__ >= 3 */ 604#endif /* RE_ENABLE_I18N */ 605 606 607static void 608free_dfa_content (re_dfa_t *dfa) 609{ 610 int i, j; 611 612 if (dfa->nodes) 613 for (i = 0; i < dfa->nodes_len; ++i) 614 free_token (dfa->nodes + i); 615 re_free (dfa->nexts); 616 for (i = 0; i < dfa->nodes_len; ++i) 617 { 618 if (dfa->eclosures != NULL) 619 re_node_set_free (dfa->eclosures + i); 620 if (dfa->inveclosures != NULL) 621 re_node_set_free (dfa->inveclosures + i); 622 if (dfa->edests != NULL) 623 re_node_set_free (dfa->edests + i); 624 } 625 re_free (dfa->edests); 626 re_free (dfa->eclosures); 627 re_free (dfa->inveclosures); 628 re_free (dfa->nodes); 629 630 if (dfa->state_table) 631 for (i = 0; i <= dfa->state_hash_mask; ++i) 632 { 633 struct re_state_table_entry *entry = dfa->state_table + i; 634 for (j = 0; j < entry->num; ++j) 635 { 636 re_dfastate_t *state = entry->array[j]; 637 free_state (state); 638 } 639 re_free (entry->array); 640 } 641 re_free (dfa->state_table); 642#ifdef RE_ENABLE_I18N 643 if (dfa->sb_char != utf8_sb_map) 644 re_free (dfa->sb_char); 645#endif 646 re_free (dfa->subexp_map); 647#ifdef DEBUG 648 re_free (dfa->re_str); 649#endif 650 651 re_free (dfa); 652} 653 654 655/* Free dynamically allocated space used by PREG. */ 656 657void 658regfree (regex_t *preg) 659{ 660 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 661 if (BE (dfa != NULL, 1)) 662 free_dfa_content (dfa); 663 preg->buffer = NULL; 664 preg->allocated = 0; 665 666 re_free (preg->fastmap); 667 preg->fastmap = NULL; 668 669 re_free (preg->translate); 670 preg->translate = NULL; 671} 672#ifdef _LIBC 673weak_alias (__regfree, regfree) 674#endif 675 676/* Entry points compatible with 4.2 BSD regex library. We don't define 677 them unless specifically requested. */ 678 679#if defined _REGEX_RE_COMP || defined _LIBC 680 681/* BSD has one and only one pattern buffer. */ 682static struct re_pattern_buffer re_comp_buf; 683 684char * 685# ifdef _LIBC 686/* Make these definitions weak in libc, so POSIX programs can redefine 687 these names if they don't use our functions, and still use 688 regcomp/regexec above without link errors. */ 689weak_function 690# endif 691re_comp (s) 692 const char *s; 693{ 694 reg_errcode_t ret; 695 char *fastmap; 696 697 if (!s) 698 { 699 if (!re_comp_buf.buffer) 700 return gettext ("No previous regular expression"); 701 return 0; 702 } 703 704 if (re_comp_buf.buffer) 705 { 706 fastmap = re_comp_buf.fastmap; 707 re_comp_buf.fastmap = NULL; 708 __regfree (&re_comp_buf); 709 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 710 re_comp_buf.fastmap = fastmap; 711 } 712 713 if (re_comp_buf.fastmap == NULL) 714 { 715 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 716 if (re_comp_buf.fastmap == NULL) 717 return (char *) gettext (__re_error_msgid 718 + __re_error_msgid_idx[(int) REG_ESPACE]); 719 } 720 721 /* Since `re_exec' always passes NULL for the `regs' argument, we 722 don't need to initialize the pattern buffer fields which affect it. */ 723 724 /* Match anchors at newlines. */ 725 re_comp_buf.newline_anchor = 1; 726 727 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 728 729 if (!ret) 730 return NULL; 731 732 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 733 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 734} 735 736#ifdef _LIBC 737libc_freeres_fn (free_mem) 738{ 739 __regfree (&re_comp_buf); 740} 741#endif 742 743#endif /* _REGEX_RE_COMP */ 744 745/* Internal entry point. 746 Compile the regular expression PATTERN, whose length is LENGTH. 747 SYNTAX indicate regular expression's syntax. */ 748 749static reg_errcode_t 750re_compile_internal (regex_t *preg, const char * pattern, size_t length, 751 reg_syntax_t syntax) 752{ 753 reg_errcode_t err = REG_NOERROR; 754 re_dfa_t *dfa; 755 re_string_t regexp; 756 757 /* Initialize the pattern buffer. */ 758 preg->fastmap_accurate = 0; 759 preg->syntax = syntax; 760 preg->not_bol = preg->not_eol = 0; 761 preg->used = 0; 762 preg->re_nsub = 0; 763 preg->can_be_null = 0; 764 preg->regs_allocated = REGS_UNALLOCATED; 765 766 /* Initialize the dfa. */ 767 dfa = (re_dfa_t *) preg->buffer; 768 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 769 { 770 /* If zero allocated, but buffer is non-null, try to realloc 771 enough space. This loses if buffer's address is bogus, but 772 that is the user's responsibility. If ->buffer is NULL this 773 is a simple allocation. */ 774 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 775 if (dfa == NULL) 776 return REG_ESPACE; 777 preg->allocated = sizeof (re_dfa_t); 778 preg->buffer = (unsigned char *) dfa; 779 } 780 preg->used = sizeof (re_dfa_t); 781 782 err = init_dfa (dfa, length); 783 if (BE (err != REG_NOERROR, 0)) 784 { 785 free_dfa_content (dfa); 786 preg->buffer = NULL; 787 preg->allocated = 0; 788 return err; 789 } 790#ifdef DEBUG 791 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 792 dfa->re_str = re_malloc (char, length + 1); 793 strncpy (dfa->re_str, pattern, length + 1); 794#endif 795 796 __libc_lock_init (dfa->lock); 797 798 err = re_string_construct (&regexp, pattern, length, preg->translate, 799 syntax & RE_ICASE, dfa); 800 if (BE (err != REG_NOERROR, 0)) 801 { 802 re_compile_internal_free_return: 803 free_workarea_compile (preg); 804 re_string_destruct (&regexp); 805 free_dfa_content (dfa); 806 preg->buffer = NULL; 807 preg->allocated = 0; 808 return err; 809 } 810 811 /* Parse the regular expression, and build a structure tree. */ 812 preg->re_nsub = 0; 813 dfa->str_tree = parse (&regexp, preg, syntax, &err); 814 if (BE (dfa->str_tree == NULL, 0)) 815 goto re_compile_internal_free_return; 816 817 /* Analyze the tree and create the nfa. */ 818 err = analyze (preg); 819 if (BE (err != REG_NOERROR, 0)) 820 goto re_compile_internal_free_return; 821 822#ifdef RE_ENABLE_I18N 823 /* If possible, do searching in single byte encoding to speed things up. */ 824 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 825 optimize_utf8 (dfa); 826#endif 827 828 /* Then create the initial state of the dfa. */ 829 err = create_initial_state (dfa); 830 831 /* Release work areas. */ 832 free_workarea_compile (preg); 833 re_string_destruct (&regexp); 834 835 if (BE (err != REG_NOERROR, 0)) 836 { 837 free_dfa_content (dfa); 838 preg->buffer = NULL; 839 preg->allocated = 0; 840 } 841 842 return err; 843} 844 845/* Initialize DFA. We use the length of the regular expression PAT_LEN 846 as the initial length of some arrays. */ 847 848static reg_errcode_t 849init_dfa (re_dfa_t *dfa, size_t pat_len) 850{ 851 unsigned int table_size; 852#ifndef _LIBC 853 const char *codeset_name; 854#endif 855 856 memset (dfa, '\0', sizeof (re_dfa_t)); 857 858 /* Force allocation of str_tree_storage the first time. */ 859 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 860 861 /* Avoid overflows. */ 862 if (pat_len == SIZE_MAX) 863 return REG_ESPACE; 864 865 dfa->nodes_alloc = pat_len + 1; 866 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 867 868 /* table_size = 2 ^ ceil(log pat_len) */ 869 for (table_size = 1; ; table_size <<= 1) 870 if (table_size > pat_len) 871 break; 872 873 dfa->state_table = calloc (table_size, sizeof (struct re_state_table_entry)); 874 dfa->state_hash_mask = table_size - 1; 875 876 dfa->mb_cur_max = MB_CUR_MAX; 877#ifdef _LIBC 878 if (dfa->mb_cur_max == 6 879 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 880 dfa->is_utf8 = 1; 881 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 882 != 0); 883#else 884# ifdef HAVE_LANGINFO_CODESET 885 codeset_name = nl_langinfo (CODESET); 886# else 887 codeset_name = getenv ("LC_ALL"); 888 if (codeset_name == NULL || codeset_name[0] == '\0') 889 codeset_name = getenv ("LC_CTYPE"); 890 if (codeset_name == NULL || codeset_name[0] == '\0') 891 codeset_name = getenv ("LANG"); 892 if (codeset_name == NULL) 893 codeset_name = ""; 894 else if (strchr (codeset_name, '.') != NULL) 895 codeset_name = strchr (codeset_name, '.') + 1; 896# endif 897 898 /* strcasecmp isn't a standard interface. brute force check */ 899#if 0 900 if (strcasecmp (codeset_name, "UTF-8") == 0 901 || strcasecmp (codeset_name, "UTF8") == 0) 902 dfa->is_utf8 = 1; 903#else 904 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u') 905 && (codeset_name[1] == 'T' || codeset_name[1] == 't') 906 && (codeset_name[2] == 'F' || codeset_name[2] == 'f') 907 && (codeset_name[3] == '-' 908 ? codeset_name[4] == '8' && codeset_name[5] == '\0' 909 : codeset_name[3] == '8' && codeset_name[4] == '\0')) 910 dfa->is_utf8 = 1; 911#endif 912 913 /* We check exhaustively in the loop below if this charset is a 914 superset of ASCII. */ 915 dfa->map_notascii = 0; 916#endif 917 918#ifdef RE_ENABLE_I18N 919 if (dfa->mb_cur_max > 1) 920 { 921 if (dfa->is_utf8) 922 { 923#if !defined(__GNUC__) || __GNUC__ < 3 924 static short utf8_sb_map_inited = 0; 925 926 if (! utf8_sb_map_inited) 927 { 928 int i; 929 930 utf8_sb_map_inited = 0; 931 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++) 932 utf8_sb_map[i] = BITSET_WORD_MAX; 933 } 934#endif 935 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 936 } 937 else 938 { 939 int i, j, ch; 940 941 dfa->sb_char = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t)); 942 if (BE (dfa->sb_char == NULL, 0)) 943 return REG_ESPACE; 944 945 /* Set the bits corresponding to single byte chars. */ 946 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 947 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 948 { 949 wint_t wch = __btowc (ch); 950 if (wch != WEOF) 951 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 952# ifndef _LIBC 953 if (isascii (ch) && wch != ch) 954 dfa->map_notascii = 1; 955# endif 956 } 957 } 958 } 959#endif 960 961 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 962 return REG_ESPACE; 963 return REG_NOERROR; 964} 965 966/* Initialize WORD_CHAR table, which indicate which character is 967 "word". In this case "word" means that it is the word construction 968 character used by some operators like "\<", "\>", etc. */ 969 970static void 971internal_function 972init_word_char (re_dfa_t *dfa) 973{ 974 int i, j, ch; 975 dfa->word_ops_used = 1; 976 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 977 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 978 if (isalnum (ch) || ch == '_') 979 dfa->word_char[i] |= (bitset_word_t) 1 << j; 980} 981 982/* Free the work area which are only used while compiling. */ 983 984static void 985free_workarea_compile (regex_t *preg) 986{ 987 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 988 bin_tree_storage_t *storage, *next; 989 for (storage = dfa->str_tree_storage; storage; storage = next) 990 { 991 next = storage->next; 992 re_free (storage); 993 } 994 dfa->str_tree_storage = NULL; 995 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 996 dfa->str_tree = NULL; 997 re_free (dfa->org_indices); 998 dfa->org_indices = NULL; 999} 1000 1001/* Create initial states for all contexts. */ 1002 1003static reg_errcode_t 1004create_initial_state (re_dfa_t *dfa) 1005{ 1006 int first, i; 1007 reg_errcode_t err; 1008 re_node_set init_nodes; 1009 1010 /* Initial states have the epsilon closure of the node which is 1011 the first node of the regular expression. */ 1012 first = dfa->str_tree->first->node_idx; 1013 dfa->init_node = first; 1014 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1015 if (BE (err != REG_NOERROR, 0)) 1016 return err; 1017 1018 /* The back-references which are in initial states can epsilon transit, 1019 since in this case all of the subexpressions can be null. 1020 Then we add epsilon closures of the nodes which are the next nodes of 1021 the back-references. */ 1022 if (dfa->nbackref > 0) 1023 for (i = 0; i < init_nodes.nelem; ++i) 1024 { 1025 int node_idx = init_nodes.elems[i]; 1026 re_token_type_t type = dfa->nodes[node_idx].type; 1027 1028 int clexp_idx; 1029 if (type != OP_BACK_REF) 1030 continue; 1031 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1032 { 1033 re_token_t *clexp_node; 1034 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1035 if (clexp_node->type == OP_CLOSE_SUBEXP 1036 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1037 break; 1038 } 1039 if (clexp_idx == init_nodes.nelem) 1040 continue; 1041 1042 if (type == OP_BACK_REF) 1043 { 1044 int dest_idx = dfa->edests[node_idx].elems[0]; 1045 if (!re_node_set_contains (&init_nodes, dest_idx)) 1046 { 1047 reg_errcode_t err = re_node_set_merge (&init_nodes, 1048 dfa->eclosures 1049 + dest_idx); 1050 if (err != REG_NOERROR) 1051 return err; 1052 i = 0; 1053 } 1054 } 1055 } 1056 1057 /* It must be the first time to invoke acquire_state. */ 1058 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1059 /* We don't check ERR here, since the initial state must not be NULL. */ 1060 if (BE (dfa->init_state == NULL, 0)) 1061 return err; 1062 if (dfa->init_state->has_constraint) 1063 { 1064 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1065 CONTEXT_WORD); 1066 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1067 CONTEXT_NEWLINE); 1068 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1069 &init_nodes, 1070 CONTEXT_NEWLINE 1071 | CONTEXT_BEGBUF); 1072 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1073 || dfa->init_state_begbuf == NULL, 0)) 1074 return err; 1075 } 1076 else 1077 dfa->init_state_word = dfa->init_state_nl 1078 = dfa->init_state_begbuf = dfa->init_state; 1079 1080 re_node_set_free (&init_nodes); 1081 return REG_NOERROR; 1082} 1083 1084#ifdef RE_ENABLE_I18N 1085/* If it is possible to do searching in single byte encoding instead of UTF-8 1086 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1087 DFA nodes where needed. */ 1088 1089static void 1090optimize_utf8 (re_dfa_t *dfa) 1091{ 1092 int node, i, mb_chars = 0, has_period = 0; 1093 1094 for (node = 0; node < dfa->nodes_len; ++node) 1095 switch (dfa->nodes[node].type) 1096 { 1097 case CHARACTER: 1098 if (dfa->nodes[node].opr.c >= 0x80) 1099 mb_chars = 1; 1100 break; 1101 case ANCHOR: 1102 switch (dfa->nodes[node].opr.ctx_type) 1103 { 1104 case LINE_FIRST: 1105 case LINE_LAST: 1106 case BUF_FIRST: 1107 case BUF_LAST: 1108 break; 1109 default: 1110 /* Word anchors etc. cannot be handled. It's okay to test 1111 opr.ctx_type since constraints (for all DFA nodes) are 1112 created by ORing one or more opr.ctx_type values. */ 1113 return; 1114 } 1115 break; 1116 case OP_PERIOD: 1117 has_period = 1; 1118 break; 1119 case OP_BACK_REF: 1120 case OP_ALT: 1121 case END_OF_RE: 1122 case OP_DUP_ASTERISK: 1123 case OP_OPEN_SUBEXP: 1124 case OP_CLOSE_SUBEXP: 1125 break; 1126 case COMPLEX_BRACKET: 1127 return; 1128 case SIMPLE_BRACKET: 1129 /* Just double check. The non-ASCII range starts at 0x80. */ 1130 assert (0x80 % BITSET_WORD_BITS == 0); 1131 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1132 if (dfa->nodes[node].opr.sbcset[i]) 1133 return; 1134 break; 1135 default: 1136 abort (); 1137 } 1138 1139 if (mb_chars || has_period) 1140 for (node = 0; node < dfa->nodes_len; ++node) 1141 { 1142 if (dfa->nodes[node].type == CHARACTER 1143 && dfa->nodes[node].opr.c >= 0x80) 1144 dfa->nodes[node].mb_partial = 0; 1145 else if (dfa->nodes[node].type == OP_PERIOD) 1146 dfa->nodes[node].type = OP_UTF8_PERIOD; 1147 } 1148 1149 /* The search can be in single byte locale. */ 1150 dfa->mb_cur_max = 1; 1151 dfa->is_utf8 = 0; 1152 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1153} 1154#endif 1155 1156/* Analyze the structure tree, and calculate "first", "next", "edest", 1157 "eclosure", and "inveclosure". */ 1158 1159static reg_errcode_t 1160analyze (regex_t *preg) 1161{ 1162 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1163 reg_errcode_t ret; 1164 1165 /* Allocate arrays. */ 1166 dfa->nexts = re_malloc (int, dfa->nodes_alloc); 1167 dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 1168 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1169 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1170 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1171 || dfa->eclosures == NULL, 0)) 1172 return REG_ESPACE; 1173 1174 dfa->subexp_map = re_malloc (int, preg->re_nsub); 1175 if (dfa->subexp_map != NULL) 1176 { 1177 int i; 1178 for (i = 0; i < preg->re_nsub; i++) 1179 dfa->subexp_map[i] = i; 1180 preorder (dfa->str_tree, optimize_subexps, dfa); 1181 for (i = 0; i < preg->re_nsub; i++) 1182 if (dfa->subexp_map[i] != i) 1183 break; 1184 if (i == preg->re_nsub) 1185 { 1186 free (dfa->subexp_map); 1187 dfa->subexp_map = NULL; 1188 } 1189 } 1190 1191 ret = postorder (dfa->str_tree, lower_subexps, preg); 1192 if (BE (ret != REG_NOERROR, 0)) 1193 return ret; 1194 ret = postorder (dfa->str_tree, calc_first, dfa); 1195 if (BE (ret != REG_NOERROR, 0)) 1196 return ret; 1197 preorder (dfa->str_tree, calc_next, dfa); 1198 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1199 if (BE (ret != REG_NOERROR, 0)) 1200 return ret; 1201 ret = calc_eclosure (dfa); 1202 if (BE (ret != REG_NOERROR, 0)) 1203 return ret; 1204 1205 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1206 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1207 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1208 || dfa->nbackref) 1209 { 1210 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1211 if (BE (dfa->inveclosures == NULL, 0)) 1212 return REG_ESPACE; 1213 ret = calc_inveclosure (dfa); 1214 } 1215 1216 return ret; 1217} 1218 1219/* Our parse trees are very unbalanced, so we cannot use a stack to 1220 implement parse tree visits. Instead, we use parent pointers and 1221 some hairy code in these two functions. */ 1222static reg_errcode_t 1223postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1224 void *extra) 1225{ 1226 bin_tree_t *node, *prev; 1227 1228 for (node = root; ; ) 1229 { 1230 /* Descend down the tree, preferably to the left (or to the right 1231 if that's the only child). */ 1232 while (node->left || node->right) 1233 if (node->left) 1234 node = node->left; 1235 else 1236 node = node->right; 1237 1238 do 1239 { 1240 reg_errcode_t err = fn (extra, node); 1241 if (BE (err != REG_NOERROR, 0)) 1242 return err; 1243 if (node->parent == NULL) 1244 return REG_NOERROR; 1245 prev = node; 1246 node = node->parent; 1247 } 1248 /* Go up while we have a node that is reached from the right. */ 1249 while (node->right == prev || node->right == NULL); 1250 node = node->right; 1251 } 1252} 1253 1254static reg_errcode_t 1255preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1256 void *extra) 1257{ 1258 bin_tree_t *node; 1259 1260 for (node = root; ; ) 1261 { 1262 reg_errcode_t err = fn (extra, node); 1263 if (BE (err != REG_NOERROR, 0)) 1264 return err; 1265 1266 /* Go to the left node, or up and to the right. */ 1267 if (node->left) 1268 node = node->left; 1269 else 1270 { 1271 bin_tree_t *prev = NULL; 1272 while (node->right == prev || node->right == NULL) 1273 { 1274 prev = node; 1275 node = node->parent; 1276 if (!node) 1277 return REG_NOERROR; 1278 } 1279 node = node->right; 1280 } 1281 } 1282} 1283 1284/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1285 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1286 backreferences as well. Requires a preorder visit. */ 1287static reg_errcode_t 1288optimize_subexps (void *extra, bin_tree_t *node) 1289{ 1290 re_dfa_t *dfa = (re_dfa_t *) extra; 1291 1292 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1293 { 1294 int idx = node->token.opr.idx; 1295 node->token.opr.idx = dfa->subexp_map[idx]; 1296 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1297 } 1298 1299 else if (node->token.type == SUBEXP 1300 && node->left && node->left->token.type == SUBEXP) 1301 { 1302 int other_idx = node->left->token.opr.idx; 1303 1304 node->left = node->left->left; 1305 if (node->left) 1306 node->left->parent = node; 1307 1308 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1309 if (other_idx < BITSET_WORD_BITS) 1310 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1311 } 1312 1313 return REG_NOERROR; 1314} 1315 1316/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1317 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1318static reg_errcode_t 1319lower_subexps (void *extra, bin_tree_t *node) 1320{ 1321 regex_t *preg = (regex_t *) extra; 1322 reg_errcode_t err = REG_NOERROR; 1323 1324 if (node->left && node->left->token.type == SUBEXP) 1325 { 1326 node->left = lower_subexp (&err, preg, node->left); 1327 if (node->left) 1328 node->left->parent = node; 1329 } 1330 if (node->right && node->right->token.type == SUBEXP) 1331 { 1332 node->right = lower_subexp (&err, preg, node->right); 1333 if (node->right) 1334 node->right->parent = node; 1335 } 1336 1337 return err; 1338} 1339 1340static bin_tree_t * 1341lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1342{ 1343 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1344 bin_tree_t *body = node->left; 1345 bin_tree_t *op, *cls, *tree1, *tree; 1346 1347 if (preg->no_sub 1348 /* We do not optimize empty subexpressions, because otherwise we may 1349 have bad CONCAT nodes with NULL children. This is obviously not 1350 very common, so we do not lose much. An example that triggers 1351 this case is the sed "script" /\(\)/x. */ 1352 && node->left != NULL 1353 && (node->token.opr.idx >= BITSET_WORD_BITS 1354 || !(dfa->used_bkref_map 1355 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1356 return node->left; 1357 1358 /* Convert the SUBEXP node to the concatenation of an 1359 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1360 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1361 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1362 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1363 tree = create_tree (dfa, op, tree1, CONCAT); 1364 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1365 { 1366 *err = REG_ESPACE; 1367 return NULL; 1368 } 1369 1370 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1371 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1372 return tree; 1373} 1374 1375/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1376 nodes. Requires a postorder visit. */ 1377static reg_errcode_t 1378calc_first (void *extra, bin_tree_t *node) 1379{ 1380 re_dfa_t *dfa = (re_dfa_t *) extra; 1381 if (node->token.type == CONCAT) 1382 { 1383 node->first = node->left->first; 1384 node->node_idx = node->left->node_idx; 1385 } 1386 else 1387 { 1388 node->first = node; 1389 node->node_idx = re_dfa_add_node (dfa, node->token); 1390 if (BE (node->node_idx == -1, 0)) 1391 return REG_ESPACE; 1392 if (node->token.type == ANCHOR) 1393 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1394 } 1395 return REG_NOERROR; 1396} 1397 1398/* Pass 2: compute NEXT on the tree. Preorder visit. */ 1399static reg_errcode_t 1400calc_next (void *extra, bin_tree_t *node) 1401{ 1402 switch (node->token.type) 1403 { 1404 case OP_DUP_ASTERISK: 1405 node->left->next = node; 1406 break; 1407 case CONCAT: 1408 node->left->next = node->right->first; 1409 node->right->next = node->next; 1410 break; 1411 default: 1412 if (node->left) 1413 node->left->next = node->next; 1414 if (node->right) 1415 node->right->next = node->next; 1416 break; 1417 } 1418 return REG_NOERROR; 1419} 1420 1421/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1422static reg_errcode_t 1423link_nfa_nodes (void *extra, bin_tree_t *node) 1424{ 1425 re_dfa_t *dfa = (re_dfa_t *) extra; 1426 int idx = node->node_idx; 1427 reg_errcode_t err = REG_NOERROR; 1428 1429 switch (node->token.type) 1430 { 1431 case CONCAT: 1432 break; 1433 1434 case END_OF_RE: 1435 assert (node->next == NULL); 1436 break; 1437 1438 case OP_DUP_ASTERISK: 1439 case OP_ALT: 1440 { 1441 int left, right; 1442 dfa->has_plural_match = 1; 1443 if (node->left != NULL) 1444 left = node->left->first->node_idx; 1445 else 1446 left = node->next->node_idx; 1447 if (node->right != NULL) 1448 right = node->right->first->node_idx; 1449 else 1450 right = node->next->node_idx; 1451 assert (left > -1); 1452 assert (right > -1); 1453 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1454 } 1455 break; 1456 1457 case ANCHOR: 1458 case OP_OPEN_SUBEXP: 1459 case OP_CLOSE_SUBEXP: 1460 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1461 break; 1462 1463 case OP_BACK_REF: 1464 dfa->nexts[idx] = node->next->node_idx; 1465 if (node->token.type == OP_BACK_REF) 1466 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1467 break; 1468 1469 default: 1470 assert (!IS_EPSILON_NODE (node->token.type)); 1471 dfa->nexts[idx] = node->next->node_idx; 1472 break; 1473 } 1474 1475 return err; 1476} 1477 1478/* Duplicate the epsilon closure of the node ROOT_NODE. 1479 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1480 to their own constraint. */ 1481 1482static reg_errcode_t 1483internal_function 1484duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, 1485 int root_node, unsigned int init_constraint) 1486{ 1487 int org_node, clone_node, ret; 1488 unsigned int constraint = init_constraint; 1489 for (org_node = top_org_node, clone_node = top_clone_node;;) 1490 { 1491 int org_dest, clone_dest; 1492 if (dfa->nodes[org_node].type == OP_BACK_REF) 1493 { 1494 /* If the back reference epsilon-transit, its destination must 1495 also have the constraint. Then duplicate the epsilon closure 1496 of the destination of the back reference, and store it in 1497 edests of the back reference. */ 1498 org_dest = dfa->nexts[org_node]; 1499 re_node_set_empty (dfa->edests + clone_node); 1500 clone_dest = duplicate_node (dfa, org_dest, constraint); 1501 if (BE (clone_dest == -1, 0)) 1502 return REG_ESPACE; 1503 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1504 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1505 if (BE (ret < 0, 0)) 1506 return REG_ESPACE; 1507 } 1508 else if (dfa->edests[org_node].nelem == 0) 1509 { 1510 /* In case of the node can't epsilon-transit, don't duplicate the 1511 destination and store the original destination as the 1512 destination of the node. */ 1513 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1514 break; 1515 } 1516 else if (dfa->edests[org_node].nelem == 1) 1517 { 1518 /* In case of the node can epsilon-transit, and it has only one 1519 destination. */ 1520 org_dest = dfa->edests[org_node].elems[0]; 1521 re_node_set_empty (dfa->edests + clone_node); 1522 /* If the node is root_node itself, it means the epsilon clsoure 1523 has a loop. Then tie it to the destination of the root_node. */ 1524 if (org_node == root_node && clone_node != org_node) 1525 { 1526 ret = re_node_set_insert (dfa->edests + clone_node, org_dest); 1527 if (BE (ret < 0, 0)) 1528 return REG_ESPACE; 1529 break; 1530 } 1531 /* In case of the node has another constraint, add it. */ 1532 constraint |= dfa->nodes[org_node].constraint; 1533 clone_dest = duplicate_node (dfa, org_dest, constraint); 1534 if (BE (clone_dest == -1, 0)) 1535 return REG_ESPACE; 1536 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1537 if (BE (ret < 0, 0)) 1538 return REG_ESPACE; 1539 } 1540 else /* dfa->edests[org_node].nelem == 2 */ 1541 { 1542 /* In case of the node can epsilon-transit, and it has two 1543 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1544 org_dest = dfa->edests[org_node].elems[0]; 1545 re_node_set_empty (dfa->edests + clone_node); 1546 /* Search for a duplicated node which satisfies the constraint. */ 1547 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1548 if (clone_dest == -1) 1549 { 1550 /* There is no such duplicated node, create a new one. */ 1551 reg_errcode_t err; 1552 clone_dest = duplicate_node (dfa, org_dest, constraint); 1553 if (BE (clone_dest == -1, 0)) 1554 return REG_ESPACE; 1555 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1556 if (BE (ret < 0, 0)) 1557 return REG_ESPACE; 1558 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1559 root_node, constraint); 1560 if (BE (err != REG_NOERROR, 0)) 1561 return err; 1562 } 1563 else 1564 { 1565 /* There is a duplicated node which satisfies the constraint, 1566 use it to avoid infinite loop. */ 1567 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1568 if (BE (ret < 0, 0)) 1569 return REG_ESPACE; 1570 } 1571 1572 org_dest = dfa->edests[org_node].elems[1]; 1573 clone_dest = duplicate_node (dfa, org_dest, constraint); 1574 if (BE (clone_dest == -1, 0)) 1575 return REG_ESPACE; 1576 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1577 if (BE (ret < 0, 0)) 1578 return REG_ESPACE; 1579 } 1580 org_node = org_dest; 1581 clone_node = clone_dest; 1582 } 1583 return REG_NOERROR; 1584} 1585 1586/* Search for a node which is duplicated from the node ORG_NODE, and 1587 satisfies the constraint CONSTRAINT. */ 1588 1589static int 1590search_duplicated_node (const re_dfa_t *dfa, int org_node, 1591 unsigned int constraint) 1592{ 1593 int idx; 1594 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1595 { 1596 if (org_node == dfa->org_indices[idx] 1597 && constraint == dfa->nodes[idx].constraint) 1598 return idx; /* Found. */ 1599 } 1600 return -1; /* Not found. */ 1601} 1602 1603/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1604 Return the index of the new node, or -1 if insufficient storage is 1605 available. */ 1606 1607static int 1608duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) 1609{ 1610 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1611 if (BE (dup_idx != -1, 1)) 1612 { 1613 dfa->nodes[dup_idx].constraint = constraint; 1614 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1615 dfa->nodes[dup_idx].duplicated = 1; 1616 1617 /* Store the index of the original node. */ 1618 dfa->org_indices[dup_idx] = org_idx; 1619 } 1620 return dup_idx; 1621} 1622 1623static reg_errcode_t 1624calc_inveclosure (re_dfa_t *dfa) 1625{ 1626 int src, idx, ret; 1627 for (idx = 0; idx < dfa->nodes_len; ++idx) 1628 re_node_set_init_empty (dfa->inveclosures + idx); 1629 1630 for (src = 0; src < dfa->nodes_len; ++src) 1631 { 1632 int *elems = dfa->eclosures[src].elems; 1633 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1634 { 1635 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1636 if (BE (ret == -1, 0)) 1637 return REG_ESPACE; 1638 } 1639 } 1640 1641 return REG_NOERROR; 1642} 1643 1644/* Calculate "eclosure" for all the node in DFA. */ 1645 1646static reg_errcode_t 1647calc_eclosure (re_dfa_t *dfa) 1648{ 1649 int node_idx, incomplete; 1650#ifdef DEBUG 1651 assert (dfa->nodes_len > 0); 1652#endif 1653 incomplete = 0; 1654 /* For each nodes, calculate epsilon closure. */ 1655 for (node_idx = 0; ; ++node_idx) 1656 { 1657 reg_errcode_t err; 1658 re_node_set eclosure_elem; 1659 if (node_idx == dfa->nodes_len) 1660 { 1661 if (!incomplete) 1662 break; 1663 incomplete = 0; 1664 node_idx = 0; 1665 } 1666 1667#ifdef DEBUG 1668 assert (dfa->eclosures[node_idx].nelem != -1); 1669#endif 1670 1671 /* If we have already calculated, skip it. */ 1672 if (dfa->eclosures[node_idx].nelem != 0) 1673 continue; 1674 /* Calculate epsilon closure of `node_idx'. */ 1675 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1676 if (BE (err != REG_NOERROR, 0)) 1677 return err; 1678 1679 if (dfa->eclosures[node_idx].nelem == 0) 1680 { 1681 incomplete = 1; 1682 re_node_set_free (&eclosure_elem); 1683 } 1684 } 1685 return REG_NOERROR; 1686} 1687 1688/* Calculate epsilon closure of NODE. */ 1689 1690static reg_errcode_t 1691calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) 1692{ 1693 reg_errcode_t err; 1694 int i; 1695 re_node_set eclosure; 1696 int ret; 1697 int incomplete = 0; 1698 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1699 if (BE (err != REG_NOERROR, 0)) 1700 return err; 1701 1702 /* This indicates that we are calculating this node now. 1703 We reference this value to avoid infinite loop. */ 1704 dfa->eclosures[node].nelem = -1; 1705 1706 /* If the current node has constraints, duplicate all nodes 1707 since they must inherit the constraints. */ 1708 if (dfa->nodes[node].constraint 1709 && dfa->edests[node].nelem 1710 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1711 { 1712 err = duplicate_node_closure (dfa, node, node, node, 1713 dfa->nodes[node].constraint); 1714 if (BE (err != REG_NOERROR, 0)) 1715 return err; 1716 } 1717 1718 /* Expand each epsilon destination nodes. */ 1719 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1720 for (i = 0; i < dfa->edests[node].nelem; ++i) 1721 { 1722 re_node_set eclosure_elem; 1723 int edest = dfa->edests[node].elems[i]; 1724 /* If calculating the epsilon closure of `edest' is in progress, 1725 return intermediate result. */ 1726 if (dfa->eclosures[edest].nelem == -1) 1727 { 1728 incomplete = 1; 1729 continue; 1730 } 1731 /* If we haven't calculated the epsilon closure of `edest' yet, 1732 calculate now. Otherwise use calculated epsilon closure. */ 1733 if (dfa->eclosures[edest].nelem == 0) 1734 { 1735 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1736 if (BE (err != REG_NOERROR, 0)) 1737 return err; 1738 } 1739 else 1740 eclosure_elem = dfa->eclosures[edest]; 1741 /* Merge the epsilon closure of `edest'. */ 1742 err = re_node_set_merge (&eclosure, &eclosure_elem); 1743 if (BE (err != REG_NOERROR, 0)) 1744 return err; 1745 /* If the epsilon closure of `edest' is incomplete, 1746 the epsilon closure of this node is also incomplete. */ 1747 if (dfa->eclosures[edest].nelem == 0) 1748 { 1749 incomplete = 1; 1750 re_node_set_free (&eclosure_elem); 1751 } 1752 } 1753 1754 /* An epsilon closure includes itself. */ 1755 ret = re_node_set_insert (&eclosure, node); 1756 if (BE (ret < 0, 0)) 1757 return REG_ESPACE; 1758 if (incomplete && !root) 1759 dfa->eclosures[node].nelem = 0; 1760 else 1761 dfa->eclosures[node] = eclosure; 1762 *new_set = eclosure; 1763 return REG_NOERROR; 1764} 1765 1766/* Functions for token which are used in the parser. */ 1767 1768/* Fetch a token from INPUT. 1769 We must not use this function inside bracket expressions. */ 1770 1771static void 1772internal_function 1773fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1774{ 1775 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1776} 1777 1778/* Peek a token from INPUT, and return the length of the token. 1779 We must not use this function inside bracket expressions. */ 1780 1781static int 1782internal_function 1783peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1784{ 1785 unsigned char c; 1786 1787 if (re_string_eoi (input)) 1788 { 1789 token->type = END_OF_RE; 1790 return 0; 1791 } 1792 1793 c = re_string_peek_byte (input, 0); 1794 token->opr.c = c; 1795 1796 token->word_char = 0; 1797#ifdef RE_ENABLE_I18N 1798 token->mb_partial = 0; 1799 if (input->mb_cur_max > 1 && 1800 !re_string_first_byte (input, re_string_cur_idx (input))) 1801 { 1802 token->type = CHARACTER; 1803 token->mb_partial = 1; 1804 return 1; 1805 } 1806#endif 1807 if (c == '\\') 1808 { 1809 unsigned char c2; 1810 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1811 { 1812 token->type = BACK_SLASH; 1813 return 1; 1814 } 1815 1816 c2 = re_string_peek_byte_case (input, 1); 1817 token->opr.c = c2; 1818 token->type = CHARACTER; 1819#ifdef RE_ENABLE_I18N 1820 if (input->mb_cur_max > 1) 1821 { 1822 wint_t wc = re_string_wchar_at (input, 1823 re_string_cur_idx (input) + 1); 1824 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1825 } 1826 else 1827#endif 1828 token->word_char = IS_WORD_CHAR (c2) != 0; 1829 1830 switch (c2) 1831 { 1832 case '|': 1833 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1834 token->type = OP_ALT; 1835 break; 1836 case '1': case '2': case '3': case '4': case '5': 1837 case '6': case '7': case '8': case '9': 1838 if (!(syntax & RE_NO_BK_REFS)) 1839 { 1840 token->type = OP_BACK_REF; 1841 token->opr.idx = c2 - '1'; 1842 } 1843 break; 1844 case '<': 1845 if (!(syntax & RE_NO_GNU_OPS)) 1846 { 1847 token->type = ANCHOR; 1848 token->opr.ctx_type = WORD_FIRST; 1849 } 1850 break; 1851 case '>': 1852 if (!(syntax & RE_NO_GNU_OPS)) 1853 { 1854 token->type = ANCHOR; 1855 token->opr.ctx_type = WORD_LAST; 1856 } 1857 break; 1858 case 'b': 1859 if (!(syntax & RE_NO_GNU_OPS)) 1860 { 1861 token->type = ANCHOR; 1862 token->opr.ctx_type = WORD_DELIM; 1863 } 1864 break; 1865 case 'B': 1866 if (!(syntax & RE_NO_GNU_OPS)) 1867 { 1868 token->type = ANCHOR; 1869 token->opr.ctx_type = NOT_WORD_DELIM; 1870 } 1871 break; 1872 case 'w': 1873 if (!(syntax & RE_NO_GNU_OPS)) 1874 token->type = OP_WORD; 1875 break; 1876 case 'W': 1877 if (!(syntax & RE_NO_GNU_OPS)) 1878 token->type = OP_NOTWORD; 1879 break; 1880 case 's': 1881 if (!(syntax & RE_NO_GNU_OPS)) 1882 token->type = OP_SPACE; 1883 break; 1884 case 'S': 1885 if (!(syntax & RE_NO_GNU_OPS)) 1886 token->type = OP_NOTSPACE; 1887 break; 1888 case '`': 1889 if (!(syntax & RE_NO_GNU_OPS)) 1890 { 1891 token->type = ANCHOR; 1892 token->opr.ctx_type = BUF_FIRST; 1893 } 1894 break; 1895 case '\'': 1896 if (!(syntax & RE_NO_GNU_OPS)) 1897 { 1898 token->type = ANCHOR; 1899 token->opr.ctx_type = BUF_LAST; 1900 } 1901 break; 1902 case '(': 1903 if (!(syntax & RE_NO_BK_PARENS)) 1904 token->type = OP_OPEN_SUBEXP; 1905 break; 1906 case ')': 1907 if (!(syntax & RE_NO_BK_PARENS)) 1908 token->type = OP_CLOSE_SUBEXP; 1909 break; 1910 case '+': 1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1912 token->type = OP_DUP_PLUS; 1913 break; 1914 case '?': 1915 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1916 token->type = OP_DUP_QUESTION; 1917 break; 1918 case '{': 1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1920 token->type = OP_OPEN_DUP_NUM; 1921 break; 1922 case '}': 1923 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1924 token->type = OP_CLOSE_DUP_NUM; 1925 break; 1926 default: 1927 break; 1928 } 1929 return 2; 1930 } 1931 1932 token->type = CHARACTER; 1933#ifdef RE_ENABLE_I18N 1934 if (input->mb_cur_max > 1) 1935 { 1936 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1937 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1938 } 1939 else 1940#endif 1941 token->word_char = IS_WORD_CHAR (token->opr.c); 1942 1943 switch (c) 1944 { 1945 case '\n': 1946 if (syntax & RE_NEWLINE_ALT) 1947 token->type = OP_ALT; 1948 break; 1949 case '|': 1950 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1951 token->type = OP_ALT; 1952 break; 1953 case '*': 1954 token->type = OP_DUP_ASTERISK; 1955 break; 1956 case '+': 1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1958 token->type = OP_DUP_PLUS; 1959 break; 1960 case '?': 1961 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1962 token->type = OP_DUP_QUESTION; 1963 break; 1964 case '{': 1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1966 token->type = OP_OPEN_DUP_NUM; 1967 break; 1968 case '}': 1969 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1970 token->type = OP_CLOSE_DUP_NUM; 1971 break; 1972 case '(': 1973 if (syntax & RE_NO_BK_PARENS) 1974 token->type = OP_OPEN_SUBEXP; 1975 break; 1976 case ')': 1977 if (syntax & RE_NO_BK_PARENS) 1978 token->type = OP_CLOSE_SUBEXP; 1979 break; 1980 case '[': 1981 token->type = OP_OPEN_BRACKET; 1982 break; 1983 case '.': 1984 token->type = OP_PERIOD; 1985 break; 1986 case '^': 1987 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1988 re_string_cur_idx (input) != 0) 1989 { 1990 char prev = re_string_peek_byte (input, -1); 1991 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1992 break; 1993 } 1994 token->type = ANCHOR; 1995 token->opr.ctx_type = LINE_FIRST; 1996 break; 1997 case '$': 1998 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1999 re_string_cur_idx (input) + 1 != re_string_length (input)) 2000 { 2001 re_token_t next; 2002 re_string_skip_bytes (input, 1); 2003 peek_token (&next, input, syntax); 2004 re_string_skip_bytes (input, -1); 2005 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2006 break; 2007 } 2008 token->type = ANCHOR; 2009 token->opr.ctx_type = LINE_LAST; 2010 break; 2011 default: 2012 break; 2013 } 2014 return 1; 2015} 2016 2017/* Peek a token from INPUT, and return the length of the token. 2018 We must not use this function out of bracket expressions. */ 2019 2020static int 2021internal_function 2022peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2023{ 2024 unsigned char c; 2025 if (re_string_eoi (input)) 2026 { 2027 token->type = END_OF_RE; 2028 return 0; 2029 } 2030 c = re_string_peek_byte (input, 0); 2031 token->opr.c = c; 2032 2033#ifdef RE_ENABLE_I18N 2034 if (input->mb_cur_max > 1 && 2035 !re_string_first_byte (input, re_string_cur_idx (input))) 2036 { 2037 token->type = CHARACTER; 2038 return 1; 2039 } 2040#endif /* RE_ENABLE_I18N */ 2041 2042 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2043 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2044 { 2045 /* In this case, '\' escape a character. */ 2046 unsigned char c2; 2047 re_string_skip_bytes (input, 1); 2048 c2 = re_string_peek_byte (input, 0); 2049 token->opr.c = c2; 2050 token->type = CHARACTER; 2051 return 1; 2052 } 2053 if (c == '[') /* '[' is a special char in a bracket exps. */ 2054 { 2055 unsigned char c2; 2056 int token_len; 2057 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2058 c2 = re_string_peek_byte (input, 1); 2059 else 2060 c2 = 0; 2061 token->opr.c = c2; 2062 token_len = 2; 2063 switch (c2) 2064 { 2065 case '.': 2066 token->type = OP_OPEN_COLL_ELEM; 2067 break; 2068 case '=': 2069 token->type = OP_OPEN_EQUIV_CLASS; 2070 break; 2071 case ':': 2072 if (syntax & RE_CHAR_CLASSES) 2073 { 2074 token->type = OP_OPEN_CHAR_CLASS; 2075 break; 2076 } 2077 /* else fall through. */ 2078 default: 2079 token->type = CHARACTER; 2080 token->opr.c = c; 2081 token_len = 1; 2082 break; 2083 } 2084 return token_len; 2085 } 2086 switch (c) 2087 { 2088 case '-': 2089 token->type = OP_CHARSET_RANGE; 2090 break; 2091 case ']': 2092 token->type = OP_CLOSE_BRACKET; 2093 break; 2094 case '^': 2095 token->type = OP_NON_MATCH_LIST; 2096 break; 2097 default: 2098 token->type = CHARACTER; 2099 } 2100 return 1; 2101} 2102 2103/* Functions for parser. */ 2104 2105/* Entry point of the parser. 2106 Parse the regular expression REGEXP and return the structure tree. 2107 If an error has occurred, ERR is set by error code, and return NULL. 2108 This function build the following tree, from regular expression <reg_exp>: 2109 CAT 2110 / \ 2111 / \ 2112 <reg_exp> EOR 2113 2114 CAT means concatenation. 2115 EOR means end of regular expression. */ 2116 2117static bin_tree_t * 2118parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2119 reg_errcode_t *err) 2120{ 2121 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2122 bin_tree_t *tree, *eor, *root; 2123 re_token_t current_token; 2124 dfa->syntax = syntax; 2125 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2126 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err); 2127 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2128 return NULL; 2129 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2130 if (tree != NULL) 2131 root = create_tree (dfa, tree, eor, CONCAT); 2132 else 2133 root = eor; 2134 if (BE (eor == NULL || root == NULL, 0)) 2135 { 2136 *err = REG_ESPACE; 2137 return NULL; 2138 } 2139 return root; 2140} 2141 2142/* This function build the following tree, from regular expression 2143 <branch1>|<branch2>: 2144 ALT 2145 / \ 2146 / \ 2147 <branch1> <branch2> 2148 2149 ALT means alternative, which represents the operator `|'. */ 2150 2151static bin_tree_t * 2152parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2153 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2154{ 2155 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2156 bin_tree_t *tree, *branch = NULL; 2157 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2158 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2159 return NULL; 2160 2161 while (token->type == OP_ALT) 2162 { 2163 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2164 if (token->type != OP_ALT && token->type != END_OF_RE 2165 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2166 { 2167 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2168 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2169 return NULL; 2170 } 2171 else 2172 branch = NULL; 2173 tree = create_tree (dfa, tree, branch, OP_ALT); 2174 if (BE (tree == NULL, 0)) 2175 { 2176 *err = REG_ESPACE; 2177 return NULL; 2178 } 2179 } 2180 return tree; 2181} 2182 2183/* This function build the following tree, from regular expression 2184 <exp1><exp2>: 2185 CAT 2186 / \ 2187 / \ 2188 <exp1> <exp2> 2189 2190 CAT means concatenation. */ 2191 2192static bin_tree_t * 2193parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2194 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2195{ 2196 bin_tree_t *tree, *exp; 2197 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2198 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2199 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2200 return NULL; 2201 2202 while (token->type != OP_ALT && token->type != END_OF_RE 2203 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2204 { 2205 exp = parse_expression (regexp, preg, token, syntax, nest, err); 2206 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2207 { 2208 return NULL; 2209 } 2210 if (tree != NULL && exp != NULL) 2211 { 2212 tree = create_tree (dfa, tree, exp, CONCAT); 2213 if (tree == NULL) 2214 { 2215 *err = REG_ESPACE; 2216 return NULL; 2217 } 2218 } 2219 else if (tree == NULL) 2220 tree = exp; 2221 /* Otherwise exp == NULL, we don't need to create new tree. */ 2222 } 2223 return tree; 2224} 2225 2226/* This function build the following tree, from regular expression a*: 2227 * 2228 | 2229 a 2230*/ 2231 2232static bin_tree_t * 2233parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2234 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2235{ 2236 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2237 bin_tree_t *tree; 2238 switch (token->type) 2239 { 2240 case CHARACTER: 2241 tree = create_token_tree (dfa, NULL, NULL, token); 2242 if (BE (tree == NULL, 0)) 2243 { 2244 *err = REG_ESPACE; 2245 return NULL; 2246 } 2247#ifdef RE_ENABLE_I18N 2248 if (dfa->mb_cur_max > 1) 2249 { 2250 while (!re_string_eoi (regexp) 2251 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2252 { 2253 bin_tree_t *mbc_remain; 2254 fetch_token (token, regexp, syntax); 2255 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2256 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2257 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2258 { 2259 *err = REG_ESPACE; 2260 return NULL; 2261 } 2262 } 2263 } 2264#endif 2265 break; 2266 case OP_OPEN_SUBEXP: 2267 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2268 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2269 return NULL; 2270 break; 2271 case OP_OPEN_BRACKET: 2272 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2273 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2274 return NULL; 2275 break; 2276 case OP_BACK_REF: 2277 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2278 { 2279 *err = REG_ESUBREG; 2280 return NULL; 2281 } 2282 dfa->used_bkref_map |= 1 << token->opr.idx; 2283 tree = create_token_tree (dfa, NULL, NULL, token); 2284 if (BE (tree == NULL, 0)) 2285 { 2286 *err = REG_ESPACE; 2287 return NULL; 2288 } 2289 ++dfa->nbackref; 2290 dfa->has_mb_node = 1; 2291 break; 2292 case OP_OPEN_DUP_NUM: 2293 if (syntax & RE_CONTEXT_INVALID_DUP) 2294 { 2295 *err = REG_BADRPT; 2296 return NULL; 2297 } 2298 /* FALLTHROUGH */ 2299 case OP_DUP_ASTERISK: 2300 case OP_DUP_PLUS: 2301 case OP_DUP_QUESTION: 2302 if (syntax & RE_CONTEXT_INVALID_OPS) 2303 { 2304 *err = REG_BADRPT; 2305 return NULL; 2306 } 2307 else if (syntax & RE_CONTEXT_INDEP_OPS) 2308 { 2309 fetch_token (token, regexp, syntax); 2310 return parse_expression (regexp, preg, token, syntax, nest, err); 2311 } 2312 /* else fall through */ 2313 case OP_CLOSE_SUBEXP: 2314 if ((token->type == OP_CLOSE_SUBEXP) && 2315 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2316 { 2317 *err = REG_ERPAREN; 2318 return NULL; 2319 } 2320 /* else fall through */ 2321 case OP_CLOSE_DUP_NUM: 2322 /* We treat it as a normal character. */ 2323 2324 /* Then we can these characters as normal characters. */ 2325 token->type = CHARACTER; 2326 /* mb_partial and word_char bits should be initialized already 2327 by peek_token. */ 2328 tree = create_token_tree (dfa, NULL, NULL, token); 2329 if (BE (tree == NULL, 0)) 2330 { 2331 *err = REG_ESPACE; 2332 return NULL; 2333 } 2334 break; 2335 case ANCHOR: 2336 if ((token->opr.ctx_type 2337 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2338 && dfa->word_ops_used == 0) 2339 init_word_char (dfa); 2340 if (token->opr.ctx_type == WORD_DELIM 2341 || token->opr.ctx_type == NOT_WORD_DELIM) 2342 { 2343 bin_tree_t *tree_first, *tree_last; 2344 if (token->opr.ctx_type == WORD_DELIM) 2345 { 2346 token->opr.ctx_type = WORD_FIRST; 2347 tree_first = create_token_tree (dfa, NULL, NULL, token); 2348 token->opr.ctx_type = WORD_LAST; 2349 } 2350 else 2351 { 2352 token->opr.ctx_type = INSIDE_WORD; 2353 tree_first = create_token_tree (dfa, NULL, NULL, token); 2354 token->opr.ctx_type = INSIDE_NOTWORD; 2355 } 2356 tree_last = create_token_tree (dfa, NULL, NULL, token); 2357 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2358 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2359 { 2360 *err = REG_ESPACE; 2361 return NULL; 2362 } 2363 } 2364 else 2365 { 2366 tree = create_token_tree (dfa, NULL, NULL, token); 2367 if (BE (tree == NULL, 0)) 2368 { 2369 *err = REG_ESPACE; 2370 return NULL; 2371 } 2372 } 2373 /* We must return here, since ANCHORs can't be followed 2374 by repetition operators. 2375 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2376 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2377 fetch_token (token, regexp, syntax); 2378 return tree; 2379 case OP_PERIOD: 2380 tree = create_token_tree (dfa, NULL, NULL, token); 2381 if (BE (tree == NULL, 0)) 2382 { 2383 *err = REG_ESPACE; 2384 return NULL; 2385 } 2386 if (dfa->mb_cur_max > 1) 2387 dfa->has_mb_node = 1; 2388 break; 2389 case OP_WORD: 2390 case OP_NOTWORD: 2391 tree = build_charclass_op (dfa, regexp->trans, 2392 "alnum", 2393 "_", 2394 token->type == OP_NOTWORD, err); 2395 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2396 return NULL; 2397 break; 2398 case OP_SPACE: 2399 case OP_NOTSPACE: 2400 tree = build_charclass_op (dfa, regexp->trans, 2401 "space", 2402 "", 2403 token->type == OP_NOTSPACE, err); 2404 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2405 return NULL; 2406 break; 2407 case OP_ALT: 2408 case END_OF_RE: 2409 return NULL; 2410 case BACK_SLASH: 2411 *err = REG_EESCAPE; 2412 return NULL; 2413 default: 2414 /* Must not happen? */ 2415#ifdef DEBUG 2416 assert (0); 2417#endif 2418 return NULL; 2419 } 2420 fetch_token (token, regexp, syntax); 2421 2422 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2423 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2424 { 2425 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2426 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2427 return NULL; 2428 /* In BRE consecutive duplications are not allowed. */ 2429 if ((syntax & RE_CONTEXT_INVALID_DUP) 2430 && (token->type == OP_DUP_ASTERISK 2431 || token->type == OP_OPEN_DUP_NUM)) 2432 { 2433 *err = REG_BADRPT; 2434 return NULL; 2435 } 2436 } 2437 2438 return tree; 2439} 2440 2441/* This function build the following tree, from regular expression 2442 (<reg_exp>): 2443 SUBEXP 2444 | 2445 <reg_exp> 2446*/ 2447 2448static bin_tree_t * 2449parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2450 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2451{ 2452 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2453 bin_tree_t *tree; 2454 size_t cur_nsub; 2455 cur_nsub = preg->re_nsub++; 2456 2457 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2458 2459 /* The subexpression may be a null string. */ 2460 if (token->type == OP_CLOSE_SUBEXP) 2461 tree = NULL; 2462 else 2463 { 2464 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2465 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2466 *err = REG_EPAREN; 2467 if (BE (*err != REG_NOERROR, 0)) 2468 return NULL; 2469 } 2470 2471 if (cur_nsub <= '9' - '1') 2472 dfa->completed_bkref_map |= 1 << cur_nsub; 2473 2474 tree = create_tree (dfa, tree, NULL, SUBEXP); 2475 if (BE (tree == NULL, 0)) 2476 { 2477 *err = REG_ESPACE; 2478 return NULL; 2479 } 2480 tree->token.opr.idx = cur_nsub; 2481 return tree; 2482} 2483 2484/* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2485 2486static bin_tree_t * 2487parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2488 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2489{ 2490 bin_tree_t *tree = NULL, *old_tree = NULL; 2491 int i, start, end, start_idx = re_string_cur_idx (regexp); 2492#ifndef RE_TOKEN_INIT_BUG 2493 re_token_t start_token = *token; 2494#else 2495 re_token_t start_token; 2496 2497 memcpy ((void *) &start_token, (void *) token, sizeof start_token); 2498#endif 2499 2500 if (token->type == OP_OPEN_DUP_NUM) 2501 { 2502 end = 0; 2503 start = fetch_number (regexp, token, syntax); 2504 if (start == -1) 2505 { 2506 if (token->type == CHARACTER && token->opr.c == ',') 2507 start = 0; /* We treat "{,m}" as "{0,m}". */ 2508 else 2509 { 2510 *err = REG_BADBR; /* <re>{} is invalid. */ 2511 return NULL; 2512 } 2513 } 2514 if (BE (start != -2, 1)) 2515 { 2516 /* We treat "{n}" as "{n,n}". */ 2517 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2518 : ((token->type == CHARACTER && token->opr.c == ',') 2519 ? fetch_number (regexp, token, syntax) : -2)); 2520 } 2521 if (BE (start == -2 || end == -2, 0)) 2522 { 2523 /* Invalid sequence. */ 2524 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2525 { 2526 if (token->type == END_OF_RE) 2527 *err = REG_EBRACE; 2528 else 2529 *err = REG_BADBR; 2530 2531 return NULL; 2532 } 2533 2534 /* If the syntax bit is set, rollback. */ 2535 re_string_set_index (regexp, start_idx); 2536 *token = start_token; 2537 token->type = CHARACTER; 2538 /* mb_partial and word_char bits should be already initialized by 2539 peek_token. */ 2540 return elem; 2541 } 2542 2543 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0)) 2544 { 2545 /* First number greater than second. */ 2546 *err = REG_BADBR; 2547 return NULL; 2548 } 2549 } 2550 else 2551 { 2552 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2553 end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2554 } 2555 2556 fetch_token (token, regexp, syntax); 2557 2558 if (BE (elem == NULL, 0)) 2559 return NULL; 2560 if (BE (start == 0 && end == 0, 0)) 2561 { 2562 postorder (elem, free_tree, NULL); 2563 return NULL; 2564 } 2565 2566 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2567 if (BE (start > 0, 0)) 2568 { 2569 tree = elem; 2570 for (i = 2; i <= start; ++i) 2571 { 2572 elem = duplicate_tree (elem, dfa); 2573 tree = create_tree (dfa, tree, elem, CONCAT); 2574 if (BE (elem == NULL || tree == NULL, 0)) 2575 goto parse_dup_op_espace; 2576 } 2577 2578 if (start == end) 2579 return tree; 2580 2581 /* Duplicate ELEM before it is marked optional. */ 2582 elem = duplicate_tree (elem, dfa); 2583 old_tree = tree; 2584 } 2585 else 2586 old_tree = NULL; 2587 2588 if (elem->token.type == SUBEXP) 2589 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx); 2590 2591 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2592 if (BE (tree == NULL, 0)) 2593 goto parse_dup_op_espace; 2594 2595 /* This loop is actually executed only when end != -1, 2596 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2597 already created the start+1-th copy. */ 2598 for (i = start + 2; i <= end; ++i) 2599 { 2600 elem = duplicate_tree (elem, dfa); 2601 tree = create_tree (dfa, tree, elem, CONCAT); 2602 if (BE (elem == NULL || tree == NULL, 0)) 2603 goto parse_dup_op_espace; 2604 2605 tree = create_tree (dfa, tree, NULL, OP_ALT); 2606 if (BE (tree == NULL, 0)) 2607 goto parse_dup_op_espace; 2608 } 2609 2610 if (old_tree) 2611 tree = create_tree (dfa, old_tree, tree, CONCAT); 2612 2613 return tree; 2614 2615 parse_dup_op_espace: 2616 *err = REG_ESPACE; 2617 return NULL; 2618} 2619 2620/* Size of the names for collating symbol/equivalence_class/character_class. 2621 I'm not sure, but maybe enough. */ 2622#define BRACKET_NAME_BUF_SIZE 32 2623 2624#ifndef _LIBC 2625 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2626 Build the range expression which starts from START_ELEM, and ends 2627 at END_ELEM. The result are written to MBCSET and SBCSET. 2628 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2629 mbcset->range_ends, is a pointer argument since we may 2630 update it. */ 2631 2632static reg_errcode_t 2633internal_function 2634# ifdef RE_ENABLE_I18N 2635build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2636 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2637# else /* not RE_ENABLE_I18N */ 2638build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, 2639 bracket_elem_t *end_elem) 2640# endif /* not RE_ENABLE_I18N */ 2641{ 2642 unsigned int start_ch, end_ch; 2643 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2644 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2645 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2646 0)) 2647 return REG_ERANGE; 2648 2649 /* We can handle no multi character collating elements without libc 2650 support. */ 2651 if (BE ((start_elem->type == COLL_SYM 2652 && strlen ((char *) start_elem->opr.name) > 1) 2653 || (end_elem->type == COLL_SYM 2654 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2655 return REG_ECOLLATE; 2656 2657# ifdef RE_ENABLE_I18N 2658 { 2659 wchar_t wc; 2660 wint_t start_wc; 2661 wint_t end_wc; 2662 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2663 2664 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2665 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2666 : 0)); 2667 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2668 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2669 : 0)); 2670#ifdef GAWK 2671 /* 2672 * Fedora Core 2, maybe others, have broken `btowc' that returns -1 2673 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are 2674 * unsigned, so we don't have sign extension problems. 2675 */ 2676 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2677 ? start_ch : start_elem->opr.wch); 2678 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2679 ? end_ch : end_elem->opr.wch); 2680#else 2681 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2682 ? __btowc (start_ch) : start_elem->opr.wch); 2683 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2684 ? __btowc (end_ch) : end_elem->opr.wch); 2685#endif 2686 if (start_wc == WEOF || end_wc == WEOF) 2687 return REG_ECOLLATE; 2688 cmp_buf[0] = start_wc; 2689 cmp_buf[4] = end_wc; 2690 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2691 return REG_ERANGE; 2692 2693 /* Got valid collation sequence values, add them as a new entry. 2694 However, for !_LIBC we have no collation elements: if the 2695 character set is single byte, the single byte character set 2696 that we build below suffices. parse_bracket_exp passes 2697 no MBCSET if dfa->mb_cur_max == 1. */ 2698 if (mbcset) 2699 { 2700 /* Check the space of the arrays. */ 2701 if (BE (*range_alloc == mbcset->nranges, 0)) 2702 { 2703 /* There is not enough space, need realloc. */ 2704 wchar_t *new_array_start, *new_array_end; 2705 int new_nranges; 2706 2707 /* +1 in case of mbcset->nranges is 0. */ 2708 new_nranges = 2 * mbcset->nranges + 1; 2709 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2710 are NULL if *range_alloc == 0. */ 2711 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2712 new_nranges); 2713 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2714 new_nranges); 2715 2716 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2717 return REG_ESPACE; 2718 2719 mbcset->range_starts = new_array_start; 2720 mbcset->range_ends = new_array_end; 2721 *range_alloc = new_nranges; 2722 } 2723 2724 mbcset->range_starts[mbcset->nranges] = start_wc; 2725 mbcset->range_ends[mbcset->nranges++] = end_wc; 2726 } 2727 2728 /* Build the table for single byte characters. */ 2729 for (wc = 0; wc < SBC_MAX; ++wc) 2730 { 2731 cmp_buf[2] = wc; 2732 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2733 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2734 bitset_set (sbcset, wc); 2735 } 2736 } 2737# else /* not RE_ENABLE_I18N */ 2738 { 2739 unsigned int ch; 2740 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2741 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2742 : 0)); 2743 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2744 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2745 : 0)); 2746 if (start_ch > end_ch) 2747 return REG_ERANGE; 2748 /* Build the table for single byte characters. */ 2749 for (ch = 0; ch < SBC_MAX; ++ch) 2750 if (start_ch <= ch && ch <= end_ch) 2751 bitset_set (sbcset, ch); 2752 } 2753# endif /* not RE_ENABLE_I18N */ 2754 return REG_NOERROR; 2755} 2756#endif /* not _LIBC */ 2757 2758#ifndef _LIBC 2759/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2760 Build the collating element which is represented by NAME. 2761 The result are written to MBCSET and SBCSET. 2762 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2763 pointer argument since we may update it. */ 2764 2765static reg_errcode_t 2766internal_function 2767# ifdef RE_ENABLE_I18N 2768build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2769 int *coll_sym_alloc, const unsigned char *name) 2770# else /* not RE_ENABLE_I18N */ 2771build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2772# endif /* not RE_ENABLE_I18N */ 2773{ 2774 size_t name_len = strlen ((const char *) name); 2775 if (BE (name_len != 1, 0)) 2776 return REG_ECOLLATE; 2777 else 2778 { 2779 bitset_set (sbcset, name[0]); 2780 return REG_NOERROR; 2781 } 2782} 2783#endif /* not _LIBC */ 2784 2785/* This function parse bracket expression like "[abc]", "[a-c]", 2786 "[[.a-a.]]" etc. */ 2787 2788static bin_tree_t * 2789parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2790 reg_syntax_t syntax, reg_errcode_t *err) 2791{ 2792#ifdef _LIBC 2793 const unsigned char *collseqmb; 2794 const char *collseqwc; 2795 uint32_t nrules; 2796 int32_t table_size; 2797 const int32_t *symb_table; 2798 const unsigned char *extra; 2799 2800 /* Local function for parse_bracket_exp used in _LIBC environment. 2801 Seek the collating symbol entry correspondings to NAME. 2802 Return the index of the symbol in the SYMB_TABLE. */ 2803 2804 auto inline int32_t 2805 __attribute ((always_inline)) 2806 seek_collating_symbol_entry (name, name_len) 2807 const unsigned char *name; 2808 size_t name_len; 2809 { 2810 int32_t hash = elem_hash ((const char *) name, name_len); 2811 int32_t elem = hash % table_size; 2812 if (symb_table[2 * elem] != 0) 2813 { 2814 int32_t second = hash % (table_size - 2) + 1; 2815 2816 do 2817 { 2818 /* First compare the hashing value. */ 2819 if (symb_table[2 * elem] == hash 2820 /* Compare the length of the name. */ 2821 && name_len == extra[symb_table[2 * elem + 1]] 2822 /* Compare the name. */ 2823 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2824 name_len) == 0) 2825 { 2826 /* Yep, this is the entry. */ 2827 break; 2828 } 2829 2830 /* Next entry. */ 2831 elem += second; 2832 } 2833 while (symb_table[2 * elem] != 0); 2834 } 2835 return elem; 2836 } 2837 2838 /* Local function for parse_bracket_exp used in _LIBC environment. 2839 Look up the collation sequence value of BR_ELEM. 2840 Return the value if succeeded, UINT_MAX otherwise. */ 2841 2842 auto inline unsigned int 2843 __attribute ((always_inline)) 2844 lookup_collation_sequence_value (br_elem) 2845 bracket_elem_t *br_elem; 2846 { 2847 if (br_elem->type == SB_CHAR) 2848 { 2849 /* 2850 if (MB_CUR_MAX == 1) 2851 */ 2852 if (nrules == 0) 2853 return collseqmb[br_elem->opr.ch]; 2854 else 2855 { 2856 wint_t wc = __btowc (br_elem->opr.ch); 2857 return __collseq_table_lookup (collseqwc, wc); 2858 } 2859 } 2860 else if (br_elem->type == MB_CHAR) 2861 { 2862 if (nrules != 0) 2863 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2864 } 2865 else if (br_elem->type == COLL_SYM) 2866 { 2867 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2868 if (nrules != 0) 2869 { 2870 int32_t elem, idx; 2871 elem = seek_collating_symbol_entry (br_elem->opr.name, 2872 sym_name_len); 2873 if (symb_table[2 * elem] != 0) 2874 { 2875 /* We found the entry. */ 2876 idx = symb_table[2 * elem + 1]; 2877 /* Skip the name of collating element name. */ 2878 idx += 1 + extra[idx]; 2879 /* Skip the byte sequence of the collating element. */ 2880 idx += 1 + extra[idx]; 2881 /* Adjust for the alignment. */ 2882 idx = (idx + 3) & ~3; 2883 /* Skip the multibyte collation sequence value. */ 2884 idx += sizeof (unsigned int); 2885 /* Skip the wide char sequence of the collating element. */ 2886 idx += sizeof (unsigned int) * 2887 (1 + *(unsigned int *) (extra + idx)); 2888 /* Return the collation sequence value. */ 2889 return *(unsigned int *) (extra + idx); 2890 } 2891 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2892 { 2893 /* No valid character. Match it as a single byte 2894 character. */ 2895 return collseqmb[br_elem->opr.name[0]]; 2896 } 2897 } 2898 else if (sym_name_len == 1) 2899 return collseqmb[br_elem->opr.name[0]]; 2900 } 2901 return UINT_MAX; 2902 } 2903 2904 /* Local function for parse_bracket_exp used in _LIBC environment. 2905 Build the range expression which starts from START_ELEM, and ends 2906 at END_ELEM. The result are written to MBCSET and SBCSET. 2907 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2908 mbcset->range_ends, is a pointer argument since we may 2909 update it. */ 2910 2911 auto inline reg_errcode_t 2912 __attribute ((always_inline)) 2913 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2914 re_charset_t *mbcset; 2915 int *range_alloc; 2916 bitset_t sbcset; 2917 bracket_elem_t *start_elem, *end_elem; 2918 { 2919 unsigned int ch; 2920 uint32_t start_collseq; 2921 uint32_t end_collseq; 2922 2923 /* Equivalence Classes and Character Classes can't be a range 2924 start/end. */ 2925 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2926 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2927 0)) 2928 return REG_ERANGE; 2929 2930 start_collseq = lookup_collation_sequence_value (start_elem); 2931 end_collseq = lookup_collation_sequence_value (end_elem); 2932 /* Check start/end collation sequence values. */ 2933 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2934 return REG_ECOLLATE; 2935 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2936 return REG_ERANGE; 2937 2938 /* Got valid collation sequence values, add them as a new entry. 2939 However, if we have no collation elements, and the character set 2940 is single byte, the single byte character set that we 2941 build below suffices. */ 2942 if (nrules > 0 || dfa->mb_cur_max > 1) 2943 { 2944 /* Check the space of the arrays. */ 2945 if (BE (*range_alloc == mbcset->nranges, 0)) 2946 { 2947 /* There is not enough space, need realloc. */ 2948 uint32_t *new_array_start; 2949 uint32_t *new_array_end; 2950 int new_nranges; 2951 2952 /* +1 in case of mbcset->nranges is 0. */ 2953 new_nranges = 2 * mbcset->nranges + 1; 2954 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2955 new_nranges); 2956 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2957 new_nranges); 2958 2959 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2960 return REG_ESPACE; 2961 2962 mbcset->range_starts = new_array_start; 2963 mbcset->range_ends = new_array_end; 2964 *range_alloc = new_nranges; 2965 } 2966 2967 mbcset->range_starts[mbcset->nranges] = start_collseq; 2968 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2969 } 2970 2971 /* Build the table for single byte characters. */ 2972 for (ch = 0; ch < SBC_MAX; ch++) 2973 { 2974 uint32_t ch_collseq; 2975 /* 2976 if (MB_CUR_MAX == 1) 2977 */ 2978 if (nrules == 0) 2979 ch_collseq = collseqmb[ch]; 2980 else 2981 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2982 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2983 bitset_set (sbcset, ch); 2984 } 2985 return REG_NOERROR; 2986 } 2987 2988 /* Local function for parse_bracket_exp used in _LIBC environment. 2989 Build the collating element which is represented by NAME. 2990 The result are written to MBCSET and SBCSET. 2991 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2992 pointer argument since we may update it. */ 2993 2994 auto inline reg_errcode_t 2995 __attribute ((always_inline)) 2996 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2997 re_charset_t *mbcset; 2998 int *coll_sym_alloc; 2999 bitset_t sbcset; 3000 const unsigned char *name; 3001 { 3002 int32_t elem, idx; 3003 size_t name_len = strlen ((const char *) name); 3004 if (nrules != 0) 3005 { 3006 elem = seek_collating_symbol_entry (name, name_len); 3007 if (symb_table[2 * elem] != 0) 3008 { 3009 /* We found the entry. */ 3010 idx = symb_table[2 * elem + 1]; 3011 /* Skip the name of collating element name. */ 3012 idx += 1 + extra[idx]; 3013 } 3014 else if (symb_table[2 * elem] == 0 && name_len == 1) 3015 { 3016 /* No valid character, treat it as a normal 3017 character. */ 3018 bitset_set (sbcset, name[0]); 3019 return REG_NOERROR; 3020 } 3021 else 3022 return REG_ECOLLATE; 3023 3024 /* Got valid collation sequence, add it as a new entry. */ 3025 /* Check the space of the arrays. */ 3026 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3027 { 3028 /* Not enough, realloc it. */ 3029 /* +1 in case of mbcset->ncoll_syms is 0. */ 3030 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3031 /* Use realloc since mbcset->coll_syms is NULL 3032 if *alloc == 0. */ 3033 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3034 new_coll_sym_alloc); 3035 if (BE (new_coll_syms == NULL, 0)) 3036 return REG_ESPACE; 3037 mbcset->coll_syms = new_coll_syms; 3038 *coll_sym_alloc = new_coll_sym_alloc; 3039 } 3040 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3041 return REG_NOERROR; 3042 } 3043 else 3044 { 3045 if (BE (name_len != 1, 0)) 3046 return REG_ECOLLATE; 3047 else 3048 { 3049 bitset_set (sbcset, name[0]); 3050 return REG_NOERROR; 3051 } 3052 } 3053 } 3054#endif 3055 3056 re_token_t br_token; 3057 re_bitset_ptr_t sbcset; 3058#ifdef RE_ENABLE_I18N 3059 re_charset_t *mbcset; 3060 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3061 int equiv_class_alloc = 0, char_class_alloc = 0; 3062#endif /* not RE_ENABLE_I18N */ 3063 int non_match = 0; 3064 bin_tree_t *work_tree; 3065 int token_len; 3066 int first_round = 1; 3067#ifdef _LIBC 3068 collseqmb = (const unsigned char *) 3069 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3070 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3071 if (nrules) 3072 { 3073 /* 3074 if (MB_CUR_MAX > 1) 3075 */ 3076 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3077 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3078 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3079 _NL_COLLATE_SYMB_TABLEMB); 3080 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3081 _NL_COLLATE_SYMB_EXTRAMB); 3082 } 3083#endif 3084 sbcset = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t)); 3085#ifdef RE_ENABLE_I18N 3086 mbcset = (re_charset_t *) calloc (1, sizeof (re_charset_t)); 3087#endif /* RE_ENABLE_I18N */ 3088#ifdef RE_ENABLE_I18N 3089 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3090#else 3091 if (BE (sbcset == NULL, 0)) 3092#endif /* RE_ENABLE_I18N */ 3093 { 3094 *err = REG_ESPACE; 3095 return NULL; 3096 } 3097 3098 token_len = peek_token_bracket (token, regexp, syntax); 3099 if (BE (token->type == END_OF_RE, 0)) 3100 { 3101 *err = REG_BADPAT; 3102 goto parse_bracket_exp_free_return; 3103 } 3104 if (token->type == OP_NON_MATCH_LIST) 3105 { 3106#ifdef RE_ENABLE_I18N 3107 mbcset->non_match = 1; 3108#endif /* not RE_ENABLE_I18N */ 3109 non_match = 1; 3110 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3111 bitset_set (sbcset, '\n'); 3112 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3113 token_len = peek_token_bracket (token, regexp, syntax); 3114 if (BE (token->type == END_OF_RE, 0)) 3115 { 3116 *err = REG_BADPAT; 3117 goto parse_bracket_exp_free_return; 3118 } 3119 } 3120 3121 /* We treat the first ']' as a normal character. */ 3122 if (token->type == OP_CLOSE_BRACKET) 3123 token->type = CHARACTER; 3124 3125 while (1) 3126 { 3127 bracket_elem_t start_elem, end_elem; 3128 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3129 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3130 reg_errcode_t ret; 3131 int token_len2 = 0, is_range_exp = 0; 3132 re_token_t token2; 3133 3134 start_elem.opr.name = start_name_buf; 3135 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3136 syntax, first_round); 3137 if (BE (ret != REG_NOERROR, 0)) 3138 { 3139 *err = ret; 3140 goto parse_bracket_exp_free_return; 3141 } 3142 first_round = 0; 3143 3144 /* Get information about the next token. We need it in any case. */ 3145 token_len = peek_token_bracket (token, regexp, syntax); 3146 3147 /* Do not check for ranges if we know they are not allowed. */ 3148 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3149 { 3150 if (BE (token->type == END_OF_RE, 0)) 3151 { 3152 *err = REG_EBRACK; 3153 goto parse_bracket_exp_free_return; 3154 } 3155 if (token->type == OP_CHARSET_RANGE) 3156 { 3157 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3158 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3159 if (BE (token2.type == END_OF_RE, 0)) 3160 { 3161 *err = REG_EBRACK; 3162 goto parse_bracket_exp_free_return; 3163 } 3164 if (token2.type == OP_CLOSE_BRACKET) 3165 { 3166 /* We treat the last '-' as a normal character. */ 3167 re_string_skip_bytes (regexp, -token_len); 3168 token->type = CHARACTER; 3169 } 3170 else 3171 is_range_exp = 1; 3172 } 3173 } 3174 3175 if (is_range_exp == 1) 3176 { 3177 end_elem.opr.name = end_name_buf; 3178 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3179 dfa, syntax, 1); 3180 if (BE (ret != REG_NOERROR, 0)) 3181 { 3182 *err = ret; 3183 goto parse_bracket_exp_free_return; 3184 } 3185 3186 token_len = peek_token_bracket (token, regexp, syntax); 3187 3188#ifdef _LIBC 3189 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3190 &start_elem, &end_elem); 3191#else 3192# ifdef RE_ENABLE_I18N 3193 *err = build_range_exp (sbcset, 3194 dfa->mb_cur_max > 1 ? mbcset : NULL, 3195 &range_alloc, &start_elem, &end_elem); 3196# else 3197 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3198# endif 3199#endif /* RE_ENABLE_I18N */ 3200 if (BE (*err != REG_NOERROR, 0)) 3201 goto parse_bracket_exp_free_return; 3202 } 3203 else 3204 { 3205 switch (start_elem.type) 3206 { 3207 case SB_CHAR: 3208 bitset_set (sbcset, start_elem.opr.ch); 3209 break; 3210#ifdef RE_ENABLE_I18N 3211 case MB_CHAR: 3212 /* Check whether the array has enough space. */ 3213 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3214 { 3215 wchar_t *new_mbchars; 3216 /* Not enough, realloc it. */ 3217 /* +1 in case of mbcset->nmbchars is 0. */ 3218 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3219 /* Use realloc since array is NULL if *alloc == 0. */ 3220 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3221 mbchar_alloc); 3222 if (BE (new_mbchars == NULL, 0)) 3223 goto parse_bracket_exp_espace; 3224 mbcset->mbchars = new_mbchars; 3225 } 3226 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3227 break; 3228#endif /* RE_ENABLE_I18N */ 3229 case EQUIV_CLASS: 3230 *err = build_equiv_class (sbcset, 3231#ifdef RE_ENABLE_I18N 3232 mbcset, &equiv_class_alloc, 3233#endif /* RE_ENABLE_I18N */ 3234 start_elem.opr.name); 3235 if (BE (*err != REG_NOERROR, 0)) 3236 goto parse_bracket_exp_free_return; 3237 break; 3238 case COLL_SYM: 3239 *err = build_collating_symbol (sbcset, 3240#ifdef RE_ENABLE_I18N 3241 mbcset, &coll_sym_alloc, 3242#endif /* RE_ENABLE_I18N */ 3243 start_elem.opr.name); 3244 if (BE (*err != REG_NOERROR, 0)) 3245 goto parse_bracket_exp_free_return; 3246 break; 3247 case CHAR_CLASS: 3248 *err = build_charclass (regexp->trans, sbcset, 3249#ifdef RE_ENABLE_I18N 3250 mbcset, &char_class_alloc, 3251#endif /* RE_ENABLE_I18N */ 3252 (const char *) start_elem.opr.name, syntax); 3253 if (BE (*err != REG_NOERROR, 0)) 3254 goto parse_bracket_exp_free_return; 3255 break; 3256 default: 3257 assert (0); 3258 break; 3259 } 3260 } 3261 if (BE (token->type == END_OF_RE, 0)) 3262 { 3263 *err = REG_EBRACK; 3264 goto parse_bracket_exp_free_return; 3265 } 3266 if (token->type == OP_CLOSE_BRACKET) 3267 break; 3268 } 3269 3270 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3271 3272 /* If it is non-matching list. */ 3273 if (non_match) 3274 bitset_not (sbcset); 3275 3276#ifdef RE_ENABLE_I18N 3277 /* Ensure only single byte characters are set. */ 3278 if (dfa->mb_cur_max > 1) 3279 bitset_mask (sbcset, dfa->sb_char); 3280 3281 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3282 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3283 || mbcset->non_match))) 3284 { 3285 bin_tree_t *mbc_tree; 3286 int sbc_idx; 3287 /* Build a tree for complex bracket. */ 3288 dfa->has_mb_node = 1; 3289 br_token.type = COMPLEX_BRACKET; 3290 br_token.opr.mbcset = mbcset; 3291 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3292 if (BE (mbc_tree == NULL, 0)) 3293 goto parse_bracket_exp_espace; 3294 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3295 if (sbcset[sbc_idx]) 3296 break; 3297 /* If there are no bits set in sbcset, there is no point 3298 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3299 if (sbc_idx < BITSET_WORDS) 3300 { 3301 /* Build a tree for simple bracket. */ 3302 br_token.type = SIMPLE_BRACKET; 3303 br_token.opr.sbcset = sbcset; 3304 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3305 if (BE (work_tree == NULL, 0)) 3306 goto parse_bracket_exp_espace; 3307 3308 /* Then join them by ALT node. */ 3309 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3310 if (BE (work_tree == NULL, 0)) 3311 goto parse_bracket_exp_espace; 3312 } 3313 else 3314 { 3315 re_free (sbcset); 3316 work_tree = mbc_tree; 3317 } 3318 } 3319 else 3320#endif /* not RE_ENABLE_I18N */ 3321 { 3322#ifdef RE_ENABLE_I18N 3323 free_charset (mbcset); 3324#endif 3325 /* Build a tree for simple bracket. */ 3326 br_token.type = SIMPLE_BRACKET; 3327 br_token.opr.sbcset = sbcset; 3328 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3329 if (BE (work_tree == NULL, 0)) 3330 goto parse_bracket_exp_espace; 3331 } 3332 return work_tree; 3333 3334 parse_bracket_exp_espace: 3335 *err = REG_ESPACE; 3336 parse_bracket_exp_free_return: 3337 re_free (sbcset); 3338#ifdef RE_ENABLE_I18N 3339 free_charset (mbcset); 3340#endif /* RE_ENABLE_I18N */ 3341 return NULL; 3342} 3343 3344/* Parse an element in the bracket expression. */ 3345 3346static reg_errcode_t 3347parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3348 re_token_t *token, int token_len, re_dfa_t *dfa, 3349 reg_syntax_t syntax, int accept_hyphen) 3350{ 3351#ifdef RE_ENABLE_I18N 3352 int cur_char_size; 3353 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3354 if (cur_char_size > 1) 3355 { 3356 elem->type = MB_CHAR; 3357 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3358 re_string_skip_bytes (regexp, cur_char_size); 3359 return REG_NOERROR; 3360 } 3361#endif /* RE_ENABLE_I18N */ 3362 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3363 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3364 || token->type == OP_OPEN_EQUIV_CLASS) 3365 return parse_bracket_symbol (elem, regexp, token); 3366 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3367 { 3368 /* A '-' must only appear as anything but a range indicator before 3369 the closing bracket. Everything else is an error. */ 3370 re_token_t token2; 3371 (void) peek_token_bracket (&token2, regexp, syntax); 3372 if (token2.type != OP_CLOSE_BRACKET) 3373 /* The actual error value is not standardized since this whole 3374 case is undefined. But ERANGE makes good sense. */ 3375 return REG_ERANGE; 3376 } 3377 elem->type = SB_CHAR; 3378 elem->opr.ch = token->opr.c; 3379 return REG_NOERROR; 3380} 3381 3382/* Parse a bracket symbol in the bracket expression. Bracket symbols are 3383 such as [:<character_class>:], [.<collating_element>.], and 3384 [=<equivalent_class>=]. */ 3385 3386static reg_errcode_t 3387parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3388 re_token_t *token) 3389{ 3390 unsigned char ch, delim = token->opr.c; 3391 int i = 0; 3392 if (re_string_eoi(regexp)) 3393 return REG_EBRACK; 3394 for (;; ++i) 3395 { 3396 if (i >= BRACKET_NAME_BUF_SIZE) 3397 return REG_EBRACK; 3398 if (token->type == OP_OPEN_CHAR_CLASS) 3399 ch = re_string_fetch_byte_case (regexp); 3400 else 3401 ch = re_string_fetch_byte (regexp); 3402 if (re_string_eoi(regexp)) 3403 return REG_EBRACK; 3404 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3405 break; 3406 elem->opr.name[i] = ch; 3407 } 3408 re_string_skip_bytes (regexp, 1); 3409 elem->opr.name[i] = '\0'; 3410 switch (token->type) 3411 { 3412 case OP_OPEN_COLL_ELEM: 3413 elem->type = COLL_SYM; 3414 break; 3415 case OP_OPEN_EQUIV_CLASS: 3416 elem->type = EQUIV_CLASS; 3417 break; 3418 case OP_OPEN_CHAR_CLASS: 3419 elem->type = CHAR_CLASS; 3420 break; 3421 default: 3422 break; 3423 } 3424 return REG_NOERROR; 3425} 3426 3427 /* Helper function for parse_bracket_exp. 3428 Build the equivalence class which is represented by NAME. 3429 The result are written to MBCSET and SBCSET. 3430 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3431 is a pointer argument since we may update it. */ 3432 3433static reg_errcode_t 3434#ifdef RE_ENABLE_I18N 3435build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3436 int *equiv_class_alloc, const unsigned char *name) 3437#else /* not RE_ENABLE_I18N */ 3438build_equiv_class (bitset_t sbcset, const unsigned char *name) 3439#endif /* not RE_ENABLE_I18N */ 3440{ 3441#ifdef _LIBC 3442 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3443 if (nrules != 0) 3444 { 3445 const int32_t *table, *indirect; 3446 const unsigned char *weights, *extra, *cp; 3447 unsigned char char_buf[2]; 3448 int32_t idx1, idx2; 3449 unsigned int ch; 3450 size_t len; 3451 /* This #include defines a local function! */ 3452# include <locale/weight.h> 3453 /* Calculate the index for equivalence class. */ 3454 cp = name; 3455 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3456 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3457 _NL_COLLATE_WEIGHTMB); 3458 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3459 _NL_COLLATE_EXTRAMB); 3460 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3461 _NL_COLLATE_INDIRECTMB); 3462 idx1 = findidx (&cp); 3463 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3464 /* This isn't a valid character. */ 3465 return REG_ECOLLATE; 3466 3467 /* Build single byte matching table for this equivalence class. */ 3468 char_buf[1] = (unsigned char) '\0'; 3469 len = weights[idx1 & 0xffffff]; 3470 for (ch = 0; ch < SBC_MAX; ++ch) 3471 { 3472 char_buf[0] = ch; 3473 cp = char_buf; 3474 idx2 = findidx (&cp); 3475/* 3476 idx2 = table[ch]; 3477*/ 3478 if (idx2 == 0) 3479 /* This isn't a valid character. */ 3480 continue; 3481 /* Compare only if the length matches and the collation rule 3482 index is the same. */ 3483 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3484 { 3485 int cnt = 0; 3486 3487 while (cnt <= len && 3488 weights[(idx1 & 0xffffff) + 1 + cnt] 3489 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3490 ++cnt; 3491 3492 if (cnt > len) 3493 bitset_set (sbcset, ch); 3494 } 3495 } 3496 /* Check whether the array has enough space. */ 3497 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3498 { 3499 /* Not enough, realloc it. */ 3500 /* +1 in case of mbcset->nequiv_classes is 0. */ 3501 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3502 /* Use realloc since the array is NULL if *alloc == 0. */ 3503 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3504 int32_t, 3505 new_equiv_class_alloc); 3506 if (BE (new_equiv_classes == NULL, 0)) 3507 return REG_ESPACE; 3508 mbcset->equiv_classes = new_equiv_classes; 3509 *equiv_class_alloc = new_equiv_class_alloc; 3510 } 3511 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3512 } 3513 else 3514#endif /* _LIBC */ 3515 { 3516 if (BE (strlen ((const char *) name) != 1, 0)) 3517 return REG_ECOLLATE; 3518 bitset_set (sbcset, *name); 3519 } 3520 return REG_NOERROR; 3521} 3522 3523 /* Helper function for parse_bracket_exp. 3524 Build the character class which is represented by NAME. 3525 The result are written to MBCSET and SBCSET. 3526 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3527 is a pointer argument since we may update it. */ 3528 3529static reg_errcode_t 3530#ifdef RE_ENABLE_I18N 3531build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3532 re_charset_t *mbcset, int *char_class_alloc, 3533 const char *class_name, reg_syntax_t syntax) 3534#else /* not RE_ENABLE_I18N */ 3535build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3536 const char *class_name, reg_syntax_t syntax) 3537#endif /* not RE_ENABLE_I18N */ 3538{ 3539 int i; 3540 3541 /* In case of REG_ICASE "upper" and "lower" match the both of 3542 upper and lower cases. */ 3543 if ((syntax & RE_ICASE) 3544 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0)) 3545 class_name = "alpha"; 3546 3547#ifdef RE_ENABLE_I18N 3548 /* Check the space of the arrays. */ 3549 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3550 { 3551 /* Not enough, realloc it. */ 3552 /* +1 in case of mbcset->nchar_classes is 0. */ 3553 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3554 /* Use realloc since array is NULL if *alloc == 0. */ 3555 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3556 new_char_class_alloc); 3557 if (BE (new_char_classes == NULL, 0)) 3558 return REG_ESPACE; 3559 mbcset->char_classes = new_char_classes; 3560 *char_class_alloc = new_char_class_alloc; 3561 } 3562 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name); 3563#endif /* RE_ENABLE_I18N */ 3564 3565#define BUILD_CHARCLASS_LOOP(ctype_func) \ 3566 do { \ 3567 if (BE (trans != NULL, 0)) \ 3568 { \ 3569 for (i = 0; i < SBC_MAX; ++i) \ 3570 if (ctype_func (i)) \ 3571 bitset_set (sbcset, trans[i]); \ 3572 } \ 3573 else \ 3574 { \ 3575 for (i = 0; i < SBC_MAX; ++i) \ 3576 if (ctype_func (i)) \ 3577 bitset_set (sbcset, i); \ 3578 } \ 3579 } while (0) 3580 3581 if (strcmp (class_name, "alnum") == 0) 3582 BUILD_CHARCLASS_LOOP (isalnum); 3583 else if (strcmp (class_name, "cntrl") == 0) 3584 BUILD_CHARCLASS_LOOP (iscntrl); 3585 else if (strcmp (class_name, "lower") == 0) 3586 BUILD_CHARCLASS_LOOP (islower); 3587 else if (strcmp (class_name, "space") == 0) 3588 BUILD_CHARCLASS_LOOP (isspace); 3589 else if (strcmp (class_name, "alpha") == 0) 3590 BUILD_CHARCLASS_LOOP (isalpha); 3591 else if (strcmp (class_name, "digit") == 0) 3592 BUILD_CHARCLASS_LOOP (isdigit); 3593 else if (strcmp (class_name, "print") == 0) 3594 BUILD_CHARCLASS_LOOP (isprint); 3595 else if (strcmp (class_name, "upper") == 0) 3596 BUILD_CHARCLASS_LOOP (isupper); 3597 else if (strcmp (class_name, "blank") == 0) 3598#ifndef GAWK 3599 BUILD_CHARCLASS_LOOP (isblank); 3600#else 3601 /* see comments above */ 3602 BUILD_CHARCLASS_LOOP (is_blank); 3603#endif 3604 else if (strcmp (class_name, "graph") == 0) 3605 BUILD_CHARCLASS_LOOP (isgraph); 3606 else if (strcmp (class_name, "punct") == 0) 3607 BUILD_CHARCLASS_LOOP (ispunct); 3608 else if (strcmp (class_name, "xdigit") == 0) 3609 BUILD_CHARCLASS_LOOP (isxdigit); 3610 else 3611 return REG_ECTYPE; 3612 3613 return REG_NOERROR; 3614} 3615 3616static bin_tree_t * 3617build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3618 const char *class_name, 3619 const char *extra, int non_match, 3620 reg_errcode_t *err) 3621{ 3622 re_bitset_ptr_t sbcset; 3623#ifdef RE_ENABLE_I18N 3624 re_charset_t *mbcset; 3625 int alloc = 0; 3626#endif /* not RE_ENABLE_I18N */ 3627 reg_errcode_t ret; 3628 re_token_t br_token; 3629 bin_tree_t *tree; 3630 3631 sbcset = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t)); 3632#ifdef RE_ENABLE_I18N 3633 mbcset = (re_charset_t *) calloc (1, sizeof (re_charset_t)); 3634#endif /* RE_ENABLE_I18N */ 3635 3636#ifdef RE_ENABLE_I18N 3637 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3638#else /* not RE_ENABLE_I18N */ 3639 if (BE (sbcset == NULL, 0)) 3640#endif /* not RE_ENABLE_I18N */ 3641 { 3642 *err = REG_ESPACE; 3643 return NULL; 3644 } 3645 3646 if (non_match) 3647 { 3648#ifdef RE_ENABLE_I18N 3649 mbcset->non_match = 1; 3650#endif /* not RE_ENABLE_I18N */ 3651 } 3652 3653 /* We don't care the syntax in this case. */ 3654 ret = build_charclass (trans, sbcset, 3655#ifdef RE_ENABLE_I18N 3656 mbcset, &alloc, 3657#endif /* RE_ENABLE_I18N */ 3658 class_name, 0); 3659 3660 if (BE (ret != REG_NOERROR, 0)) 3661 { 3662 re_free (sbcset); 3663#ifdef RE_ENABLE_I18N 3664 free_charset (mbcset); 3665#endif /* RE_ENABLE_I18N */ 3666 *err = ret; 3667 return NULL; 3668 } 3669 /* \w match '_' also. */ 3670 for (; *extra; extra++) 3671 bitset_set (sbcset, *extra); 3672 3673 /* If it is non-matching list. */ 3674 if (non_match) 3675 bitset_not (sbcset); 3676 3677#ifdef RE_ENABLE_I18N 3678 /* Ensure only single byte characters are set. */ 3679 if (dfa->mb_cur_max > 1) 3680 bitset_mask (sbcset, dfa->sb_char); 3681#endif 3682 3683 /* Build a tree for simple bracket. */ 3684 br_token.type = SIMPLE_BRACKET; 3685 br_token.opr.sbcset = sbcset; 3686 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3687 if (BE (tree == NULL, 0)) 3688 goto build_word_op_espace; 3689 3690#ifdef RE_ENABLE_I18N 3691 if (dfa->mb_cur_max > 1) 3692 { 3693 bin_tree_t *mbc_tree; 3694 /* Build a tree for complex bracket. */ 3695 br_token.type = COMPLEX_BRACKET; 3696 br_token.opr.mbcset = mbcset; 3697 dfa->has_mb_node = 1; 3698 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3699 if (BE (mbc_tree == NULL, 0)) 3700 goto build_word_op_espace; 3701 /* Then join them by ALT node. */ 3702 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3703 if (BE (mbc_tree != NULL, 1)) 3704 return tree; 3705 } 3706 else 3707 { 3708 free_charset (mbcset); 3709 return tree; 3710 } 3711#else /* not RE_ENABLE_I18N */ 3712 return tree; 3713#endif /* not RE_ENABLE_I18N */ 3714 3715 build_word_op_espace: 3716 re_free (sbcset); 3717#ifdef RE_ENABLE_I18N 3718 free_charset (mbcset); 3719#endif /* RE_ENABLE_I18N */ 3720 *err = REG_ESPACE; 3721 return NULL; 3722} 3723 3724/* This is intended for the expressions like "a{1,3}". 3725 Fetch a number from `input', and return the number. 3726 Return -1, if the number field is empty like "{,1}". 3727 Return -2, if an error has occurred. */ 3728 3729static int 3730fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3731{ 3732 int num = -1; 3733 unsigned char c; 3734 while (1) 3735 { 3736 fetch_token (token, input, syntax); 3737 c = token->opr.c; 3738 if (BE (token->type == END_OF_RE, 0)) 3739 return -2; 3740 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3741 break; 3742 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3743 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3744 num = (num > RE_DUP_MAX) ? -2 : num; 3745 } 3746 return num; 3747} 3748 3749#ifdef RE_ENABLE_I18N 3750static void 3751free_charset (re_charset_t *cset) 3752{ 3753 re_free (cset->mbchars); 3754# ifdef _LIBC 3755 re_free (cset->coll_syms); 3756 re_free (cset->equiv_classes); 3757 re_free (cset->range_starts); 3758 re_free (cset->range_ends); 3759# endif 3760 re_free (cset->char_classes); 3761 re_free (cset); 3762} 3763#endif /* RE_ENABLE_I18N */ 3764 3765/* Functions for binary tree operation. */ 3766 3767/* Create a tree node. */ 3768 3769static bin_tree_t * 3770create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3771 re_token_type_t type) 3772{ 3773 re_token_t t; 3774 t.type = type; 3775 return create_token_tree (dfa, left, right, &t); 3776} 3777 3778static bin_tree_t * 3779create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3780 const re_token_t *token) 3781{ 3782 bin_tree_t *tree; 3783 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3784 { 3785 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3786 3787 if (storage == NULL) 3788 return NULL; 3789 storage->next = dfa->str_tree_storage; 3790 dfa->str_tree_storage = storage; 3791 dfa->str_tree_storage_idx = 0; 3792 } 3793 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3794 3795 tree->parent = NULL; 3796 tree->left = left; 3797 tree->right = right; 3798 tree->token = *token; 3799 tree->token.duplicated = 0; 3800 tree->token.opt_subexp = 0; 3801 tree->first = NULL; 3802 tree->next = NULL; 3803 tree->node_idx = -1; 3804 3805 if (left != NULL) 3806 left->parent = tree; 3807 if (right != NULL) 3808 right->parent = tree; 3809 return tree; 3810} 3811 3812/* Mark the tree SRC as an optional subexpression. 3813 To be called from preorder or postorder. */ 3814 3815static reg_errcode_t 3816mark_opt_subexp (void *extra, bin_tree_t *node) 3817{ 3818 int idx = (int) (intptr_t) extra; 3819 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3820 node->token.opt_subexp = 1; 3821 3822 return REG_NOERROR; 3823} 3824 3825/* Free the allocated memory inside NODE. */ 3826 3827static void 3828free_token (re_token_t *node) 3829{ 3830#ifdef RE_ENABLE_I18N 3831 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3832 free_charset (node->opr.mbcset); 3833 else 3834#endif /* RE_ENABLE_I18N */ 3835 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3836 re_free (node->opr.sbcset); 3837} 3838 3839/* Worker function for tree walking. Free the allocated memory inside NODE 3840 and its children. */ 3841 3842static reg_errcode_t 3843free_tree (void *extra, bin_tree_t *node) 3844{ 3845 free_token (&node->token); 3846 return REG_NOERROR; 3847} 3848 3849 3850/* Duplicate the node SRC, and return new node. This is a preorder 3851 visit similar to the one implemented by the generic visitor, but 3852 we need more infrastructure to maintain two parallel trees --- so, 3853 it's easier to duplicate. */ 3854 3855static bin_tree_t * 3856duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3857{ 3858 const bin_tree_t *node; 3859 bin_tree_t *dup_root; 3860 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3861 3862 for (node = root; ; ) 3863 { 3864 /* Create a new tree and link it back to the current parent. */ 3865 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3866 if (*p_new == NULL) 3867 return NULL; 3868 (*p_new)->parent = dup_node; 3869 (*p_new)->token.duplicated = 1; 3870 dup_node = *p_new; 3871 3872 /* Go to the left node, or up and to the right. */ 3873 if (node->left) 3874 { 3875 node = node->left; 3876 p_new = &dup_node->left; 3877 } 3878 else 3879 { 3880 const bin_tree_t *prev = NULL; 3881 while (node->right == prev || node->right == NULL) 3882 { 3883 prev = node; 3884 node = node->parent; 3885 dup_node = dup_node->parent; 3886 if (!node) 3887 return dup_root; 3888 } 3889 node = node->right; 3890 p_new = &dup_node->right; 3891 } 3892 } 3893}