Git fork
at reftables-rust 4368 lines 128 kB view raw
1/* Extended regular expression matching and search library. 2 Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 21 int n) internal_function; 22static void match_ctx_clean (re_match_context_t *mctx) internal_function; 23static void match_ctx_free (re_match_context_t *cache) internal_function; 24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, 25 int str_idx, int from, int to) 26 internal_function; 27static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) 28 internal_function; 29static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, 30 int str_idx) internal_function; 31static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 32 int node, int str_idx) 33 internal_function; 34static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 35 re_dfastate_t **limited_sts, int last_node, 36 int last_str_idx) 37 internal_function; 38static reg_errcode_t re_search_internal (const regex_t *preg, 39 const char *string, int length, 40 int start, int range, int stop, 41 size_t nmatch, regmatch_t pmatch[], 42 int eflags); 43static int re_search_2_stub (struct re_pattern_buffer *bufp, 44 const char *string1, int length1, 45 const char *string2, int length2, 46 int start, int range, struct re_registers *regs, 47 int stop, int ret_len); 48static int re_search_stub (struct re_pattern_buffer *bufp, 49 const char *string, int length, int start, 50 int range, int stop, struct re_registers *regs, 51 int ret_len); 52static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 53 int nregs, int regs_allocated); 54static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx); 55static int check_matching (re_match_context_t *mctx, int fl_longest_match, 56 int *p_match_first) internal_function; 57static int check_halt_state_context (const re_match_context_t *mctx, 58 const re_dfastate_t *state, int idx) 59 internal_function; 60static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 61 regmatch_t *prev_idx_match, int cur_node, 62 int cur_idx, int nmatch) internal_function; 63static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 64 int str_idx, int dest_node, int nregs, 65 regmatch_t *regs, 66 re_node_set *eps_via_nodes) 67 internal_function; 68static reg_errcode_t set_regs (const regex_t *preg, 69 const re_match_context_t *mctx, 70 size_t nmatch, regmatch_t *pmatch, 71 int fl_backtrack) internal_function; 72static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) 73 internal_function; 74 75#ifdef RE_ENABLE_I18N 76static int sift_states_iter_mb (const re_match_context_t *mctx, 77 re_sift_context_t *sctx, 78 int node_idx, int str_idx, int max_str_idx) 79 internal_function; 80#endif /* RE_ENABLE_I18N */ 81static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, 82 re_sift_context_t *sctx) 83 internal_function; 84static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, 85 re_sift_context_t *sctx, int str_idx, 86 re_node_set *cur_dest) 87 internal_function; 88static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, 89 re_sift_context_t *sctx, 90 int str_idx, 91 re_node_set *dest_nodes) 92 internal_function; 93static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, 94 re_node_set *dest_nodes, 95 const re_node_set *candidates) 96 internal_function; 97static int check_dst_limits (const re_match_context_t *mctx, 98 re_node_set *limits, 99 int dst_node, int dst_idx, int src_node, 100 int src_idx) internal_function; 101static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, 102 int boundaries, int subexp_idx, 103 int from_node, int bkref_idx) 104 internal_function; 105static int check_dst_limits_calc_pos (const re_match_context_t *mctx, 106 int limit, int subexp_idx, 107 int node, int str_idx, 108 int bkref_idx) internal_function; 109static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, 110 re_node_set *dest_nodes, 111 const re_node_set *candidates, 112 re_node_set *limits, 113 struct re_backref_cache_entry *bkref_ents, 114 int str_idx) internal_function; 115static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, 116 re_sift_context_t *sctx, 117 int str_idx, const re_node_set *candidates) 118 internal_function; 119static reg_errcode_t merge_state_array (const re_dfa_t *dfa, 120 re_dfastate_t **dst, 121 re_dfastate_t **src, int num) 122 internal_function; 123static re_dfastate_t *find_recover_state (reg_errcode_t *err, 124 re_match_context_t *mctx) internal_function; 125static re_dfastate_t *transit_state (reg_errcode_t *err, 126 re_match_context_t *mctx, 127 re_dfastate_t *state) internal_function; 128static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 129 re_match_context_t *mctx, 130 re_dfastate_t *next_state) 131 internal_function; 132static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 133 re_node_set *cur_nodes, 134 int str_idx) internal_function; 135#if 0 136static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 137 re_match_context_t *mctx, 138 re_dfastate_t *pstate) 139 internal_function; 140#endif 141#ifdef RE_ENABLE_I18N 142static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 143 re_dfastate_t *pstate) 144 internal_function; 145#endif /* RE_ENABLE_I18N */ 146static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 147 const re_node_set *nodes) 148 internal_function; 149static reg_errcode_t get_subexp (re_match_context_t *mctx, 150 int bkref_node, int bkref_str_idx) 151 internal_function; 152static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 153 const re_sub_match_top_t *sub_top, 154 re_sub_match_last_t *sub_last, 155 int bkref_node, int bkref_str) 156 internal_function; 157static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 158 int subexp_idx, int type) internal_function; 159static reg_errcode_t check_arrival (re_match_context_t *mctx, 160 state_array_t *path, int top_node, 161 int top_str, int last_node, int last_str, 162 int type) internal_function; 163static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 164 int str_idx, 165 re_node_set *cur_nodes, 166 re_node_set *next_nodes) 167 internal_function; 168static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, 169 re_node_set *cur_nodes, 170 int ex_subexp, int type) 171 internal_function; 172static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, 173 re_node_set *dst_nodes, 174 int target, int ex_subexp, 175 int type) internal_function; 176static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 177 re_node_set *cur_nodes, int cur_str, 178 int subexp_num, int type) 179 internal_function; 180static int build_trtable (const re_dfa_t *dfa, 181 re_dfastate_t *state) internal_function; 182#ifdef RE_ENABLE_I18N 183static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, 184 const re_string_t *input, int idx) 185 internal_function; 186# ifdef _LIBC 187static unsigned int find_collation_sequence_value (const unsigned char *mbs, 188 size_t name_len) 189 internal_function; 190# endif /* _LIBC */ 191#endif /* RE_ENABLE_I18N */ 192static int group_nodes_into_DFAstates (const re_dfa_t *dfa, 193 const re_dfastate_t *state, 194 re_node_set *states_node, 195 bitset_t *states_ch) internal_function; 196static int check_node_accept (const re_match_context_t *mctx, 197 const re_token_t *node, int idx) 198 internal_function; 199static reg_errcode_t extend_buffers (re_match_context_t *mctx) 200 internal_function; 201 202/* Entry point for POSIX code. */ 203 204/* regexec searches for a given pattern, specified by PREG, in the 205 string STRING. 206 207 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 208 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 209 least NMATCH elements, and we set them to the offsets of the 210 corresponding matched substrings. 211 212 EFLAGS specifies `execution flags' which affect matching: if 213 REG_NOTBOL is set, then ^ does not match at the beginning of the 214 string; if REG_NOTEOL is set, then $ does not match at the end. 215 216 We return 0 if we find a match and REG_NOMATCH if not. */ 217 218int 219regexec ( 220 const regex_t *__restrict preg, 221 const char *__restrict string, 222 size_t nmatch, 223 regmatch_t pmatch[], 224 int eflags) 225{ 226 reg_errcode_t err; 227 int start, length; 228 229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 230 return REG_BADPAT; 231 232 if (eflags & REG_STARTEND) 233 { 234 start = pmatch[0].rm_so; 235 length = pmatch[0].rm_eo; 236 } 237 else 238 { 239 start = 0; 240 length = strlen (string); 241 } 242 243 __libc_lock_lock (dfa->lock); 244 if (preg->no_sub) 245 err = re_search_internal (preg, string, length, start, length - start, 246 length, 0, NULL, eflags); 247 else 248 err = re_search_internal (preg, string, length, start, length - start, 249 length, nmatch, pmatch, eflags); 250 __libc_lock_unlock (dfa->lock); 251 return err != REG_NOERROR; 252} 253 254#ifdef _LIBC 255# include <shlib-compat.h> 256versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 257 258# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 259__typeof__ (__regexec) __compat_regexec; 260 261int 262attribute_compat_text_section 263__compat_regexec (const regex_t *__restrict preg, 264 const char *__restrict string, size_t nmatch, 265 regmatch_t pmatch[], int eflags) 266{ 267 return regexec (preg, string, nmatch, pmatch, 268 eflags & (REG_NOTBOL | REG_NOTEOL)); 269} 270compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 271# endif 272#endif 273 274/* Entry points for GNU code. */ 275 276/* re_match, re_search, re_match_2, re_search_2 277 278 The former two functions operate on STRING with length LENGTH, 279 while the later two operate on concatenation of STRING1 and STRING2 280 with lengths LENGTH1 and LENGTH2, respectively. 281 282 re_match() matches the compiled pattern in BUFP against the string, 283 starting at index START. 284 285 re_search() first tries matching at index START, then it tries to match 286 starting from index START + 1, and so on. The last start position tried 287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 288 way as re_match().) 289 290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 291 the first STOP characters of the concatenation of the strings should be 292 concerned. 293 294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 295 and all groups are stored in REGS. (For the "_2" variants, the offsets are 296 computed relative to the concatenation, not relative to the individual 297 strings.) 298 299 On success, re_match* functions return the length of the match, re_search* 300 return the position of the start of the match. Return value -1 means no 301 match was found and -2 indicates an internal error. */ 302 303int 304re_match (struct re_pattern_buffer *bufp, 305 const char *string, 306 int length, 307 int start, 308 struct re_registers *regs) 309{ 310 return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 311} 312#ifdef _LIBC 313weak_alias (__re_match, re_match) 314#endif 315 316int 317re_search (struct re_pattern_buffer *bufp, 318 const char *string, 319 int length, int start, int range, 320 struct re_registers *regs) 321{ 322 return re_search_stub (bufp, string, length, start, range, length, regs, 0); 323} 324#ifdef _LIBC 325weak_alias (__re_search, re_search) 326#endif 327 328int 329re_match_2 (struct re_pattern_buffer *bufp, 330 const char *string1, int length1, 331 const char *string2, int length2, int start, 332 struct re_registers *regs, int stop) 333{ 334 return re_search_2_stub (bufp, string1, length1, string2, length2, 335 start, 0, regs, stop, 1); 336} 337#ifdef _LIBC 338weak_alias (__re_match_2, re_match_2) 339#endif 340 341int 342re_search_2 (struct re_pattern_buffer *bufp, 343 const char *string1, int length1, 344 const char *string2, int length2, int start, 345 int range, struct re_registers *regs, int stop) 346{ 347 return re_search_2_stub (bufp, string1, length1, string2, length2, 348 start, range, regs, stop, 0); 349} 350#ifdef _LIBC 351weak_alias (__re_search_2, re_search_2) 352#endif 353 354static int 355re_search_2_stub (struct re_pattern_buffer *bufp, 356 const char *string1, int length1, 357 const char *string2, int length2, int start, 358 int range, struct re_registers *regs, 359 int stop, int ret_len) 360{ 361 const char *str; 362 int rval; 363 int len = length1 + length2; 364 int free_str = 0; 365 366 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 367 return -2; 368 369 /* Concatenate the strings. */ 370 if (length2 > 0) 371 if (length1 > 0) 372 { 373 char *s = re_malloc (char, len); 374 375 if (BE (s == NULL, 0)) 376 return -2; 377 memcpy (s, string1, length1); 378 memcpy (s + length1, string2, length2); 379 str = s; 380 free_str = 1; 381 } 382 else 383 str = string2; 384 else 385 str = string1; 386 387 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len); 388 if (free_str) 389 re_free ((char *) str); 390 return rval; 391} 392 393/* The parameters have the same meaning as those of re_search. 394 Additional parameters: 395 If RET_LEN is nonzero the length of the match is returned (re_match style); 396 otherwise the position of the match is returned. */ 397 398static int 399re_search_stub (struct re_pattern_buffer *bufp, 400 const char *string, int length, int start, 401 int range, int stop, 402 struct re_registers *regs, int ret_len) 403{ 404 reg_errcode_t result; 405 regmatch_t *pmatch; 406 int nregs, rval; 407 int eflags = 0; 408 409 /* Check for out-of-range. */ 410 if (BE (start < 0 || start > length, 0)) 411 return -1; 412 if (BE (start + range > length, 0)) 413 range = length - start; 414 else if (BE (start + range < 0, 0)) 415 range = -start; 416 417 __libc_lock_lock (dfa->lock); 418 419 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 420 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 421 422 /* Compile fastmap if we haven't yet. */ 423 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 424 re_compile_fastmap (bufp); 425 426 if (BE (bufp->no_sub, 0)) 427 regs = NULL; 428 429 /* We need at least 1 register. */ 430 if (regs == NULL) 431 nregs = 1; 432 else if (BE (bufp->regs_allocated == REGS_FIXED && 433 regs->num_regs < bufp->re_nsub + 1, 0)) 434 { 435 nregs = regs->num_regs; 436 if (BE (nregs < 1, 0)) 437 { 438 /* Nothing can be copied to regs. */ 439 regs = NULL; 440 nregs = 1; 441 } 442 } 443 else 444 nregs = bufp->re_nsub + 1; 445 pmatch = re_malloc (regmatch_t, nregs); 446 if (BE (pmatch == NULL, 0)) 447 { 448 rval = -2; 449 goto out; 450 } 451 452 result = re_search_internal (bufp, string, length, start, range, stop, 453 nregs, pmatch, eflags); 454 455 rval = 0; 456 457 /* I hope we needn't fill their regs with -1's when no match was found. */ 458 if (result != REG_NOERROR) 459 rval = -1; 460 else if (regs != NULL) 461 { 462 /* If caller wants register contents data back, copy them. */ 463 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 464 bufp->regs_allocated); 465 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 466 rval = -2; 467 } 468 469 if (BE (rval == 0, 1)) 470 { 471 if (ret_len) 472 { 473 assert (pmatch[0].rm_so == start); 474 rval = pmatch[0].rm_eo - start; 475 } 476 else 477 rval = pmatch[0].rm_so; 478 } 479 re_free (pmatch); 480 out: 481 __libc_lock_unlock (dfa->lock); 482 return rval; 483} 484 485static unsigned 486re_copy_regs (struct re_registers *regs, 487 regmatch_t *pmatch, 488 int nregs, int regs_allocated) 489{ 490 int rval = REGS_REALLOCATE; 491 int i; 492 int need_regs = nregs + 1; 493 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 494 uses. */ 495 496 /* Have the register data arrays been allocated? */ 497 if (regs_allocated == REGS_UNALLOCATED) 498 { /* No. So allocate them with malloc. */ 499 regs->start = re_malloc (regoff_t, need_regs); 500 if (BE (regs->start == NULL, 0)) 501 return REGS_UNALLOCATED; 502 regs->end = re_malloc (regoff_t, need_regs); 503 if (BE (regs->end == NULL, 0)) 504 { 505 re_free (regs->start); 506 return REGS_UNALLOCATED; 507 } 508 regs->num_regs = need_regs; 509 } 510 else if (regs_allocated == REGS_REALLOCATE) 511 { /* Yes. If we need more elements than were already 512 allocated, reallocate them. If we need fewer, just 513 leave it alone. */ 514 if (BE (need_regs > regs->num_regs, 0)) 515 { 516 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 517 regoff_t *new_end; 518 if (BE (new_start == NULL, 0)) 519 return REGS_UNALLOCATED; 520 new_end = re_realloc (regs->end, regoff_t, need_regs); 521 if (BE (new_end == NULL, 0)) 522 { 523 re_free (new_start); 524 return REGS_UNALLOCATED; 525 } 526 regs->start = new_start; 527 regs->end = new_end; 528 regs->num_regs = need_regs; 529 } 530 } 531 else 532 { 533 assert (regs_allocated == REGS_FIXED); 534 /* This function may not be called with REGS_FIXED and nregs too big. */ 535 assert (regs->num_regs >= nregs); 536 rval = REGS_FIXED; 537 } 538 539 /* Copy the regs. */ 540 for (i = 0; i < nregs; ++i) 541 { 542 regs->start[i] = pmatch[i].rm_so; 543 regs->end[i] = pmatch[i].rm_eo; 544 } 545 for ( ; i < regs->num_regs; ++i) 546 regs->start[i] = regs->end[i] = -1; 547 548 return rval; 549} 550 551/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 552 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 553 this memory for recording register information. STARTS and ENDS 554 must be allocated using the malloc library routine, and must each 555 be at least NUM_REGS * sizeof (regoff_t) bytes long. 556 557 If NUM_REGS == 0, then subsequent matches should allocate their own 558 register data. 559 560 Unless this function is called, the first search or match using 561 PATTERN_BUFFER will allocate its own register data, without 562 freeing the old data. */ 563 564void 565re_set_registers (struct re_pattern_buffer *bufp, 566 struct re_registers *regs, 567 unsigned num_regs, 568 regoff_t *starts, 569 regoff_t *ends) 570{ 571 if (num_regs) 572 { 573 bufp->regs_allocated = REGS_REALLOCATE; 574 regs->num_regs = num_regs; 575 regs->start = starts; 576 regs->end = ends; 577 } 578 else 579 { 580 bufp->regs_allocated = REGS_UNALLOCATED; 581 regs->num_regs = 0; 582 regs->start = regs->end = (regoff_t *) 0; 583 } 584} 585#ifdef _LIBC 586weak_alias (__re_set_registers, re_set_registers) 587#endif 588 589/* Entry points compatible with 4.2 BSD regex library. We don't define 590 them unless specifically requested. */ 591 592#if defined _REGEX_RE_COMP || defined _LIBC 593int 594# ifdef _LIBC 595weak_function 596# endif 597re_exec (s) 598 const char *s; 599{ 600 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 601} 602#endif /* _REGEX_RE_COMP */ 603 604/* Internal entry point. */ 605 606/* Searches for a compiled pattern PREG in the string STRING, whose 607 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 608 mingings with regexec. START, and RANGE have the same meanings 609 with re_search. 610 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 611 otherwise return the error code. 612 Note: We assume front end functions already check ranges. 613 (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 614 615static reg_errcode_t 616re_search_internal (const regex_t *preg, 617 const char *string, 618 int length, int start, int range, int stop, 619 size_t nmatch, regmatch_t pmatch[], 620 int eflags) 621{ 622 reg_errcode_t err; 623 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 624 int left_lim, right_lim, incr; 625 int fl_longest_match, match_first, match_kind, match_last = -1; 626 int extra_nmatch; 627 int sb, ch; 628#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 629 re_match_context_t mctx = { .dfa = dfa }; 630#else 631 re_match_context_t mctx; 632#endif 633 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate 634 && range && !preg->can_be_null) ? preg->fastmap : NULL; 635 RE_TRANSLATE_TYPE t = preg->translate; 636 637#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 638 memset (&mctx, '\0', sizeof (re_match_context_t)); 639 mctx.dfa = dfa; 640#endif 641 642 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 643 nmatch -= extra_nmatch; 644 645 /* Check if the DFA haven't been compiled. */ 646 if (BE (preg->used == 0 || dfa->init_state == NULL 647 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 648 || dfa->init_state_begbuf == NULL, 0)) 649 return REG_NOMATCH; 650 651#ifdef DEBUG 652 /* We assume front-end functions already check them. */ 653 assert (start + range >= 0 && start + range <= length); 654#endif 655 656 /* If initial states with non-begbuf contexts have no elements, 657 the regex must be anchored. If preg->newline_anchor is set, 658 we'll never use init_state_nl, so do not check it. */ 659 if (dfa->init_state->nodes.nelem == 0 660 && dfa->init_state_word->nodes.nelem == 0 661 && (dfa->init_state_nl->nodes.nelem == 0 662 || !preg->newline_anchor)) 663 { 664 if (start != 0 && start + range != 0) 665 return REG_NOMATCH; 666 start = range = 0; 667 } 668 669 /* We must check the longest matching, if nmatch > 0. */ 670 fl_longest_match = (nmatch != 0 || dfa->nbackref); 671 672 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 673 preg->translate, preg->syntax & RE_ICASE, dfa); 674 if (BE (err != REG_NOERROR, 0)) 675 goto free_return; 676 mctx.input.stop = stop; 677 mctx.input.raw_stop = stop; 678 mctx.input.newline_anchor = preg->newline_anchor; 679 680 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 681 if (BE (err != REG_NOERROR, 0)) 682 goto free_return; 683 684 /* We will log all the DFA states through which the dfa pass, 685 if nmatch > 1, or this dfa has "multibyte node", which is a 686 back-reference or a node which can accept multibyte character or 687 multi character collating element. */ 688 if (nmatch > 1 || dfa->has_mb_node) 689 { 690 /* Avoid overflow. */ 691 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0)) 692 { 693 err = REG_ESPACE; 694 goto free_return; 695 } 696 697 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 698 if (BE (mctx.state_log == NULL, 0)) 699 { 700 err = REG_ESPACE; 701 goto free_return; 702 } 703 } 704 else 705 mctx.state_log = NULL; 706 707 match_first = start; 708 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 709 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 710 711 /* Check incrementally whether of not the input string match. */ 712 incr = (range < 0) ? -1 : 1; 713 left_lim = (range < 0) ? start + range : start; 714 right_lim = (range < 0) ? start : start + range; 715 sb = dfa->mb_cur_max == 1; 716 match_kind = 717 (fastmap 718 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 719 | (range >= 0 ? 2 : 0) 720 | (t != NULL ? 1 : 0)) 721 : 8); 722 723 for (;; match_first += incr) 724 { 725 err = REG_NOMATCH; 726 if (match_first < left_lim || right_lim < match_first) 727 goto free_return; 728 729 /* Advance as rapidly as possible through the string, until we 730 find a plausible place to start matching. This may be done 731 with varying efficiency, so there are various possibilities: 732 only the most common of them are specialized, in order to 733 save on code size. We use a switch statement for speed. */ 734 switch (match_kind) 735 { 736 case 8: 737 /* No fastmap. */ 738 break; 739 740 case 7: 741 /* Fastmap with single-byte translation, match forward. */ 742 while (BE (match_first < right_lim, 1) 743 && !fastmap[t[(unsigned char) string[match_first]]]) 744 ++match_first; 745 goto forward_match_found_start_or_reached_end; 746 747 case 6: 748 /* Fastmap without translation, match forward. */ 749 while (BE (match_first < right_lim, 1) 750 && !fastmap[(unsigned char) string[match_first]]) 751 ++match_first; 752 753 forward_match_found_start_or_reached_end: 754 if (BE (match_first == right_lim, 0)) 755 { 756 ch = match_first >= length 757 ? 0 : (unsigned char) string[match_first]; 758 if (!fastmap[t ? t[ch] : ch]) 759 goto free_return; 760 } 761 break; 762 763 case 4: 764 case 5: 765 /* Fastmap without multi-byte translation, match backwards. */ 766 while (match_first >= left_lim) 767 { 768 ch = match_first >= length 769 ? 0 : (unsigned char) string[match_first]; 770 if (fastmap[t ? t[ch] : ch]) 771 break; 772 --match_first; 773 } 774 if (match_first < left_lim) 775 goto free_return; 776 break; 777 778 default: 779 /* In this case, we can't determine easily the current byte, 780 since it might be a component byte of a multibyte 781 character. Then we use the constructed buffer instead. */ 782 for (;;) 783 { 784 /* If MATCH_FIRST is out of the valid range, reconstruct the 785 buffers. */ 786 unsigned int offset = match_first - mctx.input.raw_mbs_idx; 787 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) 788 { 789 err = re_string_reconstruct (&mctx.input, match_first, 790 eflags); 791 if (BE (err != REG_NOERROR, 0)) 792 goto free_return; 793 794 offset = match_first - mctx.input.raw_mbs_idx; 795 } 796 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 797 Note that MATCH_FIRST must not be smaller than 0. */ 798 ch = (match_first >= length 799 ? 0 : re_string_byte_at (&mctx.input, offset)); 800 if (fastmap[ch]) 801 break; 802 match_first += incr; 803 if (match_first < left_lim || match_first > right_lim) 804 { 805 err = REG_NOMATCH; 806 goto free_return; 807 } 808 } 809 break; 810 } 811 812 /* Reconstruct the buffers so that the matcher can assume that 813 the matching starts from the beginning of the buffer. */ 814 err = re_string_reconstruct (&mctx.input, match_first, eflags); 815 if (BE (err != REG_NOERROR, 0)) 816 goto free_return; 817 818#ifdef RE_ENABLE_I18N 819 /* Don't consider this char as a possible match start if it part, 820 yet isn't the head, of a multibyte character. */ 821 if (!sb && !re_string_first_byte (&mctx.input, 0)) 822 continue; 823#endif 824 825 /* It seems to be appropriate one, then use the matcher. */ 826 /* We assume that the matching starts from 0. */ 827 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 828 match_last = check_matching (&mctx, fl_longest_match, 829 range >= 0 ? &match_first : NULL); 830 if (match_last != -1) 831 { 832 if (BE (match_last == -2, 0)) 833 { 834 err = REG_ESPACE; 835 goto free_return; 836 } 837 else 838 { 839 mctx.match_last = match_last; 840 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 841 { 842 re_dfastate_t *pstate = mctx.state_log[match_last]; 843 mctx.last_node = check_halt_state_context (&mctx, pstate, 844 match_last); 845 } 846 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 847 || dfa->nbackref) 848 { 849 err = prune_impossible_nodes (&mctx); 850 if (err == REG_NOERROR) 851 break; 852 if (BE (err != REG_NOMATCH, 0)) 853 goto free_return; 854 match_last = -1; 855 } 856 else 857 break; /* We found a match. */ 858 } 859 } 860 861 match_ctx_clean (&mctx); 862 } 863 864#ifdef DEBUG 865 assert (match_last != -1); 866 assert (err == REG_NOERROR); 867#endif 868 869 /* Set pmatch[] if we need. */ 870 if (nmatch > 0) 871 { 872 int reg_idx; 873 874 /* Initialize registers. */ 875 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 876 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 877 878 /* Set the points where matching start/end. */ 879 pmatch[0].rm_so = 0; 880 pmatch[0].rm_eo = mctx.match_last; 881 882 if (!preg->no_sub && nmatch > 1) 883 { 884 err = set_regs (preg, &mctx, nmatch, pmatch, 885 dfa->has_plural_match && dfa->nbackref > 0); 886 if (BE (err != REG_NOERROR, 0)) 887 goto free_return; 888 } 889 890 /* At last, add the offset to the each registers, since we slided 891 the buffers so that we could assume that the matching starts 892 from 0. */ 893 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 894 if (pmatch[reg_idx].rm_so != -1) 895 { 896#ifdef RE_ENABLE_I18N 897 if (BE (mctx.input.offsets_needed != 0, 0)) 898 { 899 pmatch[reg_idx].rm_so = 900 (pmatch[reg_idx].rm_so == mctx.input.valid_len 901 ? mctx.input.valid_raw_len 902 : mctx.input.offsets[pmatch[reg_idx].rm_so]); 903 pmatch[reg_idx].rm_eo = 904 (pmatch[reg_idx].rm_eo == mctx.input.valid_len 905 ? mctx.input.valid_raw_len 906 : mctx.input.offsets[pmatch[reg_idx].rm_eo]); 907 } 908#else 909 assert (mctx.input.offsets_needed == 0); 910#endif 911 pmatch[reg_idx].rm_so += match_first; 912 pmatch[reg_idx].rm_eo += match_first; 913 } 914 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 915 { 916 pmatch[nmatch + reg_idx].rm_so = -1; 917 pmatch[nmatch + reg_idx].rm_eo = -1; 918 } 919 920 if (dfa->subexp_map) 921 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 922 if (dfa->subexp_map[reg_idx] != reg_idx) 923 { 924 pmatch[reg_idx + 1].rm_so 925 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 926 pmatch[reg_idx + 1].rm_eo 927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 928 } 929 } 930 931 free_return: 932 re_free (mctx.state_log); 933 if (dfa->nbackref) 934 match_ctx_free (&mctx); 935 re_string_destruct (&mctx.input); 936 return err; 937} 938 939static reg_errcode_t 940prune_impossible_nodes (re_match_context_t *mctx) 941{ 942 const re_dfa_t *const dfa = mctx->dfa; 943 int halt_node, match_last; 944 reg_errcode_t ret; 945 re_dfastate_t **sifted_states; 946 re_dfastate_t **lim_states = NULL; 947 re_sift_context_t sctx; 948#ifdef DEBUG 949 assert (mctx->state_log != NULL); 950#endif 951 match_last = mctx->match_last; 952 halt_node = mctx->last_node; 953 954 /* Avoid overflow. */ 955 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0)) 956 return REG_ESPACE; 957 958 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 959 if (BE (sifted_states == NULL, 0)) 960 { 961 ret = REG_ESPACE; 962 goto free_return; 963 } 964 if (dfa->nbackref) 965 { 966 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 967 if (BE (lim_states == NULL, 0)) 968 { 969 ret = REG_ESPACE; 970 goto free_return; 971 } 972 while (1) 973 { 974 memset (lim_states, '\0', 975 sizeof (re_dfastate_t *) * (match_last + 1)); 976 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 977 match_last); 978 ret = sift_states_backward (mctx, &sctx); 979 re_node_set_free (&sctx.limits); 980 if (BE (ret != REG_NOERROR, 0)) 981 goto free_return; 982 if (sifted_states[0] != NULL || lim_states[0] != NULL) 983 break; 984 do 985 { 986 --match_last; 987 if (match_last < 0) 988 { 989 ret = REG_NOMATCH; 990 goto free_return; 991 } 992 } while (mctx->state_log[match_last] == NULL 993 || !mctx->state_log[match_last]->halt); 994 halt_node = check_halt_state_context (mctx, 995 mctx->state_log[match_last], 996 match_last); 997 } 998 ret = merge_state_array (dfa, sifted_states, lim_states, 999 match_last + 1); 1000 re_free (lim_states); 1001 lim_states = NULL; 1002 if (BE (ret != REG_NOERROR, 0)) 1003 goto free_return; 1004 } 1005 else 1006 { 1007 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 1008 ret = sift_states_backward (mctx, &sctx); 1009 re_node_set_free (&sctx.limits); 1010 if (BE (ret != REG_NOERROR, 0)) 1011 goto free_return; 1012 if (sifted_states[0] == NULL) 1013 { 1014 ret = REG_NOMATCH; 1015 goto free_return; 1016 } 1017 } 1018 re_free (mctx->state_log); 1019 mctx->state_log = sifted_states; 1020 sifted_states = NULL; 1021 mctx->last_node = halt_node; 1022 mctx->match_last = match_last; 1023 ret = REG_NOERROR; 1024 free_return: 1025 re_free (sifted_states); 1026 re_free (lim_states); 1027 return ret; 1028} 1029 1030/* Acquire an initial state and return it. 1031 We must select appropriate initial state depending on the context, 1032 since initial states may have constraints like "\<", "^", etc.. */ 1033 1034static inline re_dfastate_t * 1035__attribute ((always_inline)) internal_function 1036acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, 1037 int idx) 1038{ 1039 const re_dfa_t *const dfa = mctx->dfa; 1040 if (dfa->init_state->has_constraint) 1041 { 1042 unsigned int context; 1043 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1044 if (IS_WORD_CONTEXT (context)) 1045 return dfa->init_state_word; 1046 else if (IS_ORDINARY_CONTEXT (context)) 1047 return dfa->init_state; 1048 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1049 return dfa->init_state_begbuf; 1050 else if (IS_NEWLINE_CONTEXT (context)) 1051 return dfa->init_state_nl; 1052 else if (IS_BEGBUF_CONTEXT (context)) 1053 { 1054 /* It is relatively rare case, then calculate on demand. */ 1055 return re_acquire_state_context (err, dfa, 1056 dfa->init_state->entrance_nodes, 1057 context); 1058 } 1059 else 1060 /* Must not happen? */ 1061 return dfa->init_state; 1062 } 1063 else 1064 return dfa->init_state; 1065} 1066 1067/* Check whether the regular expression match input string INPUT or not, 1068 and return the index where the matching end, return -1 if not match, 1069 or return -2 in case of an error. 1070 FL_LONGEST_MATCH means we want the POSIX longest matching. 1071 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1072 next place where we may want to try matching. 1073 Note that the matcher assume that the matching starts from the current 1074 index of the buffer. */ 1075 1076static int 1077internal_function 1078check_matching (re_match_context_t *mctx, int fl_longest_match, 1079 int *p_match_first) 1080{ 1081 const re_dfa_t *const dfa = mctx->dfa; 1082 reg_errcode_t err; 1083 int match = 0; 1084 int match_last = -1; 1085 int cur_str_idx = re_string_cur_idx (&mctx->input); 1086 re_dfastate_t *cur_state; 1087 int at_init_state = p_match_first != NULL; 1088 int next_start_idx = cur_str_idx; 1089 1090 err = REG_NOERROR; 1091 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1092 /* An initial state must not be NULL (invalid). */ 1093 if (BE (cur_state == NULL, 0)) 1094 { 1095 assert (err == REG_ESPACE); 1096 return -2; 1097 } 1098 1099 if (mctx->state_log != NULL) 1100 { 1101 mctx->state_log[cur_str_idx] = cur_state; 1102 1103 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1104 later. E.g. Processing back references. */ 1105 if (BE (dfa->nbackref, 0)) 1106 { 1107 at_init_state = 0; 1108 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1109 if (BE (err != REG_NOERROR, 0)) 1110 return err; 1111 1112 if (cur_state->has_backref) 1113 { 1114 err = transit_state_bkref (mctx, &cur_state->nodes); 1115 if (BE (err != REG_NOERROR, 0)) 1116 return err; 1117 } 1118 } 1119 } 1120 1121 /* If the RE accepts NULL string. */ 1122 if (BE (cur_state->halt, 0)) 1123 { 1124 if (!cur_state->has_constraint 1125 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1126 { 1127 if (!fl_longest_match) 1128 return cur_str_idx; 1129 else 1130 { 1131 match_last = cur_str_idx; 1132 match = 1; 1133 } 1134 } 1135 } 1136 1137 while (!re_string_eoi (&mctx->input)) 1138 { 1139 re_dfastate_t *old_state = cur_state; 1140 int next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1141 1142 if (BE (next_char_idx >= mctx->input.bufs_len, 0) 1143 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1144 && mctx->input.valid_len < mctx->input.len)) 1145 { 1146 err = extend_buffers (mctx); 1147 if (BE (err != REG_NOERROR, 0)) 1148 { 1149 assert (err == REG_ESPACE); 1150 return -2; 1151 } 1152 } 1153 1154 cur_state = transit_state (&err, mctx, cur_state); 1155 if (mctx->state_log != NULL) 1156 cur_state = merge_state_with_log (&err, mctx, cur_state); 1157 1158 if (cur_state == NULL) 1159 { 1160 /* Reached the invalid state or an error. Try to recover a valid 1161 state using the state log, if available and if we have not 1162 already found a valid (even if not the longest) match. */ 1163 if (BE (err != REG_NOERROR, 0)) 1164 return -2; 1165 1166 if (mctx->state_log == NULL 1167 || (match && !fl_longest_match) 1168 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1169 break; 1170 } 1171 1172 if (BE (at_init_state, 0)) 1173 { 1174 if (old_state == cur_state) 1175 next_start_idx = next_char_idx; 1176 else 1177 at_init_state = 0; 1178 } 1179 1180 if (cur_state->halt) 1181 { 1182 /* Reached a halt state. 1183 Check the halt state can satisfy the current context. */ 1184 if (!cur_state->has_constraint 1185 || check_halt_state_context (mctx, cur_state, 1186 re_string_cur_idx (&mctx->input))) 1187 { 1188 /* We found an appropriate halt state. */ 1189 match_last = re_string_cur_idx (&mctx->input); 1190 match = 1; 1191 1192 /* We found a match, do not modify match_first below. */ 1193 p_match_first = NULL; 1194 if (!fl_longest_match) 1195 break; 1196 } 1197 } 1198 } 1199 1200 if (p_match_first) 1201 *p_match_first += next_start_idx; 1202 1203 return match_last; 1204} 1205 1206/* Check NODE match the current context. */ 1207 1208static int 1209internal_function 1210check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context) 1211{ 1212 re_token_type_t type = dfa->nodes[node].type; 1213 unsigned int constraint = dfa->nodes[node].constraint; 1214 if (type != END_OF_RE) 1215 return 0; 1216 if (!constraint) 1217 return 1; 1218 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1219 return 0; 1220 return 1; 1221} 1222 1223/* Check the halt state STATE match the current context. 1224 Return 0 if not match, if the node, STATE has, is a halt node and 1225 match the context, return the node. */ 1226 1227static int 1228internal_function 1229check_halt_state_context (const re_match_context_t *mctx, 1230 const re_dfastate_t *state, int idx) 1231{ 1232 int i; 1233 unsigned int context; 1234#ifdef DEBUG 1235 assert (state->halt); 1236#endif 1237 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1238 for (i = 0; i < state->nodes.nelem; ++i) 1239 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1240 return state->nodes.elems[i]; 1241 return 0; 1242} 1243 1244/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1245 corresponding to the DFA). 1246 Return the destination node, and update EPS_VIA_NODES, return -1 in case 1247 of errors. */ 1248 1249static int 1250internal_function 1251proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs, 1252 int *pidx, int node, re_node_set *eps_via_nodes, 1253 struct re_fail_stack_t *fs) 1254{ 1255 const re_dfa_t *const dfa = mctx->dfa; 1256 int i, err; 1257 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1258 { 1259 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1260 re_node_set *edests = &dfa->edests[node]; 1261 int dest_node; 1262 err = re_node_set_insert (eps_via_nodes, node); 1263 if (BE (err < 0, 0)) 1264 return -2; 1265 /* Pick up a valid destination, or return -1 if none is found. */ 1266 for (dest_node = -1, i = 0; i < edests->nelem; ++i) 1267 { 1268 int candidate = edests->elems[i]; 1269 if (!re_node_set_contains (cur_nodes, candidate)) 1270 continue; 1271 if (dest_node == -1) 1272 dest_node = candidate; 1273 1274 else 1275 { 1276 /* In order to avoid infinite loop like "(a*)*", return the second 1277 epsilon-transition if the first was already considered. */ 1278 if (re_node_set_contains (eps_via_nodes, dest_node)) 1279 return candidate; 1280 1281 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1282 else if (fs != NULL 1283 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1284 eps_via_nodes)) 1285 return -2; 1286 1287 /* We know we are going to exit. */ 1288 break; 1289 } 1290 } 1291 return dest_node; 1292 } 1293 else 1294 { 1295 int naccepted = 0; 1296 re_token_type_t type = dfa->nodes[node].type; 1297 1298#ifdef RE_ENABLE_I18N 1299 if (dfa->nodes[node].accept_mb) 1300 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1301 else 1302#endif /* RE_ENABLE_I18N */ 1303 if (type == OP_BACK_REF) 1304 { 1305 int subexp_idx = dfa->nodes[node].opr.idx + 1; 1306 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1307 if (fs != NULL) 1308 { 1309 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1310 return -1; 1311 else if (naccepted) 1312 { 1313 char *buf = (char *) re_string_get_buffer (&mctx->input); 1314 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1315 naccepted) != 0) 1316 return -1; 1317 } 1318 } 1319 1320 if (naccepted == 0) 1321 { 1322 int dest_node; 1323 err = re_node_set_insert (eps_via_nodes, node); 1324 if (BE (err < 0, 0)) 1325 return -2; 1326 dest_node = dfa->edests[node].elems[0]; 1327 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1328 dest_node)) 1329 return dest_node; 1330 } 1331 } 1332 1333 if (naccepted != 0 1334 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1335 { 1336 int dest_node = dfa->nexts[node]; 1337 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1338 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1339 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1340 dest_node))) 1341 return -1; 1342 re_node_set_empty (eps_via_nodes); 1343 return dest_node; 1344 } 1345 } 1346 return -1; 1347} 1348 1349static reg_errcode_t 1350internal_function 1351push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node, 1352 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes) 1353{ 1354 reg_errcode_t err; 1355 int num = fs->num++; 1356 if (fs->num == fs->alloc) 1357 { 1358 struct re_fail_stack_ent_t *new_array; 1359 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1360 * fs->alloc * 2)); 1361 if (new_array == NULL) 1362 return REG_ESPACE; 1363 fs->alloc *= 2; 1364 fs->stack = new_array; 1365 } 1366 fs->stack[num].idx = str_idx; 1367 fs->stack[num].node = dest_node; 1368 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1369 if (fs->stack[num].regs == NULL) 1370 return REG_ESPACE; 1371 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1372 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1373 return err; 1374} 1375 1376static int 1377internal_function 1378pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, 1379 regmatch_t *regs, re_node_set *eps_via_nodes) 1380{ 1381 int num = --fs->num; 1382 assert (num >= 0); 1383 *pidx = fs->stack[num].idx; 1384 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1385 re_node_set_free (eps_via_nodes); 1386 re_free (fs->stack[num].regs); 1387 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1388 return fs->stack[num].node; 1389} 1390 1391/* Set the positions where the subexpressions are starts/ends to registers 1392 PMATCH. 1393 Note: We assume that pmatch[0] is already set, and 1394 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1395 1396static reg_errcode_t 1397internal_function 1398set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, 1399 regmatch_t *pmatch, int fl_backtrack) 1400{ 1401 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 1402 int idx, cur_node; 1403 re_node_set eps_via_nodes; 1404 struct re_fail_stack_t *fs; 1405 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1406 regmatch_t *prev_idx_match; 1407 int prev_idx_match_malloced = 0; 1408 1409#ifdef DEBUG 1410 assert (nmatch > 1); 1411 assert (mctx->state_log != NULL); 1412#endif 1413 if (fl_backtrack) 1414 { 1415 fs = &fs_body; 1416 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1417 if (fs->stack == NULL) 1418 return REG_ESPACE; 1419 } 1420 else 1421 fs = NULL; 1422 1423 cur_node = dfa->init_node; 1424 re_node_set_init_empty (&eps_via_nodes); 1425 1426#ifdef HAVE_ALLOCA 1427 if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) 1428 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); 1429 else 1430#endif 1431 { 1432 prev_idx_match = re_malloc (regmatch_t, nmatch); 1433 if (prev_idx_match == NULL) 1434 { 1435 free_fail_stack_return (fs); 1436 return REG_ESPACE; 1437 } 1438 prev_idx_match_malloced = 1; 1439 } 1440 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1441 1442 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1443 { 1444 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1445 1446 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1447 { 1448 int reg_idx; 1449 if (fs) 1450 { 1451 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1452 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1453 break; 1454 if (reg_idx == nmatch) 1455 { 1456 re_node_set_free (&eps_via_nodes); 1457 if (prev_idx_match_malloced) 1458 re_free (prev_idx_match); 1459 return free_fail_stack_return (fs); 1460 } 1461 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1462 &eps_via_nodes); 1463 } 1464 else 1465 { 1466 re_node_set_free (&eps_via_nodes); 1467 if (prev_idx_match_malloced) 1468 re_free (prev_idx_match); 1469 return REG_NOERROR; 1470 } 1471 } 1472 1473 /* Proceed to next node. */ 1474 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1475 &eps_via_nodes, fs); 1476 1477 if (BE (cur_node < 0, 0)) 1478 { 1479 if (BE (cur_node == -2, 0)) 1480 { 1481 re_node_set_free (&eps_via_nodes); 1482 if (prev_idx_match_malloced) 1483 re_free (prev_idx_match); 1484 free_fail_stack_return (fs); 1485 return REG_ESPACE; 1486 } 1487 if (fs) 1488 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1489 &eps_via_nodes); 1490 else 1491 { 1492 re_node_set_free (&eps_via_nodes); 1493 if (prev_idx_match_malloced) 1494 re_free (prev_idx_match); 1495 return REG_NOMATCH; 1496 } 1497 } 1498 } 1499 re_node_set_free (&eps_via_nodes); 1500 if (prev_idx_match_malloced) 1501 re_free (prev_idx_match); 1502 return free_fail_stack_return (fs); 1503} 1504 1505static reg_errcode_t 1506internal_function 1507free_fail_stack_return (struct re_fail_stack_t *fs) 1508{ 1509 if (fs) 1510 { 1511 int fs_idx; 1512 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1513 { 1514 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1515 re_free (fs->stack[fs_idx].regs); 1516 } 1517 re_free (fs->stack); 1518 } 1519 return REG_NOERROR; 1520} 1521 1522static void 1523internal_function 1524update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 1525 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch) 1526{ 1527 int type = dfa->nodes[cur_node].type; 1528 if (type == OP_OPEN_SUBEXP) 1529 { 1530 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1531 1532 /* We are at the first node of this sub expression. */ 1533 if (reg_num < nmatch) 1534 { 1535 pmatch[reg_num].rm_so = cur_idx; 1536 pmatch[reg_num].rm_eo = -1; 1537 } 1538 } 1539 else if (type == OP_CLOSE_SUBEXP) 1540 { 1541 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1542 if (reg_num < nmatch) 1543 { 1544 /* We are at the last node of this sub expression. */ 1545 if (pmatch[reg_num].rm_so < cur_idx) 1546 { 1547 pmatch[reg_num].rm_eo = cur_idx; 1548 /* This is a non-empty match or we are not inside an optional 1549 subexpression. Accept this right away. */ 1550 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1551 } 1552 else 1553 { 1554 if (dfa->nodes[cur_node].opt_subexp 1555 && prev_idx_match[reg_num].rm_so != -1) 1556 /* We transited through an empty match for an optional 1557 subexpression, like (a?)*, and this is not the subexp's 1558 first match. Copy back the old content of the registers 1559 so that matches of an inner subexpression are undone as 1560 well, like in ((a?))*. */ 1561 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1562 else 1563 /* We completed a subexpression, but it may be part of 1564 an optional one, so do not update PREV_IDX_MATCH. */ 1565 pmatch[reg_num].rm_eo = cur_idx; 1566 } 1567 } 1568 } 1569} 1570 1571/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1572 and sift the nodes in each states according to the following rules. 1573 Updated state_log will be wrote to STATE_LOG. 1574 1575 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1576 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1577 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1578 the LAST_NODE, we throw away the node `a'. 1579 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1580 string `s' and transit to `b': 1581 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1582 away the node `a'. 1583 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1584 thrown away, we throw away the node `a'. 1585 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1586 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1587 node `a'. 1588 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1589 we throw away the node `a'. */ 1590 1591#define STATE_NODE_CONTAINS(state,node) \ 1592 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1593 1594static reg_errcode_t 1595internal_function 1596sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) 1597{ 1598 reg_errcode_t err; 1599 int null_cnt = 0; 1600 int str_idx = sctx->last_str_idx; 1601 re_node_set cur_dest; 1602 1603#ifdef DEBUG 1604 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1605#endif 1606 1607 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1608 transit to the last_node and the last_node itself. */ 1609 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1610 if (BE (err != REG_NOERROR, 0)) 1611 return err; 1612 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1613 if (BE (err != REG_NOERROR, 0)) 1614 goto free_return; 1615 1616 /* Then check each states in the state_log. */ 1617 while (str_idx > 0) 1618 { 1619 /* Update counters. */ 1620 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1621 if (null_cnt > mctx->max_mb_elem_len) 1622 { 1623 memset (sctx->sifted_states, '\0', 1624 sizeof (re_dfastate_t *) * str_idx); 1625 re_node_set_free (&cur_dest); 1626 return REG_NOERROR; 1627 } 1628 re_node_set_empty (&cur_dest); 1629 --str_idx; 1630 1631 if (mctx->state_log[str_idx]) 1632 { 1633 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1634 if (BE (err != REG_NOERROR, 0)) 1635 goto free_return; 1636 } 1637 1638 /* Add all the nodes which satisfy the following conditions: 1639 - It can epsilon transit to a node in CUR_DEST. 1640 - It is in CUR_SRC. 1641 And update state_log. */ 1642 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1643 if (BE (err != REG_NOERROR, 0)) 1644 goto free_return; 1645 } 1646 err = REG_NOERROR; 1647 free_return: 1648 re_node_set_free (&cur_dest); 1649 return err; 1650} 1651 1652static reg_errcode_t 1653internal_function 1654build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, 1655 int str_idx, re_node_set *cur_dest) 1656{ 1657 const re_dfa_t *const dfa = mctx->dfa; 1658 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1659 int i; 1660 1661 /* Then build the next sifted state. 1662 We build the next sifted state on `cur_dest', and update 1663 `sifted_states[str_idx]' with `cur_dest'. 1664 Note: 1665 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1666 `cur_src' points the node_set of the old `state_log[str_idx]' 1667 (with the epsilon nodes pre-filtered out). */ 1668 for (i = 0; i < cur_src->nelem; i++) 1669 { 1670 int prev_node = cur_src->elems[i]; 1671 int naccepted = 0; 1672 int ret; 1673 1674#ifdef DEBUG 1675 re_token_type_t type = dfa->nodes[prev_node].type; 1676 assert (!IS_EPSILON_NODE (type)); 1677#endif 1678#ifdef RE_ENABLE_I18N 1679 /* If the node may accept `multi byte'. */ 1680 if (dfa->nodes[prev_node].accept_mb) 1681 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1682 str_idx, sctx->last_str_idx); 1683#endif /* RE_ENABLE_I18N */ 1684 1685 /* We don't check backreferences here. 1686 See update_cur_sifted_state(). */ 1687 if (!naccepted 1688 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1689 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1690 dfa->nexts[prev_node])) 1691 naccepted = 1; 1692 1693 if (naccepted == 0) 1694 continue; 1695 1696 if (sctx->limits.nelem) 1697 { 1698 int to_idx = str_idx + naccepted; 1699 if (check_dst_limits (mctx, &sctx->limits, 1700 dfa->nexts[prev_node], to_idx, 1701 prev_node, str_idx)) 1702 continue; 1703 } 1704 ret = re_node_set_insert (cur_dest, prev_node); 1705 if (BE (ret == -1, 0)) 1706 return REG_ESPACE; 1707 } 1708 1709 return REG_NOERROR; 1710} 1711 1712/* Helper functions. */ 1713 1714static reg_errcode_t 1715internal_function 1716clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx) 1717{ 1718 int top = mctx->state_log_top; 1719 1720 if (next_state_log_idx >= mctx->input.bufs_len 1721 || (next_state_log_idx >= mctx->input.valid_len 1722 && mctx->input.valid_len < mctx->input.len)) 1723 { 1724 reg_errcode_t err; 1725 err = extend_buffers (mctx); 1726 if (BE (err != REG_NOERROR, 0)) 1727 return err; 1728 } 1729 1730 if (top < next_state_log_idx) 1731 { 1732 memset (mctx->state_log + top + 1, '\0', 1733 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1734 mctx->state_log_top = next_state_log_idx; 1735 } 1736 return REG_NOERROR; 1737} 1738 1739static reg_errcode_t 1740internal_function 1741merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, 1742 re_dfastate_t **src, int num) 1743{ 1744 int st_idx; 1745 reg_errcode_t err; 1746 for (st_idx = 0; st_idx < num; ++st_idx) 1747 { 1748 if (dst[st_idx] == NULL) 1749 dst[st_idx] = src[st_idx]; 1750 else if (src[st_idx] != NULL) 1751 { 1752 re_node_set merged_set; 1753 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1754 &src[st_idx]->nodes); 1755 if (BE (err != REG_NOERROR, 0)) 1756 return err; 1757 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1758 re_node_set_free (&merged_set); 1759 if (BE (err != REG_NOERROR, 0)) 1760 return err; 1761 } 1762 } 1763 return REG_NOERROR; 1764} 1765 1766static reg_errcode_t 1767internal_function 1768update_cur_sifted_state (const re_match_context_t *mctx, 1769 re_sift_context_t *sctx, int str_idx, 1770 re_node_set *dest_nodes) 1771{ 1772 const re_dfa_t *const dfa = mctx->dfa; 1773 reg_errcode_t err = REG_NOERROR; 1774 const re_node_set *candidates; 1775 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1776 : &mctx->state_log[str_idx]->nodes); 1777 1778 if (dest_nodes->nelem == 0) 1779 sctx->sifted_states[str_idx] = NULL; 1780 else 1781 { 1782 if (candidates) 1783 { 1784 /* At first, add the nodes which can epsilon transit to a node in 1785 DEST_NODE. */ 1786 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1787 if (BE (err != REG_NOERROR, 0)) 1788 return err; 1789 1790 /* Then, check the limitations in the current sift_context. */ 1791 if (sctx->limits.nelem) 1792 { 1793 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1794 mctx->bkref_ents, str_idx); 1795 if (BE (err != REG_NOERROR, 0)) 1796 return err; 1797 } 1798 } 1799 1800 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1801 if (BE (err != REG_NOERROR, 0)) 1802 return err; 1803 } 1804 1805 if (candidates && mctx->state_log[str_idx]->has_backref) 1806 { 1807 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1808 if (BE (err != REG_NOERROR, 0)) 1809 return err; 1810 } 1811 return REG_NOERROR; 1812} 1813 1814static reg_errcode_t 1815internal_function 1816add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, 1817 const re_node_set *candidates) 1818{ 1819 reg_errcode_t err = REG_NOERROR; 1820 int i; 1821 1822 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1823 if (BE (err != REG_NOERROR, 0)) 1824 return err; 1825 1826 if (!state->inveclosure.alloc) 1827 { 1828 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1829 if (BE (err != REG_NOERROR, 0)) 1830 return REG_ESPACE; 1831 for (i = 0; i < dest_nodes->nelem; i++) 1832 { 1833 err = re_node_set_merge (&state->inveclosure, 1834 dfa->inveclosures + dest_nodes->elems[i]); 1835 if (BE (err != REG_NOERROR, 0)) 1836 return REG_ESPACE; 1837 } 1838 } 1839 return re_node_set_add_intersect (dest_nodes, candidates, 1840 &state->inveclosure); 1841} 1842 1843static reg_errcode_t 1844internal_function 1845sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes, 1846 const re_node_set *candidates) 1847{ 1848 int ecl_idx; 1849 reg_errcode_t err; 1850 re_node_set *inv_eclosure = dfa->inveclosures + node; 1851 re_node_set except_nodes; 1852 re_node_set_init_empty (&except_nodes); 1853 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1854 { 1855 int cur_node = inv_eclosure->elems[ecl_idx]; 1856 if (cur_node == node) 1857 continue; 1858 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1859 { 1860 int edst1 = dfa->edests[cur_node].elems[0]; 1861 int edst2 = ((dfa->edests[cur_node].nelem > 1) 1862 ? dfa->edests[cur_node].elems[1] : -1); 1863 if ((!re_node_set_contains (inv_eclosure, edst1) 1864 && re_node_set_contains (dest_nodes, edst1)) 1865 || (edst2 > 0 1866 && !re_node_set_contains (inv_eclosure, edst2) 1867 && re_node_set_contains (dest_nodes, edst2))) 1868 { 1869 err = re_node_set_add_intersect (&except_nodes, candidates, 1870 dfa->inveclosures + cur_node); 1871 if (BE (err != REG_NOERROR, 0)) 1872 { 1873 re_node_set_free (&except_nodes); 1874 return err; 1875 } 1876 } 1877 } 1878 } 1879 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1880 { 1881 int cur_node = inv_eclosure->elems[ecl_idx]; 1882 if (!re_node_set_contains (&except_nodes, cur_node)) 1883 { 1884 int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1885 re_node_set_remove_at (dest_nodes, idx); 1886 } 1887 } 1888 re_node_set_free (&except_nodes); 1889 return REG_NOERROR; 1890} 1891 1892static int 1893internal_function 1894check_dst_limits (const re_match_context_t *mctx, re_node_set *limits, 1895 int dst_node, int dst_idx, int src_node, int src_idx) 1896{ 1897 const re_dfa_t *const dfa = mctx->dfa; 1898 int lim_idx, src_pos, dst_pos; 1899 1900 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1901 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1902 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1903 { 1904 int subexp_idx; 1905 struct re_backref_cache_entry *ent; 1906 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1907 subexp_idx = dfa->nodes[ent->node].opr.idx; 1908 1909 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1910 subexp_idx, dst_node, dst_idx, 1911 dst_bkref_idx); 1912 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1913 subexp_idx, src_node, src_idx, 1914 src_bkref_idx); 1915 1916 /* In case of: 1917 <src> <dst> ( <subexp> ) 1918 ( <subexp> ) <src> <dst> 1919 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1920 if (src_pos == dst_pos) 1921 continue; /* This is unrelated limitation. */ 1922 else 1923 return 1; 1924 } 1925 return 0; 1926} 1927 1928static int 1929internal_function 1930check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, 1931 int subexp_idx, int from_node, int bkref_idx) 1932{ 1933 const re_dfa_t *const dfa = mctx->dfa; 1934 const re_node_set *eclosures = dfa->eclosures + from_node; 1935 int node_idx; 1936 1937 /* Else, we are on the boundary: examine the nodes on the epsilon 1938 closure. */ 1939 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1940 { 1941 int node = eclosures->elems[node_idx]; 1942 switch (dfa->nodes[node].type) 1943 { 1944 case OP_BACK_REF: 1945 if (bkref_idx != -1) 1946 { 1947 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1948 do 1949 { 1950 int dst, cpos; 1951 1952 if (ent->node != node) 1953 continue; 1954 1955 if (subexp_idx < BITSET_WORD_BITS 1956 && !(ent->eps_reachable_subexps_map 1957 & ((bitset_word_t) 1 << subexp_idx))) 1958 continue; 1959 1960 /* Recurse trying to reach the OP_OPEN_SUBEXP and 1961 OP_CLOSE_SUBEXP cases below. But, if the 1962 destination node is the same node as the source 1963 node, don't recurse because it would cause an 1964 infinite loop: a regex that exhibits this behavior 1965 is ()\1*\1* */ 1966 dst = dfa->edests[node].elems[0]; 1967 if (dst == from_node) 1968 { 1969 if (boundaries & 1) 1970 return -1; 1971 else /* if (boundaries & 2) */ 1972 return 0; 1973 } 1974 1975 cpos = 1976 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1977 dst, bkref_idx); 1978 if (cpos == -1 /* && (boundaries & 1) */) 1979 return -1; 1980 if (cpos == 0 && (boundaries & 2)) 1981 return 0; 1982 1983 if (subexp_idx < BITSET_WORD_BITS) 1984 ent->eps_reachable_subexps_map 1985 &= ~((bitset_word_t) 1 << subexp_idx); 1986 } 1987 while (ent++->more); 1988 } 1989 break; 1990 1991 case OP_OPEN_SUBEXP: 1992 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 1993 return -1; 1994 break; 1995 1996 case OP_CLOSE_SUBEXP: 1997 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 1998 return 0; 1999 break; 2000 2001 default: 2002 break; 2003 } 2004 } 2005 2006 return (boundaries & 2) ? 1 : 0; 2007} 2008 2009static int 2010internal_function 2011check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit, 2012 int subexp_idx, int from_node, int str_idx, 2013 int bkref_idx) 2014{ 2015 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 2016 int boundaries; 2017 2018 /* If we are outside the range of the subexpression, return -1 or 1. */ 2019 if (str_idx < lim->subexp_from) 2020 return -1; 2021 2022 if (lim->subexp_to < str_idx) 2023 return 1; 2024 2025 /* If we are within the subexpression, return 0. */ 2026 boundaries = (str_idx == lim->subexp_from); 2027 boundaries |= (str_idx == lim->subexp_to) << 1; 2028 if (boundaries == 0) 2029 return 0; 2030 2031 /* Else, examine epsilon closure. */ 2032 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2033 from_node, bkref_idx); 2034} 2035 2036/* Check the limitations of sub expressions LIMITS, and remove the nodes 2037 which are against limitations from DEST_NODES. */ 2038 2039static reg_errcode_t 2040internal_function 2041check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, 2042 const re_node_set *candidates, re_node_set *limits, 2043 struct re_backref_cache_entry *bkref_ents, int str_idx) 2044{ 2045 reg_errcode_t err; 2046 int node_idx, lim_idx; 2047 2048 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2049 { 2050 int subexp_idx; 2051 struct re_backref_cache_entry *ent; 2052 ent = bkref_ents + limits->elems[lim_idx]; 2053 2054 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2055 continue; /* This is unrelated limitation. */ 2056 2057 subexp_idx = dfa->nodes[ent->node].opr.idx; 2058 if (ent->subexp_to == str_idx) 2059 { 2060 int ops_node = -1; 2061 int cls_node = -1; 2062 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2063 { 2064 int node = dest_nodes->elems[node_idx]; 2065 re_token_type_t type = dfa->nodes[node].type; 2066 if (type == OP_OPEN_SUBEXP 2067 && subexp_idx == dfa->nodes[node].opr.idx) 2068 ops_node = node; 2069 else if (type == OP_CLOSE_SUBEXP 2070 && subexp_idx == dfa->nodes[node].opr.idx) 2071 cls_node = node; 2072 } 2073 2074 /* Check the limitation of the open subexpression. */ 2075 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2076 if (ops_node >= 0) 2077 { 2078 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2079 candidates); 2080 if (BE (err != REG_NOERROR, 0)) 2081 return err; 2082 } 2083 2084 /* Check the limitation of the close subexpression. */ 2085 if (cls_node >= 0) 2086 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2087 { 2088 int node = dest_nodes->elems[node_idx]; 2089 if (!re_node_set_contains (dfa->inveclosures + node, 2090 cls_node) 2091 && !re_node_set_contains (dfa->eclosures + node, 2092 cls_node)) 2093 { 2094 /* It is against this limitation. 2095 Remove it form the current sifted state. */ 2096 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2097 candidates); 2098 if (BE (err != REG_NOERROR, 0)) 2099 return err; 2100 --node_idx; 2101 } 2102 } 2103 } 2104 else /* (ent->subexp_to != str_idx) */ 2105 { 2106 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2107 { 2108 int node = dest_nodes->elems[node_idx]; 2109 re_token_type_t type = dfa->nodes[node].type; 2110 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2111 { 2112 if (subexp_idx != dfa->nodes[node].opr.idx) 2113 continue; 2114 /* It is against this limitation. 2115 Remove it form the current sifted state. */ 2116 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2117 candidates); 2118 if (BE (err != REG_NOERROR, 0)) 2119 return err; 2120 } 2121 } 2122 } 2123 } 2124 return REG_NOERROR; 2125} 2126 2127static reg_errcode_t 2128internal_function 2129sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, 2130 int str_idx, const re_node_set *candidates) 2131{ 2132 const re_dfa_t *const dfa = mctx->dfa; 2133 reg_errcode_t err; 2134 int node_idx, node; 2135 re_sift_context_t local_sctx; 2136 int first_idx = search_cur_bkref_entry (mctx, str_idx); 2137 2138 if (first_idx == -1) 2139 return REG_NOERROR; 2140 2141 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2142 2143 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2144 { 2145 int enabled_idx; 2146 re_token_type_t type; 2147 struct re_backref_cache_entry *entry; 2148 node = candidates->elems[node_idx]; 2149 type = dfa->nodes[node].type; 2150 /* Avoid infinite loop for the REs like "()\1+". */ 2151 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2152 continue; 2153 if (type != OP_BACK_REF) 2154 continue; 2155 2156 entry = mctx->bkref_ents + first_idx; 2157 enabled_idx = first_idx; 2158 do 2159 { 2160 int subexp_len; 2161 int to_idx; 2162 int dst_node; 2163 int ret; 2164 re_dfastate_t *cur_state; 2165 2166 if (entry->node != node) 2167 continue; 2168 subexp_len = entry->subexp_to - entry->subexp_from; 2169 to_idx = str_idx + subexp_len; 2170 dst_node = (subexp_len ? dfa->nexts[node] 2171 : dfa->edests[node].elems[0]); 2172 2173 if (to_idx > sctx->last_str_idx 2174 || sctx->sifted_states[to_idx] == NULL 2175 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2176 || check_dst_limits (mctx, &sctx->limits, node, 2177 str_idx, dst_node, to_idx)) 2178 continue; 2179 2180 if (local_sctx.sifted_states == NULL) 2181 { 2182 local_sctx = *sctx; 2183 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2184 if (BE (err != REG_NOERROR, 0)) 2185 goto free_return; 2186 } 2187 local_sctx.last_node = node; 2188 local_sctx.last_str_idx = str_idx; 2189 ret = re_node_set_insert (&local_sctx.limits, enabled_idx); 2190 if (BE (ret < 0, 0)) 2191 { 2192 err = REG_ESPACE; 2193 goto free_return; 2194 } 2195 cur_state = local_sctx.sifted_states[str_idx]; 2196 err = sift_states_backward (mctx, &local_sctx); 2197 if (BE (err != REG_NOERROR, 0)) 2198 goto free_return; 2199 if (sctx->limited_states != NULL) 2200 { 2201 err = merge_state_array (dfa, sctx->limited_states, 2202 local_sctx.sifted_states, 2203 str_idx + 1); 2204 if (BE (err != REG_NOERROR, 0)) 2205 goto free_return; 2206 } 2207 local_sctx.sifted_states[str_idx] = cur_state; 2208 re_node_set_remove (&local_sctx.limits, enabled_idx); 2209 2210 /* mctx->bkref_ents may have changed, reload the pointer. */ 2211 entry = mctx->bkref_ents + enabled_idx; 2212 } 2213 while ((void)enabled_idx++, entry++->more); 2214 } 2215 err = REG_NOERROR; 2216 free_return: 2217 if (local_sctx.sifted_states != NULL) 2218 { 2219 re_node_set_free (&local_sctx.limits); 2220 } 2221 2222 return err; 2223} 2224 2225 2226#ifdef RE_ENABLE_I18N 2227static int 2228internal_function 2229sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 2230 int node_idx, int str_idx, int max_str_idx) 2231{ 2232 const re_dfa_t *const dfa = mctx->dfa; 2233 int naccepted; 2234 /* Check the node can accept `multi byte'. */ 2235 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2236 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2237 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2238 dfa->nexts[node_idx])) 2239 /* The node can't accept the `multi byte', or the 2240 destination was already thrown away, then the node 2241 couldn't accept the current input `multi byte'. */ 2242 naccepted = 0; 2243 /* Otherwise, it is sure that the node could accept 2244 `naccepted' bytes input. */ 2245 return naccepted; 2246} 2247#endif /* RE_ENABLE_I18N */ 2248 2249 2250/* Functions for state transition. */ 2251 2252/* Return the next state to which the current state STATE will transit by 2253 accepting the current input byte, and update STATE_LOG if necessary. 2254 If STATE can accept a multibyte char/collating element/back reference 2255 update the destination of STATE_LOG. */ 2256 2257static re_dfastate_t * 2258internal_function 2259transit_state (reg_errcode_t *err, re_match_context_t *mctx, 2260 re_dfastate_t *state) 2261{ 2262 re_dfastate_t **trtable; 2263 unsigned char ch; 2264 2265#ifdef RE_ENABLE_I18N 2266 /* If the current state can accept multibyte. */ 2267 if (BE (state->accept_mb, 0)) 2268 { 2269 *err = transit_state_mb (mctx, state); 2270 if (BE (*err != REG_NOERROR, 0)) 2271 return NULL; 2272 } 2273#endif /* RE_ENABLE_I18N */ 2274 2275 /* Then decide the next state with the single byte. */ 2276#if 0 2277 if (0) 2278 /* don't use transition table */ 2279 return transit_state_sb (err, mctx, state); 2280#endif 2281 2282 /* Use transition table */ 2283 ch = re_string_fetch_byte (&mctx->input); 2284 for (;;) 2285 { 2286 trtable = state->trtable; 2287 if (BE (trtable != NULL, 1)) 2288 return trtable[ch]; 2289 2290 trtable = state->word_trtable; 2291 if (BE (trtable != NULL, 1)) 2292 { 2293 unsigned int context; 2294 context 2295 = re_string_context_at (&mctx->input, 2296 re_string_cur_idx (&mctx->input) - 1, 2297 mctx->eflags); 2298 if (IS_WORD_CONTEXT (context)) 2299 return trtable[ch + SBC_MAX]; 2300 else 2301 return trtable[ch]; 2302 } 2303 2304 if (!build_trtable (mctx->dfa, state)) 2305 { 2306 *err = REG_ESPACE; 2307 return NULL; 2308 } 2309 2310 /* Retry, we now have a transition table. */ 2311 } 2312} 2313 2314/* Update the state_log if we need */ 2315static re_dfastate_t * 2316internal_function 2317merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, 2318 re_dfastate_t *next_state) 2319{ 2320 const re_dfa_t *const dfa = mctx->dfa; 2321 int cur_idx = re_string_cur_idx (&mctx->input); 2322 2323 if (cur_idx > mctx->state_log_top) 2324 { 2325 mctx->state_log[cur_idx] = next_state; 2326 mctx->state_log_top = cur_idx; 2327 } 2328 else if (mctx->state_log[cur_idx] == NULL) 2329 { 2330 mctx->state_log[cur_idx] = next_state; 2331 } 2332 else 2333 { 2334 re_dfastate_t *pstate; 2335 unsigned int context; 2336 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2337 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2338 the destination of a multibyte char/collating element/ 2339 back reference. Then the next state is the union set of 2340 these destinations and the results of the transition table. */ 2341 pstate = mctx->state_log[cur_idx]; 2342 log_nodes = pstate->entrance_nodes; 2343 if (next_state != NULL) 2344 { 2345 table_nodes = next_state->entrance_nodes; 2346 *err = re_node_set_init_union (&next_nodes, table_nodes, 2347 log_nodes); 2348 if (BE (*err != REG_NOERROR, 0)) 2349 return NULL; 2350 } 2351 else 2352 next_nodes = *log_nodes; 2353 /* Note: We already add the nodes of the initial state, 2354 then we don't need to add them here. */ 2355 2356 context = re_string_context_at (&mctx->input, 2357 re_string_cur_idx (&mctx->input) - 1, 2358 mctx->eflags); 2359 next_state = mctx->state_log[cur_idx] 2360 = re_acquire_state_context (err, dfa, &next_nodes, context); 2361 /* We don't need to check errors here, since the return value of 2362 this function is next_state and ERR is already set. */ 2363 2364 if (table_nodes != NULL) 2365 re_node_set_free (&next_nodes); 2366 } 2367 2368 if (BE (dfa->nbackref, 0) && next_state != NULL) 2369 { 2370 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2371 later. We must check them here, since the back references in the 2372 next state might use them. */ 2373 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2374 cur_idx); 2375 if (BE (*err != REG_NOERROR, 0)) 2376 return NULL; 2377 2378 /* If the next state has back references. */ 2379 if (next_state->has_backref) 2380 { 2381 *err = transit_state_bkref (mctx, &next_state->nodes); 2382 if (BE (*err != REG_NOERROR, 0)) 2383 return NULL; 2384 next_state = mctx->state_log[cur_idx]; 2385 } 2386 } 2387 2388 return next_state; 2389} 2390 2391/* Skip bytes in the input that correspond to part of a 2392 multi-byte match, then look in the log for a state 2393 from which to restart matching. */ 2394static re_dfastate_t * 2395internal_function 2396find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) 2397{ 2398 re_dfastate_t *cur_state; 2399 do 2400 { 2401 int max = mctx->state_log_top; 2402 int cur_str_idx = re_string_cur_idx (&mctx->input); 2403 2404 do 2405 { 2406 if (++cur_str_idx > max) 2407 return NULL; 2408 re_string_skip_bytes (&mctx->input, 1); 2409 } 2410 while (mctx->state_log[cur_str_idx] == NULL); 2411 2412 cur_state = merge_state_with_log (err, mctx, NULL); 2413 } 2414 while (*err == REG_NOERROR && cur_state == NULL); 2415 return cur_state; 2416} 2417 2418/* Helper functions for transit_state. */ 2419 2420/* From the node set CUR_NODES, pick up the nodes whose types are 2421 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2422 expression. And register them to use them later for evaluating the 2423 corresponding back references. */ 2424 2425static reg_errcode_t 2426internal_function 2427check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, 2428 int str_idx) 2429{ 2430 const re_dfa_t *const dfa = mctx->dfa; 2431 int node_idx; 2432 reg_errcode_t err; 2433 2434 /* TODO: This isn't efficient. 2435 Because there might be more than one nodes whose types are 2436 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2437 nodes. 2438 E.g. RE: (a){2} */ 2439 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2440 { 2441 int node = cur_nodes->elems[node_idx]; 2442 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2443 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS 2444 && (dfa->used_bkref_map 2445 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) 2446 { 2447 err = match_ctx_add_subtop (mctx, node, str_idx); 2448 if (BE (err != REG_NOERROR, 0)) 2449 return err; 2450 } 2451 } 2452 return REG_NOERROR; 2453} 2454 2455#if 0 2456/* Return the next state to which the current state STATE will transit by 2457 accepting the current input byte. */ 2458 2459static re_dfastate_t * 2460transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, 2461 re_dfastate_t *state) 2462{ 2463 const re_dfa_t *const dfa = mctx->dfa; 2464 re_node_set next_nodes; 2465 re_dfastate_t *next_state; 2466 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2467 unsigned int context; 2468 2469 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2470 if (BE (*err != REG_NOERROR, 0)) 2471 return NULL; 2472 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2473 { 2474 int cur_node = state->nodes.elems[node_cnt]; 2475 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2476 { 2477 *err = re_node_set_merge (&next_nodes, 2478 dfa->eclosures + dfa->nexts[cur_node]); 2479 if (BE (*err != REG_NOERROR, 0)) 2480 { 2481 re_node_set_free (&next_nodes); 2482 return NULL; 2483 } 2484 } 2485 } 2486 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2487 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2488 /* We don't need to check errors here, since the return value of 2489 this function is next_state and ERR is already set. */ 2490 2491 re_node_set_free (&next_nodes); 2492 re_string_skip_bytes (&mctx->input, 1); 2493 return next_state; 2494} 2495#endif 2496 2497#ifdef RE_ENABLE_I18N 2498static reg_errcode_t 2499internal_function 2500transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) 2501{ 2502 const re_dfa_t *const dfa = mctx->dfa; 2503 reg_errcode_t err; 2504 int i; 2505 2506 for (i = 0; i < pstate->nodes.nelem; ++i) 2507 { 2508 re_node_set dest_nodes, *new_nodes; 2509 int cur_node_idx = pstate->nodes.elems[i]; 2510 int naccepted, dest_idx; 2511 unsigned int context; 2512 re_dfastate_t *dest_state; 2513 2514 if (!dfa->nodes[cur_node_idx].accept_mb) 2515 continue; 2516 2517 if (dfa->nodes[cur_node_idx].constraint) 2518 { 2519 context = re_string_context_at (&mctx->input, 2520 re_string_cur_idx (&mctx->input), 2521 mctx->eflags); 2522 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2523 context)) 2524 continue; 2525 } 2526 2527 /* How many bytes the node can accept? */ 2528 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2529 re_string_cur_idx (&mctx->input)); 2530 if (naccepted == 0) 2531 continue; 2532 2533 /* The node can accepts `naccepted' bytes. */ 2534 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2535 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2536 : mctx->max_mb_elem_len); 2537 err = clean_state_log_if_needed (mctx, dest_idx); 2538 if (BE (err != REG_NOERROR, 0)) 2539 return err; 2540#ifdef DEBUG 2541 assert (dfa->nexts[cur_node_idx] != -1); 2542#endif 2543 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2544 2545 dest_state = mctx->state_log[dest_idx]; 2546 if (dest_state == NULL) 2547 dest_nodes = *new_nodes; 2548 else 2549 { 2550 err = re_node_set_init_union (&dest_nodes, 2551 dest_state->entrance_nodes, new_nodes); 2552 if (BE (err != REG_NOERROR, 0)) 2553 return err; 2554 } 2555 context = re_string_context_at (&mctx->input, dest_idx - 1, 2556 mctx->eflags); 2557 mctx->state_log[dest_idx] 2558 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2559 if (dest_state != NULL) 2560 re_node_set_free (&dest_nodes); 2561 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2562 return err; 2563 } 2564 return REG_NOERROR; 2565} 2566#endif /* RE_ENABLE_I18N */ 2567 2568static reg_errcode_t 2569internal_function 2570transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) 2571{ 2572 const re_dfa_t *const dfa = mctx->dfa; 2573 reg_errcode_t err; 2574 int i; 2575 int cur_str_idx = re_string_cur_idx (&mctx->input); 2576 2577 for (i = 0; i < nodes->nelem; ++i) 2578 { 2579 int dest_str_idx, prev_nelem, bkc_idx; 2580 int node_idx = nodes->elems[i]; 2581 unsigned int context; 2582 const re_token_t *node = dfa->nodes + node_idx; 2583 re_node_set *new_dest_nodes; 2584 2585 /* Check whether `node' is a backreference or not. */ 2586 if (node->type != OP_BACK_REF) 2587 continue; 2588 2589 if (node->constraint) 2590 { 2591 context = re_string_context_at (&mctx->input, cur_str_idx, 2592 mctx->eflags); 2593 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2594 continue; 2595 } 2596 2597 /* `node' is a backreference. 2598 Check the substring which the substring matched. */ 2599 bkc_idx = mctx->nbkref_ents; 2600 err = get_subexp (mctx, node_idx, cur_str_idx); 2601 if (BE (err != REG_NOERROR, 0)) 2602 goto free_return; 2603 2604 /* And add the epsilon closures (which is `new_dest_nodes') of 2605 the backreference to appropriate state_log. */ 2606#ifdef DEBUG 2607 assert (dfa->nexts[node_idx] != -1); 2608#endif 2609 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2610 { 2611 int subexp_len; 2612 re_dfastate_t *dest_state; 2613 struct re_backref_cache_entry *bkref_ent; 2614 bkref_ent = mctx->bkref_ents + bkc_idx; 2615 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2616 continue; 2617 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2618 new_dest_nodes = (subexp_len == 0 2619 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2620 : dfa->eclosures + dfa->nexts[node_idx]); 2621 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2622 - bkref_ent->subexp_from); 2623 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2624 mctx->eflags); 2625 dest_state = mctx->state_log[dest_str_idx]; 2626 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2627 : mctx->state_log[cur_str_idx]->nodes.nelem); 2628 /* Add `new_dest_node' to state_log. */ 2629 if (dest_state == NULL) 2630 { 2631 mctx->state_log[dest_str_idx] 2632 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2633 context); 2634 if (BE (mctx->state_log[dest_str_idx] == NULL 2635 && err != REG_NOERROR, 0)) 2636 goto free_return; 2637 } 2638 else 2639 { 2640 re_node_set dest_nodes; 2641 err = re_node_set_init_union (&dest_nodes, 2642 dest_state->entrance_nodes, 2643 new_dest_nodes); 2644 if (BE (err != REG_NOERROR, 0)) 2645 { 2646 re_node_set_free (&dest_nodes); 2647 goto free_return; 2648 } 2649 mctx->state_log[dest_str_idx] 2650 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2651 re_node_set_free (&dest_nodes); 2652 if (BE (mctx->state_log[dest_str_idx] == NULL 2653 && err != REG_NOERROR, 0)) 2654 goto free_return; 2655 } 2656 /* We need to check recursively if the backreference can epsilon 2657 transit. */ 2658 if (subexp_len == 0 2659 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2660 { 2661 err = check_subexp_matching_top (mctx, new_dest_nodes, 2662 cur_str_idx); 2663 if (BE (err != REG_NOERROR, 0)) 2664 goto free_return; 2665 err = transit_state_bkref (mctx, new_dest_nodes); 2666 if (BE (err != REG_NOERROR, 0)) 2667 goto free_return; 2668 } 2669 } 2670 } 2671 err = REG_NOERROR; 2672 free_return: 2673 return err; 2674} 2675 2676/* Enumerate all the candidates which the backreference BKREF_NODE can match 2677 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2678 Note that we might collect inappropriate candidates here. 2679 However, the cost of checking them strictly here is too high, then we 2680 delay these checking for prune_impossible_nodes(). */ 2681 2682static reg_errcode_t 2683internal_function 2684get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx) 2685{ 2686 const re_dfa_t *const dfa = mctx->dfa; 2687 int subexp_num, sub_top_idx; 2688 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2689 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2690 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2691 if (cache_idx != -1) 2692 { 2693 const struct re_backref_cache_entry *entry 2694 = mctx->bkref_ents + cache_idx; 2695 do 2696 if (entry->node == bkref_node) 2697 return REG_NOERROR; /* We already checked it. */ 2698 while (entry++->more); 2699 } 2700 2701 subexp_num = dfa->nodes[bkref_node].opr.idx; 2702 2703 /* For each sub expression */ 2704 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2705 { 2706 reg_errcode_t err; 2707 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2708 re_sub_match_last_t *sub_last; 2709 int sub_last_idx, sl_str, bkref_str_off; 2710 2711 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2712 continue; /* It isn't related. */ 2713 2714 sl_str = sub_top->str_idx; 2715 bkref_str_off = bkref_str_idx; 2716 /* At first, check the last node of sub expressions we already 2717 evaluated. */ 2718 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2719 { 2720 int sl_str_diff; 2721 sub_last = sub_top->lasts[sub_last_idx]; 2722 sl_str_diff = sub_last->str_idx - sl_str; 2723 /* The matched string by the sub expression match with the substring 2724 at the back reference? */ 2725 if (sl_str_diff > 0) 2726 { 2727 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2728 { 2729 /* Not enough chars for a successful match. */ 2730 if (bkref_str_off + sl_str_diff > mctx->input.len) 2731 break; 2732 2733 err = clean_state_log_if_needed (mctx, 2734 bkref_str_off 2735 + sl_str_diff); 2736 if (BE (err != REG_NOERROR, 0)) 2737 return err; 2738 buf = (const char *) re_string_get_buffer (&mctx->input); 2739 } 2740 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2741 /* We don't need to search this sub expression any more. */ 2742 break; 2743 } 2744 bkref_str_off += sl_str_diff; 2745 sl_str += sl_str_diff; 2746 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2747 bkref_str_idx); 2748 2749 /* Reload buf, since the preceding call might have reallocated 2750 the buffer. */ 2751 buf = (const char *) re_string_get_buffer (&mctx->input); 2752 2753 if (err == REG_NOMATCH) 2754 continue; 2755 if (BE (err != REG_NOERROR, 0)) 2756 return err; 2757 } 2758 2759 if (sub_last_idx < sub_top->nlasts) 2760 continue; 2761 if (sub_last_idx > 0) 2762 ++sl_str; 2763 /* Then, search for the other last nodes of the sub expression. */ 2764 for (; sl_str <= bkref_str_idx; ++sl_str) 2765 { 2766 int cls_node, sl_str_off; 2767 const re_node_set *nodes; 2768 sl_str_off = sl_str - sub_top->str_idx; 2769 /* The matched string by the sub expression match with the substring 2770 at the back reference? */ 2771 if (sl_str_off > 0) 2772 { 2773 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2774 { 2775 /* If we are at the end of the input, we cannot match. */ 2776 if (bkref_str_off >= mctx->input.len) 2777 break; 2778 2779 err = extend_buffers (mctx); 2780 if (BE (err != REG_NOERROR, 0)) 2781 return err; 2782 2783 buf = (const char *) re_string_get_buffer (&mctx->input); 2784 } 2785 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2786 break; /* We don't need to search this sub expression 2787 any more. */ 2788 } 2789 if (mctx->state_log[sl_str] == NULL) 2790 continue; 2791 /* Does this state have a ')' of the sub expression? */ 2792 nodes = &mctx->state_log[sl_str]->nodes; 2793 cls_node = find_subexp_node (dfa, nodes, subexp_num, 2794 OP_CLOSE_SUBEXP); 2795 if (cls_node == -1) 2796 continue; /* No. */ 2797 if (sub_top->path == NULL) 2798 { 2799 sub_top->path = calloc (sl_str - sub_top->str_idx + 1, 2800 sizeof (state_array_t)); 2801 if (sub_top->path == NULL) 2802 return REG_ESPACE; 2803 } 2804 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2805 in the current context? */ 2806 err = check_arrival (mctx, sub_top->path, sub_top->node, 2807 sub_top->str_idx, cls_node, sl_str, 2808 OP_CLOSE_SUBEXP); 2809 if (err == REG_NOMATCH) 2810 continue; 2811 if (BE (err != REG_NOERROR, 0)) 2812 return err; 2813 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2814 if (BE (sub_last == NULL, 0)) 2815 return REG_ESPACE; 2816 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2817 bkref_str_idx); 2818 if (err == REG_NOMATCH) 2819 continue; 2820 } 2821 } 2822 return REG_NOERROR; 2823} 2824 2825/* Helper functions for get_subexp(). */ 2826 2827/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2828 If it can arrive, register the sub expression expressed with SUB_TOP 2829 and SUB_LAST. */ 2830 2831static reg_errcode_t 2832internal_function 2833get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, 2834 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str) 2835{ 2836 reg_errcode_t err; 2837 int to_idx; 2838 /* Can the subexpression arrive the back reference? */ 2839 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2840 sub_last->str_idx, bkref_node, bkref_str, 2841 OP_OPEN_SUBEXP); 2842 if (err != REG_NOERROR) 2843 return err; 2844 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2845 sub_last->str_idx); 2846 if (BE (err != REG_NOERROR, 0)) 2847 return err; 2848 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2849 return clean_state_log_if_needed (mctx, to_idx); 2850} 2851 2852/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2853 Search '(' if FL_OPEN, or search ')' otherwise. 2854 TODO: This function isn't efficient... 2855 Because there might be more than one nodes whose types are 2856 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2857 nodes. 2858 E.g. RE: (a){2} */ 2859 2860static int 2861internal_function 2862find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 2863 int subexp_idx, int type) 2864{ 2865 int cls_idx; 2866 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2867 { 2868 int cls_node = nodes->elems[cls_idx]; 2869 const re_token_t *node = dfa->nodes + cls_node; 2870 if (node->type == type 2871 && node->opr.idx == subexp_idx) 2872 return cls_node; 2873 } 2874 return -1; 2875} 2876 2877/* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2878 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2879 heavily reused. 2880 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2881 2882static reg_errcode_t 2883internal_function 2884check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node, 2885 int top_str, int last_node, int last_str, int type) 2886{ 2887 const re_dfa_t *const dfa = mctx->dfa; 2888 reg_errcode_t err = REG_NOERROR; 2889 int subexp_num, backup_cur_idx, str_idx, null_cnt; 2890 re_dfastate_t *cur_state = NULL; 2891 re_node_set *cur_nodes, next_nodes; 2892 re_dfastate_t **backup_state_log; 2893 unsigned int context; 2894 2895 subexp_num = dfa->nodes[top_node].opr.idx; 2896 /* Extend the buffer if we need. */ 2897 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2898 { 2899 re_dfastate_t **new_array; 2900 int old_alloc = path->alloc; 2901 path->alloc += last_str + mctx->max_mb_elem_len + 1; 2902 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2903 if (BE (new_array == NULL, 0)) 2904 { 2905 path->alloc = old_alloc; 2906 return REG_ESPACE; 2907 } 2908 path->array = new_array; 2909 memset (new_array + old_alloc, '\0', 2910 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2911 } 2912 2913 str_idx = path->next_idx ? path->next_idx : top_str; 2914 2915 /* Temporary modify MCTX. */ 2916 backup_state_log = mctx->state_log; 2917 backup_cur_idx = mctx->input.cur_idx; 2918 mctx->state_log = path->array; 2919 mctx->input.cur_idx = str_idx; 2920 2921 /* Setup initial node set. */ 2922 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2923 if (str_idx == top_str) 2924 { 2925 err = re_node_set_init_1 (&next_nodes, top_node); 2926 if (BE (err != REG_NOERROR, 0)) 2927 return err; 2928 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2929 if (BE (err != REG_NOERROR, 0)) 2930 { 2931 re_node_set_free (&next_nodes); 2932 return err; 2933 } 2934 } 2935 else 2936 { 2937 cur_state = mctx->state_log[str_idx]; 2938 if (cur_state && cur_state->has_backref) 2939 { 2940 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2941 if (BE (err != REG_NOERROR, 0)) 2942 return err; 2943 } 2944 else 2945 re_node_set_init_empty (&next_nodes); 2946 } 2947 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2948 { 2949 if (next_nodes.nelem) 2950 { 2951 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2952 subexp_num, type); 2953 if (BE (err != REG_NOERROR, 0)) 2954 { 2955 re_node_set_free (&next_nodes); 2956 return err; 2957 } 2958 } 2959 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2960 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2961 { 2962 re_node_set_free (&next_nodes); 2963 return err; 2964 } 2965 mctx->state_log[str_idx] = cur_state; 2966 } 2967 2968 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2969 { 2970 re_node_set_empty (&next_nodes); 2971 if (mctx->state_log[str_idx + 1]) 2972 { 2973 err = re_node_set_merge (&next_nodes, 2974 &mctx->state_log[str_idx + 1]->nodes); 2975 if (BE (err != REG_NOERROR, 0)) 2976 { 2977 re_node_set_free (&next_nodes); 2978 return err; 2979 } 2980 } 2981 if (cur_state) 2982 { 2983 err = check_arrival_add_next_nodes (mctx, str_idx, 2984 &cur_state->non_eps_nodes, 2985 &next_nodes); 2986 if (BE (err != REG_NOERROR, 0)) 2987 { 2988 re_node_set_free (&next_nodes); 2989 return err; 2990 } 2991 } 2992 ++str_idx; 2993 if (next_nodes.nelem) 2994 { 2995 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2996 if (BE (err != REG_NOERROR, 0)) 2997 { 2998 re_node_set_free (&next_nodes); 2999 return err; 3000 } 3001 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 3002 subexp_num, type); 3003 if (BE (err != REG_NOERROR, 0)) 3004 { 3005 re_node_set_free (&next_nodes); 3006 return err; 3007 } 3008 } 3009 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 3010 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3011 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3012 { 3013 re_node_set_free (&next_nodes); 3014 return err; 3015 } 3016 mctx->state_log[str_idx] = cur_state; 3017 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 3018 } 3019 re_node_set_free (&next_nodes); 3020 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 3021 : &mctx->state_log[last_str]->nodes); 3022 path->next_idx = str_idx; 3023 3024 /* Fix MCTX. */ 3025 mctx->state_log = backup_state_log; 3026 mctx->input.cur_idx = backup_cur_idx; 3027 3028 /* Then check the current node set has the node LAST_NODE. */ 3029 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 3030 return REG_NOERROR; 3031 3032 return REG_NOMATCH; 3033} 3034 3035/* Helper functions for check_arrival. */ 3036 3037/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3038 to NEXT_NODES. 3039 TODO: This function is similar to the functions transit_state*(), 3040 however this function has many additional works. 3041 Can't we unify them? */ 3042 3043static reg_errcode_t 3044internal_function 3045check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx, 3046 re_node_set *cur_nodes, re_node_set *next_nodes) 3047{ 3048 const re_dfa_t *const dfa = mctx->dfa; 3049 int result; 3050 int cur_idx; 3051#ifdef RE_ENABLE_I18N 3052 reg_errcode_t err = REG_NOERROR; 3053#endif 3054 re_node_set union_set; 3055 re_node_set_init_empty (&union_set); 3056 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3057 { 3058 int naccepted = 0; 3059 int cur_node = cur_nodes->elems[cur_idx]; 3060#ifdef DEBUG 3061 re_token_type_t type = dfa->nodes[cur_node].type; 3062 assert (!IS_EPSILON_NODE (type)); 3063#endif 3064#ifdef RE_ENABLE_I18N 3065 /* If the node may accept `multi byte'. */ 3066 if (dfa->nodes[cur_node].accept_mb) 3067 { 3068 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3069 str_idx); 3070 if (naccepted > 1) 3071 { 3072 re_dfastate_t *dest_state; 3073 int next_node = dfa->nexts[cur_node]; 3074 int next_idx = str_idx + naccepted; 3075 dest_state = mctx->state_log[next_idx]; 3076 re_node_set_empty (&union_set); 3077 if (dest_state) 3078 { 3079 err = re_node_set_merge (&union_set, &dest_state->nodes); 3080 if (BE (err != REG_NOERROR, 0)) 3081 { 3082 re_node_set_free (&union_set); 3083 return err; 3084 } 3085 } 3086 result = re_node_set_insert (&union_set, next_node); 3087 if (BE (result < 0, 0)) 3088 { 3089 re_node_set_free (&union_set); 3090 return REG_ESPACE; 3091 } 3092 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3093 &union_set); 3094 if (BE (mctx->state_log[next_idx] == NULL 3095 && err != REG_NOERROR, 0)) 3096 { 3097 re_node_set_free (&union_set); 3098 return err; 3099 } 3100 } 3101 } 3102#endif /* RE_ENABLE_I18N */ 3103 if (naccepted 3104 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3105 { 3106 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3107 if (BE (result < 0, 0)) 3108 { 3109 re_node_set_free (&union_set); 3110 return REG_ESPACE; 3111 } 3112 } 3113 } 3114 re_node_set_free (&union_set); 3115 return REG_NOERROR; 3116} 3117 3118/* For all the nodes in CUR_NODES, add the epsilon closures of them to 3119 CUR_NODES, however exclude the nodes which are: 3120 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3121 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3122*/ 3123 3124static reg_errcode_t 3125internal_function 3126check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, 3127 int ex_subexp, int type) 3128{ 3129 reg_errcode_t err; 3130 int idx, outside_node; 3131 re_node_set new_nodes; 3132#ifdef DEBUG 3133 assert (cur_nodes->nelem); 3134#endif 3135 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3136 if (BE (err != REG_NOERROR, 0)) 3137 return err; 3138 /* Create a new node set NEW_NODES with the nodes which are epsilon 3139 closures of the node in CUR_NODES. */ 3140 3141 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3142 { 3143 int cur_node = cur_nodes->elems[idx]; 3144 const re_node_set *eclosure = dfa->eclosures + cur_node; 3145 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3146 if (outside_node == -1) 3147 { 3148 /* There are no problematic nodes, just merge them. */ 3149 err = re_node_set_merge (&new_nodes, eclosure); 3150 if (BE (err != REG_NOERROR, 0)) 3151 { 3152 re_node_set_free (&new_nodes); 3153 return err; 3154 } 3155 } 3156 else 3157 { 3158 /* There are problematic nodes, re-calculate incrementally. */ 3159 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3160 ex_subexp, type); 3161 if (BE (err != REG_NOERROR, 0)) 3162 { 3163 re_node_set_free (&new_nodes); 3164 return err; 3165 } 3166 } 3167 } 3168 re_node_set_free (cur_nodes); 3169 *cur_nodes = new_nodes; 3170 return REG_NOERROR; 3171} 3172 3173/* Helper function for check_arrival_expand_ecl. 3174 Check incrementally the epsilon closure of TARGET, and if it isn't 3175 problematic append it to DST_NODES. */ 3176 3177static reg_errcode_t 3178internal_function 3179check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, 3180 int target, int ex_subexp, int type) 3181{ 3182 int cur_node; 3183 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3184 { 3185 int err; 3186 3187 if (dfa->nodes[cur_node].type == type 3188 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3189 { 3190 if (type == OP_CLOSE_SUBEXP) 3191 { 3192 err = re_node_set_insert (dst_nodes, cur_node); 3193 if (BE (err == -1, 0)) 3194 return REG_ESPACE; 3195 } 3196 break; 3197 } 3198 err = re_node_set_insert (dst_nodes, cur_node); 3199 if (BE (err == -1, 0)) 3200 return REG_ESPACE; 3201 if (dfa->edests[cur_node].nelem == 0) 3202 break; 3203 if (dfa->edests[cur_node].nelem == 2) 3204 { 3205 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3206 dfa->edests[cur_node].elems[1], 3207 ex_subexp, type); 3208 if (BE (err != REG_NOERROR, 0)) 3209 return err; 3210 } 3211 cur_node = dfa->edests[cur_node].elems[0]; 3212 } 3213 return REG_NOERROR; 3214} 3215 3216 3217/* For all the back references in the current state, calculate the 3218 destination of the back references by the appropriate entry 3219 in MCTX->BKREF_ENTS. */ 3220 3221static reg_errcode_t 3222internal_function 3223expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, 3224 int cur_str, int subexp_num, int type) 3225{ 3226 const re_dfa_t *const dfa = mctx->dfa; 3227 reg_errcode_t err; 3228 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3229 struct re_backref_cache_entry *ent; 3230 3231 if (cache_idx_start == -1) 3232 return REG_NOERROR; 3233 3234 restart: 3235 ent = mctx->bkref_ents + cache_idx_start; 3236 do 3237 { 3238 int to_idx, next_node; 3239 3240 /* Is this entry ENT is appropriate? */ 3241 if (!re_node_set_contains (cur_nodes, ent->node)) 3242 continue; /* No. */ 3243 3244 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3245 /* Calculate the destination of the back reference, and append it 3246 to MCTX->STATE_LOG. */ 3247 if (to_idx == cur_str) 3248 { 3249 /* The backreference did epsilon transit, we must re-check all the 3250 node in the current state. */ 3251 re_node_set new_dests; 3252 reg_errcode_t err2, err3; 3253 next_node = dfa->edests[ent->node].elems[0]; 3254 if (re_node_set_contains (cur_nodes, next_node)) 3255 continue; 3256 err = re_node_set_init_1 (&new_dests, next_node); 3257 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3258 err3 = re_node_set_merge (cur_nodes, &new_dests); 3259 re_node_set_free (&new_dests); 3260 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3261 || err3 != REG_NOERROR, 0)) 3262 { 3263 err = (err != REG_NOERROR ? err 3264 : (err2 != REG_NOERROR ? err2 : err3)); 3265 return err; 3266 } 3267 /* TODO: It is still inefficient... */ 3268 goto restart; 3269 } 3270 else 3271 { 3272 re_node_set union_set; 3273 next_node = dfa->nexts[ent->node]; 3274 if (mctx->state_log[to_idx]) 3275 { 3276 int ret; 3277 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3278 next_node)) 3279 continue; 3280 err = re_node_set_init_copy (&union_set, 3281 &mctx->state_log[to_idx]->nodes); 3282 ret = re_node_set_insert (&union_set, next_node); 3283 if (BE (err != REG_NOERROR || ret < 0, 0)) 3284 { 3285 re_node_set_free (&union_set); 3286 err = err != REG_NOERROR ? err : REG_ESPACE; 3287 return err; 3288 } 3289 } 3290 else 3291 { 3292 err = re_node_set_init_1 (&union_set, next_node); 3293 if (BE (err != REG_NOERROR, 0)) 3294 return err; 3295 } 3296 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3297 re_node_set_free (&union_set); 3298 if (BE (mctx->state_log[to_idx] == NULL 3299 && err != REG_NOERROR, 0)) 3300 return err; 3301 } 3302 } 3303 while (ent++->more); 3304 return REG_NOERROR; 3305} 3306 3307/* Build transition table for the state. 3308 Return 1 if succeeded, otherwise return NULL. */ 3309 3310static int 3311internal_function 3312build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) 3313{ 3314 reg_errcode_t err; 3315 int i, j, ch, need_word_trtable = 0; 3316 bitset_word_t elem, mask; 3317 bool dests_node_malloced = false; 3318 bool dest_states_malloced = false; 3319 int ndests; /* Number of the destination states from `state'. */ 3320 re_dfastate_t **trtable; 3321 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3322 re_node_set follows, *dests_node; 3323 bitset_t *dests_ch; 3324 bitset_t acceptable; 3325 3326 struct dests_alloc 3327 { 3328 re_node_set dests_node[SBC_MAX]; 3329 bitset_t dests_ch[SBC_MAX]; 3330 } *dests_alloc; 3331 3332 /* We build DFA states which corresponds to the destination nodes 3333 from `state'. `dests_node[i]' represents the nodes which i-th 3334 destination state contains, and `dests_ch[i]' represents the 3335 characters which i-th destination state accepts. */ 3336#ifdef HAVE_ALLOCA 3337 if (__libc_use_alloca (sizeof (struct dests_alloc))) 3338 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); 3339 else 3340#endif 3341 { 3342 dests_alloc = re_malloc (struct dests_alloc, 1); 3343 if (BE (dests_alloc == NULL, 0)) 3344 return 0; 3345 dests_node_malloced = true; 3346 } 3347 dests_node = dests_alloc->dests_node; 3348 dests_ch = dests_alloc->dests_ch; 3349 3350 /* Initialize transition table. */ 3351 state->word_trtable = state->trtable = NULL; 3352 3353 /* At first, group all nodes belonging to `state' into several 3354 destinations. */ 3355 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3356 if (BE (ndests <= 0, 0)) 3357 { 3358 if (dests_node_malloced) 3359 free (dests_alloc); 3360 /* Return 0 in case of an error, 1 otherwise. */ 3361 if (ndests == 0) 3362 { 3363 state->trtable = (re_dfastate_t **) 3364 calloc (SBC_MAX, sizeof (re_dfastate_t *)); 3365 return 1; 3366 } 3367 return 0; 3368 } 3369 3370 err = re_node_set_alloc (&follows, ndests + 1); 3371 if (BE (err != REG_NOERROR, 0)) 3372 goto out_free; 3373 3374 /* Avoid arithmetic overflow in size calculation. */ 3375 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX) 3376 / (3 * sizeof (re_dfastate_t *))) 3377 < ndests), 3378 0)) 3379 goto out_free; 3380 3381#ifdef HAVE_ALLOCA 3382 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX 3383 + ndests * 3 * sizeof (re_dfastate_t *))) 3384 dest_states = (re_dfastate_t **) 3385 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3386 else 3387#endif 3388 { 3389 dest_states = (re_dfastate_t **) 3390 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3391 if (BE (dest_states == NULL, 0)) 3392 { 3393out_free: 3394 if (dest_states_malloced) 3395 free (dest_states); 3396 re_node_set_free (&follows); 3397 for (i = 0; i < ndests; ++i) 3398 re_node_set_free (dests_node + i); 3399 if (dests_node_malloced) 3400 free (dests_alloc); 3401 return 0; 3402 } 3403 dest_states_malloced = true; 3404 } 3405 dest_states_word = dest_states + ndests; 3406 dest_states_nl = dest_states_word + ndests; 3407 bitset_empty (acceptable); 3408 3409 /* Then build the states for all destinations. */ 3410 for (i = 0; i < ndests; ++i) 3411 { 3412 int next_node; 3413 re_node_set_empty (&follows); 3414 /* Merge the follows of this destination states. */ 3415 for (j = 0; j < dests_node[i].nelem; ++j) 3416 { 3417 next_node = dfa->nexts[dests_node[i].elems[j]]; 3418 if (next_node != -1) 3419 { 3420 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3421 if (BE (err != REG_NOERROR, 0)) 3422 goto out_free; 3423 } 3424 } 3425 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3426 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3427 goto out_free; 3428 /* If the new state has context constraint, 3429 build appropriate states for these contexts. */ 3430 if (dest_states[i]->has_constraint) 3431 { 3432 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3433 CONTEXT_WORD); 3434 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3435 goto out_free; 3436 3437 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3438 need_word_trtable = 1; 3439 3440 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3441 CONTEXT_NEWLINE); 3442 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3443 goto out_free; 3444 } 3445 else 3446 { 3447 dest_states_word[i] = dest_states[i]; 3448 dest_states_nl[i] = dest_states[i]; 3449 } 3450 bitset_merge (acceptable, dests_ch[i]); 3451 } 3452 3453 if (!BE (need_word_trtable, 0)) 3454 { 3455 /* We don't care about whether the following character is a word 3456 character, or we are in a single-byte character set so we can 3457 discern by looking at the character code: allocate a 3458 256-entry transition table. */ 3459 trtable = state->trtable = 3460 (re_dfastate_t **) calloc (SBC_MAX, sizeof (re_dfastate_t *)); 3461 if (BE (trtable == NULL, 0)) 3462 goto out_free; 3463 3464 /* For all characters ch...: */ 3465 for (i = 0; i < BITSET_WORDS; ++i) 3466 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3467 elem; 3468 mask <<= 1, elem >>= 1, ++ch) 3469 if (BE (elem & 1, 0)) 3470 { 3471 /* There must be exactly one destination which accepts 3472 character ch. See group_nodes_into_DFAstates. */ 3473 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3474 ; 3475 3476 /* j-th destination accepts the word character ch. */ 3477 if (dfa->word_char[i] & mask) 3478 trtable[ch] = dest_states_word[j]; 3479 else 3480 trtable[ch] = dest_states[j]; 3481 } 3482 } 3483 else 3484 { 3485 /* We care about whether the following character is a word 3486 character, and we are in a multi-byte character set: discern 3487 by looking at the character code: build two 256-entry 3488 transition tables, one starting at trtable[0] and one 3489 starting at trtable[SBC_MAX]. */ 3490 trtable = state->word_trtable = 3491 (re_dfastate_t **) calloc (2 * SBC_MAX, sizeof (re_dfastate_t *)); 3492 if (BE (trtable == NULL, 0)) 3493 goto out_free; 3494 3495 /* For all characters ch...: */ 3496 for (i = 0; i < BITSET_WORDS; ++i) 3497 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3498 elem; 3499 mask <<= 1, elem >>= 1, ++ch) 3500 if (BE (elem & 1, 0)) 3501 { 3502 /* There must be exactly one destination which accepts 3503 character ch. See group_nodes_into_DFAstates. */ 3504 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3505 ; 3506 3507 /* j-th destination accepts the word character ch. */ 3508 trtable[ch] = dest_states[j]; 3509 trtable[ch + SBC_MAX] = dest_states_word[j]; 3510 } 3511 } 3512 3513 /* new line */ 3514 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3515 { 3516 /* The current state accepts newline character. */ 3517 for (j = 0; j < ndests; ++j) 3518 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3519 { 3520 /* k-th destination accepts newline character. */ 3521 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3522 if (need_word_trtable) 3523 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3524 /* There must be only one destination which accepts 3525 newline. See group_nodes_into_DFAstates. */ 3526 break; 3527 } 3528 } 3529 3530 if (dest_states_malloced) 3531 free (dest_states); 3532 3533 re_node_set_free (&follows); 3534 for (i = 0; i < ndests; ++i) 3535 re_node_set_free (dests_node + i); 3536 3537 if (dests_node_malloced) 3538 free (dests_alloc); 3539 3540 return 1; 3541} 3542 3543/* Group all nodes belonging to STATE into several destinations. 3544 Then for all destinations, set the nodes belonging to the destination 3545 to DESTS_NODE[i] and set the characters accepted by the destination 3546 to DEST_CH[i]. This function return the number of destinations. */ 3547 3548static int 3549internal_function 3550group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, 3551 re_node_set *dests_node, bitset_t *dests_ch) 3552{ 3553 reg_errcode_t err; 3554 int result; 3555 int i, j, k; 3556 int ndests; /* Number of the destinations from `state'. */ 3557 bitset_t accepts; /* Characters a node can accept. */ 3558 const re_node_set *cur_nodes = &state->nodes; 3559 bitset_empty (accepts); 3560 ndests = 0; 3561 3562 /* For all the nodes belonging to `state', */ 3563 for (i = 0; i < cur_nodes->nelem; ++i) 3564 { 3565 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3566 re_token_type_t type = node->type; 3567 unsigned int constraint = node->constraint; 3568 3569 /* Enumerate all single byte character this node can accept. */ 3570 if (type == CHARACTER) 3571 bitset_set (accepts, node->opr.c); 3572 else if (type == SIMPLE_BRACKET) 3573 { 3574 bitset_merge (accepts, node->opr.sbcset); 3575 } 3576 else if (type == OP_PERIOD) 3577 { 3578#ifdef RE_ENABLE_I18N 3579 if (dfa->mb_cur_max > 1) 3580 bitset_merge (accepts, dfa->sb_char); 3581 else 3582#endif 3583 bitset_set_all (accepts); 3584 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3585 bitset_clear (accepts, '\n'); 3586 if (dfa->syntax & RE_DOT_NOT_NULL) 3587 bitset_clear (accepts, '\0'); 3588 } 3589#ifdef RE_ENABLE_I18N 3590 else if (type == OP_UTF8_PERIOD) 3591 { 3592 memset (accepts, '\xff', sizeof (bitset_t) / 2); 3593 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3594 bitset_clear (accepts, '\n'); 3595 if (dfa->syntax & RE_DOT_NOT_NULL) 3596 bitset_clear (accepts, '\0'); 3597 } 3598#endif 3599 else 3600 continue; 3601 3602 /* Check the `accepts' and sift the characters which are not 3603 match it the context. */ 3604 if (constraint) 3605 { 3606 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3607 { 3608 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3609 bitset_empty (accepts); 3610 if (accepts_newline) 3611 bitset_set (accepts, NEWLINE_CHAR); 3612 else 3613 continue; 3614 } 3615 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3616 { 3617 bitset_empty (accepts); 3618 continue; 3619 } 3620 3621 if (constraint & NEXT_WORD_CONSTRAINT) 3622 { 3623 bitset_word_t any_set = 0; 3624 if (type == CHARACTER && !node->word_char) 3625 { 3626 bitset_empty (accepts); 3627 continue; 3628 } 3629#ifdef RE_ENABLE_I18N 3630 if (dfa->mb_cur_max > 1) 3631 for (j = 0; j < BITSET_WORDS; ++j) 3632 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3633 else 3634#endif 3635 for (j = 0; j < BITSET_WORDS; ++j) 3636 any_set |= (accepts[j] &= dfa->word_char[j]); 3637 if (!any_set) 3638 continue; 3639 } 3640 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3641 { 3642 bitset_word_t any_set = 0; 3643 if (type == CHARACTER && node->word_char) 3644 { 3645 bitset_empty (accepts); 3646 continue; 3647 } 3648#ifdef RE_ENABLE_I18N 3649 if (dfa->mb_cur_max > 1) 3650 for (j = 0; j < BITSET_WORDS; ++j) 3651 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3652 else 3653#endif 3654 for (j = 0; j < BITSET_WORDS; ++j) 3655 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3656 if (!any_set) 3657 continue; 3658 } 3659 } 3660 3661 /* Then divide `accepts' into DFA states, or create a new 3662 state. Above, we make sure that accepts is not empty. */ 3663 for (j = 0; j < ndests; ++j) 3664 { 3665 bitset_t intersec; /* Intersection sets, see below. */ 3666 bitset_t remains; 3667 /* Flags, see below. */ 3668 bitset_word_t has_intersec, not_subset, not_consumed; 3669 3670 /* Optimization, skip if this state doesn't accept the character. */ 3671 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3672 continue; 3673 3674 /* Enumerate the intersection set of this state and `accepts'. */ 3675 has_intersec = 0; 3676 for (k = 0; k < BITSET_WORDS; ++k) 3677 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3678 /* And skip if the intersection set is empty. */ 3679 if (!has_intersec) 3680 continue; 3681 3682 /* Then check if this state is a subset of `accepts'. */ 3683 not_subset = not_consumed = 0; 3684 for (k = 0; k < BITSET_WORDS; ++k) 3685 { 3686 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3687 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3688 } 3689 3690 /* If this state isn't a subset of `accepts', create a 3691 new group state, which has the `remains'. */ 3692 if (not_subset) 3693 { 3694 bitset_copy (dests_ch[ndests], remains); 3695 bitset_copy (dests_ch[j], intersec); 3696 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3697 if (BE (err != REG_NOERROR, 0)) 3698 goto error_return; 3699 ++ndests; 3700 } 3701 3702 /* Put the position in the current group. */ 3703 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3704 if (BE (result < 0, 0)) 3705 goto error_return; 3706 3707 /* If all characters are consumed, go to next node. */ 3708 if (!not_consumed) 3709 break; 3710 } 3711 /* Some characters remain, create a new group. */ 3712 if (j == ndests) 3713 { 3714 bitset_copy (dests_ch[ndests], accepts); 3715 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3716 if (BE (err != REG_NOERROR, 0)) 3717 goto error_return; 3718 ++ndests; 3719 bitset_empty (accepts); 3720 } 3721 } 3722 return ndests; 3723 error_return: 3724 for (j = 0; j < ndests; ++j) 3725 re_node_set_free (dests_node + j); 3726 return -1; 3727} 3728 3729#ifdef RE_ENABLE_I18N 3730/* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3731 Return the number of the bytes the node accepts. 3732 STR_IDX is the current index of the input string. 3733 3734 This function handles the nodes which can accept one character, or 3735 one collating element like '.', '[a-z]', opposite to the other nodes 3736 can only accept one byte. */ 3737 3738static int 3739internal_function 3740check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, 3741 const re_string_t *input, int str_idx) 3742{ 3743 const re_token_t *node = dfa->nodes + node_idx; 3744 int char_len, elem_len; 3745 int i; 3746 wint_t wc; 3747 3748 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3749 { 3750 unsigned char c = re_string_byte_at (input, str_idx), d; 3751 if (BE (c < 0xc2, 1)) 3752 return 0; 3753 3754 if (str_idx + 2 > input->len) 3755 return 0; 3756 3757 d = re_string_byte_at (input, str_idx + 1); 3758 if (c < 0xe0) 3759 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3760 else if (c < 0xf0) 3761 { 3762 char_len = 3; 3763 if (c == 0xe0 && d < 0xa0) 3764 return 0; 3765 } 3766 else if (c < 0xf8) 3767 { 3768 char_len = 4; 3769 if (c == 0xf0 && d < 0x90) 3770 return 0; 3771 } 3772 else if (c < 0xfc) 3773 { 3774 char_len = 5; 3775 if (c == 0xf8 && d < 0x88) 3776 return 0; 3777 } 3778 else if (c < 0xfe) 3779 { 3780 char_len = 6; 3781 if (c == 0xfc && d < 0x84) 3782 return 0; 3783 } 3784 else 3785 return 0; 3786 3787 if (str_idx + char_len > input->len) 3788 return 0; 3789 3790 for (i = 1; i < char_len; ++i) 3791 { 3792 d = re_string_byte_at (input, str_idx + i); 3793 if (d < 0x80 || d > 0xbf) 3794 return 0; 3795 } 3796 return char_len; 3797 } 3798 3799 char_len = re_string_char_size_at (input, str_idx); 3800 if (node->type == OP_PERIOD) 3801 { 3802 if (char_len <= 1) 3803 return 0; 3804 /* FIXME: I don't think this if is needed, as both '\n' 3805 and '\0' are char_len == 1. */ 3806 /* '.' accepts any one character except the following two cases. */ 3807 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3808 re_string_byte_at (input, str_idx) == '\n') || 3809 ((dfa->syntax & RE_DOT_NOT_NULL) && 3810 re_string_byte_at (input, str_idx) == '\0')) 3811 return 0; 3812 return char_len; 3813 } 3814 3815 elem_len = re_string_elem_size_at (input, str_idx); 3816 wc = __btowc(*(input->mbs+str_idx)); 3817 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX)) 3818 return 0; 3819 3820 if (node->type == COMPLEX_BRACKET) 3821 { 3822 const re_charset_t *cset = node->opr.mbcset; 3823# ifdef _LIBC 3824 const unsigned char *pin 3825 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3826 int j; 3827 uint32_t nrules; 3828# endif /* _LIBC */ 3829 int match_len = 0; 3830 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3831 ? re_string_wchar_at (input, str_idx) : 0); 3832 3833 /* match with multibyte character? */ 3834 for (i = 0; i < cset->nmbchars; ++i) 3835 if (wc == cset->mbchars[i]) 3836 { 3837 match_len = char_len; 3838 goto check_node_accept_bytes_match; 3839 } 3840 /* match with character_class? */ 3841 for (i = 0; i < cset->nchar_classes; ++i) 3842 { 3843 wctype_t wt = cset->char_classes[i]; 3844 if (__iswctype (wc, wt)) 3845 { 3846 match_len = char_len; 3847 goto check_node_accept_bytes_match; 3848 } 3849 } 3850 3851# ifdef _LIBC 3852 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3853 if (nrules != 0) 3854 { 3855 unsigned int in_collseq = 0; 3856 const int32_t *table, *indirect; 3857 const unsigned char *weights, *extra; 3858 const char *collseqwc; 3859 /* This #include defines a local function! */ 3860# include <locale/weight.h> 3861 3862 /* match with collating_symbol? */ 3863 if (cset->ncoll_syms) 3864 extra = (const unsigned char *) 3865 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3866 for (i = 0; i < cset->ncoll_syms; ++i) 3867 { 3868 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3869 /* Compare the length of input collating element and 3870 the length of current collating element. */ 3871 if (*coll_sym != elem_len) 3872 continue; 3873 /* Compare each bytes. */ 3874 for (j = 0; j < *coll_sym; j++) 3875 if (pin[j] != coll_sym[1 + j]) 3876 break; 3877 if (j == *coll_sym) 3878 { 3879 /* Match if every bytes is equal. */ 3880 match_len = j; 3881 goto check_node_accept_bytes_match; 3882 } 3883 } 3884 3885 if (cset->nranges) 3886 { 3887 if (elem_len <= char_len) 3888 { 3889 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3890 in_collseq = __collseq_table_lookup (collseqwc, wc); 3891 } 3892 else 3893 in_collseq = find_collation_sequence_value (pin, elem_len); 3894 } 3895 /* match with range expression? */ 3896 for (i = 0; i < cset->nranges; ++i) 3897 if (cset->range_starts[i] <= in_collseq 3898 && in_collseq <= cset->range_ends[i]) 3899 { 3900 match_len = elem_len; 3901 goto check_node_accept_bytes_match; 3902 } 3903 3904 /* match with equivalence_class? */ 3905 if (cset->nequiv_classes) 3906 { 3907 const unsigned char *cp = pin; 3908 table = (const int32_t *) 3909 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3910 weights = (const unsigned char *) 3911 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3912 extra = (const unsigned char *) 3913 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3914 indirect = (const int32_t *) 3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3916 int32_t idx = findidx (&cp); 3917 if (idx > 0) 3918 for (i = 0; i < cset->nequiv_classes; ++i) 3919 { 3920 int32_t equiv_class_idx = cset->equiv_classes[i]; 3921 size_t weight_len = weights[idx & 0xffffff]; 3922 if (weight_len == weights[equiv_class_idx & 0xffffff] 3923 && (idx >> 24) == (equiv_class_idx >> 24)) 3924 { 3925 int cnt = 0; 3926 3927 idx &= 0xffffff; 3928 equiv_class_idx &= 0xffffff; 3929 3930 while (cnt <= weight_len 3931 && (weights[equiv_class_idx + 1 + cnt] 3932 == weights[idx + 1 + cnt])) 3933 ++cnt; 3934 if (cnt > weight_len) 3935 { 3936 match_len = elem_len; 3937 goto check_node_accept_bytes_match; 3938 } 3939 } 3940 } 3941 } 3942 } 3943 else 3944# endif /* _LIBC */ 3945 { 3946 /* match with range expression? */ 3947#if __GNUC__ >= 2 3948 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3949#else 3950 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3951 cmp_buf[2] = wc; 3952#endif 3953 for (i = 0; i < cset->nranges; ++i) 3954 { 3955 cmp_buf[0] = cset->range_starts[i]; 3956 cmp_buf[4] = cset->range_ends[i]; 3957 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3958 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3959 { 3960 match_len = char_len; 3961 goto check_node_accept_bytes_match; 3962 } 3963 } 3964 } 3965 check_node_accept_bytes_match: 3966 if (!cset->non_match) 3967 return match_len; 3968 else 3969 { 3970 if (match_len > 0) 3971 return 0; 3972 else 3973 return (elem_len > char_len) ? elem_len : char_len; 3974 } 3975 } 3976 return 0; 3977} 3978 3979# ifdef _LIBC 3980static unsigned int 3981internal_function 3982find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) 3983{ 3984 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3985 if (nrules == 0) 3986 { 3987 if (mbs_len == 1) 3988 { 3989 /* No valid character. Match it as a single byte character. */ 3990 const unsigned char *collseq = (const unsigned char *) 3991 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3992 return collseq[mbs[0]]; 3993 } 3994 return UINT_MAX; 3995 } 3996 else 3997 { 3998 int32_t idx; 3999 const unsigned char *extra = (const unsigned char *) 4000 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4001 int32_t extrasize = (const unsigned char *) 4002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 4003 4004 for (idx = 0; idx < extrasize;) 4005 { 4006 int mbs_cnt, found = 0; 4007 int32_t elem_mbs_len; 4008 /* Skip the name of collating element name. */ 4009 idx = idx + extra[idx] + 1; 4010 elem_mbs_len = extra[idx++]; 4011 if (mbs_len == elem_mbs_len) 4012 { 4013 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 4014 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 4015 break; 4016 if (mbs_cnt == elem_mbs_len) 4017 /* Found the entry. */ 4018 found = 1; 4019 } 4020 /* Skip the byte sequence of the collating element. */ 4021 idx += elem_mbs_len; 4022 /* Adjust for the alignment. */ 4023 idx = (idx + 3) & ~3; 4024 /* Skip the collation sequence value. */ 4025 idx += sizeof (uint32_t); 4026 /* Skip the wide char sequence of the collating element. */ 4027 idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 4028 /* If we found the entry, return the sequence value. */ 4029 if (found) 4030 return *(uint32_t *) (extra + idx); 4031 /* Skip the collation sequence value. */ 4032 idx += sizeof (uint32_t); 4033 } 4034 return UINT_MAX; 4035 } 4036} 4037# endif /* _LIBC */ 4038#endif /* RE_ENABLE_I18N */ 4039 4040/* Check whether the node accepts the byte which is IDX-th 4041 byte of the INPUT. */ 4042 4043static int 4044internal_function 4045check_node_accept (const re_match_context_t *mctx, const re_token_t *node, 4046 int idx) 4047{ 4048 unsigned char ch; 4049 ch = re_string_byte_at (&mctx->input, idx); 4050 switch (node->type) 4051 { 4052 case CHARACTER: 4053 if (node->opr.c != ch) 4054 return 0; 4055 break; 4056 4057 case SIMPLE_BRACKET: 4058 if (!bitset_contain (node->opr.sbcset, ch)) 4059 return 0; 4060 break; 4061 4062#ifdef RE_ENABLE_I18N 4063 case OP_UTF8_PERIOD: 4064 if (ch >= 0x80) 4065 return 0; 4066 /* FALLTHROUGH */ 4067#endif 4068 case OP_PERIOD: 4069 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4070 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4071 return 0; 4072 break; 4073 4074 default: 4075 return 0; 4076 } 4077 4078 if (node->constraint) 4079 { 4080 /* The node has constraints. Check whether the current context 4081 satisfies the constraints. */ 4082 unsigned int context = re_string_context_at (&mctx->input, idx, 4083 mctx->eflags); 4084 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4085 return 0; 4086 } 4087 4088 return 1; 4089} 4090 4091/* Extend the buffers, if the buffers have run out. */ 4092 4093static reg_errcode_t 4094internal_function 4095extend_buffers (re_match_context_t *mctx) 4096{ 4097 reg_errcode_t ret; 4098 re_string_t *pstr = &mctx->input; 4099 4100 /* Avoid overflow. */ 4101 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0)) 4102 return REG_ESPACE; 4103 4104 /* Double the lengths of the buffers. */ 4105 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 4106 if (BE (ret != REG_NOERROR, 0)) 4107 return ret; 4108 4109 if (mctx->state_log != NULL) 4110 { 4111 /* And double the length of state_log. */ 4112 /* XXX We have no indication of the size of this buffer. If this 4113 allocation fail we have no indication that the state_log array 4114 does not have the right size. */ 4115 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4116 pstr->bufs_len + 1); 4117 if (BE (new_array == NULL, 0)) 4118 return REG_ESPACE; 4119 mctx->state_log = new_array; 4120 } 4121 4122 /* Then reconstruct the buffers. */ 4123 if (pstr->icase) 4124 { 4125#ifdef RE_ENABLE_I18N 4126 if (pstr->mb_cur_max > 1) 4127 { 4128 ret = build_wcs_upper_buffer (pstr); 4129 if (BE (ret != REG_NOERROR, 0)) 4130 return ret; 4131 } 4132 else 4133#endif /* RE_ENABLE_I18N */ 4134 build_upper_buffer (pstr); 4135 } 4136 else 4137 { 4138#ifdef RE_ENABLE_I18N 4139 if (pstr->mb_cur_max > 1) 4140 build_wcs_buffer (pstr); 4141 else 4142#endif /* RE_ENABLE_I18N */ 4143 { 4144 if (pstr->trans != NULL) 4145 re_string_translate_buffer (pstr); 4146 } 4147 } 4148 return REG_NOERROR; 4149} 4150 4151 4152/* Functions for matching context. */ 4153 4154/* Initialize MCTX. */ 4155 4156static reg_errcode_t 4157internal_function 4158match_ctx_init (re_match_context_t *mctx, int eflags, int n) 4159{ 4160 mctx->eflags = eflags; 4161 mctx->match_last = -1; 4162 if (n > 0) 4163 { 4164 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4165 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4166 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4167 return REG_ESPACE; 4168 } 4169 /* Already zero-ed by the caller. 4170 else 4171 mctx->bkref_ents = NULL; 4172 mctx->nbkref_ents = 0; 4173 mctx->nsub_tops = 0; */ 4174 mctx->abkref_ents = n; 4175 mctx->max_mb_elem_len = 1; 4176 mctx->asub_tops = n; 4177 return REG_NOERROR; 4178} 4179 4180/* Clean the entries which depend on the current input in MCTX. 4181 This function must be invoked when the matcher changes the start index 4182 of the input, or changes the input string. */ 4183 4184static void 4185internal_function 4186match_ctx_clean (re_match_context_t *mctx) 4187{ 4188 int st_idx; 4189 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4190 { 4191 int sl_idx; 4192 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4193 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4194 { 4195 re_sub_match_last_t *last = top->lasts[sl_idx]; 4196 re_free (last->path.array); 4197 re_free (last); 4198 } 4199 re_free (top->lasts); 4200 if (top->path) 4201 { 4202 re_free (top->path->array); 4203 re_free (top->path); 4204 } 4205 free (top); 4206 } 4207 4208 mctx->nsub_tops = 0; 4209 mctx->nbkref_ents = 0; 4210} 4211 4212/* Free all the memory associated with MCTX. */ 4213 4214static void 4215internal_function 4216match_ctx_free (re_match_context_t *mctx) 4217{ 4218 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4219 match_ctx_clean (mctx); 4220 re_free (mctx->sub_tops); 4221 re_free (mctx->bkref_ents); 4222} 4223 4224/* Add a new backreference entry to MCTX. 4225 Note that we assume that caller never call this function with duplicate 4226 entry, and call with STR_IDX which isn't smaller than any existing entry. 4227*/ 4228 4229static reg_errcode_t 4230internal_function 4231match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from, 4232 int to) 4233{ 4234 if (mctx->nbkref_ents >= mctx->abkref_ents) 4235 { 4236 struct re_backref_cache_entry* new_entry; 4237 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4238 mctx->abkref_ents * 2); 4239 if (BE (new_entry == NULL, 0)) 4240 { 4241 re_free (mctx->bkref_ents); 4242 return REG_ESPACE; 4243 } 4244 mctx->bkref_ents = new_entry; 4245 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4246 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4247 mctx->abkref_ents *= 2; 4248 } 4249 if (mctx->nbkref_ents > 0 4250 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4251 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4252 4253 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4254 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4255 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4256 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4257 4258 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4259 If bit N is clear, means that this entry won't epsilon-transition to 4260 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4261 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4262 such node. 4263 4264 A backreference does not epsilon-transition unless it is empty, so set 4265 to all zeros if FROM != TO. */ 4266 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4267 = (from == to ? ~0 : 0); 4268 4269 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4270 if (mctx->max_mb_elem_len < to - from) 4271 mctx->max_mb_elem_len = to - from; 4272 return REG_NOERROR; 4273} 4274 4275/* Search for the first entry which has the same str_idx, or -1 if none is 4276 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4277 4278static int 4279internal_function 4280search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) 4281{ 4282 int left, right, mid, last; 4283 last = right = mctx->nbkref_ents; 4284 for (left = 0; left < right;) 4285 { 4286 mid = left + (right - left) / 2; 4287 if (mctx->bkref_ents[mid].str_idx < str_idx) 4288 left = mid + 1; 4289 else 4290 right = mid; 4291 } 4292 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4293 return left; 4294 else 4295 return -1; 4296} 4297 4298/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4299 at STR_IDX. */ 4300 4301static reg_errcode_t 4302internal_function 4303match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx) 4304{ 4305#ifdef DEBUG 4306 assert (mctx->sub_tops != NULL); 4307 assert (mctx->asub_tops > 0); 4308#endif 4309 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4310 { 4311 int new_asub_tops = mctx->asub_tops * 2; 4312 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4313 re_sub_match_top_t *, 4314 new_asub_tops); 4315 if (BE (new_array == NULL, 0)) 4316 return REG_ESPACE; 4317 mctx->sub_tops = new_array; 4318 mctx->asub_tops = new_asub_tops; 4319 } 4320 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4321 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4322 return REG_ESPACE; 4323 mctx->sub_tops[mctx->nsub_tops]->node = node; 4324 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4325 return REG_NOERROR; 4326} 4327 4328/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4329 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4330 4331static re_sub_match_last_t * 4332internal_function 4333match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx) 4334{ 4335 re_sub_match_last_t *new_entry; 4336 if (BE (subtop->nlasts == subtop->alasts, 0)) 4337 { 4338 int new_alasts = 2 * subtop->alasts + 1; 4339 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4340 re_sub_match_last_t *, 4341 new_alasts); 4342 if (BE (new_array == NULL, 0)) 4343 return NULL; 4344 subtop->lasts = new_array; 4345 subtop->alasts = new_alasts; 4346 } 4347 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4348 if (BE (new_entry != NULL, 1)) 4349 { 4350 subtop->lasts[subtop->nlasts] = new_entry; 4351 new_entry->node = node; 4352 new_entry->str_idx = str_idx; 4353 ++subtop->nlasts; 4354 } 4355 return new_entry; 4356} 4357 4358static void 4359internal_function 4360sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 4361 re_dfastate_t **limited_sts, int last_node, int last_str_idx) 4362{ 4363 sctx->sifted_states = sifted_sts; 4364 sctx->limited_states = limited_sts; 4365 sctx->last_node = last_node; 4366 sctx->last_str_idx = last_str_idx; 4367 re_node_set_init_empty (&sctx->limits); 4368}