compat/regex/regcomp.c at reftables-rust

freshlybakedca.ke / git
fork atom
Git fork
fork atom
git / compat / regex / regcomp.c
at reftables-rust 3893 lines 112 kB view raw
wrap content
Jeff King compat: disable -Wunused-parameter in 3rd-party code 2y ago
4550c164
   1/* Extended regular expression matching and search library.
   2   Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
   3   This file is part of the GNU C Library.
   4   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6   The GNU C Library is free software; you can redistribute it and/or
   7   modify it under the terms of the GNU Lesser General Public
   8   License as published by the Free Software Foundation; either
   9   version 2.1 of the License, or (at your option) any later version.
  10
  11   The GNU C Library is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public
  17   License along with the GNU C Library; if not, see
  18   <http://www.gnu.org/licenses/>.  */
  19
  20#pragma GCC diagnostic ignored "-Wunused-parameter"
  21
  22#if defined __TANDEM
  23 /* This is currently duplicated from git-compat-utils.h */
  24# ifdef NO_INTPTR_T
  25 typedef long intptr_t;
  26 typedef unsigned long uintptr_t;
  27# endif
  28#endif
  29
  30static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  31					  size_t length, reg_syntax_t syntax);
  32static void re_compile_fastmap_iter (regex_t *bufp,
  33				     const re_dfastate_t *init_state,
  34				     char *fastmap);
  35static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  36#ifdef RE_ENABLE_I18N
  37static void free_charset (re_charset_t *cset);
  38#endif /* RE_ENABLE_I18N */
  39static void free_workarea_compile (regex_t *preg);
  40static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  41#ifdef RE_ENABLE_I18N
  42static void optimize_utf8 (re_dfa_t *dfa);
  43#endif
  44static reg_errcode_t analyze (regex_t *preg);
  45static reg_errcode_t preorder (bin_tree_t *root,
  46			       reg_errcode_t (fn (void *, bin_tree_t *)),
  47			       void *extra);
  48static reg_errcode_t postorder (bin_tree_t *root,
  49				reg_errcode_t (fn (void *, bin_tree_t *)),
  50				void *extra);
  51static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  52static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  53static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  54				 bin_tree_t *node);
  55static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  56static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  57static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  58static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  59static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  60				   unsigned int constraint);
  61static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  62static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  63					 int node, int root);
  64static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  65static int fetch_number (re_string_t *input, re_token_t *token,
  66			 reg_syntax_t syntax);
  67static int peek_token (re_token_t *token, re_string_t *input,
  68			reg_syntax_t syntax) internal_function;
  69static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  70			  reg_syntax_t syntax, reg_errcode_t *err);
  71static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  72				  re_token_t *token, reg_syntax_t syntax,
  73				  int nest, reg_errcode_t *err);
  74static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  75				 re_token_t *token, reg_syntax_t syntax,
  76				 int nest, reg_errcode_t *err);
  77static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  78				     re_token_t *token, reg_syntax_t syntax,
  79				     int nest, reg_errcode_t *err);
  80static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  81				  re_token_t *token, reg_syntax_t syntax,
  82				  int nest, reg_errcode_t *err);
  83static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  84				 re_dfa_t *dfa, re_token_t *token,
  85				 reg_syntax_t syntax, reg_errcode_t *err);
  86static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  87				      re_token_t *token, reg_syntax_t syntax,
  88				      reg_errcode_t *err);
  89static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  90					    re_string_t *regexp,
  91					    re_token_t *token, int token_len,
  92					    re_dfa_t *dfa,
  93					    reg_syntax_t syntax,
  94					    int accept_hyphen);
  95static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  96					  re_string_t *regexp,
  97					  re_token_t *token);
  98#ifdef RE_ENABLE_I18N
  99static reg_errcode_t build_equiv_class (bitset_t sbcset,
 100					re_charset_t *mbcset,
 101					int *equiv_class_alloc,
 102					const unsigned char *name);
 103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104				      bitset_t sbcset,
 105				      re_charset_t *mbcset,
 106				      int *char_class_alloc,
 107				      const char *class_name,
 108				      reg_syntax_t syntax);
 109#else  /* not RE_ENABLE_I18N */
 110static reg_errcode_t build_equiv_class (bitset_t sbcset,
 111					const unsigned char *name);
 112static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 113				      bitset_t sbcset,
 114				      const char *class_name,
 115				      reg_syntax_t syntax);
 116#endif /* not RE_ENABLE_I18N */
 117static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 118				       RE_TRANSLATE_TYPE trans,
 119				       const char *class_name,
 120				       const char *extra,
 121				       int non_match, reg_errcode_t *err);
 122static bin_tree_t *create_tree (re_dfa_t *dfa,
 123				bin_tree_t *left, bin_tree_t *right,
 124				re_token_type_t type);
 125static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 126				      bin_tree_t *left, bin_tree_t *right,
 127				      const re_token_t *token);
 128static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 129static void free_token (re_token_t *node);
 130static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 131static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 132
 133/* This table gives an error message for each of the error codes listed
 134   in regex.h.  Obviously the order here has to be same as there.
 135   POSIX doesn't require that we do anything for REG_NOERROR,
 136   but why not be nice?  */
 137
 138const char __re_error_msgid[] attribute_hidden =
 139  {
 140#define REG_NOERROR_IDX	0
 141    gettext_noop ("Success")	/* REG_NOERROR */
 142    "\0"
 143#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 144    gettext_noop ("No match")	/* REG_NOMATCH */
 145    "\0"
 146#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
 147    gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 148    "\0"
 149#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 150    gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 151    "\0"
 152#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 153    gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 154    "\0"
 155#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
 156    gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 157    "\0"
 158#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
 159    gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 160    "\0"
 161#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
 162    gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
 163    "\0"
 164#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 165    gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 166    "\0"
 167#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 168    gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 169    "\0"
 170#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
 171    gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 172    "\0"
 173#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 174    gettext_noop ("Invalid range end")	/* REG_ERANGE */
 175    "\0"
 176#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
 177    gettext_noop ("Memory exhausted") /* REG_ESPACE */
 178    "\0"
 179#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
 180    gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 181    "\0"
 182#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 183    gettext_noop ("Premature end of regular expression") /* REG_EEND */
 184    "\0"
 185#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
 186    gettext_noop ("Regular expression too big") /* REG_ESIZE */
 187    "\0"
 188#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
 189    gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 190  };
 191
 192const size_t __re_error_msgid_idx[] attribute_hidden =
 193  {
 194    REG_NOERROR_IDX,
 195    REG_NOMATCH_IDX,
 196    REG_BADPAT_IDX,
 197    REG_ECOLLATE_IDX,
 198    REG_ECTYPE_IDX,
 199    REG_EESCAPE_IDX,
 200    REG_ESUBREG_IDX,
 201    REG_EBRACK_IDX,
 202    REG_EPAREN_IDX,
 203    REG_EBRACE_IDX,
 204    REG_BADBR_IDX,
 205    REG_ERANGE_IDX,
 206    REG_ESPACE_IDX,
 207    REG_BADRPT_IDX,
 208    REG_EEND_IDX,
 209    REG_ESIZE_IDX,
 210    REG_ERPAREN_IDX
 211  };
 212
 213/* Entry points for GNU code.  */
 214
 215
 216#ifdef ZOS_USS
 217
 218/* For ZOS USS we must define btowc */
 219
 220wchar_t 
 221btowc (int c)
 222{
 223   wchar_t wtmp[2];
 224   char tmp[2];
 225
 226   tmp[0] = c;
 227   tmp[1] = 0;
 228
 229   mbtowc (wtmp, tmp, 1);
 230   return wtmp[0];
 231}
 232#endif
 233
 234/* re_compile_pattern is the GNU regular expression compiler: it
 235   compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 236   Returns 0 if the pattern was valid, otherwise an error string.
 237
 238   Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 239   are set in BUFP on entry.  */
 240
 241const char *
 242re_compile_pattern (const char *pattern,
 243		    size_t length,
 244		    struct re_pattern_buffer *bufp)
 245{
 246  reg_errcode_t ret;
 247
 248  /* And GNU code determines whether or not to get register information
 249     by passing null for the REGS argument to re_match, etc., not by
 250     setting no_sub, unless RE_NO_SUB is set.  */
 251  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 252
 253  /* Match anchors at newline.  */
 254  bufp->newline_anchor = 1;
 255
 256  ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 257
 258  if (!ret)
 259    return NULL;
 260  return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 261}
 262#ifdef _LIBC
 263weak_alias (__re_compile_pattern, re_compile_pattern)
 264#endif
 265
 266/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 267   also be assigned to arbitrarily: each pattern buffer stores its own
 268   syntax, so it can be changed between regex compilations.  */
 269/* This has no initializer because initialized variables in Emacs
 270   become read-only after dumping.  */
 271reg_syntax_t re_syntax_options;
 272
 273
 274/* Specify the precise syntax of regexps for compilation.  This provides
 275   for compatibility for various utilities which historically have
 276   different, incompatible syntaxes.
 277
 278   The argument SYNTAX is a bit mask comprised of the various bits
 279   defined in regex.h.  We return the old syntax.  */
 280
 281reg_syntax_t
 282re_set_syntax (reg_syntax_t syntax)
 283{
 284  reg_syntax_t ret = re_syntax_options;
 285
 286  re_syntax_options = syntax;
 287  return ret;
 288}
 289#ifdef _LIBC
 290weak_alias (__re_set_syntax, re_set_syntax)
 291#endif
 292
 293int
 294re_compile_fastmap (struct re_pattern_buffer *bufp)
 295{
 296  re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 297  char *fastmap = bufp->fastmap;
 298
 299  memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 300  re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 301  if (dfa->init_state != dfa->init_state_word)
 302    re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 303  if (dfa->init_state != dfa->init_state_nl)
 304    re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 305  if (dfa->init_state != dfa->init_state_begbuf)
 306    re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 307  bufp->fastmap_accurate = 1;
 308  return 0;
 309}
 310#ifdef _LIBC
 311weak_alias (__re_compile_fastmap, re_compile_fastmap)
 312#endif
 313
 314static inline void
 315__attribute ((always_inline))
 316re_set_fastmap (char *fastmap, int icase, int ch)
 317{
 318  fastmap[ch] = 1;
 319  if (icase)
 320    fastmap[tolower (ch)] = 1;
 321}
 322
 323/* Helper function for re_compile_fastmap.
 324   Compile fastmap for the initial_state INIT_STATE.  */
 325
 326static void
 327re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 328			 char *fastmap)
 329{
 330  volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 331  int node_cnt;
 332  int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 333  for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 334    {
 335      int node = init_state->nodes.elems[node_cnt];
 336      re_token_type_t type = dfa->nodes[node].type;
 337
 338      if (type == CHARACTER)
 339	{
 340	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 341#ifdef RE_ENABLE_I18N
 342	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 343	    {
 344	      unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 345	      wchar_t wc;
 346	      mbstate_t state;
 347
 348	      p = buf;
 349	      *p++ = dfa->nodes[node].opr.c;
 350	      while (++node < dfa->nodes_len
 351		     && dfa->nodes[node].type == CHARACTER
 352		     && dfa->nodes[node].mb_partial)
 353		*p++ = dfa->nodes[node].opr.c;
 354	      memset (&state, '\0', sizeof (state));
 355	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
 356			     &state) == p - buf
 357		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
 358		      != (size_t) -1))
 359		re_set_fastmap (fastmap, 0, buf[0]);
 360	      re_free (buf);
 361	    }
 362#endif
 363	}
 364      else if (type == SIMPLE_BRACKET)
 365	{
 366	  int i, ch;
 367	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 368	    {
 369	      int j;
 370	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 371	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 372		if (w & ((bitset_word_t) 1 << j))
 373		  re_set_fastmap (fastmap, icase, ch);
 374	    }
 375	}
 376#ifdef RE_ENABLE_I18N
 377      else if (type == COMPLEX_BRACKET)
 378	{
 379	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380	  int i;
 381
 382# ifdef _LIBC
 383	  /* See if we have to try all bytes which start multiple collation
 384	     elements.
 385	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 386		  collation element, and don't catch 'b' since 'b' is
 387		  the only collation element which starts from 'b' (and
 388		  it is caught by SIMPLE_BRACKET).  */
 389	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 390		  && (cset->ncoll_syms || cset->nranges))
 391		{
 392		  const int32_t *table = (const int32_t *)
 393		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 394		  for (i = 0; i < SBC_MAX; ++i)
 395		    if (table[i] < 0)
 396		      re_set_fastmap (fastmap, icase, i);
 397		}
 398# endif /* _LIBC */
 399
 400	  /* See if we have to start the match at all multibyte characters,
 401	     i.e. where we would not find an invalid sequence.  This only
 402	     applies to multibyte character sets; for single byte character
 403	     sets, the SIMPLE_BRACKET again suffices.  */
 404	  if (dfa->mb_cur_max > 1
 405	      && (cset->nchar_classes || cset->non_match || cset->nranges
 406# ifdef _LIBC
 407		  || cset->nequiv_classes
 408# endif /* _LIBC */
 409		 ))
 410	    {
 411	      unsigned char c = 0;
 412	      do
 413		{
 414		  mbstate_t mbs;
 415		  memset (&mbs, 0, sizeof (mbs));
 416		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 417		    re_set_fastmap (fastmap, false, (int) c);
 418		}
 419	      while (++c != 0);
 420	    }
 421
 422	  else
 423	    {
 424	      /* ... Else catch all bytes which can start the mbchars.  */
 425	      for (i = 0; i < cset->nmbchars; ++i)
 426		{
 427		  char buf[256];
 428		  mbstate_t state;
 429		  memset (&state, '\0', sizeof (state));
 430		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 431		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 432		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 433		    {
 434		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 435			  != (size_t) -1)
 436			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 437		    }
 438		}
 439	    }
 440	}
 441#endif /* RE_ENABLE_I18N */
 442      else if (type == OP_PERIOD
 443#ifdef RE_ENABLE_I18N
 444	       || type == OP_UTF8_PERIOD
 445#endif /* RE_ENABLE_I18N */
 446	       || type == END_OF_RE)
 447	{
 448	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 449	  if (type == END_OF_RE)
 450	    bufp->can_be_null = 1;
 451	  return;
 452	}
 453    }
 454}
 455
 456/* Entry point for POSIX code.  */
 457/* regcomp takes a regular expression as a string and compiles it.
 458
 459   PREG is a regex_t *.  We do not expect any fields to be initialized,
 460   since POSIX says we shouldn't.  Thus, we set
 461
 462     `buffer' to the compiled pattern;
 463     `used' to the length of the compiled pattern;
 464     `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 465       REG_EXTENDED bit in CFLAGS is set; otherwise, to
 466       RE_SYNTAX_POSIX_BASIC;
 467     `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 468     `fastmap' to an allocated space for the fastmap;
 469     `fastmap_accurate' to zero;
 470     `re_nsub' to the number of subexpressions in PATTERN.
 471
 472   PATTERN is the address of the pattern string.
 473
 474   CFLAGS is a series of bits which affect compilation.
 475
 476     If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 477     use POSIX basic syntax.
 478
 479     If REG_NEWLINE is set, then . and [^...] don't match newline.
 480     Also, regexec will try a match beginning after every newline.
 481
 482     If REG_ICASE is set, then we considers upper- and lowercase
 483     versions of letters to be equivalent when matching.
 484
 485     If REG_NOSUB is set, then when PREG is passed to regexec, that
 486     routine will report only success or failure, and nothing about the
 487     registers.
 488
 489   It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 490   the return codes and their meanings.)  */
 491
 492int
 493regcomp (regex_t *__restrict preg,
 494	 const char *__restrict pattern,
 495	 int cflags)
 496{
 497  reg_errcode_t ret;
 498  reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 499			 : RE_SYNTAX_POSIX_BASIC);
 500
 501  preg->buffer = NULL;
 502  preg->allocated = 0;
 503  preg->used = 0;
 504
 505  /* Try to allocate space for the fastmap.  */
 506  preg->fastmap = re_malloc (char, SBC_MAX);
 507  if (BE (preg->fastmap == NULL, 0))
 508    return REG_ESPACE;
 509
 510  syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 511
 512  /* If REG_NEWLINE is set, newlines are treated differently.  */
 513  if (cflags & REG_NEWLINE)
 514    { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 515      syntax &= ~RE_DOT_NEWLINE;
 516      syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 517      /* It also changes the matching behavior.  */
 518      preg->newline_anchor = 1;
 519    }
 520  else
 521    preg->newline_anchor = 0;
 522  preg->no_sub = !!(cflags & REG_NOSUB);
 523  preg->translate = NULL;
 524
 525  ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 526
 527  /* POSIX doesn't distinguish between an unmatched open-group and an
 528     unmatched close-group: both are REG_EPAREN.  */
 529  if (ret == REG_ERPAREN)
 530    ret = REG_EPAREN;
 531
 532  /* We have already checked preg->fastmap != NULL.  */
 533  if (BE (ret == REG_NOERROR, 1))
 534    /* Compute the fastmap now, since regexec cannot modify the pattern
 535       buffer.  This function never fails in this implementation.  */
 536    (void) re_compile_fastmap (preg);
 537  else
 538    {
 539      /* Some error occurred while compiling the expression.  */
 540      re_free (preg->fastmap);
 541      preg->fastmap = NULL;
 542    }
 543
 544  return (int) ret;
 545}
 546#ifdef _LIBC
 547weak_alias (__regcomp, regcomp)
 548#endif
 549
 550/* Returns a message corresponding to an error code, ERRCODE, returned
 551   from either regcomp or regexec.   We don't use PREG here.  */
 552
 553size_t
 554regerror(int errcode, const regex_t *__restrict preg,
 555	 char *__restrict errbuf, size_t errbuf_size)
 556{
 557  const char *msg;
 558  size_t msg_size;
 559
 560  if (BE (errcode < 0
 561	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
 562			       / sizeof (__re_error_msgid_idx[0])), 0))
 563    /* Only error codes returned by the rest of the code should be passed
 564       to this routine.  If we are given anything else, or if other regex
 565       code generates an invalid error code, then the program has a bug.
 566       Dump core so we can fix it.  */
 567    abort ();
 568
 569  msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 570
 571  msg_size = strlen (msg) + 1; /* Includes the null.  */
 572
 573  if (BE (errbuf_size != 0, 1))
 574    {
 575      if (BE (msg_size > errbuf_size, 0))
 576	{
 577	  memcpy (errbuf, msg, errbuf_size - 1);
 578	  errbuf[errbuf_size - 1] = 0;
 579	}
 580      else
 581	memcpy (errbuf, msg, msg_size);
 582    }
 583
 584  return msg_size;
 585}
 586#ifdef _LIBC
 587weak_alias (__regerror, regerror)
 588#endif
 589
 590
 591#ifdef RE_ENABLE_I18N
 592/* This static array is used for the map to single-byte characters when
 593   UTF-8 is used.  Otherwise we would allocate memory just to initialize
 594   it the same all the time.  UTF-8 is the preferred encoding so this is
 595   a worthwhile optimization.  */
 596#if __GNUC__ >= 3
 597static const bitset_t utf8_sb_map = {
 598  /* Set the first 128 bits.  */
 599  [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 600};
 601#else /* ! (__GNUC__ >= 3) */
 602static bitset_t utf8_sb_map;
 603#endif /* __GNUC__ >= 3 */
 604#endif /* RE_ENABLE_I18N */
 605
 606
 607static void
 608free_dfa_content (re_dfa_t *dfa)
 609{
 610  int i, j;
 611
 612  if (dfa->nodes)
 613    for (i = 0; i < dfa->nodes_len; ++i)
 614      free_token (dfa->nodes + i);
 615  re_free (dfa->nexts);
 616  for (i = 0; i < dfa->nodes_len; ++i)
 617    {
 618      if (dfa->eclosures != NULL)
 619	re_node_set_free (dfa->eclosures + i);
 620      if (dfa->inveclosures != NULL)
 621	re_node_set_free (dfa->inveclosures + i);
 622      if (dfa->edests != NULL)
 623	re_node_set_free (dfa->edests + i);
 624    }
 625  re_free (dfa->edests);
 626  re_free (dfa->eclosures);
 627  re_free (dfa->inveclosures);
 628  re_free (dfa->nodes);
 629
 630  if (dfa->state_table)
 631    for (i = 0; i <= dfa->state_hash_mask; ++i)
 632      {
 633	struct re_state_table_entry *entry = dfa->state_table + i;
 634	for (j = 0; j < entry->num; ++j)
 635	  {
 636	    re_dfastate_t *state = entry->array[j];
 637	    free_state (state);
 638	  }
 639	re_free (entry->array);
 640      }
 641  re_free (dfa->state_table);
 642#ifdef RE_ENABLE_I18N
 643  if (dfa->sb_char != utf8_sb_map)
 644    re_free (dfa->sb_char);
 645#endif
 646  re_free (dfa->subexp_map);
 647#ifdef DEBUG
 648  re_free (dfa->re_str);
 649#endif
 650
 651  re_free (dfa);
 652}
 653
 654
 655/* Free dynamically allocated space used by PREG.  */
 656
 657void
 658regfree (regex_t *preg)
 659{
 660  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 661  if (BE (dfa != NULL, 1))
 662    free_dfa_content (dfa);
 663  preg->buffer = NULL;
 664  preg->allocated = 0;
 665
 666  re_free (preg->fastmap);
 667  preg->fastmap = NULL;
 668
 669  re_free (preg->translate);
 670  preg->translate = NULL;
 671}
 672#ifdef _LIBC
 673weak_alias (__regfree, regfree)
 674#endif
 675
 676/* Entry points compatible with 4.2 BSD regex library.  We don't define
 677   them unless specifically requested.  */
 678
 679#if defined _REGEX_RE_COMP || defined _LIBC
 680
 681/* BSD has one and only one pattern buffer.  */
 682static struct re_pattern_buffer re_comp_buf;
 683
 684char *
 685# ifdef _LIBC
 686/* Make these definitions weak in libc, so POSIX programs can redefine
 687   these names if they don't use our functions, and still use
 688   regcomp/regexec above without link errors.  */
 689weak_function
 690# endif
 691re_comp (s)
 692     const char *s;
 693{
 694  reg_errcode_t ret;
 695  char *fastmap;
 696
 697  if (!s)
 698    {
 699      if (!re_comp_buf.buffer)
 700	return gettext ("No previous regular expression");
 701      return 0;
 702    }
 703
 704  if (re_comp_buf.buffer)
 705    {
 706      fastmap = re_comp_buf.fastmap;
 707      re_comp_buf.fastmap = NULL;
 708      __regfree (&re_comp_buf);
 709      memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 710      re_comp_buf.fastmap = fastmap;
 711    }
 712
 713  if (re_comp_buf.fastmap == NULL)
 714    {
 715      re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 716      if (re_comp_buf.fastmap == NULL)
 717	return (char *) gettext (__re_error_msgid
 718				 + __re_error_msgid_idx[(int) REG_ESPACE]);
 719    }
 720
 721  /* Since `re_exec' always passes NULL for the `regs' argument, we
 722     don't need to initialize the pattern buffer fields which affect it.  */
 723
 724  /* Match anchors at newlines.  */
 725  re_comp_buf.newline_anchor = 1;
 726
 727  ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 728
 729  if (!ret)
 730    return NULL;
 731
 732  /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 733  return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 734}
 735
 736#ifdef _LIBC
 737libc_freeres_fn (free_mem)
 738{
 739  __regfree (&re_comp_buf);
 740}
 741#endif
 742
 743#endif /* _REGEX_RE_COMP */
 744
 745/* Internal entry point.
 746   Compile the regular expression PATTERN, whose length is LENGTH.
 747   SYNTAX indicate regular expression's syntax.  */
 748
 749static reg_errcode_t
 750re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 751		     reg_syntax_t syntax)
 752{
 753  reg_errcode_t err = REG_NOERROR;
 754  re_dfa_t *dfa;
 755  re_string_t regexp;
 756
 757  /* Initialize the pattern buffer.  */
 758  preg->fastmap_accurate = 0;
 759  preg->syntax = syntax;
 760  preg->not_bol = preg->not_eol = 0;
 761  preg->used = 0;
 762  preg->re_nsub = 0;
 763  preg->can_be_null = 0;
 764  preg->regs_allocated = REGS_UNALLOCATED;
 765
 766  /* Initialize the dfa.  */
 767  dfa = (re_dfa_t *) preg->buffer;
 768  if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 769    {
 770      /* If zero allocated, but buffer is non-null, try to realloc
 771	 enough space.  This loses if buffer's address is bogus, but
 772	 that is the user's responsibility.  If ->buffer is NULL this
 773	 is a simple allocation.  */
 774      dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 775      if (dfa == NULL)
 776	return REG_ESPACE;
 777      preg->allocated = sizeof (re_dfa_t);
 778      preg->buffer = (unsigned char *) dfa;
 779    }
 780  preg->used = sizeof (re_dfa_t);
 781
 782  err = init_dfa (dfa, length);
 783  if (BE (err != REG_NOERROR, 0))
 784    {
 785      free_dfa_content (dfa);
 786      preg->buffer = NULL;
 787      preg->allocated = 0;
 788      return err;
 789    }
 790#ifdef DEBUG
 791  /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 792  dfa->re_str = re_malloc (char, length + 1);
 793  strncpy (dfa->re_str, pattern, length + 1);
 794#endif
 795
 796  __libc_lock_init (dfa->lock);
 797
 798  err = re_string_construct (&regexp, pattern, length, preg->translate,
 799			     syntax & RE_ICASE, dfa);
 800  if (BE (err != REG_NOERROR, 0))
 801    {
 802    re_compile_internal_free_return:
 803      free_workarea_compile (preg);
 804      re_string_destruct (&regexp);
 805      free_dfa_content (dfa);
 806      preg->buffer = NULL;
 807      preg->allocated = 0;
 808      return err;
 809    }
 810
 811  /* Parse the regular expression, and build a structure tree.  */
 812  preg->re_nsub = 0;
 813  dfa->str_tree = parse (&regexp, preg, syntax, &err);
 814  if (BE (dfa->str_tree == NULL, 0))
 815    goto re_compile_internal_free_return;
 816
 817  /* Analyze the tree and create the nfa.  */
 818  err = analyze (preg);
 819  if (BE (err != REG_NOERROR, 0))
 820    goto re_compile_internal_free_return;
 821
 822#ifdef RE_ENABLE_I18N
 823  /* If possible, do searching in single byte encoding to speed things up.  */
 824  if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 825    optimize_utf8 (dfa);
 826#endif
 827
 828  /* Then create the initial state of the dfa.  */
 829  err = create_initial_state (dfa);
 830
 831  /* Release work areas.  */
 832  free_workarea_compile (preg);
 833  re_string_destruct (&regexp);
 834
 835  if (BE (err != REG_NOERROR, 0))
 836    {
 837      free_dfa_content (dfa);
 838      preg->buffer = NULL;
 839      preg->allocated = 0;
 840    }
 841
 842  return err;
 843}
 844
 845/* Initialize DFA.  We use the length of the regular expression PAT_LEN
 846   as the initial length of some arrays.  */
 847
 848static reg_errcode_t
 849init_dfa (re_dfa_t *dfa, size_t pat_len)
 850{
 851  unsigned int table_size;
 852#ifndef _LIBC
 853  const char *codeset_name;
 854#endif
 855
 856  memset (dfa, '\0', sizeof (re_dfa_t));
 857
 858  /* Force allocation of str_tree_storage the first time.  */
 859  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 860
 861  /* Avoid overflows.  */
 862  if (pat_len == SIZE_MAX)
 863    return REG_ESPACE;
 864
 865  dfa->nodes_alloc = pat_len + 1;
 866  dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 867
 868  /*  table_size = 2 ^ ceil(log pat_len) */
 869  for (table_size = 1; ; table_size <<= 1)
 870    if (table_size > pat_len)
 871      break;
 872
 873  dfa->state_table = calloc (table_size, sizeof (struct re_state_table_entry));
 874  dfa->state_hash_mask = table_size - 1;
 875
 876  dfa->mb_cur_max = MB_CUR_MAX;
 877#ifdef _LIBC
 878  if (dfa->mb_cur_max == 6
 879      && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 880    dfa->is_utf8 = 1;
 881  dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 882		       != 0);
 883#else
 884# ifdef HAVE_LANGINFO_CODESET
 885  codeset_name = nl_langinfo (CODESET);
 886# else
 887  codeset_name = getenv ("LC_ALL");
 888  if (codeset_name == NULL || codeset_name[0] == '\0')
 889    codeset_name = getenv ("LC_CTYPE");
 890  if (codeset_name == NULL || codeset_name[0] == '\0')
 891    codeset_name = getenv ("LANG");
 892  if (codeset_name == NULL)
 893    codeset_name = "";
 894  else if (strchr (codeset_name, '.') !=  NULL)
 895    codeset_name = strchr (codeset_name, '.') + 1;
 896# endif
 897
 898  /* strcasecmp isn't a standard interface. brute force check */
 899#if 0
 900  if (strcasecmp (codeset_name, "UTF-8") == 0
 901      || strcasecmp (codeset_name, "UTF8") == 0)
 902    dfa->is_utf8 = 1;
 903#else
 904  if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 905      && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 906      && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 907      && (codeset_name[3] == '-'
 908          ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 909          : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 910    dfa->is_utf8 = 1;
 911#endif
 912
 913  /* We check exhaustively in the loop below if this charset is a
 914     superset of ASCII.  */
 915  dfa->map_notascii = 0;
 916#endif
 917
 918#ifdef RE_ENABLE_I18N
 919  if (dfa->mb_cur_max > 1)
 920    {
 921      if (dfa->is_utf8)
 922        {
 923#if !defined(__GNUC__) || __GNUC__ < 3
 924	  static short utf8_sb_map_inited = 0;
 925
 926	  if (! utf8_sb_map_inited)
 927	    {
 928		int i;
 929
 930	  	utf8_sb_map_inited = 0;
 931		for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 932		  utf8_sb_map[i] = BITSET_WORD_MAX;
 933	    }
 934#endif
 935	  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 936	}
 937      else
 938	{
 939	  int i, j, ch;
 940
 941	  dfa->sb_char = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t));
 942	  if (BE (dfa->sb_char == NULL, 0))
 943	    return REG_ESPACE;
 944
 945	  /* Set the bits corresponding to single byte chars.  */
 946	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 947	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 948	      {
 949		wint_t wch = __btowc (ch);
 950		if (wch != WEOF)
 951		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 952# ifndef _LIBC
 953		if (isascii (ch) && wch != ch)
 954		  dfa->map_notascii = 1;
 955# endif
 956	      }
 957	}
 958    }
 959#endif
 960
 961  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 962    return REG_ESPACE;
 963  return REG_NOERROR;
 964}
 965
 966/* Initialize WORD_CHAR table, which indicate which character is
 967   "word".  In this case "word" means that it is the word construction
 968   character used by some operators like "\<", "\>", etc.  */
 969
 970static void
 971internal_function
 972init_word_char (re_dfa_t *dfa)
 973{
 974  int i, j, ch;
 975  dfa->word_ops_used = 1;
 976  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 977    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 978      if (isalnum (ch) || ch == '_')
 979	dfa->word_char[i] |= (bitset_word_t) 1 << j;
 980}
 981
 982/* Free the work area which are only used while compiling.  */
 983
 984static void
 985free_workarea_compile (regex_t *preg)
 986{
 987  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 988  bin_tree_storage_t *storage, *next;
 989  for (storage = dfa->str_tree_storage; storage; storage = next)
 990    {
 991      next = storage->next;
 992      re_free (storage);
 993    }
 994  dfa->str_tree_storage = NULL;
 995  dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 996  dfa->str_tree = NULL;
 997  re_free (dfa->org_indices);
 998  dfa->org_indices = NULL;
 999}
1000
1001/* Create initial states for all contexts.  */
1002
1003static reg_errcode_t
1004create_initial_state (re_dfa_t *dfa)
1005{
1006  int first, i;
1007  reg_errcode_t err;
1008  re_node_set init_nodes;
1009
1010  /* Initial states have the epsilon closure of the node which is
1011     the first node of the regular expression.  */
1012  first = dfa->str_tree->first->node_idx;
1013  dfa->init_node = first;
1014  err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1015  if (BE (err != REG_NOERROR, 0))
1016    return err;
1017
1018  /* The back-references which are in initial states can epsilon transit,
1019     since in this case all of the subexpressions can be null.
1020     Then we add epsilon closures of the nodes which are the next nodes of
1021     the back-references.  */
1022  if (dfa->nbackref > 0)
1023    for (i = 0; i < init_nodes.nelem; ++i)
1024      {
1025	int node_idx = init_nodes.elems[i];
1026	re_token_type_t type = dfa->nodes[node_idx].type;
1027
1028	int clexp_idx;
1029	if (type != OP_BACK_REF)
1030	  continue;
1031	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1032	  {
1033	    re_token_t *clexp_node;
1034	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1035	    if (clexp_node->type == OP_CLOSE_SUBEXP
1036		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1037	      break;
1038	  }
1039	if (clexp_idx == init_nodes.nelem)
1040	  continue;
1041
1042	if (type == OP_BACK_REF)
1043	  {
1044	    int dest_idx = dfa->edests[node_idx].elems[0];
1045	    if (!re_node_set_contains (&init_nodes, dest_idx))
1046	      {
1047		reg_errcode_t err = re_node_set_merge (&init_nodes,
1048						       dfa->eclosures
1049						       + dest_idx);
1050		if (err != REG_NOERROR)
1051		  return err;
1052		i = 0;
1053	      }
1054	  }
1055      }
1056
1057  /* It must be the first time to invoke acquire_state.  */
1058  dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1059  /* We don't check ERR here, since the initial state must not be NULL.  */
1060  if (BE (dfa->init_state == NULL, 0))
1061    return err;
1062  if (dfa->init_state->has_constraint)
1063    {
1064      dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1065						       CONTEXT_WORD);
1066      dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1067						     CONTEXT_NEWLINE);
1068      dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1069							 &init_nodes,
1070							 CONTEXT_NEWLINE
1071							 | CONTEXT_BEGBUF);
1072      if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1073	      || dfa->init_state_begbuf == NULL, 0))
1074	return err;
1075    }
1076  else
1077    dfa->init_state_word = dfa->init_state_nl
1078      = dfa->init_state_begbuf = dfa->init_state;
1079
1080  re_node_set_free (&init_nodes);
1081  return REG_NOERROR;
1082}
1083
1084#ifdef RE_ENABLE_I18N
1085/* If it is possible to do searching in single byte encoding instead of UTF-8
1086   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1087   DFA nodes where needed.  */
1088
1089static void
1090optimize_utf8 (re_dfa_t *dfa)
1091{
1092  int node, i, mb_chars = 0, has_period = 0;
1093
1094  for (node = 0; node < dfa->nodes_len; ++node)
1095    switch (dfa->nodes[node].type)
1096      {
1097      case CHARACTER:
1098	if (dfa->nodes[node].opr.c >= 0x80)
1099	  mb_chars = 1;
1100	break;
1101      case ANCHOR:
1102	switch (dfa->nodes[node].opr.ctx_type)
1103	  {
1104	  case LINE_FIRST:
1105	  case LINE_LAST:
1106	  case BUF_FIRST:
1107	  case BUF_LAST:
1108	    break;
1109	  default:
1110	    /* Word anchors etc. cannot be handled.  It's okay to test
1111	       opr.ctx_type since constraints (for all DFA nodes) are
1112	       created by ORing one or more opr.ctx_type values.  */
1113	    return;
1114	  }
1115	break;
1116      case OP_PERIOD:
1117	has_period = 1;
1118	break;
1119      case OP_BACK_REF:
1120      case OP_ALT:
1121      case END_OF_RE:
1122      case OP_DUP_ASTERISK:
1123      case OP_OPEN_SUBEXP:
1124      case OP_CLOSE_SUBEXP:
1125	break;
1126      case COMPLEX_BRACKET:
1127	return;
1128      case SIMPLE_BRACKET:
1129	/* Just double check.  The non-ASCII range starts at 0x80.  */
1130	assert (0x80 % BITSET_WORD_BITS == 0);
1131	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1132	  if (dfa->nodes[node].opr.sbcset[i])
1133	    return;
1134	break;
1135      default:
1136	abort ();
1137      }
1138
1139  if (mb_chars || has_period)
1140    for (node = 0; node < dfa->nodes_len; ++node)
1141      {
1142	if (dfa->nodes[node].type == CHARACTER
1143	    && dfa->nodes[node].opr.c >= 0x80)
1144	  dfa->nodes[node].mb_partial = 0;
1145	else if (dfa->nodes[node].type == OP_PERIOD)
1146	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1147      }
1148
1149  /* The search can be in single byte locale.  */
1150  dfa->mb_cur_max = 1;
1151  dfa->is_utf8 = 0;
1152  dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1153}
1154#endif
1155
1156/* Analyze the structure tree, and calculate "first", "next", "edest",
1157   "eclosure", and "inveclosure".  */
1158
1159static reg_errcode_t
1160analyze (regex_t *preg)
1161{
1162  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1163  reg_errcode_t ret;
1164
1165  /* Allocate arrays.  */
1166  dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1167  dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1168  dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1169  dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1170  if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1171	  || dfa->eclosures == NULL, 0))
1172    return REG_ESPACE;
1173
1174  dfa->subexp_map = re_malloc (int, preg->re_nsub);
1175  if (dfa->subexp_map != NULL)
1176    {
1177      int i;
1178      for (i = 0; i < preg->re_nsub; i++)
1179	dfa->subexp_map[i] = i;
1180      preorder (dfa->str_tree, optimize_subexps, dfa);
1181      for (i = 0; i < preg->re_nsub; i++)
1182	if (dfa->subexp_map[i] != i)
1183	  break;
1184      if (i == preg->re_nsub)
1185	{
1186	  free (dfa->subexp_map);
1187	  dfa->subexp_map = NULL;
1188	}
1189    }
1190
1191  ret = postorder (dfa->str_tree, lower_subexps, preg);
1192  if (BE (ret != REG_NOERROR, 0))
1193    return ret;
1194  ret = postorder (dfa->str_tree, calc_first, dfa);
1195  if (BE (ret != REG_NOERROR, 0))
1196    return ret;
1197  preorder (dfa->str_tree, calc_next, dfa);
1198  ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1199  if (BE (ret != REG_NOERROR, 0))
1200    return ret;
1201  ret = calc_eclosure (dfa);
1202  if (BE (ret != REG_NOERROR, 0))
1203    return ret;
1204
1205  /* We only need this during the prune_impossible_nodes pass in regexec.c;
1206     skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1207  if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1208      || dfa->nbackref)
1209    {
1210      dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1211      if (BE (dfa->inveclosures == NULL, 0))
1212	return REG_ESPACE;
1213      ret = calc_inveclosure (dfa);
1214    }
1215
1216  return ret;
1217}
1218
1219/* Our parse trees are very unbalanced, so we cannot use a stack to
1220   implement parse tree visits.  Instead, we use parent pointers and
1221   some hairy code in these two functions.  */
1222static reg_errcode_t
1223postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1224	   void *extra)
1225{
1226  bin_tree_t *node, *prev;
1227
1228  for (node = root; ; )
1229    {
1230      /* Descend down the tree, preferably to the left (or to the right
1231	 if that's the only child).  */
1232      while (node->left || node->right)
1233	if (node->left)
1234	  node = node->left;
1235	else
1236	  node = node->right;
1237
1238      do
1239	{
1240	  reg_errcode_t err = fn (extra, node);
1241	  if (BE (err != REG_NOERROR, 0))
1242	    return err;
1243	  if (node->parent == NULL)
1244	    return REG_NOERROR;
1245	  prev = node;
1246	  node = node->parent;
1247	}
1248      /* Go up while we have a node that is reached from the right.  */
1249      while (node->right == prev || node->right == NULL);
1250      node = node->right;
1251    }
1252}
1253
1254static reg_errcode_t
1255preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1256	  void *extra)
1257{
1258  bin_tree_t *node;
1259
1260  for (node = root; ; )
1261    {
1262      reg_errcode_t err = fn (extra, node);
1263      if (BE (err != REG_NOERROR, 0))
1264	return err;
1265
1266      /* Go to the left node, or up and to the right.  */
1267      if (node->left)
1268	node = node->left;
1269      else
1270	{
1271	  bin_tree_t *prev = NULL;
1272	  while (node->right == prev || node->right == NULL)
1273	    {
1274	      prev = node;
1275	      node = node->parent;
1276	      if (!node)
1277		return REG_NOERROR;
1278	    }
1279	  node = node->right;
1280	}
1281    }
1282}
1283
1284/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1285   re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1286   backreferences as well.  Requires a preorder visit.  */
1287static reg_errcode_t
1288optimize_subexps (void *extra, bin_tree_t *node)
1289{
1290  re_dfa_t *dfa = (re_dfa_t *) extra;
1291
1292  if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1293    {
1294      int idx = node->token.opr.idx;
1295      node->token.opr.idx = dfa->subexp_map[idx];
1296      dfa->used_bkref_map |= 1 << node->token.opr.idx;
1297    }
1298
1299  else if (node->token.type == SUBEXP
1300	   && node->left && node->left->token.type == SUBEXP)
1301    {
1302      int other_idx = node->left->token.opr.idx;
1303
1304      node->left = node->left->left;
1305      if (node->left)
1306	node->left->parent = node;
1307
1308      dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1309      if (other_idx < BITSET_WORD_BITS)
1310	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1311    }
1312
1313  return REG_NOERROR;
1314}
1315
1316/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1317   of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1318static reg_errcode_t
1319lower_subexps (void *extra, bin_tree_t *node)
1320{
1321  regex_t *preg = (regex_t *) extra;
1322  reg_errcode_t err = REG_NOERROR;
1323
1324  if (node->left && node->left->token.type == SUBEXP)
1325    {
1326      node->left = lower_subexp (&err, preg, node->left);
1327      if (node->left)
1328	node->left->parent = node;
1329    }
1330  if (node->right && node->right->token.type == SUBEXP)
1331    {
1332      node->right = lower_subexp (&err, preg, node->right);
1333      if (node->right)
1334	node->right->parent = node;
1335    }
1336
1337  return err;
1338}
1339
1340static bin_tree_t *
1341lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1342{
1343  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1344  bin_tree_t *body = node->left;
1345  bin_tree_t *op, *cls, *tree1, *tree;
1346
1347  if (preg->no_sub
1348      /* We do not optimize empty subexpressions, because otherwise we may
1349	 have bad CONCAT nodes with NULL children.  This is obviously not
1350	 very common, so we do not lose much.  An example that triggers
1351	 this case is the sed "script" /\(\)/x.  */
1352      && node->left != NULL
1353      && (node->token.opr.idx >= BITSET_WORD_BITS
1354	  || !(dfa->used_bkref_map
1355	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1356    return node->left;
1357
1358  /* Convert the SUBEXP node to the concatenation of an
1359     OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1360  op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1361  cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1362  tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1363  tree = create_tree (dfa, op, tree1, CONCAT);
1364  if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1365    {
1366      *err = REG_ESPACE;
1367      return NULL;
1368    }
1369
1370  op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1371  op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1372  return tree;
1373}
1374
1375/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1376   nodes.  Requires a postorder visit.  */
1377static reg_errcode_t
1378calc_first (void *extra, bin_tree_t *node)
1379{
1380  re_dfa_t *dfa = (re_dfa_t *) extra;
1381  if (node->token.type == CONCAT)
1382    {
1383      node->first = node->left->first;
1384      node->node_idx = node->left->node_idx;
1385    }
1386  else
1387    {
1388      node->first = node;
1389      node->node_idx = re_dfa_add_node (dfa, node->token);
1390      if (BE (node->node_idx == -1, 0))
1391	return REG_ESPACE;
1392      if (node->token.type == ANCHOR)
1393	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1394    }
1395  return REG_NOERROR;
1396}
1397
1398/* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1399static reg_errcode_t
1400calc_next (void *extra, bin_tree_t *node)
1401{
1402  switch (node->token.type)
1403    {
1404    case OP_DUP_ASTERISK:
1405      node->left->next = node;
1406      break;
1407    case CONCAT:
1408      node->left->next = node->right->first;
1409      node->right->next = node->next;
1410      break;
1411    default:
1412      if (node->left)
1413	node->left->next = node->next;
1414      if (node->right)
1415	node->right->next = node->next;
1416      break;
1417    }
1418  return REG_NOERROR;
1419}
1420
1421/* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1422static reg_errcode_t
1423link_nfa_nodes (void *extra, bin_tree_t *node)
1424{
1425  re_dfa_t *dfa = (re_dfa_t *) extra;
1426  int idx = node->node_idx;
1427  reg_errcode_t err = REG_NOERROR;
1428
1429  switch (node->token.type)
1430    {
1431    case CONCAT:
1432      break;
1433
1434    case END_OF_RE:
1435      assert (node->next == NULL);
1436      break;
1437
1438    case OP_DUP_ASTERISK:
1439    case OP_ALT:
1440      {
1441	int left, right;
1442	dfa->has_plural_match = 1;
1443	if (node->left != NULL)
1444	  left = node->left->first->node_idx;
1445	else
1446	  left = node->next->node_idx;
1447	if (node->right != NULL)
1448	  right = node->right->first->node_idx;
1449	else
1450	  right = node->next->node_idx;
1451	assert (left > -1);
1452	assert (right > -1);
1453	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1454      }
1455      break;
1456
1457    case ANCHOR:
1458    case OP_OPEN_SUBEXP:
1459    case OP_CLOSE_SUBEXP:
1460      err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1461      break;
1462
1463    case OP_BACK_REF:
1464      dfa->nexts[idx] = node->next->node_idx;
1465      if (node->token.type == OP_BACK_REF)
1466	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1467      break;
1468
1469    default:
1470      assert (!IS_EPSILON_NODE (node->token.type));
1471      dfa->nexts[idx] = node->next->node_idx;
1472      break;
1473    }
1474
1475  return err;
1476}
1477
1478/* Duplicate the epsilon closure of the node ROOT_NODE.
1479   Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1480   to their own constraint.  */
1481
1482static reg_errcode_t
1483internal_function
1484duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1485			int root_node, unsigned int init_constraint)
1486{
1487  int org_node, clone_node, ret;
1488  unsigned int constraint = init_constraint;
1489  for (org_node = top_org_node, clone_node = top_clone_node;;)
1490    {
1491      int org_dest, clone_dest;
1492      if (dfa->nodes[org_node].type == OP_BACK_REF)
1493	{
1494	  /* If the back reference epsilon-transit, its destination must
1495	     also have the constraint.  Then duplicate the epsilon closure
1496	     of the destination of the back reference, and store it in
1497	     edests of the back reference.  */
1498	  org_dest = dfa->nexts[org_node];
1499	  re_node_set_empty (dfa->edests + clone_node);
1500	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1501	  if (BE (clone_dest == -1, 0))
1502	    return REG_ESPACE;
1503	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1504	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1505	  if (BE (ret < 0, 0))
1506	    return REG_ESPACE;
1507	}
1508      else if (dfa->edests[org_node].nelem == 0)
1509	{
1510	  /* In case of the node can't epsilon-transit, don't duplicate the
1511	     destination and store the original destination as the
1512	     destination of the node.  */
1513	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1514	  break;
1515	}
1516      else if (dfa->edests[org_node].nelem == 1)
1517	{
1518	  /* In case of the node can epsilon-transit, and it has only one
1519	     destination.  */
1520	  org_dest = dfa->edests[org_node].elems[0];
1521	  re_node_set_empty (dfa->edests + clone_node);
1522	  /* If the node is root_node itself, it means the epsilon clsoure
1523	     has a loop.   Then tie it to the destination of the root_node.  */
1524	  if (org_node == root_node && clone_node != org_node)
1525	    {
1526	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1527	      if (BE (ret < 0, 0))
1528		return REG_ESPACE;
1529	      break;
1530	    }
1531	  /* In case of the node has another constraint, add it.  */
1532	  constraint |= dfa->nodes[org_node].constraint;
1533	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1534	  if (BE (clone_dest == -1, 0))
1535	    return REG_ESPACE;
1536	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1537	  if (BE (ret < 0, 0))
1538	    return REG_ESPACE;
1539	}
1540      else /* dfa->edests[org_node].nelem == 2 */
1541	{
1542	  /* In case of the node can epsilon-transit, and it has two
1543	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1544	  org_dest = dfa->edests[org_node].elems[0];
1545	  re_node_set_empty (dfa->edests + clone_node);
1546	  /* Search for a duplicated node which satisfies the constraint.  */
1547	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1548	  if (clone_dest == -1)
1549	    {
1550	      /* There is no such duplicated node, create a new one.  */
1551	      reg_errcode_t err;
1552	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1553	      if (BE (clone_dest == -1, 0))
1554		return REG_ESPACE;
1555	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556	      if (BE (ret < 0, 0))
1557		return REG_ESPACE;
1558	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1559					    root_node, constraint);
1560	      if (BE (err != REG_NOERROR, 0))
1561		return err;
1562	    }
1563	  else
1564	    {
1565	      /* There is a duplicated node which satisfies the constraint,
1566		 use it to avoid infinite loop.  */
1567	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568	      if (BE (ret < 0, 0))
1569		return REG_ESPACE;
1570	    }
1571
1572	  org_dest = dfa->edests[org_node].elems[1];
1573	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1574	  if (BE (clone_dest == -1, 0))
1575	    return REG_ESPACE;
1576	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1577	  if (BE (ret < 0, 0))
1578	    return REG_ESPACE;
1579	}
1580      org_node = org_dest;
1581      clone_node = clone_dest;
1582    }
1583  return REG_NOERROR;
1584}
1585
1586/* Search for a node which is duplicated from the node ORG_NODE, and
1587   satisfies the constraint CONSTRAINT.  */
1588
1589static int
1590search_duplicated_node (const re_dfa_t *dfa, int org_node,
1591			unsigned int constraint)
1592{
1593  int idx;
1594  for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1595    {
1596      if (org_node == dfa->org_indices[idx]
1597	  && constraint == dfa->nodes[idx].constraint)
1598	return idx; /* Found.  */
1599    }
1600  return -1; /* Not found.  */
1601}
1602
1603/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1604   Return the index of the new node, or -1 if insufficient storage is
1605   available.  */
1606
1607static int
1608duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1609{
1610  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1611  if (BE (dup_idx != -1, 1))
1612    {
1613      dfa->nodes[dup_idx].constraint = constraint;
1614      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1615      dfa->nodes[dup_idx].duplicated = 1;
1616
1617      /* Store the index of the original node.  */
1618      dfa->org_indices[dup_idx] = org_idx;
1619    }
1620  return dup_idx;
1621}
1622
1623static reg_errcode_t
1624calc_inveclosure (re_dfa_t *dfa)
1625{
1626  int src, idx, ret;
1627  for (idx = 0; idx < dfa->nodes_len; ++idx)
1628    re_node_set_init_empty (dfa->inveclosures + idx);
1629
1630  for (src = 0; src < dfa->nodes_len; ++src)
1631    {
1632      int *elems = dfa->eclosures[src].elems;
1633      for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1634	{
1635	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1636	  if (BE (ret == -1, 0))
1637	    return REG_ESPACE;
1638	}
1639    }
1640
1641  return REG_NOERROR;
1642}
1643
1644/* Calculate "eclosure" for all the node in DFA.  */
1645
1646static reg_errcode_t
1647calc_eclosure (re_dfa_t *dfa)
1648{
1649  int node_idx, incomplete;
1650#ifdef DEBUG
1651  assert (dfa->nodes_len > 0);
1652#endif
1653  incomplete = 0;
1654  /* For each nodes, calculate epsilon closure.  */
1655  for (node_idx = 0; ; ++node_idx)
1656    {
1657      reg_errcode_t err;
1658      re_node_set eclosure_elem;
1659      if (node_idx == dfa->nodes_len)
1660	{
1661	  if (!incomplete)
1662	    break;
1663	  incomplete = 0;
1664	  node_idx = 0;
1665	}
1666
1667#ifdef DEBUG
1668      assert (dfa->eclosures[node_idx].nelem != -1);
1669#endif
1670
1671      /* If we have already calculated, skip it.  */
1672      if (dfa->eclosures[node_idx].nelem != 0)
1673	continue;
1674      /* Calculate epsilon closure of `node_idx'.  */
1675      err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1676      if (BE (err != REG_NOERROR, 0))
1677	return err;
1678
1679      if (dfa->eclosures[node_idx].nelem == 0)
1680	{
1681	  incomplete = 1;
1682	  re_node_set_free (&eclosure_elem);
1683	}
1684    }
1685  return REG_NOERROR;
1686}
1687
1688/* Calculate epsilon closure of NODE.  */
1689
1690static reg_errcode_t
1691calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1692{
1693  reg_errcode_t err;
1694  int i;
1695  re_node_set eclosure;
1696  int ret;
1697  int incomplete = 0;
1698  err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1699  if (BE (err != REG_NOERROR, 0))
1700    return err;
1701
1702  /* This indicates that we are calculating this node now.
1703     We reference this value to avoid infinite loop.  */
1704  dfa->eclosures[node].nelem = -1;
1705
1706  /* If the current node has constraints, duplicate all nodes
1707     since they must inherit the constraints.  */
1708  if (dfa->nodes[node].constraint
1709      && dfa->edests[node].nelem
1710      && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1711    {
1712      err = duplicate_node_closure (dfa, node, node, node,
1713				    dfa->nodes[node].constraint);
1714      if (BE (err != REG_NOERROR, 0))
1715	return err;
1716    }
1717
1718  /* Expand each epsilon destination nodes.  */
1719  if (IS_EPSILON_NODE(dfa->nodes[node].type))
1720    for (i = 0; i < dfa->edests[node].nelem; ++i)
1721      {
1722	re_node_set eclosure_elem;
1723	int edest = dfa->edests[node].elems[i];
1724	/* If calculating the epsilon closure of `edest' is in progress,
1725	   return intermediate result.  */
1726	if (dfa->eclosures[edest].nelem == -1)
1727	  {
1728	    incomplete = 1;
1729	    continue;
1730	  }
1731	/* If we haven't calculated the epsilon closure of `edest' yet,
1732	   calculate now. Otherwise use calculated epsilon closure.  */
1733	if (dfa->eclosures[edest].nelem == 0)
1734	  {
1735	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1736	    if (BE (err != REG_NOERROR, 0))
1737	      return err;
1738	  }
1739	else
1740	  eclosure_elem = dfa->eclosures[edest];
1741	/* Merge the epsilon closure of `edest'.  */
1742	err = re_node_set_merge (&eclosure, &eclosure_elem);
1743	if (BE (err != REG_NOERROR, 0))
1744	  return err;
1745	/* If the epsilon closure of `edest' is incomplete,
1746	   the epsilon closure of this node is also incomplete.  */
1747	if (dfa->eclosures[edest].nelem == 0)
1748	  {
1749	    incomplete = 1;
1750	    re_node_set_free (&eclosure_elem);
1751	  }
1752      }
1753
1754  /* An epsilon closure includes itself.  */
1755  ret = re_node_set_insert (&eclosure, node);
1756  if (BE (ret < 0, 0))
1757    return REG_ESPACE;
1758  if (incomplete && !root)
1759    dfa->eclosures[node].nelem = 0;
1760  else
1761    dfa->eclosures[node] = eclosure;
1762  *new_set = eclosure;
1763  return REG_NOERROR;
1764}
1765
1766/* Functions for token which are used in the parser.  */
1767
1768/* Fetch a token from INPUT.
1769   We must not use this function inside bracket expressions.  */
1770
1771static void
1772internal_function
1773fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1774{
1775  re_string_skip_bytes (input, peek_token (result, input, syntax));
1776}
1777
1778/* Peek a token from INPUT, and return the length of the token.
1779   We must not use this function inside bracket expressions.  */
1780
1781static int
1782internal_function
1783peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1784{
1785  unsigned char c;
1786
1787  if (re_string_eoi (input))
1788    {
1789      token->type = END_OF_RE;
1790      return 0;
1791    }
1792
1793  c = re_string_peek_byte (input, 0);
1794  token->opr.c = c;
1795
1796  token->word_char = 0;
1797#ifdef RE_ENABLE_I18N
1798  token->mb_partial = 0;
1799  if (input->mb_cur_max > 1 &&
1800      !re_string_first_byte (input, re_string_cur_idx (input)))
1801    {
1802      token->type = CHARACTER;
1803      token->mb_partial = 1;
1804      return 1;
1805    }
1806#endif
1807  if (c == '\\')
1808    {
1809      unsigned char c2;
1810      if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1811	{
1812	  token->type = BACK_SLASH;
1813	  return 1;
1814	}
1815
1816      c2 = re_string_peek_byte_case (input, 1);
1817      token->opr.c = c2;
1818      token->type = CHARACTER;
1819#ifdef RE_ENABLE_I18N
1820      if (input->mb_cur_max > 1)
1821	{
1822	  wint_t wc = re_string_wchar_at (input,
1823					  re_string_cur_idx (input) + 1);
1824	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1825	}
1826      else
1827#endif
1828	token->word_char = IS_WORD_CHAR (c2) != 0;
1829
1830      switch (c2)
1831	{
1832	case '|':
1833	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1834	    token->type = OP_ALT;
1835	  break;
1836	case '1': case '2': case '3': case '4': case '5':
1837	case '6': case '7': case '8': case '9':
1838	  if (!(syntax & RE_NO_BK_REFS))
1839	    {
1840	      token->type = OP_BACK_REF;
1841	      token->opr.idx = c2 - '1';
1842	    }
1843	  break;
1844	case '<':
1845	  if (!(syntax & RE_NO_GNU_OPS))
1846	    {
1847	      token->type = ANCHOR;
1848	      token->opr.ctx_type = WORD_FIRST;
1849	    }
1850	  break;
1851	case '>':
1852	  if (!(syntax & RE_NO_GNU_OPS))
1853	    {
1854	      token->type = ANCHOR;
1855	      token->opr.ctx_type = WORD_LAST;
1856	    }
1857	  break;
1858	case 'b':
1859	  if (!(syntax & RE_NO_GNU_OPS))
1860	    {
1861	      token->type = ANCHOR;
1862	      token->opr.ctx_type = WORD_DELIM;
1863	    }
1864	  break;
1865	case 'B':
1866	  if (!(syntax & RE_NO_GNU_OPS))
1867	    {
1868	      token->type = ANCHOR;
1869	      token->opr.ctx_type = NOT_WORD_DELIM;
1870	    }
1871	  break;
1872	case 'w':
1873	  if (!(syntax & RE_NO_GNU_OPS))
1874	    token->type = OP_WORD;
1875	  break;
1876	case 'W':
1877	  if (!(syntax & RE_NO_GNU_OPS))
1878	    token->type = OP_NOTWORD;
1879	  break;
1880	case 's':
1881	  if (!(syntax & RE_NO_GNU_OPS))
1882	    token->type = OP_SPACE;
1883	  break;
1884	case 'S':
1885	  if (!(syntax & RE_NO_GNU_OPS))
1886	    token->type = OP_NOTSPACE;
1887	  break;
1888	case '`':
1889	  if (!(syntax & RE_NO_GNU_OPS))
1890	    {
1891	      token->type = ANCHOR;
1892	      token->opr.ctx_type = BUF_FIRST;
1893	    }
1894	  break;
1895	case '\'':
1896	  if (!(syntax & RE_NO_GNU_OPS))
1897	    {
1898	      token->type = ANCHOR;
1899	      token->opr.ctx_type = BUF_LAST;
1900	    }
1901	  break;
1902	case '(':
1903	  if (!(syntax & RE_NO_BK_PARENS))
1904	    token->type = OP_OPEN_SUBEXP;
1905	  break;
1906	case ')':
1907	  if (!(syntax & RE_NO_BK_PARENS))
1908	    token->type = OP_CLOSE_SUBEXP;
1909	  break;
1910	case '+':
1911	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912	    token->type = OP_DUP_PLUS;
1913	  break;
1914	case '?':
1915	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916	    token->type = OP_DUP_QUESTION;
1917	  break;
1918	case '{':
1919	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920	    token->type = OP_OPEN_DUP_NUM;
1921	  break;
1922	case '}':
1923	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924	    token->type = OP_CLOSE_DUP_NUM;
1925	  break;
1926	default:
1927	  break;
1928	}
1929      return 2;
1930    }
1931
1932  token->type = CHARACTER;
1933#ifdef RE_ENABLE_I18N
1934  if (input->mb_cur_max > 1)
1935    {
1936      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1937      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1938    }
1939  else
1940#endif
1941    token->word_char = IS_WORD_CHAR (token->opr.c);
1942
1943  switch (c)
1944    {
1945    case '\n':
1946      if (syntax & RE_NEWLINE_ALT)
1947	token->type = OP_ALT;
1948      break;
1949    case '|':
1950      if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1951	token->type = OP_ALT;
1952      break;
1953    case '*':
1954      token->type = OP_DUP_ASTERISK;
1955      break;
1956    case '+':
1957      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958	token->type = OP_DUP_PLUS;
1959      break;
1960    case '?':
1961      if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962	token->type = OP_DUP_QUESTION;
1963      break;
1964    case '{':
1965      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966	token->type = OP_OPEN_DUP_NUM;
1967      break;
1968    case '}':
1969      if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970	token->type = OP_CLOSE_DUP_NUM;
1971      break;
1972    case '(':
1973      if (syntax & RE_NO_BK_PARENS)
1974	token->type = OP_OPEN_SUBEXP;
1975      break;
1976    case ')':
1977      if (syntax & RE_NO_BK_PARENS)
1978	token->type = OP_CLOSE_SUBEXP;
1979      break;
1980    case '[':
1981      token->type = OP_OPEN_BRACKET;
1982      break;
1983    case '.':
1984      token->type = OP_PERIOD;
1985      break;
1986    case '^':
1987      if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1988	  re_string_cur_idx (input) != 0)
1989	{
1990	  char prev = re_string_peek_byte (input, -1);
1991	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1992	    break;
1993	}
1994      token->type = ANCHOR;
1995      token->opr.ctx_type = LINE_FIRST;
1996      break;
1997    case '$':
1998      if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1999	  re_string_cur_idx (input) + 1 != re_string_length (input))
2000	{
2001	  re_token_t next;
2002	  re_string_skip_bytes (input, 1);
2003	  peek_token (&next, input, syntax);
2004	  re_string_skip_bytes (input, -1);
2005	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006	    break;
2007	}
2008      token->type = ANCHOR;
2009      token->opr.ctx_type = LINE_LAST;
2010      break;
2011    default:
2012      break;
2013    }
2014  return 1;
2015}
2016
2017/* Peek a token from INPUT, and return the length of the token.
2018   We must not use this function out of bracket expressions.  */
2019
2020static int
2021internal_function
2022peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2023{
2024  unsigned char c;
2025  if (re_string_eoi (input))
2026    {
2027      token->type = END_OF_RE;
2028      return 0;
2029    }
2030  c = re_string_peek_byte (input, 0);
2031  token->opr.c = c;
2032
2033#ifdef RE_ENABLE_I18N
2034  if (input->mb_cur_max > 1 &&
2035      !re_string_first_byte (input, re_string_cur_idx (input)))
2036    {
2037      token->type = CHARACTER;
2038      return 1;
2039    }
2040#endif /* RE_ENABLE_I18N */
2041
2042  if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2043      && re_string_cur_idx (input) + 1 < re_string_length (input))
2044    {
2045      /* In this case, '\' escape a character.  */
2046      unsigned char c2;
2047      re_string_skip_bytes (input, 1);
2048      c2 = re_string_peek_byte (input, 0);
2049      token->opr.c = c2;
2050      token->type = CHARACTER;
2051      return 1;
2052    }
2053  if (c == '[') /* '[' is a special char in a bracket exps.  */
2054    {
2055      unsigned char c2;
2056      int token_len;
2057      if (re_string_cur_idx (input) + 1 < re_string_length (input))
2058	c2 = re_string_peek_byte (input, 1);
2059      else
2060	c2 = 0;
2061      token->opr.c = c2;
2062      token_len = 2;
2063      switch (c2)
2064	{
2065	case '.':
2066	  token->type = OP_OPEN_COLL_ELEM;
2067	  break;
2068	case '=':
2069	  token->type = OP_OPEN_EQUIV_CLASS;
2070	  break;
2071	case ':':
2072	  if (syntax & RE_CHAR_CLASSES)
2073	    {
2074	      token->type = OP_OPEN_CHAR_CLASS;
2075	      break;
2076	    }
2077	  /* else fall through.  */
2078	default:
2079	  token->type = CHARACTER;
2080	  token->opr.c = c;
2081	  token_len = 1;
2082	  break;
2083	}
2084      return token_len;
2085    }
2086  switch (c)
2087    {
2088    case '-':
2089      token->type = OP_CHARSET_RANGE;
2090      break;
2091    case ']':
2092      token->type = OP_CLOSE_BRACKET;
2093      break;
2094    case '^':
2095      token->type = OP_NON_MATCH_LIST;
2096      break;
2097    default:
2098      token->type = CHARACTER;
2099    }
2100  return 1;
2101}
2102
2103/* Functions for parser.  */
2104
2105/* Entry point of the parser.
2106   Parse the regular expression REGEXP and return the structure tree.
2107   If an error has occurred, ERR is set by error code, and return NULL.
2108   This function build the following tree, from regular expression <reg_exp>:
2109	   CAT
2110	   / \
2111	  /   \
2112   <reg_exp>  EOR
2113
2114   CAT means concatenation.
2115   EOR means end of regular expression.  */
2116
2117static bin_tree_t *
2118parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2119       reg_errcode_t *err)
2120{
2121  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2122  bin_tree_t *tree, *eor, *root;
2123  re_token_t current_token;
2124  dfa->syntax = syntax;
2125  fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126  tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2128    return NULL;
2129  eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130  if (tree != NULL)
2131    root = create_tree (dfa, tree, eor, CONCAT);
2132  else
2133    root = eor;
2134  if (BE (eor == NULL || root == NULL, 0))
2135    {
2136      *err = REG_ESPACE;
2137      return NULL;
2138    }
2139  return root;
2140}
2141
2142/* This function build the following tree, from regular expression
2143   <branch1>|<branch2>:
2144	   ALT
2145	   / \
2146	  /   \
2147   <branch1> <branch2>
2148
2149   ALT means alternative, which represents the operator `|'.  */
2150
2151static bin_tree_t *
2152parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2154{
2155  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2156  bin_tree_t *tree, *branch = NULL;
2157  tree = parse_branch (regexp, preg, token, syntax, nest, err);
2158  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2159    return NULL;
2160
2161  while (token->type == OP_ALT)
2162    {
2163      fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2164      if (token->type != OP_ALT && token->type != END_OF_RE
2165	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166	{
2167	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2168	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2169	    return NULL;
2170	}
2171      else
2172	branch = NULL;
2173      tree = create_tree (dfa, tree, branch, OP_ALT);
2174      if (BE (tree == NULL, 0))
2175	{
2176	  *err = REG_ESPACE;
2177	  return NULL;
2178	}
2179    }
2180  return tree;
2181}
2182
2183/* This function build the following tree, from regular expression
2184   <exp1><exp2>:
2185	CAT
2186	/ \
2187       /   \
2188   <exp1> <exp2>
2189
2190   CAT means concatenation.  */
2191
2192static bin_tree_t *
2193parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2194	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2195{
2196  bin_tree_t *tree, *exp;
2197  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2198  tree = parse_expression (regexp, preg, token, syntax, nest, err);
2199  if (BE (*err != REG_NOERROR && tree == NULL, 0))
2200    return NULL;
2201
2202  while (token->type != OP_ALT && token->type != END_OF_RE
2203	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2204    {
2205      exp = parse_expression (regexp, preg, token, syntax, nest, err);
2206      if (BE (*err != REG_NOERROR && exp == NULL, 0))
2207	{
2208	  return NULL;
2209	}
2210      if (tree != NULL && exp != NULL)
2211	{
2212	  tree = create_tree (dfa, tree, exp, CONCAT);
2213	  if (tree == NULL)
2214	    {
2215	      *err = REG_ESPACE;
2216	      return NULL;
2217	    }
2218	}
2219      else if (tree == NULL)
2220	tree = exp;
2221      /* Otherwise exp == NULL, we don't need to create new tree.  */
2222    }
2223  return tree;
2224}
2225
2226/* This function build the following tree, from regular expression a*:
2227	 *
2228	 |
2229	 a
2230*/
2231
2232static bin_tree_t *
2233parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2234		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2235{
2236  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2237  bin_tree_t *tree;
2238  switch (token->type)
2239    {
2240    case CHARACTER:
2241      tree = create_token_tree (dfa, NULL, NULL, token);
2242      if (BE (tree == NULL, 0))
2243	{
2244	  *err = REG_ESPACE;
2245	  return NULL;
2246	}
2247#ifdef RE_ENABLE_I18N
2248      if (dfa->mb_cur_max > 1)
2249	{
2250	  while (!re_string_eoi (regexp)
2251		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2252	    {
2253	      bin_tree_t *mbc_remain;
2254	      fetch_token (token, regexp, syntax);
2255	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2256	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2257	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2258		{
2259		  *err = REG_ESPACE;
2260		  return NULL;
2261		}
2262	    }
2263	}
2264#endif
2265      break;
2266    case OP_OPEN_SUBEXP:
2267      tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2268      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2269	return NULL;
2270      break;
2271    case OP_OPEN_BRACKET:
2272      tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2273      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2274	return NULL;
2275      break;
2276    case OP_BACK_REF:
2277      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2278	{
2279	  *err = REG_ESUBREG;
2280	  return NULL;
2281	}
2282      dfa->used_bkref_map |= 1 << token->opr.idx;
2283      tree = create_token_tree (dfa, NULL, NULL, token);
2284      if (BE (tree == NULL, 0))
2285	{
2286	  *err = REG_ESPACE;
2287	  return NULL;
2288	}
2289      ++dfa->nbackref;
2290      dfa->has_mb_node = 1;
2291      break;
2292    case OP_OPEN_DUP_NUM:
2293      if (syntax & RE_CONTEXT_INVALID_DUP)
2294	{
2295	  *err = REG_BADRPT;
2296	  return NULL;
2297	}
2298      /* FALLTHROUGH */
2299    case OP_DUP_ASTERISK:
2300    case OP_DUP_PLUS:
2301    case OP_DUP_QUESTION:
2302      if (syntax & RE_CONTEXT_INVALID_OPS)
2303	{
2304	  *err = REG_BADRPT;
2305	  return NULL;
2306	}
2307      else if (syntax & RE_CONTEXT_INDEP_OPS)
2308	{
2309	  fetch_token (token, regexp, syntax);
2310	  return parse_expression (regexp, preg, token, syntax, nest, err);
2311	}
2312      /* else fall through  */
2313    case OP_CLOSE_SUBEXP:
2314      if ((token->type == OP_CLOSE_SUBEXP) &&
2315	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2316	{
2317	  *err = REG_ERPAREN;
2318	  return NULL;
2319	}
2320      /* else fall through  */
2321    case OP_CLOSE_DUP_NUM:
2322      /* We treat it as a normal character.  */
2323
2324      /* Then we can these characters as normal characters.  */
2325      token->type = CHARACTER;
2326      /* mb_partial and word_char bits should be initialized already
2327	 by peek_token.  */
2328      tree = create_token_tree (dfa, NULL, NULL, token);
2329      if (BE (tree == NULL, 0))
2330	{
2331	  *err = REG_ESPACE;
2332	  return NULL;
2333	}
2334      break;
2335    case ANCHOR:
2336      if ((token->opr.ctx_type
2337	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2338	  && dfa->word_ops_used == 0)
2339	init_word_char (dfa);
2340      if (token->opr.ctx_type == WORD_DELIM
2341	  || token->opr.ctx_type == NOT_WORD_DELIM)
2342	{
2343	  bin_tree_t *tree_first, *tree_last;
2344	  if (token->opr.ctx_type == WORD_DELIM)
2345	    {
2346	      token->opr.ctx_type = WORD_FIRST;
2347	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2348	      token->opr.ctx_type = WORD_LAST;
2349	    }
2350	  else
2351	    {
2352	      token->opr.ctx_type = INSIDE_WORD;
2353	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2354	      token->opr.ctx_type = INSIDE_NOTWORD;
2355	    }
2356	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2357	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2358	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2359	    {
2360	      *err = REG_ESPACE;
2361	      return NULL;
2362	    }
2363	}
2364      else
2365	{
2366	  tree = create_token_tree (dfa, NULL, NULL, token);
2367	  if (BE (tree == NULL, 0))
2368	    {
2369	      *err = REG_ESPACE;
2370	      return NULL;
2371	    }
2372	}
2373      /* We must return here, since ANCHORs can't be followed
2374	 by repetition operators.
2375	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2376	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2377      fetch_token (token, regexp, syntax);
2378      return tree;
2379    case OP_PERIOD:
2380      tree = create_token_tree (dfa, NULL, NULL, token);
2381      if (BE (tree == NULL, 0))
2382	{
2383	  *err = REG_ESPACE;
2384	  return NULL;
2385	}
2386      if (dfa->mb_cur_max > 1)
2387	dfa->has_mb_node = 1;
2388      break;
2389    case OP_WORD:
2390    case OP_NOTWORD:
2391      tree = build_charclass_op (dfa, regexp->trans,
2392				 "alnum",
2393				 "_",
2394				 token->type == OP_NOTWORD, err);
2395      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396	return NULL;
2397      break;
2398    case OP_SPACE:
2399    case OP_NOTSPACE:
2400      tree = build_charclass_op (dfa, regexp->trans,
2401				 "space",
2402				 "",
2403				 token->type == OP_NOTSPACE, err);
2404      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2405	return NULL;
2406      break;
2407    case OP_ALT:
2408    case END_OF_RE:
2409      return NULL;
2410    case BACK_SLASH:
2411      *err = REG_EESCAPE;
2412      return NULL;
2413    default:
2414      /* Must not happen?  */
2415#ifdef DEBUG
2416      assert (0);
2417#endif
2418      return NULL;
2419    }
2420  fetch_token (token, regexp, syntax);
2421
2422  while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2423	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2424    {
2425      tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2426      if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427	return NULL;
2428      /* In BRE consecutive duplications are not allowed.  */
2429      if ((syntax & RE_CONTEXT_INVALID_DUP)
2430	  && (token->type == OP_DUP_ASTERISK
2431	      || token->type == OP_OPEN_DUP_NUM))
2432	{
2433	  *err = REG_BADRPT;
2434	  return NULL;
2435	}
2436    }
2437
2438  return tree;
2439}
2440
2441/* This function build the following tree, from regular expression
2442   (<reg_exp>):
2443	 SUBEXP
2444	    |
2445	<reg_exp>
2446*/
2447
2448static bin_tree_t *
2449parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2450	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2451{
2452  re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2453  bin_tree_t *tree;
2454  size_t cur_nsub;
2455  cur_nsub = preg->re_nsub++;
2456
2457  fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2458
2459  /* The subexpression may be a null string.  */
2460  if (token->type == OP_CLOSE_SUBEXP)
2461    tree = NULL;
2462  else
2463    {
2464      tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2465      if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2466	*err = REG_EPAREN;
2467      if (BE (*err != REG_NOERROR, 0))
2468	return NULL;
2469    }
2470
2471  if (cur_nsub <= '9' - '1')
2472    dfa->completed_bkref_map |= 1 << cur_nsub;
2473
2474  tree = create_tree (dfa, tree, NULL, SUBEXP);
2475  if (BE (tree == NULL, 0))
2476    {
2477      *err = REG_ESPACE;
2478      return NULL;
2479    }
2480  tree->token.opr.idx = cur_nsub;
2481  return tree;
2482}
2483
2484/* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2485
2486static bin_tree_t *
2487parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2488	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2489{
2490  bin_tree_t *tree = NULL, *old_tree = NULL;
2491  int i, start, end, start_idx = re_string_cur_idx (regexp);
2492#ifndef RE_TOKEN_INIT_BUG
2493  re_token_t start_token = *token;
2494#else
2495  re_token_t start_token;
2496
2497  memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2498#endif
2499
2500  if (token->type == OP_OPEN_DUP_NUM)
2501    {
2502      end = 0;
2503      start = fetch_number (regexp, token, syntax);
2504      if (start == -1)
2505	{
2506	  if (token->type == CHARACTER && token->opr.c == ',')
2507	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2508	  else
2509	    {
2510	      *err = REG_BADBR; /* <re>{} is invalid.  */
2511	      return NULL;
2512	    }
2513	}
2514      if (BE (start != -2, 1))
2515	{
2516	  /* We treat "{n}" as "{n,n}".  */
2517	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2518		 : ((token->type == CHARACTER && token->opr.c == ',')
2519		    ? fetch_number (regexp, token, syntax) : -2));
2520	}
2521      if (BE (start == -2 || end == -2, 0))
2522	{
2523	  /* Invalid sequence.  */
2524	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2525	    {
2526	      if (token->type == END_OF_RE)
2527		*err = REG_EBRACE;
2528	      else
2529		*err = REG_BADBR;
2530
2531	      return NULL;
2532	    }
2533
2534	  /* If the syntax bit is set, rollback.  */
2535	  re_string_set_index (regexp, start_idx);
2536	  *token = start_token;
2537	  token->type = CHARACTER;
2538	  /* mb_partial and word_char bits should be already initialized by
2539	     peek_token.  */
2540	  return elem;
2541	}
2542
2543      if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2544	{
2545	  /* First number greater than second.  */
2546	  *err = REG_BADBR;
2547	  return NULL;
2548	}
2549    }
2550  else
2551    {
2552      start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2553      end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554    }
2555
2556  fetch_token (token, regexp, syntax);
2557
2558  if (BE (elem == NULL, 0))
2559    return NULL;
2560  if (BE (start == 0 && end == 0, 0))
2561    {
2562      postorder (elem, free_tree, NULL);
2563      return NULL;
2564    }
2565
2566  /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2567  if (BE (start > 0, 0))
2568    {
2569      tree = elem;
2570      for (i = 2; i <= start; ++i)
2571	{
2572	  elem = duplicate_tree (elem, dfa);
2573	  tree = create_tree (dfa, tree, elem, CONCAT);
2574	  if (BE (elem == NULL || tree == NULL, 0))
2575	    goto parse_dup_op_espace;
2576	}
2577
2578      if (start == end)
2579	return tree;
2580
2581      /* Duplicate ELEM before it is marked optional.  */
2582      elem = duplicate_tree (elem, dfa);
2583      old_tree = tree;
2584    }
2585  else
2586    old_tree = NULL;
2587
2588  if (elem->token.type == SUBEXP)
2589    postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2590
2591  tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2592  if (BE (tree == NULL, 0))
2593    goto parse_dup_op_espace;
2594
2595  /* This loop is actually executed only when end != -1,
2596     to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2597     already created the start+1-th copy.  */
2598  for (i = start + 2; i <= end; ++i)
2599    {
2600      elem = duplicate_tree (elem, dfa);
2601      tree = create_tree (dfa, tree, elem, CONCAT);
2602      if (BE (elem == NULL || tree == NULL, 0))
2603	goto parse_dup_op_espace;
2604
2605      tree = create_tree (dfa, tree, NULL, OP_ALT);
2606      if (BE (tree == NULL, 0))
2607	goto parse_dup_op_espace;
2608    }
2609
2610  if (old_tree)
2611    tree = create_tree (dfa, old_tree, tree, CONCAT);
2612
2613  return tree;
2614
2615 parse_dup_op_espace:
2616  *err = REG_ESPACE;
2617  return NULL;
2618}
2619
2620/* Size of the names for collating symbol/equivalence_class/character_class.
2621   I'm not sure, but maybe enough.  */
2622#define BRACKET_NAME_BUF_SIZE 32
2623
2624#ifndef _LIBC
2625  /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2626     Build the range expression which starts from START_ELEM, and ends
2627     at END_ELEM.  The result are written to MBCSET and SBCSET.
2628     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2629     mbcset->range_ends, is a pointer argument since we may
2630     update it.  */
2631
2632static reg_errcode_t
2633internal_function
2634# ifdef RE_ENABLE_I18N
2635build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2636		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2637# else /* not RE_ENABLE_I18N */
2638build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2639		 bracket_elem_t *end_elem)
2640# endif /* not RE_ENABLE_I18N */
2641{
2642  unsigned int start_ch, end_ch;
2643  /* Equivalence Classes and Character Classes can't be a range start/end.  */
2644  if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2645	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2646	  0))
2647    return REG_ERANGE;
2648
2649  /* We can handle no multi character collating elements without libc
2650     support.  */
2651  if (BE ((start_elem->type == COLL_SYM
2652	   && strlen ((char *) start_elem->opr.name) > 1)
2653	  || (end_elem->type == COLL_SYM
2654	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2655    return REG_ECOLLATE;
2656
2657# ifdef RE_ENABLE_I18N
2658  {
2659    wchar_t wc;
2660    wint_t start_wc;
2661    wint_t end_wc;
2662    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2663
2664    start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2665		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2666		   : 0));
2667    end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2668	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2669		 : 0));
2670#ifdef GAWK
2671    /*
2672     * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2673     * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2674     * unsigned, so we don't have sign extension problems.
2675     */
2676    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2677		? start_ch : start_elem->opr.wch);
2678    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2679	      ? end_ch : end_elem->opr.wch);
2680#else
2681    start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2682		? __btowc (start_ch) : start_elem->opr.wch);
2683    end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2684	      ? __btowc (end_ch) : end_elem->opr.wch);
2685#endif
2686    if (start_wc == WEOF || end_wc == WEOF)
2687      return REG_ECOLLATE;
2688    cmp_buf[0] = start_wc;
2689    cmp_buf[4] = end_wc;
2690    if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2691      return REG_ERANGE;
2692
2693    /* Got valid collation sequence values, add them as a new entry.
2694       However, for !_LIBC we have no collation elements: if the
2695       character set is single byte, the single byte character set
2696       that we build below suffices.  parse_bracket_exp passes
2697       no MBCSET if dfa->mb_cur_max == 1.  */
2698    if (mbcset)
2699      {
2700	/* Check the space of the arrays.  */
2701	if (BE (*range_alloc == mbcset->nranges, 0))
2702	  {
2703	    /* There is not enough space, need realloc.  */
2704	    wchar_t *new_array_start, *new_array_end;
2705	    int new_nranges;
2706
2707	    /* +1 in case of mbcset->nranges is 0.  */
2708	    new_nranges = 2 * mbcset->nranges + 1;
2709	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2710	       are NULL if *range_alloc == 0.  */
2711	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2712					  new_nranges);
2713	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2714					new_nranges);
2715
2716	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2717	      return REG_ESPACE;
2718
2719	    mbcset->range_starts = new_array_start;
2720	    mbcset->range_ends = new_array_end;
2721	    *range_alloc = new_nranges;
2722	  }
2723
2724	mbcset->range_starts[mbcset->nranges] = start_wc;
2725	mbcset->range_ends[mbcset->nranges++] = end_wc;
2726      }
2727
2728    /* Build the table for single byte characters.  */
2729    for (wc = 0; wc < SBC_MAX; ++wc)
2730      {
2731	cmp_buf[2] = wc;
2732	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2733	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2734	  bitset_set (sbcset, wc);
2735      }
2736  }
2737# else /* not RE_ENABLE_I18N */
2738  {
2739    unsigned int ch;
2740    start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2741		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2742		   : 0));
2743    end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2744	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2745		 : 0));
2746    if (start_ch > end_ch)
2747      return REG_ERANGE;
2748    /* Build the table for single byte characters.  */
2749    for (ch = 0; ch < SBC_MAX; ++ch)
2750      if (start_ch <= ch  && ch <= end_ch)
2751	bitset_set (sbcset, ch);
2752  }
2753# endif /* not RE_ENABLE_I18N */
2754  return REG_NOERROR;
2755}
2756#endif /* not _LIBC */
2757
2758#ifndef _LIBC
2759/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2760   Build the collating element which is represented by NAME.
2761   The result are written to MBCSET and SBCSET.
2762   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2763   pointer argument since we may update it.  */
2764
2765static reg_errcode_t
2766internal_function
2767# ifdef RE_ENABLE_I18N
2768build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2769			int *coll_sym_alloc, const unsigned char *name)
2770# else /* not RE_ENABLE_I18N */
2771build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2772# endif /* not RE_ENABLE_I18N */
2773{
2774  size_t name_len = strlen ((const char *) name);
2775  if (BE (name_len != 1, 0))
2776    return REG_ECOLLATE;
2777  else
2778    {
2779      bitset_set (sbcset, name[0]);
2780      return REG_NOERROR;
2781    }
2782}
2783#endif /* not _LIBC */
2784
2785/* This function parse bracket expression like "[abc]", "[a-c]",
2786   "[[.a-a.]]" etc.  */
2787
2788static bin_tree_t *
2789parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2790		   reg_syntax_t syntax, reg_errcode_t *err)
2791{
2792#ifdef _LIBC
2793  const unsigned char *collseqmb;
2794  const char *collseqwc;
2795  uint32_t nrules;
2796  int32_t table_size;
2797  const int32_t *symb_table;
2798  const unsigned char *extra;
2799
2800  /* Local function for parse_bracket_exp used in _LIBC environment.
2801     Seek the collating symbol entry correspondings to NAME.
2802     Return the index of the symbol in the SYMB_TABLE.  */
2803
2804  auto inline int32_t
2805  __attribute ((always_inline))
2806  seek_collating_symbol_entry (name, name_len)
2807	 const unsigned char *name;
2808	 size_t name_len;
2809    {
2810      int32_t hash = elem_hash ((const char *) name, name_len);
2811      int32_t elem = hash % table_size;
2812      if (symb_table[2 * elem] != 0)
2813	{
2814	  int32_t second = hash % (table_size - 2) + 1;
2815
2816	  do
2817	    {
2818	      /* First compare the hashing value.  */
2819	      if (symb_table[2 * elem] == hash
2820		  /* Compare the length of the name.  */
2821		  && name_len == extra[symb_table[2 * elem + 1]]
2822		  /* Compare the name.  */
2823		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2824			     name_len) == 0)
2825		{
2826		  /* Yep, this is the entry.  */
2827		  break;
2828		}
2829
2830	      /* Next entry.  */
2831	      elem += second;
2832	    }
2833	  while (symb_table[2 * elem] != 0);
2834	}
2835      return elem;
2836    }
2837
2838  /* Local function for parse_bracket_exp used in _LIBC environment.
2839     Look up the collation sequence value of BR_ELEM.
2840     Return the value if succeeded, UINT_MAX otherwise.  */
2841
2842  auto inline unsigned int
2843  __attribute ((always_inline))
2844  lookup_collation_sequence_value (br_elem)
2845	 bracket_elem_t *br_elem;
2846    {
2847      if (br_elem->type == SB_CHAR)
2848	{
2849	  /*
2850	  if (MB_CUR_MAX == 1)
2851	  */
2852	  if (nrules == 0)
2853	    return collseqmb[br_elem->opr.ch];
2854	  else
2855	    {
2856	      wint_t wc = __btowc (br_elem->opr.ch);
2857	      return __collseq_table_lookup (collseqwc, wc);
2858	    }
2859	}
2860      else if (br_elem->type == MB_CHAR)
2861	{
2862	  if (nrules != 0)
2863	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2864	}
2865      else if (br_elem->type == COLL_SYM)
2866	{
2867	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2868	  if (nrules != 0)
2869	    {
2870	      int32_t elem, idx;
2871	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2872						  sym_name_len);
2873	      if (symb_table[2 * elem] != 0)
2874		{
2875		  /* We found the entry.  */
2876		  idx = symb_table[2 * elem + 1];
2877		  /* Skip the name of collating element name.  */
2878		  idx += 1 + extra[idx];
2879		  /* Skip the byte sequence of the collating element.  */
2880		  idx += 1 + extra[idx];
2881		  /* Adjust for the alignment.  */
2882		  idx = (idx + 3) & ~3;
2883		  /* Skip the multibyte collation sequence value.  */
2884		  idx += sizeof (unsigned int);
2885		  /* Skip the wide char sequence of the collating element.  */
2886		  idx += sizeof (unsigned int) *
2887		    (1 + *(unsigned int *) (extra + idx));
2888		  /* Return the collation sequence value.  */
2889		  return *(unsigned int *) (extra + idx);
2890		}
2891	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2892		{
2893		  /* No valid character.  Match it as a single byte
2894		     character.  */
2895		  return collseqmb[br_elem->opr.name[0]];
2896		}
2897	    }
2898	  else if (sym_name_len == 1)
2899	    return collseqmb[br_elem->opr.name[0]];
2900	}
2901      return UINT_MAX;
2902    }
2903
2904  /* Local function for parse_bracket_exp used in _LIBC environment.
2905     Build the range expression which starts from START_ELEM, and ends
2906     at END_ELEM.  The result are written to MBCSET and SBCSET.
2907     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2908     mbcset->range_ends, is a pointer argument since we may
2909     update it.  */
2910
2911  auto inline reg_errcode_t
2912  __attribute ((always_inline))
2913  build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2914	 re_charset_t *mbcset;
2915	 int *range_alloc;
2916	 bitset_t sbcset;
2917	 bracket_elem_t *start_elem, *end_elem;
2918    {
2919      unsigned int ch;
2920      uint32_t start_collseq;
2921      uint32_t end_collseq;
2922
2923      /* Equivalence Classes and Character Classes can't be a range
2924	 start/end.  */
2925      if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2926	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2927	      0))
2928	return REG_ERANGE;
2929
2930      start_collseq = lookup_collation_sequence_value (start_elem);
2931      end_collseq = lookup_collation_sequence_value (end_elem);
2932      /* Check start/end collation sequence values.  */
2933      if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2934	return REG_ECOLLATE;
2935      if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2936	return REG_ERANGE;
2937
2938      /* Got valid collation sequence values, add them as a new entry.
2939	 However, if we have no collation elements, and the character set
2940	 is single byte, the single byte character set that we
2941	 build below suffices. */
2942      if (nrules > 0 || dfa->mb_cur_max > 1)
2943	{
2944	  /* Check the space of the arrays.  */
2945	  if (BE (*range_alloc == mbcset->nranges, 0))
2946	    {
2947	      /* There is not enough space, need realloc.  */
2948	      uint32_t *new_array_start;
2949	      uint32_t *new_array_end;
2950	      int new_nranges;
2951
2952	      /* +1 in case of mbcset->nranges is 0.  */
2953	      new_nranges = 2 * mbcset->nranges + 1;
2954	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2955					    new_nranges);
2956	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2957					  new_nranges);
2958
2959	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2960		return REG_ESPACE;
2961
2962	      mbcset->range_starts = new_array_start;
2963	      mbcset->range_ends = new_array_end;
2964	      *range_alloc = new_nranges;
2965	    }
2966
2967	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2968	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2969	}
2970
2971      /* Build the table for single byte characters.  */
2972      for (ch = 0; ch < SBC_MAX; ch++)
2973	{
2974	  uint32_t ch_collseq;
2975	  /*
2976	  if (MB_CUR_MAX == 1)
2977	  */
2978	  if (nrules == 0)
2979	    ch_collseq = collseqmb[ch];
2980	  else
2981	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2982	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2983	    bitset_set (sbcset, ch);
2984	}
2985      return REG_NOERROR;
2986    }
2987
2988  /* Local function for parse_bracket_exp used in _LIBC environment.
2989     Build the collating element which is represented by NAME.
2990     The result are written to MBCSET and SBCSET.
2991     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2992     pointer argument since we may update it.  */
2993
2994  auto inline reg_errcode_t
2995  __attribute ((always_inline))
2996  build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2997	 re_charset_t *mbcset;
2998	 int *coll_sym_alloc;
2999	 bitset_t sbcset;
3000	 const unsigned char *name;
3001    {
3002      int32_t elem, idx;
3003      size_t name_len = strlen ((const char *) name);
3004      if (nrules != 0)
3005	{
3006	  elem = seek_collating_symbol_entry (name, name_len);
3007	  if (symb_table[2 * elem] != 0)
3008	    {
3009	      /* We found the entry.  */
3010	      idx = symb_table[2 * elem + 1];
3011	      /* Skip the name of collating element name.  */
3012	      idx += 1 + extra[idx];
3013	    }
3014	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3015	    {
3016	      /* No valid character, treat it as a normal
3017		 character.  */
3018	      bitset_set (sbcset, name[0]);
3019	      return REG_NOERROR;
3020	    }
3021	  else
3022	    return REG_ECOLLATE;
3023
3024	  /* Got valid collation sequence, add it as a new entry.  */
3025	  /* Check the space of the arrays.  */
3026	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3027	    {
3028	      /* Not enough, realloc it.  */
3029	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3030	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3031	      /* Use realloc since mbcset->coll_syms is NULL
3032		 if *alloc == 0.  */
3033	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3034						   new_coll_sym_alloc);
3035	      if (BE (new_coll_syms == NULL, 0))
3036		return REG_ESPACE;
3037	      mbcset->coll_syms = new_coll_syms;
3038	      *coll_sym_alloc = new_coll_sym_alloc;
3039	    }
3040	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3041	  return REG_NOERROR;
3042	}
3043      else
3044	{
3045	  if (BE (name_len != 1, 0))
3046	    return REG_ECOLLATE;
3047	  else
3048	    {
3049	      bitset_set (sbcset, name[0]);
3050	      return REG_NOERROR;
3051	    }
3052	}
3053    }
3054#endif
3055
3056  re_token_t br_token;
3057  re_bitset_ptr_t sbcset;
3058#ifdef RE_ENABLE_I18N
3059  re_charset_t *mbcset;
3060  int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3061  int equiv_class_alloc = 0, char_class_alloc = 0;
3062#endif /* not RE_ENABLE_I18N */
3063  int non_match = 0;
3064  bin_tree_t *work_tree;
3065  int token_len;
3066  int first_round = 1;
3067#ifdef _LIBC
3068  collseqmb = (const unsigned char *)
3069    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3070  nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3071  if (nrules)
3072    {
3073      /*
3074      if (MB_CUR_MAX > 1)
3075      */
3076      collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3077      table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3078      symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3079						  _NL_COLLATE_SYMB_TABLEMB);
3080      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3081						   _NL_COLLATE_SYMB_EXTRAMB);
3082    }
3083#endif
3084  sbcset = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t));
3085#ifdef RE_ENABLE_I18N
3086  mbcset = (re_charset_t *) calloc (1, sizeof (re_charset_t));
3087#endif /* RE_ENABLE_I18N */
3088#ifdef RE_ENABLE_I18N
3089  if (BE (sbcset == NULL || mbcset == NULL, 0))
3090#else
3091  if (BE (sbcset == NULL, 0))
3092#endif /* RE_ENABLE_I18N */
3093    {
3094      *err = REG_ESPACE;
3095      return NULL;
3096    }
3097
3098  token_len = peek_token_bracket (token, regexp, syntax);
3099  if (BE (token->type == END_OF_RE, 0))
3100    {
3101      *err = REG_BADPAT;
3102      goto parse_bracket_exp_free_return;
3103    }
3104  if (token->type == OP_NON_MATCH_LIST)
3105    {
3106#ifdef RE_ENABLE_I18N
3107      mbcset->non_match = 1;
3108#endif /* not RE_ENABLE_I18N */
3109      non_match = 1;
3110      if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3111	bitset_set (sbcset, '\n');
3112      re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3113      token_len = peek_token_bracket (token, regexp, syntax);
3114      if (BE (token->type == END_OF_RE, 0))
3115	{
3116	  *err = REG_BADPAT;
3117	  goto parse_bracket_exp_free_return;
3118	}
3119    }
3120
3121  /* We treat the first ']' as a normal character.  */
3122  if (token->type == OP_CLOSE_BRACKET)
3123    token->type = CHARACTER;
3124
3125  while (1)
3126    {
3127      bracket_elem_t start_elem, end_elem;
3128      unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3129      unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3130      reg_errcode_t ret;
3131      int token_len2 = 0, is_range_exp = 0;
3132      re_token_t token2;
3133
3134      start_elem.opr.name = start_name_buf;
3135      ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3136				   syntax, first_round);
3137      if (BE (ret != REG_NOERROR, 0))
3138	{
3139	  *err = ret;
3140	  goto parse_bracket_exp_free_return;
3141	}
3142      first_round = 0;
3143
3144      /* Get information about the next token.  We need it in any case.  */
3145      token_len = peek_token_bracket (token, regexp, syntax);
3146
3147      /* Do not check for ranges if we know they are not allowed.  */
3148      if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3149	{
3150	  if (BE (token->type == END_OF_RE, 0))
3151	    {
3152	      *err = REG_EBRACK;
3153	      goto parse_bracket_exp_free_return;
3154	    }
3155	  if (token->type == OP_CHARSET_RANGE)
3156	    {
3157	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3158	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3159	      if (BE (token2.type == END_OF_RE, 0))
3160		{
3161		  *err = REG_EBRACK;
3162		  goto parse_bracket_exp_free_return;
3163		}
3164	      if (token2.type == OP_CLOSE_BRACKET)
3165		{
3166		  /* We treat the last '-' as a normal character.  */
3167		  re_string_skip_bytes (regexp, -token_len);
3168		  token->type = CHARACTER;
3169		}
3170	      else
3171		is_range_exp = 1;
3172	    }
3173	}
3174
3175      if (is_range_exp == 1)
3176	{
3177	  end_elem.opr.name = end_name_buf;
3178	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3179				       dfa, syntax, 1);
3180	  if (BE (ret != REG_NOERROR, 0))
3181	    {
3182	      *err = ret;
3183	      goto parse_bracket_exp_free_return;
3184	    }
3185
3186	  token_len = peek_token_bracket (token, regexp, syntax);
3187
3188#ifdef _LIBC
3189	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3190				  &start_elem, &end_elem);
3191#else
3192# ifdef RE_ENABLE_I18N
3193	  *err = build_range_exp (sbcset,
3194				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3195				  &range_alloc, &start_elem, &end_elem);
3196# else
3197	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3198# endif
3199#endif /* RE_ENABLE_I18N */
3200	  if (BE (*err != REG_NOERROR, 0))
3201	    goto parse_bracket_exp_free_return;
3202	}
3203      else
3204	{
3205	  switch (start_elem.type)
3206	    {
3207	    case SB_CHAR:
3208	      bitset_set (sbcset, start_elem.opr.ch);
3209	      break;
3210#ifdef RE_ENABLE_I18N
3211	    case MB_CHAR:
3212	      /* Check whether the array has enough space.  */
3213	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3214		{
3215		  wchar_t *new_mbchars;
3216		  /* Not enough, realloc it.  */
3217		  /* +1 in case of mbcset->nmbchars is 0.  */
3218		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3219		  /* Use realloc since array is NULL if *alloc == 0.  */
3220		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3221					    mbchar_alloc);
3222		  if (BE (new_mbchars == NULL, 0))
3223		    goto parse_bracket_exp_espace;
3224		  mbcset->mbchars = new_mbchars;
3225		}
3226	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3227	      break;
3228#endif /* RE_ENABLE_I18N */
3229	    case EQUIV_CLASS:
3230	      *err = build_equiv_class (sbcset,
3231#ifdef RE_ENABLE_I18N
3232					mbcset, &equiv_class_alloc,
3233#endif /* RE_ENABLE_I18N */
3234					start_elem.opr.name);
3235	      if (BE (*err != REG_NOERROR, 0))
3236		goto parse_bracket_exp_free_return;
3237	      break;
3238	    case COLL_SYM:
3239	      *err = build_collating_symbol (sbcset,
3240#ifdef RE_ENABLE_I18N
3241					     mbcset, &coll_sym_alloc,
3242#endif /* RE_ENABLE_I18N */
3243					     start_elem.opr.name);
3244	      if (BE (*err != REG_NOERROR, 0))
3245		goto parse_bracket_exp_free_return;
3246	      break;
3247	    case CHAR_CLASS:
3248	      *err = build_charclass (regexp->trans, sbcset,
3249#ifdef RE_ENABLE_I18N
3250				      mbcset, &char_class_alloc,
3251#endif /* RE_ENABLE_I18N */
3252				      (const char *) start_elem.opr.name, syntax);
3253	      if (BE (*err != REG_NOERROR, 0))
3254	       goto parse_bracket_exp_free_return;
3255	      break;
3256	    default:
3257	      assert (0);
3258	      break;
3259	    }
3260	}
3261      if (BE (token->type == END_OF_RE, 0))
3262	{
3263	  *err = REG_EBRACK;
3264	  goto parse_bracket_exp_free_return;
3265	}
3266      if (token->type == OP_CLOSE_BRACKET)
3267	break;
3268    }
3269
3270  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3271
3272  /* If it is non-matching list.  */
3273  if (non_match)
3274    bitset_not (sbcset);
3275
3276#ifdef RE_ENABLE_I18N
3277  /* Ensure only single byte characters are set.  */
3278  if (dfa->mb_cur_max > 1)
3279    bitset_mask (sbcset, dfa->sb_char);
3280
3281  if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3282      || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3283						     || mbcset->non_match)))
3284    {
3285      bin_tree_t *mbc_tree;
3286      int sbc_idx;
3287      /* Build a tree for complex bracket.  */
3288      dfa->has_mb_node = 1;
3289      br_token.type = COMPLEX_BRACKET;
3290      br_token.opr.mbcset = mbcset;
3291      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3292      if (BE (mbc_tree == NULL, 0))
3293	goto parse_bracket_exp_espace;
3294      for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3295	if (sbcset[sbc_idx])
3296	  break;
3297      /* If there are no bits set in sbcset, there is no point
3298	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3299      if (sbc_idx < BITSET_WORDS)
3300	{
3301	  /* Build a tree for simple bracket.  */
3302	  br_token.type = SIMPLE_BRACKET;
3303	  br_token.opr.sbcset = sbcset;
3304	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3305	  if (BE (work_tree == NULL, 0))
3306	    goto parse_bracket_exp_espace;
3307
3308	  /* Then join them by ALT node.  */
3309	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3310	  if (BE (work_tree == NULL, 0))
3311	    goto parse_bracket_exp_espace;
3312	}
3313      else
3314	{
3315	  re_free (sbcset);
3316	  work_tree = mbc_tree;
3317	}
3318    }
3319  else
3320#endif /* not RE_ENABLE_I18N */
3321    {
3322#ifdef RE_ENABLE_I18N
3323      free_charset (mbcset);
3324#endif
3325      /* Build a tree for simple bracket.  */
3326      br_token.type = SIMPLE_BRACKET;
3327      br_token.opr.sbcset = sbcset;
3328      work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3329      if (BE (work_tree == NULL, 0))
3330	goto parse_bracket_exp_espace;
3331    }
3332  return work_tree;
3333
3334 parse_bracket_exp_espace:
3335  *err = REG_ESPACE;
3336 parse_bracket_exp_free_return:
3337  re_free (sbcset);
3338#ifdef RE_ENABLE_I18N
3339  free_charset (mbcset);
3340#endif /* RE_ENABLE_I18N */
3341  return NULL;
3342}
3343
3344/* Parse an element in the bracket expression.  */
3345
3346static reg_errcode_t
3347parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3348		       re_token_t *token, int token_len, re_dfa_t *dfa,
3349		       reg_syntax_t syntax, int accept_hyphen)
3350{
3351#ifdef RE_ENABLE_I18N
3352  int cur_char_size;
3353  cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3354  if (cur_char_size > 1)
3355    {
3356      elem->type = MB_CHAR;
3357      elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3358      re_string_skip_bytes (regexp, cur_char_size);
3359      return REG_NOERROR;
3360    }
3361#endif /* RE_ENABLE_I18N */
3362  re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3363  if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3364      || token->type == OP_OPEN_EQUIV_CLASS)
3365    return parse_bracket_symbol (elem, regexp, token);
3366  if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3367    {
3368      /* A '-' must only appear as anything but a range indicator before
3369	 the closing bracket.  Everything else is an error.  */
3370      re_token_t token2;
3371      (void) peek_token_bracket (&token2, regexp, syntax);
3372      if (token2.type != OP_CLOSE_BRACKET)
3373	/* The actual error value is not standardized since this whole
3374	   case is undefined.  But ERANGE makes good sense.  */
3375	return REG_ERANGE;
3376    }
3377  elem->type = SB_CHAR;
3378  elem->opr.ch = token->opr.c;
3379  return REG_NOERROR;
3380}
3381
3382/* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3383   such as [:<character_class>:], [.<collating_element>.], and
3384   [=<equivalent_class>=].  */
3385
3386static reg_errcode_t
3387parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3388		      re_token_t *token)
3389{
3390  unsigned char ch, delim = token->opr.c;
3391  int i = 0;
3392  if (re_string_eoi(regexp))
3393    return REG_EBRACK;
3394  for (;; ++i)
3395    {
3396      if (i >= BRACKET_NAME_BUF_SIZE)
3397	return REG_EBRACK;
3398      if (token->type == OP_OPEN_CHAR_CLASS)
3399	ch = re_string_fetch_byte_case (regexp);
3400      else
3401	ch = re_string_fetch_byte (regexp);
3402      if (re_string_eoi(regexp))
3403	return REG_EBRACK;
3404      if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3405	break;
3406      elem->opr.name[i] = ch;
3407    }
3408  re_string_skip_bytes (regexp, 1);
3409  elem->opr.name[i] = '\0';
3410  switch (token->type)
3411    {
3412    case OP_OPEN_COLL_ELEM:
3413      elem->type = COLL_SYM;
3414      break;
3415    case OP_OPEN_EQUIV_CLASS:
3416      elem->type = EQUIV_CLASS;
3417      break;
3418    case OP_OPEN_CHAR_CLASS:
3419      elem->type = CHAR_CLASS;
3420      break;
3421    default:
3422      break;
3423    }
3424  return REG_NOERROR;
3425}
3426
3427  /* Helper function for parse_bracket_exp.
3428     Build the equivalence class which is represented by NAME.
3429     The result are written to MBCSET and SBCSET.
3430     EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3431     is a pointer argument since we may update it.  */
3432
3433static reg_errcode_t
3434#ifdef RE_ENABLE_I18N
3435build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3436		   int *equiv_class_alloc, const unsigned char *name)
3437#else /* not RE_ENABLE_I18N */
3438build_equiv_class (bitset_t sbcset, const unsigned char *name)
3439#endif /* not RE_ENABLE_I18N */
3440{
3441#ifdef _LIBC
3442  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3443  if (nrules != 0)
3444    {
3445      const int32_t *table, *indirect;
3446      const unsigned char *weights, *extra, *cp;
3447      unsigned char char_buf[2];
3448      int32_t idx1, idx2;
3449      unsigned int ch;
3450      size_t len;
3451      /* This #include defines a local function!  */
3452# include <locale/weight.h>
3453      /* Calculate the index for equivalence class.  */
3454      cp = name;
3455      table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3456      weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3457					       _NL_COLLATE_WEIGHTMB);
3458      extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3459						   _NL_COLLATE_EXTRAMB);
3460      indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3461						_NL_COLLATE_INDIRECTMB);
3462      idx1 = findidx (&cp);
3463      if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3464	/* This isn't a valid character.  */
3465	return REG_ECOLLATE;
3466
3467      /* Build single byte matching table for this equivalence class.  */
3468      char_buf[1] = (unsigned char) '\0';
3469      len = weights[idx1 & 0xffffff];
3470      for (ch = 0; ch < SBC_MAX; ++ch)
3471	{
3472	  char_buf[0] = ch;
3473	  cp = char_buf;
3474	  idx2 = findidx (&cp);
3475/*
3476	  idx2 = table[ch];
3477*/
3478	  if (idx2 == 0)
3479	    /* This isn't a valid character.  */
3480	    continue;
3481	  /* Compare only if the length matches and the collation rule
3482	     index is the same.  */
3483	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3484	    {
3485	      int cnt = 0;
3486
3487	      while (cnt <= len &&
3488		     weights[(idx1 & 0xffffff) + 1 + cnt]
3489		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3490		++cnt;
3491
3492	      if (cnt > len)
3493		bitset_set (sbcset, ch);
3494	    }
3495	}
3496      /* Check whether the array has enough space.  */
3497      if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3498	{
3499	  /* Not enough, realloc it.  */
3500	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3501	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3502	  /* Use realloc since the array is NULL if *alloc == 0.  */
3503	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3504						   int32_t,
3505						   new_equiv_class_alloc);
3506	  if (BE (new_equiv_classes == NULL, 0))
3507	    return REG_ESPACE;
3508	  mbcset->equiv_classes = new_equiv_classes;
3509	  *equiv_class_alloc = new_equiv_class_alloc;
3510	}
3511      mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3512    }
3513  else
3514#endif /* _LIBC */
3515    {
3516      if (BE (strlen ((const char *) name) != 1, 0))
3517	return REG_ECOLLATE;
3518      bitset_set (sbcset, *name);
3519    }
3520  return REG_NOERROR;
3521}
3522
3523  /* Helper function for parse_bracket_exp.
3524     Build the character class which is represented by NAME.
3525     The result are written to MBCSET and SBCSET.
3526     CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3527     is a pointer argument since we may update it.  */
3528
3529static reg_errcode_t
3530#ifdef RE_ENABLE_I18N
3531build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3532		 re_charset_t *mbcset, int *char_class_alloc,
3533		 const char *class_name, reg_syntax_t syntax)
3534#else /* not RE_ENABLE_I18N */
3535build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3536		 const char *class_name, reg_syntax_t syntax)
3537#endif /* not RE_ENABLE_I18N */
3538{
3539  int i;
3540
3541  /* In case of REG_ICASE "upper" and "lower" match the both of
3542     upper and lower cases.  */
3543  if ((syntax & RE_ICASE)
3544      && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3545    class_name = "alpha";
3546
3547#ifdef RE_ENABLE_I18N
3548  /* Check the space of the arrays.  */
3549  if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3550    {
3551      /* Not enough, realloc it.  */
3552      /* +1 in case of mbcset->nchar_classes is 0.  */
3553      int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3554      /* Use realloc since array is NULL if *alloc == 0.  */
3555      wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3556					       new_char_class_alloc);
3557      if (BE (new_char_classes == NULL, 0))
3558	return REG_ESPACE;
3559      mbcset->char_classes = new_char_classes;
3560      *char_class_alloc = new_char_class_alloc;
3561    }
3562  mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3563#endif /* RE_ENABLE_I18N */
3564
3565#define BUILD_CHARCLASS_LOOP(ctype_func)	\
3566  do {						\
3567    if (BE (trans != NULL, 0))			\
3568      {						\
3569	for (i = 0; i < SBC_MAX; ++i)		\
3570  	  if (ctype_func (i))			\
3571	    bitset_set (sbcset, trans[i]);	\
3572      }						\
3573    else					\
3574      {						\
3575	for (i = 0; i < SBC_MAX; ++i)		\
3576  	  if (ctype_func (i))			\
3577	    bitset_set (sbcset, i);		\
3578      }						\
3579  } while (0)
3580
3581  if (strcmp (class_name, "alnum") == 0)
3582    BUILD_CHARCLASS_LOOP (isalnum);
3583  else if (strcmp (class_name, "cntrl") == 0)
3584    BUILD_CHARCLASS_LOOP (iscntrl);
3585  else if (strcmp (class_name, "lower") == 0)
3586    BUILD_CHARCLASS_LOOP (islower);
3587  else if (strcmp (class_name, "space") == 0)
3588    BUILD_CHARCLASS_LOOP (isspace);
3589  else if (strcmp (class_name, "alpha") == 0)
3590    BUILD_CHARCLASS_LOOP (isalpha);
3591  else if (strcmp (class_name, "digit") == 0)
3592    BUILD_CHARCLASS_LOOP (isdigit);
3593  else if (strcmp (class_name, "print") == 0)
3594    BUILD_CHARCLASS_LOOP (isprint);
3595  else if (strcmp (class_name, "upper") == 0)
3596    BUILD_CHARCLASS_LOOP (isupper);
3597  else if (strcmp (class_name, "blank") == 0)
3598#ifndef GAWK
3599    BUILD_CHARCLASS_LOOP (isblank);
3600#else
3601    /* see comments above */
3602    BUILD_CHARCLASS_LOOP (is_blank);
3603#endif
3604  else if (strcmp (class_name, "graph") == 0)
3605    BUILD_CHARCLASS_LOOP (isgraph);
3606  else if (strcmp (class_name, "punct") == 0)
3607    BUILD_CHARCLASS_LOOP (ispunct);
3608  else if (strcmp (class_name, "xdigit") == 0)
3609    BUILD_CHARCLASS_LOOP (isxdigit);
3610  else
3611    return REG_ECTYPE;
3612
3613  return REG_NOERROR;
3614}
3615
3616static bin_tree_t *
3617build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3618		    const char *class_name,
3619		    const char *extra, int non_match,
3620		    reg_errcode_t *err)
3621{
3622  re_bitset_ptr_t sbcset;
3623#ifdef RE_ENABLE_I18N
3624  re_charset_t *mbcset;
3625  int alloc = 0;
3626#endif /* not RE_ENABLE_I18N */
3627  reg_errcode_t ret;
3628  re_token_t br_token;
3629  bin_tree_t *tree;
3630
3631  sbcset = (re_bitset_ptr_t) calloc (1, sizeof (bitset_t));
3632#ifdef RE_ENABLE_I18N
3633  mbcset = (re_charset_t *) calloc (1, sizeof (re_charset_t));
3634#endif /* RE_ENABLE_I18N */
3635
3636#ifdef RE_ENABLE_I18N
3637  if (BE (sbcset == NULL || mbcset == NULL, 0))
3638#else /* not RE_ENABLE_I18N */
3639  if (BE (sbcset == NULL, 0))
3640#endif /* not RE_ENABLE_I18N */
3641    {
3642      *err = REG_ESPACE;
3643      return NULL;
3644    }
3645
3646  if (non_match)
3647    {
3648#ifdef RE_ENABLE_I18N
3649      mbcset->non_match = 1;
3650#endif /* not RE_ENABLE_I18N */
3651    }
3652
3653  /* We don't care the syntax in this case.  */
3654  ret = build_charclass (trans, sbcset,
3655#ifdef RE_ENABLE_I18N
3656			 mbcset, &alloc,
3657#endif /* RE_ENABLE_I18N */
3658			 class_name, 0);
3659
3660  if (BE (ret != REG_NOERROR, 0))
3661    {
3662      re_free (sbcset);
3663#ifdef RE_ENABLE_I18N
3664      free_charset (mbcset);
3665#endif /* RE_ENABLE_I18N */
3666      *err = ret;
3667      return NULL;
3668    }
3669  /* \w match '_' also.  */
3670  for (; *extra; extra++)
3671    bitset_set (sbcset, *extra);
3672
3673  /* If it is non-matching list.  */
3674  if (non_match)
3675    bitset_not (sbcset);
3676
3677#ifdef RE_ENABLE_I18N
3678  /* Ensure only single byte characters are set.  */
3679  if (dfa->mb_cur_max > 1)
3680    bitset_mask (sbcset, dfa->sb_char);
3681#endif
3682
3683  /* Build a tree for simple bracket.  */
3684  br_token.type = SIMPLE_BRACKET;
3685  br_token.opr.sbcset = sbcset;
3686  tree = create_token_tree (dfa, NULL, NULL, &br_token);
3687  if (BE (tree == NULL, 0))
3688    goto build_word_op_espace;
3689
3690#ifdef RE_ENABLE_I18N
3691  if (dfa->mb_cur_max > 1)
3692    {
3693      bin_tree_t *mbc_tree;
3694      /* Build a tree for complex bracket.  */
3695      br_token.type = COMPLEX_BRACKET;
3696      br_token.opr.mbcset = mbcset;
3697      dfa->has_mb_node = 1;
3698      mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3699      if (BE (mbc_tree == NULL, 0))
3700	goto build_word_op_espace;
3701      /* Then join them by ALT node.  */
3702      tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3703      if (BE (mbc_tree != NULL, 1))
3704	return tree;
3705    }
3706  else
3707    {
3708      free_charset (mbcset);
3709      return tree;
3710    }
3711#else /* not RE_ENABLE_I18N */
3712  return tree;
3713#endif /* not RE_ENABLE_I18N */
3714
3715 build_word_op_espace:
3716  re_free (sbcset);
3717#ifdef RE_ENABLE_I18N
3718  free_charset (mbcset);
3719#endif /* RE_ENABLE_I18N */
3720  *err = REG_ESPACE;
3721  return NULL;
3722}
3723
3724/* This is intended for the expressions like "a{1,3}".
3725   Fetch a number from `input', and return the number.
3726   Return -1, if the number field is empty like "{,1}".
3727   Return -2, if an error has occurred.  */
3728
3729static int
3730fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3731{
3732  int num = -1;
3733  unsigned char c;
3734  while (1)
3735    {
3736      fetch_token (token, input, syntax);
3737      c = token->opr.c;
3738      if (BE (token->type == END_OF_RE, 0))
3739	return -2;
3740      if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3741	break;
3742      num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3743	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3744      num = (num > RE_DUP_MAX) ? -2 : num;
3745    }
3746  return num;
3747}
3748
3749#ifdef RE_ENABLE_I18N
3750static void
3751free_charset (re_charset_t *cset)
3752{
3753  re_free (cset->mbchars);
3754# ifdef _LIBC
3755  re_free (cset->coll_syms);
3756  re_free (cset->equiv_classes);
3757  re_free (cset->range_starts);
3758  re_free (cset->range_ends);
3759# endif
3760  re_free (cset->char_classes);
3761  re_free (cset);
3762}
3763#endif /* RE_ENABLE_I18N */
3764
3765/* Functions for binary tree operation.  */
3766
3767/* Create a tree node.  */
3768
3769static bin_tree_t *
3770create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3771	     re_token_type_t type)
3772{
3773  re_token_t t;
3774  t.type = type;
3775  return create_token_tree (dfa, left, right, &t);
3776}
3777
3778static bin_tree_t *
3779create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3780		   const re_token_t *token)
3781{
3782  bin_tree_t *tree;
3783  if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3784    {
3785      bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3786
3787      if (storage == NULL)
3788	return NULL;
3789      storage->next = dfa->str_tree_storage;
3790      dfa->str_tree_storage = storage;
3791      dfa->str_tree_storage_idx = 0;
3792    }
3793  tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3794
3795  tree->parent = NULL;
3796  tree->left = left;
3797  tree->right = right;
3798  tree->token = *token;
3799  tree->token.duplicated = 0;
3800  tree->token.opt_subexp = 0;
3801  tree->first = NULL;
3802  tree->next = NULL;
3803  tree->node_idx = -1;
3804
3805  if (left != NULL)
3806    left->parent = tree;
3807  if (right != NULL)
3808    right->parent = tree;
3809  return tree;
3810}
3811
3812/* Mark the tree SRC as an optional subexpression.
3813   To be called from preorder or postorder.  */
3814
3815static reg_errcode_t
3816mark_opt_subexp (void *extra, bin_tree_t *node)
3817{
3818  int idx = (int) (intptr_t) extra;
3819  if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3820    node->token.opt_subexp = 1;
3821
3822  return REG_NOERROR;
3823}
3824
3825/* Free the allocated memory inside NODE. */
3826
3827static void
3828free_token (re_token_t *node)
3829{
3830#ifdef RE_ENABLE_I18N
3831  if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3832    free_charset (node->opr.mbcset);
3833  else
3834#endif /* RE_ENABLE_I18N */
3835    if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3836      re_free (node->opr.sbcset);
3837}
3838
3839/* Worker function for tree walking.  Free the allocated memory inside NODE
3840   and its children. */
3841
3842static reg_errcode_t
3843free_tree (void *extra, bin_tree_t *node)
3844{
3845  free_token (&node->token);
3846  return REG_NOERROR;
3847}
3848
3849
3850/* Duplicate the node SRC, and return new node.  This is a preorder
3851   visit similar to the one implemented by the generic visitor, but
3852   we need more infrastructure to maintain two parallel trees --- so,
3853   it's easier to duplicate.  */
3854
3855static bin_tree_t *
3856duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3857{
3858  const bin_tree_t *node;
3859  bin_tree_t *dup_root;
3860  bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3861
3862  for (node = root; ; )
3863    {
3864      /* Create a new tree and link it back to the current parent.  */
3865      *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3866      if (*p_new == NULL)
3867	return NULL;
3868      (*p_new)->parent = dup_node;
3869      (*p_new)->token.duplicated = 1;
3870      dup_node = *p_new;
3871
3872      /* Go to the left node, or up and to the right.  */
3873      if (node->left)
3874	{
3875	  node = node->left;
3876	  p_new = &dup_node->left;
3877	}
3878      else
3879	{
3880	  const bin_tree_t *prev = NULL;
3881	  while (node->right == prev || node->right == NULL)
3882	    {
3883	      prev = node;
3884	      node = node->parent;
3885	      dup_node = dup_node->parent;
3886	      if (!node)
3887		return dup_root;
3888	    }
3889	  node = node->right;
3890	  p_new = &dup_node->right;
3891	}
3892    }
3893}