1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 #ifndef _REGEX_INTERNAL_H
22 #define _REGEX_INTERNAL_H 1
24 /* Number of bits in a byte. */
26 /* Number of single byte character. */
29 #define COLL_ELEM_LEN_MAX 8
31 /* The character which represents newline. */
32 #define NEWLINE_CHAR '\n'
34 /* Rename to standard API for using out of glibc. */
36 # define __wctype wctype
37 # define __iswctype iswctype
38 # define __btowc btowc
39 # define __mempcpy memcpy
40 # define attribute_hidden
41 #endif /* not _LIBC */
43 extern const char __re_error_msgid[] attribute_hidden;
44 extern const size_t __re_error_msgid_idx[] attribute_hidden;
46 /* Number of bits in an unsinged int. */
47 #define UINT_BITS (sizeof (unsigned int) * BYTE_BITS)
48 /* Number of unsigned int in an bit_set. */
49 #define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS)
50 typedef unsigned int bitset[BITSET_UINTS];
51 typedef unsigned int *re_bitset_ptr_t;
53 #define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS)
54 #define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS))
55 #define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS))
56 #define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS)
57 #define bitset_set_all(set) \
58 memset (set, 255, sizeof (unsigned int) * BITSET_UINTS)
59 #define bitset_copy(dest,src) \
60 memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS)
61 static inline void bitset_not (bitset set);
62 static inline void bitset_merge (bitset dest, const bitset src);
64 static inline void bitset_not_merge (bitset dest, const bitset src);
67 #define PREV_WORD_CONSTRAINT 0x0001
68 #define PREV_NOTWORD_CONSTRAINT 0x0002
69 #define NEXT_WORD_CONSTRAINT 0x0004
70 #define NEXT_NOTWORD_CONSTRAINT 0x0008
71 #define PREV_NEWLINE_CONSTRAINT 0x0010
72 #define NEXT_NEWLINE_CONSTRAINT 0x0020
73 #define PREV_BEGBUF_CONSTRAINT 0x0040
74 #define NEXT_ENDBUF_CONSTRAINT 0x0080
75 #define DUMMY_CONSTRAINT 0x0100
79 INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
80 WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
81 WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
82 LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
83 LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
84 BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
85 BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
86 WORD_DELIM = DUMMY_CONSTRAINT
100 /* Token type, these are used only by token. */
110 OP_CLOSE_EQUIV_CLASS,
117 /* Tree type, these are used only by tree. */
122 #ifdef RE_ENABLE_I18N
124 #endif /* RE_ENABLE_I18N */
126 /* Node type, These are used by token, node, tree. */
144 #ifdef RE_ENABLE_I18N
147 /* Multibyte characters. */
150 /* Collating symbols. */
155 /* Equivalence classes. */
157 int32_t *equiv_classes;
160 /* Range expressions. */
162 uint32_t *range_starts;
163 uint32_t *range_ends;
164 # else /* not _LIBC */
165 wchar_t *range_starts;
167 # endif /* not _LIBC */
169 /* Character classes. */
170 wctype_t *char_classes;
172 /* If this character set is the non-matching list. */
173 unsigned int non_match : 1;
175 /* # of multibyte characters. */
178 /* # of collating symbols. */
181 /* # of equivalence classes. */
184 /* # of range expressions. */
187 /* # of character classes. */
190 #endif /* RE_ENABLE_I18N */
196 unsigned char c; /* for CHARACTER */
197 re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */
198 #ifdef RE_ENABLE_I18N
199 re_charset_t *mbcset; /* for COMPLEX_BRACKET */
200 #endif /* RE_ENABLE_I18N */
201 int idx; /* for BACK_REF */
202 re_context_type ctx_type; /* for ANCHOR */
205 int entity; /* for OP_CONTEXT_NODE, index of the entity */
206 re_node_set *bkref_eclosure;
210 re_token_type_t type : 8;
212 re_token_type_t type;
214 unsigned int constraint : 10; /* context constraint */
215 unsigned int duplicated : 1;
216 #ifdef RE_ENABLE_I18N
217 unsigned int mb_partial : 1;
221 #define IS_EPSILON_NODE(type) \
222 ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS \
223 || (type) == OP_DUP_QUESTION || (type) == ANCHOR \
224 || (type) == OP_OPEN_SUBEXP || (type) == OP_CLOSE_SUBEXP)
226 #define ACCEPT_MB_NODE(type) \
227 ((type) == COMPLEX_BRACKET || (type) == OP_PERIOD)
231 /* Indicate the raw buffer which is the original string passed as an
232 argument of regexec(), re_search(), etc.. */
233 const unsigned char *raw_mbs;
234 /* Store the multibyte string. In case of "case insensitive mode" like
235 REG_ICASE, upper cases of the string are stored, otherwise MBS points
236 the same address that RAW_MBS points. */
238 /* Store the case sensitive multibyte string. In case of
239 "case insensitive mode", the original string are stored,
240 otherwise MBS_CASE points the same address that MBS points. */
241 unsigned char *mbs_case;
242 #ifdef RE_ENABLE_I18N
243 /* Store the wide character string which is corresponding to MBS. */
247 /* Index in RAW_MBS. Each character mbs[i] corresponds to
248 raw_mbs[raw_mbs_idx + i]. */
250 /* The length of the valid characters in the buffers. */
252 /* The length of the buffers MBS, MBS_CASE, and WCS. */
254 /* The index in MBS, which is updated by re_string_fetch_byte. */
256 /* This is length_of_RAW_MBS - RAW_MBS_IDX. */
258 /* End of the buffer may be shorter than its length in the cases such
259 as re_match_2, re_search_2. Then, we use STOP for end of the buffer
263 /* The context of mbs[0]. We store the context independently, since
264 the context of mbs[0] may be different from raw_mbs[0], which is
265 the beginning of the input string. */
266 unsigned int tip_context;
267 /* The translation passed as a part of an argument of re_compile_pattern. */
268 RE_TRANSLATE_TYPE trans;
269 /* 1 if REG_ICASE. */
270 unsigned int icase : 1;
272 typedef struct re_string_t re_string_t;
273 /* In case of REG_ICASE, we allocate the buffer dynamically for mbs. */
274 #define MBS_ALLOCATED(pstr) (pstr->icase)
275 /* In case that we need translation, we allocate the buffer dynamically
276 for mbs_case. Note that mbs == mbs_case if not REG_ICASE. */
277 #define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)
280 static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str,
281 int len, int init_len,
282 RE_TRANSLATE_TYPE trans, int icase);
283 static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str,
284 int len, RE_TRANSLATE_TYPE trans,
286 static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
287 int eflags, int newline);
288 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
290 #ifdef RE_ENABLE_I18N
291 static void build_wcs_buffer (re_string_t *pstr);
292 static void build_wcs_upper_buffer (re_string_t *pstr);
293 #endif /* RE_ENABLE_I18N */
294 static void build_upper_buffer (re_string_t *pstr);
295 static void re_string_translate_buffer (re_string_t *pstr);
296 static void re_string_destruct (re_string_t *pstr);
297 #ifdef RE_ENABLE_I18N
298 static int re_string_elem_size_at (const re_string_t *pstr, int idx);
299 static inline int re_string_char_size_at (const re_string_t *pstr, int idx);
300 static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx);
301 #endif /* RE_ENABLE_I18N */
302 static unsigned int re_string_context_at (const re_string_t *input, int idx,
303 int eflags, int newline_anchor);
304 #define re_string_peek_byte(pstr, offset) \
305 ((pstr)->mbs[(pstr)->cur_idx + offset])
306 #define re_string_peek_byte_case(pstr, offset) \
307 ((pstr)->mbs_case[(pstr)->cur_idx + offset])
308 #define re_string_fetch_byte(pstr) \
309 ((pstr)->mbs[(pstr)->cur_idx++])
310 #define re_string_fetch_byte_case(pstr) \
311 ((pstr)->mbs_case[(pstr)->cur_idx++])
312 #define re_string_first_byte(pstr, idx) \
313 ((idx) == (pstr)->len || (pstr)->wcs[idx] != WEOF)
314 #define re_string_is_single_byte_char(pstr, idx) \
315 ((pstr)->wcs[idx] != WEOF && ((pstr)->len == (idx) \
316 || (pstr)->wcs[(idx) + 1] != WEOF))
317 #define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
318 #define re_string_cur_idx(pstr) ((pstr)->cur_idx)
319 #define re_string_get_buffer(pstr) ((pstr)->mbs)
320 #define re_string_length(pstr) ((pstr)->len)
321 #define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
322 #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
323 #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
325 #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
326 #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
327 #define re_free(p) free (p)
331 struct bin_tree_t *parent;
332 struct bin_tree_t *left;
333 struct bin_tree_t *right;
335 /* `node_idx' is the index in dfa->nodes, if `type' == 0.
336 Otherwise `type' indicate the type of this node. */
337 re_token_type_t type;
342 re_node_set eclosure;
344 typedef struct bin_tree_t bin_tree_t;
347 #define CONTEXT_WORD 1
348 #define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
349 #define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
350 #define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
352 #define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
353 #define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
354 #define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
355 #define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
356 #define IS_ORDINARY_CONTEXT(c) ((c) == 0)
358 #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
359 #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
361 #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
362 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
363 || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
364 || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
365 || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
367 #define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
368 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
369 || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
370 || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
371 || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
377 re_node_set *entrance_nodes;
378 struct re_dfastate_t **trtable;
379 struct re_dfastate_t **trtable_search;
380 /* If this state is a special state.
381 A state is a special state if the state is the halt state, or
383 unsigned int context : 2;
384 unsigned int halt : 1;
385 /* If this state can accept `multi byte'.
386 Note that we refer to multibyte characters, and multi character
387 collating elements as `multi byte'. */
388 unsigned int accept_mb : 1;
389 /* If this state has backreference node(s). */
390 unsigned int has_backref : 1;
391 unsigned int has_constraint : 1;
393 typedef struct re_dfastate_t re_dfastate_t;
397 /* start <= node < end */
402 struct re_state_table_entry
406 re_dfastate_t **array;
409 struct re_backref_cache_entry
420 /* EFLAGS of the argument of regexec. */
422 /* Where the matching ends. */
425 /* The string object corresponding to the input string. */
427 /* The state log used by the matcher. */
428 re_dfastate_t **state_log;
430 /* Back reference cache. */
433 struct re_backref_cache_entry *bkref_ents;
435 } re_match_context_t;
442 re_dfastate_t **sifted_states;
443 re_dfastate_t **limited_states;
452 struct re_fail_stack_ent_t
457 re_node_set eps_via_nodes;
460 struct re_fail_stack_t
464 struct re_fail_stack_ent_t *stack;
469 re_bitset_ptr_t word_char;
471 /* number of subexpressions `re_nsub' is in regex_t. */
473 re_subexp_t *subexps;
478 bin_tree_t *str_tree;
482 re_node_set *eclosures;
483 re_node_set *inveclosures;
484 struct re_state_table_entry *state_table;
485 unsigned int state_hash_mask;
486 re_dfastate_t *init_state;
487 re_dfastate_t *init_state_word;
488 re_dfastate_t *init_state_nl;
489 re_dfastate_t *init_state_begbuf;
492 int nbackref; /* The number of backreference in this dfa. */
493 /* If this dfa has "multibyte node", which is a backreference or
494 a node which can accept multibyte character or multi character
495 collating element. */
499 unsigned int has_plural_match : 1;
500 unsigned int has_mb_node : 1;
502 typedef struct re_dfa_t re_dfa_t;
504 static reg_errcode_t re_node_set_alloc (re_node_set *set, int size);
505 static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem);
506 static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1,
508 #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
509 static reg_errcode_t re_node_set_init_copy (re_node_set *dest,
510 const re_node_set *src);
511 static reg_errcode_t re_node_set_add_intersect (re_node_set *dest,
512 const re_node_set *src1,
513 const re_node_set *src2);
514 static reg_errcode_t re_node_set_init_union (re_node_set *dest,
515 const re_node_set *src1,
516 const re_node_set *src2);
517 static reg_errcode_t re_node_set_merge (re_node_set *dest,
518 const re_node_set *src);
519 static int re_node_set_insert (re_node_set *set, int elem);
520 static int re_node_set_compare (const re_node_set *set1,
521 const re_node_set *set2);
522 static int re_node_set_contains (const re_node_set *set, int elem);
523 static void re_node_set_remove_at (re_node_set *set, int idx);
524 #define re_node_set_empty(p) ((p)->nelem = 0)
525 #define re_node_set_free(set) re_free ((set)->elems)
526 static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token, int mode);
527 static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa,
528 const re_node_set *nodes);
529 static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err,
531 const re_node_set *nodes,
532 unsigned int context);
546 bracket_elem_type type;
556 /* Inline functions for bitset operation. */
562 for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
563 set[bitset_i] = ~set[bitset_i];
567 bitset_merge (dest, src)
572 for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
573 dest[bitset_i] |= src[bitset_i];
578 bitset_not_merge (dest, src)
583 for (i = 0; i < BITSET_UINTS; ++i)
588 #ifdef RE_ENABLE_I18N
589 /* Inline functions for re_string. */
591 re_string_char_size_at (pstr, idx)
592 const re_string_t *pstr;
598 for (byte_idx = 1; idx + byte_idx < pstr->len; ++byte_idx)
599 if (pstr->wcs[idx + byte_idx] != WEOF)
605 re_string_wchar_at (pstr, idx)
606 const re_string_t *pstr;
610 return (wint_t) pstr->mbs[idx];
611 return (wint_t) pstr->wcs[idx];
615 re_string_elem_size_at (pstr, idx)
616 const re_string_t *pstr;
620 const unsigned char *p, *extra;
621 const int32_t *table, *indirect;
623 # include <locale/weight.h>
624 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
628 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
629 extra = (const unsigned char *)
630 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
631 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
632 _NL_COLLATE_INDIRECTMB);
635 return p - pstr->mbs - idx;
641 #endif /* RE_ENABLE_I18N */
643 #endif /* _REGEX_INTERNAL_H */