5 /* Written August 1992 by Mike Haertel. */
8 +# define _GNU_SOURCE 1
16 #include <sys/types.h>
18 #include "mbsupport.h"
23 +#ifdef HAVE_LANGINFO_CODESET
24 +# include <langinfo.h>
27 #define NCHAR (UCHAR_MAX + 1)
30 error (2, 0, _("memory exhausted"));
33 +/* UTF-8 encoding allows some optimizations that we can't otherwise
34 + assume in a multibyte encoding. */
35 +static int using_utf8;
40 +#ifdef HAVE_LANGINFO_CODESET
41 + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
47 /* DFA compiled regexp. */
48 static struct dfa dfa;
51 #endif /* !FGREP_PROGRAM */
54 -/* This function allocate the array which correspond to "buf".
55 - Then this check multibyte string and mark on the positions which
56 - are not single byte character nor the first byte of a multibyte
57 - character. Caller must free the array. */
59 -check_multibyte_string(char const *buf, size_t size)
61 - char *mb_properties = xmalloc(size);
62 - mbstate_t cur_state;
66 - memset(&cur_state, 0, sizeof(mbstate_t));
67 - memset(mb_properties, 0, sizeof(char)*size);
69 - for (i = 0; i < size ;)
72 - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
74 - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
76 - /* An invalid sequence, or a truncated multibyte character.
77 - We treat it as a single byte character. */
80 - else if (match_icase)
82 - if (iswupper((wint_t)wc))
84 - wc = towlower((wint_t)wc);
85 - wcrtomb(buf + i, wc, &cur_state);
88 - mb_properties[i] = mbclen;
92 - return mb_properties;
94 -#endif /* MBS_SUPPORT */
96 #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
101 char const *motif = pattern;
106 syntax_bits |= RE_ICASE;
107 #@@ -303,47 +282,78 @@ hunk6
108 @@ -303,20 +282,9 @@ hunk6
109 struct kwsmatch kwsm;
112 - char *mb_properties = NULL;
113 - if (MB_CUR_MAX > 1)
117 - char *case_buf = xmalloc(size);
118 - memcpy(case_buf, buf, size);
120 - start_ptr = case_buf + (start_ptr - buf);
124 - mb_properties = check_multibyte_string(buf, size);
126 + int mb_cur_max = MB_CUR_MAX;
128 + memset (&mbs, '\0', sizeof (mbstate_t));
129 #endif /* MBS_SUPPORT */
132 @@ -329,21 +282,63 @@ hunk6
135 /* Find a possible match using the KWset matcher. */
136 - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
138 + size_t bytes_left = 0;
139 +#endif /* MBS_SUPPORT */
142 + /* kwsexec doesn't work with match_icase and multibyte input. */
143 + if (match_icase && mb_cur_max > 1)
147 +#endif /* MBS_SUPPORT */
148 + offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
149 if (offset == (size_t) -1)
153 + if (mb_cur_max > 1 && !using_utf8)
155 + bytes_left = offset;
158 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
159 + if (mlen == (size_t) -1 || mlen == 0)
161 + /* Incomplete character: treat as single-byte. */
162 + memset (&mbs, '\0', sizeof (mbstate_t));
168 + if (mlen == (size_t) -2)
169 + /* Offset points inside multibyte character:
174 + bytes_left -= mlen;
178 +#endif /* MBS_SUPPORT */
180 /* Narrow down to the line containing the candidate, and
181 run it through DFA. */
182 end = memchr(beg, eol, buflim - beg);
185 - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
186 + if (mb_cur_max > 1 && bytes_left)
189 while (beg > buf && beg[-1] != eol)
191 - if (kwsm.index < kwset_exact_matches)
194 + !(match_icase && mb_cur_max > 1) &&
195 +#endif /* MBS_SUPPORT */
196 + (kwsm.index < kwset_exact_matches))
198 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
200 @@ -351,13 +363,47 @@
203 /* No good fixed strings; start with DFA. */
205 + size_t bytes_left = 0;
206 +#endif /* MBS_SUPPORT */
207 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
208 if (offset == (size_t) -1)
210 /* Narrow down to the line we've found. */
212 + if (mb_cur_max > 1 && !using_utf8)
214 + bytes_left = offset;
217 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
218 + if (mlen == (size_t) -1 || mlen == 0)
220 + /* Incomplete character: treat as single-byte. */
221 + memset (&mbs, '\0', sizeof (mbstate_t));
227 + if (mlen == (size_t) -2)
228 + /* Offset points inside multibyte character:
233 + bytes_left -= mlen;
237 +#endif /* MBS_SUPPORT */
239 end = memchr (beg, eol, buflim - beg);
242 + if (mb_cur_max > 1 && bytes_left)
244 +#endif /* MBS_SUPPORT */
245 while (beg > buf && beg[-1] != eol)
248 @@ -475,24 +521,144 @@
253 - if (MB_CUR_MAX > 1)
258 - free(mb_properties);
260 -#endif /* MBS_SUPPORT */
263 #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
266 +static int f_i_multibyte; /* whether we're using the new -Fi MB method */
269 + wchar_t **patterns;
270 + size_t count, maxlen;
271 + unsigned char *match;
275 #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
276 COMPILE_FCT(Fcompile)
278 + int mb_cur_max = MB_CUR_MAX;
279 char const *beg, *lim, *err;
283 + /* Support -F -i for UTF-8 input. */
284 + if (match_icase && mb_cur_max > 1)
287 + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
288 + const char *patternend = pattern;
290 + kwset_t fimb_kwset = NULL;
291 + char *starts = NULL;
292 + wchar_t *wcbeg, *wclim;
293 + size_t allocated = 0;
295 + memset (&mbs, '\0', sizeof (mbs));
296 +# ifdef __GNU_LIBRARY__
297 + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
298 + if (patternend != pattern + size)
299 + wcsize = (size_t) -1;
302 + char *patterncopy = xmalloc (size + 1);
304 + memcpy (patterncopy, pattern, size);
305 + patterncopy[size] = '\0';
306 + patternend = patterncopy;
307 + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
308 + if (patternend != patterncopy + size)
309 + wcsize = (size_t) -1;
310 + free (patterncopy);
313 + if (wcsize + 2 <= 2)
319 + kwsfree (fimb_kwset);
320 + free (Fimb.patterns);
321 + Fimb.patterns = NULL;
325 + if (!(fimb_kwset = kwsalloc (NULL)))
326 + error (2, 0, _("memory exhausted"));
328 + starts = xmalloc (mb_cur_max * 3);
335 + if (Fimb.count >= allocated)
337 + if (allocated == 0)
341 + Fimb.patterns = xrealloc (Fimb.patterns,
342 + sizeof (wchar_t *) * allocated);
344 + Fimb.patterns[Fimb.count++] = wcbeg;
345 + for (wclim = wcbeg;
346 + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
347 + *wclim = towlower (*wclim);
349 + wclen = wclim - wcbeg;
350 + if (wclen > Fimb.maxlen)
351 + Fimb.maxlen = wclen;
356 + if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
360 + for (i = 0; i < (1 << wclen); i++)
365 + for (j = 0; j < wclen; ++j)
367 + wchar_t wc = wcbeg[j];
370 + wc = towupper (wc);
371 + if (wc == wcbeg[j])
374 + k = wctomb (p, wc);
379 + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
382 + if (wclim < wcpattern + wcsize)
386 + while (wcbeg < wcpattern + wcsize);
388 + kwset = fimb_kwset;
390 + Fimb.match = xmalloc (Fimb.count);
391 + if ((err = kwsprep (kwset)) != 0)
396 +#endif /* MBS_SUPPORT */
408 +Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
410 + size_t len, letter, i;
416 + assert (match_icase && f_i_multibyte == 1);
417 + assert (MB_CUR_MAX > 1);
419 + memset (&mbs, '\0', sizeof (mbs));
420 + memset (Fimb.match, '\1', Fimb.count);
423 + while (patterns_left && len <= size)
430 + c = mbrtowc (&wc, buf + len, size - len, &mbs);
434 + wc = towlower (wc);
442 + for (i = 0; i < Fimb.count; i++)
446 + if (Fimb.patterns[i][letter] == L'\0')
448 + /* Found a match. */
450 + if (!exact && !match_words)
454 + /* For -w or exact look for longest match. */
456 + Fimb.match[i] = '\0';
461 + if (Fimb.patterns[i][letter] == wc)
464 + Fimb.match[i] = '\0';
474 +#endif /* MBS_SUPPORT */
476 EXECUTE_FCT(Fexecute)
478 register char const *beg, *try, *end;
479 @@ -519,69 +755,256 @@
480 struct kwsmatch kwsmatch;
483 - char *mb_properties = NULL;
484 - if (MB_CUR_MAX > 1)
488 - char *case_buf = xmalloc(size);
489 - memcpy(case_buf, buf, size);
491 - start_ptr = case_buf + (start_ptr - buf);
494 - mb_properties = check_multibyte_string(buf, size);
496 + int mb_cur_max = MB_CUR_MAX;
498 + memset (&mbs, '\0', sizeof (mbstate_t));
499 + const char *last_char = NULL;
500 #endif /* MBS_SUPPORT */
502 for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
504 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
505 if (offset == (size_t) -1)
509 - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
510 - continue; /* It is a part of multibyte character. */
511 + if (mb_cur_max > 1 && !using_utf8)
513 + size_t bytes_left = offset;
516 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
519 + if (mlen == (size_t) -1 || mlen == 0)
521 + /* Incomplete character: treat as single-byte. */
522 + memset (&mbs, '\0', sizeof (mbstate_t));
528 + if (mlen == (size_t) -2)
529 + /* Offset points inside multibyte character: no good. */
533 + bytes_left -= mlen;
540 #endif /* MBS_SUPPORT */
543 + /* For f_i_multibyte, the string at beg now matches first 3 chars of
544 + one of the search strings (less if there are shorter search strings).
545 + See if this is a real match. */
547 + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL))
549 +#endif /* MBS_SUPPORT */
550 len = kwsmatch.size[0];
551 if (start_ptr && !match_words)
552 goto success_in_beg_and_len;
555 if (beg > buf && beg[-1] != eol)
558 if (beg + len < buf + size && beg[len] != eol)
563 else if (match_words)
564 - for (try = beg; len; )
566 - if (try > buf && WCHAR((unsigned char) try[-1]))
568 - if (try + len < buf + size && WCHAR((unsigned char) try[len]))
570 - offset = kwsexec (kwset, beg, --len, &kwsmatch);
571 - if (offset == (size_t) -1)
573 - try = beg + offset;
574 - len = kwsmatch.size[0];
576 - else if (!start_ptr)
579 - goto success_in_beg_and_len;
583 - } /* for (beg in buf) */
587 + int word_match = 0;
591 + if (mb_cur_max > 1)
601 + && (unsigned char) *s >= 0x80
602 + && (unsigned char) *s <= 0xbf)
607 + mr = mbtowc (&pwc, s, beg - s);
609 + memset (&mbs, '\0', sizeof (mbstate_t));
610 + else if ((iswalnum (pwc) || pwc == L'_')
611 + && mr == (int) (beg - s))
615 +#endif /* MBS_SUPPORT */
616 + if (WCHAR ((unsigned char) beg[-1]))
620 + if (mb_cur_max > 1)
628 + mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
631 + memset (&mbs, '\0', sizeof (mbstate_t));
634 + else if (!iswalnum (nwc) && nwc != L'_')
638 +#endif /* MBS_SUPPORT */
639 + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
643 + if (start_ptr == NULL)
644 + /* Returns the whole line now we know there's a word match. */
647 + /* Returns just this word match. */
654 + /* Try a shorter length anchored at the same place. */
656 + offset = kwsexec (kwset, beg, len, &kwsmatch);
659 + goto next_char; /* Try a different anchor. */
662 + if (mb_cur_max > 1 && !using_utf8)
664 + size_t bytes_left = offset;
667 + size_t mlen = mbrlen (beg, bytes_left, &mbs);
670 + if (mlen == (size_t) -1 || mlen == 0)
672 + /* Incomplete character: treat as single-byte. */
673 + memset (&mbs, '\0', sizeof (mbstate_t));
679 + if (mlen == (size_t) -2)
681 + /* Offset points inside multibyte character:
687 + bytes_left -= mlen;
692 + memset (&mbs, '\0', sizeof (mbstate_t));
693 + goto next_char; /* Try a different anchor. */
697 +#endif /* MBS_SUPPORT */
700 + /* The string at beg now matches first 3 chars of one of
701 + the search strings (less if there are shorter search
702 + strings). See if this is a real match. */
704 + && Fimbexec (beg, len - offset, &kwsmatch.size[0],
705 + start_ptr == NULL))
707 +#endif /* MBS_SUPPORT */
708 + len = kwsmatch.size[0];
716 + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
718 + if (mb_cur_max > 1)
722 + unsigned char c = *beg;
739 + size_t l = mbrlen (beg, buf + size - beg, &mbs);
745 + memset (&mbs, '\0', sizeof (mbstate_t));
748 +#endif /* MBS_SUPPORT */
755 + if (mb_cur_max > 1 && !using_utf8)
758 + while (end < buf + size)
760 + size_t mlen = mbrlen (end, buf + size - end, &mbs);
761 + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
763 + memset (&mbs, '\0', sizeof (mbstate_t));
766 + if (mlen == 1 && *end == eol)
773 + #endif /* MBS_SUPPORT */
774 end = memchr (beg + len, eol, (buf + size) - (beg + len));
776 while (buf < beg && beg[-1] != eol)
777 @@ -591,15 +1016,6 @@
782 - if (MB_CUR_MAX > 1)
787 - free(mb_properties);
789 -#endif /* MBS_SUPPORT */
792 #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */