]>
Commit | Line | Data |
---|---|---|
e0c0fa61 AM |
1 | --- src/search.c.orig |
2 | +++ src/search.c | |
3 | @@ -18,10 +18,15 @@ | |
4 | ||
5 | /* Written August 1992 by Mike Haertel. */ | |
6 | ||
7 | +#ifndef _GNU_SOURCE | |
8 | +# define _GNU_SOURCE 1 | |
9 | +#endif | |
10 | #ifdef HAVE_CONFIG_H | |
11 | # include <config.h> | |
12 | #endif | |
13 | ||
14 | +#include <assert.h> | |
15 | + | |
16 | #include <sys/types.h> | |
17 | ||
18 | #include "mbsupport.h" | |
19 | @@ -43,6 +48,9 @@ | |
20 | #ifdef HAVE_LIBPCRE | |
21 | # include <pcre.h> | |
22 | #endif | |
23 | +#ifdef HAVE_LANGINFO_CODESET | |
24 | +# include <langinfo.h> | |
25 | +#endif | |
26 | ||
27 | #define NCHAR (UCHAR_MAX + 1) | |
28 | ||
29 | @@ -68,6 +76,19 @@ | |
30 | error (2, 0, _("memory exhausted")); | |
31 | } | |
32 | ||
33 | +/* UTF-8 encoding allows some optimizations that we can't otherwise | |
34 | + assume in a multibyte encoding. */ | |
35 | +static int using_utf8; | |
36 | + | |
37 | +void | |
38 | +check_utf8 (void) | |
39 | +{ | |
40 | +#ifdef HAVE_LANGINFO_CODESET | |
41 | + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) | |
42 | + using_utf8 = 1; | |
43 | +#endif | |
44 | +} | |
45 | + | |
46 | #ifndef FGREP_PROGRAM | |
47 | /* DFA compiled regexp. */ | |
48 | static struct dfa dfa; | |
49 | @@ -134,49 +155,6 @@ | |
50 | } | |
51 | #endif /* !FGREP_PROGRAM */ | |
52 | ||
53 | -#ifdef MBS_SUPPORT | |
54 | -/* This function allocate the array which correspond to "buf". | |
55 | - Then this check multibyte string and mark on the positions which | |
56 | - are not single byte character nor the first byte of a multibyte | |
57 | - character. Caller must free the array. */ | |
58 | -static char* | |
59 | -check_multibyte_string(char const *buf, size_t size) | |
60 | -{ | |
61 | - char *mb_properties = xmalloc(size); | |
62 | - mbstate_t cur_state; | |
63 | - wchar_t wc; | |
64 | - int i; | |
65 | - | |
66 | - memset(&cur_state, 0, sizeof(mbstate_t)); | |
67 | - memset(mb_properties, 0, sizeof(char)*size); | |
68 | - | |
69 | - for (i = 0; i < size ;) | |
70 | - { | |
71 | - size_t mbclen; | |
72 | - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); | |
73 | - | |
74 | - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) | |
75 | - { | |
76 | - /* An invalid sequence, or a truncated multibyte character. | |
77 | - We treat it as a single byte character. */ | |
78 | - mbclen = 1; | |
79 | - } | |
80 | - else if (match_icase) | |
81 | - { | |
82 | - if (iswupper((wint_t)wc)) | |
83 | - { | |
84 | - wc = towlower((wint_t)wc); | |
85 | - wcrtomb(buf + i, wc, &cur_state); | |
86 | - } | |
87 | - } | |
88 | - mb_properties[i] = mbclen; | |
89 | - i += mbclen; | |
90 | - } | |
91 | - | |
92 | - return mb_properties; | |
93 | -} | |
94 | -#endif /* MBS_SUPPORT */ | |
95 | - | |
96 | #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) | |
97 | #ifdef EGREP_PROGRAM | |
98 | COMPILE_FCT(Ecompile) | |
99 | @@ -193,6 +171,7 @@ | |
100 | size_t total = size; | |
101 | char const *motif = pattern; | |
102 | ||
103 | + check_utf8 (); | |
104 | #if 0 | |
105 | if (match_icase) | |
106 | syntax_bits |= RE_ICASE; | |
107 | #@@ -303,47 +282,78 @@ hunk6 | |
108 | @@ -303,20 +282,9 @@ hunk6 | |
109 | struct kwsmatch kwsm; | |
110 | size_t i, ret_val; | |
111 | #ifdef MBS_SUPPORT | |
112 | - char *mb_properties = NULL; | |
113 | - if (MB_CUR_MAX > 1) | |
114 | - { | |
115 | - if (match_icase) | |
116 | - { | |
117 | - char *case_buf = xmalloc(size); | |
118 | - memcpy(case_buf, buf, size); | |
119 | - if (start_ptr) | |
120 | - start_ptr = case_buf + (start_ptr - buf); | |
121 | - buf = case_buf; | |
122 | - } | |
123 | - if (kwset) | |
124 | - mb_properties = check_multibyte_string(buf, size); | |
125 | - } | |
126 | + int mb_cur_max = MB_CUR_MAX; | |
127 | + mbstate_t mbs; | |
128 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
129 | #endif /* MBS_SUPPORT */ | |
130 | ||
131 | buflim = buf + size; | |
132 | @@ -329,21 +282,63 @@ hunk6 | |
133 | if (kwset) | |
134 | { | |
135 | /* Find a possible match using the KWset matcher. */ | |
136 | - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | |
137 | +#ifdef MBS_SUPPORT | |
138 | + size_t bytes_left = 0; | |
139 | +#endif /* MBS_SUPPORT */ | |
140 | + size_t offset; | |
141 | +#ifdef MBS_SUPPORT | |
142 | + /* kwsexec doesn't work with match_icase and multibyte input. */ | |
143 | + if (match_icase && mb_cur_max > 1) | |
144 | + /* Avoid kwset */ | |
145 | + offset = 0; | |
146 | + else | |
147 | +#endif /* MBS_SUPPORT */ | |
148 | + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); | |
149 | if (offset == (size_t) -1) | |
150 | - goto failure; | |
151 | + return (size_t)-1; | |
152 | +#ifdef MBS_SUPPORT | |
153 | + if (mb_cur_max > 1 && !using_utf8) | |
154 | + { | |
155 | + bytes_left = offset; | |
156 | + while (bytes_left) | |
157 | + { | |
158 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
159 | + if (mlen == (size_t) -1 || mlen == 0) | |
160 | + { | |
161 | + /* Incomplete character: treat as single-byte. */ | |
162 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
163 | + beg++; | |
164 | + bytes_left--; | |
165 | + continue; | |
166 | + } | |
167 | + | |
168 | + if (mlen == (size_t) -2) | |
169 | + /* Offset points inside multibyte character: | |
170 | + * no good. */ | |
171 | + break; | |
172 | + | |
173 | + beg += mlen; | |
174 | + bytes_left -= mlen; | |
175 | + } | |
176 | + } | |
177 | + else | |
178 | +#endif /* MBS_SUPPORT */ | |
179 | beg += offset; | |
180 | /* Narrow down to the line containing the candidate, and | |
181 | run it through DFA. */ | |
182 | end = memchr(beg, eol, buflim - beg); | |
183 | end++; | |
184 | #ifdef MBS_SUPPORT | |
185 | - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) | |
186 | + if (mb_cur_max > 1 && bytes_left) | |
187 | continue; | |
188 | #endif | |
189 | while (beg > buf && beg[-1] != eol) | |
190 | --beg; | |
191 | - if (kwsm.index < kwset_exact_matches) | |
192 | + if ( | |
193 | +#ifdef MBS_SUPPORT | |
194 | + !(match_icase && mb_cur_max > 1) && | |
195 | +#endif /* MBS_SUPPORT */ | |
196 | + (kwsm.index < kwset_exact_matches)) | |
197 | goto success; | |
198 | if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) | |
199 | continue; | |
200 | @@ -351,13 +363,47 @@ | |
201 | else | |
202 | { | |
203 | /* No good fixed strings; start with DFA. */ | |
204 | +#ifdef MBS_SUPPORT | |
205 | + size_t bytes_left = 0; | |
206 | +#endif /* MBS_SUPPORT */ | |
207 | size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); | |
208 | if (offset == (size_t) -1) | |
209 | break; | |
210 | /* Narrow down to the line we've found. */ | |
211 | +#ifdef MBS_SUPPORT | |
212 | + if (mb_cur_max > 1 && !using_utf8) | |
213 | + { | |
214 | + bytes_left = offset; | |
215 | + while (bytes_left) | |
216 | + { | |
217 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
218 | + if (mlen == (size_t) -1 || mlen == 0) | |
219 | + { | |
220 | + /* Incomplete character: treat as single-byte. */ | |
221 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
222 | + beg++; | |
223 | + bytes_left--; | |
224 | + continue; | |
225 | + } | |
226 | + | |
227 | + if (mlen == (size_t) -2) | |
228 | + /* Offset points inside multibyte character: | |
229 | + * no good. */ | |
230 | + break; | |
231 | + | |
232 | + beg += mlen; | |
233 | + bytes_left -= mlen; | |
234 | + } | |
235 | + } | |
236 | + else | |
237 | +#endif /* MBS_SUPPORT */ | |
238 | beg += offset; | |
239 | end = memchr (beg, eol, buflim - beg); | |
240 | end++; | |
241 | +#ifdef MBS_SUPPORT | |
242 | + if (mb_cur_max > 1 && bytes_left) | |
243 | + continue; | |
244 | +#endif /* MBS_SUPPORT */ | |
245 | while (beg > buf && beg[-1] != eol) | |
246 | --beg; | |
247 | } | |
248 | @@ -475,24 +521,144 @@ | |
249 | *match_size = len; | |
250 | ret_val = beg - buf; | |
251 | out: | |
252 | -#ifdef MBS_SUPPORT | |
253 | - if (MB_CUR_MAX > 1) | |
254 | - { | |
255 | - if (match_icase) | |
256 | - free((char*)buf); | |
257 | - if (mb_properties) | |
258 | - free(mb_properties); | |
259 | - } | |
260 | -#endif /* MBS_SUPPORT */ | |
261 | return ret_val; | |
262 | } | |
263 | #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */ | |
264 | ||
265 | +#ifdef MBS_SUPPORT | |
266 | +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ | |
267 | +static struct | |
268 | +{ | |
269 | + wchar_t **patterns; | |
270 | + size_t count, maxlen; | |
271 | + unsigned char *match; | |
272 | +} Fimb; | |
273 | +#endif | |
274 | + | |
275 | #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) | |
276 | COMPILE_FCT(Fcompile) | |
277 | { | |
278 | + int mb_cur_max = MB_CUR_MAX; | |
279 | char const *beg, *lim, *err; | |
280 | ||
281 | + check_utf8 (); | |
282 | +#ifdef MBS_SUPPORT | |
283 | + /* Support -F -i for UTF-8 input. */ | |
284 | + if (match_icase && mb_cur_max > 1) | |
285 | + { | |
286 | + mbstate_t mbs; | |
287 | + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); | |
288 | + const char *patternend = pattern; | |
289 | + size_t wcsize; | |
290 | + kwset_t fimb_kwset = NULL; | |
291 | + char *starts = NULL; | |
292 | + wchar_t *wcbeg, *wclim; | |
293 | + size_t allocated = 0; | |
294 | + | |
295 | + memset (&mbs, '\0', sizeof (mbs)); | |
296 | +# ifdef __GNU_LIBRARY__ | |
297 | + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); | |
298 | + if (patternend != pattern + size) | |
299 | + wcsize = (size_t) -1; | |
300 | +# else | |
301 | + { | |
302 | + char *patterncopy = xmalloc (size + 1); | |
303 | + | |
304 | + memcpy (patterncopy, pattern, size); | |
305 | + patterncopy[size] = '\0'; | |
306 | + patternend = patterncopy; | |
307 | + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); | |
308 | + if (patternend != patterncopy + size) | |
309 | + wcsize = (size_t) -1; | |
310 | + free (patterncopy); | |
311 | + } | |
312 | +# endif | |
313 | + if (wcsize + 2 <= 2) | |
314 | + { | |
315 | +fimb_fail: | |
316 | + free (wcpattern); | |
317 | + free (starts); | |
318 | + if (fimb_kwset) | |
319 | + kwsfree (fimb_kwset); | |
320 | + free (Fimb.patterns); | |
321 | + Fimb.patterns = NULL; | |
322 | + } | |
323 | + else | |
324 | + { | |
325 | + if (!(fimb_kwset = kwsalloc (NULL))) | |
326 | + error (2, 0, _("memory exhausted")); | |
327 | + | |
328 | + starts = xmalloc (mb_cur_max * 3); | |
329 | + wcbeg = wcpattern; | |
330 | + do | |
331 | + { | |
332 | + int i; | |
333 | + size_t wclen; | |
334 | + | |
335 | + if (Fimb.count >= allocated) | |
336 | + { | |
337 | + if (allocated == 0) | |
338 | + allocated = 128; | |
339 | + else | |
340 | + allocated *= 2; | |
341 | + Fimb.patterns = xrealloc (Fimb.patterns, | |
342 | + sizeof (wchar_t *) * allocated); | |
343 | + } | |
344 | + Fimb.patterns[Fimb.count++] = wcbeg; | |
345 | + for (wclim = wcbeg; | |
346 | + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) | |
347 | + *wclim = towlower (*wclim); | |
348 | + *wclim = L'\0'; | |
349 | + wclen = wclim - wcbeg; | |
350 | + if (wclen > Fimb.maxlen) | |
351 | + Fimb.maxlen = wclen; | |
352 | + if (wclen > 3) | |
353 | + wclen = 3; | |
354 | + if (wclen == 0) | |
355 | + { | |
356 | + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) | |
357 | + error (2, 0, err); | |
358 | + } | |
359 | + else | |
360 | + for (i = 0; i < (1 << wclen); i++) | |
361 | + { | |
362 | + char *p = starts; | |
363 | + int j, k; | |
364 | + | |
365 | + for (j = 0; j < wclen; ++j) | |
366 | + { | |
367 | + wchar_t wc = wcbeg[j]; | |
368 | + if (i & (1 << j)) | |
369 | + { | |
370 | + wc = towupper (wc); | |
371 | + if (wc == wcbeg[j]) | |
372 | + continue; | |
373 | + } | |
374 | + k = wctomb (p, wc); | |
375 | + if (k <= 0) | |
376 | + goto fimb_fail; | |
377 | + p += k; | |
378 | + } | |
379 | + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) | |
380 | + error (2, 0, err); | |
381 | + } | |
382 | + if (wclim < wcpattern + wcsize) | |
383 | + ++wclim; | |
384 | + wcbeg = wclim; | |
385 | + } | |
386 | + while (wcbeg < wcpattern + wcsize); | |
387 | + f_i_multibyte = 1; | |
388 | + kwset = fimb_kwset; | |
389 | + free (starts); | |
390 | + Fimb.match = xmalloc (Fimb.count); | |
391 | + if ((err = kwsprep (kwset)) != 0) | |
392 | + error (2, 0, err); | |
393 | + return; | |
394 | + } | |
395 | + } | |
396 | +#endif /* MBS_SUPPORT */ | |
397 | + | |
398 | + | |
399 | kwsinit (); | |
400 | beg = pattern; | |
401 | do | |
402 | @@ -511,6 +677,76 @@ | |
403 | error (2, 0, err); | |
404 | } | |
405 | ||
406 | +#ifdef MBS_SUPPORT | |
407 | +static int | |
408 | +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) | |
409 | +{ | |
410 | + size_t len, letter, i; | |
411 | + int ret = -1; | |
412 | + mbstate_t mbs; | |
413 | + wchar_t wc; | |
414 | + int patterns_left; | |
415 | + | |
416 | + assert (match_icase && f_i_multibyte == 1); | |
417 | + assert (MB_CUR_MAX > 1); | |
418 | + | |
419 | + memset (&mbs, '\0', sizeof (mbs)); | |
420 | + memset (Fimb.match, '\1', Fimb.count); | |
421 | + letter = len = 0; | |
422 | + patterns_left = 1; | |
423 | + while (patterns_left && len <= size) | |
424 | + { | |
425 | + size_t c; | |
426 | + | |
427 | + patterns_left = 0; | |
428 | + if (len < size) | |
429 | + { | |
430 | + c = mbrtowc (&wc, buf + len, size - len, &mbs); | |
431 | + if (c + 2 <= 2) | |
432 | + return ret; | |
433 | + | |
434 | + wc = towlower (wc); | |
435 | + } | |
436 | + else | |
437 | + { | |
438 | + c = 1; | |
439 | + wc = L'\0'; | |
440 | + } | |
441 | + | |
442 | + for (i = 0; i < Fimb.count; i++) | |
443 | + { | |
444 | + if (Fimb.match[i]) | |
445 | + { | |
446 | + if (Fimb.patterns[i][letter] == L'\0') | |
447 | + { | |
448 | + /* Found a match. */ | |
449 | + *plen = len; | |
450 | + if (!exact && !match_words) | |
451 | + return 0; | |
452 | + else | |
453 | + { | |
454 | + /* For -w or exact look for longest match. */ | |
455 | + ret = 0; | |
456 | + Fimb.match[i] = '\0'; | |
457 | + continue; | |
458 | + } | |
459 | + } | |
460 | + | |
461 | + if (Fimb.patterns[i][letter] == wc) | |
462 | + patterns_left = 1; | |
463 | + else | |
464 | + Fimb.match[i] = '\0'; | |
465 | + } | |
466 | + } | |
467 | + | |
468 | + len += c; | |
469 | + letter++; | |
470 | + } | |
471 | + | |
472 | + return ret; | |
473 | +} | |
474 | +#endif /* MBS_SUPPORT */ | |
475 | + | |
476 | EXECUTE_FCT(Fexecute) | |
477 | { | |
478 | register char const *beg, *try, *end; | |
479 | @@ -519,69 +755,256 @@ | |
480 | struct kwsmatch kwsmatch; | |
481 | size_t ret_val; | |
482 | #ifdef MBS_SUPPORT | |
483 | - char *mb_properties = NULL; | |
484 | - if (MB_CUR_MAX > 1) | |
485 | - { | |
486 | - if (match_icase) | |
487 | - { | |
488 | - char *case_buf = xmalloc(size); | |
489 | - memcpy(case_buf, buf, size); | |
490 | - if (start_ptr) | |
491 | - start_ptr = case_buf + (start_ptr - buf); | |
492 | - buf = case_buf; | |
493 | - } | |
494 | - mb_properties = check_multibyte_string(buf, size); | |
495 | - } | |
496 | + int mb_cur_max = MB_CUR_MAX; | |
497 | + mbstate_t mbs; | |
498 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
499 | + const char *last_char = NULL; | |
500 | #endif /* MBS_SUPPORT */ | |
501 | ||
502 | for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) | |
503 | { | |
504 | size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); | |
505 | if (offset == (size_t) -1) | |
506 | - goto failure; | |
507 | + return offset; | |
508 | #ifdef MBS_SUPPORT | |
509 | - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) | |
510 | - continue; /* It is a part of multibyte character. */ | |
511 | + if (mb_cur_max > 1 && !using_utf8) | |
512 | + { | |
513 | + size_t bytes_left = offset; | |
514 | + while (bytes_left) | |
515 | + { | |
516 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
517 | + | |
518 | + last_char = beg; | |
519 | + if (mlen == (size_t) -1 || mlen == 0) | |
520 | + { | |
521 | + /* Incomplete character: treat as single-byte. */ | |
522 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
523 | + beg++; | |
524 | + bytes_left--; | |
525 | + continue; | |
526 | + } | |
527 | + | |
528 | + if (mlen == (size_t) -2) | |
529 | + /* Offset points inside multibyte character: no good. */ | |
530 | + break; | |
531 | + | |
532 | + beg += mlen; | |
533 | + bytes_left -= mlen; | |
534 | + } | |
535 | + | |
536 | + if (bytes_left) | |
537 | + continue; | |
538 | + } | |
539 | + else | |
540 | #endif /* MBS_SUPPORT */ | |
541 | beg += offset; | |
542 | +#ifdef MBS_SUPPORT | |
543 | + /* For f_i_multibyte, the string at beg now matches first 3 chars of | |
544 | + one of the search strings (less if there are shorter search strings). | |
545 | + See if this is a real match. */ | |
546 | + if (f_i_multibyte | |
547 | + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL)) | |
548 | + goto next_char; | |
549 | +#endif /* MBS_SUPPORT */ | |
550 | len = kwsmatch.size[0]; | |
551 | if (start_ptr && !match_words) | |
552 | goto success_in_beg_and_len; | |
553 | if (match_lines) | |
554 | { | |
555 | if (beg > buf && beg[-1] != eol) | |
556 | - continue; | |
557 | + goto next_char; | |
558 | if (beg + len < buf + size && beg[len] != eol) | |
559 | - continue; | |
560 | + goto next_char; | |
561 | goto success; | |
562 | } | |
563 | else if (match_words) | |
564 | - for (try = beg; len; ) | |
565 | - { | |
566 | - if (try > buf && WCHAR((unsigned char) try[-1])) | |
567 | - break; | |
568 | - if (try + len < buf + size && WCHAR((unsigned char) try[len])) | |
569 | - { | |
570 | - offset = kwsexec (kwset, beg, --len, &kwsmatch); | |
571 | - if (offset == (size_t) -1) | |
572 | - break; | |
573 | - try = beg + offset; | |
574 | - len = kwsmatch.size[0]; | |
575 | - } | |
576 | - else if (!start_ptr) | |
577 | - goto success; | |
578 | - else | |
579 | - goto success_in_beg_and_len; | |
580 | - } /* for (try) */ | |
581 | - else | |
582 | - goto success; | |
583 | - } /* for (beg in buf) */ | |
584 | + { | |
585 | + while (len) | |
586 | + { | |
587 | + int word_match = 0; | |
588 | + if (beg > buf) | |
589 | + { | |
590 | +#ifdef MBS_SUPPORT | |
591 | + if (mb_cur_max > 1) | |
592 | + { | |
593 | + const char *s; | |
594 | + int mr; | |
595 | + wchar_t pwc; | |
596 | + | |
597 | + if (using_utf8) | |
598 | + { | |
599 | + s = beg - 1; | |
600 | + while (s > buf | |
601 | + && (unsigned char) *s >= 0x80 | |
602 | + && (unsigned char) *s <= 0xbf) | |
603 | + --s; | |
604 | + } | |
605 | + else | |
606 | + s = last_char; | |
607 | + mr = mbtowc (&pwc, s, beg - s); | |
608 | + if (mr <= 0) | |
609 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
610 | + else if ((iswalnum (pwc) || pwc == L'_') | |
611 | + && mr == (int) (beg - s)) | |
612 | + goto next_char; | |
613 | + } | |
614 | + else | |
615 | +#endif /* MBS_SUPPORT */ | |
616 | + if (WCHAR ((unsigned char) beg[-1])) | |
617 | + goto next_char; | |
618 | + } | |
619 | +#ifdef MBS_SUPPORT | |
620 | + if (mb_cur_max > 1) | |
621 | + { | |
622 | + wchar_t nwc; | |
623 | + int mr; | |
624 | ||
625 | - failure: | |
626 | - ret_val = -1; | |
627 | - goto out; | |
628 | + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); | |
629 | + if (mr <= 0) | |
630 | + { | |
631 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
632 | + word_match = 1; | |
633 | + } | |
634 | + else if (!iswalnum (nwc) && nwc != L'_') | |
635 | + word_match = 1; | |
636 | + } | |
637 | + else | |
638 | +#endif /* MBS_SUPPORT */ | |
639 | + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) | |
640 | + word_match = 1; | |
641 | + if (word_match) | |
642 | + { | |
643 | + if (start_ptr == NULL) | |
644 | + /* Returns the whole line now we know there's a word match. */ | |
645 | + goto success; | |
646 | + else { | |
647 | + /* Returns just this word match. */ | |
648 | + *match_size = len; | |
649 | + return beg - buf; | |
650 | + } | |
651 | + } | |
652 | + if (len > 0) | |
653 | + { | |
654 | + /* Try a shorter length anchored at the same place. */ | |
655 | + --len; | |
656 | + offset = kwsexec (kwset, beg, len, &kwsmatch); | |
657 | + | |
658 | + if (offset == -1) | |
659 | + goto next_char; /* Try a different anchor. */ | |
660 | +#ifdef MBS_SUPPORT | |
661 | + | |
662 | + if (mb_cur_max > 1 && !using_utf8) | |
663 | + { | |
664 | + size_t bytes_left = offset; | |
665 | + while (bytes_left) | |
666 | + { | |
667 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); | |
668 | + | |
669 | + last_char = beg; | |
670 | + if (mlen == (size_t) -1 || mlen == 0) | |
671 | + { | |
672 | + /* Incomplete character: treat as single-byte. */ | |
673 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
674 | + beg++; | |
675 | + bytes_left--; | |
676 | + continue; | |
677 | + } | |
678 | + | |
679 | + if (mlen == (size_t) -2) | |
680 | + { | |
681 | + /* Offset points inside multibyte character: | |
682 | + * no good. */ | |
683 | + break; | |
684 | + } | |
685 | + | |
686 | + beg += mlen; | |
687 | + bytes_left -= mlen; | |
688 | + } | |
689 | + | |
690 | + if (bytes_left) | |
691 | + { | |
692 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
693 | + goto next_char; /* Try a different anchor. */ | |
694 | + } | |
695 | + } | |
696 | + else | |
697 | +#endif /* MBS_SUPPORT */ | |
698 | + beg += offset; | |
699 | +#ifdef MBS_SUPPORT | |
700 | + /* The string at beg now matches first 3 chars of one of | |
701 | + the search strings (less if there are shorter search | |
702 | + strings). See if this is a real match. */ | |
703 | + if (f_i_multibyte | |
704 | + && Fimbexec (beg, len - offset, &kwsmatch.size[0], | |
705 | + start_ptr == NULL)) | |
706 | + goto next_char; | |
707 | +#endif /* MBS_SUPPORT */ | |
708 | + len = kwsmatch.size[0]; | |
709 | + } | |
710 | + } | |
711 | + } | |
712 | + else | |
713 | + goto success; | |
714 | +next_char:; | |
715 | +#ifdef MBS_SUPPORT | |
716 | + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled | |
717 | + by ++beg above. */ | |
718 | + if (mb_cur_max > 1) | |
719 | + { | |
720 | + if (using_utf8) | |
721 | + { | |
722 | + unsigned char c = *beg; | |
723 | + if (c >= 0xc2) | |
724 | + { | |
725 | + if (c < 0xe0) | |
726 | + ++beg; | |
727 | + else if (c < 0xf0) | |
728 | + beg += 2; | |
729 | + else if (c < 0xf8) | |
730 | + beg += 3; | |
731 | + else if (c < 0xfc) | |
732 | + beg += 4; | |
733 | + else if (c < 0xfe) | |
734 | + beg += 5; | |
735 | + } | |
736 | + } | |
737 | + else | |
738 | + { | |
739 | + size_t l = mbrlen (beg, buf + size - beg, &mbs); | |
740 | + | |
741 | + last_char = beg; | |
742 | + if (l + 2 >= 2) | |
743 | + beg += l - 1; | |
744 | + else | |
745 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
746 | + } | |
747 | + } | |
748 | +#endif /* MBS_SUPPORT */ | |
749 | + } | |
750 | + | |
751 | + return -1; | |
752 | ||
753 | success: | |
754 | +#ifdef MBS_SUPPORT | |
755 | + if (mb_cur_max > 1 && !using_utf8) | |
756 | + { | |
757 | + end = beg + len; | |
758 | + while (end < buf + size) | |
759 | + { | |
760 | + size_t mlen = mbrlen (end, buf + size - end, &mbs); | |
761 | + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) | |
762 | + { | |
763 | + memset (&mbs, '\0', sizeof (mbstate_t)); | |
764 | + mlen = 1; | |
765 | + } | |
766 | + if (mlen == 1 && *end == eol) | |
767 | + break; | |
768 | + | |
769 | + end += mlen; | |
770 | + } | |
771 | + } | |
772 | + else | |
773 | + #endif /* MBS_SUPPORT */ | |
774 | end = memchr (beg + len, eol, (buf + size) - (beg + len)); | |
775 | end++; | |
776 | while (buf < beg && beg[-1] != eol) | |
777 | @@ -591,15 +1016,6 @@ | |
778 | *match_size = len; | |
779 | ret_val = beg - buf; | |
780 | out: | |
781 | -#ifdef MBS_SUPPORT | |
782 | - if (MB_CUR_MAX > 1) | |
783 | - { | |
784 | - if (match_icase) | |
785 | - free((char*)buf); | |
786 | - if (mb_properties) | |
787 | - free(mb_properties); | |
788 | - } | |
789 | -#endif /* MBS_SUPPORT */ | |
790 | return ret_val; | |
791 | } | |
792 | #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */ |