]>
Commit | Line | Data |
---|---|---|
fccf3f89 AM |
1 | From 8521001643bc6a28c760552824eaea5ecee0aa8c Mon Sep 17 00:00:00 2001 |
2 | From: Paul Eggert <eggert@cs.ucla.edu> | |
3 | Date: Thu, 31 Dec 2015 03:10:14 +0000 | |
4 | Subject: grep: be less picky about encoding errors | |
5 | ||
6 | This fixes a longstanding problem introduced in grep 2.21, | |
7 | which is overly picky about binary files. | |
8 | * NEWS: | |
9 | * doc/grep.texi (File and Directory Selection): Document this. | |
10 | * src/grep.c (input_textbin, textbin_is_binary, buffer_textbin) | |
11 | (file_textbin): | |
12 | Remove. All uses removed. | |
13 | (encoding_error_output): New static var. | |
14 | (buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls): | |
15 | New functions, which reuse bits | |
16 | and pieces of the removed functions. | |
17 | (lastout, print_line_head, print_line_middle, print_line_tail, prline) | |
18 | (prpending, prtext, grepbuf): | |
19 | Avoid use of const, now that we have | |
20 | functions that require modifying a sentinel. | |
21 | (print_line_head): New arg LEN. All uses changed. | |
22 | (print_line_head, print_line_tail): | |
23 | Return indicator whether the output line was printed. | |
24 | All uses changed. | |
25 | (print_line_middle): Exit early on encoding error. | |
26 | (grep): Use new method for determining whether file is binary. | |
27 | * src/grep.h (enum textbin, TEXTBIN_BINARY, TEXTBIN_UNKNOWN) | |
28 | (TEXTBIN_TEXT, input_textbin): Remove decls. All uses removed. | |
29 | * src/pcresearch.c (Pexecute): Remove multiline optimization, | |
30 | since the main program no longer checks for encoding errors on input. | |
31 | * tests/encoding-error: New file. | |
32 | * tests/Makefile.am (TESTS): Add it. | |
33 | --- | |
34 | diff --git a/doc/grep.texi b/doc/grep.texi | |
35 | index 76c7f46..58e7f48 100644 | |
36 | --- a/doc/grep.texi | |
37 | +++ b/doc/grep.texi | |
38 | @@ -596,13 +596,13 @@ If a file's allocation metadata, | |
39 | or if its data read before a line is selected for output, | |
40 | indicate that the file contains binary data, | |
41 | assume that the file is of type @var{type}. | |
42 | -Non-text bytes indicate binary data; these are either data bytes | |
43 | -improperly encoded for the current locale, or null bytes when the | |
44 | +Non-text bytes indicate binary data; these are either output bytes that are | |
45 | +improperly encoded for the current locale, or null input bytes when the | |
46 | @option{-z} (@option{--null-data}) option is not given (@pxref{Other | |
47 | Options}). | |
48 | ||
49 | -By default, @var{type} is @samp{binary}, | |
50 | -and @command{grep} normally outputs either | |
51 | +By default, @var{type} is @samp{binary}, and when @command{grep} | |
52 | +discovers that a file is binary it normally outputs either | |
53 | a one-line message saying that a binary file matches, | |
54 | or no message if there is no match. | |
55 | When processing binary data, @command{grep} may treat non-text bytes | |
56 | @@ -611,7 +611,8 @@ not match a null byte, as the null byte might be treated as a line | |
57 | terminator even without the @option{-z} (@option{--null-data}) option. | |
58 | ||
59 | If @var{type} is @samp{without-match}, | |
60 | -@command{grep} assumes that a binary file does not match; | |
61 | +when @command{grep} discovers that a file is binary | |
62 | +it assumes that the rest of the file does not match; | |
63 | this is equivalent to the @option{-I} option. | |
64 | ||
65 | If @var{type} is @samp{text}, | |
66 | diff --git a/src/grep.c b/src/grep.c | |
67 | index 19ba208..e059a46 100644 | |
68 | --- a/src/grep.c | |
69 | +++ b/src/grep.c | |
70 | @@ -377,7 +377,6 @@ bool match_icase; | |
71 | bool match_words; | |
72 | bool match_lines; | |
73 | char eolbyte; | |
74 | -enum textbin input_textbin; | |
75 | ||
76 | static char const *matcher; | |
77 | ||
78 | @@ -389,6 +388,10 @@ static bool omit_dot_slash; | |
79 | static bool errseen; | |
80 | static bool write_error_seen; | |
81 | ||
82 | +/* True if output from the current input file has been suppressed | |
83 | + because an output line had an encoding error. */ | |
84 | +static bool encoding_error_output; | |
85 | + | |
86 | enum directories_type | |
87 | { | |
88 | READ_DIRECTORIES = 2, | |
89 | @@ -481,12 +484,6 @@ clean_up_stdout (void) | |
90 | close_stdout (); | |
91 | } | |
92 | ||
93 | -static bool | |
94 | -textbin_is_binary (enum textbin textbin) | |
95 | -{ | |
96 | - return textbin < TEXTBIN_UNKNOWN; | |
97 | -} | |
98 | - | |
99 | /* The high-order bit of a byte. */ | |
100 | enum { HIBYTE = 0x80 }; | |
101 | ||
102 | @@ -551,58 +548,60 @@ skip_easy_bytes (char const *buf) | |
103 | return p; | |
104 | } | |
105 | ||
106 | -/* Return the text type of data in BUF, of size SIZE. | |
107 | +/* Return true if BUF, of size SIZE, has an encoding error. | |
108 | BUF must be followed by at least sizeof (uword) bytes, | |
109 | - which may be arbitrarily written to or read from. */ | |
110 | -static enum textbin | |
111 | -buffer_textbin (char *buf, size_t size) | |
112 | + the first of which may be modified. */ | |
113 | +static bool | |
114 | +buf_has_encoding_errors (char *buf, size_t size) | |
115 | { | |
116 | - if (eolbyte && memchr (buf, '\0', size)) | |
117 | - return TEXTBIN_BINARY; | |
118 | + if (MB_CUR_MAX <= 1) | |
119 | + return false; | |
120 | ||
121 | - if (1 < MB_CUR_MAX) | |
122 | - { | |
123 | - mbstate_t mbs = { 0 }; | |
124 | - size_t clen; | |
125 | - char const *p; | |
126 | + mbstate_t mbs = { 0 }; | |
127 | + size_t clen; | |
128 | ||
129 | - buf[size] = -1; | |
130 | - for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen) | |
131 | - { | |
132 | - clen = mbrlen (p, buf + size - p, &mbs); | |
133 | - if ((size_t) -2 <= clen) | |
134 | - return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY; | |
135 | - } | |
136 | + buf[size] = -1; | |
137 | + for (char const *p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen) | |
138 | + { | |
139 | + clen = mbrlen (p, buf + size - p, &mbs); | |
140 | + if ((size_t) -2 <= clen) | |
141 | + return true; | |
142 | } | |
143 | ||
144 | - return TEXTBIN_TEXT; | |
145 | + return false; | |
146 | } | |
147 | ||
148 | -/* Return the text type of a file. BUF, of size SIZE, is the initial | |
149 | - buffer read from the file with descriptor FD and status ST. | |
150 | - BUF must be followed by at least sizeof (uword) bytes, | |
151 | + | |
152 | +/* Return true if BUF, of size SIZE, has a null byte. | |
153 | + BUF must be followed by at least one byte, | |
154 | which may be arbitrarily written to or read from. */ | |
155 | -static enum textbin | |
156 | -file_textbin (char *buf, size_t size, int fd, struct stat const *st) | |
157 | +static bool | |
158 | +buf_has_nulls (char *buf, size_t size) | |
159 | { | |
160 | - enum textbin textbin = buffer_textbin (buf, size); | |
161 | - if (textbin_is_binary (textbin)) | |
162 | - return textbin; | |
163 | + buf[size] = 0; | |
164 | + return strlen (buf) != size; | |
165 | +} | |
166 | ||
167 | +/* Return true if a file is known to contain null bytes. | |
168 | + SIZE bytes have already been read from the file | |
169 | + with descriptor FD and status ST. */ | |
170 | +static bool | |
171 | +file_must_have_nulls (size_t size, int fd, struct stat const *st) | |
172 | +{ | |
173 | if (usable_st_size (st)) | |
174 | { | |
175 | if (st->st_size <= size) | |
176 | - return textbin == TEXTBIN_UNKNOWN ? TEXTBIN_BINARY : textbin; | |
177 | + return false; | |
178 | ||
179 | /* If the file has holes, it must contain a null byte somewhere. */ | |
180 | - if (SEEK_HOLE != SEEK_SET && eolbyte) | |
181 | + if (SEEK_HOLE != SEEK_SET) | |
182 | { | |
183 | off_t cur = size; | |
184 | if (O_BINARY || fd == STDIN_FILENO) | |
185 | { | |
186 | cur = lseek (fd, 0, SEEK_CUR); | |
187 | if (cur < 0) | |
188 | - return TEXTBIN_UNKNOWN; | |
189 | + return false; | |
190 | } | |
191 | ||
192 | /* Look for a hole after the current location. */ | |
193 | @@ -612,12 +611,12 @@ file_textbin (char *buf, size_t size, int fd, struct stat const *st) | |
194 | if (lseek (fd, cur, SEEK_SET) < 0) | |
195 | suppressible_error (filename, errno); | |
196 | if (hole_start < st->st_size) | |
197 | - return TEXTBIN_BINARY; | |
198 | + return true; | |
199 | } | |
200 | } | |
201 | } | |
202 | ||
203 | - return TEXTBIN_UNKNOWN; | |
204 | + return false; | |
205 | } | |
206 | ||
207 | /* Convert STR to a nonnegative integer, storing the result in *OUT. | |
208 | @@ -899,7 +898,7 @@ static char *label = NULL; /* Fake filename for stdin */ | |
209 | /* Internal variables to keep track of byte count, context, etc. */ | |
210 | static uintmax_t totalcc; /* Total character count before bufbeg. */ | |
211 | static char const *lastnl; /* Pointer after last newline counted. */ | |
212 | -static char const *lastout; /* Pointer after last character output; | |
213 | +static char *lastout; /* Pointer after last character output; | |
214 | NULL if no character has been output | |
215 | or if it's conceptually before bufbeg. */ | |
216 | static intmax_t outleft; /* Maximum number of lines to be output. */ | |
217 | @@ -971,10 +970,31 @@ print_offset (uintmax_t pos, int min_width, const char *color) | |
218 | pr_sgr_end_if (color); | |
219 | } | |
220 | ||
221 | -/* Print a whole line head (filename, line, byte). */ | |
222 | -static void | |
223 | -print_line_head (char const *beg, char const *lim, char sep) | |
224 | +/* Print a whole line head (filename, line, byte). The output data | |
225 | + starts at BEG and contains LEN bytes; it is followed by at least | |
226 | + sizeof (uword) bytes, the first of which may be temporarily modified. | |
227 | + The output data comes from what is perhaps a larger input line that | |
228 | + goes until LIM, where LIM[-1] is an end-of-line byte. Use SEP as | |
229 | + the separator on output. | |
230 | + | |
231 | + Return true unless the line was suppressed due to an encoding error. */ | |
232 | + | |
233 | +static bool | |
234 | +print_line_head (char *beg, size_t len, char const *lim, char sep) | |
235 | { | |
236 | + bool encoding_errors = false; | |
237 | + if (binary_files != TEXT_BINARY_FILES) | |
238 | + { | |
239 | + char ch = beg[len]; | |
240 | + encoding_errors = buf_has_encoding_errors (beg, len); | |
241 | + beg[len] = ch; | |
242 | + } | |
243 | + if (encoding_errors) | |
244 | + { | |
245 | + encoding_error_output = done_on_match = out_quiet = true; | |
246 | + return false; | |
247 | + } | |
248 | + | |
249 | bool pending_sep = false; | |
250 | ||
251 | if (out_file) | |
252 | @@ -1021,22 +1041,27 @@ print_line_head (char const *beg, char const *lim, char sep) | |
253 | ||
254 | print_sep (sep); | |
255 | } | |
256 | + | |
257 | + return true; | |
258 | } | |
259 | ||
260 | -static const char * | |
261 | -print_line_middle (const char *beg, const char *lim, | |
262 | +static char * | |
263 | +print_line_middle (char *beg, char *lim, | |
264 | const char *line_color, const char *match_color) | |
265 | { | |
266 | size_t match_size; | |
267 | size_t match_offset; | |
268 | - const char *cur = beg; | |
269 | - const char *mid = NULL; | |
270 | - | |
271 | - while (cur < lim | |
272 | - && ((match_offset = execute (beg, lim - beg, &match_size, cur)) | |
273 | - != (size_t) -1)) | |
274 | + char *cur = beg; | |
275 | + char *mid = NULL; | |
276 | + char *b; | |
277 | + | |
278 | + for (cur = beg; | |
279 | + (cur < lim | |
280 | + && ((match_offset = execute (beg, lim - beg, &match_size, cur)) | |
281 | + != (size_t) -1)); | |
282 | + cur = b + match_size) | |
283 | { | |
284 | - char const *b = beg + match_offset; | |
285 | + b = beg + match_offset; | |
286 | ||
287 | /* Avoid matching the empty line at the end of the buffer. */ | |
288 | if (b == lim) | |
289 | @@ -1056,8 +1081,11 @@ print_line_middle (const char *beg, const char *lim, | |
290 | /* This function is called on a matching line only, | |
291 | but is it selected or rejected/context? */ | |
292 | if (only_matching) | |
293 | - print_line_head (b, lim, (out_invert ? SEP_CHAR_REJECTED | |
294 | - : SEP_CHAR_SELECTED)); | |
295 | + { | |
296 | + char sep = out_invert ? SEP_CHAR_REJECTED : SEP_CHAR_SELECTED; | |
297 | + if (! print_line_head (b, match_size, lim, sep)) | |
298 | + return NULL; | |
299 | + } | |
300 | else | |
301 | { | |
302 | pr_sgr_start (line_color); | |
303 | @@ -1075,7 +1103,6 @@ print_line_middle (const char *beg, const char *lim, | |
304 | if (only_matching) | |
305 | fputs ("\n", stdout); | |
306 | } | |
307 | - cur = b + match_size; | |
308 | } | |
309 | ||
310 | if (only_matching) | |
311 | @@ -1086,8 +1113,8 @@ print_line_middle (const char *beg, const char *lim, | |
312 | return cur; | |
313 | } | |
314 | ||
315 | -static const char * | |
316 | -print_line_tail (const char *beg, const char *lim, const char *line_color) | |
317 | +static char * | |
318 | +print_line_tail (char *beg, const char *lim, const char *line_color) | |
319 | { | |
320 | size_t eol_size; | |
321 | size_t tail_size; | |
322 | @@ -1108,14 +1135,15 @@ print_line_tail (const char *beg, const char *lim, const char *line_color) | |
323 | } | |
324 | ||
325 | static void | |
326 | -prline (char const *beg, char const *lim, char sep) | |
327 | +prline (char *beg, char *lim, char sep) | |
328 | { | |
329 | bool matching; | |
330 | const char *line_color; | |
331 | const char *match_color; | |
332 | ||
333 | if (!only_matching) | |
334 | - print_line_head (beg, lim, sep); | |
335 | + if (! print_line_head (beg, lim - beg - 1, lim, sep)) | |
336 | + return; | |
337 | ||
338 | matching = (sep == SEP_CHAR_SELECTED) ^ out_invert; | |
339 | ||
340 | @@ -1135,7 +1163,11 @@ prline (char const *beg, char const *lim, char sep) | |
341 | { | |
342 | /* We already know that non-matching lines have no match (to colorize). */ | |
343 | if (matching && (only_matching || *match_color)) | |
344 | - beg = print_line_middle (beg, lim, line_color, match_color); | |
345 | + { | |
346 | + beg = print_line_middle (beg, lim, line_color, match_color); | |
347 | + if (! beg) | |
348 | + return; | |
349 | + } | |
350 | ||
351 | if (!only_matching && *line_color) | |
352 | { | |
353 | @@ -1169,7 +1201,7 @@ prpending (char const *lim) | |
354 | lastout = bufbeg; | |
355 | while (pending > 0 && lastout < lim) | |
356 | { | |
357 | - char const *nl = memchr (lastout, eolbyte, lim - lastout); | |
358 | + char *nl = memchr (lastout, eolbyte, lim - lastout); | |
359 | size_t match_size; | |
360 | --pending; | |
361 | if (outleft | |
362 | @@ -1184,7 +1216,7 @@ prpending (char const *lim) | |
363 | ||
364 | /* Output the lines between BEG and LIM. Deal with context. */ | |
365 | static void | |
366 | -prtext (char const *beg, char const *lim) | |
367 | +prtext (char *beg, char *lim) | |
368 | { | |
369 | static bool used; /* Avoid printing SEP_STR_GROUP before any output. */ | |
370 | char eol = eolbyte; | |
371 | @@ -1192,7 +1224,7 @@ prtext (char const *beg, char const *lim) | |
372 | if (!out_quiet && pending > 0) | |
373 | prpending (beg); | |
374 | ||
375 | - char const *p = beg; | |
376 | + char *p = beg; | |
377 | ||
378 | if (!out_quiet) | |
379 | { | |
380 | @@ -1218,7 +1250,7 @@ prtext (char const *beg, char const *lim) | |
381 | ||
382 | while (p < beg) | |
383 | { | |
384 | - char const *nl = memchr (p, eol, beg - p); | |
385 | + char *nl = memchr (p, eol, beg - p); | |
386 | nl++; | |
387 | prline (p, nl, SEP_CHAR_REJECTED); | |
388 | p = nl; | |
389 | @@ -1231,7 +1263,7 @@ prtext (char const *beg, char const *lim) | |
390 | /* One or more lines are output. */ | |
391 | for (n = 0; p < lim && n < outleft; n++) | |
392 | { | |
393 | - char const *nl = memchr (p, eol, lim - p); | |
394 | + char *nl = memchr (p, eol, lim - p); | |
395 | nl++; | |
396 | if (!out_quiet) | |
397 | prline (p, nl, SEP_CHAR_SELECTED); | |
398 | @@ -1278,13 +1310,12 @@ zap_nuls (char *p, char *lim, char eol) | |
399 | between matching lines if OUT_INVERT is true). Return a count of | |
400 | lines printed. Replace all NUL bytes with NUL_ZAPPER as we go. */ | |
401 | static intmax_t | |
402 | -grepbuf (char const *beg, char const *lim) | |
403 | +grepbuf (char *beg, char const *lim) | |
404 | { | |
405 | intmax_t outleft0 = outleft; | |
406 | - char const *p; | |
407 | - char const *endp; | |
408 | + char *endp; | |
409 | ||
410 | - for (p = beg; p < lim; p = endp) | |
411 | + for (char *p = beg; p < lim; p = endp) | |
412 | { | |
413 | size_t match_size; | |
414 | size_t match_offset = execute (p, lim - p, &match_size, NULL); | |
415 | @@ -1295,15 +1326,15 @@ grepbuf (char const *beg, char const *lim) | |
416 | match_offset = lim - p; | |
417 | match_size = 0; | |
418 | } | |
419 | - char const *b = p + match_offset; | |
420 | + char *b = p + match_offset; | |
421 | endp = b + match_size; | |
422 | /* Avoid matching the empty line at the end of the buffer. */ | |
423 | if (!out_invert && b == lim) | |
424 | break; | |
425 | if (!out_invert || p < b) | |
426 | { | |
427 | - char const *prbeg = out_invert ? p : b; | |
428 | - char const *prend = out_invert ? b : endp; | |
429 | + char *prbeg = out_invert ? p : b; | |
430 | + char *prend = out_invert ? b : endp; | |
431 | prtext (prbeg, prend); | |
432 | if (!outleft || done_on_match) | |
433 | { | |
434 | @@ -1324,7 +1355,6 @@ static intmax_t | |
435 | grep (int fd, struct stat const *st) | |
436 | { | |
437 | intmax_t nlines, i; | |
438 | - enum textbin textbin; | |
439 | size_t residue, save; | |
440 | char oldc; | |
441 | char *beg; | |
442 | @@ -1333,6 +1363,7 @@ grep (int fd, struct stat const *st) | |
443 | char nul_zapper = '\0'; | |
444 | bool done_on_match_0 = done_on_match; | |
445 | bool out_quiet_0 = out_quiet; | |
446 | + bool has_nulls = false; | |
447 | ||
448 | if (! reset (fd, st)) | |
449 | return 0; | |
450 | @@ -1344,6 +1375,7 @@ grep (int fd, struct stat const *st) | |
451 | after_last_match = 0; | |
452 | pending = 0; | |
453 | skip_nuls = skip_empty_lines && !eol; | |
454 | + encoding_error_output = false; | |
455 | seek_data_failed = false; | |
456 | ||
457 | nlines = 0; | |
458 | @@ -1356,26 +1388,20 @@ grep (int fd, struct stat const *st) | |
459 | return 0; | |
460 | } | |
461 | ||
462 | - if (binary_files == TEXT_BINARY_FILES) | |
463 | - textbin = TEXTBIN_TEXT; | |
464 | - else | |
465 | + for (bool firsttime = true; ; firsttime = false) | |
466 | { | |
467 | - textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st); | |
468 | - if (textbin_is_binary (textbin)) | |
469 | + if (!has_nulls && eol && binary_files != TEXT_BINARY_FILES | |
470 | + && (buf_has_nulls (bufbeg, buflim - bufbeg) | |
471 | + || (firsttime && file_must_have_nulls (buflim - bufbeg, fd, st)))) | |
472 | { | |
473 | + has_nulls = true; | |
474 | if (binary_files == WITHOUT_MATCH_BINARY_FILES) | |
475 | return 0; | |
476 | done_on_match = out_quiet = true; | |
477 | nul_zapper = eol; | |
478 | skip_nuls = skip_empty_lines; | |
479 | } | |
480 | - else if (execute != Pexecute) | |
481 | - textbin = TEXTBIN_TEXT; | |
482 | - } | |
483 | ||
484 | - for (;;) | |
485 | - { | |
486 | - input_textbin = textbin; | |
487 | lastnl = bufbeg; | |
488 | if (lastout) | |
489 | lastout = bufbeg; | |
490 | @@ -1426,13 +1452,8 @@ grep (int fd, struct stat const *st) | |
491 | } | |
492 | ||
493 | /* Detect whether leading context is adjacent to previous output. */ | |
494 | - if (lastout) | |
495 | - { | |
496 | - if (textbin == TEXTBIN_UNKNOWN) | |
497 | - textbin = TEXTBIN_TEXT; | |
498 | - if (beg != lastout) | |
499 | - lastout = 0; | |
500 | - } | |
501 | + if (beg != lastout) | |
502 | + lastout = 0; | |
503 | ||
504 | /* Handle some details and read more data to scan. */ | |
505 | save = residue + lim - beg; | |
506 | @@ -1445,22 +1466,6 @@ grep (int fd, struct stat const *st) | |
507 | suppressible_error (filename, errno); | |
508 | goto finish_grep; | |
509 | } | |
510 | - | |
511 | - /* If the file's textbin has not been determined yet, assume | |
512 | - it's binary if the next input buffer suggests so. */ | |
513 | - if (textbin == TEXTBIN_UNKNOWN) | |
514 | - { | |
515 | - enum textbin tb = buffer_textbin (bufbeg, buflim - bufbeg); | |
516 | - if (textbin_is_binary (tb)) | |
517 | - { | |
518 | - if (binary_files == WITHOUT_MATCH_BINARY_FILES) | |
519 | - return 0; | |
520 | - textbin = tb; | |
521 | - done_on_match = out_quiet = true; | |
522 | - nul_zapper = eol; | |
523 | - skip_nuls = skip_empty_lines; | |
524 | - } | |
525 | - } | |
526 | } | |
527 | if (residue) | |
528 | { | |
529 | @@ -1474,7 +1479,7 @@ grep (int fd, struct stat const *st) | |
530 | finish_grep: | |
531 | done_on_match = done_on_match_0; | |
532 | out_quiet = out_quiet_0; | |
533 | - if (textbin_is_binary (textbin) && !out_quiet && nlines != 0) | |
534 | + if ((has_nulls || encoding_error_output) && !out_quiet && nlines != 0) | |
535 | printf (_("Binary file %s matches\n"), filename); | |
536 | return nlines; | |
537 | } | |
538 | diff --git a/src/grep.h b/src/grep.h | |
539 | index 580eb11..2e4527c 100644 | |
540 | --- a/src/grep.h | |
541 | +++ b/src/grep.h | |
542 | @@ -29,22 +29,4 @@ extern bool match_words; /* -w */ | |
543 | extern bool match_lines; /* -x */ | |
544 | extern char eolbyte; /* -z */ | |
545 | ||
546 | -/* An enum textbin describes the file's type, inferred from data read | |
547 | - before the first line is selected for output. */ | |
548 | -enum textbin | |
549 | - { | |
550 | - /* Binary, as it contains null bytes and the -z option is not in effect, | |
551 | - or it contains encoding errors. */ | |
552 | - TEXTBIN_BINARY = -1, | |
553 | - | |
554 | - /* Not known yet. Only text has been seen so far. */ | |
555 | - TEXTBIN_UNKNOWN = 0, | |
556 | - | |
557 | - /* Text. */ | |
558 | - TEXTBIN_TEXT = 1 | |
559 | - }; | |
560 | - | |
561 | -/* Input file type. */ | |
562 | -extern enum textbin input_textbin; | |
563 | - | |
564 | #endif | |
565 | diff --git a/src/pcresearch.c b/src/pcresearch.c | |
566 | index dc68345..c403032 100644 | |
567 | --- a/src/pcresearch.c | |
568 | +++ b/src/pcresearch.c | |
569 | @@ -194,32 +194,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size, | |
570 | error. */ | |
571 | char const *subject = buf; | |
572 | ||
573 | - /* If the input type is unknown, the caller is still testing the | |
574 | - input, which means the current buffer cannot contain encoding | |
575 | - errors and a multiline search is typically more efficient. | |
576 | - Otherwise, a single-line search is typically faster, so that | |
577 | - pcre_exec doesn't waste time validating the entire input | |
578 | - buffer. */ | |
579 | - bool multiline = input_textbin == TEXTBIN_UNKNOWN; | |
580 | - | |
581 | for (; p < buf + size; p = line_start = line_end + 1) | |
582 | { | |
583 | - bool too_big; | |
584 | - | |
585 | - if (multiline) | |
586 | - { | |
587 | - size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1); | |
588 | - size_t scan_size = MIN (pcre_size_max + 1, buf + size - p); | |
589 | - line_end = memrchr (p, eolbyte, scan_size); | |
590 | - too_big = ! line_end; | |
591 | - } | |
592 | - else | |
593 | - { | |
594 | - line_end = memchr (p, eolbyte, buf + size - p); | |
595 | - too_big = INT_MAX < line_end - p; | |
596 | - } | |
597 | - | |
598 | - if (too_big) | |
599 | + /* A single-line search is typically faster, so that | |
600 | + pcre_exec doesn't waste time validating the entire input | |
601 | + buffer. */ | |
602 | + line_end = memchr (p, eolbyte, buf + size - p); | |
603 | + if (INT_MAX < line_end - p) | |
604 | error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); | |
605 | ||
606 | for (;;) | |
607 | @@ -247,27 +228,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size, | |
608 | int options = 0; | |
609 | if (!bol) | |
610 | options |= PCRE_NOTBOL; | |
611 | - if (multiline) | |
612 | - options |= PCRE_NO_UTF8_CHECK; | |
613 | ||
614 | e = jit_exec (subject, line_end - subject, search_offset, | |
615 | options, sub); | |
616 | if (e != PCRE_ERROR_BADUTF8) | |
617 | - { | |
618 | - if (0 < e && multiline && sub[1] - sub[0] != 0) | |
619 | - { | |
620 | - char const *nl = memchr (subject + sub[0], eolbyte, | |
621 | - sub[1] - sub[0]); | |
622 | - if (nl) | |
623 | - { | |
624 | - /* This match crosses a line boundary; reject it. */ | |
625 | - p = subject + sub[0]; | |
626 | - line_end = nl; | |
627 | - continue; | |
628 | - } | |
629 | - } | |
630 | - break; | |
631 | - } | |
632 | + break; | |
633 | int valid_bytes = sub[0]; | |
634 | ||
635 | /* Try to match the string before the encoding error. */ | |
636 | @@ -339,15 +304,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size, | |
637 | beg = matchbeg; | |
638 | end = matchend; | |
639 | } | |
640 | - else if (multiline) | |
641 | - { | |
642 | - char const *prev_nl = memrchr (line_start - 1, eolbyte, | |
643 | - matchbeg - (line_start - 1)); | |
644 | - char const *next_nl = memchr (matchend, eolbyte, | |
645 | - line_end + 1 - matchend); | |
646 | - beg = prev_nl + 1; | |
647 | - end = next_nl + 1; | |
648 | - } | |
649 | else | |
650 | { | |
651 | beg = line_start; | |
652 | diff --git a/tests/Makefile.am b/tests/Makefile.am | |
653 | index 37bb501..f1b8c43 100644 | |
654 | --- a/tests/Makefile.am | |
655 | +++ b/tests/Makefile.am | |
656 | @@ -70,6 +70,7 @@ TESTS = \ | |
657 | empty \ | |
658 | empty-line \ | |
659 | empty-line-mb \ | |
660 | + encoding-error \ | |
661 | epipe \ | |
662 | equiv-classes \ | |
663 | ere \ | |
664 | diff --git a/tests/encoding-error b/tests/encoding-error | |
665 | new file mode 100755 | |
666 | index 0000000..fe52de2 | |
667 | --- a/dev/null | |
668 | +++ b/tests/encoding-error | |
669 | @@ -0,0 +1,41 @@ | |
670 | +#! /bin/sh | |
671 | +# Test grep's behavior on encoding errors. | |
672 | +# | |
673 | +# Copyright 2015 Free Software Foundation, Inc. | |
674 | +# | |
675 | +# Copying and distribution of this file, with or without modification, | |
676 | +# are permitted in any medium without royalty provided the copyright | |
677 | +# notice and this notice are preserved. | |
678 | + | |
679 | +. "${srcdir=.}/init.sh"; path_prepend_ ../src | |
680 | + | |
681 | +require_en_utf8_locale_ | |
682 | + | |
683 | +LC_ALL=en_US.UTF-8 | |
684 | +export LC_ALL | |
685 | + | |
686 | +printf 'Alfred Jones\n' > a || framework_failure_ | |
687 | +printf 'John Smith\n' >j || framework_failure_ | |
688 | +printf 'Pedro P\xe9rez\n' >p || framework_failure_ | |
689 | +cat a p j >in || framework_failure_ | |
690 | + | |
691 | +fail=0 | |
692 | + | |
693 | +grep '^A' in >out || fail=1 | |
694 | +compare a out || fail=1 | |
695 | + | |
696 | +grep '^P' in >out || fail=1 | |
697 | +printf 'Binary file in matches\n' >exp || framework_failure_ | |
698 | +compare exp out || fail=1 | |
699 | + | |
700 | +grep '^J' in >out || fail=1 | |
701 | +compare j out || fail=1 | |
702 | + | |
703 | +grep '^X' in >out | |
704 | +test $? = 1 || fail=1 | |
705 | +compare /dev/null out || fail=1 | |
706 | + | |
707 | +grep -a . in >out || fail=1 | |
708 | +compare in out | |
709 | + | |
710 | +Exit $fail | |
711 | -- | |
712 | cgit v0.9.0.2 | |
713 | From 40ed879db22d57516a31fefd1c39416974b74ec4 Mon Sep 17 00:00:00 2001 | |
714 | From: Paul Eggert <eggert@cs.ucla.edu> | |
715 | Date: Sat, 02 Jan 2016 05:16:12 +0000 | |
716 | Subject: grep: fix bug with with invalid unibyte sequence | |
717 | ||
718 | This was introduced by the recent binary-data-detection changes. | |
719 | Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86 | |
720 | * src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove, | |
721 | replacing with ... | |
722 | (uword_max, unibyte_mask, initialize_unibyte_mask): ... this new | |
723 | constant, static var, and function. All uses changed. The | |
724 | unibyte_mask var generalizes the old local var hibyte_mask, which | |
725 | worked only for encodings where every byte with 0x80 turned off is | |
726 | a single-byte character. | |
727 | (buf_has_encoding_errors): Return false immediately if | |
728 | unibyte_mask is zero, not whether the current encoding is unibyte. | |
729 | The old test was incorrect in unibyte locales in which some bytes | |
730 | were encoding errors. | |
731 | * tests/pcre-z: Require UTF-8 locale, since the grep -z . test now | |
732 | needs this. Use printf \0 rather than tr. Port the 'grep -z .' | |
733 | test to platforms where the C locale says '\200' is an encoding | |
734 | error. Use cmp rather than compare, as the file is binary and | |
735 | so non-GNU diff might not work. | |
736 | * tests/unibyte-binary: New file. | |
737 | * tests/Makefile.am (TESTS): Add it. | |
738 | --- | |
739 | diff --git a/src/grep.c b/src/grep.c | |
740 | index 1207a76..a5f1fa2 100644 | |
741 | --- a/src/grep.c | |
742 | +++ b/src/grep.c | |
743 | @@ -484,21 +484,6 @@ clean_up_stdout (void) | |
744 | close_stdout (); | |
745 | } | |
746 | ||
747 | -/* The high-order bit of a byte. */ | |
748 | -enum { HIBYTE = 0x80 }; | |
749 | - | |
750 | -/* True if every byte with HIBYTE off is a single-byte character. | |
751 | - UTF-8 has this property. */ | |
752 | -static bool easy_encoding; | |
753 | - | |
754 | -static void | |
755 | -init_easy_encoding (void) | |
756 | -{ | |
757 | - easy_encoding = true; | |
758 | - for (int i = 0; i < HIBYTE; i++) | |
759 | - easy_encoding &= mbclen_cache[i] == 1; | |
760 | -} | |
761 | - | |
762 | /* A cast to TYPE of VAL. Use this when TYPE is a pointer type, VAL | |
763 | is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer | |
764 | the alignment and would otherwise complain about the cast. */ | |
765 | @@ -517,21 +502,33 @@ init_easy_encoding (void) | |
766 | /* An unsigned type suitable for fast matching. */ | |
767 | typedef uintmax_t uword; | |
768 | ||
769 | +/* All bytes that are not unibyte characters, ANDed together, and then | |
770 | + with the pattern repeated to fill a uword. For an encoding where | |
771 | + all bytes are unibyte characters, this is 0. For UTF-8, this is | |
772 | + 0x808080.... For encodings where unibyte characters have no useful | |
773 | + pattern, this is all 1s. The unsigned char C is a unibyte | |
774 | + character if C & UNIBYTE_MASK is zero. If the uword W is the | |
775 | + concatenation of bytes, the bytes are all unibyte characters | |
776 | + if W & UNIBYTE_MASK is zero. */ | |
777 | +static uword unibyte_mask; | |
778 | + | |
779 | +static void | |
780 | +initialize_unibyte_mask (void) | |
781 | +{ | |
782 | + unsigned char mask = UCHAR_MAX; | |
783 | + for (int i = 1; i <= UCHAR_MAX; i++) | |
784 | + if (mbclen_cache[i] != 1) | |
785 | + mask &= i; | |
786 | + uword uword_max = -1; | |
787 | + unibyte_mask = uword_max / UCHAR_MAX * mask; | |
788 | +} | |
789 | + | |
790 | /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel | |
791 | that is not easy, and return a pointer to the first non-easy byte. | |
792 | - In easy encodings, the easy bytes all have HIBYTE off. | |
793 | - In other encodings, no byte is easy. */ | |
794 | + The easy bytes all have UNIBYTE_MASK off. */ | |
795 | static char const * _GL_ATTRIBUTE_PURE | |
796 | skip_easy_bytes (char const *buf) | |
797 | { | |
798 | - if (!easy_encoding) | |
799 | - return buf; | |
800 | - | |
801 | - uword uword_max = -1; | |
802 | - | |
803 | - /* 0x8080..., extended to be wide enough for uword. */ | |
804 | - uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE; | |
805 | - | |
806 | /* Search a byte at a time until the pointer is aligned, then a | |
807 | uword at a time until a match is found, then a byte at a time to | |
808 | identify the exact byte. The uword search may go slightly past | |
809 | @@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf) | |
810 | char const *p; | |
811 | uword const *s; | |
812 | for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++) | |
813 | - if (*p & HIBYTE) | |
814 | + if (to_uchar (*p) & unibyte_mask) | |
815 | return p; | |
816 | - for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++) | |
817 | + for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++) | |
818 | continue; | |
819 | - for (p = (char const *) s; ! (*p & HIBYTE); p++) | |
820 | + for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++) | |
821 | continue; | |
822 | return p; | |
823 | } | |
824 | @@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf) | |
825 | static bool | |
826 | buf_has_encoding_errors (char *buf, size_t size) | |
827 | { | |
828 | - if (MB_CUR_MAX <= 1) | |
829 | + if (! unibyte_mask) | |
830 | return false; | |
831 | ||
832 | mbstate_t mbs = { 0 }; | |
833 | @@ -2592,7 +2589,7 @@ main (int argc, char **argv) | |
834 | usage (EXIT_TROUBLE); | |
835 | ||
836 | build_mbclen_cache (); | |
837 | - init_easy_encoding (); | |
838 | + initialize_unibyte_mask (); | |
839 | ||
840 | /* In a unibyte locale, switch from fgrep to grep if | |
841 | the pattern matches words (where grep is typically faster). | |
842 | diff --git a/tests/Makefile.am b/tests/Makefile.am | |
843 | index f349aa3..a38303c 100644 | |
844 | --- a/tests/Makefile.am | |
845 | +++ b/tests/Makefile.am | |
846 | @@ -133,6 +133,7 @@ TESTS = \ | |
847 | turkish-I-without-dot \ | |
848 | turkish-eyes \ | |
849 | two-files \ | |
850 | + unibyte-binary \ | |
851 | unibyte-bracket-expr \ | |
852 | unibyte-negated-circumflex \ | |
853 | utf8-bracket \ | |
854 | diff --git a/tests/pcre-z b/tests/pcre-z | |
855 | index 6bbde94..4ce9a93 100755 | |
856 | --- a/tests/pcre-z | |
857 | +++ b/tests/pcre-z | |
858 | @@ -2,10 +2,11 @@ | |
859 | # Test Perl regex with NUL-separated input | |
860 | . "${srcdir=.}/init.sh"; path_prepend_ ../src | |
861 | require_pcre_ | |
862 | +require_en_utf8_locale_ | |
863 | ||
864 | REGEX=a | |
865 | ||
866 | -printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in | |
867 | +printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_ | |
868 | ||
869 | grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.' | |
870 | compare /dev/null err || fail_ 'stderr not empty on grep -z.' | |
871 | @@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1 | |
872 | compare exp out || fail=1 | |
873 | compare /dev/null err || fail=1 | |
874 | ||
875 | -printf '\200\0' >in0 | |
876 | -LC_ALL=C grep -z . in0 >out || fail=1 | |
877 | -compare in0 out || fail=1 | |
878 | +printf '\303\200\0' >in0 # "À" followed by a NUL. | |
879 | +LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1 | |
880 | +cmp in0 out || fail=1 | |
881 | ||
882 | Exit $fail | |
883 | diff --git a/tests/unibyte-binary b/tests/unibyte-binary | |
884 | new file mode 100755 | |
885 | index 0000000..78735b8 | |
886 | --- a/dev/null | |
887 | +++ b/tests/unibyte-binary | |
888 | @@ -0,0 +1,28 @@ | |
889 | +#!/bin/sh | |
890 | +# Test binary files in unibyte locales with encoding errors | |
891 | + | |
892 | +# Copyright 2016 Free Software Foundation, Inc. | |
893 | + | |
894 | +# This program is free software: you can redistribute it and/or modify | |
895 | +# it under the terms of the GNU General Public License as published by | |
896 | +# the Free Software Foundation, either version 3 of the License, or | |
897 | +# (at your option) any later version. | |
898 | + | |
899 | +# This program is distributed in the hope that it will be useful, | |
900 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
901 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
902 | +# GNU General Public License for more details. | |
903 | + | |
904 | +# You should have received a copy of the GNU General Public License | |
905 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
906 | + | |
907 | +. "${srcdir=.}/init.sh"; path_prepend_ ../src | |
908 | +require_unibyte_locale | |
909 | + | |
910 | +fail=0 | |
911 | + | |
912 | +printf 'a\n\200\nb\n' >in || framework_failure_ | |
913 | +printf 'a\nBinary file in matches\n' >exp || framework_failure_ | |
914 | +grep . in >out || fail=1 | |
915 | +compare exp out || fail=1 | |
916 | +Exit $fail | |
917 | -- | |
918 | cgit v0.9.0.2 |