]> git.pld-linux.org Git - packages/grep.git/blame - grep-enc-errors.patch
- rel 2; from upstream: be less picky about encoding errors
[packages/grep.git] / grep-enc-errors.patch
CommitLineData
fccf3f89
AM
1From 8521001643bc6a28c760552824eaea5ecee0aa8c Mon Sep 17 00:00:00 2001
2From: Paul Eggert <eggert@cs.ucla.edu>
3Date: Thu, 31 Dec 2015 03:10:14 +0000
4Subject: grep: be less picky about encoding errors
5
6This fixes a longstanding problem introduced in grep 2.21,
7which is overly picky about binary files.
8* NEWS:
9* doc/grep.texi (File and Directory Selection): Document this.
10* src/grep.c (input_textbin, textbin_is_binary, buffer_textbin)
11(file_textbin):
12Remove. All uses removed.
13(encoding_error_output): New static var.
14(buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls):
15New functions, which reuse bits
16and pieces of the removed functions.
17(lastout, print_line_head, print_line_middle, print_line_tail, prline)
18(prpending, prtext, grepbuf):
19Avoid use of const, now that we have
20functions that require modifying a sentinel.
21(print_line_head): New arg LEN. All uses changed.
22(print_line_head, print_line_tail):
23Return indicator whether the output line was printed.
24All uses changed.
25(print_line_middle): Exit early on encoding error.
26(grep): Use new method for determining whether file is binary.
27* src/grep.h (enum textbin, TEXTBIN_BINARY, TEXTBIN_UNKNOWN)
28(TEXTBIN_TEXT, input_textbin): Remove decls. All uses removed.
29* src/pcresearch.c (Pexecute): Remove multiline optimization,
30since the main program no longer checks for encoding errors on input.
31* tests/encoding-error: New file.
32* tests/Makefile.am (TESTS): Add it.
33---
34diff --git a/doc/grep.texi b/doc/grep.texi
35index 76c7f46..58e7f48 100644
36--- a/doc/grep.texi
37+++ b/doc/grep.texi
38@@ -596,13 +596,13 @@ If a file's allocation metadata,
39 or if its data read before a line is selected for output,
40 indicate that the file contains binary data,
41 assume that the file is of type @var{type}.
42-Non-text bytes indicate binary data; these are either data bytes
43-improperly encoded for the current locale, or null bytes when the
44+Non-text bytes indicate binary data; these are either output bytes that are
45+improperly encoded for the current locale, or null input bytes when the
46 @option{-z} (@option{--null-data}) option is not given (@pxref{Other
47 Options}).
48
49-By default, @var{type} is @samp{binary},
50-and @command{grep} normally outputs either
51+By default, @var{type} is @samp{binary}, and when @command{grep}
52+discovers that a file is binary it normally outputs either
53 a one-line message saying that a binary file matches,
54 or no message if there is no match.
55 When processing binary data, @command{grep} may treat non-text bytes
56@@ -611,7 +611,8 @@ not match a null byte, as the null byte might be treated as a line
57 terminator even without the @option{-z} (@option{--null-data}) option.
58
59 If @var{type} is @samp{without-match},
60-@command{grep} assumes that a binary file does not match;
61+when @command{grep} discovers that a file is binary
62+it assumes that the rest of the file does not match;
63 this is equivalent to the @option{-I} option.
64
65 If @var{type} is @samp{text},
66diff --git a/src/grep.c b/src/grep.c
67index 19ba208..e059a46 100644
68--- a/src/grep.c
69+++ b/src/grep.c
70@@ -377,7 +377,6 @@ bool match_icase;
71 bool match_words;
72 bool match_lines;
73 char eolbyte;
74-enum textbin input_textbin;
75
76 static char const *matcher;
77
78@@ -389,6 +388,10 @@ static bool omit_dot_slash;
79 static bool errseen;
80 static bool write_error_seen;
81
82+/* True if output from the current input file has been suppressed
83+ because an output line had an encoding error. */
84+static bool encoding_error_output;
85+
86 enum directories_type
87 {
88 READ_DIRECTORIES = 2,
89@@ -481,12 +484,6 @@ clean_up_stdout (void)
90 close_stdout ();
91 }
92
93-static bool
94-textbin_is_binary (enum textbin textbin)
95-{
96- return textbin < TEXTBIN_UNKNOWN;
97-}
98-
99 /* The high-order bit of a byte. */
100 enum { HIBYTE = 0x80 };
101
102@@ -551,58 +548,60 @@ skip_easy_bytes (char const *buf)
103 return p;
104 }
105
106-/* Return the text type of data in BUF, of size SIZE.
107+/* Return true if BUF, of size SIZE, has an encoding error.
108 BUF must be followed by at least sizeof (uword) bytes,
109- which may be arbitrarily written to or read from. */
110-static enum textbin
111-buffer_textbin (char *buf, size_t size)
112+ the first of which may be modified. */
113+static bool
114+buf_has_encoding_errors (char *buf, size_t size)
115 {
116- if (eolbyte && memchr (buf, '\0', size))
117- return TEXTBIN_BINARY;
118+ if (MB_CUR_MAX <= 1)
119+ return false;
120
121- if (1 < MB_CUR_MAX)
122- {
123- mbstate_t mbs = { 0 };
124- size_t clen;
125- char const *p;
126+ mbstate_t mbs = { 0 };
127+ size_t clen;
128
129- buf[size] = -1;
130- for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
131- {
132- clen = mbrlen (p, buf + size - p, &mbs);
133- if ((size_t) -2 <= clen)
134- return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
135- }
136+ buf[size] = -1;
137+ for (char const *p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
138+ {
139+ clen = mbrlen (p, buf + size - p, &mbs);
140+ if ((size_t) -2 <= clen)
141+ return true;
142 }
143
144- return TEXTBIN_TEXT;
145+ return false;
146 }
147
148-/* Return the text type of a file. BUF, of size SIZE, is the initial
149- buffer read from the file with descriptor FD and status ST.
150- BUF must be followed by at least sizeof (uword) bytes,
151+
152+/* Return true if BUF, of size SIZE, has a null byte.
153+ BUF must be followed by at least one byte,
154 which may be arbitrarily written to or read from. */
155-static enum textbin
156-file_textbin (char *buf, size_t size, int fd, struct stat const *st)
157+static bool
158+buf_has_nulls (char *buf, size_t size)
159 {
160- enum textbin textbin = buffer_textbin (buf, size);
161- if (textbin_is_binary (textbin))
162- return textbin;
163+ buf[size] = 0;
164+ return strlen (buf) != size;
165+}
166
167+/* Return true if a file is known to contain null bytes.
168+ SIZE bytes have already been read from the file
169+ with descriptor FD and status ST. */
170+static bool
171+file_must_have_nulls (size_t size, int fd, struct stat const *st)
172+{
173 if (usable_st_size (st))
174 {
175 if (st->st_size <= size)
176- return textbin == TEXTBIN_UNKNOWN ? TEXTBIN_BINARY : textbin;
177+ return false;
178
179 /* If the file has holes, it must contain a null byte somewhere. */
180- if (SEEK_HOLE != SEEK_SET && eolbyte)
181+ if (SEEK_HOLE != SEEK_SET)
182 {
183 off_t cur = size;
184 if (O_BINARY || fd == STDIN_FILENO)
185 {
186 cur = lseek (fd, 0, SEEK_CUR);
187 if (cur < 0)
188- return TEXTBIN_UNKNOWN;
189+ return false;
190 }
191
192 /* Look for a hole after the current location. */
193@@ -612,12 +611,12 @@ file_textbin (char *buf, size_t size, int fd, struct stat const *st)
194 if (lseek (fd, cur, SEEK_SET) < 0)
195 suppressible_error (filename, errno);
196 if (hole_start < st->st_size)
197- return TEXTBIN_BINARY;
198+ return true;
199 }
200 }
201 }
202
203- return TEXTBIN_UNKNOWN;
204+ return false;
205 }
206
207 /* Convert STR to a nonnegative integer, storing the result in *OUT.
208@@ -899,7 +898,7 @@ static char *label = NULL; /* Fake filename for stdin */
209 /* Internal variables to keep track of byte count, context, etc. */
210 static uintmax_t totalcc; /* Total character count before bufbeg. */
211 static char const *lastnl; /* Pointer after last newline counted. */
212-static char const *lastout; /* Pointer after last character output;
213+static char *lastout; /* Pointer after last character output;
214 NULL if no character has been output
215 or if it's conceptually before bufbeg. */
216 static intmax_t outleft; /* Maximum number of lines to be output. */
217@@ -971,10 +970,31 @@ print_offset (uintmax_t pos, int min_width, const char *color)
218 pr_sgr_end_if (color);
219 }
220
221-/* Print a whole line head (filename, line, byte). */
222-static void
223-print_line_head (char const *beg, char const *lim, char sep)
224+/* Print a whole line head (filename, line, byte). The output data
225+ starts at BEG and contains LEN bytes; it is followed by at least
226+ sizeof (uword) bytes, the first of which may be temporarily modified.
227+ The output data comes from what is perhaps a larger input line that
228+ goes until LIM, where LIM[-1] is an end-of-line byte. Use SEP as
229+ the separator on output.
230+
231+ Return true unless the line was suppressed due to an encoding error. */
232+
233+static bool
234+print_line_head (char *beg, size_t len, char const *lim, char sep)
235 {
236+ bool encoding_errors = false;
237+ if (binary_files != TEXT_BINARY_FILES)
238+ {
239+ char ch = beg[len];
240+ encoding_errors = buf_has_encoding_errors (beg, len);
241+ beg[len] = ch;
242+ }
243+ if (encoding_errors)
244+ {
245+ encoding_error_output = done_on_match = out_quiet = true;
246+ return false;
247+ }
248+
249 bool pending_sep = false;
250
251 if (out_file)
252@@ -1021,22 +1041,27 @@ print_line_head (char const *beg, char const *lim, char sep)
253
254 print_sep (sep);
255 }
256+
257+ return true;
258 }
259
260-static const char *
261-print_line_middle (const char *beg, const char *lim,
262+static char *
263+print_line_middle (char *beg, char *lim,
264 const char *line_color, const char *match_color)
265 {
266 size_t match_size;
267 size_t match_offset;
268- const char *cur = beg;
269- const char *mid = NULL;
270-
271- while (cur < lim
272- && ((match_offset = execute (beg, lim - beg, &match_size, cur))
273- != (size_t) -1))
274+ char *cur = beg;
275+ char *mid = NULL;
276+ char *b;
277+
278+ for (cur = beg;
279+ (cur < lim
280+ && ((match_offset = execute (beg, lim - beg, &match_size, cur))
281+ != (size_t) -1));
282+ cur = b + match_size)
283 {
284- char const *b = beg + match_offset;
285+ b = beg + match_offset;
286
287 /* Avoid matching the empty line at the end of the buffer. */
288 if (b == lim)
289@@ -1056,8 +1081,11 @@ print_line_middle (const char *beg, const char *lim,
290 /* This function is called on a matching line only,
291 but is it selected or rejected/context? */
292 if (only_matching)
293- print_line_head (b, lim, (out_invert ? SEP_CHAR_REJECTED
294- : SEP_CHAR_SELECTED));
295+ {
296+ char sep = out_invert ? SEP_CHAR_REJECTED : SEP_CHAR_SELECTED;
297+ if (! print_line_head (b, match_size, lim, sep))
298+ return NULL;
299+ }
300 else
301 {
302 pr_sgr_start (line_color);
303@@ -1075,7 +1103,6 @@ print_line_middle (const char *beg, const char *lim,
304 if (only_matching)
305 fputs ("\n", stdout);
306 }
307- cur = b + match_size;
308 }
309
310 if (only_matching)
311@@ -1086,8 +1113,8 @@ print_line_middle (const char *beg, const char *lim,
312 return cur;
313 }
314
315-static const char *
316-print_line_tail (const char *beg, const char *lim, const char *line_color)
317+static char *
318+print_line_tail (char *beg, const char *lim, const char *line_color)
319 {
320 size_t eol_size;
321 size_t tail_size;
322@@ -1108,14 +1135,15 @@ print_line_tail (const char *beg, const char *lim, const char *line_color)
323 }
324
325 static void
326-prline (char const *beg, char const *lim, char sep)
327+prline (char *beg, char *lim, char sep)
328 {
329 bool matching;
330 const char *line_color;
331 const char *match_color;
332
333 if (!only_matching)
334- print_line_head (beg, lim, sep);
335+ if (! print_line_head (beg, lim - beg - 1, lim, sep))
336+ return;
337
338 matching = (sep == SEP_CHAR_SELECTED) ^ out_invert;
339
340@@ -1135,7 +1163,11 @@ prline (char const *beg, char const *lim, char sep)
341 {
342 /* We already know that non-matching lines have no match (to colorize). */
343 if (matching && (only_matching || *match_color))
344- beg = print_line_middle (beg, lim, line_color, match_color);
345+ {
346+ beg = print_line_middle (beg, lim, line_color, match_color);
347+ if (! beg)
348+ return;
349+ }
350
351 if (!only_matching && *line_color)
352 {
353@@ -1169,7 +1201,7 @@ prpending (char const *lim)
354 lastout = bufbeg;
355 while (pending > 0 && lastout < lim)
356 {
357- char const *nl = memchr (lastout, eolbyte, lim - lastout);
358+ char *nl = memchr (lastout, eolbyte, lim - lastout);
359 size_t match_size;
360 --pending;
361 if (outleft
362@@ -1184,7 +1216,7 @@ prpending (char const *lim)
363
364 /* Output the lines between BEG and LIM. Deal with context. */
365 static void
366-prtext (char const *beg, char const *lim)
367+prtext (char *beg, char *lim)
368 {
369 static bool used; /* Avoid printing SEP_STR_GROUP before any output. */
370 char eol = eolbyte;
371@@ -1192,7 +1224,7 @@ prtext (char const *beg, char const *lim)
372 if (!out_quiet && pending > 0)
373 prpending (beg);
374
375- char const *p = beg;
376+ char *p = beg;
377
378 if (!out_quiet)
379 {
380@@ -1218,7 +1250,7 @@ prtext (char const *beg, char const *lim)
381
382 while (p < beg)
383 {
384- char const *nl = memchr (p, eol, beg - p);
385+ char *nl = memchr (p, eol, beg - p);
386 nl++;
387 prline (p, nl, SEP_CHAR_REJECTED);
388 p = nl;
389@@ -1231,7 +1263,7 @@ prtext (char const *beg, char const *lim)
390 /* One or more lines are output. */
391 for (n = 0; p < lim && n < outleft; n++)
392 {
393- char const *nl = memchr (p, eol, lim - p);
394+ char *nl = memchr (p, eol, lim - p);
395 nl++;
396 if (!out_quiet)
397 prline (p, nl, SEP_CHAR_SELECTED);
398@@ -1278,13 +1310,12 @@ zap_nuls (char *p, char *lim, char eol)
399 between matching lines if OUT_INVERT is true). Return a count of
400 lines printed. Replace all NUL bytes with NUL_ZAPPER as we go. */
401 static intmax_t
402-grepbuf (char const *beg, char const *lim)
403+grepbuf (char *beg, char const *lim)
404 {
405 intmax_t outleft0 = outleft;
406- char const *p;
407- char const *endp;
408+ char *endp;
409
410- for (p = beg; p < lim; p = endp)
411+ for (char *p = beg; p < lim; p = endp)
412 {
413 size_t match_size;
414 size_t match_offset = execute (p, lim - p, &match_size, NULL);
415@@ -1295,15 +1326,15 @@ grepbuf (char const *beg, char const *lim)
416 match_offset = lim - p;
417 match_size = 0;
418 }
419- char const *b = p + match_offset;
420+ char *b = p + match_offset;
421 endp = b + match_size;
422 /* Avoid matching the empty line at the end of the buffer. */
423 if (!out_invert && b == lim)
424 break;
425 if (!out_invert || p < b)
426 {
427- char const *prbeg = out_invert ? p : b;
428- char const *prend = out_invert ? b : endp;
429+ char *prbeg = out_invert ? p : b;
430+ char *prend = out_invert ? b : endp;
431 prtext (prbeg, prend);
432 if (!outleft || done_on_match)
433 {
434@@ -1324,7 +1355,6 @@ static intmax_t
435 grep (int fd, struct stat const *st)
436 {
437 intmax_t nlines, i;
438- enum textbin textbin;
439 size_t residue, save;
440 char oldc;
441 char *beg;
442@@ -1333,6 +1363,7 @@ grep (int fd, struct stat const *st)
443 char nul_zapper = '\0';
444 bool done_on_match_0 = done_on_match;
445 bool out_quiet_0 = out_quiet;
446+ bool has_nulls = false;
447
448 if (! reset (fd, st))
449 return 0;
450@@ -1344,6 +1375,7 @@ grep (int fd, struct stat const *st)
451 after_last_match = 0;
452 pending = 0;
453 skip_nuls = skip_empty_lines && !eol;
454+ encoding_error_output = false;
455 seek_data_failed = false;
456
457 nlines = 0;
458@@ -1356,26 +1388,20 @@ grep (int fd, struct stat const *st)
459 return 0;
460 }
461
462- if (binary_files == TEXT_BINARY_FILES)
463- textbin = TEXTBIN_TEXT;
464- else
465+ for (bool firsttime = true; ; firsttime = false)
466 {
467- textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
468- if (textbin_is_binary (textbin))
469+ if (!has_nulls && eol && binary_files != TEXT_BINARY_FILES
470+ && (buf_has_nulls (bufbeg, buflim - bufbeg)
471+ || (firsttime && file_must_have_nulls (buflim - bufbeg, fd, st))))
472 {
473+ has_nulls = true;
474 if (binary_files == WITHOUT_MATCH_BINARY_FILES)
475 return 0;
476 done_on_match = out_quiet = true;
477 nul_zapper = eol;
478 skip_nuls = skip_empty_lines;
479 }
480- else if (execute != Pexecute)
481- textbin = TEXTBIN_TEXT;
482- }
483
484- for (;;)
485- {
486- input_textbin = textbin;
487 lastnl = bufbeg;
488 if (lastout)
489 lastout = bufbeg;
490@@ -1426,13 +1452,8 @@ grep (int fd, struct stat const *st)
491 }
492
493 /* Detect whether leading context is adjacent to previous output. */
494- if (lastout)
495- {
496- if (textbin == TEXTBIN_UNKNOWN)
497- textbin = TEXTBIN_TEXT;
498- if (beg != lastout)
499- lastout = 0;
500- }
501+ if (beg != lastout)
502+ lastout = 0;
503
504 /* Handle some details and read more data to scan. */
505 save = residue + lim - beg;
506@@ -1445,22 +1466,6 @@ grep (int fd, struct stat const *st)
507 suppressible_error (filename, errno);
508 goto finish_grep;
509 }
510-
511- /* If the file's textbin has not been determined yet, assume
512- it's binary if the next input buffer suggests so. */
513- if (textbin == TEXTBIN_UNKNOWN)
514- {
515- enum textbin tb = buffer_textbin (bufbeg, buflim - bufbeg);
516- if (textbin_is_binary (tb))
517- {
518- if (binary_files == WITHOUT_MATCH_BINARY_FILES)
519- return 0;
520- textbin = tb;
521- done_on_match = out_quiet = true;
522- nul_zapper = eol;
523- skip_nuls = skip_empty_lines;
524- }
525- }
526 }
527 if (residue)
528 {
529@@ -1474,7 +1479,7 @@ grep (int fd, struct stat const *st)
530 finish_grep:
531 done_on_match = done_on_match_0;
532 out_quiet = out_quiet_0;
533- if (textbin_is_binary (textbin) && !out_quiet && nlines != 0)
534+ if ((has_nulls || encoding_error_output) && !out_quiet && nlines != 0)
535 printf (_("Binary file %s matches\n"), filename);
536 return nlines;
537 }
538diff --git a/src/grep.h b/src/grep.h
539index 580eb11..2e4527c 100644
540--- a/src/grep.h
541+++ b/src/grep.h
542@@ -29,22 +29,4 @@ extern bool match_words; /* -w */
543 extern bool match_lines; /* -x */
544 extern char eolbyte; /* -z */
545
546-/* An enum textbin describes the file's type, inferred from data read
547- before the first line is selected for output. */
548-enum textbin
549- {
550- /* Binary, as it contains null bytes and the -z option is not in effect,
551- or it contains encoding errors. */
552- TEXTBIN_BINARY = -1,
553-
554- /* Not known yet. Only text has been seen so far. */
555- TEXTBIN_UNKNOWN = 0,
556-
557- /* Text. */
558- TEXTBIN_TEXT = 1
559- };
560-
561-/* Input file type. */
562-extern enum textbin input_textbin;
563-
564 #endif
565diff --git a/src/pcresearch.c b/src/pcresearch.c
566index dc68345..c403032 100644
567--- a/src/pcresearch.c
568+++ b/src/pcresearch.c
569@@ -194,32 +194,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
570 error. */
571 char const *subject = buf;
572
573- /* If the input type is unknown, the caller is still testing the
574- input, which means the current buffer cannot contain encoding
575- errors and a multiline search is typically more efficient.
576- Otherwise, a single-line search is typically faster, so that
577- pcre_exec doesn't waste time validating the entire input
578- buffer. */
579- bool multiline = input_textbin == TEXTBIN_UNKNOWN;
580-
581 for (; p < buf + size; p = line_start = line_end + 1)
582 {
583- bool too_big;
584-
585- if (multiline)
586- {
587- size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
588- size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
589- line_end = memrchr (p, eolbyte, scan_size);
590- too_big = ! line_end;
591- }
592- else
593- {
594- line_end = memchr (p, eolbyte, buf + size - p);
595- too_big = INT_MAX < line_end - p;
596- }
597-
598- if (too_big)
599+ /* A single-line search is typically faster, so that
600+ pcre_exec doesn't waste time validating the entire input
601+ buffer. */
602+ line_end = memchr (p, eolbyte, buf + size - p);
603+ if (INT_MAX < line_end - p)
604 error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
605
606 for (;;)
607@@ -247,27 +228,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
608 int options = 0;
609 if (!bol)
610 options |= PCRE_NOTBOL;
611- if (multiline)
612- options |= PCRE_NO_UTF8_CHECK;
613
614 e = jit_exec (subject, line_end - subject, search_offset,
615 options, sub);
616 if (e != PCRE_ERROR_BADUTF8)
617- {
618- if (0 < e && multiline && sub[1] - sub[0] != 0)
619- {
620- char const *nl = memchr (subject + sub[0], eolbyte,
621- sub[1] - sub[0]);
622- if (nl)
623- {
624- /* This match crosses a line boundary; reject it. */
625- p = subject + sub[0];
626- line_end = nl;
627- continue;
628- }
629- }
630- break;
631- }
632+ break;
633 int valid_bytes = sub[0];
634
635 /* Try to match the string before the encoding error. */
636@@ -339,15 +304,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
637 beg = matchbeg;
638 end = matchend;
639 }
640- else if (multiline)
641- {
642- char const *prev_nl = memrchr (line_start - 1, eolbyte,
643- matchbeg - (line_start - 1));
644- char const *next_nl = memchr (matchend, eolbyte,
645- line_end + 1 - matchend);
646- beg = prev_nl + 1;
647- end = next_nl + 1;
648- }
649 else
650 {
651 beg = line_start;
652diff --git a/tests/Makefile.am b/tests/Makefile.am
653index 37bb501..f1b8c43 100644
654--- a/tests/Makefile.am
655+++ b/tests/Makefile.am
656@@ -70,6 +70,7 @@ TESTS = \
657 empty \
658 empty-line \
659 empty-line-mb \
660+ encoding-error \
661 epipe \
662 equiv-classes \
663 ere \
664diff --git a/tests/encoding-error b/tests/encoding-error
665new file mode 100755
666index 0000000..fe52de2
667--- a/dev/null
668+++ b/tests/encoding-error
669@@ -0,0 +1,41 @@
670+#! /bin/sh
671+# Test grep's behavior on encoding errors.
672+#
673+# Copyright 2015 Free Software Foundation, Inc.
674+#
675+# Copying and distribution of this file, with or without modification,
676+# are permitted in any medium without royalty provided the copyright
677+# notice and this notice are preserved.
678+
679+. "${srcdir=.}/init.sh"; path_prepend_ ../src
680+
681+require_en_utf8_locale_
682+
683+LC_ALL=en_US.UTF-8
684+export LC_ALL
685+
686+printf 'Alfred Jones\n' > a || framework_failure_
687+printf 'John Smith\n' >j || framework_failure_
688+printf 'Pedro P\xe9rez\n' >p || framework_failure_
689+cat a p j >in || framework_failure_
690+
691+fail=0
692+
693+grep '^A' in >out || fail=1
694+compare a out || fail=1
695+
696+grep '^P' in >out || fail=1
697+printf 'Binary file in matches\n' >exp || framework_failure_
698+compare exp out || fail=1
699+
700+grep '^J' in >out || fail=1
701+compare j out || fail=1
702+
703+grep '^X' in >out
704+test $? = 1 || fail=1
705+compare /dev/null out || fail=1
706+
707+grep -a . in >out || fail=1
708+compare in out
709+
710+Exit $fail
711--
712cgit v0.9.0.2
713From 40ed879db22d57516a31fefd1c39416974b74ec4 Mon Sep 17 00:00:00 2001
714From: Paul Eggert <eggert@cs.ucla.edu>
715Date: Sat, 02 Jan 2016 05:16:12 +0000
716Subject: grep: fix bug with with invalid unibyte sequence
717
718This was introduced by the recent binary-data-detection changes.
719Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86
720* src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove,
721replacing with ...
722(uword_max, unibyte_mask, initialize_unibyte_mask): ... this new
723constant, static var, and function. All uses changed. The
724unibyte_mask var generalizes the old local var hibyte_mask, which
725worked only for encodings where every byte with 0x80 turned off is
726a single-byte character.
727(buf_has_encoding_errors): Return false immediately if
728unibyte_mask is zero, not whether the current encoding is unibyte.
729The old test was incorrect in unibyte locales in which some bytes
730were encoding errors.
731* tests/pcre-z: Require UTF-8 locale, since the grep -z . test now
732needs this. Use printf \0 rather than tr. Port the 'grep -z .'
733test to platforms where the C locale says '\200' is an encoding
734error. Use cmp rather than compare, as the file is binary and
735so non-GNU diff might not work.
736* tests/unibyte-binary: New file.
737* tests/Makefile.am (TESTS): Add it.
738---
739diff --git a/src/grep.c b/src/grep.c
740index 1207a76..a5f1fa2 100644
741--- a/src/grep.c
742+++ b/src/grep.c
743@@ -484,21 +484,6 @@ clean_up_stdout (void)
744 close_stdout ();
745 }
746
747-/* The high-order bit of a byte. */
748-enum { HIBYTE = 0x80 };
749-
750-/* True if every byte with HIBYTE off is a single-byte character.
751- UTF-8 has this property. */
752-static bool easy_encoding;
753-
754-static void
755-init_easy_encoding (void)
756-{
757- easy_encoding = true;
758- for (int i = 0; i < HIBYTE; i++)
759- easy_encoding &= mbclen_cache[i] == 1;
760-}
761-
762 /* A cast to TYPE of VAL. Use this when TYPE is a pointer type, VAL
763 is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
764 the alignment and would otherwise complain about the cast. */
765@@ -517,21 +502,33 @@ init_easy_encoding (void)
766 /* An unsigned type suitable for fast matching. */
767 typedef uintmax_t uword;
768
769+/* All bytes that are not unibyte characters, ANDed together, and then
770+ with the pattern repeated to fill a uword. For an encoding where
771+ all bytes are unibyte characters, this is 0. For UTF-8, this is
772+ 0x808080.... For encodings where unibyte characters have no useful
773+ pattern, this is all 1s. The unsigned char C is a unibyte
774+ character if C & UNIBYTE_MASK is zero. If the uword W is the
775+ concatenation of bytes, the bytes are all unibyte characters
776+ if W & UNIBYTE_MASK is zero. */
777+static uword unibyte_mask;
778+
779+static void
780+initialize_unibyte_mask (void)
781+{
782+ unsigned char mask = UCHAR_MAX;
783+ for (int i = 1; i <= UCHAR_MAX; i++)
784+ if (mbclen_cache[i] != 1)
785+ mask &= i;
786+ uword uword_max = -1;
787+ unibyte_mask = uword_max / UCHAR_MAX * mask;
788+}
789+
790 /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
791 that is not easy, and return a pointer to the first non-easy byte.
792- In easy encodings, the easy bytes all have HIBYTE off.
793- In other encodings, no byte is easy. */
794+ The easy bytes all have UNIBYTE_MASK off. */
795 static char const * _GL_ATTRIBUTE_PURE
796 skip_easy_bytes (char const *buf)
797 {
798- if (!easy_encoding)
799- return buf;
800-
801- uword uword_max = -1;
802-
803- /* 0x8080..., extended to be wide enough for uword. */
804- uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
805-
806 /* Search a byte at a time until the pointer is aligned, then a
807 uword at a time until a match is found, then a byte at a time to
808 identify the exact byte. The uword search may go slightly past
809@@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf)
810 char const *p;
811 uword const *s;
812 for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
813- if (*p & HIBYTE)
814+ if (to_uchar (*p) & unibyte_mask)
815 return p;
816- for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
817+ for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
818 continue;
819- for (p = (char const *) s; ! (*p & HIBYTE); p++)
820+ for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
821 continue;
822 return p;
823 }
824@@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf)
825 static bool
826 buf_has_encoding_errors (char *buf, size_t size)
827 {
828- if (MB_CUR_MAX <= 1)
829+ if (! unibyte_mask)
830 return false;
831
832 mbstate_t mbs = { 0 };
833@@ -2592,7 +2589,7 @@ main (int argc, char **argv)
834 usage (EXIT_TROUBLE);
835
836 build_mbclen_cache ();
837- init_easy_encoding ();
838+ initialize_unibyte_mask ();
839
840 /* In a unibyte locale, switch from fgrep to grep if
841 the pattern matches words (where grep is typically faster).
842diff --git a/tests/Makefile.am b/tests/Makefile.am
843index f349aa3..a38303c 100644
844--- a/tests/Makefile.am
845+++ b/tests/Makefile.am
846@@ -133,6 +133,7 @@ TESTS = \
847 turkish-I-without-dot \
848 turkish-eyes \
849 two-files \
850+ unibyte-binary \
851 unibyte-bracket-expr \
852 unibyte-negated-circumflex \
853 utf8-bracket \
854diff --git a/tests/pcre-z b/tests/pcre-z
855index 6bbde94..4ce9a93 100755
856--- a/tests/pcre-z
857+++ b/tests/pcre-z
858@@ -2,10 +2,11 @@
859 # Test Perl regex with NUL-separated input
860 . "${srcdir=.}/init.sh"; path_prepend_ ../src
861 require_pcre_
862+require_en_utf8_locale_
863
864 REGEX=a
865
866-printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
867+printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
868
869 grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
870 compare /dev/null err || fail_ 'stderr not empty on grep -z.'
871@@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
872 compare exp out || fail=1
873 compare /dev/null err || fail=1
874
875-printf '\200\0' >in0
876-LC_ALL=C grep -z . in0 >out || fail=1
877-compare in0 out || fail=1
878+printf '\303\200\0' >in0 # "À" followed by a NUL.
879+LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
880+cmp in0 out || fail=1
881
882 Exit $fail
883diff --git a/tests/unibyte-binary b/tests/unibyte-binary
884new file mode 100755
885index 0000000..78735b8
886--- a/dev/null
887+++ b/tests/unibyte-binary
888@@ -0,0 +1,28 @@
889+#!/bin/sh
890+# Test binary files in unibyte locales with encoding errors
891+
892+# Copyright 2016 Free Software Foundation, Inc.
893+
894+# This program is free software: you can redistribute it and/or modify
895+# it under the terms of the GNU General Public License as published by
896+# the Free Software Foundation, either version 3 of the License, or
897+# (at your option) any later version.
898+
899+# This program is distributed in the hope that it will be useful,
900+# but WITHOUT ANY WARRANTY; without even the implied warranty of
901+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
902+# GNU General Public License for more details.
903+
904+# You should have received a copy of the GNU General Public License
905+# along with this program. If not, see <http://www.gnu.org/licenses/>.
906+
907+. "${srcdir=.}/init.sh"; path_prepend_ ../src
908+require_unibyte_locale
909+
910+fail=0
911+
912+printf 'a\n\200\nb\n' >in || framework_failure_
913+printf 'a\nBinary file in matches\n' >exp || framework_failure_
914+grep . in >out || fail=1
915+compare exp out || fail=1
916+Exit $fail
917--
918cgit v0.9.0.2
This page took 0.198485 seconds and 4 git commands to generate.