grep-enc-errors.patch

   1 From 8521001643bc6a28c760552824eaea5ecee0aa8c Mon Sep 17 00:00:00 2001
   2 From: Paul Eggert <eggert@cs.ucla.edu>
   3 Date: Thu, 31 Dec 2015 03:10:14 +0000
   4 Subject: grep: be less picky about encoding errors
   5
   6 This fixes a longstanding problem introduced in grep 2.21,
   7 which is overly picky about binary files.
   8 * NEWS:
   9 * doc/grep.texi (File and Directory Selection): Document this.
  10 * src/grep.c (input_textbin, textbin_is_binary, buffer_textbin)
  11 (file_textbin):
  12 Remove.  All uses removed.
  13 (encoding_error_output): New static var.
  14 (buf_has_encoding_errors, buf_has_nulls, file_must_have_nulls):
  15 New functions, which reuse bits
  16 and pieces of the removed functions.
  17 (lastout, print_line_head, print_line_middle, print_line_tail, prline)
  18 (prpending, prtext, grepbuf):
  19 Avoid use of const, now that we have
  20 functions that require modifying a sentinel.
  21 (print_line_head): New arg LEN.  All uses changed.
  22 (print_line_head, print_line_tail):
  23 Return indicator whether the output line was printed.
  24 All uses changed.
  25 (print_line_middle): Exit early on encoding error.
  26 (grep): Use new method for determining whether file is binary.
  27 * src/grep.h (enum textbin, TEXTBIN_BINARY, TEXTBIN_UNKNOWN)
  28 (TEXTBIN_TEXT, input_textbin): Remove decls.  All uses removed.
  29 * src/pcresearch.c (Pexecute): Remove multiline optimization,
  30 since the main program no longer checks for encoding errors on input.
  31 * tests/encoding-error: New file.
  32 * tests/Makefile.am (TESTS): Add it.
  33 ---
  34 diff --git a/doc/grep.texi b/doc/grep.texi
  35 index 76c7f46..58e7f48 100644
  36 --- a/doc/grep.texi
  37 +++ b/doc/grep.texi
  38 @@ -596,13 +596,13 @@ If a file's allocation metadata,
  39  or if its data read before a line is selected for output,
  40  indicate that the file contains binary data,
  41  assume that the file is of type @var{type}.
  42 -Non-text bytes indicate binary data; these are either data bytes
  43 -improperly encoded for the current locale, or null bytes when the
  44 +Non-text bytes indicate binary data; these are either output bytes that are
  45 +improperly encoded for the current locale, or null input bytes when the
  46  @option{-z} (@option{--null-data}) option is not given (@pxref{Other
  47  Options}).
  48
  49 -By default, @var{type} is @samp{binary},
  50 -and @command{grep} normally outputs either
  51 +By default, @var{type} is @samp{binary}, and when @command{grep}
  52 +discovers that a file is binary it normally outputs either
  53  a one-line message saying that a binary file matches,
  54  or no message if there is no match.
  55  When processing binary data, @command{grep} may treat non-text bytes
  56 @@ -611,7 +611,8 @@ not match a null byte, as the null byte might be treated as a line
  57  terminator even without the @option{-z} (@option{--null-data}) option.
  58
  59  If @var{type} is @samp{without-match},
  60 -@command{grep} assumes that a binary file does not match;
  61 +when @command{grep} discovers that a file is binary
  62 +it assumes that the rest of the file does not match;
  63  this is equivalent to the @option{-I} option.
  64
  65  If @var{type} is @samp{text},
  66 diff --git a/src/grep.c b/src/grep.c
  67 index 19ba208..e059a46 100644
  68 --- a/src/grep.c
  69 +++ b/src/grep.c
  70 @@ -377,7 +377,6 @@ bool match_icase;
  71  bool match_words;
  72  bool match_lines;
  73  char eolbyte;
  74 -enum textbin input_textbin;
  75
  76  static char const *matcher;
  77
  78 @@ -389,6 +388,10 @@ static bool omit_dot_slash;
  79  static bool errseen;
  80  static bool write_error_seen;
  81
  82 +/* True if output from the current input file has been suppressed
  83 +   because an output line had an encoding error.  */
  84 +static bool encoding_error_output;
  85 +
  86  enum directories_type
  87    {
  88      READ_DIRECTORIES = 2,
  89 @@ -481,12 +484,6 @@ clean_up_stdout (void)
  90      close_stdout ();
  91  }
  92
  93 -static bool
  94 -textbin_is_binary (enum textbin textbin)
  95 -{
  96 -  return textbin < TEXTBIN_UNKNOWN;
  97 -}
  98 -
  99  /* The high-order bit of a byte.  */
 100  enum { HIBYTE = 0x80 };
 101
 102 @@ -551,58 +548,60 @@ skip_easy_bytes (char const *buf)
 103    return p;
 104  }
 105
 106 -/* Return the text type of data in BUF, of size SIZE.
 107 +/* Return true if BUF, of size SIZE, has an encoding error.
 108     BUF must be followed by at least sizeof (uword) bytes,
 109 -   which may be arbitrarily written to or read from.  */
 110 -static enum textbin
 111 -buffer_textbin (char *buf, size_t size)
 112 +   the first of which may be modified.  */
 113 +static bool
 114 +buf_has_encoding_errors (char *buf, size_t size)
 115  {
 116 -  if (eolbyte && memchr (buf, '\0', size))
 117 -    return TEXTBIN_BINARY;
 118 +  if (MB_CUR_MAX <= 1)
 119 +    return false;
 120
 121 -  if (1 < MB_CUR_MAX)
 122 -    {
 123 -      mbstate_t mbs = { 0 };
 124 -      size_t clen;
 125 -      char const *p;
 126 +  mbstate_t mbs = { 0 };
 127 +  size_t clen;
 128
 129 -      buf[size] = -1;
 130 -      for (p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
 131 -        {
 132 -          clen = mbrlen (p, buf + size - p, &mbs);
 133 -          if ((size_t) -2 <= clen)
 134 -            return clen == (size_t) -2 ? TEXTBIN_UNKNOWN : TEXTBIN_BINARY;
 135 -        }
 136 +  buf[size] = -1;
 137 +  for (char const *p = buf; (p = skip_easy_bytes (p)) < buf + size; p += clen)
 138 +    {
 139 +      clen = mbrlen (p, buf + size - p, &mbs);
 140 +      if ((size_t) -2 <= clen)
 141 +        return true;
 142      }
 143
 144 -  return TEXTBIN_TEXT;
 145 +  return false;
 146  }
 147
 148 -/* Return the text type of a file.  BUF, of size SIZE, is the initial
 149 -   buffer read from the file with descriptor FD and status ST.
 150 -   BUF must be followed by at least sizeof (uword) bytes,
 151 +
 152 +/* Return true if BUF, of size SIZE, has a null byte.
 153 +   BUF must be followed by at least one byte,
 154     which may be arbitrarily written to or read from.  */
 155 -static enum textbin
 156 -file_textbin (char *buf, size_t size, int fd, struct stat const *st)
 157 +static bool
 158 +buf_has_nulls (char *buf, size_t size)
 159  {
 160 -  enum textbin textbin = buffer_textbin (buf, size);
 161 -  if (textbin_is_binary (textbin))
 162 -    return textbin;
 163 +  buf[size] = 0;
 164 +  return strlen (buf) != size;
 165 +}
 166
 167 +/* Return true if a file is known to contain null bytes.
 168 +   SIZE bytes have already been read from the file
 169 +   with descriptor FD and status ST.  */
 170 +static bool
 171 +file_must_have_nulls (size_t size, int fd, struct stat const *st)
 172 +{
 173    if (usable_st_size (st))
 174      {
 175        if (st->st_size <= size)
 176 -        return textbin == TEXTBIN_UNKNOWN ? TEXTBIN_BINARY : textbin;
 177 +        return false;
 178
 179        /* If the file has holes, it must contain a null byte somewhere.  */
 180 -      if (SEEK_HOLE != SEEK_SET && eolbyte)
 181 +      if (SEEK_HOLE != SEEK_SET)
 182          {
 183            off_t cur = size;
 184            if (O_BINARY || fd == STDIN_FILENO)
 185              {
 186                cur = lseek (fd, 0, SEEK_CUR);
 187                if (cur < 0)
 188 -                return TEXTBIN_UNKNOWN;
 189 +                return false;
 190              }
 191
 192            /* Look for a hole after the current location.  */
 193 @@ -612,12 +611,12 @@ file_textbin (char *buf, size_t size, int fd, struct stat const *st)
 194                if (lseek (fd, cur, SEEK_SET) < 0)
 195                  suppressible_error (filename, errno);
 196                if (hole_start < st->st_size)
 197 -                return TEXTBIN_BINARY;
 198 +                return true;
 199              }
 200          }
 201      }
 202
 203 -  return TEXTBIN_UNKNOWN;
 204 +  return false;
 205  }
 206
 207  /* Convert STR to a nonnegative integer, storing the result in *OUT.
 208 @@ -899,7 +898,7 @@ static char *label = NULL;      /* Fake filename for stdin */
 209  /* Internal variables to keep track of byte count, context, etc. */
 210  static uintmax_t totalcc;      /* Total character count before bufbeg. */
 211  static char const *lastnl;     /* Pointer after last newline counted. */
 212 -static char const *lastout;    /* Pointer after last character output;
 213 +static char *lastout;          /* Pointer after last character output;
 214                                     NULL if no character has been output
 215                                     or if it's conceptually before bufbeg. */
 216  static intmax_t outleft;       /* Maximum number of lines to be output.  */
 217 @@ -971,10 +970,31 @@ print_offset (uintmax_t pos, int min_width, const char *color)
 218    pr_sgr_end_if (color);
 219  }
 220
 221 -/* Print a whole line head (filename, line, byte).  */
 222 -static void
 223 -print_line_head (char const *beg, char const *lim, char sep)
 224 +/* Print a whole line head (filename, line, byte).  The output data
 225 +   starts at BEG and contains LEN bytes; it is followed by at least
 226 +   sizeof (uword) bytes, the first of which may be temporarily modified.
 227 +   The output data comes from what is perhaps a larger input line that
 228 +   goes until LIM, where LIM[-1] is an end-of-line byte.  Use SEP as
 229 +   the separator on output.
 230 +
 231 +   Return true unless the line was suppressed due to an encoding error.  */
 232 +
 233 +static bool
 234 +print_line_head (char *beg, size_t len, char const *lim, char sep)
 235  {
 236 +  bool encoding_errors = false;
 237 +  if (binary_files != TEXT_BINARY_FILES)
 238 +    {
 239 +      char ch = beg[len];
 240 +      encoding_errors = buf_has_encoding_errors (beg, len);
 241 +      beg[len] = ch;
 242 +    }
 243 +  if (encoding_errors)
 244 +    {
 245 +      encoding_error_output = done_on_match = out_quiet = true;
 246 +      return false;
 247 +    }
 248 +
 249    bool pending_sep = false;
 250
 251    if (out_file)
 252 @@ -1021,22 +1041,27 @@ print_line_head (char const *beg, char const *lim, char sep)
 253
 254        print_sep (sep);
 255      }
 256 +
 257 +  return true;
 258  }
 259
 260 -static const char *
 261 -print_line_middle (const char *beg, const char *lim,
 262 +static char *
 263 +print_line_middle (char *beg, char *lim,
 264                     const char *line_color, const char *match_color)
 265  {
 266    size_t match_size;
 267    size_t match_offset;
 268 -  const char *cur = beg;
 269 -  const char *mid = NULL;
 270 -
 271 -  while (cur < lim
 272 -         && ((match_offset = execute (beg, lim - beg, &match_size, cur))
 273 -             != (size_t) -1))
 274 +  char *cur = beg;
 275 +  char *mid = NULL;
 276 +  char *b;
 277 +
 278 +  for (cur = beg;
 279 +       (cur < lim
 280 +        && ((match_offset = execute (beg, lim - beg, &match_size, cur))
 281 +            != (size_t) -1));
 282 +       cur = b + match_size)
 283      {
 284 -      char const *b = beg + match_offset;
 285 +      b = beg + match_offset;
 286
 287        /* Avoid matching the empty line at the end of the buffer. */
 288        if (b == lim)
 289 @@ -1056,8 +1081,11 @@ print_line_middle (const char *beg, const char *lim,
 290            /* This function is called on a matching line only,
 291               but is it selected or rejected/context?  */
 292            if (only_matching)
 293 -            print_line_head (b, lim, (out_invert ? SEP_CHAR_REJECTED
 294 -                                      : SEP_CHAR_SELECTED));
 295 +            {
 296 +              char sep = out_invert ? SEP_CHAR_REJECTED : SEP_CHAR_SELECTED;
 297 +              if (! print_line_head (b, match_size, lim, sep))
 298 +                return NULL;
 299 +            }
 300            else
 301              {
 302                pr_sgr_start (line_color);
 303 @@ -1075,7 +1103,6 @@ print_line_middle (const char *beg, const char *lim,
 304            if (only_matching)
 305              fputs ("\n", stdout);
 306          }
 307 -      cur = b + match_size;
 308      }
 309
 310    if (only_matching)
 311 @@ -1086,8 +1113,8 @@ print_line_middle (const char *beg, const char *lim,
 312    return cur;
 313  }
 314
 315 -static const char *
 316 -print_line_tail (const char *beg, const char *lim, const char *line_color)
 317 +static char *
 318 +print_line_tail (char *beg, const char *lim, const char *line_color)
 319  {
 320    size_t eol_size;
 321    size_t tail_size;
 322 @@ -1108,14 +1135,15 @@ print_line_tail (const char *beg, const char *lim, const char *line_color)
 323  }
 324
 325  static void
 326 -prline (char const *beg, char const *lim, char sep)
 327 +prline (char *beg, char *lim, char sep)
 328  {
 329    bool matching;
 330    const char *line_color;
 331    const char *match_color;
 332
 333    if (!only_matching)
 334 -    print_line_head (beg, lim, sep);
 335 +    if (! print_line_head (beg, lim - beg - 1, lim, sep))
 336 +      return;
 337
 338    matching = (sep == SEP_CHAR_SELECTED) ^ out_invert;
 339
 340 @@ -1135,7 +1163,11 @@ prline (char const *beg, char const *lim, char sep)
 341      {
 342        /* We already know that non-matching lines have no match (to colorize). */
 343        if (matching && (only_matching || *match_color))
 344 -        beg = print_line_middle (beg, lim, line_color, match_color);
 345 +        {
 346 +          beg = print_line_middle (beg, lim, line_color, match_color);
 347 +          if (! beg)
 348 +            return;
 349 +        }
 350
 351        if (!only_matching && *line_color)
 352          {
 353 @@ -1169,7 +1201,7 @@ prpending (char const *lim)
 354      lastout = bufbeg;
 355    while (pending > 0 && lastout < lim)
 356      {
 357 -      char const *nl = memchr (lastout, eolbyte, lim - lastout);
 358 +      char *nl = memchr (lastout, eolbyte, lim - lastout);
 359        size_t match_size;
 360        --pending;
 361        if (outleft
 362 @@ -1184,7 +1216,7 @@ prpending (char const *lim)
 363
 364  /* Output the lines between BEG and LIM.  Deal with context.  */
 365  static void
 366 -prtext (char const *beg, char const *lim)
 367 +prtext (char *beg, char *lim)
 368  {
 369    static bool used;    /* Avoid printing SEP_STR_GROUP before any output.  */
 370    char eol = eolbyte;
 371 @@ -1192,7 +1224,7 @@ prtext (char const *beg, char const *lim)
 372    if (!out_quiet && pending > 0)
 373      prpending (beg);
 374
 375 -  char const *p = beg;
 376 +  char *p = beg;
 377
 378    if (!out_quiet)
 379      {
 380 @@ -1218,7 +1250,7 @@ prtext (char const *beg, char const *lim)
 381
 382        while (p < beg)
 383          {
 384 -          char const *nl = memchr (p, eol, beg - p);
 385 +          char *nl = memchr (p, eol, beg - p);
 386            nl++;
 387            prline (p, nl, SEP_CHAR_REJECTED);
 388            p = nl;
 389 @@ -1231,7 +1263,7 @@ prtext (char const *beg, char const *lim)
 390        /* One or more lines are output.  */
 391        for (n = 0; p < lim && n < outleft; n++)
 392          {
 393 -          char const *nl = memchr (p, eol, lim - p);
 394 +          char *nl = memchr (p, eol, lim - p);
 395            nl++;
 396            if (!out_quiet)
 397              prline (p, nl, SEP_CHAR_SELECTED);
 398 @@ -1278,13 +1310,12 @@ zap_nuls (char *p, char *lim, char eol)
 399     between matching lines if OUT_INVERT is true).  Return a count of
 400     lines printed.  Replace all NUL bytes with NUL_ZAPPER as we go.  */
 401  static intmax_t
 402 -grepbuf (char const *beg, char const *lim)
 403 +grepbuf (char *beg, char const *lim)
 404  {
 405    intmax_t outleft0 = outleft;
 406 -  char const *p;
 407 -  char const *endp;
 408 +  char *endp;
 409
 410 -  for (p = beg; p < lim; p = endp)
 411 +  for (char *p = beg; p < lim; p = endp)
 412      {
 413        size_t match_size;
 414        size_t match_offset = execute (p, lim - p, &match_size, NULL);
 415 @@ -1295,15 +1326,15 @@ grepbuf (char const *beg, char const *lim)
 416            match_offset = lim - p;
 417            match_size = 0;
 418          }
 419 -      char const *b = p + match_offset;
 420 +      char *b = p + match_offset;
 421        endp = b + match_size;
 422        /* Avoid matching the empty line at the end of the buffer. */
 423        if (!out_invert && b == lim)
 424          break;
 425        if (!out_invert || p < b)
 426          {
 427 -          char const *prbeg = out_invert ? p : b;
 428 -          char const *prend = out_invert ? b : endp;
 429 +          char *prbeg = out_invert ? p : b;
 430 +          char *prend = out_invert ? b : endp;
 431            prtext (prbeg, prend);
 432            if (!outleft || done_on_match)
 433              {
 434 @@ -1324,7 +1355,6 @@ static intmax_t
 435  grep (int fd, struct stat const *st)
 436  {
 437    intmax_t nlines, i;
 438 -  enum textbin textbin;
 439    size_t residue, save;
 440    char oldc;
 441    char *beg;
 442 @@ -1333,6 +1363,7 @@ grep (int fd, struct stat const *st)
 443    char nul_zapper = '\0';
 444    bool done_on_match_0 = done_on_match;
 445    bool out_quiet_0 = out_quiet;
 446 +  bool has_nulls = false;
 447
 448    if (! reset (fd, st))
 449      return 0;
 450 @@ -1344,6 +1375,7 @@ grep (int fd, struct stat const *st)
 451    after_last_match = 0;
 452    pending = 0;
 453    skip_nuls = skip_empty_lines && !eol;
 454 +  encoding_error_output = false;
 455    seek_data_failed = false;
 456
 457    nlines = 0;
 458 @@ -1356,26 +1388,20 @@ grep (int fd, struct stat const *st)
 459        return 0;
 460      }
 461
 462 -  if (binary_files == TEXT_BINARY_FILES)
 463 -    textbin = TEXTBIN_TEXT;
 464 -  else
 465 +  for (bool firsttime = true; ; firsttime = false)
 466      {
 467 -      textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
 468 -      if (textbin_is_binary (textbin))
 469 +      if (!has_nulls && eol && binary_files != TEXT_BINARY_FILES
 470 +          && (buf_has_nulls (bufbeg, buflim - bufbeg)
 471 +              || (firsttime && file_must_have_nulls (buflim - bufbeg, fd, st))))
 472          {
 473 +          has_nulls = true;
 474            if (binary_files == WITHOUT_MATCH_BINARY_FILES)
 475              return 0;
 476            done_on_match = out_quiet = true;
 477            nul_zapper = eol;
 478            skip_nuls = skip_empty_lines;
 479          }
 480 -      else if (execute != Pexecute)
 481 -        textbin = TEXTBIN_TEXT;
 482 -    }
 483
 484 -  for (;;)
 485 -    {
 486 -      input_textbin = textbin;
 487        lastnl = bufbeg;
 488        if (lastout)
 489          lastout = bufbeg;
 490 @@ -1426,13 +1452,8 @@ grep (int fd, struct stat const *st)
 491          }
 492
 493        /* Detect whether leading context is adjacent to previous output.  */
 494 -      if (lastout)
 495 -        {
 496 -          if (textbin == TEXTBIN_UNKNOWN)
 497 -            textbin = TEXTBIN_TEXT;
 498 -          if (beg != lastout)
 499 -            lastout = 0;
 500 -        }
 501 +      if (beg != lastout)
 502 +        lastout = 0;
 503
 504        /* Handle some details and read more data to scan.  */
 505        save = residue + lim - beg;
 506 @@ -1445,22 +1466,6 @@ grep (int fd, struct stat const *st)
 507            suppressible_error (filename, errno);
 508            goto finish_grep;
 509          }
 510 -
 511 -      /* If the file's textbin has not been determined yet, assume
 512 -         it's binary if the next input buffer suggests so.  */
 513 -      if (textbin == TEXTBIN_UNKNOWN)
 514 -        {
 515 -          enum textbin tb = buffer_textbin (bufbeg, buflim - bufbeg);
 516 -          if (textbin_is_binary (tb))
 517 -            {
 518 -              if (binary_files == WITHOUT_MATCH_BINARY_FILES)
 519 -                return 0;
 520 -              textbin = tb;
 521 -              done_on_match = out_quiet = true;
 522 -              nul_zapper = eol;
 523 -              skip_nuls = skip_empty_lines;
 524 -            }
 525 -        }
 526      }
 527    if (residue)
 528      {
 529 @@ -1474,7 +1479,7 @@ grep (int fd, struct stat const *st)
 530   finish_grep:
 531    done_on_match = done_on_match_0;
 532    out_quiet = out_quiet_0;
 533 -  if (textbin_is_binary (textbin) && !out_quiet && nlines != 0)
 534 +  if ((has_nulls || encoding_error_output) && !out_quiet && nlines != 0)
 535      printf (_("Binary file %s matches\n"), filename);
 536    return nlines;
 537  }
 538 diff --git a/src/grep.h b/src/grep.h
 539 index 580eb11..2e4527c 100644
 540 --- a/src/grep.h
 541 +++ b/src/grep.h
 542 @@ -29,22 +29,4 @@ extern bool match_words;     /* -w */
 543  extern bool match_lines;       /* -x */
 544  extern char eolbyte;           /* -z */
 545
 546 -/* An enum textbin describes the file's type, inferred from data read
 547 -   before the first line is selected for output.  */
 548 -enum textbin
 549 -  {
 550 -    /* Binary, as it contains null bytes and the -z option is not in effect,
 551 -       or it contains encoding errors.  */
 552 -    TEXTBIN_BINARY = -1,
 553 -
 554 -    /* Not known yet.  Only text has been seen so far.  */
 555 -    TEXTBIN_UNKNOWN = 0,
 556 -
 557 -    /* Text.  */
 558 -    TEXTBIN_TEXT = 1
 559 -  };
 560 -
 561 -/* Input file type.  */
 562 -extern enum textbin input_textbin;
 563 -
 564  #endif
 565 diff --git a/src/pcresearch.c b/src/pcresearch.c
 566 index dc68345..c403032 100644
 567 --- a/src/pcresearch.c
 568 +++ b/src/pcresearch.c
 569 @@ -194,32 +194,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
 570       error.  */
 571    char const *subject = buf;
 572
 573 -  /* If the input type is unknown, the caller is still testing the
 574 -     input, which means the current buffer cannot contain encoding
 575 -     errors and a multiline search is typically more efficient.
 576 -     Otherwise, a single-line search is typically faster, so that
 577 -     pcre_exec doesn't waste time validating the entire input
 578 -     buffer.  */
 579 -  bool multiline = input_textbin == TEXTBIN_UNKNOWN;
 580 -
 581    for (; p < buf + size; p = line_start = line_end + 1)
 582      {
 583 -      bool too_big;
 584 -
 585 -      if (multiline)
 586 -        {
 587 -          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
 588 -          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
 589 -          line_end = memrchr (p, eolbyte, scan_size);
 590 -          too_big = ! line_end;
 591 -        }
 592 -      else
 593 -        {
 594 -          line_end = memchr (p, eolbyte, buf + size - p);
 595 -          too_big = INT_MAX < line_end - p;
 596 -        }
 597 -
 598 -      if (too_big)
 599 +      /* A single-line search is typically faster, so that
 600 +         pcre_exec doesn't waste time validating the entire input
 601 +         buffer.  */
 602 +      line_end = memchr (p, eolbyte, buf + size - p);
 603 +      if (INT_MAX < line_end - p)
 604          error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 605
 606        for (;;)
 607 @@ -247,27 +228,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
 608            int options = 0;
 609            if (!bol)
 610              options |= PCRE_NOTBOL;
 611 -          if (multiline)
 612 -            options |= PCRE_NO_UTF8_CHECK;
 613
 614            e = jit_exec (subject, line_end - subject, search_offset,
 615                          options, sub);
 616            if (e != PCRE_ERROR_BADUTF8)
 617 -            {
 618 -              if (0 < e && multiline && sub[1] - sub[0] != 0)
 619 -                {
 620 -                  char const *nl = memchr (subject + sub[0], eolbyte,
 621 -                                           sub[1] - sub[0]);
 622 -                  if (nl)
 623 -                    {
 624 -                      /* This match crosses a line boundary; reject it.  */
 625 -                      p = subject + sub[0];
 626 -                      line_end = nl;
 627 -                      continue;
 628 -                    }
 629 -                }
 630 -              break;
 631 -            }
 632 +            break;
 633            int valid_bytes = sub[0];
 634
 635            /* Try to match the string before the encoding error.  */
 636 @@ -339,15 +304,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
 637            beg = matchbeg;
 638            end = matchend;
 639          }
 640 -      else if (multiline)
 641 -        {
 642 -          char const *prev_nl = memrchr (line_start - 1, eolbyte,
 643 -                                         matchbeg - (line_start - 1));
 644 -          char const *next_nl = memchr (matchend, eolbyte,
 645 -                                        line_end + 1 - matchend);
 646 -          beg = prev_nl + 1;
 647 -          end = next_nl + 1;
 648 -        }
 649        else
 650          {
 651            beg = line_start;
 652 diff --git a/tests/Makefile.am b/tests/Makefile.am
 653 index 37bb501..f1b8c43 100644
 654 --- a/tests/Makefile.am
 655 +++ b/tests/Makefile.am
 656 @@ -70,6 +70,7 @@ TESTS =                                               \
 657    empty                                                \
 658    empty-line                                   \
 659    empty-line-mb                                        \
 660 +  encoding-error                               \
 661    epipe                                                \
 662    equiv-classes                                        \
 663    ere                                          \
 664 diff --git a/tests/encoding-error b/tests/encoding-error
 665 new file mode 100755
 666 index 0000000..fe52de2
 667 --- a/dev/null
 668 +++ b/tests/encoding-error
 669 @@ -0,0 +1,41 @@
 670 +#! /bin/sh
 671 +# Test grep's behavior on encoding errors.
 672 +#
 673 +# Copyright 2015 Free Software Foundation, Inc.
 674 +#
 675 +# Copying and distribution of this file, with or without modification,
 676 +# are permitted in any medium without royalty provided the copyright
 677 +# notice and this notice are preserved.
 678 +
 679 +. "${srcdir=.}/init.sh"; path_prepend_ ../src
 680 +
 681 +require_en_utf8_locale_
 682 +
 683 +LC_ALL=en_US.UTF-8
 684 +export LC_ALL
 685 +
 686 +printf 'Alfred Jones\n' > a || framework_failure_
 687 +printf 'John Smith\n' >j || framework_failure_
 688 +printf 'Pedro P\xe9rez\n' >p || framework_failure_
 689 +cat a p j >in || framework_failure_
 690 +
 691 +fail=0
 692 +
 693 +grep '^A' in >out || fail=1
 694 +compare a out || fail=1
 695 +
 696 +grep '^P' in >out || fail=1
 697 +printf 'Binary file in matches\n' >exp || framework_failure_
 698 +compare exp out || fail=1
 699 +
 700 +grep '^J' in >out || fail=1
 701 +compare j out || fail=1
 702 +
 703 +grep '^X' in >out
 704 +test $? = 1 || fail=1
 705 +compare /dev/null out || fail=1
 706 +
 707 +grep -a . in >out || fail=1
 708 +compare in out
 709 +
 710 +Exit $fail
 711 --
 712 cgit v0.9.0.2
 713 From 40ed879db22d57516a31fefd1c39416974b74ec4 Mon Sep 17 00:00:00 2001
 714 From: Paul Eggert <eggert@cs.ucla.edu>
 715 Date: Sat, 02 Jan 2016 05:16:12 +0000
 716 Subject: grep: fix bug with with invalid unibyte sequence
 717
 718 This was introduced by the recent binary-data-detection changes.
 719 Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86
 720 * src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove,
 721 replacing with ...
 722 (uword_max, unibyte_mask, initialize_unibyte_mask): ... this new
 723 constant, static var, and function.  All uses changed.  The
 724 unibyte_mask var generalizes the old local var hibyte_mask, which
 725 worked only for encodings where every byte with 0x80 turned off is
 726 a single-byte character.
 727 (buf_has_encoding_errors): Return false immediately if
 728 unibyte_mask is zero, not whether the current encoding is unibyte.
 729 The old test was incorrect in unibyte locales in which some bytes
 730 were encoding errors.
 731 * tests/pcre-z: Require UTF-8 locale, since the grep -z . test now
 732 needs this.  Use printf \0 rather than tr.  Port the 'grep -z .'
 733 test to platforms where the C locale says '\200' is an encoding
 734 error.  Use cmp rather than compare, as the file is binary and
 735 so non-GNU diff might not work.
 736 * tests/unibyte-binary: New file.
 737 * tests/Makefile.am (TESTS): Add it.
 738 ---
 739 diff --git a/src/grep.c b/src/grep.c
 740 index 1207a76..a5f1fa2 100644
 741 --- a/src/grep.c
 742 +++ b/src/grep.c
 743 @@ -484,21 +484,6 @@ clean_up_stdout (void)
 744      close_stdout ();
 745  }
 746
 747 -/* The high-order bit of a byte.  */
 748 -enum { HIBYTE = 0x80 };
 749 -
 750 -/* True if every byte with HIBYTE off is a single-byte character.
 751 -   UTF-8 has this property.  */
 752 -static bool easy_encoding;
 753 -
 754 -static void
 755 -init_easy_encoding (void)
 756 -{
 757 -  easy_encoding = true;
 758 -  for (int i = 0; i < HIBYTE; i++)
 759 -    easy_encoding &= mbclen_cache[i] == 1;
 760 -}
 761 -
 762  /* A cast to TYPE of VAL.  Use this when TYPE is a pointer type, VAL
 763     is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer
 764     the alignment and would otherwise complain about the cast.  */
 765 @@ -517,21 +502,33 @@ init_easy_encoding (void)
 766  /* An unsigned type suitable for fast matching.  */
 767  typedef uintmax_t uword;
 768
 769 +/* All bytes that are not unibyte characters, ANDed together, and then
 770 +   with the pattern repeated to fill a uword.  For an encoding where
 771 +   all bytes are unibyte characters, this is 0.  For UTF-8, this is
 772 +   0x808080....  For encodings where unibyte characters have no useful
 773 +   pattern, this is all 1s.  The unsigned char C is a unibyte
 774 +   character if C & UNIBYTE_MASK is zero.  If the uword W is the
 775 +   concatenation of bytes, the bytes are all unibyte characters
 776 +   if W & UNIBYTE_MASK is zero.  */
 777 +static uword unibyte_mask;
 778 +
 779 +static void
 780 +initialize_unibyte_mask (void)
 781 +{
 782 +  unsigned char mask = UCHAR_MAX;
 783 +  for (int i = 1; i <= UCHAR_MAX; i++)
 784 +    if (mbclen_cache[i] != 1)
 785 +      mask &= i;
 786 +  uword uword_max = -1;
 787 +  unibyte_mask = uword_max / UCHAR_MAX * mask;
 788 +}
 789 +
 790  /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel
 791     that is not easy, and return a pointer to the first non-easy byte.
 792 -   In easy encodings, the easy bytes all have HIBYTE off.
 793 -   In other encodings, no byte is easy.  */
 794 +   The easy bytes all have UNIBYTE_MASK off.  */
 795  static char const * _GL_ATTRIBUTE_PURE
 796  skip_easy_bytes (char const *buf)
 797  {
 798 -  if (!easy_encoding)
 799 -    return buf;
 800 -
 801 -  uword uword_max = -1;
 802 -
 803 -  /* 0x8080..., extended to be wide enough for uword.  */
 804 -  uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE;
 805 -
 806    /* Search a byte at a time until the pointer is aligned, then a
 807       uword at a time until a match is found, then a byte at a time to
 808       identify the exact byte.  The uword search may go slightly past
 809 @@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf)
 810    char const *p;
 811    uword const *s;
 812    for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++)
 813 -    if (*p & HIBYTE)
 814 +    if (to_uchar (*p) & unibyte_mask)
 815        return p;
 816 -  for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++)
 817 +  for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++)
 818      continue;
 819 -  for (p = (char const *) s; ! (*p & HIBYTE); p++)
 820 +  for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++)
 821      continue;
 822    return p;
 823  }
 824 @@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf)
 825  static bool
 826  buf_has_encoding_errors (char *buf, size_t size)
 827  {
 828 -  if (MB_CUR_MAX <= 1)
 829 +  if (! unibyte_mask)
 830      return false;
 831
 832    mbstate_t mbs = { 0 };
 833 @@ -2592,7 +2589,7 @@ main (int argc, char **argv)
 834      usage (EXIT_TROUBLE);
 835
 836    build_mbclen_cache ();
 837 -  init_easy_encoding ();
 838 +  initialize_unibyte_mask ();
 839
 840    /* In a unibyte locale, switch from fgrep to grep if
 841       the pattern matches words (where grep is typically faster).
 842 diff --git a/tests/Makefile.am b/tests/Makefile.am
 843 index f349aa3..a38303c 100644
 844 --- a/tests/Makefile.am
 845 +++ b/tests/Makefile.am
 846 @@ -133,6 +133,7 @@ TESTS =                                             \
 847    turkish-I-without-dot                                \
 848    turkish-eyes                                 \
 849    two-files                                    \
 850 +  unibyte-binary                               \
 851    unibyte-bracket-expr                         \
 852    unibyte-negated-circumflex                   \
 853    utf8-bracket                                 \
 854 diff --git a/tests/pcre-z b/tests/pcre-z
 855 index 6bbde94..4ce9a93 100755
 856 --- a/tests/pcre-z
 857 +++ b/tests/pcre-z
 858 @@ -2,10 +2,11 @@
 859  # Test Perl regex with NUL-separated input
 860  . "${srcdir=.}/init.sh"; path_prepend_ ../src
 861  require_pcre_
 862 +require_en_utf8_locale_
 863
 864  REGEX=a
 865
 866 -printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in
 867 +printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_
 868
 869  grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.'
 870  compare /dev/null err || fail_ 'stderr not empty on grep -z.'
 871 @@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1
 872  compare exp out || fail=1
 873  compare /dev/null err || fail=1
 874
 875 -printf '\200\0' >in0
 876 -LC_ALL=C grep -z . in0 >out || fail=1
 877 -compare in0 out || fail=1
 878 +printf '\303\200\0' >in0 # "À" followed by a NUL.
 879 +LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1
 880 +cmp in0 out || fail=1
 881
 882  Exit $fail
 883 diff --git a/tests/unibyte-binary b/tests/unibyte-binary
 884 new file mode 100755
 885 index 0000000..78735b8
 886 --- a/dev/null
 887 +++ b/tests/unibyte-binary
 888 @@ -0,0 +1,28 @@
 889 +#!/bin/sh
 890 +# Test binary files in unibyte locales with encoding errors
 891 +
 892 +# Copyright 2016 Free Software Foundation, Inc.
 893 +
 894 +# This program is free software: you can redistribute it and/or modify
 895 +# it under the terms of the GNU General Public License as published by
 896 +# the Free Software Foundation, either version 3 of the License, or
 897 +# (at your option) any later version.
 898 +
 899 +# This program is distributed in the hope that it will be useful,
 900 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 901 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 902 +# GNU General Public License for more details.
 903 +
 904 +# You should have received a copy of the GNU General Public License
 905 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 906 +
 907 +. "${srcdir=.}/init.sh"; path_prepend_ ../src
 908 +require_unibyte_locale
 909 +
 910 +fail=0
 911 +
 912 +printf 'a\n\200\nb\n' >in || framework_failure_
 913 +printf 'a\nBinary file in matches\n' >exp || framework_failure_
 914 +grep . in >out || fail=1
 915 +compare exp out || fail=1
 916 +Exit $fail
 917 --
 918 cgit v0.9.0.2