7.1.310

   1 To: vim-dev@vim.org
   2 Subject: Patch 7.1.310
   3 Fcc: outbox
   4 From: Bram Moolenaar <Bram@moolenaar.net>
   5 Mime-Version: 1.0
   6 Content-Type: text/plain; charset=ISO-8859-1
   7 Content-Transfer-Encoding: 8bit
   8 ------------
   9
  10 Patch 7.1.310
  11 Problem:    Incomplete utf-8 byte sequence at end of the file is not detected.
  12             Accessing memory that wasn't written.
  13 Solution:   Check the last bytes in the buffer for being a valid utf-8
  14             character. (mostly by Ben Schmidt)
  15             Also fix that the reported line number of the error was wrong.
  16 Files:      src/fileio.c
  17
  18
  19 *** ../vim-7.1.309/src/fileio.c Wed May  7 19:05:55 2008
  20 --- src/fileio.c        Wed Jun  4 18:28:48 2008
  21 ***************
  22 *** 1288,1299 ****
  23   #ifdef FEAT_MBYTE
  24                     else if (conv_restlen > 0)
  25                     {
  26 !                       /* Reached end-of-file but some trailing bytes could
  27 !                        * not be converted.  Truncated file? */
  28 !                       if (conv_error == 0)
  29 !                           conv_error = linecnt;
  30 !                       if (bad_char_behavior != BAD_DROP)
  31                         {
  32                             fio_flags = 0;      /* don't convert this */
  33   # ifdef USE_ICONV
  34                             if (iconv_fd != (iconv_t)-1)
  35 --- 1288,1336 ----
  36   #ifdef FEAT_MBYTE
  37                     else if (conv_restlen > 0)
  38                     {
  39 !                       /*
  40 !                        * Reached end-of-file but some trailing bytes could
  41 !                        * not be converted.  Truncated file?
  42 !                        */
  43 !
  44 !                       /* When we did a conversion report an error. */
  45 !                       if (fio_flags != 0
  46 ! # ifdef USE_ICONV
  47 !                               || iconv_fd != (iconv_t)-1
  48 ! # endif
  49 !                          )
  50                         {
  51 +                           if (conv_error == 0)
  52 +                               conv_error = curbuf->b_ml.ml_line_count
  53 +                                                               - linecnt + 1;
  54 +                       }
  55 +                       /* Remember the first linenr with an illegal byte */
  56 +                       else if (illegal_byte == 0)
  57 +                           illegal_byte = curbuf->b_ml.ml_line_count
  58 +                                                               - linecnt + 1;
  59 +                       if (bad_char_behavior == BAD_DROP)
  60 +                       {
  61 +                           *(ptr - conv_restlen) = NUL;
  62 +                           conv_restlen = 0;
  63 +                       }
  64 +                       else
  65 +                       {
  66 +                           /* Replace the trailing bytes with the replacement
  67 +                            * character if we were converting; if we weren't,
  68 +                            * leave the UTF8 checking code to do it, as it
  69 +                            * works slightly differently. */
  70 +                           if (bad_char_behavior != BAD_KEEP && (fio_flags != 0
  71 + # ifdef USE_ICONV
  72 +                                   || iconv_fd != (iconv_t)-1
  73 + # endif
  74 +                              ))
  75 +                           {
  76 +                               while (conv_restlen > 0)
  77 +                               {
  78 +                                   *(--ptr) = bad_char_behavior;
  79 +                                   --conv_restlen;
  80 +                               }
  81 +                           }
  82                             fio_flags = 0;      /* don't convert this */
  83   # ifdef USE_ICONV
  84                             if (iconv_fd != (iconv_t)-1)
  85 ***************
  86 *** 1302,1321 ****
  87                                 iconv_fd = (iconv_t)-1;
  88                             }
  89   # endif
  90 -                           if (bad_char_behavior == BAD_KEEP)
  91 -                           {
  92 -                               /* Keep the trailing bytes as-is. */
  93 -                               size = conv_restlen;
  94 -                               ptr -= conv_restlen;
  95 -                           }
  96 -                           else
  97 -                           {
  98 -                               /* Replace the trailing bytes with the
  99 -                                * replacement character. */
 100 -                               size = 1;
 101 -                               *--ptr = bad_char_behavior;
 102 -                           }
 103 -                           conv_restlen = 0;
 104                         }
 105                     }
 106   #endif
 107 --- 1339,1344 ----
 108 ***************
 109 *** 1397,1402 ****
 110 --- 1420,1430 ----
 111                     goto retry;
 112                 }
 113             }
 114 +
 115 +           /* Include not converted bytes. */
 116 +           ptr -= conv_restlen;
 117 +           size += conv_restlen;
 118 +           conv_restlen = 0;
 119   #endif
 120             /*
 121              * Break here for a read error or end-of-file.
 122 ***************
 123 *** 1406,1416 ****
 124
 125   #ifdef FEAT_MBYTE
 126
 127 -           /* Include not converted bytes. */
 128 -           ptr -= conv_restlen;
 129 -           size += conv_restlen;
 130 -           conv_restlen = 0;
 131 -
 132   # ifdef USE_ICONV
 133             if (iconv_fd != (iconv_t)-1)
 134             {
 135 --- 1434,1439 ----
 136 ***************
 137 *** 1872,1883 ****
 138                 size = (long)((ptr + real_size) - dest);
 139                 ptr = dest;
 140             }
 141 !           else if (enc_utf8 && conv_error == 0 && !curbuf->b_p_bin)
 142             {
 143 !               /* Reading UTF-8: Check if the bytes are valid UTF-8.
 144 !                * Need to start before "ptr" when part of the character was
 145 !                * read in the previous read() call. */
 146 !               for (p = ptr - utf_head_off(buffer, ptr); ; ++p)
 147                 {
 148                     int  todo = (int)((ptr + size) - p);
 149                     int  l;
 150 --- 1895,1906 ----
 151                 size = (long)((ptr + real_size) - dest);
 152                 ptr = dest;
 153             }
 154 !           else if (enc_utf8 && !curbuf->b_p_bin)
 155             {
 156 !               int  incomplete_tail = FALSE;
 157 !
 158 !               /* Reading UTF-8: Check if the bytes are valid UTF-8. */
 159 !               for (p = ptr; ; ++p)
 160                 {
 161                     int  todo = (int)((ptr + size) - p);
 162                     int  l;
 163 ***************
 164 *** 1891,1933 ****
 165                          * read() will get the next bytes, we'll check it
 166                          * then. */
 167                         l = utf_ptr2len_len(p, todo);
 168 !                       if (l > todo)
 169                         {
 170 !                           /* Incomplete byte sequence, the next read()
 171 !                            * should get them and check the bytes. */
 172 !                           p += todo;
 173 !                           break;
 174                         }
 175 !                       if (l == 1)
 176                         {
 177                             /* Illegal byte.  If we can try another encoding
 178 !                            * do that. */
 179 !                           if (can_retry)
 180                                 break;
 181 -
 182 -                           /* Remember the first linenr with an illegal byte */
 183 -                           if (illegal_byte == 0)
 184 -                               illegal_byte = readfile_linenr(linecnt, ptr, p);
 185   # ifdef USE_ICONV
 186                             /* When we did a conversion report an error. */
 187                             if (iconv_fd != (iconv_t)-1 && conv_error == 0)
 188                                 conv_error = readfile_linenr(linecnt, ptr, p);
 189   # endif
 190
 191                             /* Drop, keep or replace the bad byte. */
 192                             if (bad_char_behavior == BAD_DROP)
 193                             {
 194 !                               mch_memmove(p, p+1, todo - 1);
 195                                 --p;
 196                                 --size;
 197                             }
 198                             else if (bad_char_behavior != BAD_KEEP)
 199                                 *p = bad_char_behavior;
 200                         }
 201 !                       p += l - 1;
 202                     }
 203                 }
 204 !               if (p < ptr + size)
 205                 {
 206                     /* Detected a UTF-8 error. */
 207   rewind_retry:
 208 --- 1914,1969 ----
 209                          * read() will get the next bytes, we'll check it
 210                          * then. */
 211                         l = utf_ptr2len_len(p, todo);
 212 !                       if (l > todo && !incomplete_tail)
 213                         {
 214 !                           /* Avoid retrying with a different encoding when
 215 !                            * a truncated file is more likely, or attempting
 216 !                            * to read the rest of an incomplete sequence when
 217 !                            * we have already done so. */
 218 !                           if (p > ptr || filesize > 0)
 219 !                               incomplete_tail = TRUE;
 220 !                           /* Incomplete byte sequence, move it to conv_rest[]
 221 !                            * and try to read the rest of it, unless we've
 222 !                            * already done so. */
 223 !                           if (p > ptr)
 224 !                           {
 225 !                               conv_restlen = todo;
 226 !                               mch_memmove(conv_rest, p, conv_restlen);
 227 !                               size -= conv_restlen;
 228 !                               break;
 229 !                           }
 230                         }
 231 !                       if (l == 1 || l > todo)
 232                         {
 233                             /* Illegal byte.  If we can try another encoding
 234 !                            * do that, unless at EOF where a truncated
 235 !                            * file is more likely than a conversion error. */
 236 !                           if (can_retry && !incomplete_tail)
 237                                 break;
 238   # ifdef USE_ICONV
 239                             /* When we did a conversion report an error. */
 240                             if (iconv_fd != (iconv_t)-1 && conv_error == 0)
 241                                 conv_error = readfile_linenr(linecnt, ptr, p);
 242   # endif
 243 +                           /* Remember the first linenr with an illegal byte */
 244 +                           if (conv_error == 0 && illegal_byte == 0)
 245 +                               illegal_byte = readfile_linenr(linecnt, ptr, p);
 246
 247                             /* Drop, keep or replace the bad byte. */
 248                             if (bad_char_behavior == BAD_DROP)
 249                             {
 250 !                               mch_memmove(p, p + 1, todo - 1);
 251                                 --p;
 252                                 --size;
 253                             }
 254                             else if (bad_char_behavior != BAD_KEEP)
 255                                 *p = bad_char_behavior;
 256                         }
 257 !                       else
 258 !                           p += l - 1;
 259                     }
 260                 }
 261 !               if (p < ptr + size && !incomplete_tail)
 262                 {
 263                     /* Detected a UTF-8 error. */
 264   rewind_retry:
 265 *** ../vim-7.1.309/src/version.c        Wed Jun  4 15:27:43 2008
 266 --- src/version.c       Wed Jun  4 19:35:16 2008
 267 ***************
 268 *** 668,669 ****
 269 --- 673,676 ----
 270   {   /* Add new patch number below this line */
 271 + /**/
 272 +     310,
 273   /**/
 274
 275 --
 276 Normal people believe that if it ain't broke, don't fix it.  Engineers believe
 277 that if it ain't broke, it doesn't have enough features yet.
 278                                 (Scott Adams - The Dilbert principle)
 279
 280  /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net   \\\
 281 ///        sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
 282 \\\        download, build and distribute -- http://www.A-A-P.org        ///
 283  \\\            help me help AIDS victims -- http://ICCF-Holland.org    ///