1 --- coreutils-6.7/src/fmt.c.orig 2006-10-22 18:54:15.000000000 +0200
2 +++ coreutils-6.7/src/fmt.c 2007-02-13 16:51:44.000000000 +0100
4 /* Written by Ross Paterson <rap@doc.ic.ac.uk>. */
12 /* The following parameters represent the program's idea of what is
13 "best". Adjust to taste, subject to the caveats given. */
15 -/* Default longest permitted line length (max_width). */
16 +/* Default longest permitted line width (max_width). */
19 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
23 /* Costs and bonuses are expressed as the equivalent departure from the
24 - optimal line length, multiplied by 10. e.g. assigning something a
25 + optimal line width, multiplied by 10. e.g. assigning something a
26 cost of 50 means that it is as bad as a line 5 characters too short
27 or too long. The definition of SHORT_COST(n) should not be changed.
28 However, EQUIV(n) may need tuning. */
30 #define LINE_COST EQUIV (70)
32 /* Cost of breaking a line after the first word of a sentence, where
33 - the length of the word is N. */
34 + the width of the word is N. */
35 #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
37 /* Cost of breaking a line before the last word of a sentence, where
38 - the length of the word is N. */
39 + the width of the word is N. */
40 #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
42 /* Bonus for breaking a line at the end of a sentence. */
47 +/* Wide character support */
50 +xgetwc (FILE *stream)
52 + wint_t c = getwc (stream);
53 + if (c == WEOF && ferror (stream))
54 + error (EXIT_FAILURE, errno, _("read error"));
59 +xwcwidth (wchar_t wc)
61 + int w = wcwidth (wc);
62 + return w < 0 ? 0 : w;
65 /* Extra ctype(3)-style macros. */
67 -#define isopen(c) (strchr ("([`'\"", c) != NULL)
68 -#define isclose(c) (strchr (")]'\"", c) != NULL)
69 -#define isperiod(c) (strchr (".?!", c) != NULL)
71 + (wcschr (L"([`'\"\u2018\u201A\u201B\u201C\u201E\u201F", c) != NULL)
72 +#define isclose(c) (wcschr (L")]'\"\u2018\u2019\u201C\u201D", c) != NULL)
73 +#define isperiod(c) (wcschr (L".?!", c) != NULL)
75 /* Size of a tab stop, for expansion on input and re-introduction on
79 /* Static attributes determined during input. */
81 - const char *text; /* the text of the word */
82 - int length; /* length of this word */
83 + const wchar_t *text; /* the text of the word */
84 + int length; /* length of this word, in characters */
85 + int width; /* width of this word, in columns */
86 int space; /* the size of the following space */
87 unsigned int paren:1; /* starts with open paren */
88 unsigned int period:1; /* ends in [.?!])* */
91 /* The remaining fields are computed during the optimization. */
93 - int line_length; /* length of the best line starting here */
94 + int line_width; /* width of the best line starting here */
95 COST best_cost; /* cost of best paragraph starting here */
96 WORD *next_break; /* break which achieves best_cost */
99 static void set_prefix (char *p);
100 static void fmt (FILE *f);
101 static bool get_paragraph (FILE *f);
102 -static int get_line (FILE *f, int c);
103 -static int get_prefix (FILE *f);
104 -static int get_space (FILE *f, int c);
105 -static int copy_rest (FILE *f, int c);
106 -static bool same_para (int c);
107 +static wint_t get_line (FILE *f, wint_t c);
108 +static wint_t get_prefix (FILE *f);
109 +static wint_t get_space (FILE *f, wint_t c);
110 +static wint_t copy_rest (FILE *f, wint_t c);
111 +static bool same_para (wint_t c);
112 static void flush_paragraph (void);
113 static void fmt_paragraph (void);
114 static void check_punctuation (WORD *w);
115 static COST base_cost (WORD *this);
116 -static COST line_cost (WORD *next, int len);
117 +static COST line_cost (WORD *next, int wid);
118 static void put_paragraph (WORD *finish);
119 static void put_line (WORD *w, int indent);
120 static void put_word (WORD *w);
122 /* If true, don't preserve inter-word spacing (default false). */
125 +/* How many spaces to put after a sentence (1 or 2). */
126 +static int sentence_space;
128 /* Prefix minus leading and trailing spaces (default ""). */
129 -static const char *prefix;
130 +static wchar_t *prefix;
132 /* User-supplied maximum line width (default WIDTH). The only output
133 lines longer than this will each comprise a single word. */
134 @@ -194,14 +218,14 @@
136 /* Values derived from the option values. */
138 -/* The length of prefix minus leading space. */
139 -static int prefix_full_length;
140 +/* The width of prefix minus leading space. */
141 +static int prefix_full_width;
143 -/* The length of the leading space trimmed from the prefix. */
144 +/* The width of the leading space trimmed from the prefix. */
145 static int prefix_lead_space;
147 -/* The length of prefix minus leading and trailing space. */
148 -static int prefix_length;
149 +/* The width of prefix minus leading and trailing space. */
150 +static int prefix_width;
152 /* The preferred width of text lines, set to LEEWAY % less than max_width. */
153 static int best_width;
154 @@ -216,10 +240,10 @@
156 /* Space for the paragraph text -- longer paragraphs are handled neatly
157 (cf. flush_paragraph()). */
158 -static char parabuf[MAXCHARS];
159 +static wchar_t parabuf[MAXCHARS];
161 /* A pointer into parabuf, indicating the first unused character position. */
163 +static wchar_t *wptr;
165 /* The words of a paragraph -- longer paragraphs are handled neatly
166 (cf. flush_paragraph()). */
167 @@ -251,16 +275,16 @@
168 prefix (next_prefix_indent). See get_paragraph() and copy_rest(). */
170 /* The last character read from the input file. */
171 -static int next_char;
172 +static wint_t next_char;
174 /* The space before the trimmed prefix (or part of it) on the next line
175 after the current paragraph. */
176 static int next_prefix_indent;
178 -/* If nonzero, the length of the last line output in the current
179 +/* If nonzero, the width of the last line output in the current
180 paragraph, used to charge for raggedness at the split point for long
181 paragraphs chosen by fmt_paragraph(). */
182 -static int last_line_length;
183 +static int last_line_width;
189 -t, --tagged-paragraph indentation of first line different from second\n\
190 -u, --uniform-spacing one space between words, two after sentences\n\
191 + -n, --single-spacing one space between words and after sentences\n\
192 -w, --width=WIDTH maximum line width (default of 75 columns)\n\
194 fputs (HELP_OPTION_DESCRIPTION, stdout);
196 {"split-only", no_argument, NULL, 's'},
197 {"tagged-paragraph", no_argument, NULL, 't'},
198 {"uniform-spacing", no_argument, NULL, 'u'},
199 + {"single-spacing", no_argument, NULL, 'n'},
200 {"width", required_argument, NULL, 'w'},
201 {GETOPT_HELP_OPTION_DECL},
202 {GETOPT_VERSION_OPTION_DECL},
205 crown = tagged = split = uniform = false;
208 - prefix_length = prefix_lead_space = prefix_full_length = 0;
210 + prefix_width = prefix_lead_space = prefix_full_width = 0;
212 if (argc > 1 && argv[1][0] == '-' && ISDIGIT (argv[1][1]))
218 - while ((optchar = getopt_long (argc, argv, "0123456789cstuw:p:",
219 + while ((optchar = getopt_long (argc, argv, "0123456789cstunw:p:",
227 + sentence_space = 2;
232 + sentence_space = 1;
236 @@ -440,26 +472,32 @@
239 /* Trim space from the front and back of the string P, yielding the prefix,
240 - and record the lengths of the prefix and the space trimmed. */
241 + and record the widths of the prefix and the space trimmed. */
250 prefix_lead_space = 0;
258 - prefix_full_length = strlen (p);
259 - s = p + prefix_full_length;
260 - while (s > p && s[-1] == ' ')
263 - prefix_length = s - p;
264 + len = mbsrtowcs (NULL, (const char **) &p, 0, NULL);
265 + prefix = xmalloc (len * sizeof (wchar_t));
266 + mbsrtowcs (prefix, (const char **) &p, len, NULL);
267 + for (s = prefix; *s; s++)
268 + prefix_full_width += xwcwidth (*s);
269 + prefix_width = prefix_full_width;
270 + while (s > prefix && s[-1] == L' ')
278 /* read file F and send formatted output to stdout. */
279 @@ -528,24 +566,24 @@
281 get_paragraph (FILE *f)
286 - last_line_length = 0;
287 + last_line_width = 0;
290 /* Scan (and copy) blank lines, and lines not introduced by the prefix. */
292 - while (c == '\n' || c == EOF
293 + while (c == L'\n' || c == WEOF
294 || next_prefix_indent < prefix_lead_space
295 - || in_column < next_prefix_indent + prefix_full_length)
296 + || in_column < next_prefix_indent + prefix_full_width)
298 c = copy_rest (f, c);
311 @@ -601,23 +639,23 @@
312 that failed to match the prefix. In the latter, C is \n or EOF.
313 Return the character (\n or EOF) ending the line. */
316 -copy_rest (FILE *f, int c)
318 +copy_rest (FILE *f, wint_t c)
324 - if (in_column > next_prefix_indent && c != '\n' && c != EOF)
325 + if (in_column > next_prefix_indent && c != L'\n' && c != WEOF)
327 put_space (next_prefix_indent);
328 for (s = prefix; out_column != in_column && *s; out_column++)
331 put_space (in_column - out_column);
333 - while (c != '\n' && c != EOF)
334 + while (c != L'\n' && c != WEOF)
343 @@ -627,11 +665,11 @@
348 +same_para (wint_t c)
350 return (next_prefix_indent == prefix_indent
351 - && in_column >= next_prefix_indent + prefix_full_length
352 - && c != '\n' && c != EOF);
353 + && in_column >= next_prefix_indent + prefix_full_width
354 + && c != L'\n' && c != WEOF);
357 /* Read a line from input file F, given first non-blank character C
358 @@ -642,11 +680,11 @@
360 Return the first non-blank character of the next line. */
363 -get_line (FILE *f, int c)
365 +get_line (FILE *f, wint_t c)
368 - char *end_of_parabuf;
369 + wchar_t *end_of_parabuf;
372 end_of_parabuf = ¶buf[MAXCHARS];
376 word_limit->text = wptr;
377 + word_limit->width = 0;
380 if (wptr == end_of_parabuf)
381 @@ -666,10 +705,12 @@
386 + word_limit->width += xwcwidth (c);
389 - while (c != EOF && !isspace (c));
390 - in_column += word_limit->length = wptr - word_limit->text;
391 + while (c != WEOF && !isspace (c));
392 + word_limit->length = wptr - word_limit->text;
393 + in_column += word_limit->width;
394 check_punctuation (word_limit);
396 /* Scan inter-word space. */
397 @@ -677,48 +718,48 @@
399 c = get_space (f, c);
400 word_limit->space = in_column - start;
401 - word_limit->final = (c == EOF
402 + word_limit->final = (c == WEOF
403 || (word_limit->period
404 - && (c == '\n' || word_limit->space > 1)));
405 - if (c == '\n' || c == EOF || uniform)
406 - word_limit->space = word_limit->final ? 2 : 1;
407 + && (c == L'\n' || word_limit->space > 1)));
408 + if (c == L'\n' || c == WEOF || uniform)
409 + word_limit->space = word_limit->final ? sentence_space : 1;
410 if (word_limit == end_of_word)
412 set_other_indent (true);
422 + while (c != L'\n');
423 return get_prefix (f);
426 /* Read a prefix from input file F. Return either first non-matching
427 character, or first non-blank character after the prefix. */
437 - c = get_space (f, getc (f));
438 - if (prefix_length == 0)
439 + c = get_space (f, xgetwc (f));
440 + if (prefix_width == 0)
441 next_prefix_indent = prefix_lead_space < in_column ?
442 prefix_lead_space : in_column;
447 next_prefix_indent = in_column;
448 - for (p = prefix; *p != '\0'; p++)
449 + for (p = prefix; *p != L'\0'; p++)
451 - unsigned char pc = *p;
459 c = get_space (f, c);
461 @@ -728,21 +769,21 @@
462 /* Read blank characters from input file F, starting with C, and keeping
463 in_column up-to-date. Return first non-blank character. */
466 -get_space (FILE *f, int c)
468 +get_space (FILE *f, wint_t c)
475 - else if (c == '\t')
476 + else if (c == L'\t')
479 in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
490 check_punctuation (WORD *w)
492 - char const *start = w->text;
493 - char const *finish = start + (w->length - 1);
494 - unsigned char fin = *finish;
495 + wchar_t const *start = w->text;
496 + wchar_t const *finish = start + (w->length - 1);
497 + wchar_t fin = *finish;
499 w->paren = isopen (*start);
500 w->punct = !! ispunct (fin);
503 if (word_limit == word)
505 - fwrite (parabuf, sizeof *parabuf, wptr - parabuf, stdout);
507 + for (outptr = parabuf; outptr < wptr; outptr++)
508 + putwchar (*outptr);
513 /* Copy text of words down to start of parabuf -- we use memmove because
514 the source and target may overlap. */
516 - memmove (parabuf, split_point->text, wptr - split_point->text);
517 + memmove (parabuf, split_point->text,
518 + (wptr - split_point->text) * sizeof (wchar_t));
519 shift = split_point->text - parabuf;
522 @@ -833,53 +877,53 @@
532 word_limit->best_cost = 0;
533 - saved_length = word_limit->length;
534 - word_limit->length = max_width; /* sentinel */
535 + saved_width = word_limit->width;
536 + word_limit->width = max_width; /* sentinel */
538 for (start = word_limit - 1; start >= word; start--)
541 - len = start == word ? first_indent : other_indent;
542 + wid = start == word ? first_indent : other_indent;
544 /* At least one word, however long, in the line. */
553 /* Consider breaking before w. */
555 - wcost = line_cost (w, len) + w->best_cost;
556 - if (start == word && last_line_length > 0)
557 - wcost += RAGGED_COST (len - last_line_length);
558 + wcost = line_cost (w, wid) + w->best_cost;
559 + if (start == word && last_line_width > 0)
560 + wcost += RAGGED_COST (wid - last_line_width);
564 start->next_break = w;
565 - start->line_length = len;
566 + start->line_width = wid;
569 - /* This is a kludge to keep us from computing `len' as the
570 - sum of the sentinel length and some non-zero number.
571 - Since the sentinel w->length may be INT_MAX, adding
572 + /* This is a kludge to keep us from computing `wid' as the
573 + sum of the sentinel width and some non-zero number.
574 + Since the sentinel w->width may be INT_MAX, adding
575 to that would give a negative result. */
579 - len += (w - 1)->space + w->length; /* w > start >= word */
580 + wid += (w - 1)->space + w->width; /* w > start >= word */
582 - while (len < max_width);
583 + while (wid < max_width);
584 start->best_cost = best + base_cost (start);
587 - word_limit->length = saved_length;
588 + word_limit->width = saved_width;
591 /* Return the constant component of the cost of breaking before the
592 @@ -904,33 +948,33 @@
593 else if ((this - 1)->punct)
595 else if (this > word + 1 && (this - 2)->final)
596 - cost += WIDOW_COST ((this - 1)->length);
597 + cost += WIDOW_COST ((this - 1)->width);
602 else if (this->final)
603 - cost += ORPHAN_COST (this->length);
604 + cost += ORPHAN_COST (this->width);
609 /* Return the component of the cost of breaking before word NEXT that
610 - depends on LEN, the length of the line beginning there. */
611 + depends on WID, the width of the line beginning there. */
614 -line_cost (WORD *next, int len)
615 +line_cost (WORD *next, int wid)
620 if (next == word_limit)
622 - n = best_width - len;
623 + n = best_width - wid;
624 cost = SHORT_COST (n);
625 if (next->next_break != word_limit)
627 - n = len - next->line_length;
628 + n = wid - next->line_width;
629 cost += RAGGED_COST (n);
635 put_space (prefix_indent);
636 - fputs (prefix, stdout);
637 - out_column += prefix_length;
638 + fputws (prefix, stdout);
639 + out_column += prefix_width;
640 put_space (indent - out_column);
642 endline = w->next_break - 1;
644 put_space (w->space);
647 - last_line_length = out_column;
649 + last_line_width = out_column;
653 /* Output to stdout the word W. */
654 @@ -979,13 +1023,13 @@
663 for (n = w->length; n != 0; n--)
665 - out_column += w->length;
667 + out_column += w->width;
670 /* Output to stdout SPACE spaces, or equivalent tabs. */
671 @@ -1002,13 +1046,13 @@
672 if (out_column + 1 < tab_target)
673 while (out_column < tab_target)
677 out_column = (out_column / TABWIDTH + 1) * TABWIDTH;
680 while (out_column < space_target)