]> git.pld-linux.org Git - packages/coreutils.git/blame - coreutils-fmt-wchars.patch
- coreutils-fmt-wchars.patch: Added support for multibyte encodings
[packages/coreutils.git] / coreutils-fmt-wchars.patch
CommitLineData
29623d34 1--- coreutils-6.7/src/fmt.c.orig 2006-10-22 18:54:15.000000000 +0200
2+++ coreutils-6.7/src/fmt.c 2007-02-13 16:51:44.000000000 +0100
3@@ -18,6 +18,7 @@
4 /* Written by Ross Paterson <rap@doc.ic.ac.uk>. */
5
6 #include <config.h>
7+#include <wchar.h>
8 #include <stdio.h>
9 #include <sys/types.h>
10 #include <getopt.h>
11@@ -39,7 +40,7 @@
12 /* The following parameters represent the program's idea of what is
13 "best". Adjust to taste, subject to the caveats given. */
14
15-/* Default longest permitted line length (max_width). */
16+/* Default longest permitted line width (max_width). */
17 #define WIDTH 75
18
19 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
20@@ -51,7 +52,7 @@
21 #define DEF_INDENT 3
22
23 /* Costs and bonuses are expressed as the equivalent departure from the
24- optimal line length, multiplied by 10. e.g. assigning something a
25+ optimal line width, multiplied by 10. e.g. assigning something a
26 cost of 50 means that it is as bad as a line 5 characters too short
27 or too long. The definition of SHORT_COST(n) should not be changed.
28 However, EQUIV(n) may need tuning. */
29@@ -78,11 +79,11 @@
30 #define LINE_COST EQUIV (70)
31
32 /* Cost of breaking a line after the first word of a sentence, where
33- the length of the word is N. */
34+ the width of the word is N. */
35 #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
36
37 /* Cost of breaking a line before the last word of a sentence, where
38- the length of the word is N. */
39+ the width of the word is N. */
40 #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
41
42 /* Bonus for breaking a line at the end of a sentence. */
43@@ -114,11 +115,30 @@
44 #define MAXWORDS 1000
45 #define MAXCHARS 5000
46
47+/* Wide character support */
48+
49+static wint_t
50+xgetwc (FILE *stream)
51+{
52+ wint_t c = getwc (stream);
53+ if (c == WEOF && ferror (stream))
54+ error (EXIT_FAILURE, errno, _("read error"));
55+ return c;
56+}
57+
58+static inline int
59+xwcwidth (wchar_t wc)
60+{
61+ int w = wcwidth (wc);
62+ return w < 0 ? 0 : w;
63+}
64+
65 /* Extra ctype(3)-style macros. */
66
67-#define isopen(c) (strchr ("([`'\"", c) != NULL)
68-#define isclose(c) (strchr (")]'\"", c) != NULL)
69-#define isperiod(c) (strchr (".?!", c) != NULL)
70+#define isopen(c) \
71+ (wcschr (L"([`'\"\u2018\u201A\u201B\u201C\u201E\u201F", c) != NULL)
72+#define isclose(c) (wcschr (L")]'\"\u2018\u2019\u201C\u201D", c) != NULL)
73+#define isperiod(c) (wcschr (L".?!", c) != NULL)
74
75 /* Size of a tab stop, for expansion on input and re-introduction on
76 output. */
77@@ -133,8 +153,9 @@
78
79 /* Static attributes determined during input. */
80
81- const char *text; /* the text of the word */
82- int length; /* length of this word */
83+ const wchar_t *text; /* the text of the word */
84+ int length; /* length of this word, in characters */
85+ int width; /* width of this word, in columns */
86 int space; /* the size of the following space */
87 unsigned int paren:1; /* starts with open paren */
88 unsigned int period:1; /* ends in [.?!])* */
89@@ -143,7 +164,7 @@
90
91 /* The remaining fields are computed during the optimization. */
92
93- int line_length; /* length of the best line starting here */
94+ int line_width; /* width of the best line starting here */
95 COST best_cost; /* cost of best paragraph starting here */
96 WORD *next_break; /* break which achieves best_cost */
97 };
98@@ -153,16 +174,16 @@
99 static void set_prefix (char *p);
100 static void fmt (FILE *f);
101 static bool get_paragraph (FILE *f);
102-static int get_line (FILE *f, int c);
103-static int get_prefix (FILE *f);
104-static int get_space (FILE *f, int c);
105-static int copy_rest (FILE *f, int c);
106-static bool same_para (int c);
107+static wint_t get_line (FILE *f, wint_t c);
108+static wint_t get_prefix (FILE *f);
109+static wint_t get_space (FILE *f, wint_t c);
110+static wint_t copy_rest (FILE *f, wint_t c);
111+static bool same_para (wint_t c);
112 static void flush_paragraph (void);
113 static void fmt_paragraph (void);
114 static void check_punctuation (WORD *w);
115 static COST base_cost (WORD *this);
116-static COST line_cost (WORD *next, int len);
117+static COST line_cost (WORD *next, int wid);
118 static void put_paragraph (WORD *finish);
119 static void put_line (WORD *w, int indent);
120 static void put_word (WORD *w);
121@@ -185,8 +206,11 @@
122 /* If true, don't preserve inter-word spacing (default false). */
123 static bool uniform;
124
125+/* How many spaces to put after a sentence (1 or 2). */
126+static int sentence_space;
127+
128 /* Prefix minus leading and trailing spaces (default ""). */
129-static const char *prefix;
130+static wchar_t *prefix;
131
132 /* User-supplied maximum line width (default WIDTH). The only output
133 lines longer than this will each comprise a single word. */
134@@ -194,14 +218,14 @@
135
136 /* Values derived from the option values. */
137
138-/* The length of prefix minus leading space. */
139-static int prefix_full_length;
140+/* The width of prefix minus leading space. */
141+static int prefix_full_width;
142
143-/* The length of the leading space trimmed from the prefix. */
144+/* The width of the leading space trimmed from the prefix. */
145 static int prefix_lead_space;
146
147-/* The length of prefix minus leading and trailing space. */
148-static int prefix_length;
149+/* The width of prefix minus leading and trailing space. */
150+static int prefix_width;
151
152 /* The preferred width of text lines, set to LEEWAY % less than max_width. */
153 static int best_width;
154@@ -216,10 +240,10 @@
155
156 /* Space for the paragraph text -- longer paragraphs are handled neatly
157 (cf. flush_paragraph()). */
158-static char parabuf[MAXCHARS];
159+static wchar_t parabuf[MAXCHARS];
160
161 /* A pointer into parabuf, indicating the first unused character position. */
162-static char *wptr;
163+static wchar_t *wptr;
164
165 /* The words of a paragraph -- longer paragraphs are handled neatly
166 (cf. flush_paragraph()). */
167@@ -251,16 +275,16 @@
168 prefix (next_prefix_indent). See get_paragraph() and copy_rest(). */
169
170 /* The last character read from the input file. */
171-static int next_char;
172+static wint_t next_char;
173
174 /* The space before the trimmed prefix (or part of it) on the next line
175 after the current paragraph. */
176 static int next_prefix_indent;
177
178-/* If nonzero, the length of the last line output in the current
179+/* If nonzero, the width of the last line output in the current
180 paragraph, used to charge for raggedness at the split point for long
181 paragraphs chosen by fmt_paragraph(). */
182-static int last_line_length;
183+static int last_line_width;
184
185 void
186 usage (int status)
187@@ -289,6 +313,7 @@
188 fputs (_("\
189 -t, --tagged-paragraph indentation of first line different from second\n\
190 -u, --uniform-spacing one space between words, two after sentences\n\
191+ -n, --single-spacing one space between words and after sentences\n\
192 -w, --width=WIDTH maximum line width (default of 75 columns)\n\
193 "), stdout);
194 fputs (HELP_OPTION_DESCRIPTION, stdout);
195@@ -311,6 +336,7 @@
196 {"split-only", no_argument, NULL, 's'},
197 {"tagged-paragraph", no_argument, NULL, 't'},
198 {"uniform-spacing", no_argument, NULL, 'u'},
199+ {"single-spacing", no_argument, NULL, 'n'},
200 {"width", required_argument, NULL, 'w'},
201 {GETOPT_HELP_OPTION_DECL},
202 {GETOPT_VERSION_OPTION_DECL},
203@@ -334,8 +360,8 @@
204
205 crown = tagged = split = uniform = false;
206 max_width = WIDTH;
207- prefix = "";
208- prefix_length = prefix_lead_space = prefix_full_length = 0;
209+ prefix = L"";
210+ prefix_width = prefix_lead_space = prefix_full_width = 0;
211
212 if (argc > 1 && argv[1][0] == '-' && ISDIGIT (argv[1][1]))
213 {
214@@ -348,7 +374,7 @@
215 argc--;
216 }
217
218- while ((optchar = getopt_long (argc, argv, "0123456789cstuw:p:",
219+ while ((optchar = getopt_long (argc, argv, "0123456789cstunw:p:",
220 long_options, NULL))
221 != -1)
222 switch (optchar)
223@@ -374,6 +400,12 @@
224
225 case 'u':
226 uniform = true;
227+ sentence_space = 2;
228+ break;
229+
230+ case 'n':
231+ uniform = true;
232+ sentence_space = 1;
233 break;
234
235 case 'w':
236@@ -440,26 +472,32 @@
237 }
238
239 /* Trim space from the front and back of the string P, yielding the prefix,
240- and record the lengths of the prefix and the space trimmed. */
241+ and record the widths of the prefix and the space trimmed. */
242
243 static void
244 set_prefix (char *p)
245 {
246- char *s;
247+ size_t len;
248+ wchar_t *s;
249
250 prefix_lead_space = 0;
251- while (*p == ' ')
252+ while (*p == L' ')
253 {
254 prefix_lead_space++;
255 p++;
256 }
257- prefix = p;
258- prefix_full_length = strlen (p);
259- s = p + prefix_full_length;
260- while (s > p && s[-1] == ' ')
261- s--;
262- *s = '\0';
263- prefix_length = s - p;
264+ len = mbsrtowcs (NULL, (const char **) &p, 0, NULL);
265+ prefix = xmalloc (len * sizeof (wchar_t));
266+ mbsrtowcs (prefix, (const char **) &p, len, NULL);
267+ for (s = prefix; *s; s++)
268+ prefix_full_width += xwcwidth (*s);
269+ prefix_width = prefix_full_width;
270+ while (s > prefix && s[-1] == L' ')
271+ {
272+ s--;
273+ prefix_width--;
274+ }
275+ *s = L'\0';
276 }
277
278 /* read file F and send formatted output to stdout. */
279@@ -528,24 +566,24 @@
280 static bool
281 get_paragraph (FILE *f)
282 {
283- int c;
284+ wint_t c;
285
286- last_line_length = 0;
287+ last_line_width = 0;
288 c = next_char;
289
290 /* Scan (and copy) blank lines, and lines not introduced by the prefix. */
291
292- while (c == '\n' || c == EOF
293+ while (c == L'\n' || c == WEOF
294 || next_prefix_indent < prefix_lead_space
295- || in_column < next_prefix_indent + prefix_full_length)
296+ || in_column < next_prefix_indent + prefix_full_width)
297 {
298 c = copy_rest (f, c);
299- if (c == EOF)
300+ if (c == WEOF)
301 {
302- next_char = EOF;
303+ next_char = WEOF;
304 return false;
305 }
306- putchar ('\n');
307+ putwchar (L'\n');
308 c = get_prefix (f);
309 }
310
311@@ -601,23 +639,23 @@
312 that failed to match the prefix. In the latter, C is \n or EOF.
313 Return the character (\n or EOF) ending the line. */
314
315-static int
316-copy_rest (FILE *f, int c)
317+static wint_t
318+copy_rest (FILE *f, wint_t c)
319 {
320- const char *s;
321+ const wchar_t *s;
322
323 out_column = 0;
324- if (in_column > next_prefix_indent && c != '\n' && c != EOF)
325+ if (in_column > next_prefix_indent && c != L'\n' && c != WEOF)
326 {
327 put_space (next_prefix_indent);
328 for (s = prefix; out_column != in_column && *s; out_column++)
329- putchar (*s++);
330+ putwchar (*s++);
331 put_space (in_column - out_column);
332 }
333- while (c != '\n' && c != EOF)
334+ while (c != L'\n' && c != WEOF)
335 {
336- putchar (c);
337- c = getc (f);
338+ putwchar (c);
339+ c = xgetwc (f);
340 }
341 return c;
342 }
343@@ -627,11 +665,11 @@
344 otherwise false. */
345
346 static bool
347-same_para (int c)
348+same_para (wint_t c)
349 {
350 return (next_prefix_indent == prefix_indent
351- && in_column >= next_prefix_indent + prefix_full_length
352- && c != '\n' && c != EOF);
353+ && in_column >= next_prefix_indent + prefix_full_width
354+ && c != L'\n' && c != WEOF);
355 }
356
357 /* Read a line from input file F, given first non-blank character C
358@@ -642,11 +680,11 @@
359
360 Return the first non-blank character of the next line. */
361
362-static int
363-get_line (FILE *f, int c)
364+static wint_t
365+get_line (FILE *f, wint_t c)
366 {
367 int start;
368- char *end_of_parabuf;
369+ wchar_t *end_of_parabuf;
370 WORD *end_of_word;
371
372 end_of_parabuf = &parabuf[MAXCHARS];
373@@ -658,6 +696,7 @@
374 /* Scan word. */
375
376 word_limit->text = wptr;
377+ word_limit->width = 0;
378 do
379 {
380 if (wptr == end_of_parabuf)
381@@ -666,10 +705,12 @@
382 flush_paragraph ();
383 }
384 *wptr++ = c;
385- c = getc (f);
386+ word_limit->width += xwcwidth (c);
387+ c = xgetwc (f);
388 }
389- while (c != EOF && !isspace (c));
390- in_column += word_limit->length = wptr - word_limit->text;
391+ while (c != WEOF && !isspace (c));
392+ word_limit->length = wptr - word_limit->text;
393+ in_column += word_limit->width;
394 check_punctuation (word_limit);
395
396 /* Scan inter-word space. */
397@@ -677,48 +718,48 @@
398 start = in_column;
399 c = get_space (f, c);
400 word_limit->space = in_column - start;
401- word_limit->final = (c == EOF
402+ word_limit->final = (c == WEOF
403 || (word_limit->period
404- && (c == '\n' || word_limit->space > 1)));
405- if (c == '\n' || c == EOF || uniform)
406- word_limit->space = word_limit->final ? 2 : 1;
407+ && (c == L'\n' || word_limit->space > 1)));
408+ if (c == L'\n' || c == WEOF || uniform)
409+ word_limit->space = word_limit->final ? sentence_space : 1;
410 if (word_limit == end_of_word)
411 {
412 set_other_indent (true);
413 flush_paragraph ();
414 }
415 word_limit++;
416- if (c == EOF)
417- return EOF;
418+ if (c == WEOF)
419+ return WEOF;
420 }
421- while (c != '\n');
422+ while (c != L'\n');
423 return get_prefix (f);
424 }
425
426 /* Read a prefix from input file F. Return either first non-matching
427 character, or first non-blank character after the prefix. */
428
429-static int
430+static wint_t
431 get_prefix (FILE *f)
432 {
433- int c;
434+ wint_t c;
435
436 in_column = 0;
437- c = get_space (f, getc (f));
438- if (prefix_length == 0)
439+ c = get_space (f, xgetwc (f));
440+ if (prefix_width == 0)
441 next_prefix_indent = prefix_lead_space < in_column ?
442 prefix_lead_space : in_column;
443 else
444 {
445- const char *p;
446+ const wchar_t *p;
447 next_prefix_indent = in_column;
448- for (p = prefix; *p != '\0'; p++)
449+ for (p = prefix; *p != L'\0'; p++)
450 {
451- unsigned char pc = *p;
452+ wchar_t pc = *p;
453 if (c != pc)
454 return c;
455 in_column++;
456- c = getc (f);
457+ c = xgetwc (f);
458 }
459 c = get_space (f, c);
460 }
461@@ -728,21 +769,21 @@
462 /* Read blank characters from input file F, starting with C, and keeping
463 in_column up-to-date. Return first non-blank character. */
464
465-static int
466-get_space (FILE *f, int c)
467+static wint_t
468+get_space (FILE *f, wint_t c)
469 {
470 for (;;)
471 {
472- if (c == ' ')
473+ if (c == L' ')
474 in_column++;
475- else if (c == '\t')
476+ else if (c == L'\t')
477 {
478 tabs = true;
479 in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
480 }
481 else
482 return c;
483- c = getc (f);
484+ c = xgetwc (f);
485 }
486 }
487
488@@ -751,9 +792,9 @@
489 static void
490 check_punctuation (WORD *w)
491 {
492- char const *start = w->text;
493- char const *finish = start + (w->length - 1);
494- unsigned char fin = *finish;
495+ wchar_t const *start = w->text;
496+ wchar_t const *finish = start + (w->length - 1);
497+ wchar_t fin = *finish;
498
499 w->paren = isopen (*start);
500 w->punct = !! ispunct (fin);
501@@ -777,7 +818,9 @@
502
503 if (word_limit == word)
504 {
505- fwrite (parabuf, sizeof *parabuf, wptr - parabuf, stdout);
506+ wchar_t *outptr;
507+ for (outptr = parabuf; outptr < wptr; outptr++)
508+ putwchar (*outptr);
509 wptr = parabuf;
510 return;
511 }
512@@ -809,7 +852,8 @@
513 /* Copy text of words down to start of parabuf -- we use memmove because
514 the source and target may overlap. */
515
516- memmove (parabuf, split_point->text, wptr - split_point->text);
517+ memmove (parabuf, split_point->text,
518+ (wptr - split_point->text) * sizeof (wchar_t));
519 shift = split_point->text - parabuf;
520 wptr -= shift;
521
522@@ -833,53 +877,53 @@
523 fmt_paragraph (void)
524 {
525 WORD *start, *w;
526- int len;
527+ int wid;
528 COST wcost, best;
529- int saved_length;
530+ int saved_width;
531
532 word_limit->best_cost = 0;
533- saved_length = word_limit->length;
534- word_limit->length = max_width; /* sentinel */
535+ saved_width = word_limit->width;
536+ word_limit->width = max_width; /* sentinel */
537
538 for (start = word_limit - 1; start >= word; start--)
539 {
540 best = MAXCOST;
541- len = start == word ? first_indent : other_indent;
542+ wid = start == word ? first_indent : other_indent;
543
544 /* At least one word, however long, in the line. */
545
546 w = start;
547- len += w->length;
548+ wid += w->width;
549 do
550 {
551 w++;
552
553 /* Consider breaking before w. */
554
555- wcost = line_cost (w, len) + w->best_cost;
556- if (start == word && last_line_length > 0)
557- wcost += RAGGED_COST (len - last_line_length);
558+ wcost = line_cost (w, wid) + w->best_cost;
559+ if (start == word && last_line_width > 0)
560+ wcost += RAGGED_COST (wid - last_line_width);
561 if (wcost < best)
562 {
563 best = wcost;
564 start->next_break = w;
565- start->line_length = len;
566+ start->line_width = wid;
567 }
568
569- /* This is a kludge to keep us from computing `len' as the
570- sum of the sentinel length and some non-zero number.
571- Since the sentinel w->length may be INT_MAX, adding
572+ /* This is a kludge to keep us from computing `wid' as the
573+ sum of the sentinel width and some non-zero number.
574+ Since the sentinel w->width may be INT_MAX, adding
575 to that would give a negative result. */
576 if (w == word_limit)
577 break;
578
579- len += (w - 1)->space + w->length; /* w > start >= word */
580+ wid += (w - 1)->space + w->width; /* w > start >= word */
581 }
582- while (len < max_width);
583+ while (wid < max_width);
584 start->best_cost = best + base_cost (start);
585 }
586
587- word_limit->length = saved_length;
588+ word_limit->width = saved_width;
589 }
590
591 /* Return the constant component of the cost of breaking before the
592@@ -904,33 +948,33 @@
593 else if ((this - 1)->punct)
594 cost -= PUNCT_BONUS;
595 else if (this > word + 1 && (this - 2)->final)
596- cost += WIDOW_COST ((this - 1)->length);
597+ cost += WIDOW_COST ((this - 1)->width);
598 }
599
600 if (this->paren)
601 cost -= PAREN_BONUS;
602 else if (this->final)
603- cost += ORPHAN_COST (this->length);
604+ cost += ORPHAN_COST (this->width);
605
606 return cost;
607 }
608
609 /* Return the component of the cost of breaking before word NEXT that
610- depends on LEN, the length of the line beginning there. */
611+ depends on WID, the width of the line beginning there. */
612
613 static COST
614-line_cost (WORD *next, int len)
615+line_cost (WORD *next, int wid)
616 {
617 int n;
618 COST cost;
619
620 if (next == word_limit)
621 return 0;
622- n = best_width - len;
623+ n = best_width - wid;
624 cost = SHORT_COST (n);
625 if (next->next_break != word_limit)
626 {
627- n = len - next->line_length;
628+ n = wid - next->line_width;
629 cost += RAGGED_COST (n);
630 }
631 return cost;
632@@ -959,8 +1003,8 @@
633
634 out_column = 0;
635 put_space (prefix_indent);
636- fputs (prefix, stdout);
637- out_column += prefix_length;
638+ fputws (prefix, stdout);
639+ out_column += prefix_width;
640 put_space (indent - out_column);
641
642 endline = w->next_break - 1;
643@@ -970,8 +1014,8 @@
644 put_space (w->space);
645 }
646 put_word (w);
647- last_line_length = out_column;
648- putchar ('\n');
649+ last_line_width = out_column;
650+ putwchar (L'\n');
651 }
652
653 /* Output to stdout the word W. */
654@@ -979,13 +1023,13 @@
655 static void
656 put_word (WORD *w)
657 {
658- const char *s;
659+ const wchar_t *s;
660 int n;
661
662 s = w->text;
663 for (n = w->length; n != 0; n--)
664- putchar (*s++);
665- out_column += w->length;
666+ putwchar (*s++);
667+ out_column += w->width;
668 }
669
670 /* Output to stdout SPACE spaces, or equivalent tabs. */
671@@ -1002,13 +1046,13 @@
672 if (out_column + 1 < tab_target)
673 while (out_column < tab_target)
674 {
675- putchar ('\t');
676+ putwchar (L'\t');
677 out_column = (out_column / TABWIDTH + 1) * TABWIDTH;
678 }
679 }
680 while (out_column < space_target)
681 {
682- putchar (' ');
683+ putwchar (L' ');
684 out_column++;
685 }
686 }
This page took 0.358942 seconds and 4 git commands to generate.