3 if [ $# -eq 3 -a "$2" = '-d' ]; then
5 elif [ $# -ne 1 ]; then
6 echo >&2 "`basename $0`: script expects -patch|-unpatch as argument"
10 -patch) patch $pdir -f --no-backup-if-mismatch -p0 -l < $0;;
11 -unpatch) patch $pdir -f --no-backup-if-mismatch -R -p0 -l < $0;;
13 echo >&2 "`basename $0`: script expects -patch|-unpatch as argument"
20 To: Gcc Patch List <gcc-patches at gcc dot gnu dot org>
21 Subject: Patch: gcj -vs- iconv
22 From: Tom Tromey <tromey at cygnus dot com>
23 Date: 06 Mar 2000 14:39:01 -0700
24 Reply-To: tromey at cygnus dot com
26 # DP: This patch changes gcj to use iconv(), when available, to read Java
27 # DP: source files. It adds a new `--encoding' option that lets the user
28 # DP: choose what encoding to use. For systems without iconv(), gcj still
29 # DP: assumes that the input is UTF-8, but it no longer ignores encoding
32 # DP: This patch does have one minor problem, which is that if --encoding is
33 # DP: not specified we default to UTF-8 instead of the encoding the user has
34 # DP: chosen (as part of his locale). I don't know how to find that
35 # DP: information. Anyway, that is an addition which shouldn't affect
36 # DP: whether or not this patch goes in, since this patch doesn't make the
37 # DP: situation any worse than it is right now.
39 Alex, I'm not sure I really understand how the parser context stack
40 works, so it is possible that some of my changes there are wrong.
41 Could you look at it? Is this ok to check in?
43 # DP: This fixes PR gcj/33; I can now compile a Latin-1 encoded file on my
44 # DP: PPC Linux box with `gcj --encoding=Latin1 ...'.
46 2000-03-06 Tom Tromey <tromey@cygnus.com>
49 * jv-scan.c (help): Document --encoding.
50 (options): Added `encoding' entry.
51 (OPT_ENCODING): New define.
52 (main): Handle --encoding.
53 * lang-options.h: Document --classpath, --CLASSPATH, --main, and
55 * jcf-parse.c (parse_source_file): Correctly call java_init_lex.
56 Added `finput' argument.
57 * java-tree.h (current_encoding): Declare.
58 * parse.y (java_parser_context_restore_global): Don't restore
60 (java_parser_context_save_global): Don't set `finput' field.
61 (java_pop_parser_context): Don't restore `finput'. Free old lexer
63 * lang.c (current_encoding): New global.
64 (lang_decode_option): Recognize `-fencoding='.
65 (finish_parse): Don't close finput.
66 * parse.h (struct parser_ctxt): Removed `finput' and
67 `unget_utf8_value' fields. Added `lexer' field.
68 (java_init_lex): Fixed declaration.
69 * lex.c (java_new_lexer): New function.
70 (java_destroy_lexer): Likewise.
71 (java_read_char): Added `lex' argument. Handle iconv case.
72 (java_read_unicode): Added `lex' argument. Count backslashes in
74 (java_init_lex): Added `finput' and `encoding' arguments. Set
75 `lexer' field in ctxp.
76 (BAD_UTF8_VALUE): Removed.
77 * lex.h: Include <iconv.h> if HAVE_ICONV defined.
78 (java_lexer): New structure.
81 (DEFAULT_ENCODING): New define.
82 (java_destroy_lexer): Declare.
86 --- gcc/java/java-tree.h.orig Sat Apr 8 23:03:03 2000
87 +++ gcc/java/java-tree.h Sat Apr 8 23:03:56 2000
89 extern int flag_not_overriding;
90 extern int flag_static_local_jdk1_1;
92 +/* Encoding used for source files. */
93 +extern char *current_encoding;
95 /* The Java .class file that provides main_class; the main input file. */
96 extern struct JCF *current_jcf;
98 --- gcc/java/jcf-parse.c.orig Sat Apr 8 23:03:03 2000
99 +++ gcc/java/jcf-parse.c Sat Apr 8 23:05:39 2000
101 static tree give_name_to_class PROTO ((JCF *jcf, int index));
102 static void parse_zip_file_entries PROTO ((void));
103 static void process_zip_dir PROTO ((void));
104 -static void parse_source_file PROTO ((tree));
105 +static void parse_source_file PARAMS ((tree, FILE *));
106 static void jcf_parse_source PROTO ((void));
107 static int jcf_figure_file_type PROTO ((JCF *));
108 static int find_in_current_zip PROTO ((char *, struct JCF **));
115 java_parser_context_save_global ();
116 java_push_parser_context ();
118 if (!(finput = fopen (input_filename, "r")))
119 fatal ("input file `%s' just disappeared - jcf_parse_source",
121 - parse_source_file (file);
122 + parse_source_file (file, finput);
124 fatal ("can't close input file `%s' stream - jcf_parse_source",
127 /* Parse a source file, as pointed by the current value of INPUT_FILENAME. */
130 -parse_source_file (file)
131 +parse_source_file (file, finput)
135 int save_error_count = java_error_count;
136 /* Mark the file as parsed */
139 lang_init_source (1); /* Error msgs have no method prototypes */
141 - java_init_lex (); /* Initialize the parser */
142 + /* Initialize the parser */
143 + java_init_lex (finput,
144 + current_encoding ? current_encoding : DEFAULT_ENCODING);
145 java_parse_abort_on_error ();
147 java_parse (); /* Parse and build partial tree nodes. */
149 int several_files = 0;
150 char *list = strdup (input_filename), *next;
151 tree node, current_file_list = NULL_TREE;
158 java_push_parser_context ();
159 java_parser_context_save_global ();
160 - parse_source_file (name);
161 + parse_source_file (name, finput);
162 java_parser_context_restore_global ();
163 java_pop_parser_context (1);
165 --- gcc/java/jv-scan.c.orig Sat Apr 8 23:03:03 2000
166 +++ gcc/java/jv-scan.c Sat Apr 8 23:11:41 2000
170 char *output_file = NULL;
171 + char *encoding = NULL;
179 + /* file encoding */
180 + else if (!strcmp (argv [i], "--encoding") && i+1 < argc)
183 + encoding = argv [++i];
187 /* Print the name of the class that contains main */
188 else if (!strcmp (argv [i], "--print-main"))
191 input_filename = argv [i];
192 if ( (finput = fopen (argv [i], "r")) )
195 + java_init_lex (finput, encoding ? encoding : DEFAULT_ENCODING);
197 if (ftell (out) != ft)
199 --- gcc/java/lang-options.h.orig Sat Apr 8 23:03:03 2000
200 +++ gcc/java/lang-options.h Sat Apr 8 23:03:56 2000
202 { "-M", "Print dependencies to stdout" },
203 { "-MM", "Print dependencies to stdout" },
204 #endif /* ! USE_CPPLIB */
205 - { "-fclasspath", "Set class path and suppress system path" },
206 - { "-fCLASSPATH", "Set class path" },
207 + { "--classpath", "Set class path and suppress system path" },
208 + { "--CLASSPATH", "Set class path" },
209 + { "--main", "Choose class whose main method should be used" },
210 + { "--encoding", "Choose input encoding (default is UTF-8)" },
211 { "-I", "Add directory to class path" },
212 { "-foutput-class-dir", "Directory where class files should be written" },
213 { "-Wredundant-modifiers",
214 --- gcc/java/lang.c.orig Sat Apr 8 23:03:03 2000
215 +++ gcc/java/lang.c Sat Apr 8 23:03:56 2000
217 /* When non zero, warns that final local are treated as non final. */
218 int flag_static_local_jdk1_1 = 0;
220 +/* The encoding of the source file. */
221 +char *current_encoding = NULL;
223 /* From gcc/flags.h, and indicates if exceptions are turned on or not. */
225 extern int flag_new_exceptions;
230 +#define ARG "-fencoding="
231 + if (strncmp (p, ARG, sizeof (ARG) - 1) == 0)
233 + current_encoding = p + sizeof (ARG) - 1;
238 if (p[0] == '-' && p[1] == 'f')
244 +/* Global open file. */
248 init_parse (filename)
255 jcf_dependency_write ();
258 --- gcc/java/lex.c.orig Sat Apr 8 23:03:03 2000
259 +++ gcc/java/lex.c Sat Apr 8 23:14:41 2000
261 The Free Software Foundation is independent of Sun Microsystems, Inc. */
263 /* It defines java_lex (yylex) that reads a Java ASCII source file
264 -possibly containing Unicode escape sequence or utf8 encoded characters
265 -and returns a token for everything found but comments, white spaces
266 -and line terminators. When necessary, it also fills the java_lval
267 -(yylval) union. It's implemented to be called by a re-entrant parser
270 -The lexical analysis conforms to the Java grammar described in "The
271 -Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
272 -Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
273 + possibly containing Unicode escape sequence or utf8 encoded
274 + characters and returns a token for everything found but comments,
275 + white spaces and line terminators. When necessary, it also fills
276 + the java_lval (yylval) union. It's implemented to be called by a
277 + re-entrant parser generated by Bison.
279 + The lexical analysis conforms to the Java grammar described in "The
280 + Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
281 + Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
286 static int java_parse_doc_section PROTO ((unicode_t));
287 static void java_parse_end_comment PROTO ((unicode_t));
288 static unicode_t java_get_unicode PROTO (());
289 -static unicode_t java_read_unicode PROTO ((int, int *));
290 -static void java_store_unicode PROTO ((struct java_line *, unicode_t, int));
291 -static unicode_t java_read_char PROTO (());
292 +static unicode_t java_read_unicode PARAMS ((java_lexer *, int, int *));
293 +static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
294 +static unicode_t java_read_char PARAMS ((java_lexer *));
295 static void java_allocate_new_line PROTO (());
296 static void java_unget_unicode PROTO (());
297 static unicode_t java_sneak_unicode PROTO (());
298 +java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
302 +java_init_lex (finput, encoding)
304 + const char *encoding;
307 int java_lang_imported = 0;
309 ctxp->lineno = lineno = 0;
312 - ctxp->unget_utf8_value = 0;
313 ctxp->minus_seen = 0;
314 ctxp->java_error_flag = 0;
315 + ctxp->lexer = java_new_lexer (finput, encoding);
319 @@ -188,22 +191,142 @@
320 ctxp->c_line->white_space_only = 1;
323 -#define BAD_UTF8_VALUE 0xFFFE
324 +/* Create a new lexer object. */
326 +java_new_lexer (finput, encoding)
328 + const char *encoding;
330 + java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
333 + lex->finput = finput;
335 + lex->unget_value = 0;
338 + lex->handle = iconv_open ("UCS-2", encoding);
339 + if (lex->handle == (iconv_t) -1)
341 + /* FIXME: we should give a nice error based on errno here. */
346 +#else /* HAVE_ICONV */
347 + if (strcmp (encoding, DEFAULT_ENCODING))
349 +#endif /* HAVE_ICONV */
352 + fatal ("unknown encoding: `%s'", encoding);
358 +java_destroy_lexer (lex)
361 + fclose (lex->finput);
363 + iconv_close (lex->handle);
370 +java_read_char (lex)
375 + if (lex->unget_value)
377 + unicode_t r = lex->unget_value;
378 + lex->unget_value = 0;
385 + size_t ir, inbytesleft, in_save, out_count;
388 - if (ctxp->unget_utf8_value)
391 - int to_return = ctxp->unget_utf8_value;
392 - ctxp->unget_utf8_value = 0;
393 - return (to_return);
394 + /* See if we need to read more data. If FIRST == 0 then the
395 + previous conversion attempt ended in the middle of a
396 + character at the end of the buffer. Otherwise we only have
397 + to read if the buffer is empty. */
398 + if (lex->first == 0 || lex->first >= lex->last)
402 + if (lex->first >= lex->last)
407 + if (feof (lex->finput))
409 + r = fread (&lex->buffer[lex->last], 1,
410 + sizeof (lex->buffer) - lex->last,
416 + inbytesleft = lex->last - lex->first;
418 + if (inbytesleft == 0)
420 + /* We've tried to read and there is nothing left. */
424 + in_save = inbytesleft;
426 + inp = &lex->buffer[lex->first];
428 + ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
429 + &outp, &out_count);
430 + lex->first += in_save - inbytesleft;
432 + if (out_count == 0)
434 + /* Success. We assume that UCS-2 is big-endian. This
435 + appears to be an ok assumption. */
437 + result = (((unsigned char) out[0]) << 8) | (unsigned char) out[1];
441 + if (ir == (size_t) -1)
443 + if (errno == EINVAL)
445 + /* This is ok. This means that the end of our buffer
446 + is in the middle of a character sequence. We just
447 + move the valid part of the buffer to the beginning
448 + to force a read. */
449 + /* We use bcopy() because it should work for
450 + overlapping strings. Use memmove() instead... */
451 + bcopy (&lex->buffer[lex->first], &lex->buffer[0],
452 + lex->last - lex->first);
453 + lex->last -= lex->first;
458 + /* A more serious error. */
459 + java_lex_error ("unrecognized character in input stream", 0);
464 +#else /* HAVE_ICONV */
467 + c = getc (lex->finput);
471 @@ -213,17 +336,17 @@
473 if ((c & 0xe0) == 0xc0)
476 + c1 = getc (lex->finput);
477 if ((c1 & 0xc0) == 0x80)
478 return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
481 else if ((c & 0xf0) == 0xe0)
484 + c1 = getc (lex->finput);
485 if ((c1 & 0xc0) == 0x80)
488 + c2 = getc (lex->finput);
489 if ((c2 & 0xc0) == 0x80)
490 return (unicode_t)(((c & 0xf) << 12) +
491 (( c1 & 0x3f) << 6) + (c2 & 0x3f));
492 @@ -233,14 +356,15 @@
496 - /* We looked for a UTF8 multi-byte sequence (since we saw an initial
497 - byte with the high bit set), but found invalid bytes instead.
498 - If the most recent byte was Ascii (and not EOF), we should
499 - unget it, in case it was a comment terminator or other delimitor. */
500 - if ((c & 0x80) == 0)
502 - return BAD_UTF8_VALUE;
504 + /* We simply don't support invalid characters. */
505 + java_lex_error ("malformed UTF-8 character", 0);
508 +#endif /* HAVE_ICONV */
510 + /* We only get here on error. */
515 @@ -261,56 +385,54 @@
519 -java_read_unicode (term_context, unicode_escape_p)
520 +java_read_unicode (lex, term_context, unicode_escape_p)
523 int *unicode_escape_p;
528 - c = java_read_char ();
529 + c = java_read_char (lex);
530 *unicode_escape_p = 0;
533 - return ((term_context ? c :
534 - java_lineterminator (c) ? '\n' : (unicode_t)c));
536 - /* Count the number of preceeding '\' */
537 - for (base = ftell (finput), i = base-2; c == '\\';)
539 - fseek (finput, i--, SEEK_SET);
540 - c = java_read_char (); /* Will fail if reading utf8 stream. FIXME */
542 + return (term_context ? c : (java_lineterminator (c)
546 - fseek (finput, base, SEEK_SET);
547 - if ((base-i-3)%2 == 0) /* If odd number of \ seen */
550 + if ((lex->bs_count) % 2 == 1)
552 - c = java_read_char ();
553 + /* Odd number of \ seen. */
554 + c = java_read_char (lex);
557 - unsigned short unicode = 0;
558 + unicode_t unicode = 0;
560 /* Next should be 4 hex digits, otherwise it's an error.
561 The hex value is converted into the unicode, pushed into
562 the Unicode stream. */
563 for (shift = 12; shift >= 0; shift -= 4)
565 - if ((c = java_read_char ()) == UEOF)
566 + if ((c = java_read_char (lex)) == UEOF)
568 if (c >= '0' && c <= '9')
569 unicode |= (unicode_t)((c-'0') << shift);
570 else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
571 unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
574 - ("Non hex digit in Unicode escape sequence", 0);
575 + java_lex_error ("Non hex digit in Unicode escape sequence", 0);
577 *unicode_escape_p = 1;
578 - return (term_context ? unicode :
579 - (java_lineterminator (c) ? '\n' : unicode));
580 + return (term_context
581 + ? unicode : (java_lineterminator (c) ? '\n' : unicode));
583 - ctxp->unget_utf8_value = c;
584 + lex->unget_value = c;
586 - return (unicode_t)'\\';
587 + return (unicode_t) '\\';
594 int unicode_escape_p;
595 - c = java_read_unicode (0, &unicode_escape_p);
596 + c = java_read_unicode (ctxp->lexer, 0, &unicode_escape_p);
597 java_store_unicode (ctxp->c_line, c, unicode_escape_p);
598 if (ctxp->c_line->white_space_only
599 && !JAVA_WHITE_SPACE_P (c) && c!='\n')
601 int unicode_escape_p;
602 if (c == '\n') /* CR */
604 - if ((c = java_read_unicode (1, &unicode_escape_p)) != '\r')
605 + if ((c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p)) != '\r')
607 ctxp->c_line->ahead [0] = c;
608 ctxp->c_line->unicode_escape_ahead_p = unicode_escape_p;
611 else if (c == '\r') /* LF */
613 - if ((c = java_read_unicode (1, &unicode_escape_p)) != '\n')
614 + if ((c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p)) != '\n')
616 ctxp->c_line->ahead [0] = c;
617 ctxp->c_line->unicode_escape_ahead_p = unicode_escape_p;
618 --- gcc/java/lex.h.orig Sat Apr 8 23:03:03 2000
619 +++ gcc/java/lex.h Sat Apr 8 23:03:56 2000
621 /* A Unicode character, as read from the input file */
622 typedef unsigned short unicode_t;
626 +#endif /* HAVE_ICONV */
628 +/* Default encoding to use if no encoding is specified. */
629 +#define DEFAULT_ENCODING "UTF-8"
631 /* Debug macro to print-out what we match */
632 #ifdef JAVA_LEX_DEBUG
633 #ifdef JAVA_LEX_DEBUG_CHAR
638 +typedef struct java_lexer
640 + /* The file from which we're reading. */
643 + /* Number of consecutive backslashes we've read. */
646 + /* If nonzero, a value that was pushed back. */
647 + unicode_t unget_value;
650 + /* The handle for the iconv converter we're using. */
653 + /* Bytes we've read from the file but have not sent to iconv. */
656 + /* Index of first valid character in buffer, -1 if no valid
660 + /* Index of last valid character in buffer, plus one. -1 if no
661 + valid characters in buffer. */
663 +#endif /* HAVE_ICONV */
666 -#define JAVA_LINE_MAX 80
667 +/* Destroy a lexer object. */
668 +extern void java_destroy_lexer PARAMS ((java_lexer *));
670 -/* Macro to read and unread bytes */
671 -#define UNGETC(c) ungetc(c, finput)
672 -#define GETC() getc(finput)
673 +#define JAVA_LINE_MAX 80
675 /* Build a location compound integer */
676 #define BUILD_LOCATION() ((ctxp->elc.line << 12) | (ctxp->elc.col & 0xfff))
677 --- gcc/java/parse.h.orig Sat Apr 8 23:03:03 2000
678 +++ gcc/java/parse.h Sat Apr 8 23:15:36 2000
679 @@ -586,12 +586,11 @@
682 char *filename; /* Current filename */
683 - FILE *finput; /* Current file input stream */
684 struct parser_ctxt *next;
686 + java_lexer *lexer; /* Current lexer state */
687 struct java_line *p_line, *c_line; /* Previous and current line */
688 java_lc elc; /* Error's line column info */
689 - unicode_t unget_utf8_value; /* An unget utf8 value */
690 int ccb_indent; /* Keep track of {} indent, lexer */
691 int first_ccb_indent1; /* First { at ident level 1 */
692 int last_ccb_indent1; /* Last } at ident level 1 */
694 /* Always in use, no matter what you compile */
695 void java_push_parser_context PROTO ((void));
696 void java_pop_parser_context PROTO ((int));
697 -void java_init_lex PROTO ((void));
698 +void java_init_lex PARAMS ((FILE *, const char *));
699 extern void java_parser_context_save_global PROTO ((void));
700 extern void java_parser_context_restore_global PROTO ((void));
701 int yyparse PROTO ((void));
702 --- gcc/java/parse.y.orig Sat Apr 8 23:03:03 2000
703 +++ gcc/java/parse.y Sat Apr 8 23:03:56 2000
704 @@ -2347,7 +2347,6 @@
705 java_push_parser_context ();
706 extra_ctxp_pushed_p = 1;
708 - ctxp->finput = finput;
709 ctxp->lineno = lineno;
710 ctxp->current_class = current_class;
711 ctxp->filename = input_filename;
712 @@ -2357,7 +2356,6 @@
714 java_parser_context_restore_global ()
716 - finput = ctxp->finput;
717 lineno = ctxp->lineno;
718 current_class = ctxp->current_class;
719 input_filename = ctxp->filename;
720 @@ -2386,9 +2384,12 @@
721 next->incomplete_class = ctxp->incomplete_class;
722 next->gclass_list = ctxp->gclass_list;
723 lineno = ctxp->lineno;
724 - finput = ctxp->finput;
725 current_class = ctxp->current_class;
728 + /* If the old and new lexers differ, then free the old one. */
729 + if (ctxp->lexer && next && ctxp->lexer != next->lexer)
730 + java_destroy_lexer (ctxp->lexer);
732 /* Set the single import class file flag to 0 for the current list
733 of imported things */