1 diff -rc perl-5.8.8/patchlevel.h perl-5.8.8.patched/patchlevel.h
2 *** perl-5.8.8/patchlevel.h Tue Jan 31 16:12:10 2006
3 --- perl-5.8.8.patched/patchlevel.h Thu Nov 15 16:49:41 2007
8 ! ,"REGEXP0 - fix for UTF-8 recoding in regexps - CVE-2007-5116"
10 diff -rc perl-5.8.8/regcomp.c perl-5.8.8.patched/regcomp.c
11 *** perl-5.8.8/regcomp.c Sun Jan 8 20:59:27 2006
12 --- perl-5.8.8.patched/regcomp.c Thu Nov 15 16:38:53 2007
20 char *starttry; /* -Dr: where regtry was called. */
21 #define RExC_starttry (pRExC_state->starttry)
26 ! I32 utf8; /* whether the pattern is utf8 or not */
27 ! I32 orig_utf8; /* whether the pattern was originally in utf8 */
28 ! /* XXX use this for future optimisation of case
29 ! * where pattern must be upgraded to utf8. */
31 char *starttry; /* -Dr: where regtry was called. */
32 #define RExC_starttry (pRExC_state->starttry)
36 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
37 #define RExC_seen_evals (pRExC_state->seen_evals)
38 #define RExC_utf8 (pRExC_state->utf8)
39 + #define RExC_orig_utf8 (pRExC_state->orig_utf8)
41 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
42 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
46 FAIL("NULL regexp argument");
48 ! RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
52 if (!PL_colorset) reginitcolors();
53 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
54 PL_colors[4],PL_colors[5],PL_colors[0],
55 ! (int)(xend - exp), RExC_precomp, PL_colors[1]);
57 RExC_flags = pm->op_pmflags;
62 FAIL("NULL regexp argument");
64 ! RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
67 if (!PL_colorset) reginitcolors();
68 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
69 PL_colors[4],PL_colors[5],PL_colors[0],
70 ! (int)(xend - exp), exp, PL_colors[1]);
75 RExC_flags = pm->op_pmflags;
81 if (reg(pRExC_state, 0, &flags) == NULL) {
82 RExC_precomp = Nullch;
85 + if (RExC_utf8 && !RExC_orig_utf8) {
86 + /* It's possible to write a regexp in ascii that represents unicode
87 + codepoints outside of the byte range, such as via \x{100}. If we
88 + detect such a sequence we have to convert the entire pattern to utf8
89 + and then recompile, as our sizing calculation will have been based
90 + on 1 byte == 1 character, but we will need to use utf8 to encode
91 + at least some part of the pattern, and therefore must convert the whole
93 + XXX: somehow figure out how to make this less expensive...
95 + STRLEN len = xend-exp;
96 + DEBUG_r(PerlIO_printf(Perl_debug_log,
97 + "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
98 + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
100 + RExC_orig_utf8 = RExC_utf8;
102 + goto redo_first_pass;
104 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
106 diff -rc perl-5.8.8/t/op/pat.t perl-5.8.8.patched/t/op/pat.t
107 *** perl-5.8.8/t/op/pat.t Sat Jan 7 12:53:32 2006
108 --- perl-5.8.8.patched/t/op/pat.t Thu Nov 15 16:45:18 2007
130 "# assigning to original string should not corrupt match vars");
136 + local $SIG{__WARN__}=sub{push @w,"@_"};
138 + ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
139 + ok(@w==0, "No warnings");