diff -ruN migrationtools-47.orig/migrate_common.ph migrationtools-47/migrate_common.ph --- migrationtools-47.orig/migrate_common.ph 2006-01-25 05:18:16.000000000 +0100 +++ migrationtools-47/migrate_common.ph 2007-04-07 00:05:52.000000000 +0200 @@ -89,6 +89,9 @@ # such as person. $EXTENDED_SCHEMA = 0; +# Comment this out if your ldap server does not support UTF8 encoding +$USE_UTF8 = 1; + # # allow environment variables to override predefines # diff -ruN migrationtools-47.orig/migrate_passwd.pl migrationtools-47/migrate_passwd.pl --- migrationtools-47.orig/migrate_passwd.pl 2006-01-25 05:18:16.000000000 +0100 +++ migrationtools-47/migrate_passwd.pl 2007-04-07 00:06:13.000000000 +0200 @@ -36,6 +36,7 @@ # # Thanks to Peter Jacob Slot . # +# UTF8 support by Jonas Smedegaard . require 'migrate_common.ph'; @@ -53,28 +54,6 @@ next if /^#/; next if /^\+/; - s/Ä/Ae/g; - s/Ë/Ee/g; - s/Ï/Ie/g; - s/Ö/Oe/g; - s/Ü/Ue/g; - - s/ä/ae/g; - s/ë/ee/g; - s/ï/ie/g; - s/ö/oe/g; - s/ü/ue/g; - s/ÿ/ye/g; - s/ß/ss/g; - s/é/e/g; - - s/Æ/Ae/g; - s/æ/ae/g; - s/Ø/Oe/g; - s/ø/oe/g; - s/Å/Ae/g; - s/å/ae/g; - local($user, $pwd, $uid, $gid, $gecos, $homedir, $shell) = split(/:/); if ($use_stdout) { @@ -100,25 +79,25 @@ $sn = $tmp[$#tmp]; pop(@tmp); $givenname=join(' ',@tmp); - + print $HANDLE "dn: uid=$user,$NAMINGCONTEXT\n"; print $HANDLE "uid: $user\n"; - print $HANDLE "cn: $cn\n"; + &print_utf8($HANDLE, "cn", $cn); if ($EXTENDED_SCHEMA) { if ($wphone) { - print $HANDLE "telephoneNumber: $wphone\n"; + &print_utf8($HANDLE, "telephoneNumber", $wphone); } if ($office) { - print $HANDLE "roomNumber: $office\n"; + &print_utf8($HANDLE, "roomNumber", $office); } if ($hphone) { - print $HANDLE "homePhone: $hphone\n"; + &print_utf8($HANDLE, "homePhone", $hphone); } if ($givenname) { - print $HANDLE "givenName: $givenname\n"; + &print_utf8($HANDLE, "givenName", $givenname); } - print $HANDLE "sn: $sn\n"; + &print_utf8($HANDLE, "sn", $sn); if ($DEFAULT_MAIL_DOMAIN) { print $HANDLE "mail: $user\@$DEFAULT_MAIL_DOMAIN\n"; } @@ -174,7 +153,7 @@ } if ($gecos) { - print $HANDLE "gecos: $gecos\n"; + &print_ascii($HANDLE, "gecos", $gecos); } print $HANDLE "\n"; @@ -225,3 +204,177 @@ } } +sub print_utf8 +{ + my($HANDLE, $attribute, $content) = @_; + + if (&validate_ascii($content)) { + print $HANDLE "$attribute: $content\n"; + } elsif ($USE_UTF8) { +# $content = &recode_custom_to_utf8($content); + $content = &recode_latin1_to_utf8($content); + if (&validate_utf8($content)) { + $content = &encode_base64($content, ""); + print $HANDLE "$attribute\:: $content\n"; + } else { + die "ERROR: Illegal character(s) in UTF-8 string: \"$content\""; + } + } else { + &print_ascii($HANDLE, "$attribute", "$content"); + } +} + +sub print_ascii +{ + my($HANDLE, $attribute, $content) = @_; + + if (&validate_utf8($content)) { + $content = &recode_utf8_to_latin1($content); + } else { + $content = &recode_latin1_to_utf8($content); + $content = &recode_utf8_to_latin1($content); + } + $content = &recode_custom_to_ascii($content); + if (&validate_ascii($content)) { + print $HANDLE "$attribute: $content\n"; + } else { + my $badchars = $content; + for ($badchars) { + s/[\x20-\x7E]//g; + } + die "ERROR: Illegal character(s) \"$badchars\" in ASCII string: \"$content\""; + } +} + +sub recode_latin1_to_utf8 +{ + my ($content) = @_; + for ($content) { + s/([\x80-\xFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg; + } + return ($content) +} + +sub recode_utf8_to_latin1 +{ + my ($content) = @_; + for ($content) { + s/([\xC2\xC3])([\x80-\xBF])/chr(ord($1)<<6&0xC0|ord($2)&0x3F)/eg; + } + return ($content) +} + +sub recode_custom_to_ascii +{ + my ($content) = @_; + for ($content) { + s/\xc0/A/g; # latin capital letter a with grave + s/\xc1/A/g; # latin capital letter a with acute + s/\xc2/A/g; # latin capital letter a with circumflex + s/\xc3/A/g; # latin capital letter a with tilde + s/\xc4/Ae/g; # latin capital letter a with diaeresis + s/\xc5/Aa/g; # latin capital letter a with ring above + s/\xc6/Ae/g; # latin capital letter ae + s/\xc7/C/g; # latin capital letter c with cedilla + s/\xc8/E/g; # latin capital letter e with grave + s/\xc9/E/g; # latin capital letter e with acute + s/\xca/E/g; # latin capital letter e with circumflex + s/\xcb/Ee/g; # latin capital letter e with diaeresis + s/\xcc/I/g; # latin capital letter i with grave + s/\xcd/I/g; # latin capital letter i with acute + s/\xce/I/g; # latin capital letter i with circumflex + s/\xcf/Ie/g; # latin capital letter i with diaeresis + s/\xd0/Dh/g; # latin capital letter eth (icelandic) + s/\xd1/N/g; # latin capital letter n with tilde + s/\xd2/O/g; # latin capital letter o with grave + s/\xd3/O/g; # latin capital letter o with acute + s/\xd4/O/g; # latin capital letter o with circumflex + s/\xd5/O/g; # latin capital letter o with tilde + s/\xd6/Oe/g; # latin capital letter o with diaeresis + s/\xd8/Oe/g; # latin capital letter o with stroke + s/\xd9/U/g; # latin capital letter u with grave + s/\xda/U/g; # latin capital letter u with acute + s/\xdb/U/g; # latin capital letter u with circumflex + s/\xdc/Ue/g; # latin capital letter u with diaeresis + s/\xdd/Y/g; # latin capital letter y with acute + s/\xde/TH/g; # latin capital letter thorn (icelandic) + s/\xdf/ss/g; # latin small letter sharp s (german) + s/\xe0/a/g; # latin small letter a with grave + s/\xe1/a/g; # latin small letter a with acute + s/\xe2/a/g; # latin small letter a with circumflex + s/\xe3/a/g; # latin small letter a with tilde + s/\xe4/ae/g; # latin small letter a with diaeresis + s/\xe5/aa/g; # latin small letter a with ring above + s/\xe6/ae/g; # latin small letter ae + s/\xe7/c/g; # latin small letter c with cedilla + s/\xe8/e/g; # latin small letter e with grave + s/\xe9/e/g; # latin small letter e with acute + s/\xea/e/g; # latin small letter e with circumflex + s/\xeb/ee/g; # latin small letter e with diaeresis + s/\xec/i/g; # latin small letter i with grave + s/\xed/i/g; # latin small letter i with acute + s/\xee/i/g; # latin small letter i with circumflex + s/\xef/ii/g; # latin small letter i with diaeresis + s/\xf0/dh/g; # latin small letter eth (icelandic) + s/\xf1/n/g; # latin small letter n with tilde + s/\xf2/o/g; # latin small letter o with grave + s/\xf3/o/g; # latin small letter o with acute + s/\xf4/o/g; # latin small letter o with circumflex + s/\xf5/o/g; # latin small letter o with tilde + s/\xf6/oe/g; # latin small letter o with diaeresis + s/\xf8/oe/g; # latin small letter o with stroke + s/\xf9/u/g; # latin small letter u with grave + s/\xfa/u/g; # latin small letter u with acute + s/\xfb/u/g; # latin small letter u with circumflex + s/\xfc/ue/g; # latin small letter u with diaeresis + s/\xfd/y/g; # latin small letter y with acute + s/\xfe/th/g; # latin small letter thorn (icelandic) + s/\xff/ye/g; # latin small letter y with diaeresis + } + return ($content); +} + +sub encode_base64 +# Found in email by Baruzzi Giovanni on openldap mailinglist + +# Historically this module has been implemented as pure perl code. +# The XS implementation runs about 20 times faster, but the Perl +# code might be more portable, so it is still here. +{ + my $res = ""; + my $eol = $_[1]; + $eol = "\n" unless defined $eol; + pos($_[0]) = 0; # ensure start at the beginning + while ($_[0] =~ /(.{1,45})/gs) { + $res .= substr(pack('u', $1), 1); + chop($res); + } + $res =~ tr|` -_|AA-Za-z0-9+/|; # `# help emacs + # fix padding at the end + my $padding = (3 - length($_[0]) % 3) % 3; + $res =~ s/.{$padding}$/'=' x $padding/e if $padding; + # break encoded string into lines of no more than 76 characters each + if (length $eol) { + $res =~ s/(.{1,76})/$1$eol/g; + } + $res; +} + +sub validate_ascii +{ + my ($content) = @_; + $content =~ /^[\x20-\x7E]*$/; +} + +sub validate_utf8 +{ + my ($content) = @_; + if (&validate_ascii($content)) { + return 1; + } + if ($] >= 5.8) { + ## No Perl support for UTF-8! ;-/ + return undef; + } + $content =~ /^[\x20-\x7E\x{0080}-\x{FFFF}]*$/; +}