# # $Id: _Classic.pm,v 2.0 2005/05/16 19:08:04 dankogai Exp $ # package Jcode::_Classic; use 5.004; use Carp; use strict; use vars qw($RCSID $VERSION $DEBUG); $RCSID = q$Id: _Classic.pm,v 2.0 2005/05/16 19:08:04 dankogai Exp $; $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; $DEBUG = $Jcode::DEBUG; use vars qw($USE_CACHE $NOXS); $USE_CACHE = 1; $NOXS = 0; print $RCSID, "\n" if $DEBUG; use Jcode::Constants qw(:all); sub new { my $class = shift; my ($thingy, $icode) = @_; my $r_str = ref $thingy ? $thingy : \$thingy; my $nmatch; ($icode, $nmatch) = getcode($r_str) unless $icode; convert($r_str, 'euc', $icode); my $self = [ $r_str, $icode, $nmatch, ]; carp "Object of class $class created" if $DEBUG >= 2; bless $self, $class; } sub r_str { $_[0]->[0] } sub icode { $_[0]->[1] } sub nmatch { $_[0]->[2] } sub set { my $self = shift; my ($thingy, $icode) = @_; my $r_str = ref $thingy ? $thingy : \$thingy; my $nmatch; ($icode, $nmatch) = getcode($r_str) unless $icode; convert($r_str, 'euc', $icode); $self->[0] = $r_str; $self->[1] = $icode; $self->[2] = $nmatch; $self->[3] = "Classic"; return $self; } sub append { my $self = shift; my ($thingy, $icode) = @_; my $r_str = ref $thingy ? $thingy : \$thingy; my $nmatch; ($icode, $nmatch) = getcode($r_str) unless $icode; convert($r_str, 'euc', $icode); ${$self->[0]} .= $$r_str; $self->[1] = $icode; $self->[2] = $nmatch; return $self; } sub jcode { return Jcode->new(@_) } sub euc { return ${$_[0]->[0]} } sub jis { return &euc_jis(${$_[0]->[0]})} sub sjis { return &euc_sjis(${$_[0]->[0]})} sub iso_2022_jp{return $_[0]->h2z->jis} sub jfold{ my $self = shift; my ($bpl, $nl) = @_; $bpl ||= 72; $nl ||= "\n"; my $r_str = $self->[0]; my @lines = (); my $len = 0; my $i = 0; while ($$r_str =~ m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo) { if ($len + length($1) > $bpl){ # fold! $i++; $len = 0; } $lines[$i] .= $1; $len += length($1); } defined($lines[$i]) or pop @lines; $$r_str = join($nl, @lines); return wantarray ? @lines : $self; } sub jlength { my $self = shift; my $r_str = $self->[0]; return scalar (my @char = $$r_str =~ m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo); } sub mime_encode{ my $self = shift; my $r_str = $self->[0]; my $lf = shift || "\n"; my $bpl = shift || 76; my ($trailing_crlf) = ($$r_str =~ /(\n|\r|\x0d\x0a)$/o); my $str = _mime_unstructured_header($$r_str, $lf, $bpl); not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; $str; } # # shamelessly stolen from # http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 # sub _add_encoded_word { require MIME::Base64; my($str, $line, $bpl) = @_; my $result = ''; while (length($str)) { my $target = $str; $str = ''; if (length($line) + 22 + ($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl) { $line =~ s/[ \t\n\r]*$/\n/; $result .= $line; $line = ' '; } while (1) { my $iso_2022_jp = jcode($target, 'euc')->iso_2022_jp; if (my $count = ($iso_2022_jp =~ tr/\x80-\xff//d)){ $DEBUG and warn $count; $target = jcode($iso_2022_jp, 'iso_2022_jp')->euc; } my $encoded = '=?ISO-2022-JP?B?' . MIME::Base64::encode_base64($iso_2022_jp, '') . '?='; if (length($encoded) + length($line) > $bpl) { $target =~ s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; $str = $1 . $str; } else { $line .= $encoded; last; } } } return $result . $line; } sub _mime_unstructured_header { my ($oldheader, $lf, $bpl) = @_; my(@words, @wordstmp, $i); my $header = ''; $oldheader =~ s/\s+$//; @wordstmp = split /\s+/, $oldheader; for ($i = 0; $i < $#wordstmp; $i++) { if ($wordstmp[$i] !~ /^[\x21-\x7E]+$/ and $wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/) { $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]"; } else { push(@words, $wordstmp[$i]); } } push(@words, $wordstmp[-1]); for my $word (@words) { if ($word =~ /^[\x21-\x7E]+$/) { $header =~ /(?:.*\n)*(.*)/; if (length($1) + length($word) > $bpl) { $header .= "$lf $word"; } else { $header .= $word; } } else { $header = _add_encoded_word($word, $header, $bpl); } $header =~ /(?:.*\n)*(.*)/; if (length($1) == $bpl) { $header .= "$lf "; } else { $header .= ' '; } } $header =~ s/\n? $/\n/; $header; } # see http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 #$lws = '(?:(?:\x0d\x0a)?[ \t])+'; #$ew_regex = '=\?ISO-2022-JP\?B\?([A-Za-z0-9+/]+=*)\?='; #$str =~ s/($ew_regex)$lws(?=$ew_regex)/$1/gio; #$str =~ s/$lws/ /go; $str =~ s/$ew_regex/decode_base64($1)/egio; sub mime_decode{ require MIME::Base64; # not use my $self = shift; my $r_str = $self->[0]; my $re_lws = '(?:(?:\r|\n|\x0d\x0a)?[ \t])+'; my $re_ew = '=\?[Ii][Ss][Oo]-2022-[Jj][Pp]\?[Bb]\?([A-Za-z0-9+/]+=*)\?='; $$r_str =~ s/($re_ew)$re_lws(?=$re_ew)/$1/sgo; $$r_str =~ s/$re_lws/ /go; $self->[2] = ($$r_str =~ s/$re_ew/jis_euc(MIME::Base64::decode_base64($1))/ego ); $self; } sub tr{ require Jcode::Tr; # not use my $self = shift; $self->[2] = Jcode::Tr::tr($self->[0], @_); return $self; } # # load needed module depending on the configuration just once! # use vars qw(%PKG_LOADED); sub load_module{ my $pkg = shift; return $pkg if $PKG_LOADED{$pkg}++; unless ($NOXS){ eval qq( require $pkg; ); unless ($@){ carp "$pkg loaded." if $DEBUG; return $pkg; } } $pkg .= "::NoXS"; eval qq( require $pkg; ); unless ($@){ carp "$pkg loaded" if $DEBUG; }else{ croak "Loading $pkg failed!"; } $pkg; } sub ucs2{ load_module("Jcode::Unicode"); euc_ucs2(${$_[0]->[0]}); } sub utf8{ load_module("Jcode::Unicode"); euc_utf8(${$_[0]->[0]}); } sub getcode { my $thingy = shift; my $r_str = ref $thingy ? $thingy : \$thingy; my ($code, $nmatch, $sjis, $euc, $utf8) = ("", 0, 0, 0, 0); if ($$r_str =~ /$RE{BIN}/o) { # 'binary' my $ucs2; $ucs2 += length($1) while $$r_str =~ /(\x00$RE{ASCII})+/go; if ($ucs2){ # smells like raw unicode ($code, $nmatch) = ('ucs2', $ucs2); }else{ ($code, $nmatch) = ('binary', 0); } } elsif ($$r_str !~ /[\e\x80-\xff]/o) { # not Japanese ($code, $nmatch) = ('ascii', 1); } # 'jis' elsif ($$r_str =~ m[ $RE{JIS_0208}|$RE{JIS_0212}|$RE{JIS_ASC}|$RE{JIS_KANA} ]ox) { ($code, $nmatch) = ('jis', 1); } else { # should be euc|sjis|utf8 # use of (?:) by Hiroki Ohzaki $sjis += length($1) while $$r_str =~ /((?:$RE{SJIS_C})+)/go; $euc += length($1) while $$r_str =~ /((?:$RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})+)/go; $utf8 += length($1) while $$r_str =~ /((?:$RE{UTF8})+)/go; # $utf8 *= 1.5; # M. Takahashi's suggestion $nmatch = _max($utf8, $sjis, $euc); carp ">DEBUG:sjis = $sjis, euc = $euc, utf8 = $utf8" if $DEBUG >= 3; $code = ($euc > $sjis and $euc > $utf8) ? 'euc' : ($sjis > $euc and $sjis > $utf8) ? 'sjis' : ($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef; } return wantarray ? ($code, $nmatch) : $code; } sub convert{ my $thingy = shift; my $r_str = ref $thingy ? $thingy : \$thingy; my ($ocode, $icode, $opt) = @_; my $nmatch; ($icode, $nmatch) = getcode($r_str) unless $icode; return $$r_str if $icode eq $ocode and !defined $opt; # do nothin' no strict qw(refs); my $method; # convert to EUC load_module("Jcode::Unicode") if $icode =~ /ucs2|utf8/o; if ($icode and defined &{$method = "$icode" . "_euc"}){ carp "Dispatching \&$method" if $DEBUG >= 2; &{$method}($r_str) ; } # h2z or z2h if ($opt){ my $cmd = ($opt =~ /^z/o) ? "h2z" : ($opt =~ /^h/o) ? "z2h" : undef; if ($cmd){ require Jcode::H2Z; &{'Jcode::H2Z::' . $cmd}($r_str); } } # convert to $ocode load_module("Jcode::Unicode") if $ocode =~ /ucs2|utf8/o; if ($ocode and defined &{$method = "euc_" . $ocode}){ carp "Dispatching \&$method" if $DEBUG >= 2; &{$method}($r_str) ; } $$r_str; } sub h2z { require Jcode::H2Z; # not use my $self = shift; $self->[2] = Jcode::H2Z::h2z($self->[0], @_); return $self; } sub z2h { require Jcode::H2Z; # not use my $self = shift; $self->[2] = &Jcode::H2Z::z2h($self->[0], @_); return $self; } # JIS<->EUC sub jis_euc { my $thingy = shift; my $r_str = ref $thingy ? $thingy : \$thingy; $$r_str =~ s( ($RE{JIS_0212}|$RE{JIS_0208}|$RE{JIS_ASC}|$RE{JIS_KANA}) ([^\e]*) ) { my ($esc, $str) = ($1, $2); if ($esc !~ /$RE{JIS_ASC}/o) { $str =~ tr/\x21-\x7e/\xa1-\xfe/; if ($esc =~ /$RE{JIS_KANA}/o) { $str =~ s/([\xa1-\xdf])/\x8e$1/og; } elsif ($esc =~ /$RE{JIS_0212}/o) { $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; } } $str; }geox; $$r_str; } # # euc_jis # # Based upon the contribution of # Kazuto Ichimura # optimized by sub euc_jis{ my $thingy = shift; my $r_str = ref $thingy ? $thingy : \$thingy; $$r_str =~ s{ ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) }{ my $str = $1; my $esc = ( $str =~ tr/\x8E//d ) ? $ESC{KANA} : ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} : $ESC{JIS_0208}; $str =~ tr/\xA1-\xFE/\x21-\x7E/; $esc . $str . $ESC{ASC}; }geox; $$r_str =~ s/\Q$ESC{ASC}\E (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; $$r_str; } # EUC<->SJIS my %_S2E = (); my %_E2S = (); sub sjis_euc { my $thingy = shift; my $r_str = ref $thingy ? $thingy : \$thingy; $$r_str =~ s( ($RE{SJIS_C}|$RE{SJIS_KANA}) ) { my $str = $1; unless ($_S2E{$1}){ my ($c1, $c2) = unpack('CC', $str); if (0xa1 <= $c1 && $c1 <= 0xdf) { $c2 = $c1; $c1 = 0x8e; } elsif (0x9f <= $c2) { $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60); $c2 += 2; } else { $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61); $c2 += 0x60 + ($c2 < 0x7f); } $_S2E{$str} = pack('CC', $c1, $c2); } $_S2E{$str}; }geox; $$r_str; } # sub euc_sjis { my $thingy = shift; my $r_str = ref $thingy ? $thingy : \$thingy; $$r_str =~ s( ($RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212}) ) { my $str = $1; unless ($_E2S{$str}){ my ($c1, $c2) = unpack('CC', $str); if ($c1 == 0x8e) { # SS2 $_E2S{$str} = chr($c2); } elsif ($c1 == 0x8f) { # SS3 $_E2S{$str} = $CHARCODE{UNDEF_SJIS}; }else { #SS1 or X0208 if ($c1 % 2) { $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71); $c2 -= 0x60 + ($c2 < 0xe0); } else { $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70); $c2 -= 2; } $_E2S{$str} = pack('CC', $c1, $c2); } } $_E2S{$str}; }geox; $$r_str; } # # Util. Functions # sub _max { my $result = shift; for my $n (@_){ $result = $n if $n > $result; } return $result; } 1;