package TermExtract::JapanesePlainTextEUC;
use TermExtract::Calc_Imp;

use strict;
use Exporter ();
use vars qw(@ISA $VERSION @EXPORT);

@ISA = qw(TermExtract::Calc_Imp Exporter);
@EXPORT = qw();
$VERSION = "0.22";


# ========================================================================
# get_noun_frq -- Get noun frequency.
#                 The values of the hash are frequency of the noun.
# ѸȤ٤륵֥롼
#
#  Over-write TermExtract::Calc_Imp::get_noun_frq
#
# ========================================================================

sub get_noun_frq {
    my $self = shift;
    my $data = shift;           # ϥǡ
    my $mode = shift || 0;      # ϥǡե뤫ѿμѥե饰
    my %cmp_noun_list = ();     # ʣپ줿ϥåʴؿ͡
    my @terms = ();
    my @StopWords = ();

    $self->IsAgglutinativeLang; # ñ֣ʤ

    # Ϥեξ
    if ($mode ne 'var') {
        local($/) = undef;
        open (IN, $data) || die "Can not open input file. $!";
        $data = <IN>;
        close IN;
    }

    foreach my $morph ((split /\n/, $data)) {
        chomp $morph;
        next if $morph eq "";
        my $terms = get_katakana_kanji($morph);
        foreach my $cmp_noun (@$terms) {
            next if @$cmp_noun < 2;
            $cmp_noun_list{ join ' ', @$cmp_noun }++ if $$cmp_noun[0];
        }
    }

    return \%cmp_noun_list;
}

# ʵڤӴФ륵֥롼
sub get_katakana_kanji {
    my $word = shift;
    my @terms = ();
    my $iPos = 0; my $iLen = 0;
    my $pool = "";
    my $status = "0";  # 0  ִס1  ֱѻڤӣХȵ桢2  ֥ʡ
    my @cmp_noun = ();
    LOOP:
    for($iPos = 0;$word ne ""; $word = substr($word, $iLen)) {
        my $noun = "";
        # 1Хʸ
        if ($word =~ /^([\x00-\x7E])/) {
            $iLen = 1;
            $noun = $1;
            if ($status == 2) { push @cmp_noun, $pool; $pool = ""; }
            $pool .= $noun;
            $status = 1;
        }
        # Хȱѻ
        elsif ($word =~ /^(\xA3[\xC1-\xFA])/) {
            $iLen = 2;
            $noun = $1;
            if ($status == 2) { push @cmp_noun, $pool; $pool = ""; }
            $pool .= $noun;
            $status = 1;
        }
        # Хȵڤӥ
        elsif ($word =~ /^([\xA1-\xA8][\xA0-\xFE])/) {
            $iLen = 2;
            $noun = $1;
            # Ĺڤӥ
            if ($noun eq "\xA1\xBC" | $noun =~ /^\xA5[\xA0-\xFE]/) {
                if ($status == 1) { push @cmp_noun, $pool; $pool = ""; }
                $pool .= $noun;
                $status = 2;
            }
            # 
            else {
                if ($status != 0) { push @cmp_noun, $pool; $pool = ""; }
                $status = 0;
                add_cmp_noun (\@terms, \@cmp_noun);
            }
        }
        # Хʸ
        elsif ($word =~ /^([\xB0-\xFE][\xA0-\xFE])/) {
            $iLen = 2;
            $noun = $1;
            if ($status != 0) { push @cmp_noun, $pool; $pool = ""; }
            $status = 0;
            push @cmp_noun, $noun;
        }
        # Хʸ
        elsif ($word =~ /^([\x8F-\xEF][\xA0-\xFE][\xA0-\xFE])/) {
            $iLen = 3;
            $noun = $1;
            if ($status != 0) { push @cmp_noun, $pool; $pool = ""; }
            $status = 0;
            push @cmp_noun, $noun;
        }
        else {
            $iLen = 1;
        }
    }
    # ǸΰΤ߽
    push @cmp_noun, $pool if $status != 0;
    add_cmp_noun (\@terms, \@cmp_noun) if @cmp_noun;;
    return \@terms;
}

# ʣϿ
sub add_cmp_noun {
    my $terms =    shift;
    my $cmp_noun = shift;
    my @work     = @$cmp_noun;
    push @$terms, \@work;
    @$cmp_noun    = ();
}

1;

__END__

=head1 NAME

    TermExtract::JapanesePlainTextEUC
     -- Ѹ켫ưХ⥸塼ʸ֥ʡEUCǡ

=head1 SYNOPSIS

    use TermExtract::JapanesePlainTextEUC;

=head1 DESCRIPTION

    ܸΥƥȥǡEUC)餽ΤޤѸФץࡣ

    ⥸塼λˡˤĤƤϡƥ饹TermExtract::Calc_Imp)
  ʲΥץ륹ץȤ򻲾ȤΤȡ

=head2 Sample Script

 #!/opt/local/bin/perl5.34 -w
 
 #
 #  ex_JPTE.pl
 #
 #  ɸϤѸȤν٤֤ץ
 #  ʸ֥ʡEUC
 #
 #   version 0.05
 #
 #
 
 use TermExtract::JapanesePlainTextEUC;
 #use strict;
 my $data = new TermExtract::JapanesePlainTextEUC;
 my $InputFile = "JPTE_out.txt";    # ϥե
 
 # ץΰ۾ｪλ
 # (åǥ쥯ȥѤΤߡ
 $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = 'sigexit';
 
 # ϥ⡼ɤ
 # 1  Ѹܽ١2  ѸΤ
 # 3  ޶ڤ
 my $output_mode = 1;
 
 #
 # ٷ׻ǡϢܸ"ٿ""ۤʤ""ѡץ쥭ƥ"Τ
 # Ȥ뤫򡣥ѡץ쥭ƥϡֳؽǽפȤʤ
 # ޤ"ϢܸξȤʤ"⤢ꡢξѸи
 # (ꤵƤIDFȤ߹碌ˤǽٷ׻Ԥ
 # ʥǥեȤ"ٿ"Ȥ $obj->use_total)
 #
 #$data->use_total;      # ٿȤ
 #$data->use_uniq;       # ۤʤȤ
 #$data->use_Perplexity; # ѡץ쥭ƥȤ(TermExtract 3.04 ʾ)
 #$data->no_LR;          # ܾȤʤ (TermExtract 4.02 ʾ)
 
 #
 # ٷ׻ǡϢܾ˳ݤ碌Ѹиپ򤹤
 # $data->no_LR; ȤȤ߹碌Ѹи٤Τߤν٤⻻вǽ
 # ʥǥեȤ "Frequency" $data->use_frq)
 # TFϤѸ줬¾Ѹΰ˻ȤƤˤ⥫
 # Frequency Ѹ줬¾Ѹΰ˻ȤƤ˥Ȥʤ
 #
 #$data->use_TF;   # TF (Term Frequency) (TermExtract 4.02 ʾ)
 #$data->use_frq;  # FrequencyˤѸ
 #$data->no_frq;   # پȤʤ
 
 #
 # ٷ׻ǡؽǽȤɤ
 # ʥǥեȤϡѤʤ $obj->no_stat)
 #
 #$data->use_stat; # ؽǽȤ
 #$data->no_stat;  # ؽǽȤʤ
 
 #
 # ٷ׻ǡ֥ɥѸ١פȡϢܸν١
 # ΤɤŤ򤪤ꤹ롣
 # ǥեͤϣ
 # ͤ礭ۤɡ֥ɥѸ١פŤޤ
 #
 #$data->average_rate(0.5);
 
 #
 # ؽǽDB˥ǡѤ뤫ɤ
 # ٷ׻ǡؽǽȤȤϡåȤƤۤ
 # ̵񡣽оݤ˳ؽǽDBϿƤʤ줬ޤޤ
 # ưʤ
 # ʥǥեȤϡѤʤ $obj->no_storage
 #
 #$data->use_storage; # Ѥ
 #$data->no_storage;  # Ѥʤ
 
 #
 # ؽǽDB˻ѤDBMSDBM_File˻
 # ʥǥեȤϡDB_FileBTREE⡼ɡ
 #
 #$data->use_SDBM;
 
 #
 # ΥɥȤפȤΥǡ١
 # ե̾򥻥å
 # ʥǥեȤ "stat.db""comb.db"
 #
 #$data->stat_db("statUC.db");
 #$data->comb_db("combUC.db");
 
 #
 # ǡ١¾åΤΰǥ쥯ȥ
 # ǥ쥯ȥ̾ʸʥǥեȡˤξϥåʤ
 #
 #$data->lock_dir("lock_dir");
 
 #
 # ǡɤ߹
 # ѸꥹȤ֤
 # DBѡɥٻѤ˥åȡ
 #
 #my @noun_list = $data->get_imp_word($str, 'var');     # Ϥѿ
 my @noun_list = $data->get_imp_word($InputFile); # Ϥե
 
 #
 # ɤ߹ƥȥե򸵤
 # ⡼ɤѤơѸꥹȤ֤
 #$data->use_stat->no_frq;
 #my @noun_list2 = $data->get_imp_word();
 # ޤη̤̤Υ⡼ɤˤ̤ȳݤ碌
 #@noun_list = $data->result_filter (\@noun_list, \@noun_list2, 30, 1000);
 
 #
 #  ѸꥹȤȷ׻٤ɸϤ˽Ф
 #
 foreach (@noun_list) {
    # աɽʤ
    next if $_->[0] =~ /^()*(ʿ)*(\d+ǯ)*(\d+)*(\d+)*()*()*(\d+)*(\d+ʬ)*(\d+)*$/;
 
    # ͤΤߤɽʤ
    next if $_->[0] =~ /^\d+$/;
 
    # ɽ$output_mode˱ơͼѹ
    printf "%-60s %16.2f\n", $_->[0], $_->[1] if $output_mode == 1;
    printf "%s\n",           $_->[0]          if $output_mode == 2;
    printf "%s,",            $_->[0]          if $output_mode == 3;
 }
 
 
=head1 Methods

    Υ⥸塼Ǥϡget_imp_word Τ߼ʳΥ᥽åɤϿ
  ⥸塼 TermExtract::Calc_Imp ǼƤ롣
    get_imp_word ϥȥåץɤˤʸϤʣñ̤ޤǤʬ䤷
  롣ʳΥ᥽åɤˤĤƤϡTermExtract::Calc_Imp PODɥ
  Ȥ򻲾Ȥ뤳ȡ

=head2 get_imp_word

    ܸʸϤ鼡Υ롼ˤʣФ롣裱ϡ
  оݤΥǡ裲裱μ̤Ǥ롣ǥեȤǤϡ裱
  ϡܸΥƥȥեȤʤ롣裲ʸ'var'å
  줿ȤˤϡܸΥƥȥǡä顼ѿ
  Ȳ᤹롣

    ʣϢ³ִס֥ʡסֱѻڤӣХȵפ
Ф롣
     (˲ԤäϡʣζڤȤ
    ʣˡִפϣʸñ̡֥ʡ׵ڤӡֱѻڤӣХȵ
סϸñ̤ǤޤȤޤ롣
    ʣ˾嵭ˤñ̰ʾϢ³ƤѸȤ롣
    ʣ˽٤η׻ϡ嵭ˤñ̤ǹԤ

=head1 SEE ALSO

    TermExtract::Calc_Imp
    TermExtract::Chasen
    TermExtract::MeCab
    TermExtract::BrillsTagger
    TermExtract::EnglishPlainText
    TermExtract::ChainesPlainTextUC
    TermExtract::ChainesPlainTextGB
    TermExtract::ICTCLAS
    TermExtract::JapanesePlainTextSJIS

=head1 COPYRIGHT

    Υץϡ ϯ (maeda@lib.u-tokyo.ac.jp)
  ΤǤ롣ѸФΥǥ ͵ֶʸ
  ȥåץˤѸٷ׻ȡɽȥƥ
  ȡޥ˥󥰡(ھԡĵɧ нŹ 2003.10ˤ˵ܤ
  ʡФˤ륭ڤФΥǥ򸵤ˤƤ롣

    ʤܥץλѤˤʤ̤˴ؤƤǤ
  Ǥʤ

=cut
