package TermExtract::ICTCLAS;
use TermExtract::Calc_Imp;

use strict;
use Exporter ();
use vars qw(@ISA $VERSION @EXPORT);

@ISA = qw(TermExtract::Calc_Imp Exporter);
@EXPORT = qw();
$VERSION = "0.23";

# ========================================================================
# get_noun_frq -- Get noun frequency.
#                 The values of the hash are frequency of the noun.
# TZȤlȤä륵֥`
#
#  Over-write TermExtract::Calc_Imp::get_noun_frq
#
# ========================================================================

sub get_noun_frq {
    my $self = shift;
    my $data = shift;           # ǩ`
    my $mode = shift || 0;      # ǩ`ե뤫Reåե饰
    my %cmp_noun_list = ();     # }Zl줿ϥå壨vΑꂎ
    # ҪӋZָ
    $self->IgnoreWords("","","","֮","","","ʽ","");  
    $self->IsAgglutinativeLang; # zZָgZgֿդʤ)

    # եΈ
    if ($mode ne 'var') {
        local($/) = undef;
        open (IN, $data) || die "Can not open input file. $!";
        $data = <IN>;
        close IN;
    }

    foreach my $morph ((split /\n/, $data)) {
        chomp $morph;
        next if $morph =~ /^\s*$/;

        # $status = 1   ǰ~(ng n nr ns nt nz nz vn an i j)
        #           2   ǰ~(ag a)
        #           3   ǰ~(u), ӳɷ(k)
        #           4   ǰB~[͡](c)
        #           5   ǰe~(b)

        my $status = 0;

        my $rest   = 0;  # ~ZZBA
        my @seg    = (); # }ZΥꥹȣУ

        foreach my $term (split(/\s+/, $morph)) {
        	$term =~ s/^+//;
        	# ФӛŤΈ
            if($term =~ /^[\s\+\-\%\&\$\*\#\^\|\/\>\<\;\:\[\]]/ || $term =~ /^[\d]+\//){
                _increase_frq(\%cmp_noun_list, \@seg, \$rest);
                next;
            }
        	
            # ~Z~~ʤɤ򺬤ࣩΈ
            if($term =~ /\/ng$/ || $term =~ /\/n$/  || $term =~ /\/nr$/ || $term =~ /\/ns$/ ||
               $term =~ /\/nt$/ || $term =~ /\/nz$/ || $term =~ /\/nx$/ || $term =~ /\/vn$/ ||
               $term =~ /\an$/  || $term =~ /\/i$/  || $term =~ /\/j$/){
                $status = 1;
                push(@seg, $term); $rest = 0;
            }
            # ~Έ
            elsif($term =~ /\/ag$/ || $term =~ /\/a$/){
                #ǰZ"ʤ","~","~ӳɷ","B~"ΈϤBY
                if($status == 0 || $status == 2 || $status == 3 || $status == 4){
                    push(@seg, $term); $rest++;
                }
                else {
                    _increase_frq(\%cmp_noun_list, \@seg, \$rest);
                    @seg = ($term); $rest++;
                }
                $status = 2;
            }
            # ~~Έ
            elsif($term =~ /\/u$/ || $term =~ /\/k$/){
                #ǰZ"~","~"ΈϤBY
                if($status == 1 || $status == 2){
                    push(@seg, $term); $rest++;
                }
                else {
                    _increase_frq(\%cmp_noun_list, \@seg, \$rest);
                    $rest = 0;
                }
                $status = 3;
            }
            # B~͡룩Έ
            elsif($term =~ /\/c$/ && ($term =~ /^/ | $term =~ /^/)){
                #ǰZ"~"ΈϤBY
                if($status == 1){
                    push(@seg, $term); $rest++;
                }
                else {
                    _increase_frq(\%cmp_noun_list, \@seg, \$rest);
                    $rest = 0;
                }
                $status = 4;
            }
            # e~Έ
            elsif($term =~ /\/b$/){
                #ǰZ"ʤ","~", "~ӳɷ", "B~"ΈϤBY
                if($status == 0 || $status == 1 || $status == 3 || $status == 4){
                    push(@seg, $term); $rest++;
                }
                else {
                    _increase_frq(\%cmp_noun_list, \@seg, \$rest);
                    $rest = 0;
                }
                $status = 5;
            }
            # ָƷ~ΈϤϡ}ZФȤ
            else{
                _increase_frq(\%cmp_noun_list, \@seg, \$rest);
                $status = 0;
            }
        }
        # ФäϤϤ}ZФȤ
        _increase_frq(\%cmp_noun_list, \@seg, \$rest);
        $status = 0;
    }
    return \%cmp_noun_list;
}


# ---------------------------------------------------------------------------
#   _increase_frq  --  lȉ䤹
# 
#   usage : _increase_frq(frequency_of_compnoun, segment, rest_after_noun);
# ---------------------------------------------------------------------------

sub _increase_frq {
    my $frq_ref  = shift;
    my $seg      = shift;
    my $rest     = shift;
    my $allwords = "";

    # }Zĩβ~ȤФΤƤ
    $#$seg -= $$rest if @$seg;
    $$rest = 0;

    if ($#$seg >= 0) {
        foreach my $word (@$seg) {
	        $word =~ s/\/[A-Z\$]+$//i; # ȡ
            if ($allwords eq "") { 
                $allwords = $word;
            }
            else {
                $allwords .= ' ' . $word;
            }
        }
        $$frq_ref{"$allwords"}++;
    }
    @$seg = ();
}

1;

__END__

=head1 NAME

    TermExtract::ICTCLAS
                -- TZ⥸`루"ICTCLAS")

=head1 SYNOPSIS

    use TermExtract::ICTCLAS;

=head1 DESCRIPTION

    ƥȤ"ICTCLAS"ĤƷ~ץࣩˤ
  νYȤƥȤ錟TZץࡣ

    ⥸`ʹ÷ˤĤƤϡH饹TermExtract::Calc_Imp)
  ¤Υץ륹ץȤդΤȡ

=head2 Sample Script

 #!/opt/local/bin/perl5.32 -w
 
 #
 #  ex_ICT.pl
 #
 #ե뤫 ICTCLAS ΄IYiȡ
 #  ˜ʳˌTZȤҪȤ򷵤ץ
 #
 #   version 0.07
 #
 #
 
 use TermExtract::ICTCLAS;
 #use strict;
 my $data = new TermExtract::ICTCLAS;
 my $InputFile = "ICT_out.txt";    # եָ
 
 # ץήK˕rI
 # (åǥ쥯ȥʹäϤΤߣ
 $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = 'sigexit';
 
 # `ɤָ
 # 1  TZҪȡ2  TZΤ
 # 3  Ф
 my $output_mode = 1;
 
 #
 # ҪӋǡBZ"Ӥ""ʤ""ѩ`ץ쥭ƥ"Τ
 # Ȥ뤫xkѩ`ץ쥭ƥϡѧCܡʹʤ
 # ޤ"BZʹʤ"xk⤢ꡢΈϤZF
 # (OƤIDFνMߺϤ碌ҪӋФ
 # ǥեȤ"Ӥ"Ȥ $obj->use_total)
 #
 #$data->use_total;      # ӤȤ
 #$data->use_uniq;       # ʤȤ
 #$data->use_Perplexity; # ѩ`ץ쥭ƥȤ(TermExtract 3.04 )
 #$data->no_LR;          # Oʹʤ (TermExtract 4.02 )
 
 #
 # ҪӋǡB˒줱Ϥ碌ZFlxk
 # $data->no_LR; ȤνMߺϤ碌ZFlȤΤߤҪȤ
 # ǥեȤ "Frequency" $data->use_frq)
 # TFϤZZһʹƤϤˤ⥫
 # Frequency ZZһʹƤϤ˥Ȥʤ
 #
 #$data->use_TF;   # TF (Term Frequency) (TermExtract 4.02 )
 #$data->use_frq;  # FrequencyˤZl
 #$data->no_frq;   # lʹʤ
 
 #
 # ҪӋǡѧCܤʹɤxk
 # ǥեȤϡʹäʤ $obj->no_stat)
 #
 #$data->use_stat; # ѧCܤʹ
 #$data->no_stat;  # ѧCܤʹʤ
 
 #
 # ҪӋǡɥФZlȡȡBZҪȡ
 # Τɤ˱ؤ򤪤O롣
 # ǥեȂϣ
 # 󤭤ۤɡɥФZlȡαؤߤޤ
 #
 #$data->average_rate(0.5);
 
 #
 # ѧCDB˥ǩ`e뤫ɤxk
 # ҪӋǡѧCܤʹȤϡåȤƤۤ
 # oyI팝ѧCDB˵hƤʤZޤ
 # ʤ
 # ǥեȤϡeʤ $obj->no_storage
 #
 #$data->use_storage; # e
 #$data->no_storage;  # eʤ
 
 #
 # ѧCDBʹäDBMSDBM_Fileָ
 # ǥեȤϡDB_FileBTREE`ɣ
 #
 #$data->use_SDBM;
 
 #
 # ^ȥΥɥȤ۷eyӋʹϤΥǩ`٩`
 # ե򥻥å
 # ǥեȤ "stat.db""comb.db"
 #
 $data->stat_db("statICT.db");
 $data->comb_db("combICT.db");
 
 #
 # ǩ`٩`åΤһrǥ쥯ȥָ
 # ǥ쥯ȥУǥեȣΈϤϥåʤ
 #
 #$data->lock_dir("lock_dir");
 
 #
 # Ʒ~gߤΥƥȤ顢ǩ`iz
 # TZꥹȤФ˷
 # ۷eyӋDBʹáɥФlʹä˥åȣ
 #
 #my @noun_list = $data->get_imp_word($str, 'var');     # 
 my @noun_list = $data->get_imp_word($InputFile); # ե
 
 #
 # ǰizƷ~gߥƥȥեԪ
 # `ɤ䤨ơTZꥹȤФ˷
 #$data->use_stat->no_frq;
 #my @noun_list2 = $data->get_imp_word();
 # ޤνYeΥ`ɤˤYȒ줱Ϥ碌
 #@noun_list = $data->result_filter (\@noun_list, \@noun_list2, 30, 1000);
 
 #
 #  TZꥹȤӋ㤷ҪȤ˜ʳ˳
 #
 foreach (@noun_list) {
    # Τߤϱʾʤ
    next if $_->[0] =~ /^\d+$/;
    # 1(GB)Τߤϱʾʤ
    next if $_->[0] =~ /^[\x00-\x7F]$/;
    next if $_->[0] =~ /^[\x81-\xFE][\x40-\xFE]$/;
    next if $_->[0] =~ /^[\x81-\xEF][\x30-\x39][\x81-\xEF][\x30-\x39]$/;

    # Yʾ
    printf "%-60s %16.2f\n", $_->[0], $_->[1] if $output_mode == 1;
    printf "%s\n",           $_->[0]          if $output_mode == 2;
    printf "%s,",            $_->[0]          if $output_mode == 3;
 }

=head1 Methods

    Υ⥸`Ǥϡget_imp_word ΤߌgװΥ᥽åɤH
  ⥸` TermExtract::Calc_Imp ǌgװƤ롣
    get_imp_word Ʒ~Ф줿gZ򡢂΅gZZ
  Ʒ~Ԫ}ZɤƤ롣Υ᥽åɤˤĤƤϡ
  TermExtract::Calc_Imp PODɥȤդ뤳ȡ

=head2 get_imp_word

    ĤƷ~YΤΥ`ˤ}Zɤ롣ڣϡ
  I팝Υǩ`ڣϵڣηNeǤ롣ǥեȤǤϡڣ
  ϡƷ~gߤΥƥȥեȤʤ롣ڣ
  'var'åȤ줿ȤˤϡһƷ~gΥƥȥǩ`
  ä`Ƚዤ롣

    Ʒ~ϴΤΤȤYϤ
       ~Z(ng n nr ns nt nz nx vn an i j) *ᡸ~
           ~~~ӳɷ֡B~͡룩˽YϤ롣
             }Z^ˤʤ롣
       ~(ag, a)
           ~~ӳɷ֡B~͡룩˽YϤ롣}Z
             ^ˤʤ
       ~(u), ӳɷ(k)
          ~~˽YϤ
       B~(c)
          ,ΈϤΤߡ~˽YϤ롣
       e~(b)
           ~, ~B~͡룩˽YϤ롣}Z^ˤʤ

    ФäϤϡ}ZФȤ

    Z^ȫǥک`ȡ

  ΤӛŤʼޤZΈϤϡ}ZФȤ
        +-%\&\$*#^|<>;:

    }Z~ǽKΤȤФΤƤ

    ҪӋˤƴΤZ~~B~ϟoҕ
         ֮   ʽ 

=head1 SEE ALSO

    TermExtract::Calc_Imp
    TermExtract::Chasen
    TermExtract::MeCab
    TermExtract::BrillsTagger
    TermExtract::EnglishPlainText
    TermExtract::ChainesPlainTextUC
    TermExtract::ChainesPlainTextGB
    TermExtract::JapanesePlainTextEUC
    TermExtract::JapanesePlainTextSJIS

=head1 COPYRIGHT

    Υץϡ|ѧ дԣ־ڤČTZΥǥ
Ԫˡ|ѧ ǰ (maeda@lib.u-tokyo.ac.jp)ɤΤǤ롣
˘Υåϡ|ѧ Сu֮(kojime@e.u-tokyo.ac.jp)Фä

    ʤץʹäˤʤYvƤ⵱Ǥ
  һ؟Τؓʤ

=cut
