/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.fsm.DFSA;
import edu.stanford.nlp.fsm.DFSAState;
import edu.stanford.nlp.fsm.DFSATransition;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.wordseg.ChineseStringUtils;
import edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

public class MaxMatchSegmenter
implements WordSegmenter {
    private static final boolean DEBUG = false;
    private static Redwood.RedwoodChannels logger = Redwood.channels(MaxMatchSegmenter.class);
    private final Set<String> words = Generics.newHashSet();
    private int len = -1;
    private int edgesNb = 0;
    private static final int maxLength = 10;
    private List<DFSAState<Word, Integer>> states;
    private DFSA<Word, Integer> lattice = null;
    private static final Pattern chineseStartChars = Pattern.compile("^[\u4e00-\u9fff]");
    private static final Pattern chineseEndChars = Pattern.compile("[\u4e00-\u9fff]$");
    private static final Pattern chineseChars = Pattern.compile("[\u4e00-\u9fff]");
    private static final Pattern excludeChars = Pattern.compile("[0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u96f6\u3007\u767e\u5343\u4e07\u4ebf\u5169\u25cb\u25ef\u3021-\u3029\u3038-\u303a-#$%&'*+/@_\uff0d\uff03\uff04\uff05\uff06\uff07\uff0a\uff0b\uff0f\uff20\uff3f]");
    private static final long serialVersionUID = 8263734344886904724L;

    @Override
    public void initializeTraining(double numTrees) {
    }

    @Override
    public void train(Collection<Tree> trees) {
        for (Tree tree : trees) {
            this.train(tree);
        }
    }

    @Override
    public void train(Tree tree) {
        this.train((List<TaggedWord>)tree.taggedYield());
    }

    @Override
    public void train(List<TaggedWord> sentence) {
        for (TaggedWord word : sentence) {
            if (word.word().length() > 10) continue;
            this.addStringToLexicon(word.word());
        }
    }

    @Override
    public void finishTraining() {
    }

    @Override
    public void loadSegmenter(String filename) {
        this.addLexicon(filename);
    }

    @Override
    public List<HasWord> segment(String s) {
        this.buildSegmentationLattice(s);
        ArrayList<Word> sent = this.maxMatchSegmentation();
        MaxMatchSegmenter.printlnErr("raw output: " + SentenceUtils.listToString(sent));
        ArrayList<Word> postProcessedSent = MaxMatchSegmenter.postProcessSentence(sent);
        MaxMatchSegmenter.printlnErr("processed output: " + SentenceUtils.listToString(postProcessedSent));
        ChineseStringUtils.CTPPostProcessor postProcessor = new ChineseStringUtils.CTPPostProcessor();
        String postSentString = postProcessor.postProcessingAnswer(postProcessedSent.toString(), false);
        MaxMatchSegmenter.printlnErr("Sighan2005 output: " + postSentString);
        String[] postSentArray = postSentString.split("\\s+");
        ArrayList<Word> postSent = new ArrayList<Word>();
        for (String w : postSentArray) {
            postSent.add(new Word(w));
        }
        return new ArrayList<HasWord>(postSent);
    }

    private void addStringToLexicon(String str) {
        if (str.equals("")) {
            logger.warn("WARNING: blank line in lexicon");
        } else if (str.contains(" ")) {
            logger.warn("WARNING: word with space in lexicon");
        } else {
            if (MaxMatchSegmenter.excludeChar(str)) {
                MaxMatchSegmenter.printlnErr("skipping word: " + str);
                return;
            }
            this.words.add(str);
        }
    }

    private void addLexicon(String filename) {
        try {
            String lexiconLine;
            BufferedReader lexiconReader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(filename), "UTF-8"));
            while ((lexiconLine = lexiconReader.readLine()) != null) {
                this.addStringToLexicon(lexiconLine);
            }
        }
        catch (FileNotFoundException e) {
            logger.error("Lexicon not found: " + filename);
            System.exit(-1);
        }
        catch (IOException e) {
            logger.error("IO error while reading: " + filename, e);
            throw new RuntimeException(e);
        }
    }

    private void buildSegmentationLattice(String s) {
        this.edgesNb = 0;
        this.len = s.length();
        this.states = new ArrayList<DFSAState<Word, Integer>>();
        this.lattice = new DFSA("wordLattice");
        for (int i = 0; i <= s.length(); ++i) {
            this.states.add(new DFSAState<Word, Integer>(i, this.lattice));
        }
        this.lattice.setInitialState(this.states.get(0));
        this.states.get(this.len).setAccepting(true);
        for (int start = 0; start < this.len; ++start) {
            for (int end = this.len; end > start; --end) {
                String str = s.substring(start, end);
                assert (str.length() > 0);
                boolean isOneChar = start + 1 == end;
                boolean isInDict = this.words.contains(str);
                if (!isInDict && !isOneChar) continue;
                double cost = isInDict ? 1.0 : 100.0;
                DFSATransition<Word, Integer> trans = new DFSATransition<Word, Integer>(null, this.states.get(start), this.states.get(end), new Word(str), null, cost);
                this.states.get(start).addTransition(trans);
                ++this.edgesNb;
            }
        }
    }

    public ArrayList<Word> maxMatchSegmentation() {
        return this.segmentWords(MatchHeuristic.MINWORDS);
    }

    public ArrayList<Word> segmentWords(MatchHeuristic h) throws UnsupportedOperationException {
        int i;
        if (this.lattice == null || this.len < 0) {
            throw new UnsupportedOperationException("segmentWords must be run first");
        }
        ArrayList<Word> segmentedWords = new ArrayList<Word>();
        double[] costs = new double[this.len + 1];
        ArrayList<DFSATransition<Word, Integer>> bptrs = new ArrayList<DFSATransition<Word, Integer>>();
        for (i = 0; i < this.len + 1; ++i) {
            bptrs.add(null);
        }
        costs[0] = 0.0;
        for (i = 1; i <= this.len; ++i) {
            costs[i] = Double.MAX_VALUE;
        }
        for (int start = 0; start < this.len; ++start) {
            DFSAState<Word, Integer> fromState = this.states.get(start);
            Collection<DFSATransition<Word, Integer>> trs = fromState.transitions();
            for (DFSATransition<Word, Integer> tr : trs) {
                DFSAState<Word, Integer> toState = tr.getTarget();
                double lcost = tr.score();
                int end = toState.stateID();
                if (h == MatchHeuristic.MINWORDS) {
                    if (!(costs[start] + 1.0 < costs[end])) continue;
                    costs[end] = costs[start] + lcost;
                    bptrs.set(end, tr);
                    continue;
                }
                if (h == MatchHeuristic.MAXWORDS) {
                    if (!(costs[start] + 1.0 < costs[end])) continue;
                    costs[end] = costs[start] - lcost;
                    bptrs.set(end, tr);
                    continue;
                }
                throw new UnsupportedOperationException("unimplemented heuristic");
            }
        }
        i = this.len;
        while (i > 0) {
            DFSATransition tr = (DFSATransition)bptrs.get(i);
            DFSAState fromState = tr.getSource();
            Word word = (Word)tr.getInput();
            if (!word.word().equals(" ")) {
                segmentedWords.add(0, word);
            }
            i = (Integer)fromState.stateID();
        }
        return new ArrayList<Word>(segmentedWords);
    }

    public ArrayList<Word> greedilySegmentWords(String s) {
        ArrayList<Word> segmentedWords = new ArrayList<Word>();
        int length = s.length();
        int start = 0;
        while (start < length) {
            int end;
            for (end = Math.min(length, start + 10); end > start + 1; --end) {
                String nextWord = s.substring(start, end);
                if (!this.words.contains(nextWord)) continue;
                segmentedWords.add(new Word(nextWord));
                break;
            }
            if (end == start + 1) {
                segmentedWords.add(new Word(new String(new char[]{s.charAt(start)})));
                ++start;
                continue;
            }
            start = end;
        }
        return new ArrayList<Word>(segmentedWords);
    }

    /*
     * Loose catch block
     */
    public static void main(String[] args) {
        Properties props = StringUtils.argsToProperties(args);
        SeqClassifierFlags flags = new SeqClassifierFlags(props);
        MaxMatchSegmenter seg = new MaxMatchSegmenter();
        String lexiconFile = props.getProperty("lexicon");
        if (lexiconFile != null) {
            seg.addLexicon(lexiconFile);
        } else {
            logger.error("Error: no lexicon file!");
            System.exit(1);
        }
        Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();
        sighanRW.init(flags);
        BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
        PrintWriter stdoutW = new PrintWriter(System.out);
        int lineNb = 0;
        block2: while (true) {
            logger.info("line: " + ++lineNb);
            String line = br.readLine();
            if (line != null) {
                String outputLine = null;
                if (props.getProperty("greedy") != null) {
                    ArrayList<Word> sentence = seg.greedilySegmentWords(line);
                    outputLine = SentenceUtils.listToString(sentence);
                } else if (props.getProperty("maxwords") != null) {
                    seg.buildSegmentationLattice(line);
                    outputLine = SentenceUtils.listToString(seg.segmentWords(MatchHeuristic.MAXWORDS));
                } else {
                    seg.buildSegmentationLattice(line);
                    outputLine = SentenceUtils.listToString(seg.maxMatchSegmentation());
                }
                StringReader strR = new StringReader(outputLine);
                Iterator<List<CoreLabel>> itr = sighanRW.getIterator(strR);
                while (true) {
                    if (!itr.hasNext()) continue block2;
                    sighanRW.printAnswers(itr.next(), stdoutW);
                }
            }
            break;
        }
        catch (IOException e) {
            // empty catch block
        }
        stdoutW.flush();
    }

    private static void printlnErr(String s) {
        EncodingPrintWriter.err.println(s, "UTF-8");
    }

    private static ArrayList<Word> postProcessSentence(ArrayList<Word> sent) {
        ArrayList<Word> newSent = new ArrayList<Word>();
        for (Word word : sent) {
            if (newSent.size() > 0) {
                String prevWord = ((Word)newSent.get(newSent.size() - 1)).toString();
                String curWord = word.toString();
                String prevChar = prevWord.substring(prevWord.length() - 1);
                String curChar = curWord.substring(0, 1);
                if (!MaxMatchSegmenter.isChinese(prevChar) && !MaxMatchSegmenter.isChinese(curChar)) {
                    Word mergedWord = new Word(prevWord + curWord);
                    newSent.set(newSent.size() - 1, mergedWord);
                    continue;
                }
            }
            newSent.add(word);
        }
        return new ArrayList<Word>(newSent);
    }

    private static boolean startsWithChinese(String str) {
        return chineseStartChars.matcher(str).matches();
    }

    private static boolean endsWithChinese(String str) {
        return chineseEndChars.matcher(str).matches();
    }

    private static boolean isChinese(String str) {
        return chineseChars.matcher(str).matches();
    }

    private static boolean excludeChar(String str) {
        return excludeChars.matcher(str).matches();
    }

    public static enum MatchHeuristic {
        MINWORDS,
        MAXWORDS,
        MAXLEN;

    }
}

