/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.parser.eval;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.File;
import java.io.FileFilter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;

public class TreebankStats {
    private static Redwood.RedwoodChannels log = Redwood.channels(TreebankStats.class);
    private final Language languageName;
    private final TreebankLangParserParams tlpp;
    private final List<String> pathNames;
    private Map<Split, Set<String>> splitFileLists;
    private boolean useSplit = false;
    private boolean makeVocab = false;
    private static Set<String> trainVocab = null;
    private static final int MIN_ARGS = 2;

    public TreebankStats(Language langName, List<String> paths, TreebankLangParserParams tlpp) {
        this.languageName = langName;
        this.pathNames = paths;
        this.tlpp = tlpp;
    }

    public boolean useSplit(String prefix) {
        Map<Split, File> splitMap = Generics.newHashMap();
        splitMap.put(Split.Train, new File(prefix + ".train"));
        splitMap.put(Split.Test, new File(prefix + ".test"));
        splitMap.put(Split.Dev, new File(prefix + ".dev"));
        this.splitFileLists = Generics.newHashMap();
        for (Map.Entry entry : splitMap.entrySet()) {
            File f = (File)entry.getValue();
            if (!f.exists()) {
                return false;
            }
            Set<String> files = Generics.newHashSet();
            for (String fileName : IOUtils.readLines(f)) {
                files.add(fileName);
            }
            this.splitFileLists.put((Split)((Object)entry.getKey()), files);
        }
        this.useSplit = true;
        return true;
    }

    private ObservedCorpusStats gatherStats(DiskTreebank tb, String name) {
        ObservedCorpusStats ocs = new ObservedCorpusStats(name);
        if (this.makeVocab) {
            trainVocab = Generics.newHashSet();
        }
        System.out.println("Reading treebank:");
        for (Tree t : tb) {
            Pair<Integer, Integer> treeFacts = TreebankStats.dissectTree(t, ocs, this.makeVocab);
            ocs.addStatsForTree(t.yield().size(), treeFacts.first(), treeFacts.second());
            if (ocs.numTrees % 100 == 0) {
                System.out.print(".");
                continue;
            }
            if (ocs.numTrees % 8001 != 0) continue;
            System.out.println();
        }
        ocs.computeFinalValues();
        System.out.println("done!");
        return ocs;
    }

    private static Pair<Integer, Integer> dissectTree(Tree t, ObservedCorpusStats ocs, boolean addToVocab) {
        Stack<Pair<Integer, Tree>> stack = new Stack<Pair<Integer, Tree>>();
        stack.push(new Pair<Integer, Tree>(0, t));
        int maxBreadth = 0;
        int maxDepth = -1;
        if (t == null) {
            throw new RuntimeException("Null tree passed to dissectTree()");
        }
        while (!stack.isEmpty()) {
            Pair depthNode = (Pair)stack.pop();
            int nodeDepth = (Integer)depthNode.first();
            Tree node = (Tree)depthNode.second();
            if (nodeDepth != maxDepth) {
                maxDepth = nodeDepth;
                if (node.isPhrasal() && stack.size() + 1 > maxBreadth) {
                    maxBreadth = stack.size() + 1;
                }
            }
            if (node.isPhrasal()) {
                ocs.addPhrasalBranch(node.value(), node.children().length);
            } else if (node.isPreTerminal()) {
                ocs.posTags.incrementCount(node.value());
            } else if (node.isLeaf()) {
                ocs.words.incrementCount(node.value());
                if (addToVocab) {
                    trainVocab.add(node.value());
                }
            }
            for (Tree kid : node.children()) {
                stack.push(new Pair<Integer, Tree>(nodeDepth + 1, kid));
            }
        }
        return new Pair<Integer, Integer>(maxDepth, maxBreadth);
    }

    private static void display(ObservedCorpusStats corpStats, boolean displayWords, boolean displayOOV) {
        System.out.println("####################################################################");
        System.out.println("## " + corpStats.getName());
        System.out.println("####################################################################");
        System.out.println();
        corpStats.display(displayWords, displayOOV);
    }

    private static ObservedCorpusStats aggregateStats(List<ObservedCorpusStats> allStats) {
        if (allStats.size() == 0) {
            return null;
        }
        if (allStats.size() == 1) {
            return allStats.get(0);
        }
        ObservedCorpusStats agStats = new ObservedCorpusStats("CORPUS");
        for (ObservedCorpusStats ocs : allStats) {
            agStats.numTrees += ocs.numTrees;
            agStats.breadth2 += ocs.breadth2;
            agStats.breadths.addAll(ocs.breadths);
            agStats.depth2 += ocs.depth2;
            agStats.depths.addAll(ocs.depths);
            agStats.length2 += ocs.length2;
            agStats.lengths.addAll(ocs.lengths);
            if (ocs.minLength < agStats.minLength) {
                agStats.minLength = ocs.minLength;
            }
            if (ocs.maxLength > agStats.maxLength) {
                agStats.maxLength = ocs.maxLength;
            }
            if (ocs.minBreadth < agStats.minBreadth) {
                agStats.minBreadth = ocs.minBreadth;
            }
            if (ocs.maxBreadth > agStats.maxBreadth) {
                agStats.maxBreadth = ocs.maxBreadth;
            }
            if (ocs.minDepth < agStats.minDepth) {
                agStats.minDepth = ocs.minDepth;
            }
            if (ocs.maxDepth > agStats.maxDepth) {
                agStats.maxDepth = ocs.maxDepth;
            }
            agStats.words.addAll(ocs.words);
            agStats.posTags.addAll(ocs.posTags);
            agStats.phrasalBranching2.addAll(ocs.phrasalBranching2);
            agStats.phrasalBranchingNum2.addAll(ocs.phrasalBranchingNum2);
        }
        agStats.computeFinalValues();
        return agStats;
    }

    public void run(boolean pathsAreFiles, boolean displayWords, boolean displayOOV) {
        if (this.useSplit) {
            ArrayList<ObservedCorpusStats> allSplitStats = new ArrayList<ObservedCorpusStats>();
            this.makeVocab = true;
            for (Map.Entry<Split, Set<String>> split : this.splitFileLists.entrySet()) {
                DiskTreebank tb = this.tlpp.diskTreebank();
                SplitFilter splitFilter = new SplitFilter(split.getValue());
                for (String path : this.pathNames) {
                    tb.loadPath(path, (FileFilter)splitFilter);
                }
                ObservedCorpusStats splitStats = this.gatherStats(tb, this.languageName.toString() + "." + split.getKey().toString());
                allSplitStats.add(splitStats);
                this.makeVocab = false;
            }
            TreebankStats.display(TreebankStats.aggregateStats(allSplitStats), displayWords, displayOOV);
            for (ObservedCorpusStats ocs : allSplitStats) {
                TreebankStats.display(ocs, displayWords, displayOOV);
            }
        } else if (pathsAreFiles) {
            this.makeVocab = true;
            for (String path : this.pathNames) {
                DiskTreebank tb = this.tlpp.diskTreebank();
                tb.loadPath(path, pathname -> true);
                ObservedCorpusStats stats = this.gatherStats(tb, this.languageName.toString() + "  " + path);
                TreebankStats.display(stats, displayWords, displayOOV);
                this.makeVocab = false;
            }
        } else {
            trainVocab = Generics.newHashSet();
            DiskTreebank tb = this.tlpp.diskTreebank();
            for (String path : this.pathNames) {
                tb.loadPath(path, pathname -> !pathname.isDirectory());
            }
            ObservedCorpusStats allStats = this.gatherStats(tb, this.languageName.toString());
            TreebankStats.display(allStats, displayWords, displayOOV);
        }
    }

    private static String usage() {
        StringBuilder usage = new StringBuilder();
        String nl = System.getProperty("line.separator");
        usage.append(String.format("Usage: java %s [OPTS] LANG paths%n%n", TreebankStats.class.getName()));
        usage.append("Options:").append(nl);
        usage.append(" LANG is one of " + Language.langList).append(nl);
        usage.append("  -s prefix : Use a split (extensions must be dev/test/train)").append(nl);
        usage.append("  -w        : Show word distribution").append(nl);
        usage.append("  -f        : Path list is a set of files, and the first file is the training set").append(nl);
        usage.append("  -o        : Print OOV words.").append(nl);
        return usage.toString();
    }

    private static Map<String, Integer> optArgDefs() {
        Map<String, Integer> optArgDefs = Generics.newHashMap(4);
        optArgDefs.put("s", 1);
        optArgDefs.put("w", 0);
        optArgDefs.put("f", 0);
        optArgDefs.put("o", 0);
        return optArgDefs;
    }

    public static void main(String[] args) {
        if (args.length < 2) {
            log.info(TreebankStats.usage());
            System.exit(-1);
        }
        Properties options = StringUtils.argsToProperties(args, TreebankStats.optArgDefs());
        String splitPrefix = options.getProperty("s", null);
        boolean SHOW_WORDS = PropertiesUtils.getBool(options, "w", false);
        boolean pathsAreFiles = PropertiesUtils.getBool(options, "f", false);
        boolean SHOW_OOV = PropertiesUtils.getBool(options, "o", false);
        String[] parsedArgs = options.getProperty("", "").split("\\s+");
        if (parsedArgs.length != 2) {
            log.info(TreebankStats.usage());
            System.exit(-1);
        }
        Language language = Language.valueOf(parsedArgs[0]);
        ArrayList<String> corpusPaths = new ArrayList<String>(parsedArgs.length - 1);
        for (int i = 1; i < parsedArgs.length; ++i) {
            corpusPaths.add(parsedArgs[i]);
        }
        TreebankLangParserParams tlpp = language.params;
        TreebankStats cs = new TreebankStats(language, corpusPaths, tlpp);
        if (splitPrefix != null && !cs.useSplit(splitPrefix)) {
            log.info("Could not load split!");
        }
        cs.run(pathsAreFiles, SHOW_WORDS, SHOW_OOV);
    }

    protected static class ObservedCorpusStats {
        private final String corpusName;
        public final Counter<String> words;
        public final Counter<String> posTags;
        private final Counter<String> phrasalBranching2;
        private final Counter<String> phrasalBranchingNum2;
        public int numTrees = 0;
        private double depth2 = 0.0;
        private double breadth2 = 0.0;
        private double length2 = 0.0;
        private final List<Integer> lengths;
        private final List<Integer> breadths;
        private final List<Integer> depths;
        private Counter<String> meanBranchingByLabel;
        private double meanDepth = 0.0;
        private double stddevDepth = 0.0;
        private double meanBranchingFactor = 0.0;
        private double meanConstituents = 0.0;
        private double meanLength = 0.0;
        private double stddevLength = 0.0;
        private double meanBreadth = 0.0;
        private double stddevBreadth = 0.0;
        private double OOVRate = 0.0;
        private Set<String> oovWords;
        public int minLength = Integer.MAX_VALUE;
        public int maxLength = Integer.MIN_VALUE;
        public int minDepth = Integer.MAX_VALUE;
        public int maxDepth = Integer.MIN_VALUE;
        public int minBreadth = Integer.MAX_VALUE;
        public int maxBreadth = Integer.MIN_VALUE;

        public ObservedCorpusStats(String name) {
            this.corpusName = name;
            this.words = new ClassicCounter<String>();
            this.posTags = new ClassicCounter<String>();
            this.phrasalBranching2 = new ClassicCounter<String>();
            this.phrasalBranchingNum2 = new ClassicCounter<String>();
            this.lengths = new ArrayList<Integer>();
            this.depths = new ArrayList<Integer>();
            this.breadths = new ArrayList<Integer>();
        }

        public String getName() {
            return this.corpusName;
        }

        public void addStatsForTree(int yieldLength, int depth, int breadth) {
            ++this.numTrees;
            this.breadths.add(breadth);
            this.breadth2 += (double)breadth;
            this.lengths.add(yieldLength);
            this.length2 += (double)yieldLength;
            this.depths.add(depth);
            this.depth2 += (double)depth;
            if (depth < this.minDepth) {
                this.minDepth = depth;
            } else if (depth > this.maxDepth) {
                this.maxDepth = depth;
            }
            if (yieldLength < this.minLength) {
                this.minLength = yieldLength;
            } else if (yieldLength > this.maxLength) {
                this.maxLength = yieldLength;
            }
            if (breadth < this.minBreadth) {
                this.minBreadth = breadth;
            } else if (breadth > this.maxBreadth) {
                this.maxBreadth = breadth;
            }
        }

        public double getPercLensLessThan(int maxLen) {
            int lens = 0;
            for (Integer len : this.lengths) {
                if (len > maxLen) continue;
                ++lens;
            }
            return (double)lens / (double)this.lengths.size();
        }

        public void addPhrasalBranch(String label, int factor) {
            this.phrasalBranching2.incrementCount(label, factor);
            this.phrasalBranchingNum2.incrementCount(label);
        }

        public void display(boolean displayWords, boolean displayOOV) {
            DecimalFormat nf = new DecimalFormat("0.00");
            System.out.println("======================================================");
            System.out.println(">>> " + this.corpusName);
            System.out.println(" trees:\t\t" + this.numTrees);
            System.out.println(" words:\t\t" + this.words.keySet().size());
            System.out.println(" tokens:\t" + (int)this.words.totalCount());
            System.out.println(" tags:\t\t" + this.posTags.size());
            System.out.println(" phrasal types:\t" + this.phrasalBranchingNum2.keySet().size());
            System.out.println(" phrasal nodes:\t" + (int)this.phrasalBranchingNum2.totalCount());
            System.out.println(" OOV rate:\t" + nf.format(this.OOVRate * 100.0) + "%");
            System.out.println("======================================================");
            System.out.println(">>> Per tree means");
            System.out.printf(" depth:\t\t%s\t{min:%d\tmax:%d}\t\ts: %s\n", nf.format(this.meanDepth), this.minDepth, this.maxDepth, nf.format(this.stddevDepth));
            System.out.printf(" breadth:\t%s\t{min:%d\tmax:%d}\ts: %s\n", nf.format(this.meanBreadth), this.minBreadth, this.maxBreadth, nf.format(this.stddevBreadth));
            System.out.printf(" length:\t%s\t{min:%d\tmax:%d}\ts: %s\n", nf.format(this.meanLength), this.minLength, this.maxLength, nf.format(this.stddevLength));
            System.out.println(" branching:\t" + nf.format(this.meanBranchingFactor));
            System.out.println(" constituents:\t" + nf.format(this.meanConstituents));
            System.out.println("======================================================");
            System.out.println(">>> Branching factor means by phrasal tag:");
            ArrayList<String> sortedKeys = new ArrayList<String>(this.meanBranchingByLabel.keySet());
            Collections.sort(sortedKeys, Counters.toComparator(this.phrasalBranchingNum2, false, true));
            for (String label : sortedKeys) {
                System.out.printf(" %s:\t\t%s  /  %d instances\n", label, nf.format(this.meanBranchingByLabel.getCount(label)), (int)this.phrasalBranchingNum2.getCount(label));
            }
            System.out.println("======================================================");
            System.out.println(">>> Phrasal tag counts");
            sortedKeys = new ArrayList<String>(this.phrasalBranchingNum2.keySet());
            Collections.sort(sortedKeys, Counters.toComparator(this.phrasalBranchingNum2, false, true));
            for (String label : sortedKeys) {
                System.out.println(" " + label + ":\t\t" + (int)this.phrasalBranchingNum2.getCount(label));
            }
            System.out.println("======================================================");
            System.out.println(">>> POS tag counts");
            sortedKeys = new ArrayList<String>(this.posTags.keySet());
            Collections.sort(sortedKeys, Counters.toComparator(this.posTags, false, true));
            for (String posTag : sortedKeys) {
                System.out.println(" " + posTag + ":\t\t" + (int)this.posTags.getCount(posTag));
            }
            System.out.println("======================================================");
            if (displayWords) {
                System.out.println(">>> Word counts");
                sortedKeys = new ArrayList<String>(this.words.keySet());
                Collections.sort(sortedKeys, Counters.toComparator(this.words, false, true));
                for (String word : sortedKeys) {
                    System.out.println(" " + word + ":\t\t" + (int)this.words.getCount(word));
                }
                System.out.println("======================================================");
            }
            if (displayOOV) {
                System.out.println(">>> OOV word types");
                for (String word : this.oovWords) {
                    System.out.println(" " + word);
                }
                System.out.println("======================================================");
            }
        }

        public void computeFinalValues() {
            double denom = this.numTrees;
            this.meanDepth = this.depth2 / denom;
            this.meanLength = this.length2 / denom;
            this.meanBreadth = this.breadth2 / denom;
            this.meanConstituents = this.phrasalBranchingNum2.totalCount() / denom;
            this.meanBranchingFactor = this.phrasalBranching2.totalCount() / this.phrasalBranchingNum2.totalCount();
            Iterator<Object> iterator = this.depths.iterator();
            while (iterator.hasNext()) {
                int d = iterator.next();
                this.stddevDepth += Math.pow((double)d - this.meanDepth, 2.0);
            }
            this.stddevDepth = Math.sqrt(this.stddevDepth / denom);
            iterator = this.lengths.iterator();
            while (iterator.hasNext()) {
                int l = iterator.next();
                this.stddevLength += Math.pow((double)l - this.meanLength, 2.0);
            }
            this.stddevLength = Math.sqrt(this.stddevLength / denom);
            iterator = this.breadths.iterator();
            while (iterator.hasNext()) {
                int b = iterator.next();
                this.stddevBreadth += Math.pow((double)b - this.meanBreadth, 2.0);
            }
            this.stddevBreadth = Math.sqrt(this.stddevBreadth / denom);
            this.meanBranchingByLabel = new ClassicCounter<String>();
            for (String label : this.phrasalBranching2.keySet()) {
                double mean = this.phrasalBranching2.getCount(label) / this.phrasalBranchingNum2.getCount(label);
                this.meanBranchingByLabel.incrementCount(label, mean);
            }
            this.oovWords = Generics.newHashSet(this.words.keySet());
            this.oovWords.removeAll(trainVocab);
            this.OOVRate = (double)this.oovWords.size() / (double)this.words.keySet().size();
        }
    }

    protected static class SplitFilter
    implements FileFilter {
        private final Set<String> filterMap;

        public SplitFilter(Set<String> fileList) {
            this.filterMap = fileList;
        }

        @Override
        public boolean accept(File f) {
            return this.filterMap.contains(f.getName());
        }
    }

    private static enum Split {
        Train,
        Dev,
        Test;

    }
}

