/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.classify.tui;

import cc.mallet.pipe.CharSequence2TokenSequence;
import cc.mallet.pipe.CharSequenceLowercase;
import cc.mallet.pipe.CharSequenceRemoveHTML;
import cc.mallet.pipe.CharSubsequence;
import cc.mallet.pipe.FeatureSequence2AugmentableFeatureVector;
import cc.mallet.pipe.Input2CharSequence;
import cc.mallet.pipe.NGramPreprocessor;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintInputAndTarget;
import cc.mallet.pipe.SaveDataInSource;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.Target2Label;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequence2FeatureSequenceWithBigrams;
import cc.mallet.pipe.TokenSequenceNGrams;
import cc.mallet.pipe.TokenSequenceRemoveNonAlpha;
import cc.mallet.pipe.TokenSequenceRemoveStopPatterns;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.Strings;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

public class Text2Vectors {
    private static Logger logger = MalletLogger.getLogger(Text2Vectors.class.getName());
    public static String defaultTokenRegex = "\\p{L}[\\p{L}\\p{P}]+\\p{L}";
    static CommandOption.SpacedStrings classDirs = new CommandOption.SpacedStrings(Text2Vectors.class, "input", "DIR...", true, null, "The directories containing text files to be classified, one directory per class", null);
    static CommandOption.File outputFile = new CommandOption.File(Text2Vectors.class, "output", "FILE", true, new File("text.vectors"), "Write the instance list to this file; Using - indicates stdout.", null);
    static CommandOption.File usePipeFromVectorsFile = new CommandOption.File(Text2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"), "Use the pipe and alphabets from a previously created vectors file. Allows the creation, for example, of a test set of vectors that are compatible with a previously created set of training vectors", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(Text2Vectors.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.SpacedStrings replacementFiles = new CommandOption.SpacedStrings(Text2Vectors.class, "replacement-files", "FILE [FILE ...]", true, null, "files containing string replacements, one per line:\n    'A B [tab] C' replaces A B with C,\n    'A B' replaces A B with A_B", null);
    static CommandOption.SpacedStrings deletionFiles = new CommandOption.SpacedStrings(Text2Vectors.class, "deletion-files", "FILE [FILE ...]", true, null, "files containing strings to delete after replacements but before tokenization (ie multiword stop terms)", null);
    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(Text2Vectors.class, "remove-stopwords", "[TRUE|FALSE]", false, false, "If true, remove a default list of common English \"stop words\" from the text.", null);
    static CommandOption.File stoplistFile = new CommandOption.File(Text2Vectors.class, "stoplist-file", "FILE", true, null, "Instead of the default list, read stop words from a file, one per line. Implies --remove-stopwords", null);
    static CommandOption.File extraStopwordsFile = new CommandOption.File(Text2Vectors.class, "extra-stopwords", "FILE", true, null, "Read whitespace-separated words from this file, and add them to either\n   the default English stoplist or the list specified by --stoplist-file.", null);
    static CommandOption.File stopPatternFile = new CommandOption.File(Text2Vectors.class, "stop-pattern-file", "FILE", true, null, "Read regular expressions from a file, one per line. Tokens matching these regexps will be removed.", null);
    static CommandOption.Boolean skipHeader = new CommandOption.Boolean(Text2Vectors.class, "skip-header", "[TRUE|FALSE]", false, false, "If true, in each document, remove text occurring before a blank line.  This is useful for removing email or UseNet headers", null);
    static CommandOption.Boolean skipHtml = new CommandOption.Boolean(Text2Vectors.class, "skip-html", "[TRUE|FALSE]", false, false, "If true, remove text occurring inside <...>, as in HTML or SGML.", null);
    static CommandOption.Boolean binaryFeatures = new CommandOption.Boolean(Text2Vectors.class, "binary-features", "[TRUE|FALSE]", false, false, "If true, features will be binary.", null);
    static CommandOption.IntegerArray gramSizes = new CommandOption.IntegerArray(Text2Vectors.class, "gram-sizes", "INTEGER,[INTEGER,...]", true, new int[]{1}, "Include among the features all n-grams of sizes specified.  For example, to get all unigrams and bigrams, use --gram-sizes 1,2.  This option occurs after the removal of stop words, if removed.", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(Text2Vectors.class, "keep-sequence", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.Boolean keepSequenceBigrams = new CommandOption.Boolean(Text2Vectors.class, "keep-sequence-bigrams", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequenceWithBigrams rather than a FeatureVector.", null);
    static CommandOption.Boolean saveTextInSource = new CommandOption.Boolean(Text2Vectors.class, "save-text-in-source", "[TRUE|FALSE]", false, false, "If true, save original text of document in source.", null);
    static CommandOption.ObjectFromBean stringPipe = new CommandOption.ObjectFromBean(Text2Vectors.class, "string-pipe", "Pipe constructor", true, null, "Java code for the constructor of a Pipe to be run as soon as input becomes a CharSequence", null);
    static CommandOption.ObjectFromBean tokenPipe = new CommandOption.ObjectFromBean(Text2Vectors.class, "token-pipe", "Pipe constructor", true, null, "Java code for the constructor of a Pipe to be run as soon as input becomes a TokenSequence", null);
    static CommandOption.ObjectFromBean featureVectorPipe = new CommandOption.ObjectFromBean(Text2Vectors.class, "fv-pipe", "Pipe constructor", true, null, "Java code for the constructor of a Pipe to be run as soon as input becomes a FeatureVector", null);
    static CommandOption.String encoding = new CommandOption.String(Text2Vectors.class, "encoding", "STRING", true, Charset.defaultCharset().displayName(), "Character encoding for input file", null);
    static CommandOption.String tokenRegex = new CommandOption.String(Text2Vectors.class, "token-regex", "REGEX", true, defaultTokenRegex, "Regular expression used for tokenization.\n   Example: \"[\\p{L}\\p{N}_]+|[\\p{P}]+\" (unicode letters, numbers and underscore OR all punctuation) ", null);
    static CommandOption.Boolean printOutput = new CommandOption.Boolean(Text2Vectors.class, "print-output", "[TRUE|FALSE]", false, false, "If true, print a representation of the processed data\n   to standard output. This option is intended for debugging.", null);

    public static void main(String[] args) throws FileNotFoundException, IOException {
        Pipe instancePipe;
        CommandOption.setSummary(Text2Vectors.class, "A tool for creating instance lists of FeatureVectors or FeatureSequences from text documents.\n");
        CommandOption.process(Text2Vectors.class, args);
        if (args.length == 0) {
            CommandOption.getList(Text2Vectors.class).printUsage(false);
            System.exit(-1);
        }
        if (Text2Vectors.classDirs.value.length == 0) {
            throw new IllegalArgumentException("You must include --input DIR1 DIR2 ...' in order to specify a list of directories containing the documents for each class.");
        }
        int commonPrefixIndex = Strings.commonPrefixIndex(Text2Vectors.classDirs.value);
        logger.info("Labels = ");
        File[] directories = new File[Text2Vectors.classDirs.value.length];
        for (int i = 0; i < Text2Vectors.classDirs.value.length; ++i) {
            directories[i] = new File(Text2Vectors.classDirs.value[i]);
            if (commonPrefixIndex < Text2Vectors.classDirs.value.length) {
                logger.info("   " + Text2Vectors.classDirs.value[i].substring(commonPrefixIndex));
                continue;
            }
            logger.info("   " + Text2Vectors.classDirs.value[i]);
        }
        InstanceList previousInstanceList = null;
        if (usePipeFromVectorsFile.wasInvoked()) {
            previousInstanceList = InstanceList.load(Text2Vectors.usePipeFromVectorsFile.value);
            instancePipe = previousInstanceList.getPipe();
        } else {
            TokenSequenceRemoveStopwords stopwordFilter;
            ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
            pipeList.add(new Target2Label());
            pipeList.add(new SaveDataInSource());
            pipeList.add(new Input2CharSequence(Text2Vectors.encoding.value));
            if (saveTextInSource.wasInvoked()) {
                pipeList.add(new SaveDataInSource());
            }
            if (stringPipe.wasInvoked()) {
                pipeList.add((Pipe)Text2Vectors.stringPipe.value);
            }
            if (Text2Vectors.skipHeader.value) {
                pipeList.add(new CharSubsequence(CharSubsequence.SKIP_HEADER));
            }
            if (Text2Vectors.skipHtml.value) {
                pipeList.add(new CharSequenceRemoveHTML());
            }
            if (!preserveCase.value()) {
                pipeList.add(new CharSequenceLowercase());
            }
            if (Text2Vectors.replacementFiles.value != null || Text2Vectors.deletionFiles.value != null) {
                NGramPreprocessor preprocessor = new NGramPreprocessor();
                if (Text2Vectors.replacementFiles.value != null) {
                    for (String filename : Text2Vectors.replacementFiles.value) {
                        preprocessor.loadReplacements(filename);
                    }
                }
                if (Text2Vectors.deletionFiles.value != null) {
                    for (String filename : Text2Vectors.deletionFiles.value) {
                        preprocessor.loadDeletions(filename);
                    }
                }
                pipeList.add(preprocessor);
            }
            Pattern tokenPattern = null;
            if (Text2Vectors.keepSequenceBigrams.value) {
                tokenPattern = CharSequenceLexer.LEX_NONWHITESPACE_CLASSES;
            } else {
                try {
                    tokenPattern = Pattern.compile(Text2Vectors.tokenRegex.value);
                }
                catch (PatternSyntaxException pse) {
                    throw new IllegalArgumentException("The token regular expression (" + Text2Vectors.tokenRegex.value + ") was invalid: " + pse.getMessage());
                }
            }
            pipeList.add(new CharSequence2TokenSequence(tokenPattern));
            if (tokenPipe.wasInvoked()) {
                pipeList.add((Pipe)Text2Vectors.tokenPipe.value);
            }
            if (Text2Vectors.keepSequenceBigrams.value) {
                pipeList.add(new TokenSequenceRemoveNonAlpha(true));
            }
            if (stoplistFile.wasInvoked()) {
                stopwordFilter = new TokenSequenceRemoveStopwords(Text2Vectors.stoplistFile.value, Text2Vectors.encoding.value, false, false, Text2Vectors.keepSequenceBigrams.value);
                if (extraStopwordsFile.wasInvoked()) {
                    stopwordFilter.addStopWords(Text2Vectors.extraStopwordsFile.value);
                }
                pipeList.add(stopwordFilter);
            } else if (Text2Vectors.removeStopWords.value) {
                stopwordFilter = new TokenSequenceRemoveStopwords(false, Text2Vectors.keepSequenceBigrams.value);
                if (extraStopwordsFile.wasInvoked()) {
                    stopwordFilter.addStopWords(Text2Vectors.extraStopwordsFile.value);
                }
                pipeList.add(stopwordFilter);
            }
            if (stopPatternFile.wasInvoked()) {
                TokenSequenceRemoveStopPatterns stopPatternFilter = new TokenSequenceRemoveStopPatterns(Text2Vectors.stopPatternFile.value);
                pipeList.add(stopPatternFilter);
            }
            if (Text2Vectors.gramSizes.value.length != 1 || Text2Vectors.gramSizes.value[0] != 1) {
                pipeList.add(new TokenSequenceNGrams(Text2Vectors.gramSizes.value));
            }
            if (Text2Vectors.keepSequenceBigrams.value) {
                pipeList.add(new TokenSequence2FeatureSequenceWithBigrams());
            } else {
                pipeList.add(new TokenSequence2FeatureSequence());
            }
            if (!Text2Vectors.keepSequence.value && !Text2Vectors.keepSequenceBigrams.value) {
                pipeList.add(new FeatureSequence2AugmentableFeatureVector(Text2Vectors.binaryFeatures.value));
            }
            if (featureVectorPipe.wasInvoked()) {
                pipeList.add((Pipe)Text2Vectors.featureVectorPipe.value);
            }
            if (Text2Vectors.printOutput.value) {
                pipeList.add(new PrintInputAndTarget());
            }
            instancePipe = new SerialPipes(pipeList);
        }
        InstanceList instances = new InstanceList(instancePipe);
        boolean removeCommonPrefix = true;
        instances.addThruPipe(new FileIterator(directories, FileIterator.STARTING_DIRECTORIES, removeCommonPrefix));
        ObjectOutputStream oos = Text2Vectors.outputFile.value.toString().equals("-") ? new ObjectOutputStream(System.out) : new ObjectOutputStream(new FileOutputStream(Text2Vectors.outputFile.value));
        oos.writeObject(instances);
        oos.close();
        if (usePipeFromVectorsFile.wasInvoked()) {
            logger.info(" rewriting previous instance list, with ID = " + previousInstanceList.getPipe().getInstanceId());
            oos = new ObjectOutputStream(new FileOutputStream(Text2Vectors.usePipeFromVectorsFile.value));
            oos.writeObject(previousInstanceList);
            oos.close();
        }
    }
}

