/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.share.weili.ner.enron;

import cc.mallet.pipe.Pipe;
import cc.mallet.share.weili.ner.WordTransformation;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.HashSet;
import java.util.StringTokenizer;

public class EnronMessage2TokenSequence
extends Pipe
implements Serializable {
    boolean saveSource = false;
    public static String[] skip = new String[]{"=_part_", "sent by:"};
    public static String[] skipToBlankLine = new String[]{"subject:", "original message", "content-type:", "content-transfer-encoding:", "forwarded by", "from:", "sent:", "to:", "bcc:", "cc:"};
    public static String[] labels = new String[]{"DATE", "TIME", "LOCATION", "PERSON", "ORGANIZATION", "ACRONYM", "PHONE", "MONEY", "PERCENT"};
    HashSet headerPersonNames = new HashSet();
    private static final long serialVersionUID = 1L;
    private static final int CURRENT_SERIAL_VERSION = 0;

    public EnronMessage2TokenSequence() {
        super(null, new LabelAlphabet());
    }

    @Override
    public Instance pipe(Instance carrier) {
        TokenSequence data = new TokenSequence();
        LabelSequence target = new LabelSequence((LabelAlphabet)this.getTargetAlphabet());
        StringBuffer source = this.saveSource ? new StringBuffer() : null;
        WordTransformation wt = new WordTransformation();
        File f = (File)carrier.getData();
        StringBuffer message = new StringBuffer();
        try {
            BufferedReader br = new BufferedReader(new FileReader(f));
            String line = br.readLine();
            while (line != null && !line.equals("")) {
                int i;
                line = line.toLowerCase();
                for (i = 5; i <= 9 && !line.startsWith(skipToBlankLine[i]); ++i) {
                }
                if (i <= 9) {
                    String header = line.substring(skipToBlankLine[i].length());
                    while ((line = br.readLine()) != null && !line.equals("") && (line.startsWith(" ") || line.startsWith("\t"))) {
                        header = header + line;
                    }
                    StringTokenizer st = new StringTokenizer(header, " \t,");
                    while (st.hasMoreTokens()) {
                        int dot;
                        String token = st.nextToken();
                        if (!token.endsWith("@enron.com") || (dot = (token = token.substring(0, token.length() - 10)).indexOf(".")) == -1) continue;
                        if (dot != token.lastIndexOf(".")) {
                            if (dot != token.lastIndexOf(".") - 1 || ++dot + 1 >= token.length() - 1) continue;
                            this.headerPersonNames.add(token.substring(dot + 1));
                            continue;
                        }
                        if (dot > 1) {
                            this.headerPersonNames.add(token.substring(0, dot));
                        }
                        if (dot + 1 >= token.length() - 1) continue;
                        this.headerPersonNames.add(token.substring(dot + 1));
                    }
                    continue;
                }
                line = br.readLine();
            }
            while ((line = br.readLine()) != null) {
                int j;
                String prefix;
                int index;
                int i;
                boolean header = false;
                for (i = 0; i < skip.length; ++i) {
                    index = line.toLowerCase().indexOf(skip[i]);
                    if (index == -1) continue;
                    prefix = line.substring(0, index).trim();
                    header = true;
                    for (j = 0; j < prefix.length(); ++j) {
                        if (prefix.charAt(j) == '-' || prefix.charAt(j) == '>' || prefix.charAt(j) == ' ') continue;
                        header = false;
                        break;
                    }
                    if (header) break;
                }
                if (header) continue;
                for (i = 0; i < skipToBlankLine.length; ++i) {
                    index = line.toLowerCase().indexOf(skipToBlankLine[i]);
                    if (index == -1) continue;
                    prefix = line.substring(0, index).trim();
                    header = true;
                    for (j = 0; j < prefix.length(); ++j) {
                        if (prefix.charAt(j) == '-' || prefix.charAt(j) == '>' || prefix.charAt(j) == ' ') continue;
                        header = false;
                        break;
                    }
                    if (header) break;
                }
                if (header) {
                    while ((line = br.readLine()) != null && !line.equals("")) {
                    }
                    continue;
                }
                message.append(line);
                message.append("\n");
            }
        }
        catch (IOException e) {
            System.err.println(e);
        }
        String currentLabel = "O";
        StringTokenizer st = new StringTokenizer(message.toString(), "<>", true);
        boolean readText = true;
        String text = null;
        while (st.hasMoreTokens()) {
            if (readText) {
                text = st.nextToken();
            }
            readText = true;
            if (text.equals("<")) {
                String tag = st.nextToken();
                if (tag.equals("/ENAMEX") || tag.equals("/TIMEX") || tag.equals("/NUMEX")) {
                    String nextToken = st.nextToken();
                    assert (nextToken.equals(">"));
                    currentLabel = "O";
                    continue;
                }
                if (tag.startsWith("ENAMEX") || tag.startsWith("TIMEX") || tag.startsWith("NUMEX")) {
                    String type = tag.substring(tag.indexOf(" ") + 1);
                    assert (type.startsWith("TYPE="));
                    type = type.substring(type.indexOf("\"") + 1, type.lastIndexOf("\""));
                    for (int i = 0; i < labels.length; ++i) {
                        if (!labels[i].equals(type)) continue;
                        currentLabel = "B-" + type;
                        break;
                    }
                    String nextToken = st.nextToken();
                    assert (nextToken.equals(">"));
                    continue;
                }
                data.add(new Token("<"));
                target.add(currentLabel);
                if (this.saveSource) {
                    source.append("<");
                    source.append("\n");
                }
                text = tag;
                readText = false;
            }
            StringTokenizer wordst = new StringTokenizer(text, "~`!@#$%^&*()_-+={[}]|\\:;\"',<.>?/ \t\n\r", true);
            while (wordst.hasMoreTokens()) {
                String word = wordst.nextToken();
                if (word.equals(" ") || word.equals("\t") || word.equals("\n") || word.equals("\r")) continue;
                String originalWord = word;
                Token token = wt.transformedToken(word);
                if (this.headerPersonNames.contains(word.toLowerCase())) {
                    token.setFeatureValue("HEADER-PERSON", 1.0);
                }
                data.add(token);
                target.add(currentLabel);
                if (this.saveSource) {
                    source.append(originalWord);
                    source.append("\n");
                }
                if (!currentLabel.startsWith("B-")) continue;
                currentLabel = "I-" + currentLabel.substring(2);
            }
        }
        carrier.setData(data);
        carrier.setTarget(target);
        if (this.saveSource) {
            carrier.setSource(source);
        }
        return carrier;
    }

    public void write(File f) {
        try {
            ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(f));
            oos.writeObject(this.headerPersonNames);
            oos.close();
        }
        catch (IOException e) {
            System.err.println("Exception writing file " + f + ": " + e);
        }
    }

    private void writeObject(ObjectOutputStream out) throws IOException {
        out.writeInt(0);
        out.writeBoolean(this.saveSource);
        out.writeObject(this.headerPersonNames);
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        int version = in.readInt();
        this.saveSource = in.readBoolean();
        this.headerPersonNames = (HashSet)in.readObject();
    }
}

