/*
 * Decompiled with CFR 0.152.
 */
package net.sf.okapi.steps.termextraction;

import com.ibm.icu.text.BreakIterator;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.annotation.TermsAnnotation;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.resource.AnnotatedSpan;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.steps.termextraction.Parameters;
import net.sf.okapi.steps.tokenization.Token;
import net.sf.okapi.steps.tokenization.Tokens;
import net.sf.okapi.steps.tokenization.TokensAnnotation;

public class SimpleTermExtractor {
    private Parameters params;
    private Map<String, Boolean> stopWords;
    private Map<String, Boolean> notStartWords;
    private Map<String, Boolean> notEndWords;
    private Map<String, Integer> terms;
    private Map<String, Integer> termsFromAnnotations;
    private Locale srcLocale;
    private BreakIterator breaker;
    private String rootDir;
    private String inputRootDir;

    public void initialize(Parameters params, LocaleId sourceLocaleId, String rootDir, String inputRootDir) {
        this.srcLocale = sourceLocaleId.toJavaLocale();
        this.params = params;
        this.rootDir = rootDir;
        this.inputRootDir = inputRootDir;
        this.stopWords = this.loadList(params.getStopWordsPath(), "stopWords_en.txt");
        this.notStartWords = this.loadList(params.getNotStartWordsPath(), "notStartWords_en.txt");
        this.notEndWords = this.loadList(params.getNotEndWordsPath(), "notEndWords_en.txt");
        this.terms = new LinkedHashMap<String, Integer>();
        this.termsFromAnnotations = new LinkedHashMap<String, Integer>();
        this.breaker = null;
    }

    public void processTextUnit(ITextUnit tu) {
        TermsAnnotation ann;
        if (!tu.isTranslatable()) {
            return;
        }
        if (this.params.getUseStatistics()) {
            this.gathertermsFromStatistics(tu);
        }
        if (this.params.getUseTerminologyAnnotations() && (ann = tu.getSource().getAnnotation(TermsAnnotation.class)) != null) {
            for (int i = 0; i < ann.size(); ++i) {
                String term = ann.getTerm(i);
                if (this.termsFromAnnotations.containsKey(term)) {
                    this.termsFromAnnotations.put(term, this.termsFromAnnotations.get(term) + 1);
                    continue;
                }
                this.termsFromAnnotations.put(term, 1);
            }
        }
        if (this.params.getUseTextAnalysisAnnotations()) {
            this.harvestTextAnalysisAnnotations(tu);
        }
    }

    private void gathertermsFromStatistics(ITextUnit tu) {
        TokensAnnotation annot = tu.getAnnotation(TokensAnnotation.class);
        List<String> words = null;
        if (annot != null) {
            Tokens tokens = annot.getFilteredList("WORD", "KANA", "IDEOGRAM");
            words = new ArrayList<String>();
            for (Token token : tokens) {
                this.addWord(words, token.getValue());
            }
        } else {
            words = this.getWordsFromDefaultBreaker(tu.getSource());
        }
        for (int i = 0; i < words.size(); ++i) {
            if (this.stopWords.containsKey(words.get(i))) continue;
            Object term = "";
            for (int j = 0; j < this.params.getMaxWordsPerTerm(); ++j) {
                if (i + j >= words.size()) continue;
                String word = words.get(i + j);
                if (this.stopWords.containsKey(word)) {
                    j = this.params.getMaxWordsPerTerm() + 1;
                    continue;
                }
                if (j == 0 && this.notStartWords.containsKey(word)) {
                    j = this.params.getMaxWordsPerTerm() + 1;
                    continue;
                }
                if (j > 0) {
                    term = (String)term + this.getWordSeparator(((String)term).charAt(((String)term).length() - 1));
                }
                term = (String)term + word;
                if (j + 1 < this.params.getMinWordsPerTerm() || this.notEndWords.containsKey(word)) continue;
                if (this.terms.containsKey(term)) {
                    this.terms.put((String)term, this.terms.get(term) + 1);
                    continue;
                }
                this.terms.put((String)term, 1);
            }
        }
    }

    private void harvestTextAnalysisAnnotations(ITextUnit tu) {
        for (Segment seg : tu.getSource().getSegments()) {
            if (!seg.getContent().hasAnnotation("generic")) continue;
            List<AnnotatedSpan> aspans = seg.getContent().getAnnotatedSpans("generic");
            for (AnnotatedSpan aspan : aspans) {
                String term = aspan.span.toText();
                if (this.termsFromAnnotations.containsKey(term)) {
                    this.termsFromAnnotations.put(term, this.termsFromAnnotations.get(term) + 1);
                    continue;
                }
                this.termsFromAnnotations.put(term, 1);
            }
        }
    }

    private String getWordSeparator(char prevChar) {
        if (prevChar > '\u0700') {
            switch (Character.getType(prevChar)) {
                case 5: {
                    return "";
                }
            }
        }
        return " ";
    }

    public void completeExtraction() {
        this.cleanupLowCounts(this.terms);
        if (this.params.getRemoveSubTerms()) {
            this.terms = this.cleanupSubStrings(this.terms);
            this.cleanupLowCounts(this.terms);
        }
        this.terms.putAll(this.termsFromAnnotations);
        this.terms = new TreeMap<String, Integer>(this.terms);
        if (this.params.getSortByOccurrence()) {
            this.terms = this.sortByValues(this.terms);
        }
        this.generateReport();
    }

    private void generateReport() {
        PrintWriter writer = null;
        try {
            String finalPath = Util.fillRootDirectoryVariable(this.params.getOutputPath(), this.rootDir);
            finalPath = Util.fillInputRootDirectoryVariable(finalPath, this.inputRootDir);
            Util.createDirectories(finalPath);
            writer = new PrintWriter(finalPath, "UTF-8");
            for (Map.Entry<String, Integer> entry : this.terms.entrySet()) {
                writer.println(String.format("%d\t%s", entry.getValue(), entry.getKey()));
            }
        }
        catch (IOException e) {
            throw new OkapiException("Error when writing output file.", e);
        }
        finally {
            if (writer != null) {
                writer.close();
                writer = null;
            }
        }
    }

    public Map<String, Integer> getTerms() {
        return this.terms;
    }

    private void cleanupLowCounts(Map<String, Integer> mapToClean) {
        Iterator<Map.Entry<String, Integer>> iter = mapToClean.entrySet().iterator();
        while (iter.hasNext()) {
            Map.Entry<String, Integer> entry = iter.next();
            if (entry.getValue() >= this.params.getMinOccurrences()) continue;
            iter.remove();
        }
    }

    private Map<String, Integer> cleanupSubStrings(Map<String, Integer> mapToClean) {
        TreeMap<String, Integer> sortedTerms = new TreeMap<String, Integer>(Collections.reverseOrder());
        sortedTerms.putAll(mapToClean);
        Iterator iter1 = sortedTerms.entrySet().iterator();
        while (iter1.hasNext()) {
            Map.Entry entry1 = iter1.next();
            Iterator iter2 = sortedTerms.entrySet().iterator();
            int count = 0;
            Object sub = (String)entry1.getKey();
            sub = (String)sub + this.getWordSeparator(((String)sub).charAt(((String)sub).length() - 1));
            while (iter2.hasNext()) {
                Map.Entry entry2 = iter2.next();
                if (!((String)entry2.getKey()).startsWith((String)sub) || entry2.equals(entry1)) continue;
                count += ((Integer)entry2.getValue()).intValue();
            }
            if ((Integer)entry1.getValue() == count) {
                iter1.remove();
                continue;
            }
            entry1.setValue((Integer)entry1.getValue() - count);
        }
        return sortedTerms;
    }

    private void addWord(List<String> list, String token) {
        if (token.length() == 0 || token.length() == 1 && token.codePointAt(0) < 126) {
            return;
        }
        if (!Character.isLetterOrDigit(token.codePointAt(0))) {
            return;
        }
        if (this.params.getKeepCase()) {
            list.add(token);
        } else {
            list.add(token.toLowerCase(this.srcLocale));
        }
    }

    private List<String> getWordsFromDefaultBreaker(TextContainer tc) {
        String content = tc.contentIsOneSegment() ? TextUnitUtil.getText(tc.getFirstContent()) : TextUnitUtil.getText(tc.getUnSegmentedContentCopy());
        if (content.length() == 0) {
            return Collections.emptyList();
        }
        if (this.breaker == null) {
            this.breaker = BreakIterator.getWordInstance((Locale)this.srcLocale);
        }
        this.breaker.setText(content);
        ArrayList<String> words = new ArrayList<String>();
        int start = this.breaker.first();
        int end = this.breaker.next();
        while (end != -1) {
            this.addWord(words, content.substring(start, end));
            start = end;
            end = this.breaker.next();
        }
        return words;
    }

    private HashMap<String, Boolean> loadList(String path, String defaultFile) {
        HashMap<String, Boolean> map = new HashMap<String, Boolean>();
        BufferedReader reader = null;
        try {
            String line;
            InputStream is = Util.isEmpty(path) ? SimpleTermExtractor.class.getResourceAsStream(defaultFile) : new FileInputStream(path);
            reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
            while ((line = reader.readLine()) != null) {
                if ((line = line.trim()).length() == 0 || line.charAt(0) == '#' || map.containsKey(line)) continue;
                map.put(line, false);
            }
        }
        catch (IOException e) {
            throw new OkapiException("Error reading word list.", e);
        }
        finally {
            if (reader != null) {
                try {
                    reader.close();
                }
                catch (IOException e) {
                    throw new OkapiException("Error reading word list.", e);
                }
            }
        }
        return map;
    }

    private <K, V extends Comparable<V>> Map<K, V> sortByValues(Map<K, V> map) {
        Comparator valueComparator = (k1, k2) -> {
            int res = ((Comparable)map.get(k2)).compareTo((Comparable)map.get(k1));
            if (res == 0) {
                return 1;
            }
            return res;
        };
        TreeMap<K, V> sortedMap = new TreeMap<K, V>(valueComparator);
        sortedMap.putAll(map);
        return sortedMap;
    }
}

