/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import junit.framework.TestCase;

public class ChineseTokenizationITest
extends TestCase {
    public static List<List<String>> xmlDocSentenceTokens = new ArrayList<List<String>>();
    public static List<List<Pair<Integer, Integer>>> xmlDocCharOffsets = new ArrayList<List<Pair<Integer, Integer>>>();

    public void testXMLDocWithNewlines() throws Exception {
        String RESOURCE_DIR = "/u/scr/nlp/data/stanford-corenlp-testing/";
        Properties props = StringUtils.argsToProperties("-args", RESOURCE_DIR + "test-props/kbp-2017-chinese.properties");
        props.setProperty("annotators", "tokenize,cleanxml,ssplit,pos");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        String xmlFilePath = RESOURCE_DIR + "test-docs/example-chinese-basic.xml";
        Annotation xmlAnnotation = new Annotation(IOUtils.stringFromFile(xmlFilePath));
        pipeline.annotate(xmlAnnotation);
        int sentNum = 0;
        for (CoreMap sentence : (List)xmlAnnotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            int tokenNum = 0;
            for (CoreLabel token : (List)sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                ChineseTokenizationITest.assertEquals((String)xmlDocSentenceTokens.get(sentNum).get(tokenNum), (String)token.word());
                Pair<Integer, Integer> tokenCharOffsets = new Pair<Integer, Integer>((Integer)token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), (Integer)token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                ChineseTokenizationITest.assertEquals(xmlDocCharOffsets.get(sentNum).get(tokenNum), tokenCharOffsets);
                ++tokenNum;
            }
            ++sentNum;
        }
    }

    static {
        xmlDocSentenceTokens.add(Arrays.asList("\u5df4\u62c9\u514b\u00b7\u5965\u5df4\u9a6c", "\u662f", "\u7f8e\u56fd", "\u603b\u7edf", "\u3002"));
        xmlDocSentenceTokens.add(Arrays.asList("\u4ed6", "\u5728", "2008\u5e74", "\u5f53\u9009", "\u3002"));
        xmlDocCharOffsets.add(new ArrayList());
        xmlDocCharOffsets.add(new ArrayList());
        xmlDocCharOffsets.get(0).add(new Pair<Integer, Integer>(55, 62));
        xmlDocCharOffsets.get(0).add(new Pair<Integer, Integer>(62, 63));
        xmlDocCharOffsets.get(0).add(new Pair<Integer, Integer>(63, 66));
        xmlDocCharOffsets.get(0).add(new Pair<Integer, Integer>(66, 68));
        xmlDocCharOffsets.get(0).add(new Pair<Integer, Integer>(68, 69));
        xmlDocCharOffsets.get(1).add(new Pair<Integer, Integer>(79, 80));
        xmlDocCharOffsets.get(1).add(new Pair<Integer, Integer>(80, 81));
        xmlDocCharOffsets.get(1).add(new Pair<Integer, Integer>(81, 86));
        xmlDocCharOffsets.get(1).add(new Pair<Integer, Integer>(86, 88));
        xmlDocCharOffsets.get(1).add(new Pair<Integer, Integer>(88, 89));
    }
}

