/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.international.negra.NegraPennLanguagePack;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;

public class PTBTokenizerTest {
    private final String[] ptbInputs = new String[]{"This is a sentence.", "U.S. insurance: Conseco acquires Kemper Corp. \n</HEADLINE>\n<P>\nU.S insurance", "Based in Eugene,Ore., PakTech needs a new distributor after Sydney-based Creative Pack Pty. Ltd. went into voluntary administration.", "The Iron Age (ca. 1300 \u2013 ca. 300 BC).", "Indo\u00adnesian ship\u00adping \u00ad", "Gimme a phone, I'm gonna call.", "\"John & Mary's dog,\" Jane thought (to herself).\n\"What a #$%!\na- ``I like AT&T''.\"", "I said at 4:45pm.", "I can't believe they wanna keep 40% of that.\"\n``Whatcha think?''\n\"I don't --- think so...,\"", "You `paid' US$170,000?!\nYou should've paid only$16.75.", "1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%", "I like you ;-) but do you care :(. I'm happy ^_^ but shy (x.x)!", "Diamond (``Not even the chair'') lives near Udaipur (84km). {1. A potential Palmer trade:}", "No. I like No. 24 and no.47.", "You can get a B.S. or a B. A. or a Ph.D (sometimes a Ph. D) from Stanford.", "@Harry_Styles didn`t like Mu`ammar al-Qaddafi", "Kenneth liked Windows 3.1, Windows 3.x, and Mesa A.B as I remember things.", "I like programming in F# more than C#.", "NBC Live will be available free through the Yahoo! Chat Web site. E! Entertainment said ``Jeopardy!'' is a game show.", "I lived in O\u2019Malley and read OK! Magazine.", "I lived in O\u0092Malley and read OK! Magazine.", "I like: \u2022wine, \u0095cheese, \u2023salami, & \u2043speck.", "I don't give a f**k about your sh*tty life.", "First sentence.... Second sentence.", "First sentence . . . . Second sentence.", "I wasn\u2019t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn\u2019t mean it.", "This is a url test. Here is one: http://google.com.", "This is a url test. Here is one: htvp://google.com.", "Download from ftp://myname@host.dom/%2Fetc/motd", "Download from svn://user@location.edu/path/to/magic/unicorns", "Download from svn+ssh://user@location.edu/path/to/magic/unicorns", "Independent Living can be reached at http://www.inlv.demon.nl/.", "We traveled from No. Korea to So. Calif. yesterday.", "I dunno.", "The o-kay was received by the anti-acquisition front on its foolishness-filled fish market.", "We ran the pre-tests through the post-scripted centrifuge.", "School-aged parents should be aware of the unique problems that they face.", "I dispute Art. 53 of the convention.", "I like Art. And I like History.", "Contact: sue@google.com, fred@stanford.edu; michael.inman@lab.rpi.cs.cmu.edu.", "Email: recruiters@marvelconsultants.com <mailto:recruiters@marvelconsultants.com>", " Jeremy Meier <jermeier@earthlink.net>", "Ram Tackett,  (mailto:rtackett@abacustech.net)", "[Jgerma5@aol.com]. Danny_Jones%ENRON@eott.com", "https://fancy.startup.ai", "mid-2015", "UK-based", "2010-2015", "20-30%", "80,000-man march", "39-yard", "60-90's", "Soft AC-styled", "3 p.m., eastern time", "Total Private\nOrders 779.5 -9.5%", "2-9.5%", "2- 9.5%", "From July 23-24. Radisson Miyako Hotel.", "23 percent-2 percent higher than today", "23 percent--2 percent higher than today", "438798-438804", "He earned eligibility by virtue of a top-35 finish.", "Witt was 2-for-34 as a hitter", "An Atlanta-bound DC-9 crashed", "weigh 1,000-1,200 pounds, ", "Imus arrived to be host for the 5:30-to-10 a.m. show.", "The .38-Magnum bullet", "a 1908 Model K Stanley with 1:01-minute time", "the 9-to-11:45 a.m. weekday shift", "Brighton Rd. Pacifica", "Walls keeping water out of the bowl-shaped city have been breached, and emergency teams are using helicopters to drop 1,350kg (3,000lb) sandbags and concrete barriers into the gaps.", "i got (89.2%) in my exams"};
    private final String[][] ptbGold = new String[][]{{"This", "is", "a", "sentence", "."}, {"U.S.", "insurance", ":", "Conseco", "acquires", "Kemper", "Corp.", ".", "</HEADLINE>", "<P>", "U.S", "insurance"}, {"Based", "in", "Eugene", ",", "Ore.", ",", "PakTech", "needs", "a", "new", "distributor", "after", "Sydney-based", "Creative", "Pack", "Pty.", "Ltd.", "went", "into", "voluntary", "administration", "."}, {"The", "Iron", "Age", "-LRB-", "ca.", "1300", "--", "ca.", "300", "BC", "-RRB-", "."}, {"Indonesian", "shipping", "-"}, {"Gim", "me", "a", "phone", ",", "I", "'m", "gon", "na", "call", "."}, {"``", "John", "&", "Mary", "'s", "dog", ",", "''", "Jane", "thought", "-LRB-", "to", "herself", "-RRB-", ".", "``", "What", "a", "#", "$", "%", "!", "a", "-", "``", "I", "like", "AT&T", "''", ".", "''"}, {"I", "said", "at", "4:45", "pm", "."}, {"I", "ca", "n't", "believe", "they", "wan", "na", "keep", "40", "%", "of", "that", ".", "''", "``", "Whatcha", "think", "?", "''", "``", "I", "do", "n't", "--", "think", "so", "...", ",", "''"}, {"You", "`", "paid", "'", "US$", "170,000", "?!", "You", "should", "'ve", "paid", "only", "$", "16.75", "."}, {"1", ".", "Buy", "a", "new", "Chevrolet", "-LRB-", "37", "%", "-", "owned", "in", "the", "U.S.", ".", "-RRB-", ".", "15", "%"}, {"I", "like", "you", ";--RRB-", "but", "do", "you", "care", ":-LRB-", ".", "I", "'m", "happy", "^_^", "but", "shy", "-LRB-x.x-RRB-", "!"}, {"Diamond", "-LRB-", "``", "Not", "even", "the", "chair", "''", "-RRB-", "lives", "near", "Udaipur", "-LRB-", "84", "km", "-RRB-", ".", "-LCB-", "1", ".", "A", "potential", "Palmer", "trade", ":", "-RCB-"}, {"No", ".", "I", "like", "No.", "24", "and", "no.", "47", "."}, {"You", "can", "get", "a", "B.S.", "or", "a", "B.", "A.", "or", "a", "Ph.D", "-LRB-", "sometimes", "a", "Ph.", "D", "-RRB-", "from", "Stanford", "."}, {"@Harry_Styles", "did", "n`t", "like", "Mu`ammar", "al-Qaddafi"}, {"Kenneth", "liked", "Windows", "3.1", ",", "Windows", "3.x", ",", "and", "Mesa", "A.B", "as", "I", "remember", "things", "."}, {"I", "like", "programming", "in", "F#", "more", "than", "C#", "."}, {"NBC", "Live", "will", "be", "available", "free", "through", "the", "Yahoo!", "Chat", "Web", "site", ".", "E!", "Entertainment", "said", "``", "Jeopardy!", "''", "is", "a", "game", "show", "."}, {"I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."}, {"I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."}, {"I", "like", ":", "\u2022", "wine", ",", "\u2022", "cheese", ",", "\u2023", "salami", ",", "&", "\u2043", "speck", "."}, {"I", "do", "n't", "give", "a", "f**k", "about", "your", "sh*tty", "life", "."}, {"First", "sentence", "...", ".", "Second", "sentence", "."}, {"First", "sentence", "...", ".", "Second", "sentence", "."}, {"I", "was", "n't", "really", "...", "well", ",", "what", "I", "mean", "...", "see", "...", "what", "I", "'m", "saying", ",", "the", "thing", "is", "...", "I", "did", "n't", "mean", "it", "."}, {"This", "is", "a", "url", "test", ".", "Here", "is", "one", ":", "http://google.com", "."}, {"This", "is", "a", "url", "test", ".", "Here", "is", "one", ":", "htvp", ":", "/", "/", "google.com", "."}, {"Download", "from", "ftp://myname@host.dom/%2Fetc/motd"}, {"Download", "from", "svn://user@location.edu/path/to/magic/unicorns"}, {"Download", "from", "svn+ssh://user@location.edu/path/to/magic/unicorns"}, {"Independent", "Living", "can", "be", "reached", "at", "http://www.inlv.demon.nl/", "."}, {"We", "traveled", "from", "No.", "Korea", "to", "So.", "Calif.", "yesterday", "."}, {"I", "du", "n", "no", "."}, {"The", "o-kay", "was", "received", "by", "the", "anti-acquisition", "front", "on", "its", "foolishness-filled", "fish", "market", "."}, {"We", "ran", "the", "pre-tests", "through", "the", "post-scripted", "centrifuge", "."}, {"School-aged", "parents", "should", "be", "aware", "of", "the", "unique", "problems", "that", "they", "face", "."}, {"I", "dispute", "Art.", "53", "of", "the", "convention", "."}, {"I", "like", "Art", ".", "And", "I", "like", "History", "."}, {"Contact", ":", "sue@google.com", ",", "fred@stanford.edu", ";", "michael.inman@lab.rpi.cs.cmu.edu", "."}, {"Email", ":", "recruiters@marvelconsultants.com", "<mailto:recruiters@marvelconsultants.com>"}, {"Jeremy", "Meier", "<jermeier@earthlink.net>"}, {"Ram", "Tackett", ",", "-LRB-", "mailto:rtackett@abacustech.net", "-RRB-"}, {"-LSB-", "Jgerma5@aol.com", "-RSB-", ".", "Danny_Jones%ENRON@eott.com"}, {"https://fancy.startup.ai"}, {"mid-2015"}, {"UK-based"}, {"2010-2015"}, {"20-30", "%"}, {"80,000-man", "march"}, {"39-yard"}, {"60-90", "'s"}, {"Soft", "AC-styled"}, {"3", "p.m.", ",", "eastern", "time"}, {"Total", "Private", "Orders", "779.5", "-9.5", "%"}, {"2-9.5", "%"}, {"2", "-", "9.5", "%"}, {"From", "July", "23-24", ".", "Radisson", "Miyako", "Hotel", "."}, {"23", "percent-2", "percent", "higher", "than", "today"}, {"23", "percent", "--", "2", "percent", "higher", "than", "today"}, {"438798-438804"}, {"He", "earned", "eligibility", "by", "virtue", "of", "a", "top-35", "finish", "."}, {"Witt", "was", "2-for-34", "as", "a", "hitter"}, {"An", "Atlanta-bound", "DC-9", "crashed"}, {"weigh", "1,000-1,200", "pounds", ","}, {"Imus", "arrived", "to", "be", "host", "for", "the", "5:30-to-10", "a.m.", "show", "."}, {"The", ".38-Magnum", "bullet"}, {"a", "1908", "Model", "K", "Stanley", "with", "1:01-minute", "time"}, {"the", "9-to-11:45", "a.m.", "weekday", "shift"}, {"Brighton", "Rd.", "Pacifica"}, {"Walls", "keeping", "water", "out", "of", "the", "bowl-shaped", "city", "have", "been", "breached", ",", "and", "emergency", "teams", "are", "using", "helicopters", "to", "drop", "1,350", "kg", "-LRB-", "3,000", "lb", "-RRB-", "sandbags", "and", "concrete", "barriers", "into", "the", "gaps", "."}, {"i", "got", "-LRB-", "89.2", "%", "-RRB-", "in", "my", "exams"}};
    private final String[][] ptbGoldSplitHyphenated = new String[][]{{"This", "is", "a", "sentence", "."}, {"U.S.", "insurance", ":", "Conseco", "acquires", "Kemper", "Corp.", ".", "</HEADLINE>", "<P>", "U.S", "insurance"}, {"Based", "in", "Eugene", ",", "Ore.", ",", "PakTech", "needs", "a", "new", "distributor", "after", "Sydney", "-", "based", "Creative", "Pack", "Pty.", "Ltd.", "went", "into", "voluntary", "administration", "."}, {"The", "Iron", "Age", "-LRB-", "ca.", "1300", "--", "ca.", "300", "BC", "-RRB-", "."}, {"Indonesian", "shipping", "-"}, {"Gim", "me", "a", "phone", ",", "I", "'m", "gon", "na", "call", "."}, {"``", "John", "&", "Mary", "'s", "dog", ",", "''", "Jane", "thought", "-LRB-", "to", "herself", "-RRB-", ".", "``", "What", "a", "#", "$", "%", "!", "a", "-", "``", "I", "like", "AT&T", "''", ".", "''"}, {"I", "said", "at", "4:45", "pm", "."}, {"I", "ca", "n't", "believe", "they", "wan", "na", "keep", "40", "%", "of", "that", ".", "''", "``", "Whatcha", "think", "?", "''", "``", "I", "do", "n't", "--", "think", "so", "...", ",", "''"}, {"You", "`", "paid", "'", "US$", "170,000", "?!", "You", "should", "'ve", "paid", "only", "$", "16.75", "."}, {"1", ".", "Buy", "a", "new", "Chevrolet", "-LRB-", "37", "%", "-", "owned", "in", "the", "U.S.", ".", "-RRB-", ".", "15", "%"}, {"I", "like", "you", ";--RRB-", "but", "do", "you", "care", ":-LRB-", ".", "I", "'m", "happy", "^_^", "but", "shy", "-LRB-x.x-RRB-", "!"}, {"Diamond", "-LRB-", "``", "Not", "even", "the", "chair", "''", "-RRB-", "lives", "near", "Udaipur", "-LRB-", "84", "km", "-RRB-", ".", "-LCB-", "1", ".", "A", "potential", "Palmer", "trade", ":", "-RCB-"}, {"No", ".", "I", "like", "No.", "24", "and", "no.", "47", "."}, {"You", "can", "get", "a", "B.S.", "or", "a", "B.", "A.", "or", "a", "Ph.D", "-LRB-", "sometimes", "a", "Ph.", "D", "-RRB-", "from", "Stanford", "."}, {"@Harry_Styles", "did", "n`t", "like", "Mu`ammar", "al", "-", "Qaddafi"}, {"Kenneth", "liked", "Windows", "3.1", ",", "Windows", "3.x", ",", "and", "Mesa", "A.B", "as", "I", "remember", "things", "."}, {"I", "like", "programming", "in", "F#", "more", "than", "C#", "."}, {"NBC", "Live", "will", "be", "available", "free", "through", "the", "Yahoo!", "Chat", "Web", "site", ".", "E!", "Entertainment", "said", "``", "Jeopardy!", "''", "is", "a", "game", "show", "."}, {"I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."}, {"I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."}, {"I", "like", ":", "\u2022", "wine", ",", "\u2022", "cheese", ",", "\u2023", "salami", ",", "&", "\u2043", "speck", "."}, {"I", "do", "n't", "give", "a", "f**k", "about", "your", "sh*tty", "life", "."}, {"First", "sentence", "...", ".", "Second", "sentence", "."}, {"First", "sentence", "...", ".", "Second", "sentence", "."}, {"I", "was", "n't", "really", "...", "well", ",", "what", "I", "mean", "...", "see", "...", "what", "I", "'m", "saying", ",", "the", "thing", "is", "...", "I", "did", "n't", "mean", "it", "."}, {"This", "is", "a", "url", "test", ".", "Here", "is", "one", ":", "http://google.com", "."}, {"This", "is", "a", "url", "test", ".", "Here", "is", "one", ":", "htvp", ":", "/", "/", "google.com", "."}, {"Download", "from", "ftp://myname@host.dom/%2Fetc/motd"}, {"Download", "from", "svn://user@location.edu/path/to/magic/unicorns"}, {"Download", "from", "svn+ssh://user@location.edu/path/to/magic/unicorns"}, {"Independent", "Living", "can", "be", "reached", "at", "http://www.inlv.demon.nl/", "."}, {"We", "traveled", "from", "No.", "Korea", "to", "So.", "Calif.", "yesterday", "."}, {"I", "du", "n", "no", "."}, {"The", "o-kay", "was", "received", "by", "the", "anti-acquisition", "front", "on", "its", "foolishness", "-", "filled", "fish", "market", "."}, {"We", "ran", "the", "pre-tests", "through", "the", "post-scripted", "centrifuge", "."}, {"School", "-", "aged", "parents", "should", "be", "aware", "of", "the", "unique", "problems", "that", "they", "face", "."}, {"I", "dispute", "Art.", "53", "of", "the", "convention", "."}, {"I", "like", "Art", ".", "And", "I", "like", "History", "."}, {"Contact", ":", "sue@google.com", ",", "fred@stanford.edu", ";", "michael.inman@lab.rpi.cs.cmu.edu", "."}, {"Email", ":", "recruiters@marvelconsultants.com", "<mailto:recruiters@marvelconsultants.com>"}, {"Jeremy", "Meier", "<jermeier@earthlink.net>"}, {"Ram", "Tackett", ",", "-LRB-", "mailto:rtackett@abacustech.net", "-RRB-"}, {"-LSB-", "Jgerma5@aol.com", "-RSB-", ".", "Danny_Jones%ENRON@eott.com"}, {"https://fancy.startup.ai"}, {"mid", "-", "2015"}, {"UK", "-", "based"}, {"2010", "-", "2015"}, {"20", "-", "30", "%"}, {"80,000", "-", "man", "march"}, {"39", "-", "yard"}, {"60", "-", "90", "'s"}, {"Soft", "AC", "-", "styled"}, {"3", "p.m.", ",", "eastern", "time"}, {"Total", "Private", "Orders", "779.5", "-9.5", "%"}, {"2", "-", "9.5", "%"}, {"2", "-", "9.5", "%"}, {"From", "July", "23", "-", "24", ".", "Radisson", "Miyako", "Hotel", "."}, {"23", "percent", "-2", "percent", "higher", "than", "today"}, {"23", "percent", "--", "2", "percent", "higher", "than", "today"}, {"438798", "-", "438804"}, {"He", "earned", "eligibility", "by", "virtue", "of", "a", "top", "-35", "finish", "."}, {"Witt", "was", "2", "-", "for", "-34", "as", "a", "hitter"}, {"An", "Atlanta", "-", "bound", "DC", "-9", "crashed"}, {"weigh", "1,000-1,200", "pounds", ","}, {"Imus", "arrived", "to", "be", "host", "for", "the", "5:30-to-10", "a.m.", "show", "."}, {"The", ".38-Magnum", "bullet"}, {"a", "1908", "Model", "K", "Stanley", "with", "1:01-minute", "time"}, {"the", "9-to-11:45", "a.m.", "weekday", "shift"}, {"Brighton", "Rd.", "Pacifica"}, {"Walls", "keeping", "water", "out", "of", "the", "bowl", "-", "shaped", "city", "have", "been", "breached", ",", "and", "emergency", "teams", "are", "using", "helicopters", "to", "drop", "1,350", "kg", "-LRB-", "3,000", "lb", "-RRB-", "sandbags", "and", "concrete", "barriers", "into", "the", "gaps", "."}, {"i", "got", "-LRB-", "89.2", "%", "-RRB-", "in", "my", "exams"}};
    private final String[] moreInputs = new String[]{"Joseph Someone (fl. 2050\u201375) liked the noble gases, viz. helium, neon, argon, xenon, krypton and radon.", "Sambucus nigra subsp. canadensis and Canis spp. missing", "Jim Jackon & Co. LLC replied.", "Xanadu Pvt. Ltd. replied.", " \u2010 - ___ ", "whenever one goes 'tisk tisk' at something"};
    private final String[][] moreGold = new String[][]{{"Joseph", "Someone", "-LRB-", "fl.", "2050", "--", "75", "-RRB-", "liked", "the", "noble", "gases", ",", "viz.", "helium", ",", "neon", ",", "argon", ",", "xenon", ",", "krypton", "and", "radon", "."}, {"Sambucus", "nigra", "subsp.", "canadensis", "and", "Canis", "spp.", "missing"}, {"Jim", "Jackon", "&", "Co.", "LLC", "replied", "."}, {"Xanadu", "Pvt.", "Ltd.", "replied", "."}, {"\u2010", "-", "___"}, {"whenever", "one", "goes", "`", "tisk", "tisk", "'", "at", "something"}};
    private final String[] corpInputs = new String[]{"So, too, many analysts predict, will Exxon Corp., Chevron Corp. and Amoco Corp.", "So, too, many analysts predict, will Exxon Corp., Chevron Corp. and Amoco Corp.   "};
    private final String[][] corpGold = new String[][]{{"So", ",", "too", ",", "many", "analysts", "predict", ",", "will", "Exxon", "Corp.", ",", "Chevron", "Corp.", "and", "Amoco", "Corp", "."}, {"So", ",", "too", ",", "many", "analysts", "predict", ",", "will", "Exxon", "Corp.", ",", "Chevron", "Corp.", "and", "Amoco", "Corp.", "."}};
    private static final String[] jeInputs = new String[]{"it's", " it's "};
    private static final List[] jeOutputs = new List[]{Arrays.asList(new Word("it"), new Word("'s")), Arrays.asList(new Word("it"), new Word("'s"))};
    private static final String[] untokInputs = new String[]{"London - AFP reported junk .", "Paris - Reuters reported news .", "Sydney - News said - something .", "HEADLINE - New Android phone !", "I did it 'cause I wanted to , and you 'n' me know that .", "He said that `` Luxembourg needs surface - to - air missiles . ''"};
    private static final String[] untokOutputs = new String[]{"London - AFP reported junk.", "Paris - Reuters reported news.", "Sydney - News said - something.", "HEADLINE - New Android phone!", "I did it 'cause I wanted to, and you 'n' me know that.", "He said that \"Luxembourg needs surface-to-air missiles.\""};
    private final String[] sgmlInputs = new String[]{"Significant improvements in peak FEV1 were demonstrated with tiotropium/olodaterol 5/2 \u03bcg (p\u2009=\u20090.008), 5/5 \u03bcg (p\u2009=\u20090.012), and 5/10 \u03bcg (p\u2009<\u20090.0001) versus tiotropium monotherapy [51].", "Panasonic brand products are produced by Samsung Electronics Co. Ltd. Sanyo products aren't.", "Oesophageal acid exposure (% time <pH 4) was similar in patients with or without complications (19.2% v 19.3% p>0.05).", "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">", "Hi! <foo bar=\"baz xy = foo !$*) 422\" > <?PITarget PIContent?> <?PITarget PIContent> Hi!", "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n<?xml-stylesheet type=\"text/xsl\" href=\"style.xsl\"?>\n<book xml:id=\"simple_book\" xmlns=\"http://docbook.org/ns/docbook\" version=\"5.0\">\n", "<chapter xml:id=\"chapter_1\"><?php echo $a; ?>\n<!-- This is an SGML/XML comment \"Hi!\" -->\n<p> </p> <p-fix / >", "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some \"dude'/> <not sgmltag", "<quote previouspost=\"\n&gt; &gt; I really don't want to process this junk.\n&gt; No one said you did, runny. \u00a0What's got you so scared, anyway?-\n\">", "&lt;b...@canada.com&gt; funky@thedismalscience.net <myemail@where.com>", "<DOC> <DOCID> nyt960102.0516 </DOCID><STORYID cat=w pri=u> A0264 </STORYID> <SLUG fv=ttj-z> ", "<!-- copy from here --> <a href=\"http://strategis.gc.ca/epic/internet/inabc-eac.nsf/en/home\"><img src=\"id-images/ad-220x80_01e.jpg\" alt=\"Aboriginal Business Canada:\nOpening New Doors for Your Business\" width=\"220\" height=\"80\" border=\"0\"></a> <!-- copy to here --> Small ABC Graphic Instructions 1.", "We traveled from No.\nKorea to the U.S.A.\nWhy?"};
    private final String[][] sgmlGold = new String[][]{{"Significant", "improvements", "in", "peak", "FEV1", "were", "demonstrated", "with", "tiotropium/olodaterol", "5/2", "\u03bcg", "-LRB-", "p", "=", "0.008", "-RRB-", ",", "5/5", "\u03bcg", "-LRB-", "p", "=", "0.012", "-RRB-", ",", "and", "5/10", "\u03bcg", "-LRB-", "p", "<", "0.0001", "-RRB-", "versus", "tiotropium", "monotherapy", "-LSB-", "51", "-RSB-", "."}, {"Panasonic", "brand", "products", "are", "produced", "by", "Samsung", "Electronics", "Co.", "Ltd.", ".", "Sanyo", "products", "are", "n't", "."}, {"Oesophageal", "acid", "exposure", "-LRB-", "%", "time", "<", "pH", "4", "-RRB-", "was", "similar", "in", "patients", "with", "or", "without", "complications", "-LRB-", "19.2", "%", "v", "19.3", "%", "p", ">", "0.05", "-RRB-", "."}, {"<!DOCTYPE\u00a0html\u00a0PUBLIC\u00a0\"-//W3C//DTD\u00a0HTML\u00a04.01\u00a0Strict//EN\"\u00a0\"http://www.w3.org/TR/html4/strict.dtd\">"}, {"Hi", "!", "<foo\u00a0bar=\"baz\u00a0xy\u00a0=\u00a0foo\u00a0!$*)\u00a0422\"\u00a0>", "<?PITarget\u00a0PIContent?>", "<?PITarget\u00a0PIContent>", "Hi", "!"}, {"<?xml\u00a0version=\"1.0\"\u00a0encoding=\"UTF-8\"\u00a0?>", "<?xml-stylesheet\u00a0type=\"text/xsl\"\u00a0href=\"style.xsl\"?>", "<book\u00a0xml:id=\"simple_book\"\u00a0xmlns=\"http://docbook.org/ns/docbook\"\u00a0version=\"5.0\">"}, {"<chapter\u00a0xml:id=\"chapter_1\">", "<?php\u00a0echo\u00a0$a;\u00a0?>", "<!--\u00a0This\u00a0is\u00a0an\u00a0SGML/XML\u00a0comment\u00a0\"Hi!\"\u00a0-->", "<p>", "</p>", "<p-fix\u00a0/\u00a0>"}, {"<a\u00a0href=\"http:\\\\it's\\here\">", "<quote\u00a0orig_author='some\u00a0\"dude'/>", "<", "not", "sgmltag"}, {"<quote\u00a0previouspost=\"\u00a0&gt;\u00a0&gt;\u00a0I\u00a0really\u00a0don't\u00a0want\u00a0to\u00a0process\u00a0this\u00a0junk.\u00a0&gt;\u00a0No\u00a0one\u00a0said\u00a0you\u00a0did,\u00a0runny.\u00a0\u00a0What's\u00a0got\u00a0you\u00a0so\u00a0scared,\u00a0anyway?-\u00a0\">"}, {"&lt;b...@canada.com&gt;", "funky@thedismalscience.net", "<myemail@where.com>"}, {"<DOC>", "<DOCID>", "nyt960102", ".0516", "</DOCID>", "<STORYID\u00a0cat=w\u00a0pri=u>", "A0264", "</STORYID>", "<SLUG\u00a0fv=ttj-z>"}, {"<!--\u00a0copy\u00a0from\u00a0here\u00a0-->", "<a\u00a0href=\"http://strategis.gc.ca/epic/internet/inabc-eac.nsf/en/home\">", "<img\u00a0src=\"id-images/ad-220x80_01e.jpg\"\u00a0alt=\"Aboriginal\u00a0Business\u00a0Canada:\u00a0Opening\u00a0New\u00a0Doors\u00a0for\u00a0Your\u00a0Business\"\u00a0width=\"220\"\u00a0height=\"80\"\u00a0border=\"0\">", "</a>", "<!--\u00a0copy\u00a0to\u00a0here\u00a0-->", "Small", "ABC", "Graphic", "Instructions", "1", "."}, {"We", "traveled", "from", "No.", "Korea", "to", "the", "U.S.A.", ".", "Why", "?"}};
    private final String[][] sgmlPerLineGold = new String[][]{{"Significant", "improvements", "in", "peak", "FEV1", "were", "demonstrated", "with", "tiotropium/olodaterol", "5/2", "\u03bcg", "-LRB-", "p", "=", "0.008", "-RRB-", ",", "5/5", "\u03bcg", "-LRB-", "p", "=", "0.012", "-RRB-", ",", "and", "5/10", "\u03bcg", "-LRB-", "p", "<", "0.0001", "-RRB-", "versus", "tiotropium", "monotherapy", "-LSB-", "51", "-RSB-", "."}, {"Panasonic", "brand", "products", "are", "produced", "by", "Samsung", "Electronics", "Co.", "Ltd.", ".", "Sanyo", "products", "are", "n't", "."}, {"Oesophageal", "acid", "exposure", "-LRB-", "%", "time", "<", "pH", "4", "-RRB-", "was", "similar", "in", "patients", "with", "or", "without", "complications", "-LRB-", "19.2", "%", "v", "19.3", "%", "p", ">", "0.05", "-RRB-", "."}, {"<!DOCTYPE\u00a0html\u00a0PUBLIC\u00a0\"-//W3C//DTD\u00a0HTML\u00a04.01\u00a0Strict//EN\"\u00a0\"http://www.w3.org/TR/html4/strict.dtd\">"}, {"Hi", "!", "<foo\u00a0bar=\"baz\u00a0xy\u00a0=\u00a0foo\u00a0!$*)\u00a0422\"\u00a0>", "<?PITarget\u00a0PIContent?>", "<?PITarget\u00a0PIContent>", "Hi", "!"}, {"<?xml\u00a0version=\"1.0\"\u00a0encoding=\"UTF-8\"\u00a0?>", "<?xml-stylesheet\u00a0type=\"text/xsl\"\u00a0href=\"style.xsl\"?>", "<book\u00a0xml:id=\"simple_book\"\u00a0xmlns=\"http://docbook.org/ns/docbook\"\u00a0version=\"5.0\">"}, {"<chapter\u00a0xml:id=\"chapter_1\">", "<?php\u00a0echo\u00a0$a;\u00a0?>", "<!--\u00a0This\u00a0is\u00a0an\u00a0SGML/XML\u00a0comment\u00a0\"Hi!\"\u00a0-->", "<p>", "</p>", "<p-fix\u00a0/\u00a0>"}, {"<a\u00a0href=\"http:\\\\it's\\here\">", "<quote\u00a0orig_author='some\u00a0\"dude'/>", "<", "not", "sgmltag"}, {"<", "quote", "previouspost", "=", "''", ">", ">", "I", "really", "do", "n't", "want", "to", "process", "this", "junk", ".", ">", "No", "one", "said", "you", "did", ",", "runny", ".", "What", "'s", "got", "you", "so", "scared", ",", "anyway", "?", "-", "''", ">"}, {"&lt;b...@canada.com&gt;", "funky@thedismalscience.net", "<myemail@where.com>"}, {"<DOC>", "<DOCID>", "nyt960102", ".0516", "</DOCID>", "<STORYID\u00a0cat=w\u00a0pri=u>", "A0264", "</STORYID>", "<SLUG\u00a0fv=ttj-z>"}, {"<!--\u00a0copy\u00a0from\u00a0here\u00a0-->", "<a\u00a0href=\"http://strategis.gc.ca/epic/internet/inabc-eac.nsf/en/home\">", "<", "img", "src", "=", "``", "id-images/ad-220x80_01e.jpg", "''", "alt", "=", "``", "Aboriginal", "Business", "Canada", ":", "Opening", "New", "Doors", "for", "Your", "Business", "''", "width", "=", "``", "220", "''", "height", "=", "``", "80", "''", "border", "=", "``", "0", "''", ">", "</a>", "<!--\u00a0copy\u00a0to\u00a0here\u00a0-->", "Small", "ABC", "Graphic", "Instructions", "1", "."}, {"We", "traveled", "from", "No", ".", "Korea", "to", "the", "U.S.A.", "Why", "?"}};
    private final String[] mtInputs = new String[]{"Enter an option [?/Current]:{1}", "for example, {1}http://www.autodesk.com{2}, or a path", "enter {3}@{4} at the Of prompt.", "{1}block name={2}", "1202-03-04 5:32:56 2004-03-04T18:32:56", "20\u00b0C is 68\u00b0F because 0\u2103 is 32\u2109", "a.jpg a-b.jpg a.b.jpg a-b.jpg a_b.jpg a-b-c.jpg 0-1-2.jpg a-b/c-d_e.jpg a-b/c-9a9_9a.jpg\n", "\u00af\\_(\u30c4)_/\u00af", "#hashtag #Az\u0259rbaycanca #m\u00fb\u01c1ae #\u010ce\u0161tina #\u65e5\u672c\u8a9e\u30cf\u30c3\u30b7\u30e5\u30bf\u30b0 #1 #23 #Trump2016 @3 @acl_2016", "Sect. 793 of the Penal Code", "Pls. copy the text within this quote to the subject part of your email and explain wrt. the principles."};
    private final String[][] mtGold = new String[][]{{"Enter", "an", "option", "-LSB-", "?", "/", "Current", "-RSB-", ":", "-LCB-", "1", "-RCB-"}, {"for", "example", ",", "-LCB-", "1", "-RCB-", "http://www.autodesk.com", "-LCB-", "2", "-RCB-", ",", "or", "a", "path"}, {"enter", "-LCB-", "3", "-RCB-", "@", "-LCB-", "4", "-RCB-", "at", "the", "Of", "prompt", "."}, {"-LCB-", "1", "-RCB-", "block", "name", "=", "-LCB-", "2", "-RCB-"}, {"1202-03-04", "5:32:56", "2004-03-04T18:32:56"}, {"20", "\u00b0C", "is", "68", "\u00b0F", "because", "0", "\u2103", "is", "32", "\u2109"}, {"a.jpg", "a-b.jpg", "a.b.jpg", "a-b.jpg", "a_b.jpg", "a-b-c.jpg", "0-1-2.jpg", "a-b/c-d_e.jpg", "a-b/c-9a9_9a.jpg"}, {"\u00af\\_-LRB-\u30c4-RRB-_/\u00af"}, {"#hashtag", "#Az\u0259rbaycanca", "#m\u00fb\u01c1ae", "#\u010ce\u0161tina", "#\u65e5\u672c\u8a9e\u30cf\u30c3\u30b7\u30e5\u30bf\u30b0", "#", "1", "#", "23", "#Trump2016", "@", "3", "@acl_2016"}, {"Sect.", "793", "of", "the", "Penal", "Code"}, {"Pls.", "copy", "the", "text", "within", "this", "quote", "to", "the", "subject", "part", "of", "your", "email", "and", "explain", "wrt.", "the", "principles", "."}};
    private final String[] emojiInputs = new String[]{"\ud83d\ude09\ud83d\ude00\ud83d\ude02\ud83d\ude0d\ud83e\udd21\ud83c\udde6\ud83c\uddfa\ud83c\udf7a", "\ud83d\udc66\ud83c\udffb\ud83d\udc67\ud83c\udfff", "\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d\udc67\ud83e\uddc0", "\u00ae\u203c\u2198\u231a\u2328\u23f0\u2620\u26bd\u2705\u2757", "\u26a0\u26a0\ufe0f\u26a0\ufe0e\u2764\ufe0f\u2764", "\ud83d\udc69\u200d\u2696\ud83d\udc68\ud83c\udfff\u200d\ud83c\udfa4"};
    private final String[][] emojiGold = new String[][]{{"\ud83d\ude09", "\ud83d\ude00", "\ud83d\ude02", "\ud83d\ude0d", "\ud83e\udd21", "\ud83c\udde6\ud83c\uddfa", "\ud83c\udf7a"}, {"\ud83d\udc66\ud83c\udffb", "\ud83d\udc67\ud83c\udfff"}, {"\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d\udc67", "\ud83e\uddc0"}, {"\u00ae", "\u203c", "\u2198", "\u231a", "\u2328", "\u23f0", "\u2620", "\u26bd", "\u2705", "\u2757"}, {"\u26a0", "\u26a0\ufe0f", "\u26a0\ufe0e", "\u2764\ufe0f", "\u2764"}, {"\ud83d\udc69\u200d\u2696", "\ud83d\udc68\ud83c\udfff\u200d\ud83c\udfa4"}};
    private final String[] tweetInputs = new String[]{"Happy #StarWars week! Ever wonder what was going on with Uncle Owen's dad? Check out .@WHMPodcast's rant on Ep2 https://t.co/9iJMMkAokT", "RT @BiIlionaires: #TheForceAwakens inspired vehicles are a big hit in LA.", "\u201c@people: A woman built the perfect #StarWars costume for her dog https://t.co/VJRQwNZB0t https://t.co/nmNROB7diR\u201d@guacomole123", "I would like to get a 13\" MB Air with an i7@1,7GHz", "So you have audio track 1 @145bpm and global project tempo is now 145bpm", "I know that the inside of the mall opens @5am.", "I have ordered Bose Headfones worth 300USD. Not 156bpmt. FCPX MP4 playback choppy on 5k iMac", "RT @Suns: What happens when you combine @50cent, #StarWars and introductions at an @NBA game? This.", "RT @ShirleyHoman481: '#StarWars' Premiere Street Closures Are \u201cBigger Than the Oscars\": Four blocks of Hollywood Blvd. -- from Highland\u2026 ht\u2026", "In 2009, Wiesel criticized the Vatican for lifting the excommunication of controversial bishop Richard Williamson, a member of the Society of Saint Pius X.", "RM460.35 million"};
    private final String[][] tweetGold = new String[][]{{"Happy", "#StarWars", "week", "!", "Ever", "wonder", "what", "was", "going", "on", "with", "Uncle", "Owen", "'s", "dad", "?", "Check", "out", ".@WHMPodcast", "'s", "rant", "on", "Ep2", "https://t.co/9iJMMkAokT"}, {"RT", "@BiIlionaires", ":", "#TheForceAwakens", "inspired", "vehicles", "are", "a", "big", "hit", "in", "LA", "."}, {"``", "@people", ":", "A", "woman", "built", "the", "perfect", "#StarWars", "costume", "for", "her", "dog", "https://t.co/VJRQwNZB0t", "https://t.co/nmNROB7diR", "''", "@guacomole123"}, {"I", "would", "like", "to", "get", "a", "13", "''", "MB", "Air", "with", "an", "i7", "@", "1,7", "GHz"}, {"So", "you", "have", "audio", "track", "1", "@", "145", "bpm", "and", "global", "project", "tempo", "is", "now", "145", "bpm"}, {"I", "know", "that", "the", "inside", "of", "the", "mall", "opens", "@", "5", "am", "."}, {"I", "have", "ordered", "Bose", "Headfones", "worth", "300", "USD", ".", "Not", "156bpmt", ".", "FCPX", "MP4", "playback", "choppy", "on", "5k", "iMac"}, {"RT", "@Suns", ":", "What", "happens", "when", "you", "combine", "@50cent", ",", "#StarWars", "and", "introductions", "at", "an", "@NBA", "game", "?", "This", "."}, {"RT", "@ShirleyHoman481", ":", "'", "#StarWars", "'", "Premiere", "Street", "Closures", "Are", "``", "Bigger", "Than", "the", "Oscars", "''", ":", "Four", "blocks", "of", "Hollywood", "Blvd.", "--", "from", "Highland", "...", "ht", "..."}, {"In", "2009", ",", "Wiesel", "criticized", "the", "Vatican", "for", "lifting", "the", "excommunication", "of", "controversial", "bishop", "Richard", "Williamson", ",", "a", "member", "of", "the", "Society", "of", "Saint", "Pius", "X."}, {"RM", "460.35", "million"}};
    private final String[] hyphenInputs = new String[]{"\ufeffThis is hy\u00adphen\u00adated and non-breaking spaces: 3\u202f456\u202f473.89", "\u0093I need \u008080.\u0094 \u0082And \u0085 dollars.\u0092", "Charles Howard ''Charlie\u2019' Bridges and Helen Hoyle Bridges", "All energy markets close at 1 p.m. except Palo Verde electricity futures and options, closing at\n12:55.; Palladium and copper markets close at 1 p.m.; Silver markets close at 1:05 p.m.", "BHP is `` making the right noises.''", "``There's a saying nowadays,'' he said. ```The more you owe, the longer you live.' It means the mafia won't come until we have money.''\n", "\"Whereas strategic considerations have to be based on 'real- politick' and harsh facts,\" Saleem said.", "F*ck, cr-p, I met Uchenna Nnobuko yesterday.", "I\u00b4m wrong and she\u00b4s right.", "Left Duxbury Ave. and read para. 13.8 and attached 3802.doc.", "Phone:86-0832-2115188"};
    private final String[][] hyphenGold = new String[][]{{"This", "is", "hyphenated", "and", "non-breaking", "spaces", ":", "3456473.89"}, {"``", "I", "need", "\u20ac", "80", ".", "''", "`", "And", "...", "dollars", ".", "'"}, {"Charles", "Howard", "``", "Charlie", "''", "Bridges", "and", "Helen", "Hoyle", "Bridges"}, {"All", "energy", "markets", "close", "at", "1", "p.m.", "except", "Palo", "Verde", "electricity", "futures", "and", "options", ",", "closing", "at", "12:55", ".", ";", "Palladium", "and", "copper", "markets", "close", "at", "1", "p.m.", ";", "Silver", "markets", "close", "at", "1:05", "p.m."}, {"BHP", "is", "``", "making", "the", "right", "noises", ".", "''"}, {"``", "There", "'s", "a", "saying", "nowadays", ",", "''", "he", "said", ".", "``", "`", "The", "more", "you", "owe", ",", "the", "longer", "you", "live", ".", "'", "It", "means", "the", "mafia", "wo", "n't", "come", "until", "we", "have", "money", ".", "''"}, {"``", "Whereas", "strategic", "considerations", "have", "to", "be", "based", "on", "`", "real", "-", "politick", "'", "and", "harsh", "facts", ",", "''", "Saleem", "said", "."}, {"F*ck", ",", "cr-p", ",", "I", "met", "Uchenna", "Nnobuko", "yesterday", "."}, {"I", "'m", "wrong", "and", "she", "'s", "right", "."}, {"Left", "Duxbury", "Ave.", "and", "read", "para.", "13.8", "and", "attached", "3802.doc", "."}, {"Phone", ":", "86-0832-2115188"}};

    @Test
    public void testPTBTokenizerWord() {
        TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.ptbInputs, this.ptbGold);
    }

    @Test
    public void testPTBTokenizerCoreLabel() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory();
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.moreInputs, this.moreGold);
    }

    @Test
    public void testCorp() {
        Assert.assertEquals((long)2L, (long)this.corpInputs.length);
        Assert.assertEquals((long)2L, (long)this.corpGold.length);
        for (int sent = 0; sent < 4; ++sent) {
            PTBTokenizer<CoreLabel> ptbTokenizer = new PTBTokenizer<CoreLabel>(new StringReader(this.corpInputs[sent / 2]), new CoreLabelTokenFactory(), sent % 2 == 0 ? "strictTreebank3" : "");
            int i = 0;
            while (ptbTokenizer.hasNext()) {
                CoreLabel w = (CoreLabel)ptbTokenizer.next();
                try {
                    Assert.assertEquals((String)"PTBTokenizer problem", (Object)this.corpGold[sent % 2][i], (Object)w.word());
                }
                catch (ArrayIndexOutOfBoundsException arrayIndexOutOfBoundsException) {
                    // empty catch block
                }
                ++i;
            }
            if (i != this.corpGold[sent % 2].length) {
                System.out.print("Gold: ");
                System.out.println(Arrays.toString(this.corpGold[sent % 2]));
                List tokens = new PTBTokenizer<CoreLabel>(new StringReader(this.corpInputs[sent / 2]), new CoreLabelTokenFactory(), sent % 2 == 0 ? "strictTreebank3" : "").tokenize();
                System.out.print("Guess: ");
                System.out.println(SentenceUtils.listToString(tokens));
                System.out.flush();
            }
            Assert.assertEquals((String)"PTBTokenizer num tokens problem", (long)i, (long)this.corpGold[sent % 2].length);
        }
    }

    @Test
    public void testJacobEisensteinApostropheCase() {
        Assert.assertEquals((long)jeInputs.length, (long)jeOutputs.length);
        for (int i = 0; i < jeInputs.length; ++i) {
            StringReader reader = new StringReader(jeInputs[i]);
            PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader);
            List tokens = tokenizer.tokenize();
            Assert.assertEquals((Object)jeOutputs[i], tokens);
        }
    }

    @Test
    public void testUntok() {
        assert (untokInputs.length == untokOutputs.length);
        for (int i = 0; i < untokInputs.length; ++i) {
            Assert.assertEquals((String)"untok gave the wrong result", (Object)untokOutputs[i], (Object)PTBTokenizer.ptb2Text(untokInputs[i]));
        }
    }

    @Test
    public void testInvertible() {
        String text = "  This     is     a      colourful sentence.    ";
        PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true);
        List tokens = tokenizer.tokenize();
        Assert.assertEquals((long)6L, (long)tokens.size());
        Assert.assertEquals((Object)"  ", ((CoreLabel)tokens.get(0)).get(CoreAnnotations.BeforeAnnotation.class));
        Assert.assertEquals((Object)"     ", ((CoreLabel)tokens.get(0)).get(CoreAnnotations.AfterAnnotation.class));
        Assert.assertEquals((String)"Wrong begin char offset", (long)2L, (long)((Integer)((CoreLabel)tokens.get(0)).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)).intValue());
        Assert.assertEquals((String)"Wrong end char offset", (long)6L, (long)((Integer)((CoreLabel)tokens.get(0)).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue());
        Assert.assertEquals((Object)"This", ((CoreLabel)tokens.get(0)).get(CoreAnnotations.OriginalTextAnnotation.class));
        Assert.assertEquals((Object)"     ", ((CoreLabel)tokens.get(0)).get(CoreAnnotations.AfterAnnotation.class));
        Assert.assertEquals((Object)"     ", ((CoreLabel)tokens.get(1)).get(CoreAnnotations.BeforeAnnotation.class));
        Assert.assertEquals((Object)"colourful", ((CoreLabel)tokens.get(3)).get(CoreAnnotations.TextAnnotation.class));
        Assert.assertEquals((Object)"colourful", ((CoreLabel)tokens.get(3)).get(CoreAnnotations.OriginalTextAnnotation.class));
        Assert.assertEquals((Object)"", (Object)((CoreLabel)tokens.get(4)).after());
        Assert.assertEquals((Object)"", (Object)((CoreLabel)tokens.get(5)).before());
        Assert.assertEquals((Object)"    ", ((CoreLabel)tokens.get(5)).get(CoreAnnotations.AfterAnnotation.class));
        StringBuilder result = new StringBuilder();
        result.append((String)((CoreLabel)tokens.get(0)).get(CoreAnnotations.BeforeAnnotation.class));
        for (CoreLabel token : tokens) {
            result.append((String)token.get(CoreAnnotations.OriginalTextAnnotation.class));
            String after = (String)token.get(CoreAnnotations.AfterAnnotation.class);
            if (after == null) continue;
            result.append(after);
        }
        Assert.assertEquals((Object)text, (Object)result.toString());
        for (int i = 0; i < tokens.size() - 1; ++i) {
            Assert.assertEquals(((CoreLabel)tokens.get(i)).get(CoreAnnotations.AfterAnnotation.class), ((CoreLabel)tokens.get(i + 1)).get(CoreAnnotations.BeforeAnnotation.class));
        }
    }

    @Test
    public void testPTBTokenizerSGML() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("invertible");
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.sgmlInputs, this.sgmlGold);
        PTBTokenizerTest.runAgainstOrig(tokFactory, this.sgmlInputs);
    }

    @Test
    public void testPTBTokenizerTokenizePerLineSGML() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true,invertible");
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.sgmlInputs, this.sgmlPerLineGold);
        PTBTokenizerTest.runAgainstOrig(tokFactory, this.sgmlInputs);
    }

    @Test
    public void testPTBTokenizerTokenizeSplitHyphens() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true,invertible");
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.ptbInputs, this.ptbGoldSplitHyphenated);
        PTBTokenizerTest.runAgainstOrig(tokFactory, this.ptbInputs);
    }

    @Test
    public void testFractions() {
        String[] sample = new String[]{"5-1/4 plus 2 3/16 = 7\u00a07/16 in the U.S.S.R. Why not?"};
        String[][] tokenizedNormal = new String[][]{{"5-1/4", "plus", "2\u00a03/16", "=", "7\u00a07/16", "in", "the", "U.S.S.R.", ".", "Why", "not", "?"}};
        String[][] tokenizedStrict = new String[][]{{"5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not", "?"}};
        TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory("invertible=true");
        TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3=true,invertible=true");
        PTBTokenizerTest.runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal);
        PTBTokenizerTest.runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict);
        PTBTokenizerTest.runAgainstOrig(tokFactoryNormal, sample);
        PTBTokenizerTest.runAgainstOrig(tokFactoryStrict, sample);
    }

    private static <T extends Label> void runOnTwoArrays(TokenizerFactory<T> tokFactory, String[] inputs, String[][] desired) {
        Assert.assertEquals((String)"Test data arrays don't match in length", (long)inputs.length, (long)desired.length);
        for (int sent = 0; sent < inputs.length; ++sent) {
            Tokenizer<T> tok = tokFactory.getTokenizer(new StringReader(inputs[sent]));
            for (int i = 0; tok.hasNext() || i < desired[sent].length; ++i) {
                if (!tok.hasNext()) {
                    Assert.fail((String)("PTBTokenizer generated too few tokens for sentence " + sent + "! Missing " + desired[sent][i]));
                }
                Label w = (Label)tok.next();
                if (i >= desired[sent].length) {
                    Assert.fail((String)("PTBTokenizer generated too many tokens for sentence " + sent + "! Added " + w.value()));
                    continue;
                }
                Assert.assertEquals((String)"PTBTokenizer got wrong token", (Object)desired[sent][i], (Object)w.value());
            }
        }
    }

    private static <T extends CoreLabel> void runOnTwoArraysWithOffsets(TokenizerFactory<T> tokFactory, String[] inputs, String[][] desired) {
        Assert.assertEquals((String)"Test data arrays don't match in length", (long)inputs.length, (long)desired.length);
        for (int sent = 0; sent < inputs.length; ++sent) {
            Tokenizer<T> tok = tokFactory.getTokenizer(new StringReader(inputs[sent]));
            for (int i = 0; tok.hasNext() || i < desired[sent].length; ++i) {
                if (!tok.hasNext()) {
                    Assert.fail((String)("PTBTokenizer generated too few tokens for sentence " + sent + "! Missing " + desired[sent][i]));
                }
                CoreLabel w = (CoreLabel)tok.next();
                if (i >= desired[sent].length) {
                    Assert.fail((String)("PTBTokenizer generated too many tokens for sentence " + sent + "! Added " + w.value()));
                    continue;
                }
                Assert.assertEquals((String)"PTBTokenizer got wrong token", (Object)desired[sent][i], (Object)w.value());
                Assert.assertEquals((String)("PTBTokenizer charOffsets wrong for " + desired[sent][i]), (long)desired[sent][i].length(), (long)(w.endPosition() - w.beginPosition()));
            }
        }
    }

    private static <T extends CoreLabel> void runAgainstOrig(TokenizerFactory<T> tokFactory, String[] inputs) {
        for (String input : inputs) {
            StringBuilder origText = new StringBuilder();
            CoreLabel last = null;
            Tokenizer<T> tok = tokFactory.getTokenizer(new StringReader(input));
            while (tok.hasNext()) {
                PTBTokenizerTest.appendTextFrom(origText, last);
                last = (CoreLabel)tok.next();
            }
            PTBTokenizerTest.appendTextFrom(origText, last);
            Assert.assertEquals((String)"PTBTokenizer has wrong originalText", (Object)input, (Object)origText.toString());
        }
    }

    private static <T extends CoreLabel> void appendTextFrom(StringBuilder origText, T token) {
        if (token != null) {
            if (origText.length() == 0) {
                origText.append((String)token.get(CoreAnnotations.BeforeAnnotation.class));
            }
            origText.append((String)token.get(CoreAnnotations.OriginalTextAnnotation.class));
            origText.append((String)token.get(CoreAnnotations.AfterAnnotation.class));
        }
    }

    @Test
    public void testPTBTokenizerGerman() {
        String[] sample = new String[]{"Das TV-Duell von Kanzlerin Merkel und SPD-Herausforderer Steinbr\u00fcck war eher lahm - k\u00f6nnen es die Spitzenleute der kleinen Parteien besser? ", "Die erquickende Sicherheit und Festigkeit in der Bewegung, den Vorrat von Kraft, kann ja die Versammlung nicht f\u00fchlen, h\u00f6ren will sie sie nicht, also mu\u00df sie sie sehen; und die sehe man einmal in einem Paar spitzen Schultern, zylindrischen Schenkeln, oder leeren \u00c4rmeln, oder lattenf\u00f6rmigen Beinen."};
        String[][] tokenized = new String[][]{{"Das", "TV-Duell", "von", "Kanzlerin", "Merkel", "und", "SPD-Herausforderer", "Steinbr\u00fcck", "war", "eher", "lahm", "-", "k\u00f6nnen", "es", "die", "Spitzenleute", "der", "kleinen", "Parteien", "besser", "?"}, {"Die", "erquickende", "Sicherheit", "und", "Festigkeit", "in", "der", "Bewegung", ",", "den", "Vorrat", "von", "Kraft", ",", "kann", "ja", "die", "Versammlung", "nicht", "f\u00fchlen", ",", "h\u00f6ren", "will", "sie", "sie", "nicht", ",", "also", "mu\u00df", "sie", "sie", "sehen", ";", "und", "die", "sehe", "man", "einmal", "in", "einem", "Paar", "spitzen", "Schultern", ",", "zylindrischen", "Schenkeln", ",", "oder", "leeren", "\u00c4rmeln", ",", "oder", "lattenf\u00f6rmigen", "Beinen", "."}};
        NegraPennLanguagePack tlp = new NegraPennLanguagePack();
        TokenizerFactory<? extends HasWord> tokFactory = tlp.getTokenizerFactory();
        PTBTokenizerTest.runOnTwoArrays(tokFactory, sample, tokenized);
    }

    @Test
    public void testPTBTokenizerMT() {
        TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.mtInputs, this.mtGold);
    }

    @Test
    public void testEmoji() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("invertible");
        PTBTokenizerTest.runOnTwoArraysWithOffsets(tokFactory, this.emojiInputs, this.emojiGold);
        PTBTokenizerTest.runAgainstOrig(tokFactory, this.emojiInputs);
        Assert.assertEquals((long)1L, (long)"\ud83d\udcf7".codePointCount(0, 2));
        Assert.assertEquals((long)2L, (long)"\u2764\ufe0f".codePointCount(0, 2));
    }

    @Test
    public void testTweets() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("invertible");
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.tweetInputs, this.tweetGold);
        PTBTokenizerTest.runAgainstOrig(tokFactory, this.tweetInputs);
    }

    @Test
    public void testHyphensQuoteAndBOM() {
        TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("normalizeCurrency=false,invertible");
        PTBTokenizerTest.runOnTwoArrays(tokFactory, this.hyphenInputs, this.hyphenGold);
        PTBTokenizerTest.runAgainstOrig(tokFactory, this.hyphenInputs);
    }
}

