From 6ceb39031d9c56100b989fe08d1be0bfee3092dc Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Wed, 3 Jul 2019 13:45:22 -0500 Subject: [PATCH 01/16] Add a property and feature allowing NER users to discard NER labels they are not interested in from a model. Added a unit test to validate. --- big-data-utils/pom.xml | 4 +- chunker/pom.xml | 8 +-- commasrl/pom.xml | 20 +++--- core-utilities/pom.xml | 2 +- corpusreaders/pom.xml | 6 +- curator/pom.xml | 4 +- dataless-classifier/pom.xml | 6 +- depparse/pom.xml | 12 ++-- edison/pom.xml | 8 +-- external/clausie/pom.xml | 4 +- external/external-commons/pom.xml | 6 +- external/path-lstm/pom.xml | 6 +- external/stanford_3.3.1/pom.xml | 6 +- external/stanford_3.8.0/pom.xml | 6 +- inference/pom.xml | 4 +- lbjava-nlp-tools/pom.xml | 4 +- lemmatizer/pom.xml | 6 +- md/pom.xml | 14 ++--- ner/pom.xml | 12 ++-- .../cs/cogcomp/ner/LbjTagger/Parameters.java | 5 +- .../ner/LbjTagger/ParametersForLbjCode.java | 4 ++ .../illinois/cs/cogcomp/ner/NERAnnotator.java | 8 ++- .../ner/config/NerBaseConfigurator.java | 5 +- .../cs/cogcomp/ner/NerLabelPruningTest.java | 57 +++++++++++++++++ .../edu/illinois/cs/cogcomp/ner/NerTest.java | 61 +++++++++++++++++++ pipeline-client/pom.xml | 4 +- pipeline/pom.xml | 38 ++++++------ pom.xml | 2 +- pos/pom.xml | 2 +- prepsrl/pom.xml | 14 ++--- quantifier/pom.xml | 12 ++-- question-type/pom.xml | 10 +-- relation-extraction/pom.xml | 18 +++--- similarity/pom.xml | 6 +- temporal-normalizer/pom.xml | 14 ++--- tokenizer/pom.xml | 8 +-- transliteration/pom.xml | 6 +- verbsense/pom.xml | 16 ++--- 38 files changed, 281 insertions(+), 147 deletions(-) create mode 100644 ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java diff --git a/big-data-utils/pom.xml b/big-data-utils/pom.xml index 5a5cfb129..1baee004b 100644 --- a/big-data-utils/pom.xml +++ b/big-data-utils/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -23,7 +23,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 org.xeustechnologies.google-api diff --git a/chunker/pom.xml b/chunker/pom.xml index 65fa16ffb..367b6c1bc 100644 --- a/chunker/pom.xml +++ b/chunker/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 @@ -24,12 +24,12 @@ edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp diff --git a/commasrl/pom.xml b/commasrl/pom.xml index 46ecefd7b..af3ac75aa 100644 --- a/commasrl/pom.xml +++ b/commasrl/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -35,48 +35,48 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 true edu.illinois.cs.cogcomp illinois-curator - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-inference - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-ner - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-chunker - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp diff --git a/core-utilities/pom.xml b/core-utilities/pom.xml index 59a39f359..464336dfc 100644 --- a/core-utilities/pom.xml +++ b/core-utilities/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-core-utilities diff --git a/corpusreaders/pom.xml b/corpusreaders/pom.xml index b78860c78..d36356c42 100644 --- a/corpusreaders/pom.xml +++ b/corpusreaders/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-corpusreaders @@ -15,12 +15,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 org.slf4j diff --git a/curator/pom.xml b/curator/pom.xml index 8263a3901..3a091244e 100644 --- a/curator/pom.xml +++ b/curator/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-curator @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 diff --git a/dataless-classifier/pom.xml b/dataless-classifier/pom.xml index e647ccd21..8cc236a9e 100644 --- a/dataless-classifier/pom.xml +++ b/dataless-classifier/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -21,12 +21,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 org.slf4j diff --git a/depparse/pom.xml b/depparse/pom.xml index cc573accf..039036e40 100644 --- a/depparse/pom.xml +++ b/depparse/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-depparse @@ -16,27 +16,27 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-chunker - 4.0.15 + 4.0.16 diff --git a/edison/pom.xml b/edison/pom.xml index 59b33a20e..1c60c666d 100644 --- a/edison/pom.xml +++ b/edison/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-edison @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 @@ -80,13 +80,13 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-curator - 4.0.15 + 4.0.16 test diff --git a/external/clausie/pom.xml b/external/clausie/pom.xml index b0b1fbc72..55b8899a1 100644 --- a/external/clausie/pom.xml +++ b/external/clausie/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 ../../pom.xml @@ -24,7 +24,7 @@ edu.illinois.cs.cogcomp external-commons - 4.0.15 + 4.0.16 org.slf4j diff --git a/external/external-commons/pom.xml b/external/external-commons/pom.xml index bc9f5b1d2..452b8dd52 100644 --- a/external/external-commons/pom.xml +++ b/external/external-commons/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 org.cogcomp diff --git a/external/path-lstm/pom.xml b/external/path-lstm/pom.xml index 661175121..5264be26c 100644 --- a/external/path-lstm/pom.xml +++ b/external/path-lstm/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp external-commons - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 org.cogcomp diff --git a/external/stanford_3.3.1/pom.xml b/external/stanford_3.3.1/pom.xml index 30aef7555..75091903e 100644 --- a/external/stanford_3.3.1/pom.xml +++ b/external/stanford_3.3.1/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 ../../pom.xml @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 @@ -36,7 +36,7 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 diff --git a/external/stanford_3.8.0/pom.xml b/external/stanford_3.8.0/pom.xml index d3ebcd45f..0af867a59 100644 --- a/external/stanford_3.8.0/pom.xml +++ b/external/stanford_3.8.0/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp external-commons - 4.0.15 + 4.0.16 org.slf4j diff --git a/inference/pom.xml b/inference/pom.xml index 4aed0a26c..5e1a770ef 100644 --- a/inference/pom.xml +++ b/inference/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 jar @@ -22,7 +22,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp diff --git a/lbjava-nlp-tools/pom.xml b/lbjava-nlp-tools/pom.xml index 712d456c8..b070f6bbb 100644 --- a/lbjava-nlp-tools/pom.xml +++ b/lbjava-nlp-tools/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 LBJava-NLP-tools @@ -30,7 +30,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 org.slf4j diff --git a/lemmatizer/pom.xml b/lemmatizer/pom.xml index be3fee001..cf9f5843d 100644 --- a/lemmatizer/pom.xml +++ b/lemmatizer/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-lemmatizer @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.stanford.nlp diff --git a/md/pom.xml b/md/pom.xml index 93df57aab..a6f7891e8 100644 --- a/md/pom.xml +++ b/md/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -25,32 +25,32 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-ner - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.15 + 4.0.16 org.slf4j diff --git a/ner/pom.xml b/ner/pom.xml index 0dbd48a92..7fdc5890e 100644 --- a/ner/pom.xml +++ b/ner/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-ner @@ -23,12 +23,12 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 org.cogcomp @@ -39,12 +39,12 @@ edu.illinois.cs.cogcomp LBJava - 1.3.1 + 1.3.2 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.15 + 4.0.16 org.slf4j @@ -90,7 +90,7 @@ edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.1 + 1.3.2 ${project.basedir}/src/main/lbj/LbjTagger.lbj diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java index 491968c6b..2267748ff 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java @@ -136,13 +136,16 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean Language lang = Language.getLanguageByCode(rm.getString("language")); param.language = lang; } - if (rm.containsKey("labelsToAnonymizeInEvaluation")) { String labelsToAnonymizeInEvaluation = rm.getString("labelsToAnonymizeInEvaluation"); param.labelsToAnonymizeInEvaluation = new Vector<>(Arrays.asList(labelsToAnonymizeInEvaluation.split(" "))); } + if (rm.containsKey(NerBaseConfigurator.LABELS_TO_KEEP)) { + String labelsToKeep = rm.getString(NerBaseConfigurator.LABELS_TO_KEEP); + param.labelsToKeep = new ArrayList(Arrays.asList(labelsToKeep.split(" "))); + } if (rm.containsKey("labelsToIgnoreInEvaluation")) { String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation"); param.labelsToIgnoreInEvaluation = diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java index 6446f0682..024c51346 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java @@ -13,6 +13,7 @@ import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers; import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator; +import java.util.ArrayList; import java.util.HashMap; import java.util.Vector; @@ -35,6 +36,9 @@ public enum TokenizationScheme { // will be initialized to something like {"PER","ORG","LOC","MISC"}; */ public String[] labelTypes = {"PER", "ORG", "LOC", "MISC"}; + /** labels of interest if a subset of all labels, all other labels are ignored. */ + public ArrayList labelsToKeep = null; + /** Labels to ignore when evaluating model performance, e.g. "MISC" for the MUC7 dataset. */ public Vector labelsToIgnoreInEvaluation = null; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java index 5b3e81498..081f3de61 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java @@ -121,7 +121,13 @@ public void initialize(ResourceManager nerRm) { // load the models. synchronized (LOADING_MODELS) { ModelLoader.load(nerRm, viewName, false, this.params); - } + } + if (this.params.labelsToKeep != null) { + logger.info("Kept label : "+this.params.labelsToKeep); + this.params.taggerLevel1.pruneUnusedLabels(this.params.labelsToKeep); + if (this.params.taggerLevel2 != null) + this.params.taggerLevel2.pruneUnusedLabels(this.params.labelsToKeep); + } } /** diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java index 77eab459c..02b2b3b06 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java @@ -44,7 +44,10 @@ public class NerBaseConfigurator extends AnnotatorConfigurator { // public final static String TOKENIZATION_SCHEME = "tokenizationScheme"; public final static String FORCE_NEW_SENTENCE_ON_LINE_BREAKS = "forceNewSentenceOnLineBreaks"; public final static String LABEL_TYPES = "labelTypes"; - + + /** enumerates labels of interest, other labels are discared, if null, keep them all. */ + public final static String LABELS_TO_KEEP = "labelsToKeep"; + public final static String NORMALIZE_TITLE_TEXT = "normalizeTitleText"; public final static String PATH_TO_TOKEN_NORM_DATA = "pathToTokenNormalizationData"; public final static String SORT_FILES_LEXICALLY = "sortLexicallyFilesInFolders"; diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java new file mode 100644 index 000000000..f7d6af27f --- /dev/null +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java @@ -0,0 +1,57 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.org/ + */ +package edu.illinois.cs.cogcomp.ner; + +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder; +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder; +import org.junit.Test; + +import java.util.Properties; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/** + * The user may specify in their configuration file the labels they want to keep, if they choose not to + * keep all labels. For example if you use a 4 class model yet have no interest in MISC or ORG, you can + * specify "PER" and "LOC" in a configuration parameter ("labelsToKeep") to discard the models for MISC and + * ORG. This will result in a significant performance improvement. + * @author redman + * + */ +public class NerLabelPruningTest { + + private static final String TEST_INPUT = + "JFK has one dog and Newark has a handful, Farbstein said. 114 dollars will by you a meal in New York, but not in Bement Illinois"; + + @Test + public void testOntonotesNer() { + TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer()); + Properties props = new Properties(); + // TIME LAW GPE NORP LANGUAGE PERCENT FAC PRODUCT ORDINAL LOC PERSON WORK_OF_ART MONEY DATE EVENT QUANTITY ORG CARDINAL + props.put(NerBaseConfigurator.LABELS_TO_KEEP, "LOC GPE NORP LANGUAGE FAC PRODUCT PERSON EVENT ORG"); + NERAnnotator nerOntonotes = NerAnnotatorManager.buildNerAnnotator(new ResourceManager(props), + ViewNames.NER_ONTONOTES); + TextAnnotation taOnto = tab.createTextAnnotation("", "", TEST_INPUT); + try { + nerOntonotes.getView(taOnto); + } catch (AnnotatorException e) { + e.printStackTrace(); + fail(e.getMessage()); + } + View v = taOnto.getView(nerOntonotes.getViewName()); + assertEquals(6, v.getConstituents().size()); + } +} \ No newline at end of file diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerTest.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerTest.java index 87f0481a2..c13ecb9e7 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerTest.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerTest.java @@ -35,8 +35,16 @@ public class NerTest { private static final String TEST_OUTPUT = "[PERSON JFK] has [CARDINAL one] dog and [GPE Newark] has a handful , [PERSON Farbstein] said . "; + private static final String TEST_OUTPUT_PERSON = + "[PERSON JFK] has [CARDINAL one] dog and Newark has a handful , [PERSON Farbstein] said . "; + + /** the T1 model. */ private static NETaggerLevel1 t1; + + /** the second level model. */ private static NETaggerLevel2 t2 = null; + + /** the parameters configure the models. */ ParametersForLbjCode params = null; @Before public void setUp() throws Exception { @@ -69,6 +77,59 @@ public void testTaggers() { assertTrue(output.equals(TEST_OUTPUT)); } + + @Test + public void testPersonOnlyTagger() { + long start = System.currentTimeMillis(); + System.out.println("Warming up."); + try { + ArrayList sentences = PlainTextReader.parseText(TEST_INPUT, params); + Data data = new Data(new NERDocument(sentences, "input")); + NETagPlain.tagData(data, params); + } catch (Exception e) { + logger.info("Cannot annotate the test, the exception was: "); + e.printStackTrace(); + fail(); + } + String output = null; + System.out.println("Starting full models."); + final int TOT = 500; + for (int i = 0; i < TOT; i++) { + try { + ArrayList sentences = PlainTextReader.parseText(TEST_INPUT, params); + Data data = new Data(new NERDocument(sentences, "input")); + output = NETagPlain.tagData(data, params); + } catch (Exception e) { + logger.info("Cannot annotate the test, the exception was: "); + e.printStackTrace(); + fail(); + } + } + System.out.println("Took "+(System.currentTimeMillis()-start)+" for "+TOT); + + // Strip out labels we don't want + ArrayList keepers = new ArrayList<>(); + keepers.add("PERSON"); + keepers.add("CARDINAL"); + params.taggerLevel1.pruneUnusedLabels(keepers); + params.taggerLevel2.pruneUnusedLabels(keepers); + System.out.println("Starting reduced models."); + start = System.currentTimeMillis(); + for (int i = 0; i < TOT; i++) + try { + ArrayList sentences = PlainTextReader.parseText(TEST_INPUT, params); + Data data = new Data(new NERDocument(sentences, "input")); + output = NETagPlain.tagData(data, params); + } catch (Exception e) { + logger.info("Cannot annotate the test, the exception was: "); + e.printStackTrace(); + fail(); + } + + System.out.println("Took "+(System.currentTimeMillis()-start)+" for "+TOT); + assertTrue(output.equals(TEST_OUTPUT_PERSON)); + } + @After public void tearDown() throws Exception {} } diff --git a/pipeline-client/pom.xml b/pipeline-client/pom.xml index d91b84e89..cc1e27bc8 100644 --- a/pipeline-client/pom.xml +++ b/pipeline-client/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-pipeline-client @@ -15,7 +15,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 50583ea35..46c4023fd 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-nlp-pipeline @@ -16,57 +16,57 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-chunker - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-quantifier - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-prep-srl - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-comma - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-verbsense - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-question-typer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.15 + 4.0.16 org.cogcomp @@ -83,7 +83,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.15 + 4.0.16 org.apache.commons @@ -93,17 +93,17 @@ edu.illinois.cs.cogcomp illinois-md - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-relation-extraction - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-datalessclassification - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp @@ -131,7 +131,7 @@ edu.illinois.cs.cogcomp illinois-depparse - 4.0.15 + 4.0.16 @@ -149,12 +149,12 @@ edu.illinois.cs.cogcomp illinois-time - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-transliteration - 4.0.15 + 4.0.16 diff --git a/pom.xml b/pom.xml index 26b5025af..3fc1bacee 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp illinois-cogcomp-nlp pom - 4.0.15 + 4.0.16 core-utilities tokenizer diff --git a/pos/pom.xml b/pos/pom.xml index b9b2edbd0..bd5154d34 100644 --- a/pos/pom.xml +++ b/pos/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-pos diff --git a/prepsrl/pom.xml b/prepsrl/pom.xml index 651f24112..eca8c16ee 100644 --- a/prepsrl/pom.xml +++ b/prepsrl/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -15,32 +15,32 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-depparse - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.15 + 4.0.16 diff --git a/quantifier/pom.xml b/quantifier/pom.xml index 043bf2e24..09c568c4d 100644 --- a/quantifier/pom.xml +++ b/quantifier/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-quantifier @@ -35,31 +35,31 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 compile edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 compile edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 compile edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 compile edu.illinois.cs.cogcomp illinois-curator - 4.0.15 + 4.0.16 compile diff --git a/question-type/pom.xml b/question-type/pom.xml index e6b62ed9e..a0ac76db7 100644 --- a/question-type/pom.xml +++ b/question-type/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-question-typer @@ -13,22 +13,22 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pipeline-client - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp diff --git a/relation-extraction/pom.xml b/relation-extraction/pom.xml index c0af70063..2b75bcbc0 100644 --- a/relation-extraction/pom.xml +++ b/relation-extraction/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -33,42 +33,42 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-ner - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-md - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-chunker - 4.0.15 + 4.0.16 joda-time diff --git a/similarity/pom.xml b/similarity/pom.xml index 2dee2001a..17fd976cd 100644 --- a/similarity/pom.xml +++ b/similarity/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-similarity @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 org.slf4j @@ -55,7 +55,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp diff --git a/temporal-normalizer/pom.xml b/temporal-normalizer/pom.xml index c141a206c..033d290c6 100644 --- a/temporal-normalizer/pom.xml +++ b/temporal-normalizer/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 illinois-time @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp @@ -28,7 +28,7 @@ edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 org.apache.uima @@ -43,12 +43,12 @@ edu.illinois.cs.cogcomp illinois-chunker - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-curator - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp @@ -58,7 +58,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.15 + 4.0.16 test @@ -86,7 +86,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.15 + 4.0.16 diff --git a/tokenizer/pom.xml b/tokenizer/pom.xml index 4703c674b..7c885b588 100644 --- a/tokenizer/pom.xml +++ b/tokenizer/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 illinois-tokenizer @@ -15,17 +15,17 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-curator - 4.0.15 + 4.0.16 test diff --git a/transliteration/pom.xml b/transliteration/pom.xml index 42f7e3aba..7865306bd 100644 --- a/transliteration/pom.xml +++ b/transliteration/pom.xml @@ -5,7 +5,7 @@ http://www.w3.org/2001/XMLSchema-instance "> illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 @@ -20,12 +20,12 @@ http://www.w3.org/2001/XMLSchema-instance "> edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 org.apache.commons diff --git a/verbsense/pom.xml b/verbsense/pom.xml index 8832a17b1..9959893f1 100755 --- a/verbsense/pom.xml +++ b/verbsense/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.15 + 4.0.16 4.0.0 illinois-verbsense @@ -18,37 +18,37 @@ edu.illinois.cs.cogcomp illinois-edison - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-pos - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-ner - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-chunker - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp illinois-inference - 4.0.15 + 4.0.16 edu.illinois.cs.cogcomp From 2de64acb7a404f438af7b465f3309a30eb9d828f Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Wed, 3 Jul 2019 17:27:40 -0500 Subject: [PATCH 02/16] There was a bug in this test. --- .../java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java index f7d6af27f..ae67898d2 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java @@ -34,7 +34,7 @@ public class NerLabelPruningTest { private static final String TEST_INPUT = - "JFK has one dog and Newark has a handful, Farbstein said. 114 dollars will by you a meal in New York, but not in Bement Illinois"; + "JFK has one dog and Newark has a handful, Farbstein said."; @Test public void testOntonotesNer() { @@ -52,6 +52,6 @@ public void testOntonotesNer() { fail(e.getMessage()); } View v = taOnto.getView(nerOntonotes.getViewName()); - assertEquals(6, v.getConstituents().size()); + assertEquals(3, v.getConstituents().size()); } } \ No newline at end of file From 526f7fb4225a3012cdd78689ebb72459c0ae5f8e Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 5 Jul 2019 10:19:37 -0500 Subject: [PATCH 03/16] A score is not included with each constituent for named entity constituents. --- .../cs/cogcomp/ner/LbjTagger/NEWord.java | 17 +++++++++++++++-- .../illinois/cs/cogcomp/ner/NERAnnotator.java | 19 ++++++++----------- .../cs/cogcomp/ner/NerLabelPruningTest.java | 4 ++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java index 43c72c3af..85e8f442e 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java @@ -160,7 +160,7 @@ public ArrayList resetLevel1AggregationFeatures() { * Produces a simple String representation of this word in which the * neLabel field appears followed by the word's part of speech and finally the form * (i.e., spelling) of the word all surrounded by parentheses. - **/ + */ public String toString() { return "(" + neLabel + " " + partOfSpeech + " " + form + ")"; } @@ -200,7 +200,20 @@ public void setPrediction(String label, LabelToLookAt labelType) { this.neTypeLevel2 = label; } - + /** + * This method will return the score of the chosen label. + * @return the score of the best label for this term. + */ + public double getScore() { + if (predictionConfidencesLevel2Classifier == null || predictionConfidencesLevel2Classifier.topScores.size() == 0) + if (predictionConfidencesLevel1Classifier == null || predictionConfidencesLevel1Classifier.topScores.size() == 0) + throw new RuntimeException("Attempt to get label score before scores are set."); + else + return this.predictionConfidencesLevel1Classifier.topScores.elementAt(0); + else + return this.predictionConfidencesLevel2Classifier.topScores.elementAt(0); + } + public enum LabelToLookAt { PredictionLevel2Tagger, PredictionLevel1Tagger, GoldLabel } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java index 081f3de61..d247b6194 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java @@ -177,26 +177,24 @@ public void addView(TextAnnotation ta) { // the data always has a single document // each LinkedVector in data corresponds to a sentence. int tokenoffset = 0; - for (LinkedVector vector : nerSentences) { + for (LinkedVector nerWords : nerSentences) { boolean open = false; // there should be a 1:1 mapping btw sentence tokens in record and words/predictions // from NER. int startIndex = -1; String label = null; - for (int j = 0; j < vector.size(); j++, tokenoffset++) { - NEWord neWord = (NEWord) (vector.get(j)); + for (int j = 0; j < nerWords.size(); j++, tokenoffset++) { + NEWord neWord = (NEWord) (nerWords.get(j)); String prediction = neWord.neTypeLevel2; - // LAM-tlr this is not a great way to ascertain the entity type, it's a bit - // convoluted, and very - // inefficient, use enums, or nominalized indexes for this sort of thing. + // identify the label. if (prediction.startsWith("B-")) { startIndex = tokenoffset; label = prediction.substring(2); open = true; } else if (j > 0) { - String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2; + String previous_prediction = ((NEWord) nerWords.get(j - 1)).neTypeLevel2; if (prediction.startsWith("I-") && (!previous_prediction.endsWith(prediction.substring(2)))) { startIndex = tokenoffset; @@ -207,10 +205,10 @@ public void addView(TextAnnotation ta) { if (open) { boolean close = false; - if (j == vector.size() - 1) { + if (j == nerWords.size() - 1) { close = true; } else { - String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2; + String next_prediction = ((NEWord) nerWords.get(j + 1)).neTypeLevel2; if (next_prediction.startsWith("B-")) close = true; if (next_prediction.equals("O")) @@ -232,8 +230,7 @@ public void addView(TextAnnotation ta) { int e = tokenindices[endIndex]; if (e <= s) e = s + 1; - - nerView.addSpanLabel(s, e, label, 1d); + nerView.addSpanLabel(s, e, label, neWord.getScore()); open = false; } } diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java index ae67898d2..301330959 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java @@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.annotation.AnnotatorException; import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View; import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; @@ -52,6 +53,9 @@ public void testOntonotesNer() { fail(e.getMessage()); } View v = taOnto.getView(nerOntonotes.getViewName()); + for (Constituent c : v.getConstituents()) { + System.out.println(c+" = "+c.getLabel()+" : "+c.getConstituentScore()); + } assertEquals(3, v.getConstituents().size()); } } \ No newline at end of file From 7d1d39f8b88f228047e39d2fd83b94cc6f4c9245 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 5 Jul 2019 11:06:53 -0500 Subject: [PATCH 04/16] Had to up the LBJava version. --- chunker/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chunker/pom.xml b/chunker/pom.xml index 367b6c1bc..7a66ff4f5 100644 --- a/chunker/pom.xml +++ b/chunker/pom.xml @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.1 + 1.3.2 edu.illinois.cs.cogcomp From 63b7dd2ac58b35929cf1dbe5711663cddfaae975 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 5 Jul 2019 11:37:21 -0500 Subject: [PATCH 05/16] More updates, from 1.3.0 dependencies on LBJava to 1.3.2 --- edison/pom.xml | 2 +- lbjava-nlp-tools/pom.xml | 2 +- md/pom.xml | 2 +- pos/pom.xml | 2 +- prepsrl/pom.xml | 2 +- quantifier/pom.xml | 2 +- question-type/pom.xml | 2 +- relation-extraction/pom.xml | 4 ++-- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/edison/pom.xml b/edison/pom.xml index 1c60c666d..ab802c485 100644 --- a/edison/pom.xml +++ b/edison/pom.xml @@ -98,7 +98,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 diff --git a/lbjava-nlp-tools/pom.xml b/lbjava-nlp-tools/pom.xml index b070f6bbb..eb5650c49 100644 --- a/lbjava-nlp-tools/pom.xml +++ b/lbjava-nlp-tools/pom.xml @@ -25,7 +25,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 edu.illinois.cs.cogcomp diff --git a/md/pom.xml b/md/pom.xml index a6f7891e8..c46a77007 100644 --- a/md/pom.xml +++ b/md/pom.xml @@ -70,7 +70,7 @@ edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.0 + 1.3.2 ${project.basedir}/src/lbj/md.lbj diff --git a/pos/pom.xml b/pos/pom.xml index bd5154d34..bd194e715 100644 --- a/pos/pom.xml +++ b/pos/pom.xml @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 edu.illinois.cs.cogcomp diff --git a/prepsrl/pom.xml b/prepsrl/pom.xml index eca8c16ee..eb3ff9be3 100644 --- a/prepsrl/pom.xml +++ b/prepsrl/pom.xml @@ -53,7 +53,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 org.slf4j diff --git a/quantifier/pom.xml b/quantifier/pom.xml index 09c568c4d..13fe1bdf5 100644 --- a/quantifier/pom.xml +++ b/quantifier/pom.xml @@ -24,7 +24,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 junit diff --git a/question-type/pom.xml b/question-type/pom.xml index a0ac76db7..75d7c9c73 100644 --- a/question-type/pom.xml +++ b/question-type/pom.xml @@ -33,7 +33,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 diff --git a/relation-extraction/pom.xml b/relation-extraction/pom.xml index 2b75bcbc0..37123ce5f 100644 --- a/relation-extraction/pom.xml +++ b/relation-extraction/pom.xml @@ -28,7 +28,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.0 + 1.3.2 edu.illinois.cs.cogcomp @@ -94,7 +94,7 @@ edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.0 + 1.3.2 ${project.basedir}/src/lbj/re.lbj From 371aa07066bf3d14ff4db160075c7c621f12f497 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 19 Jul 2019 09:43:07 -0500 Subject: [PATCH 06/16] Some minor fixes, cleaned up the benchmark config files. --- ner/benchmark/CoNLL/config/reuters.config | 13 ++++++------- ner/benchmark/MUC7/config/muc7.config | 11 +++++------ .../Ontonotes/config/ontonotes.config | 5 ++--- .../ContextAggregation.java | 5 +++-- .../ner/StringStatisticsUtils/MyString.java | 19 +++++++++++++------ ner/src/main/lbj/LbjTagger.lbj | 2 +- 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/ner/benchmark/CoNLL/config/reuters.config b/ner/benchmark/CoNLL/config/reuters.config index 61c541fc0..eb0f73779 100644 --- a/ner/benchmark/CoNLL/config/reuters.config +++ b/ner/benchmark/CoNLL/config/reuters.config @@ -1,6 +1,6 @@ # Required fields -modelName CoNLL -pathToModelFile models/CoNLL +modelName reuters +pathToModelFile models/reuters # Optional fields labelTypes PER ORG LOC MISC @@ -10,9 +10,8 @@ randomNoiseLevel 0.0 omissionRate 0.0 # parameter sweep reveals these to be the best params, L2 model is best. -# These were identified as part of the L1 L2 split parameter sweep of Oct '17 -learningRatePredictionsLevel1 .04 -thicknessPredictionsLevel1 40 -learningRatePredictionsLevel2 .04 -thicknessPredictionsLevel2 40 +learningRatePredictionsLevel1 .05 +thicknessPredictionsLevel1 30 +learningRatePredictionsLevel2 .05 +thicknessPredictionsLevel2 30 diff --git a/ner/benchmark/MUC7/config/muc7.config b/ner/benchmark/MUC7/config/muc7.config index f260e0507..4e9176238 100644 --- a/ner/benchmark/MUC7/config/muc7.config +++ b/ner/benchmark/MUC7/config/muc7.config @@ -1,16 +1,15 @@ # Required fields -modelName MUC7 -pathToModelFile ner/models/MUC7 +modelName muc7 +pathToModelFile models/muc7 # Optional fields labelTypes PER ORG LOC MISC # there are no misc tags in the MUC data. labelsToIgnoreInEvaluation MISC -FeaturePruningThreshold 0.0 # parameter sweep reveals these to be the best params, L1 model is best. -learningRatePredictionsLevel1 .1 -thicknessPredictionsLevel1 20 +learningRatePredictionsLevel1 .08 +thicknessPredictionsLevel1 5 learningRatePredictionsLevel2 .08 -thicknessPredictionsLevel2 10 +thicknessPredictionsLevel2 5 diff --git a/ner/benchmark/Ontonotes/config/ontonotes.config b/ner/benchmark/Ontonotes/config/ontonotes.config index d86a7a7c3..9571bf0f0 100644 --- a/ner/benchmark/Ontonotes/config/ontonotes.config +++ b/ner/benchmark/Ontonotes/config/ontonotes.config @@ -1,10 +1,9 @@ # Required fields -modelName OntoNotes -pathToModelFile ner/models/OntoNotes +modelName ontonotes +pathToModelFile models/ontonotes # Optional fields labelTypes TIME LAW GPE NORP LANGUAGE PERCENT FAC PRODUCT ORDINAL LOC PERSON WORK_OF_ART MONEY DATE EVENT QUANTITY ORG CARDINAL -FeaturePruningThreshold 0.0 # parameter sweep reveals these to be the best params, L1 model is best. learningRatePredictionsLevel1 .03 diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java index 1350f01a6..c3efbaffc 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java @@ -9,6 +9,7 @@ import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; +import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator; import java.util.HashMap; import java.util.Hashtable; @@ -20,9 +21,9 @@ public class ContextAggregation { * that the data was annotated with dictionaries etc. */ public static void annotate(NEWord word) { - if (word.params.featuresToUse.containsKey("aggregateContext") + if (word.params.featuresToUse.containsKey(NerBaseConfigurator.AGGREGATE_CONTEXT) || word.params.featuresToUse - .containsKey("aggregateGazetteerMatches")) { + .containsKey(NerBaseConfigurator.AGGREGATE_GAZETTEER)) { int i = 0; NEWord w = word, last = word.nextIgnoreSentenceBoundary; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/StringStatisticsUtils/MyString.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/StringStatisticsUtils/MyString.java index 14c51d64b..b4e3f97c1 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/StringStatisticsUtils/MyString.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/StringStatisticsUtils/MyString.java @@ -22,13 +22,20 @@ public static String cleanPunctuation(String s) { return res.toString(); } + /** + * If this is an easily identifiable date, return "*DATE*", if it is a + * combination of digits and characters, return a descriptive pattern indicating + * such, or just return the input form. + * @param s the input token term, or part of the term for DualTokenization. + * @return the feature, or just the name of the form. + */ public static String normalizeDigitsForFeatureExtraction(String s) { - String form = s; - if (MyString.isDate(form)) - form = "*DATE*"; - if (MyString.hasDigits(form)) - form = MyString.normalizeDigits(form); - return form; + if (MyString.isDate(s)) + return "*DATE*"; + else if (MyString.hasDigits(s)) + return MyString.normalizeDigits(s); + else + return s; } /** fast date formatter for identifying date instances. */ diff --git a/ner/src/main/lbj/LbjTagger.lbj b/ner/src/main/lbj/LbjTagger.lbj index 1077fc480..83c95daeb 100644 --- a/ner/src/main/lbj/LbjTagger.lbj +++ b/ner/src/main/lbj/LbjTagger.lbj @@ -365,7 +365,7 @@ discrete% PreviousTagPatternLevel1(NEWord word) <- mixed% FeaturesSharedTemp(NEWord word) <- IsSentenceStart, Capitalization, nonLocalFeatures, GazetteersFeatures, FormParts, Forms, WordTypeInformation, Affixes, BrownClusterPaths, WordEmbeddingFeatures, WikifierFeatures, AffixesZH -mixed% FeaturesLevel1SharedWithLevel2(NEWord word) <- FeaturesSharedTemp /*, IsWordCaseNormalized&&FeaturesSharedTemp*/ +mixed% FeaturesLevel1SharedWithLevel2(NEWord word) <- FeaturesSharedTemp mixed% FeaturesLevel1Only(NEWord word) <- PreviousTagPatternLevel1, PreviousTag1Level1,PreviousTag2Level1, prevTagsForContextLevel1, PreviousTag1Level1&&Forms From 3441594e2913002da3fb113a175d216e58cb73c4 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 19 Jul 2019 09:57:57 -0500 Subject: [PATCH 07/16] More fixes to config files. --- ner/benchmark/EnronCoNLL/config/EnronCoNLL.config | 2 +- ner/benchmark/Web/config/web.config | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config b/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config index f3a68fbc6..5f7212890 100644 --- a/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config +++ b/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config @@ -2,7 +2,7 @@ # Required fields modelName EnronCoNLL -pathToModelFile ner/models/EnronCoNLL +pathToModelFile models/EnronCoNLL randomNoiseLevel 0.0 omissionRate 0.0 diff --git a/ner/benchmark/Web/config/web.config b/ner/benchmark/Web/config/web.config index 56e2c46f1..0784cd4f1 100644 --- a/ner/benchmark/Web/config/web.config +++ b/ner/benchmark/Web/config/web.config @@ -1,6 +1,6 @@ # Required fields, web data is only tested, against the reuters model -modelName EnronCoNLL_testCoNLL -pathToModelFile models/EnronCoNLL_testCoNLL +modelName web +pathToModelFile models/Web # Optional fields labelTypes PER ORG LOC MISC From 6f292b211849f1c2f75b06dda9317b53722cfa32 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Mon, 22 Jul 2019 15:28:08 -0500 Subject: [PATCH 08/16] A very small change to improve performance of brown clusters, improved NER overall by 2 - 3%. --- .../ner/ExpressiveFeatures/BrownClusters.java | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java index df0020a5e..ab7943ce7 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java @@ -188,14 +188,16 @@ final public String[] getPrefixes(String word) { if (isLowercaseBrownClustersByResource[j]) word = word.toLowerCase(); THashMap wordToPath = wordToPathByResource.get(j); - final String prefix = "resource" + j + ":"; - if (wordToPath != null && wordToPath.containsKey(word)) { - String path = wordToPath.get(word); - int pathlength = path.length(); - v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[0]))); - for (int i = 1; i < prefixLengths.length; i++) - if (prefixLengths[i - 1] < pathlength) - v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[i]))); + if (wordToPath != null) { + String path = wordToPath.get(word); + final String prefix = "resource"+j+":"; + if (path != null) { + int pathlength = path.length(); + v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[0]))); + for (int i = 1; i < prefixLengths.length; i++) + if (prefixLengths[i - 1] < pathlength) + v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[i]))); + } } } String[] res = new String[v.size()]; From 77579fffe2f4a09f73870fd8699255b836c528e3 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Tue, 23 Jul 2019 10:43:35 -0500 Subject: [PATCH 09/16] Caching paths on word proved quite effective, so I am checking that in. --- .../ner/ExpressiveFeatures/BrownClusters.java | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java index ab7943ce7..7d52d4396 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java @@ -48,7 +48,13 @@ private BrownClusters() { /** clusters store, keyed on catenated paths. */ static private HashMap clusters = new HashMap<>(); - + + /** Predetermined number of words in these caches. */ + final private int INITIAL_CACHE_SIZE = 40000; + + /** this maps a word to a set of feature names. */ + private THashMap cache = new THashMap (INITIAL_CACHE_SIZE); + /** * Makes a unique key based on the paths, for storage in a hashmap. * @param pathsToClusterFiles the paths. @@ -181,8 +187,16 @@ final public ArrayList getResources() { final public String[] getPrefixes(NEWord w) { return getPrefixes(w.form); } - + final public String[] getPrefixes(String word) { + + // if we have already encountered this, it's cached, try that first. + String[] cachedPath = cache.get(word); + if (cachedPath != null) { + return cachedPath; + } + + // not cached. ArrayList v = new ArrayList<>(wordToPathByResource.size()); for (int j = 0; j < wordToPathByResource.size(); j++) { if (isLowercaseBrownClustersByResource[j]) @@ -202,6 +216,9 @@ final public String[] getPrefixes(String word) { } String[] res = new String[v.size()]; res = v.toArray(res); + if (res.length > 0) { + cache.put(word, res); + } return res; } From 08dc719b3e2b03459ee1b603ae5d51f5d3bab4ea Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Tue, 30 Jul 2019 10:28:37 -0500 Subject: [PATCH 10/16] Added support for file type allowing the file type to be added to the NEWord object via the ColumnFileReader, but with no changes to the features. --- .../cs/cogcomp/ner/LbjTagger/NEWord.java | 39 ++++++++++++++++--- .../cs/cogcomp/ner/LbjTagger/Parameters.java | 12 +++++- .../ner/LbjTagger/ParametersForLbjCode.java | 3 ++ .../edu/illinois/cs/cogcomp/ner/Main.java | 2 +- .../ColumnFileReader.java | 18 +++++++-- .../ner/config/NerBaseConfigurator.java | 3 ++ 6 files changed, 66 insertions(+), 11 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java index 85e8f442e..a353e4449 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java @@ -61,10 +61,9 @@ public class NEWord extends Word { private HashMap nonLocalFeatures = null; private String[] nonLocFeatArray = null; - /* - * This stuff was added for form normalization purposes. - */ - + /** this feature is only populate if useFileTypes feature is enabled. */ + private String fileType = null; + /** * An NEWord can be constructed from a Word object representing the * same word, an NEWord representing the previous word in the sentence, and the @@ -81,7 +80,7 @@ public NEWord(Word w, NEWord p, String type) { neLabel = type; neTypeLevel1 = null; } - + /** * Add the provided token to the sentence, for also do any additional word spliting. * @@ -95,6 +94,22 @@ public static void addTokenToSentence(LinkedVector sentence, String token, Strin addTokenToSentence(sentence, word); } + /** + * Add the provided token to the sentence, also do any additional word splitting. Additional argument + * indicates the file type which must be provided. If there is no file type, the file type is null. + * + * @param sentence the sentence to add the word to. + * @param token the individual token. + * @param tag the tag to annotate the word with. + * @param fileType a string representing file type. + */ + public static void addTokenToSentence(LinkedVector sentence, String token, String tag, ParametersForLbjCode prs, String fileType) { + NEWord word = new NEWord(new Word(token), null, tag); + word.params = prs; + word.setFileType(fileType); + addTokenToSentence(sentence, word); + } + public static void addTokenToSentence(LinkedVector sentence, NEWord word) { Vector v = NEWord.splitWord(word); if (word.params.tokenizationScheme @@ -201,6 +216,20 @@ public void setPrediction(String label, LabelToLookAt labelType) { } /** + * @return the file type of this term (same for entire document). + */ + public String getFileType() { + return fileType; + } + + /** + * @param fileType the file type of this term (same for entire document). + */ + public void setFileType(String fileType) { + this.fileType = fileType; + } + + /** * This method will return the score of the chosen label. * @return the score of the best label for this term. */ diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java index 2267748ff..2585bcba4 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java @@ -118,8 +118,6 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean } param.debug = rm.getDebug(); - // ParametersForLbjCode.currentParameters.debug = param.debug; - double randomNoiseLevel = rm.getDouble(NerBaseConfigurator.RANDOM_NOISE_LEVEL); double omissionRate = rm.getDouble(NerBaseConfigurator.OMISSION_RATE); @@ -146,6 +144,16 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean String labelsToKeep = rm.getString(NerBaseConfigurator.LABELS_TO_KEEP); param.labelsToKeep = new ArrayList(Arrays.asList(labelsToKeep.split(" "))); } + + // this property can be either "1" or "true" to enable file type. + if (rm.containsKey(NerBaseConfigurator.USE_FILETYPE)) { + String usefiletype = rm.getString(NerBaseConfigurator.USE_FILETYPE); + if (usefiletype.equalsIgnoreCase("true") || usefiletype.equals("1")) { + logger.info("File Type information will be included in the feature set."); + param.useFileType = true; + } + } + if (rm.containsKey("labelsToIgnoreInEvaluation")) { String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation"); param.labelsToIgnoreInEvaluation = diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java index 024c51346..0e24af324 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java @@ -39,6 +39,9 @@ public enum TokenizationScheme { /** labels of interest if a subset of all labels, all other labels are ignored. */ public ArrayList labelsToKeep = null; + /** use filetype feature, greatly increasing the number of features. */ + public boolean useFileType = false; + /** Labels to ignore when evaluating model performance, e.g. "MISC" for the MUC7 dataset. */ public Vector labelsToIgnoreInEvaluation = null; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java index f343b8741..0b7ee7233 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java @@ -266,7 +266,7 @@ else if (modelName.toLowerCase().equals("ontonotes")) } else { this.nerAnnotator = new NERAnnotator(this.resourceManager, viewName); } - System.out.println("Completed loading resources, assuming a "); + System.out.println("Completed loading resources "); } // display the command prompt depending on the mode we are in. diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java index ab018b31e..9d526c9ea 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java @@ -15,6 +15,7 @@ import edu.illinois.cs.cogcomp.lbjava.nlp.Word; import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; +import java.io.File; import java.util.ArrayList; @@ -32,6 +33,7 @@ public Object next() { String token = null; String pos = null; String label = null; + String fileType = null; linec++; // Skip to start of next line, skip unnecessary blank lines, headers and so on. String[] line = (String[]) super.next(); @@ -50,11 +52,16 @@ public Object next() { token = line[5]; label = line[0]; pos = line[4]; + if (line.length > 9 && params.useFileType) { + fileType = line[9]; + } else { + fileType = null; + } } LinkedVector res = new LinkedVector(); NEWord w = new NEWord(new Word(token, pos), null, label); - NEWord.addTokenToSentence(res, w.form, w.neLabel, params); + NEWord.addTokenToSentence(res, w.form, w.neLabel, params, fileType); for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) { linec++; @@ -67,14 +74,19 @@ public Object next() { token = line[5]; label = line[0]; pos = line[4]; + if (line.length > 9 && params.useFileType) { + fileType = line[9]; + } else { + fileType = null; + } } else { - System.out.println("Line "+linec+" in "+filename+" is wrong with "+line.length); + System.out.println("Line "+linec+" in "+filename+" has wrong number of columns : "+line.length); for (String a : line) System.out.print(":"+a); System.out.println(); continue; } w = new NEWord(new Word(token, pos), null, label); - NEWord.addTokenToSentence(res, w.form, w.neLabel, params); + NEWord.addTokenToSentence(res, w.form, w.neLabel, params, fileType); } if (res.size() == 0) return null; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java index 02b2b3b06..797cf5026 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/config/NerBaseConfigurator.java @@ -48,6 +48,9 @@ public class NerBaseConfigurator extends AnnotatorConfigurator { /** enumerates labels of interest, other labels are discared, if null, keep them all. */ public final static String LABELS_TO_KEEP = "labelsToKeep"; + /** enumerates labels of interest, other labels are discared, if null, keep them all. */ + public final static String USE_FILETYPE = "useFileType"; + public final static String NORMALIZE_TITLE_TEXT = "normalizeTitleText"; public final static String PATH_TO_TOKEN_NORM_DATA = "pathToTokenNormalizationData"; public final static String SORT_FILES_LEXICALLY = "sortLexicallyFilesInFolders"; From fd319747c4c8935a3e70005ea7b0a4d45f5afeae Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 13 Sep 2019 12:13:39 -0500 Subject: [PATCH 11/16] I added a NERResourceManagerFactory and it's usage so resources can be shared more easily, tests included. Also some minor performance (wallclock time) optimizations. --- .../illinois/cs/cogcomp/lbjava/nlp/Word.java | 5 +- ...redictionsAndEntitiesConfidenceScores.java | 15 +- .../PredictionsToProbabilities.java | 17 +- .../LbjTagger/LearningCurveMultiDataset.java | 62 ++++-- .../cs/cogcomp/ner/LbjTagger/NEWord.java | 22 ++- .../ner/NERResourceManagerFactory.java | 182 ++++++++++++++++++ .../illinois/cs/cogcomp/ner/NerBenchmark.java | 114 +++++++++-- .../ColumnFileReader.java | 14 +- .../ner/TestNERResourceManagerFactory.java | 56 ++++++ .../illinois/cs/cogcomp/ner/reuters.config | 20 ++ 10 files changed, 453 insertions(+), 54 deletions(-) create mode 100644 ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java create mode 100644 ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java create mode 100644 ner/src/test/java/edu/illinois/cs/cogcomp/ner/reuters.config diff --git a/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java b/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java index c8aa1337f..71714755e 100644 --- a/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java +++ b/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java @@ -184,7 +184,10 @@ public Word(String f, String pos, String l, String sense, Word p, int start, capitalized = f != null && f.length() > 0 && Character.isUpperCase(f.charAt(0)); partOfSpeech = pos; - if (partOfSpeech != null) POS.fromToken(partOfSpeech); + + // if assertions are enabled, this fails, so I seen no + // reason to leave it in - redman + //if (partOfSpeech != null) POS.fromToken(partOfSpeech); lemma = l; wordSense = sense; } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java index 20adf058f..779e72077 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java @@ -7,17 +7,16 @@ */ package edu.illinois.cs.cogcomp.ner.InferenceMethods; -import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.TwoLayerPredictionAggregationFeatures; -import edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel1; -import edu.illinois.cs.cogcomp.ner.LbjTagger.*; -import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord.RealFeature; -import edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords; -import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; - import java.util.ArrayList; import java.util.Vector; -/* +import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; +import edu.illinois.cs.cogcomp.ner.LbjTagger.Data; +import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; +import edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity; +import edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords; + +/** * This class is responsible for handling prediction scores of the entities. That is, this class can * prune the entities/predictions on which we're not confident at * diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java index 35effff2a..89c87d73e 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java @@ -31,15 +31,13 @@ public static CharacteristicWords getAndSetPredictionConfidences(SparseNetworkLe } double[] correctedScores = new double[scores.length]; double min = scores[0].score; - int maxScoreIdx = 0; - double maxScore = scores[maxScoreIdx].score; - String maxLabel = scores[maxScoreIdx].value; + double max = scores[0].score; + String maxLabel = scores[0].value; for (int i = 0; i < scores.length; i++) { if (min > scores[i].score) min = scores[i].score; - if (maxScore < scores[i].score) { - maxScore = scores[i].score; - maxScoreIdx = i; + if (max < scores[i].score) { + max = scores[i].score; maxLabel = scores[i].value; } } @@ -55,19 +53,20 @@ public static CharacteristicWords getAndSetPredictionConfidences(SparseNetworkLe correctedScores[i] /= sum; } + /* this doesn't seem necessary for (int i = 0; i < correctedScores.length; i++) - correctedScores[i] = correctedScores[i]; + correctedScores[i] = correctedScores[i];*/ CharacteristicWords res = new CharacteristicWords(scores.length); for (int i = 0; i < scores.length; i++) res.addElement(scores[i].value, correctedScores[i]); + w.setRawScore((float)max); if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel1Tagger)) { w.neTypeLevel1 = maxLabel; w.predictionConfidencesLevel1Classifier = res; - } - if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel2Tagger)) { + } else if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel2Tagger)) { w.neTypeLevel2 = maxLabel; w.predictionConfidencesLevel2Classifier = res; } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java index c38f9c9c7..719ed10ca 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java @@ -12,6 +12,7 @@ import edu.illinois.cs.cogcomp.lbjava.learn.BatchTrainer; import edu.illinois.cs.cogcomp.lbjava.learn.SparseAveragedPerceptron; import edu.illinois.cs.cogcomp.lbjava.learn.SparseNetworkLearner; +import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; import edu.illinois.cs.cogcomp.lbjava.parse.Parser; import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.ExpressiveFeaturesAnnotator; import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.TwoLayerPredictionAggregationFeatures; @@ -23,6 +24,7 @@ import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.Vector; import static java.lang.Float.NaN; @@ -123,7 +125,7 @@ public static void getLearningCurve(int fixedNumIterations, String dataFormat, S */ public static void getLearningCurve(Vector trainDataSet, Vector testDataSet, int fixedNumIterations, boolean incremental, ParametersForLbjCode params) throws Exception { - double bestF1Level1 = -1; + double bestF1Level1 = -2; int bestRoundLevel1 = 0; // Get the directory name (.model is appended in LbjTagger/Parameters.java:139) String modelPath = params.pathToModelFile; @@ -202,14 +204,12 @@ public static void getLearningCurve(Vector trainDataSet, Vector test bestRoundLevel1 = i; saveme = (NETaggerLevel1) tagger1.clone(); saveme.beginTraining(); - - System.out.println(saveme); - System.out.println(bestF1Level1); - System.out.println(f1Level1); - - } - logger.info(i + " rounds. Best so far for Level1 : (" + bestRoundLevel1 + ")=" + logger.info(i + " rounds. New best for Level1 : (" + bestRoundLevel1 + ")=" + + bestF1Level1); + } else { + logger.info(i + " rounds. Best so far for Level1 : (" + bestRoundLevel1 + ")=" + bestF1Level1); + } } saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold; @@ -252,7 +252,7 @@ public static void getLearningCurve(Vector trainDataSet, Vector test if (params.featuresToUse.containsKey("PredictionsLevel1")) { logger.info("Level 2 classifier learning rate = "+params.learningRatePredictionsLevel2+ ", thickness = "+params.thicknessPredictionsLevel2); - double bestF1Level2 = -1; + double bestF1Level2 = -2; int bestRoundLevel2 = 0; logger.info("Pre-extracting the training data for Level 2 classifier, saving to "+trainPathL2); BatchTrainer bt2train = @@ -276,14 +276,19 @@ public static void getLearningCurve(Vector trainDataSet, Vector test TestDiscrete.testDiscrete(simpleTest, tagger2, null, testParser2, true, 0); double f1Level2 = simpleTest.getOverallStats()[2]; - if (f1Level2 >= bestF1Level2) { + if(Double.isNaN(f1Level2)) + f1Level2 = 0; + if (f1Level2 > bestF1Level2) { bestF1Level2 = f1Level2; bestRoundLevel2 = i; saveme = (NETaggerLevel2) tagger2.clone(); saveme.beginTraining(); - } - logger.info(i + " rounds. Best so far for Level2 : (" + bestRoundLevel2 + ") " + logger.info(i + " rounds. New best for Level2 : (" + bestRoundLevel2 + ") " + bestF1Level2); + } else { + logger.info(i + " rounds. Best so far for Level2 : (" + bestRoundLevel2 + ") " + + bestF1Level2); + } } saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold; saveme.doneTraining(); @@ -362,22 +367,37 @@ public void close() { } public Object next() { - if (datasetId >= dataset.size()) - return null; - // logger.debug("token = "+tokenId+"; sentence = "+sentenceId+"; dataset = "+datasetId+" --- datasets="+dataset.size()+" now sentences= "+dataset.elementAt(datasetId).sentences.size()+"; now tokens = "+dataset.elementAt(datasetId).sentences.elementAt(sentenceId).size()); - Object res = + if (datasetId >= dataset.size()) { + return null; // expected, we are just done with the dataset. + } + Data nerdata = dataset.elementAt(datasetId); + if (nerdata.documents.size() <= docid) { + logger.info("Encountered a dataset with no documents in it."); + return null; // a dataset with no documents in it is odd. + } + NERDocument nerdoc = nerdata.documents.get(docid); + if (nerdoc.sentences.size() <= sentenceId) { + logger.info("Encountered a document with no sentences in it : "+nerdoc.docname); + return null; + } + LinkedVector nersentence = nerdoc.sentences.get(sentenceId); + if (nersentence.size() <= tokenId) { + logger.info("Encountered a sentnce with no tokens in it : "+nerdoc.docname); + return null; + } + Object res = nersentence.get(tokenId); + /*Object res = dataset.elementAt(datasetId).documents.get(docid).sentences.get(sentenceId) - .get(tokenId); - if (tokenId < dataset.elementAt(datasetId).documents.get(docid).sentences.get( - sentenceId).size() - 1) + .get(tokenId);*/ + if (tokenId < nersentence.size() - 1) tokenId++; else { tokenId = 0; - if (sentenceId < dataset.elementAt(datasetId).documents.get(docid).sentences.size() - 1) { + if (sentenceId < nerdoc.sentences.size() - 1) { sentenceId++; } else { sentenceId = 0; - if (docid < dataset.elementAt(datasetId).documents.size() - 1) { + if (docid < nerdata.documents.size() - 1) { docid++; } else { docid = 0; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java index a353e4449..0cefb75ac 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java @@ -26,6 +26,10 @@ public class NEWord extends Word { /** This field is used to store a computed named entity type tag. */ public String neTypeLevel1; + + /** raw score as returned by the classifier without normalization. */ + private float rawScore; + public String neTypeLevel2; public NamedEntity predictedEntity = null;// if non-null it keeps the named entity the tagger public ParametersForLbjCode params = null; @@ -243,7 +247,23 @@ public double getScore() { return this.predictionConfidencesLevel2Classifier.topScores.elementAt(0); } - public enum LabelToLookAt { + + /** + * This method will return the score of the chosen label. + * @return the unnormalized score as returned directly by classifier. + */ + public float getRawScore() { + return rawScore; + } + + /** + * @param rawScore the unnormalized score as returned directly by classifier. + */ + public void setRawScore(float rawScore) { + this.rawScore = rawScore; + } + + public enum LabelToLookAt { PredictionLevel2Tagger, PredictionLevel1Tagger, GoldLabel } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java new file mode 100644 index 000000000..c8a95fd7d --- /dev/null +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java @@ -0,0 +1,182 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package edu.illinois.cs.cogcomp.ner; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Map.Entry; +import java.util.Properties; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator; + +/** + * The factory, given a properties file reference will load the properties and + * "merge" them with the defaults. In the case of properties that reference a + * file or directory, the property in the properties file is assumed to be a + * relative path. The path will be prefixed with the directory path included in + * the static path variable before actually being saved in the resource manager. + * The resource manager is return after this refactoring is complete. + * @author redman + */ +public class NERResourceManagerFactory { + + /** + * check first if file in in resource fork (a jar file), if not check if it's in a file. If the + * configuration file exists in either a jar file or on the file system return true. + * @param configFile the file to find. + * @throws IOException + */ + static private Properties checkIfExists(String configFile) throws IOException { + InputStream is = NERResourceManagerFactory.class.getClassLoader().getResourceAsStream(configFile); + if (is == null) { + is = new FileInputStream(configFile); + } + try { + Properties properties = new Properties(); + properties.load(is); + return properties; + } finally { + try { + is.close(); + } catch (IOException ignored) { + } + } + } + + /** + * This method will return a resource manager that can be used by the NER + * system, however, all paths are assumed to be relative, the + * resourcePath must be set to contain the folder on the system where all + * property files, models, gazetteers and brown cluster reside. + * + * @param propertiesFilename the name of the properties file. + * @param modelsReplacementPattern pattern to replace with the model directory, or null to disable. + * @param resourcesReplacementPattern pattern to replace with the resources directory, or null to disable. + * @param modelsPath path where models are found, or null to disable. + * @param resourcesPath path where resources are found, or null to disable. + * @return the modified resources. + * @throws FileNotFoundException if a required file was not found. + * @throws IOException if a file was found but could not be read or parsed. + */ + static public ResourceManager get(String propertiesFilename, String modelsReplacementPattern, + String resourcesReplacementPattern, String modelsPath, String resourcesPath) + throws FileNotFoundException, IOException { + + // check the models path. + if (resourcesPath != null && resourcesPath.length() > 0) { + if (!resourcesPath.endsWith(File.separator)) { + resourcesPath = resourcesPath + File.separator; + } + File resourcesDirectory = new File(resourcesPath); + if (!resourcesDirectory.exists()) { + throw new FileNotFoundException("The resources directory did not exist."); + } + if (!resourcesDirectory.isDirectory()) { + throw new FileNotFoundException("The resources directory existed, but is not a directory."); + } + } + Properties properties = null; + try { + properties = checkIfExists(propertiesFilename); + } catch (IOException e) { + if (resourcesPath == null) + throw e; + // did not exist as presented in the argument, add the resourcePath, see if it's there. + propertiesFilename = resourcesPath+propertiesFilename; + properties = checkIfExists(propertiesFilename); + } + + // check the models path. + if (modelsPath != null && modelsPath.length() > 0) { + if (modelsPath.length() > 0 && !modelsPath.endsWith(File.separator)) { + modelsPath = modelsPath + File.separator; + } + File modelsDirectory = new File(modelsPath); + if (!modelsDirectory.exists()) { + throw new FileNotFoundException("The models directory did not exist."); + } + if (!modelsDirectory.isDirectory()) { + throw new FileNotFoundException("The models directory existed, but is not a directory."); + } + } + + // we now have the new properties, and we have the base default props, + // merge them together modifying paths as necessary. + ResourceManager rm = new NerBaseConfigurator().getDefaultConfig(); + for (Entry entry : properties.entrySet()) { + String name = (String)entry.getKey(); + String value = (String)entry.getValue(); + if (name.equals(NerBaseConfigurator.PATH_TO_MODEL)) { + if (modelsPath != null && modelsReplacementPattern != null) + value = value.replace(modelsReplacementPattern, modelsPath); + } else if (name.equals(NerBaseConfigurator.PATH_TO_GAZETTEERS)) { + if (resourcesPath != null && resourcesReplacementPattern != null) + value = value.replace(resourcesReplacementPattern, resourcesPath); + } else if (name.equals(NerBaseConfigurator.PATHS_TO_BROWN_CLUSTERS)) { + + // trickier since this is a list of paths, split on tabs or + // spaces + if (resourcesPath != null && resourcesReplacementPattern != null) { + String [] paths = value.split("[\\t ]"); + value = ""; + for (int i = 0; i < paths.length; i++) { + if (i > 0) + value += "\t"; + value += paths[i].replace(resourcesReplacementPattern, resourcesPath); + } + } + } else if (name.equals(NerBaseConfigurator.PATH_TO_TOKEN_NORM_DATA)) { + + // trickier since this is a list of paths, split on tabs or + // spaces + if (resourcesPath != null && resourcesReplacementPattern != null) { + String [] paths = value.split("[\\t ]"); + value = ""; + for (int i = 0; i < paths.length; i++) { + if (i > 0) + value += "\t"; + value += paths[i].replace(resourcesReplacementPattern, resourcesPath); + } + } + } + rm.getProperties().setProperty(name, value); + } + return rm; + } + + /** + * this is for testing. It also demonstrates the contract just in case + * you were wondering how this guy works. + */ + static public void main(String[] args) throws IOException { + ResourceManager rm = NERResourceManagerFactory.get("reuters.config", "#MODELS_PATH#", + "#RESOURCES_PATH#","/Users/redman/Desktop","/Users/redman/Desktop"); + for (Entry entry : rm.getProperties().entrySet()) + System.out.println(entry.getKey()+":"+entry.getValue()); + System.out.println(); + try { + rm = NERResourceManagerFactory.get("reuters.config", null, null, null, null); + for (Entry entry : rm.getProperties().entrySet()) + System.out.println(entry.getKey()+":"+entry.getValue()); + System.err.println("This configuration file should not have been found, should have thrown a FileNotFoundException."); + } catch (FileNotFoundException fnfe) { + + } + + try { + rm = NERResourceManagerFactory.get("/Users/redman/Desktop/reuters.config", null, null, null, null); + for (Entry entry : rm.getProperties().entrySet()) + System.out.println(entry.getKey()+":"+entry.getValue()); + } catch (FileNotFoundException fnfe) { + + } + + } +} diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java index 304c64eca..3def472e4 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerBenchmark.java @@ -83,6 +83,18 @@ public class NerBenchmark { * model and continue training from there. */ private boolean incremental = false; + /** The resources replacement pattern. */ + private String resourcesReplacementPattern = null; + + /** The resources replacement pattern. */ + private String modelsReplacementPattern = null; + + /** The resources replacement pattern. */ + private String resourcesDirectory = null; + + /** The resources replacement pattern. */ + private String modelsDirectory = null; + /** * all default. */ @@ -223,6 +235,66 @@ public NerBenchmark setIncremental(boolean incremental) { } /** + * @return the resourcesReplacementPattern + */ + public String getResourcesReplacementPattern() { + return resourcesReplacementPattern; + } + + /** + * @param resourcesReplacementPattern the resourcesReplacementPattern to set + */ + public NerBenchmark setResourcesReplacementPattern(String resourcesReplacementPattern) { + this.resourcesReplacementPattern = resourcesReplacementPattern; + return this; + } + + /** + * @return the modelsReplacementPattern + */ + public String getModelsReplacementPattern() { + return modelsReplacementPattern; + } + + /** + * @param modelsReplacementPattern the modelsReplacementPattern to set + */ + public NerBenchmark setModelsReplacementPattern(String modelsReplacementPattern) { + this.modelsReplacementPattern = modelsReplacementPattern; + return this; + } + + /** + * @return the resourcesDirectory + */ + public String getResourcesDirectory() { + return resourcesDirectory; + } + + /** + * @param resourcesDirectory the resourcesDirectory to set + */ + public NerBenchmark setResourcesDirectory(String resourcesDirectory) { + this.resourcesDirectory = resourcesDirectory; + return this; + } + + /** + * @return the modelsDirectory + */ + public String getModelsDirectory() { + return modelsDirectory; + } + + /** + * @param modelsDirectory the modelsDirectory to set + */ + public NerBenchmark setModelsDirectory(String modelsDirectory) { + this.modelsDirectory = modelsDirectory; + return this; + } + + /** * for the builder design pattern, less the factory, which I consider just a waste of * time. * @return a default instance of an NerBenchmark. @@ -291,7 +363,9 @@ public static void main(String[] args) throws Exception { break; } } - NerBenchmark bm = NerBenchmark.build().setIncremental(incremental).setIterations(iterations).setOutput(output).setRelease(release).setReportFeatures(reportFeatures).setSkiptraining(skiptraining).setReportLabels(reportLabels).setVerbose(verbose); + NerBenchmark bm = NerBenchmark.build().setIncremental(incremental).setIterations(iterations) + .setOutput(output).setRelease(release).setReportFeatures(reportFeatures) + .setSkiptraining(skiptraining).setReportLabels(reportLabels).setVerbose(verbose); // Loop over every directory within the benchmark directory. Each subdirectory will contain // a configuration file, and a directory with the test data at the very least. If there is @@ -309,7 +383,7 @@ public static void main(String[] args) throws Exception { continue; File configsDir = new File(dir + "/config/"); if (!configsDir.exists()) { - System.err.println("There was no config file in " + configsDir); + System.err.println("There was no config directory in " + configsDir); continue; } @@ -332,6 +406,9 @@ public boolean accept(File dir, String name) { return name.endsWith(".config"); } }); + if (configfiles.length == 0) { + System.err.println("There was no config file in " + configsDir); + } for (String confFile : configfiles) { confFile = dir + "/config/" + confFile; bm.execute(confFile, trainDirName, trainDir, devDirName, devDir, testDirName, testDir); @@ -354,11 +431,19 @@ public boolean accept(File dir, String name) { */ public Vector execute(String confFile, String trainDirName, File trainDir, String devDirName, File devDir, String testDirName, File testDir) throws Exception { if (!skiptraining) { - if (trainDir.exists() && testDir.exists() && devDir.exists()) { - return trainModel(confFile, trainDirName, trainDir, devDirName, devDir, testDirName, testDir); - } else { - System.err.println("Training requires a \"train\", \"test\" and \"dev\" subdirectory!"); - } + if (trainDir.exists()) { + if (testDir.exists()) { + if (devDir.exists()) { + return trainModel(confFile, trainDirName, trainDir, devDirName, devDir, testDirName, testDir); + } else { + System.err.println("Dev directory, required for training, did not exist : "+devDir); + } + } else { + System.err.println("Test directory, required for training, did not exist : "+testDir); + } + } else { + System.err.println("Train directory, required for training, did not exist : "+trainDir); + } } else if (!release) { // if not training, and not build a release model, we are just reporting the accuracy of the existing @@ -372,8 +457,9 @@ public Vector execute(String confFile, String trainDirName, File // dev data using the number of iterations to determine a stopping point rather than using the dev set // for that. if (trainDir.exists() && testDir.exists() && devDir.exists()) { - ParametersForLbjCode prms = Parameters.readConfigAndLoadExternalData(confFile, true); - ResourceManager rm = new ResourceManager(confFile); + ResourceManager rm = NERResourceManagerFactory.get(confFile, this.modelsReplacementPattern, this.resourcesReplacementPattern, + this.modelsDirectory, this.resourcesDirectory); + ParametersForLbjCode prms = Parameters.readAndLoadConfig(rm, true); ModelLoader.load(rm, rm.getString("modelName"), true, prms); System.out.println("\n\n----- Building a final model for " + confFile + " ------"); @@ -400,8 +486,9 @@ public Vector execute(String confFile, String trainDirName, File */ private void reportResults(String confFile, String testDirName) throws Exception { System.out.println("\n\n----- Reporting results from existing models for " + confFile + " ------"); - ParametersForLbjCode prms = Parameters.readConfigAndLoadExternalData(confFile, !skiptraining); - ResourceManager rm = new ResourceManager(confFile); + ResourceManager rm = NERResourceManagerFactory.get(confFile, this.modelsReplacementPattern, this.resourcesReplacementPattern, + this.modelsDirectory, this.resourcesDirectory); + ParametersForLbjCode prms = Parameters.readAndLoadConfig(rm, !skiptraining); ModelLoader.load(rm, rm.getString("modelName"), !skiptraining, prms); System.out.println("Benchmark against configuration : " + confFile); if (reportLabels) @@ -429,8 +516,9 @@ else if (reportFeatures) */ private Vector trainModel(String confFile, String trainDirName, File trainDir, String devDirName, File devDir, String testDirName, File testDir) throws Exception { System.out.println("\n\n----- Training models for evaluation for " + confFile + " ------"); - ParametersForLbjCode prms = Parameters.readConfigAndLoadExternalData(confFile, true); - ResourceManager rm = new ResourceManager(confFile); + ResourceManager rm = NERResourceManagerFactory.get(confFile, this.modelsReplacementPattern, this.resourcesReplacementPattern, + this.modelsDirectory, this.resourcesDirectory); + ParametersForLbjCode prms = Parameters.readAndLoadConfig(rm, true); ModelLoader.load(rm, rm.getString("modelName"), true, prms); NETaggerLevel1 taggerLevel1 = (NETaggerLevel1) prms.taggerLevel1; NETaggerLevel2 taggerLevel2 = (NETaggerLevel2) prms.taggerLevel2; diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java index 9d526c9ea..3bcbc3600 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/ColumnFileReader.java @@ -8,6 +8,7 @@ package edu.illinois.cs.cogcomp.ner.ParsingProcessingData; +import edu.illinois.cs.cogcomp.ner.IO.ResourceUtilities; import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument; import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; @@ -18,8 +19,12 @@ import java.io.File; import java.util.ArrayList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + class ColumnFileReader extends ColumnFormat { + private static Logger logger = LoggerFactory.getLogger(ColumnFileReader.class); String filename = null; ParametersForLbjCode params = null; public ColumnFileReader(String file, ParametersForLbjCode params) { @@ -60,7 +65,14 @@ public Object next() { } LinkedVector res = new LinkedVector(); - NEWord w = new NEWord(new Word(token, pos), null, label); + Word word = null; + try { + word = new Word(token, pos); + } catch (Throwable t) { + logger.error("A POS tag was bad in this file : "+filename); + throw t; + } + NEWord w = new NEWord(word, null, label); NEWord.addTokenToSentence(res, w.form, w.neLabel, params, fileType); for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) { diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java new file mode 100644 index 000000000..9482f46f8 --- /dev/null +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java @@ -0,0 +1,56 @@ +/** + * + */ +package edu.illinois.cs.cogcomp.ner; + +import static org.junit.Assert.*; + +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.junit.Test; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator; + +/** + * The resource manager factory can replace user supplied patterns in the resource file + * the user provided paths. In this way, models and resources can move around without regard for + * how they are qualified in the properties file. + * @author redman + */ +public class TestNERResourceManagerFactory { + + /** + * Test the NERResourceManagerFactory. + */ + @Test + public void test() throws IOException { + ResourceManager rm = NERResourceManagerFactory.get("edu/illinois/cs/cogcomp/ner/reuters.config", "#MODELS_PATH#", + "#RESOURCES_PATH#","",""); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_MODEL), "models/reuters"); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_GAZETTEERS), "gazetteers"); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_TOKEN_NORM_DATA), "brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); + rm = NERResourceManagerFactory.get("edu/illinois/cs/cogcomp/ner/reuters.config", "#MODELS_PATH#", + "#RESOURCES_PATH#",".","."); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_MODEL), "./models/reuters"); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_GAZETTEERS), "./gazetteers"); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_TOKEN_NORM_DATA), "./brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); + + try { + rm = NERResourceManagerFactory.get("reuters.config", null, null, null, null); + fail("This configuration file should not have been found, should have thrown a FileNotFoundException."); + } catch (FileNotFoundException fnfe) { + } + + try { + rm = NERResourceManagerFactory.get("edu/illinois/cs/cogcomp/ner/reuters.config", null, null, null, null); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_MODEL), "#MODELS_PATH#models/reuters"); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_GAZETTEERS), "#RESOURCES_PATH#gazetteers"); + assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_TOKEN_NORM_DATA), "#RESOURCES_PATH#brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); + } catch (FileNotFoundException fnfe) { + throw fnfe; + } + } + +} diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reuters.config b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reuters.config new file mode 100644 index 000000000..0da1591b3 --- /dev/null +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reuters.config @@ -0,0 +1,20 @@ +# Required fields +modelName reuters +pathToModelFile #MODELS_PATH#models/reuters +pathsToBrownClusters #RESOURCES_PATH#brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt #RESOURCES_PATH#brown-clusters/brownBllipClusters #RESOURCES_PATH#brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt +pathToTokenNormalizationData #RESOURCES_PATH#brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt +pathToGazetteersLists #RESOURCES_PATH#gazetteers + +# Optional fields +labelTypes PER ORG LOC MISC + +# noise level +randomNoiseLevel 0.0 +omissionRate 0.0 + +# parameter sweep reveals these to be the best params, L2 model is best. +learningRatePredictionsLevel1 .05 +thicknessPredictionsLevel1 30 +learningRatePredictionsLevel2 .05 +thicknessPredictionsLevel2 30 + From 67ed64019d1a5e6b45d1560158f4f99b16a9bdac Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Thu, 19 Sep 2019 09:51:24 -0500 Subject: [PATCH 12/16] Fixed a performance issue adding constituents to a vew, allso some fixes to the NER resource manager stuff. --- big-data-utils/pom.xml | 4 +- chunker/pom.xml | 8 ++-- commasrl/pom.xml | 20 +++++----- core-utilities/pom.xml | 2 +- .../datastructures/textannotation/View.java | 8 ++-- corpusreaders/pom.xml | 6 +-- curator/pom.xml | 4 +- dataless-classifier/pom.xml | 6 +-- depparse/pom.xml | 12 +++--- edison/pom.xml | 8 ++-- external/clausie/pom.xml | 4 +- external/external-commons/pom.xml | 6 +-- external/path-lstm/pom.xml | 6 +-- external/stanford_3.3.1/pom.xml | 6 +-- external/stanford_3.8.0/pom.xml | 6 +-- inference/pom.xml | 4 +- lbjava-nlp-tools/pom.xml | 4 +- lemmatizer/pom.xml | 6 +-- md/pom.xml | 14 +++---- ner/pom.xml | 8 ++-- .../illinois/cs/cogcomp/ner/NERAnnotator.java | 8 +++- .../cs/cogcomp/ner/NERAnnotatorTest.java | 10 ++++- .../cs/cogcomp/ner/NerLabelPruningTest.java | 3 -- .../ner/TestNERResourceManagerFactory.java | 6 +-- .../resource_manager_test.config} | 0 pipeline-client/pom.xml | 4 +- pipeline/pom.xml | 38 +++++++++---------- pom.xml | 2 +- pos/pom.xml | 2 +- prepsrl/pom.xml | 14 +++---- quantifier/pom.xml | 12 +++--- question-type/pom.xml | 10 ++--- relation-extraction/pom.xml | 18 ++++----- similarity/pom.xml | 6 +-- temporal-normalizer/pom.xml | 14 +++---- tokenizer/pom.xml | 8 ++-- transliteration/pom.xml | 6 +-- verbsense/pom.xml | 16 ++++---- 38 files changed, 165 insertions(+), 154 deletions(-) rename ner/src/test/{java/edu/illinois/cs/cogcomp/ner/reuters.config => resources/resource_manager_test.config} (100%) diff --git a/big-data-utils/pom.xml b/big-data-utils/pom.xml index 1baee004b..393f510b8 100644 --- a/big-data-utils/pom.xml +++ b/big-data-utils/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -23,7 +23,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 org.xeustechnologies.google-api diff --git a/chunker/pom.xml b/chunker/pom.xml index 7a66ff4f5..174a3ad55 100644 --- a/chunker/pom.xml +++ b/chunker/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 @@ -24,12 +24,12 @@ edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp diff --git a/commasrl/pom.xml b/commasrl/pom.xml index af3ac75aa..371dfe8f5 100644 --- a/commasrl/pom.xml +++ b/commasrl/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -35,48 +35,48 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 true edu.illinois.cs.cogcomp illinois-curator - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-inference - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-ner - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-chunker - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp diff --git a/core-utilities/pom.xml b/core-utilities/pom.xml index 464336dfc..7d0ad4328 100644 --- a/core-utilities/pom.xml +++ b/core-utilities/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-core-utilities diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java index f5bded81a..3e8e6cb08 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java @@ -137,7 +137,8 @@ private void removeAllTokenFromConstituentMapping(Constituent c) { /** * Convenience method for addConstituent(constituent, false) - * @param constituent + * + * @param constituent The new constituent to be added. */ public void addConstituent(Constituent constituent){ this.addConstituent(constituent, false); @@ -148,9 +149,10 @@ public void addConstituent(Constituent constituent){ * Otherwise, we return the new constituent. * * @param constituent The new constituent to be added. + * @param force if true, add constituent even if it is a duplicate */ public void addConstituent(Constituent constituent, boolean force) { - if(!constituents.contains(constituent) || force) { + if(force || this.tokensToConstituents[constituent.getStartSpan()] == null || !constituents.contains(constituent)) { constituents.add(constituent); startSpan = Math.min(this.startSpan, constituent.getStartSpan()); @@ -161,7 +163,7 @@ public void addConstituent(Constituent constituent, boolean force) { this.addTokenToConstituentMapping(token, constituent); } } - }else { + } else { System.err.println("Warning (View.java): not adding duplicate Constituent: " + constituent + ", use addConstituent(c, true) to force add."); } } diff --git a/corpusreaders/pom.xml b/corpusreaders/pom.xml index d36356c42..6589141f9 100644 --- a/corpusreaders/pom.xml +++ b/corpusreaders/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-corpusreaders @@ -15,12 +15,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 org.slf4j diff --git a/curator/pom.xml b/curator/pom.xml index 3a091244e..2ef4ef177 100644 --- a/curator/pom.xml +++ b/curator/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-curator @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 diff --git a/dataless-classifier/pom.xml b/dataless-classifier/pom.xml index 8cc236a9e..b04467c65 100644 --- a/dataless-classifier/pom.xml +++ b/dataless-classifier/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -21,12 +21,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 org.slf4j diff --git a/depparse/pom.xml b/depparse/pom.xml index 039036e40..f996a843a 100644 --- a/depparse/pom.xml +++ b/depparse/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-depparse @@ -16,27 +16,27 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-chunker - 4.0.16 + 4.0.17 diff --git a/edison/pom.xml b/edison/pom.xml index ab802c485..723d577c9 100644 --- a/edison/pom.xml +++ b/edison/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-edison @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 @@ -80,13 +80,13 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-curator - 4.0.16 + 4.0.17 test diff --git a/external/clausie/pom.xml b/external/clausie/pom.xml index 55b8899a1..ec7772a47 100644 --- a/external/clausie/pom.xml +++ b/external/clausie/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 ../../pom.xml @@ -24,7 +24,7 @@ edu.illinois.cs.cogcomp external-commons - 4.0.16 + 4.0.17 org.slf4j diff --git a/external/external-commons/pom.xml b/external/external-commons/pom.xml index 452b8dd52..f4c515377 100644 --- a/external/external-commons/pom.xml +++ b/external/external-commons/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 org.cogcomp diff --git a/external/path-lstm/pom.xml b/external/path-lstm/pom.xml index 5264be26c..11e84c72e 100644 --- a/external/path-lstm/pom.xml +++ b/external/path-lstm/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp external-commons - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 org.cogcomp diff --git a/external/stanford_3.3.1/pom.xml b/external/stanford_3.3.1/pom.xml index 75091903e..6797bf324 100644 --- a/external/stanford_3.3.1/pom.xml +++ b/external/stanford_3.3.1/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 ../../pom.xml @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 @@ -36,7 +36,7 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 diff --git a/external/stanford_3.8.0/pom.xml b/external/stanford_3.8.0/pom.xml index 0af867a59..0de9806e3 100644 --- a/external/stanford_3.8.0/pom.xml +++ b/external/stanford_3.8.0/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp external-commons - 4.0.16 + 4.0.17 org.slf4j diff --git a/inference/pom.xml b/inference/pom.xml index 5e1a770ef..8234a9b7d 100644 --- a/inference/pom.xml +++ b/inference/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 jar @@ -22,7 +22,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp diff --git a/lbjava-nlp-tools/pom.xml b/lbjava-nlp-tools/pom.xml index eb5650c49..319bd0748 100644 --- a/lbjava-nlp-tools/pom.xml +++ b/lbjava-nlp-tools/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 LBJava-NLP-tools @@ -30,7 +30,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 org.slf4j diff --git a/lemmatizer/pom.xml b/lemmatizer/pom.xml index cf9f5843d..a42632e57 100644 --- a/lemmatizer/pom.xml +++ b/lemmatizer/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-lemmatizer @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.stanford.nlp diff --git a/md/pom.xml b/md/pom.xml index c46a77007..a64380b64 100644 --- a/md/pom.xml +++ b/md/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -25,32 +25,32 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-ner - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.16 + 4.0.17 org.slf4j diff --git a/ner/pom.xml b/ner/pom.xml index 7fdc5890e..e1985bfd8 100644 --- a/ner/pom.xml +++ b/ner/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-ner @@ -23,12 +23,12 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 org.cogcomp @@ -44,7 +44,7 @@ edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.16 + 4.0.17 org.slf4j diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java index d247b6194..55492d8ba 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java @@ -27,6 +27,7 @@ import edu.illinois.cs.cogcomp.annotation.Annotator; import edu.illinois.cs.cogcomp.annotation.AnnotatorConfigurator; import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView; import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; @@ -55,6 +56,10 @@ */ public class NERAnnotator extends Annotator { + /** name of attribute containing the raw score value represented as a string. This value is + * not normalized in any way, it is the value produced by the perceptron. */ + final static public String RAW_SCORE_ATTRIBUTE = "RawScore"; + /** our specific logger. */ private final Logger logger = LoggerFactory.getLogger(NERAnnotator.class); @@ -230,7 +235,8 @@ public void addView(TextAnnotation ta) { int e = tokenindices[endIndex]; if (e <= s) e = s + 1; - nerView.addSpanLabel(s, e, label, neWord.getScore()); + Constituent tokenlabel = nerView.addSpanLabel(s, e, label, neWord.getScore()); + tokenlabel.addAttribute(RAW_SCORE_ATTRIBUTE, Float.toString(neWord.getRawScore())); open = false; } } diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NERAnnotatorTest.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NERAnnotatorTest.java index e9b957220..2dd953e4a 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NERAnnotatorTest.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NERAnnotatorTest.java @@ -139,8 +139,7 @@ public class NERAnnotatorTest { * See if we get the right entities back. TODO: MS removed @Test annotation as this test * currently fails, but benchmark performance is good */ - - + @Test public void testResults() { TextAnnotation ta = tab.createTextAnnotation(TEST_INPUT); View view = null; @@ -152,6 +151,13 @@ public void testResults() { } for (Constituent c : view.getConstituents()) { assertTrue("No entity named \"" + c.toString() + "\"", entities.contains(c.toString())); + String raw = c.getAttribute(NERAnnotator.RAW_SCORE_ATTRIBUTE); + assertTrue(raw != null); + try { + Float.parseFloat(raw); + } catch (NumberFormatException nfe) { + fail("The raw score was not a number."); + } } } diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java index 301330959..a8ccc2103 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/NerLabelPruningTest.java @@ -53,9 +53,6 @@ public void testOntonotesNer() { fail(e.getMessage()); } View v = taOnto.getView(nerOntonotes.getViewName()); - for (Constituent c : v.getConstituents()) { - System.out.println(c+" = "+c.getLabel()+" : "+c.getConstituentScore()); - } assertEquals(3, v.getConstituents().size()); } } \ No newline at end of file diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java index 9482f46f8..d45402c8f 100644 --- a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java +++ b/ner/src/test/java/edu/illinois/cs/cogcomp/ner/TestNERResourceManagerFactory.java @@ -26,12 +26,12 @@ public class TestNERResourceManagerFactory { */ @Test public void test() throws IOException { - ResourceManager rm = NERResourceManagerFactory.get("edu/illinois/cs/cogcomp/ner/reuters.config", "#MODELS_PATH#", + ResourceManager rm = NERResourceManagerFactory.get("resource_manager_test.config", "#MODELS_PATH#", "#RESOURCES_PATH#","",""); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_MODEL), "models/reuters"); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_GAZETTEERS), "gazetteers"); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_TOKEN_NORM_DATA), "brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); - rm = NERResourceManagerFactory.get("edu/illinois/cs/cogcomp/ner/reuters.config", "#MODELS_PATH#", + rm = NERResourceManagerFactory.get("resource_manager_test.config", "#MODELS_PATH#", "#RESOURCES_PATH#",".","."); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_MODEL), "./models/reuters"); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_GAZETTEERS), "./gazetteers"); @@ -44,7 +44,7 @@ public void test() throws IOException { } try { - rm = NERResourceManagerFactory.get("edu/illinois/cs/cogcomp/ner/reuters.config", null, null, null, null); + rm = NERResourceManagerFactory.get("resource_manager_test.config", null, null, null, null); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_MODEL), "#MODELS_PATH#models/reuters"); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_GAZETTEERS), "#RESOURCES_PATH#gazetteers"); assertEquals(rm.getString(NerBaseConfigurator.PATH_TO_TOKEN_NORM_DATA), "#RESOURCES_PATH#brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt"); diff --git a/ner/src/test/java/edu/illinois/cs/cogcomp/ner/reuters.config b/ner/src/test/resources/resource_manager_test.config similarity index 100% rename from ner/src/test/java/edu/illinois/cs/cogcomp/ner/reuters.config rename to ner/src/test/resources/resource_manager_test.config diff --git a/pipeline-client/pom.xml b/pipeline-client/pom.xml index cc1e27bc8..fc2cf3baa 100644 --- a/pipeline-client/pom.xml +++ b/pipeline-client/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-pipeline-client @@ -15,7 +15,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 46c4023fd..9ba34a432 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-nlp-pipeline @@ -16,57 +16,57 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-chunker - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-quantifier - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-prep-srl - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-comma - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-verbsense - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-question-typer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.16 + 4.0.17 org.cogcomp @@ -83,7 +83,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.16 + 4.0.17 org.apache.commons @@ -93,17 +93,17 @@ edu.illinois.cs.cogcomp illinois-md - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-relation-extraction - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-datalessclassification - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp @@ -131,7 +131,7 @@ edu.illinois.cs.cogcomp illinois-depparse - 4.0.16 + 4.0.17 @@ -149,12 +149,12 @@ edu.illinois.cs.cogcomp illinois-time - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-transliteration - 4.0.16 + 4.0.17 diff --git a/pom.xml b/pom.xml index 3fc1bacee..93657ffa1 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp illinois-cogcomp-nlp pom - 4.0.16 + 4.0.17 core-utilities tokenizer diff --git a/pos/pom.xml b/pos/pom.xml index bd194e715..4f5c8f2bd 100644 --- a/pos/pom.xml +++ b/pos/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-pos diff --git a/prepsrl/pom.xml b/prepsrl/pom.xml index eb3ff9be3..5cc688bba 100644 --- a/prepsrl/pom.xml +++ b/prepsrl/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -15,32 +15,32 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-depparse - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.16 + 4.0.17 diff --git a/quantifier/pom.xml b/quantifier/pom.xml index 13fe1bdf5..036a2f426 100644 --- a/quantifier/pom.xml +++ b/quantifier/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-quantifier @@ -35,31 +35,31 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 compile edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 compile edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 compile edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 compile edu.illinois.cs.cogcomp illinois-curator - 4.0.16 + 4.0.17 compile diff --git a/question-type/pom.xml b/question-type/pom.xml index 75d7c9c73..74b1e1eaa 100644 --- a/question-type/pom.xml +++ b/question-type/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-question-typer @@ -13,22 +13,22 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pipeline-client - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp diff --git a/relation-extraction/pom.xml b/relation-extraction/pom.xml index 37123ce5f..4636515a2 100644 --- a/relation-extraction/pom.xml +++ b/relation-extraction/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -33,42 +33,42 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-ner - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-md - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-chunker - 4.0.16 + 4.0.17 joda-time diff --git a/similarity/pom.xml b/similarity/pom.xml index 17fd976cd..23360568d 100644 --- a/similarity/pom.xml +++ b/similarity/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-similarity @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 org.slf4j @@ -55,7 +55,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp diff --git a/temporal-normalizer/pom.xml b/temporal-normalizer/pom.xml index 033d290c6..a7429e35b 100644 --- a/temporal-normalizer/pom.xml +++ b/temporal-normalizer/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 illinois-time @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp @@ -28,7 +28,7 @@ edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 org.apache.uima @@ -43,12 +43,12 @@ edu.illinois.cs.cogcomp illinois-chunker - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-curator - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp @@ -58,7 +58,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.16 + 4.0.17 test @@ -86,7 +86,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.16 + 4.0.17 diff --git a/tokenizer/pom.xml b/tokenizer/pom.xml index 7c885b588..7fc56aea7 100644 --- a/tokenizer/pom.xml +++ b/tokenizer/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 illinois-tokenizer @@ -15,17 +15,17 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-curator - 4.0.16 + 4.0.17 test diff --git a/transliteration/pom.xml b/transliteration/pom.xml index 7865306bd..6c7300b09 100644 --- a/transliteration/pom.xml +++ b/transliteration/pom.xml @@ -5,7 +5,7 @@ http://www.w3.org/2001/XMLSchema-instance "> illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 @@ -20,12 +20,12 @@ http://www.w3.org/2001/XMLSchema-instance "> edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 org.apache.commons diff --git a/verbsense/pom.xml b/verbsense/pom.xml index 9959893f1..7025775ee 100755 --- a/verbsense/pom.xml +++ b/verbsense/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.16 + 4.0.17 4.0.0 illinois-verbsense @@ -18,37 +18,37 @@ edu.illinois.cs.cogcomp illinois-edison - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-pos - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-ner - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-chunker - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp illinois-inference - 4.0.16 + 4.0.17 edu.illinois.cs.cogcomp From f526509c8a781adbb626cfb79fb5895166f14e2a Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Fri, 25 Oct 2019 13:21:45 -0500 Subject: [PATCH 13/16] Caching paths was causing a race condition, and since I suspect adding the corrective synchronizations would counteract any already small performance benefits from the caching, I simply removed it. --- .../ner/ExpressiveFeatures/BrownClusters.java | 24 +------------------ 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java index 7d52d4396..c3fe1b174 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java @@ -13,7 +13,6 @@ import edu.illinois.cs.cogcomp.ner.IO.InFile; import edu.illinois.cs.cogcomp.ner.LbjTagger.Data; import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; -import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode; import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; import gnu.trove.map.hash.THashMap; import io.minio.errors.InvalidEndpointException; @@ -48,13 +47,7 @@ private BrownClusters() { /** clusters store, keyed on catenated paths. */ static private HashMap clusters = new HashMap<>(); - - /** Predetermined number of words in these caches. */ - final private int INITIAL_CACHE_SIZE = 40000; - - /** this maps a word to a set of feature names. */ - private THashMap cache = new THashMap (INITIAL_CACHE_SIZE); - + /** * Makes a unique key based on the paths, for storage in a hashmap. * @param pathsToClusterFiles the paths. @@ -190,12 +183,6 @@ final public String[] getPrefixes(NEWord w) { final public String[] getPrefixes(String word) { - // if we have already encountered this, it's cached, try that first. - String[] cachedPath = cache.get(word); - if (cachedPath != null) { - return cachedPath; - } - // not cached. ArrayList v = new ArrayList<>(wordToPathByResource.size()); for (int j = 0; j < wordToPathByResource.size(); j++) { @@ -216,9 +203,6 @@ final public String[] getPrefixes(String word) { } String[] res = new String[v.size()]; res = v.toArray(res); - if (res.length > 0) { - cache.put(word, res); - } return res; } @@ -231,12 +215,6 @@ final public String getPrefixesCombined(String word) { return ret; } - private static void printArr(String[] arr) { - for (String anArr : arr) - logger.info(" " + anArr); - logger.info(""); - } - final public void printOovData(Data data) { HashMap tokensHash = new HashMap<>(); HashMap tokensHashIC = new HashMap<>(); From c527e3c27d836aa73a1d2aaeef02b43366610365 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Tue, 12 Nov 2019 17:28:28 -0600 Subject: [PATCH 14/16] Added methods to purge all memory for brown clusters and gazetteers. --- .../ner/ExpressiveFeatures/BrownClusters.java | 46 +++++++++++++++++-- .../ExpressiveFeatures/GazetteersFactory.java | 17 +++++++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java index c3fe1b174..8c7dbff73 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java @@ -9,10 +9,14 @@ import org.cogcomp.Datastore; import org.cogcomp.DatastoreException; + +import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder; import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; import edu.illinois.cs.cogcomp.ner.IO.InFile; import edu.illinois.cs.cogcomp.ner.LbjTagger.Data; import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder; import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector; import gnu.trove.map.hash.THashMap; import io.minio.errors.InvalidEndpointException; @@ -47,7 +51,10 @@ private BrownClusters() { /** clusters store, keyed on catenated paths. */ static private HashMap clusters = new HashMap<>(); - + /** this is just to test the tokenizer produces same as the splitter. */ + static private TextAnnotationBuilder tokenizer = new TokenizerTextAnnotationBuilder(new StatefulTokenizer()); + + /** * Makes a unique key based on the paths, for storage in a hashmap. * @param pathsToClusterFiles the paths. @@ -115,7 +122,22 @@ public static BrownClusters get(Vector pathsToClusterFiles, Vector= thresholds.elementAt(i)) { - h.put(word, path); + if (!word.contains("-")) + h.put(word, path); + else { + try { + if (tokenizer.createTextAnnotation(word).getTokens().length == 1) { + + // the word contained a dash but tokenized to a single word. + h.put(word, path); + } else { + System.out.println("2&&& \""+line+"\" tokenized differently for brown clusters."); + } + } catch (Throwable t) { + t.printStackTrace(); + System.out.println("2&&& \""+line+"\" produced an exception."); + } + } } line = in.readLine(); } @@ -243,6 +265,24 @@ final public void printOovData(Data data) { } } } - } + + /** + * Purge all brown cluster data, clearing memory. + */ + static public void reset() { + clusters = new HashMap<>(); + } + + /** + * Purge all brown cluster data, clearing memory. + */ + static public void purge(Vector pathsToClusterFiles) { + synchronized (INIT_SYNC) { + // first check for a cluster already loaded for this data. + String key = getKey(pathsToClusterFiles); + clusters.remove(key); + } + } + } diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java index 565741c15..25a502a2b 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java @@ -10,6 +10,7 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; +import java.util.Vector; import edu.illinois.cs.cogcomp.core.constants.Language; @@ -53,4 +54,20 @@ static public Gazetteers get(int maxPhraseLength, String path, boolean flatgazet return gazetteers_map.get(path); } } + + /** + * Purge all gaz data, clearing memory. + */ + static public void reset() { + gazetteers_map = new HashMap<>(); + } + + /** + * Purge a single gazetteer entry for the gaz at that path. + */ + static public void purge(String path) { + synchronized (GAZ_INIT_LOCK) { + gazetteers_map.remove(path); + } + } } From 67cf849bb74ce4229508d8ceb4d9784d68b9c616 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Tue, 3 Dec 2019 13:01:29 -0600 Subject: [PATCH 15/16] I had checked in some println with brown clusters erroneously, and there was a long standing bug in ExceptionlessInputStream that would leave a compressed zip file open after loading the model. The ZipFile object should be close explicitly, and was not. --- .../vectors/ExceptionlessInputStream.java | 14 +++++++++----- .../ner/ExpressiveFeatures/BrownClusters.java | 17 +---------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java index e2e810ce8..2a34e7e60 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java @@ -34,8 +34,8 @@ public class ExceptionlessInputStream extends FilterInputStream { private char[] chars = null; /** The underlying data input stream. */ private DataInputStream dis; - - + /** if there is a zip stream, we must close it, closing the resulting stream does not close the file. */ + private ZipFile zipfile = null; /** * Opens a buffered (and uncompressed) stream for reading from the specified file. * @@ -70,9 +70,9 @@ public static ExceptionlessInputStream openCompressedStream(String filename) { try { ZipFile zip = new ZipFile(filename); - eis = - new ExceptionlessInputStream(new BufferedInputStream(zip.getInputStream(zip + eis = new ExceptionlessInputStream(new BufferedInputStream(zip.getInputStream(zip .getEntry(zipEntryName)))); + eis.zipfile = zip; } catch (Exception e) { System.err.println("Can't open '" + filename + "' for input:"); e.printStackTrace(); @@ -160,7 +160,11 @@ private void handleException(Exception e) { **/ public void close() { try { - dis.close(); + dis.close(); + if (zipfile != null) { + zipfile.close(); + zipfile = null; + } } catch (Exception e) { System.err.println("Can't close input stream:"); e.printStackTrace(); diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java index 8c7dbff73..49335c001 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java @@ -122,22 +122,7 @@ public static BrownClusters get(Vector pathsToClusterFiles, Vector= thresholds.elementAt(i)) { - if (!word.contains("-")) - h.put(word, path); - else { - try { - if (tokenizer.createTextAnnotation(word).getTokens().length == 1) { - - // the word contained a dash but tokenized to a single word. - h.put(word, path); - } else { - System.out.println("2&&& \""+line+"\" tokenized differently for brown clusters."); - } - } catch (Throwable t) { - t.printStackTrace(); - System.out.println("2&&& \""+line+"\" produced an exception."); - } - } + h.put(word, path); } line = in.readLine(); } From 418452cdef55f96736ba9424c1cb9abd6d9a4758 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Tue, 10 Dec 2019 12:28:03 -0600 Subject: [PATCH 16/16] Model training cache files were not being deleted correctly, but now are. --- big-data-utils/pom.xml | 4 +- chunker/pom.xml | 10 ++--- commasrl/pom.xml | 22 +++++----- core-utilities/pom.xml | 2 +- corpusreaders/pom.xml | 6 +-- curator/pom.xml | 4 +- dataless-classifier/pom.xml | 6 +-- depparse/pom.xml | 12 +++--- edison/pom.xml | 10 ++--- external/clausie/pom.xml | 4 +- external/external-commons/pom.xml | 6 +-- external/path-lstm/pom.xml | 6 +-- external/stanford_3.3.1/pom.xml | 6 +-- external/stanford_3.8.0/pom.xml | 6 +-- inference/pom.xml | 4 +- lbjava-nlp-tools/pom.xml | 6 +-- lemmatizer/pom.xml | 6 +-- md/pom.xml | 18 ++++----- ner/pom.xml | 10 ++--- .../LbjTagger/LearningCurveMultiDataset.java | 40 ++++++++++++------- pipeline-client/pom.xml | 4 +- pipeline/pom.xml | 38 +++++++++--------- pom.xml | 2 +- pos/pom.xml | 4 +- prepsrl/pom.xml | 16 ++++---- quantifier/pom.xml | 14 +++---- question-type/pom.xml | 12 +++--- relation-extraction/pom.xml | 20 +++++----- similarity/pom.xml | 6 +-- temporal-normalizer/pom.xml | 16 ++++---- tokenizer/pom.xml | 8 ++-- transliteration/pom.xml | 6 +-- verbsense/pom.xml | 16 ++++---- 33 files changed, 181 insertions(+), 169 deletions(-) diff --git a/big-data-utils/pom.xml b/big-data-utils/pom.xml index 393f510b8..96a703e17 100644 --- a/big-data-utils/pom.xml +++ b/big-data-utils/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -23,7 +23,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 org.xeustechnologies.google-api diff --git a/chunker/pom.xml b/chunker/pom.xml index 174a3ad55..6e72ebc4d 100644 --- a/chunker/pom.xml +++ b/chunker/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -13,23 +13,23 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp diff --git a/commasrl/pom.xml b/commasrl/pom.xml index 371dfe8f5..7f07500a6 100644 --- a/commasrl/pom.xml +++ b/commasrl/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -13,7 +13,7 @@ UTF-8 UTF-8 - 1.2.26 + 1.3.3 @@ -35,48 +35,48 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 true edu.illinois.cs.cogcomp illinois-curator - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-inference - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-ner - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-chunker - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp diff --git a/core-utilities/pom.xml b/core-utilities/pom.xml index 7d0ad4328..fee1df17e 100644 --- a/core-utilities/pom.xml +++ b/core-utilities/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-core-utilities diff --git a/corpusreaders/pom.xml b/corpusreaders/pom.xml index 6589141f9..516dc30d7 100644 --- a/corpusreaders/pom.xml +++ b/corpusreaders/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-corpusreaders @@ -15,12 +15,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 org.slf4j diff --git a/curator/pom.xml b/curator/pom.xml index 2ef4ef177..3a5f9c84e 100644 --- a/curator/pom.xml +++ b/curator/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-curator @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 diff --git a/dataless-classifier/pom.xml b/dataless-classifier/pom.xml index b04467c65..5c11a9dbd 100644 --- a/dataless-classifier/pom.xml +++ b/dataless-classifier/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -21,12 +21,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 org.slf4j diff --git a/depparse/pom.xml b/depparse/pom.xml index f996a843a..7694a46e0 100644 --- a/depparse/pom.xml +++ b/depparse/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-depparse @@ -16,27 +16,27 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-chunker - 4.0.17 + 4.0.19 diff --git a/edison/pom.xml b/edison/pom.xml index 723d577c9..6ad78978b 100644 --- a/edison/pom.xml +++ b/edison/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-edison @@ -16,7 +16,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 @@ -80,13 +80,13 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-curator - 4.0.17 + 4.0.19 test @@ -98,7 +98,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 diff --git a/external/clausie/pom.xml b/external/clausie/pom.xml index ec7772a47..8e5bcf73c 100644 --- a/external/clausie/pom.xml +++ b/external/clausie/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 ../../pom.xml @@ -24,7 +24,7 @@ edu.illinois.cs.cogcomp external-commons - 4.0.17 + 4.0.19 org.slf4j diff --git a/external/external-commons/pom.xml b/external/external-commons/pom.xml index f4c515377..2aef92289 100644 --- a/external/external-commons/pom.xml +++ b/external/external-commons/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 org.cogcomp diff --git a/external/path-lstm/pom.xml b/external/path-lstm/pom.xml index 11e84c72e..cc112a91b 100644 --- a/external/path-lstm/pom.xml +++ b/external/path-lstm/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp external-commons - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 org.cogcomp diff --git a/external/stanford_3.3.1/pom.xml b/external/stanford_3.3.1/pom.xml index 6797bf324..b89706d0f 100644 --- a/external/stanford_3.3.1/pom.xml +++ b/external/stanford_3.3.1/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 ../../pom.xml @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 @@ -36,7 +36,7 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 diff --git a/external/stanford_3.8.0/pom.xml b/external/stanford_3.8.0/pom.xml index 0de9806e3..fc029821b 100644 --- a/external/stanford_3.8.0/pom.xml +++ b/external/stanford_3.8.0/pom.xml @@ -2,7 +2,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 ../../pom.xml @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp external-commons - 4.0.17 + 4.0.19 org.slf4j diff --git a/inference/pom.xml b/inference/pom.xml index 8234a9b7d..14894c039 100644 --- a/inference/pom.xml +++ b/inference/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 jar @@ -22,7 +22,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp diff --git a/lbjava-nlp-tools/pom.xml b/lbjava-nlp-tools/pom.xml index 319bd0748..219e68c11 100644 --- a/lbjava-nlp-tools/pom.xml +++ b/lbjava-nlp-tools/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 LBJava-NLP-tools @@ -25,12 +25,12 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 org.slf4j diff --git a/lemmatizer/pom.xml b/lemmatizer/pom.xml index a42632e57..90322e575 100644 --- a/lemmatizer/pom.xml +++ b/lemmatizer/pom.xml @@ -7,7 +7,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-lemmatizer @@ -16,12 +16,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.stanford.nlp diff --git a/md/pom.xml b/md/pom.xml index a64380b64..990e6d88f 100644 --- a/md/pom.xml +++ b/md/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -20,37 +20,37 @@ edu.illinois.cs.cogcomp LBJava - 1.2.26 + 1.3.3 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-ner - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.17 + 4.0.19 org.slf4j @@ -70,7 +70,7 @@ edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.2 + 1.3.3 ${project.basedir}/src/lbj/md.lbj diff --git a/ner/pom.xml b/ner/pom.xml index e1985bfd8..d00bccc36 100644 --- a/ner/pom.xml +++ b/ner/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-ner @@ -23,12 +23,12 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 org.cogcomp @@ -39,12 +39,12 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.17 + 4.0.19 org.slf4j diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java index 719ed10ca..fac8e91cd 100644 --- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java +++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java @@ -180,14 +180,15 @@ public static void getLearningCurve(Vector trainDataSet, Vector test deleteme = new File(testPathL1); if (deleteme.exists()) deleteme.delete(); - logger.info("Pre-extracting the training data for Level 1 classifier, saving to "+trainPathL1); - BatchTrainer bt1train = prefetchAndGetBatchTrainer(tagger1, trainDataSet, trainPathL1, params); - logger.info("Pre-extracting the testing data for Level 1 classifier, saving to "+testPathL1); - BatchTrainer bt1test = prefetchAndGetBatchTrainer(tagger1, testDataSet, testPathL1, params); - Parser testParser1 = bt1test.getParser(); // create the best model possible. { + logger.info("Pre-extracting the training data for Level 1 classifier, saving to "+trainPathL1); + BatchTrainer bt1train = prefetchAndGetBatchTrainer(tagger1, trainDataSet, trainPathL1, params); + logger.info("Pre-extracting the testing data for Level 1 classifier, saving to "+testPathL1); + BatchTrainer bt1test = prefetchAndGetBatchTrainer(tagger1, testDataSet, testPathL1, params); + Parser testParser1 = bt1test.getParser(); + NETaggerLevel1 saveme = null; for (int i = 0; (fixedNumIterations == -1 && i < 200 && i - bestRoundLevel1 < 10) || (fixedNumIterations > 0 && i <= fixedNumIterations); ++i) { @@ -211,13 +212,22 @@ public static void getLearningCurve(Vector trainDataSet, Vector test + bestF1Level1); } } - saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold; saveme.doneTraining(); saveme.save(); + bt1train.getParser().close(); + bt1test.getParser().close(); logger.info("Level 1; best round : " + bestRoundLevel1 + "\tbest F1 : " + bestF1Level1); } + // dispose of the L1 caching files + deleteme = new File(trainPathL1); + if (deleteme.exists()) + deleteme.delete(); + deleteme = new File(testPathL1); + if (deleteme.exists()) + deleteme.delete(); + // Read the best model back in, optimize by pruning useless features, then write it agains tagger1 = new NETaggerLevel1(paramLevel1, modelPath + ".level1", modelPath + ".level1.lex"); @@ -254,16 +264,16 @@ public static void getLearningCurve(Vector trainDataSet, Vector test ", thickness = "+params.thicknessPredictionsLevel2); double bestF1Level2 = -2; int bestRoundLevel2 = 0; - logger.info("Pre-extracting the training data for Level 2 classifier, saving to "+trainPathL2); - BatchTrainer bt2train = - prefetchAndGetBatchTrainer(tagger2, trainDataSet, trainPathL2, params); - logger.info("Pre-extracting the testing data for Level 2 classifier, saving to "+testPathL2); - BatchTrainer bt2test = - prefetchAndGetBatchTrainer(tagger2, testDataSet, testPathL2, params); - Parser testParser2 = bt2test.getParser(); // create the best model possible. { + logger.info("Pre-extracting the training data for Level 2 classifier, saving to "+trainPathL2); + BatchTrainer bt2train = + prefetchAndGetBatchTrainer(tagger2, trainDataSet, trainPathL2, params); + logger.info("Pre-extracting the testing data for Level 2 classifier, saving to "+testPathL2); + BatchTrainer bt2test = + prefetchAndGetBatchTrainer(tagger2, testDataSet, testPathL2, params); + Parser testParser2 = bt2test.getParser(); NETaggerLevel2 saveme = null; for (int i = 0; (fixedNumIterations == -1 && i < 200 && i - bestRoundLevel2 < 10) || (fixedNumIterations > 0 && i <= fixedNumIterations); ++i) { @@ -293,13 +303,15 @@ public static void getLearningCurve(Vector trainDataSet, Vector test saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold; saveme.doneTraining(); saveme.save(); + bt2train.getParser().close(); + bt2test.getParser().close(); } // trash the l2 prefetch data deleteme = new File(trainPathL2); if (deleteme.exists()) deleteme.delete(); - deleteme = new File(testPathL1); + deleteme = new File(testPathL2); if (deleteme.exists()) deleteme.delete(); diff --git a/pipeline-client/pom.xml b/pipeline-client/pom.xml index fc2cf3baa..fac687dd4 100644 --- a/pipeline-client/pom.xml +++ b/pipeline-client/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-pipeline-client @@ -15,7 +15,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 9ba34a432..04a511dc3 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-nlp-pipeline @@ -16,57 +16,57 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-chunker - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-quantifier - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-prep-srl - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-comma - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-verbsense - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-question-typer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.17 + 4.0.19 org.cogcomp @@ -83,7 +83,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.17 + 4.0.19 org.apache.commons @@ -93,17 +93,17 @@ edu.illinois.cs.cogcomp illinois-md - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-relation-extraction - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-datalessclassification - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp @@ -131,7 +131,7 @@ edu.illinois.cs.cogcomp illinois-depparse - 4.0.17 + 4.0.19 @@ -149,12 +149,12 @@ edu.illinois.cs.cogcomp illinois-time - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-transliteration - 4.0.17 + 4.0.19 diff --git a/pom.xml b/pom.xml index 93657ffa1..e2d0e1dbf 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp illinois-cogcomp-nlp pom - 4.0.17 + 4.0.19 core-utilities tokenizer diff --git a/pos/pom.xml b/pos/pom.xml index 4f5c8f2bd..d28216d9e 100644 --- a/pos/pom.xml +++ b/pos/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-pos @@ -19,7 +19,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 edu.illinois.cs.cogcomp diff --git a/prepsrl/pom.xml b/prepsrl/pom.xml index 5cc688bba..3dc4f762a 100644 --- a/prepsrl/pom.xml +++ b/prepsrl/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -15,32 +15,32 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-depparse - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.17 + 4.0.19 @@ -53,7 +53,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 org.slf4j diff --git a/quantifier/pom.xml b/quantifier/pom.xml index 036a2f426..195f48b49 100644 --- a/quantifier/pom.xml +++ b/quantifier/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-quantifier @@ -24,7 +24,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 junit @@ -35,31 +35,31 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 compile edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 compile edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 compile edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 compile edu.illinois.cs.cogcomp illinois-curator - 4.0.17 + 4.0.19 compile diff --git a/question-type/pom.xml b/question-type/pom.xml index 74b1e1eaa..b330b9664 100644 --- a/question-type/pom.xml +++ b/question-type/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-question-typer @@ -13,27 +13,27 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pipeline-client - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 diff --git a/relation-extraction/pom.xml b/relation-extraction/pom.xml index 4636515a2..18996525c 100644 --- a/relation-extraction/pom.xml +++ b/relation-extraction/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -28,47 +28,47 @@ edu.illinois.cs.cogcomp LBJava - 1.3.2 + 1.3.3 edu.illinois.cs.cogcomp illinois-corpusreaders - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp stanford_3.3.1 - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-ner - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-md - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-chunker - 4.0.17 + 4.0.19 joda-time diff --git a/similarity/pom.xml b/similarity/pom.xml index 23360568d..315021ec7 100644 --- a/similarity/pom.xml +++ b/similarity/pom.xml @@ -4,7 +4,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-similarity @@ -13,7 +13,7 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 org.slf4j @@ -55,7 +55,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp diff --git a/temporal-normalizer/pom.xml b/temporal-normalizer/pom.xml index a7429e35b..8cc0c0bc5 100644 --- a/temporal-normalizer/pom.xml +++ b/temporal-normalizer/pom.xml @@ -5,7 +5,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 illinois-time @@ -13,12 +13,12 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp LBJava - 1.2.24 + 1.3.3 edu.illinois.cs.cogcomp @@ -28,7 +28,7 @@ edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 org.apache.uima @@ -43,12 +43,12 @@ edu.illinois.cs.cogcomp illinois-chunker - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-curator - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp @@ -58,7 +58,7 @@ edu.illinois.cs.cogcomp illinois-ner - 4.0.17 + 4.0.19 test @@ -86,7 +86,7 @@ edu.illinois.cs.cogcomp illinois-tokenizer - 4.0.17 + 4.0.19 diff --git a/tokenizer/pom.xml b/tokenizer/pom.xml index 7fc56aea7..a1bde7615 100644 --- a/tokenizer/pom.xml +++ b/tokenizer/pom.xml @@ -6,7 +6,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 illinois-tokenizer @@ -15,17 +15,17 @@ edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp LBJava-NLP-tools - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-curator - 4.0.17 + 4.0.19 test diff --git a/transliteration/pom.xml b/transliteration/pom.xml index 6c7300b09..c1fd89c06 100644 --- a/transliteration/pom.xml +++ b/transliteration/pom.xml @@ -5,7 +5,7 @@ http://www.w3.org/2001/XMLSchema-instance "> illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 @@ -20,12 +20,12 @@ http://www.w3.org/2001/XMLSchema-instance "> edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 org.apache.commons diff --git a/verbsense/pom.xml b/verbsense/pom.xml index 7025775ee..74a8d71e5 100755 --- a/verbsense/pom.xml +++ b/verbsense/pom.xml @@ -3,7 +3,7 @@ illinois-cogcomp-nlp edu.illinois.cs.cogcomp - 4.0.17 + 4.0.19 4.0.0 illinois-verbsense @@ -18,37 +18,37 @@ edu.illinois.cs.cogcomp illinois-edison - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-core-utilities - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-pos - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-ner - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-chunker - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-lemmatizer - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp illinois-inference - 4.0.17 + 4.0.19 edu.illinois.cs.cogcomp