diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java index f77feca47..0988ddbad 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelView.java @@ -5,9 +5,7 @@ * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign * http://cogcomp.cs.illinois.edu/ */ -/** - * - */ + package edu.illinois.cs.cogcomp.core.datastructures.textannotation; import java.util.ArrayList; @@ -60,6 +58,14 @@ public SpanLabelView(String viewName, String viewGenerator, TextAnnotation text, @Override public void addConstituent(Constituent constituent) { + + if (!allowOverlappingSpans) { + int start = constituent.getStartSpan(); + int end = constituent.getEndSpan(); + if (this.getConstituentsCoveringSpan(start, end).size() != 0) + throw new IllegalArgumentException("Span [" + start + ", " + end + "] already labeled."); + } + super.addConstituent(constituent); // this sort is grossly inefficient when appending contiguous tokens one at a time. @@ -95,9 +101,6 @@ public Constituent addSpanLabel(int start, int end, String label, double score) new Constituent(label, score, this.getViewName(), this.getTextAnnotation(), start, end); - if (!allowOverlappingSpans && this.getConstituentsCoveringSpan(start, end).size() != 0) - throw new IllegalArgumentException("Span [" + start + ", " + end + "] already labeled."); - this.addConstituent(c); return c; diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java index d03f13a3d..ec377d93c 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TokenLabelView.java @@ -31,10 +31,19 @@ public TokenLabelView(String viewName, TextAnnotation text) { this(viewName, viewName + "-annotator", text, 1.0); } + public TokenLabelView(String viewName, TextAnnotation text, boolean allowOverlappingSpans) { + this(viewName, viewName + "-annotator", text, 1.0, allowOverlappingSpans); + } + public TokenLabelView(String viewName, String viewGenerator, TextAnnotation text, double score) { super(viewName, viewGenerator, text, score); } + public TokenLabelView(String viewName, String viewGenerator, TextAnnotation text, double score, + boolean allowOverlappingSpans) { + super(viewName, viewGenerator, text, score, allowOverlappingSpans); + } + /** * Adds a label to a token and returns the newly created constituent. * diff --git a/core-utilities/src/test/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelViewTest.java b/core-utilities/src/test/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelViewTest.java new file mode 100644 index 000000000..b086a7bd2 --- /dev/null +++ b/core-utilities/src/test/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/SpanLabelViewTest.java @@ -0,0 +1,104 @@ +package edu.illinois.cs.cogcomp.core.datastructures.textannotation; + +import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder; +import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder; +import edu.illinois.cs.cogcomp.core.datastructures.IntPair; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Test that addConstituent(Constituent) does not allow overlapping spans + */ +public class SpanLabelViewTest { + SpanLabelView overlappingSpansView; + SpanLabelView noOverlappingSpansView; + TextAnnotation ta; + Constituent baseConstituent; + Constituent overlappingConstituent; + + private Tokenizer.Tokenization tokenization; + + String viewName = "VIEWNAME"; + String viewGenerator = "VIEW-GENERATOR"; + String text = "This is a test string; do not pay it any mind."; + String corpusId = "TEST"; + String textId = "ID"; + + double score = 42.0; + int baseStart = 0; + int baseEnd = 5; + int overStart = 2; + int overEnd = 6; + + private Tokenizer.Tokenization getTokenization(String text) { + String[] tokens = text.split("\\s"); + List characterOffsets = new ArrayList<>(); + int[] sentenceEndArray = {tokens.length}; + + int charOffsetBegin = 0; + int charOffsetEnd = 0; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (Character.isWhitespace(c)) { + charOffsetEnd = i; + IntPair tokenOffsets = new IntPair(charOffsetBegin, charOffsetEnd); + characterOffsets.add(tokenOffsets); + charOffsetBegin = charOffsetEnd + 1; + } + } + IntPair tokenOffsets = new IntPair(charOffsetBegin, text.length()); + characterOffsets.add(tokenOffsets); + + IntPair[] charOffsetArray = new IntPair[characterOffsets.size()]; + + for (int i = 0; i < characterOffsets.size(); i++) { + charOffsetArray[i] = characterOffsets.get(i); + } + Tokenizer.Tokenization tokenization = + new Tokenizer.Tokenization(tokens, charOffsetArray, sentenceEndArray); + return tokenization; + } + + @Before + public void init(){ + TextAnnotationBuilder taBuilder = new BasicTextAnnotationBuilder(); + ta = taBuilder.createTextAnnotation(this.corpusId, this.textId, this.text, getTokenization(this.text)); + boolean allowOverlappingSpans = true; + overlappingSpansView = new SpanLabelView(this.viewName, this.viewGenerator, + ta, this.score, allowOverlappingSpans); + allowOverlappingSpans = false; + noOverlappingSpansView = new SpanLabelView(this.viewName, this.viewGenerator, + ta, this.score, allowOverlappingSpans); + + baseConstituent = new Constituent("BASE", this.score, this.viewName, ta, baseStart, baseEnd); + overlappingConstituent = new Constituent("OVER", this.score, this.viewName, ta, overStart, overEnd); + } + + @Test + public void testOverlappingSpans(){ + overlappingSpansView.addConstituent(baseConstituent); + overlappingSpansView.addConstituent(overlappingConstituent); + for(Constituent c : overlappingSpansView.getConstituents()){ + if(c.getLabel().equals("BASE")) { + assert c.getStartSpan() == this.baseStart; + assert c.getEndSpan() == this.baseEnd; + }else { + assert c.getStartSpan() == this.overStart; + assert c.getEndSpan() == this.overEnd; + } + } + } + + @Test(expected=IllegalArgumentException.class) + public void testNoOverlappingSpans(){ + noOverlappingSpansView.addConstituent(baseConstituent); + noOverlappingSpansView.addConstituent(overlappingConstituent); + } +} diff --git a/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java b/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java index 4c4e7e62b..1ef07bd26 100644 --- a/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java +++ b/corpusreaders/src/main/java/edu/illinois/cs/cogcomp/nlp/corpusreaders/ereReader/ERENerReader.java @@ -173,7 +173,7 @@ public List getAnnotationsFromFile(List corpusFileListE TextAnnotation ta = sourceTa.getTextAnnotation(); SpanLabelView tokens = (SpanLabelView) ta.getView(ViewNames.TOKENS); compileOffsets(tokens); - SpanLabelView nerView = new SpanLabelView(getMentionViewName(), NAME, ta, 1.0, false); + SpanLabelView nerView = new SpanLabelView(getMentionViewName(), NAME, ta, 1.0, true); // now pull all mentions we deal with. Start from file list index 1, as index 0 was source // text diff --git a/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java b/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java index 7af3c7f8f..1db6f7357 100644 --- a/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java +++ b/external/stanford_3.3.1/src/main/java/edu/illinois/cs/cogcomp/pipeline/handlers/StanfordTrueCaseHandler.java @@ -54,7 +54,7 @@ public void initialize(ResourceManager rm) { public void addView(TextAnnotation ta) throws AnnotatorException { Annotation document = new Annotation(ta.text); pipeline.annotate(document); - TokenLabelView vu = new TokenLabelView(viewName, ta); + TokenLabelView vu = new TokenLabelView(viewName, ta, true); for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { diff --git a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java index 107f4c748..3f835fdce 100644 --- a/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOCombinedReader.java @@ -172,7 +172,7 @@ private List getTokensFromTAs(){ mentionViewName = ViewNames.MENTION_ERE; } View mentionView = ta.getView(mentionViewName); - View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f); + View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f, true); String[] token2tags = new String[tokenView.getConstituents().size()]; for (int i = 0; i < token2tags.length; i++){ token2tags[i] = "O"; diff --git a/md/src/main/java/org/cogcomp/md/BIOReader.java b/md/src/main/java/org/cogcomp/md/BIOReader.java index 20c56617e..18971b21e 100644 --- a/md/src/main/java/org/cogcomp/md/BIOReader.java +++ b/md/src/main/java/org/cogcomp/md/BIOReader.java @@ -180,7 +180,7 @@ else if (_mode.equals("ColumnFormat")){ for (TextAnnotation ta : taList){ View tokenView = ta.getView(ViewNames.TOKENS); View mentionView = ta.getView(mentionViewName); - View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f); + View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f, true); String[] token2tags = new String[tokenView.getConstituents().size()]; for (int i = 0; i < token2tags.length; i++){ token2tags[i] = "O"; diff --git a/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java b/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java index 3290be1f0..8d6454d99 100644 --- a/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java +++ b/md/src/main/java/org/cogcomp/md/ColumnFormatReader.java @@ -118,7 +118,7 @@ public TextAnnotation readSingleFile(String file){ tokens.add(curSentenceArr); } TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(tokens); - SpanLabelView mentionView = new SpanLabelView("MENTIONS", this.getClass().getCanonicalName(), ta, 1.0f); + SpanLabelView mentionView = new SpanLabelView("MENTIONS", this.getClass().getCanonicalName(), ta, 1.0f, true); if (mentionTypes.size() != mentions.size()){ System.out.println("ERROR"); } diff --git a/md/src/main/java/org/cogcomp/md/MentionAnnotator.java b/md/src/main/java/org/cogcomp/md/MentionAnnotator.java index eaadd196e..0834e5d0b 100644 --- a/md/src/main/java/org/cogcomp/md/MentionAnnotator.java +++ b/md/src/main/java/org/cogcomp/md/MentionAnnotator.java @@ -204,7 +204,7 @@ public void addView(TextAnnotation ta) throws AnnotatorException{ throw new AnnotatorException("Missing required view POS"); } View mentionView = new SpanLabelView(ViewNames.MENTION, MentionAnnotator.class.getCanonicalName(), ta, 1.0f, true); - View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f); + View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f, true); View tokenView = ta.getView(ViewNames.TOKENS); for (int i = tokenView.getStartSpan(); i < tokenView.getEndSpan(); i++){ Constituent currentToken = tokenView.getConstituentsCoveringToken(i).get(0).cloneForNewView("BIO"); diff --git a/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java b/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java index 80ea8f250..e651eb443 100644 --- a/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java +++ b/tokenizer/src/main/java/edu/illinois/cs/cogcomp/nlp/utility/TokenizerTextAnnotationBuilder.java @@ -138,7 +138,7 @@ public TextAnnotation createTextAnnotation(String corpusId, String textId, Strin TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); SpanLabelView view = - new SpanLabelView(ViewNames.SENTENCE, NAME, ta, 1.0); + new SpanLabelView(ViewNames.SENTENCE, NAME, ta, 1); int start = 0; for (int s : tokenization.getSentenceEndTokenIndexes()) {