Skip to content

Commit e5696ec

Browse files
committed
Upgraded Stanford NER tool to 2012-11-11 version.
* Moved old version files into lib/nertools/stanford/stanford_old_library/
1 parent 3a9be83 commit e5696ec

39 files changed

+1026
-24
lines changed

src/SocialNetworkExtractorNihai/.classpath

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,14 @@
66
<classpathentry kind="lib" path="lib/httpclient-4.1.1.jar"/>
77
<classpathentry kind="lib" path="lib/httpcore-4.1.jar"/>
88
<classpathentry kind="lib" path="lib/json_simple-1.1.jar"/>
9-
<classpathentry kind="lib" path="lib/nertools/stanford/stanford-ner.jar"/>
109
<classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>
1110
<classpathentry kind="lib" path="lib/filterbuilder.jar"/>
1211
<classpathentry kind="lib" path="lib/sitecapturer.jar"/>
1312
<classpathentry kind="lib" path="lib/thumbelina.jar"/>
14-
<classpathentry kind="lib" path="lib/nertools/stanford/stanford-ner-2009-01-16.jar"/>
1513
<classpathentry kind="lib" path="lib/htmllexer.jar"/>
1614
<classpathentry kind="lib" path="lib/nertools/illinois/LBJ2.jar"/>
1715
<classpathentry kind="lib" path="lib/nertools/illinois/LBJ2Library.jar"/>
1816
<classpathentry kind="lib" path="lib/nertools/illinois/bin"/>
17+
<classpathentry kind="lib" path="lib/nertools/stanford/stanford-ner-2012-11-11.jar"/>
1918
<classpathentry kind="output" path="bin"/>
2019
</classpath>

src/SocialNetworkExtractorNihai/lib/nertools/stanford/NERDemo.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import edu.stanford.nlp.ie.crf.*;
21
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
2+
import edu.stanford.nlp.ie.crf.*;
3+
import edu.stanford.nlp.io.IOUtils;
34
import edu.stanford.nlp.ling.CoreLabel;
45
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
5-
import edu.stanford.nlp.util.StringUtils;
66

77
import java.util.List;
88
import java.io.IOException;
@@ -11,7 +11,7 @@
1111

1212
/** This is a demo of calling CRFClassifier programmatically.
1313
* <p>
14-
* Usage: <code> java -cp "stanford-ner.jar:." NERDemo [serializedClassifier [fileName]]</code>
14+
* Usage: <code> java -mx400m -cp "stanford-ner.jar:." NERDemo [serializedClassifier [fileName]]</code>
1515
* <p>
1616
* If arguments aren't specified, they default to
1717
* ner-eng-ie.crf-3-all2006.ser.gz and some hardcoded sample text.
@@ -33,13 +33,13 @@ public class NERDemo {
3333

3434
public static void main(String[] args) throws IOException {
3535

36-
String serializedClassifier = "classifiers/ner-eng-ie.crf-3-all2008.ser.gz";
36+
String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
3737

3838
if (args.length > 0) {
3939
serializedClassifier = args[0];
4040
}
4141

42-
AbstractSequenceClassifier classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
42+
AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
4343

4444
/* For either a file to annotate or for the hardcoded text example,
4545
this demo file shows two ways to process the output, for teaching
@@ -49,7 +49,7 @@ public static void main(String[] args) throws IOException {
4949
and produce an inline XML output format.
5050
*/
5151
if (args.length > 1) {
52-
String fileContents = StringUtils.slurpFile(args[1]);
52+
String fileContents = IOUtils.slurpFile(args[1]);
5353
List<List<CoreLabel>> out = classifier.classify(fileContents);
5454
for (List<CoreLabel> sentence : out) {
5555
for (CoreLabel word : sentence) {
@@ -71,6 +71,13 @@ public static void main(String[] args) throws IOException {
7171
System.out.println(classifier.classifyToString(s1));
7272
System.out.println(classifier.classifyWithInlineXML(s2));
7373
System.out.println(classifier.classifyToString(s2, "xml", true));
74+
int i=0;
75+
for (List<CoreLabel> lcl : classifier.classify(s2)) {
76+
for (CoreLabel cl : lcl) {
77+
System.out.println(i++ + ":");
78+
System.out.println(cl);
79+
}
80+
}
7481
}
7582
}
7683

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
Stanford NER - v1.2.7 - 2012-11-11
2+
----------------------------------------------
3+
4+
This package provides a high-performance machine learning based named
5+
entity recognition system, including facilities to train models from
6+
supervised training data and pre-trained models for English.
7+
8+
(c) 2002-2012. The Board of Trustees of The Leland
9+
Stanford Junior University. All Rights Reserved.
10+
11+
Original CRF code by Jenny Finkel.
12+
Additional modules, features, internationalization, compaction, and
13+
support code by Christopher Manning, Dan Klein, Christopher Cox, Huy Nguyen
14+
Shipra Dingare, Anna Rafferty, and John Bauer.
15+
This release prepared by John Bauer.
16+
17+
LICENSE
18+
19+
The software is licensed under the full GPL. Please see the file LICENCE.txt
20+
21+
For more information, bug reports, and fixes, contact:
22+
Christopher Manning
23+
Dept of Computer Science, Gates 1A
24+
Stanford CA 94305-9010
25+
USA
26+
27+
http://www-nlp.stanford.edu/software/CRF-NER.shtml
28+
29+
CONTACT
30+
31+
For questions about this distribution, please contact Stanford's JavaNLP group
32+
at [email protected]. We provide assistance on a best-effort
33+
basis.
34+
35+
TUTORIAL
36+
37+
Quickstart guidelines, primarily for end users who wish to use the included NER
38+
models, are below. For further instructions on training your own NER model,
39+
go to http://www-nlp.stanford.edu/software/crf-faq.shtml.
40+
41+
INCLUDED SERIALIZED MODELS / TRAINING DATA
42+
43+
The basic included serialized model is a 3 class NER tagger that can
44+
label: PERSON, ORGANIZATION, and LOCATION entities. It is included as
45+
english.all.3class.distsim.crf.ser.gz. It is trained on data from
46+
CoNLL, MUC6, MUC7, and ACE. Because this model is trained on both US
47+
and UK newswire, it is fairly robust across the two domains.
48+
49+
We have also included a 4 class NER tagger trained on the CoNLL 2003
50+
Shared Task training data that labels for PERSON, ORGANIZATION,
51+
LOCATION, and MISC. It is named
52+
english.conll.4class.caseless.distsim.crf.ser.gz .
53+
54+
A third model is trained only on data from MUC and distinguishes
55+
between 7 different classes,
56+
english.muc.7class.caseless.distsim.crf.ser.gz.
57+
58+
All of the serialized classifiers come in two versions, the second of
59+
which uses a distributional similarity lexicon to improve performance
60+
(by about 1.5% F-measure). These classifiers have additional features
61+
which make them perform substantially better, but they require rather
62+
more memory. The distsim models are included in the release package,
63+
and nodistsim versions of the same models are available on the
64+
Stanford NER webpage.
65+
66+
There are also case-insensitive versions of the three models available
67+
on the webpage.
68+
69+
Finally, a package with two German models is also available for download.
70+
71+
72+
QUICKSTART INSTRUCTIONS
73+
74+
This NER system requires Java 1.6 or later. We have only tested it on
75+
the SUN JVM.
76+
77+
Providing java is on your PATH, you should just be able to run an NER
78+
GUI demonstration by just clicking. It might work to double-click on
79+
the stanford-ner.jar archive but this may well fail as the operating
80+
system does not give Java enough memory for our NER system, so it is
81+
safer to instead double click on the ner-gui.bat icon (Windows) or
82+
ner-gui.sh (Linux/Unix/MacOSX). Then, from the Classifier menu, either
83+
load a CRF classifier from the classifiers directory of the distribution
84+
or you should be able to use the Load Default CRF option. You can then
85+
either load a text file or web page from the File menu, or decide to use
86+
the default text in the window. Finally, you can now named entity tag
87+
the text by pressing the Run NER button.
88+
89+
From a command line, you need to have java on your PATH and the
90+
stanford-ner.jar file in your CLASSPATH. (The way of doing this depends on
91+
your OS/shell.) The supplied ner.bat and ner.sh should work to allow
92+
you to tag a single file. For example, for Windows:
93+
94+
ner file
95+
96+
Or on Unix/Linux you should be able to parse the test file in the distribution
97+
directory with the command:
98+
99+
java -mx600m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier classifiers/all.3class.crf.ser.gz -textFile sample.txt
100+
101+
When run from a jar file, you also have the option of using a serialized
102+
classifier contained in the jar file.
103+
104+
If you use the -jar command, or double-click the jar file, NERGUI is
105+
automatically started, and you will also be given the option (under the
106+
'Classifier' menu item) to load a default supplied classifier:
107+
108+
java -mx1000m -jar stanford-ner.jar
109+
110+
111+
PROGRAMMATIC USE
112+
113+
The NERDemo file illustrates a couple of ways of calling the system
114+
programatically. You should get the same results from
115+
116+
java -mx300m NERDemo classifiers/all.3class.crf.ser.gz sample.txt
117+
118+
as from using CRFClassifier. For more information on API calls, look in
119+
the enclosed javadoc directory: load index.html in a browser and look
120+
first at the edu.stanford.nlp.ie.crf package and CRFClassifier class.
121+
If you wish to train your own NER systems, look also at the
122+
edu.stanford.nlp.ie package NERFeatureFactory class.
123+
124+
125+
SERVER VERSION
126+
127+
The NER code may also be run as a server listening on a socket:
128+
129+
java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer 1234
130+
131+
You can specify which model to load with flags, either one on disk:
132+
133+
java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/all.3class.crf.ser.gz 1234
134+
135+
Or if you have put a model inside the jar file:
136+
137+
java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadJarClassifier all.3class.crf.ser.gz 1234
138+
139+
140+
RUNNING CLASSIFIERS FROM INSIDE A JAR FILE
141+
142+
The software can run any serialized classifier from within a jar file by
143+
giving the flag -loadJarClassifier resourceName . An end user can make
144+
their own jar files with the desired NER models contained inside. The
145+
serialized classifier must be located immediately under classifiers/ in
146+
the jar file, with the name given. This allows single jar file
147+
deployment.
148+
149+
150+
PERFORMANCE GUIDELINES
151+
152+
Performance depends on many factors. Speed and memory use depend on
153+
hardware, operating system, and JVM. Accuracy depends on the data
154+
tested on. Nevertheless, in the belief that something is better than
155+
nothing, here are some statistics from one machine on one test set, in
156+
semi-realistic conditions (where the test data is somewhat varied).
157+
158+
ner-eng-ie.crf-3-all2006-distsim.ser.gz (older version of ner-eng-ie.crf-3-all2008-distsim.ser.gz)
159+
Memory: 320MB (on a 32 bit machine)
160+
PERSON ORGANIZATION LOCATION
161+
91.88 82.91 88.21
162+
163+
164+
--------------------
165+
CHANGES
166+
--------------------
167+
168+
2012-11-11 1.2.7 Improved English 3 class model, release
169+
Chinese model
170+
171+
2012-07-09 1.2.6 Minor bug fixes
172+
173+
2012-05-22 1.2.5 Fix encoding issue
174+
175+
2012-04-07 1.2.4 Caseless version of English models supported
176+
177+
2012-01-06 1.2.3 Minor bug fixes
178+
179+
2011-09-14 1.2.2 Improved thread safety
180+
181+
2011-06-19 1.2.1 Models reduced in size but on average improved
182+
in accuracy (improved distsim clusters)
183+
184+
2011-05-16 1.2 Normal download includes 3, 4, and 7
185+
class models. Updated for compatibility
186+
with other software releases.
187+
188+
2009-01-16 1.1.1 Minor bug and usability fixes, changed API
189+
190+
2008-05-07 1.1 Additional feature flags, various code updates
191+
192+
2006-09-18 1.0 Initial release
193+

src/SocialNetworkExtractorNihai/lib/nertools/stanford/build.xml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@
3939
<property name="compile.debug" value="true"/>
4040
<property name="compile.deprecation" value="false"/>
4141
<property name="compile.optimize" value="true"/>
42-
<property name="compile.source" value="1.5" />
43-
<property name="compile.target" value="1.5" />
42+
<property name="compile.source" value="1.6" />
43+
<property name="compile.target" value="1.6" />
4444

4545

4646

@@ -84,14 +84,14 @@
8484

8585
<target name="classpath" description="Sets the classpath">
8686
<path id="compile.classpath">
87-
<!--<fileset dir="${basedir}/lib">
87+
<!-- <fileset dir="${basedir}/lib">
8888
<include name="*.jar"/>
8989
<exclude name="javanlp*"/>
90-
</fileset>-->
90+
</fileset> -->
9191
</path>
9292
</target>
9393

94-
94+
9595

9696

9797

@@ -114,6 +114,7 @@
114114
<javac srcdir="${src.home}"
115115
destdir="${build.home}"
116116
debug="${compile.debug}"
117+
encoding="utf-8"
117118
deprecation="${compile.deprecation}"
118119
optimize="${compile.optimize}"
119120
source="${compile.source}"
@@ -152,14 +153,14 @@
152153
<javadoc sourcepath="${src.home}"
153154
destdir="${javadoc.home}"
154155
maxmemory="768m"
155-
author="true"
156-
source="1.5"
156+
author="true"
157+
source="1.6"
157158
Overview="${src.home}/edu/stanford/nlp/overview.html"
158-
Doctitle="Stanford JavaNLP API Documentation"
159+
Doctitle="Stanford JavaNLP API Documentation"
159160
Windowtitle="Stanford JavaNLP API"
160161
packagenames="*">
161162
<bottom><![CDATA[<FONT SIZE=2><A HREF=\"http://nlp.stanford.edu\">Stanford NLP Group</A></FONT>]]></bottom>
162-
<link href="http://java.sun.com/j2se/1.5.0/docs/api/"/>
163+
<link href="http://java.sun.com/j2se/1.6.0/docs/api/"/>
163164
</javadoc>
164165

165166
</target>
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
trainFile = /u/nlp/data/ner/column_data/all.3class.train
2+
testFile = /u/nlp/data/ner/column_data/all.3class.test
3+
serializeTo = english.all.3class.distsim.crf.ser.gz
4+
5+
type = crf
6+
7+
#distSimLexicon = /u/nlp/data/pos_tags_are_useless/englishGigaword.200.pruned
8+
#distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw.bnc.200
9+
distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters
10+
useDistSim = true
11+
12+
map = word=0,answer=1
13+
14+
saveFeatureIndexToDisk = true
15+
16+
useClassFeature=true
17+
useWord=true
18+
#useWordPairs=true
19+
useNGrams=true
20+
noMidNGrams=true
21+
maxNGramLeng=6
22+
usePrev=true
23+
useNext=true
24+
#useTags=true
25+
#useWordTag=true
26+
useLongSequences=true
27+
useSequences=true
28+
usePrevSequences=true
29+
useTypeSeqs=true
30+
useTypeSeqs2=true
31+
useTypeySequences=true
32+
useOccurrencePatterns=true
33+
useLastRealWord=true
34+
useNextRealWord=true
35+
#useReverse=false
36+
normalize=true
37+
# normalizeTimex=true
38+
wordShape=chris2useLC
39+
useDisjunctive=true
40+
disjunctionWidth=5
41+
#useDisjunctiveShapeInteraction=true
42+
43+
maxLeft=1
44+
readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
45+
46+
useObservedSequencesOnly=true
47+
48+
useQN = true
49+
QNsize = 25
50+
51+
# makes it go faster
52+
featureDiffThresh=0.05

0 commit comments

Comments
 (0)