It looks like CST have just released the resources for the lemmatizer on a GPL license, just as you said they might do when we met in April.
In light of that, I decided to try and install the lemmatizer but ran in to some problems. Here is a log of what I did:
ronnie@stat:~$ METH=git
ronnie@stat:~$ METH=https
ronnie@stat:~$
ronnie@stat:~$ if [ ! -d hashmap ]; then
> mkdir hashmap
> cd hashmap
> git init
> git remote add origin $METH://github.com/kuhumcst/hashmap.git
> cd ..
> fi
Initialised empty Git repository in /home/ronnie/hashmap/.git/
ronnie@stat:~$ cd hashmap
ronnie@stat:~/hashmap$ git pull origin master
remote: Counting objects: 26, done.
remote: Total 26 (delta 0), reused 0 (delta 0), pack-reused 26
Unpacking objects: 100% (26/26), done.
From https://github.com/kuhumcst/hashmap
* branch master -> FETCH_HEAD
* [new branch] master -> origin/master
ronnie@stat:~/hashmap$ cd ..
ronnie@stat:~$
ronnie@stat:~$ if [ ! -d letterfunc ]; then
> mkdir letterfunc
> cd letterfunc
> git init
> git remote add origin $METH://github.com/kuhumcst/letterfunc.git
> cd ..
> fi
Initialised empty Git repository in /home/ronnie/letterfunc/.git/
ronnie@stat:~$ cd letterfunc
ronnie@stat:~/letterfunc$ git pull origin master
remote: Counting objects: 102, done.
remote: Total 102 (delta 0), reused 0 (delta 0), pack-reused 102
Receiving objects: 100% (102/102), 424.62 KiB | 0 bytes/s, done.
Resolving deltas: 100% (52/52), done.
From https://github.com/kuhumcst/letterfunc
* branch master -> FETCH_HEAD
* [new branch] master -> origin/master
ronnie@stat:~/letterfunc$ cd ..
ronnie@stat:~$
ronnie@stat:~$ if [ ! -d parsesgml ]; then
> mkdir parsesgml
> cd parsesgml
> git init
> git remote add origin $METH://github.com/kuhumcst/parsesgml.git
> cd ..
> fi
Initialised empty Git repository in /home/ronnie/parsesgml/.git/
ronnie@stat:~$ cd parsesgml
ronnie@stat:~/parsesgml$ git pull origin master
remote: Counting objects: 26, done.
remote: Total 26 (delta 0), reused 0 (delta 0), pack-reused 26
Unpacking objects: 100% (26/26), done.
From https://github.com/kuhumcst/parsesgml
* branch master -> FETCH_HEAD
* [new branch] master -> origin/master
ronnie@stat:~/parsesgml$ cd ..
ronnie@stat:~$
ronnie@stat:~$ if [ ! -d cstlemma ]; then
> mkdir cstlemma
> cd cstlemma
> git init
> git remote add origin $METH://github.com/kuhumcst/cstlemma.git
> cd ..
> fi
Initialised empty Git repository in /home/ronnie/cstlemma/.git/
ronnie@stat:~$ cd cstlemma
ronnie@stat:~/cstlemma$ git pull origin master
remote: Counting objects: 707, done.
remote: Total 707 (delta 0), reused 0 (delta 0), pack-reused 707
Receiving objects: 100% (707/707), 432.05 KiB | 0 bytes/s, done.
Resolving deltas: 100% (549/549), done.
From https://github.com/kuhumcst/cstlemma
* branch master -> FETCH_HEAD
* [new branch] master -> origin/master
ronnie@stat:~/cstlemma$ cd src
ronnie@stat:~/cstlemma/src$ make all
g++ -I. -I../../hashmap/src -I../../letterfunc/src -I../../parsesgml/src -O3 -Wall -pedantic -DNDEBUG -fPIC -c cstlemma.cpp applyrules.cpp argopt.cpp basefrm.cpp basefrmpntr.cpp caseconv.cpp dictionary.cpp ../../letterfunc/src/entities.cpp field.cpp flattext.cpp flex.cpp freqfile.cpp function.cpp functiontree.cpp ../../hashmap/src/hashmap.cpp lemmatise.cpp lemmatiser.cpp lemmtags.cpp ../../letterfunc/src/letter.cpp ../../letterfunc/src/letterfunc.cpp lext.cpp makedict.cpp makesuffixflex.cpp option.cpp outputclass.cpp ../../parsesgml/src/parsesgml.cpp readfreq.cpp readlemm.cpp tags.cpp text.cpp ../../letterfunc/src/utf8func.cpp word.cpp wordReader.cpp XMLtext.cpp
In file included from cstlemma.cpp:29:0:
word.h: In constructor ‘Word::Word(const char*)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:71:27: warning: ‘baseformpointer* Word::pbfD’ [-Wreorder]
baseformpointer * pbfD; // dictionary's base forms
^
word.h:244:9: warning: when initialized here [-Wreorder]
Word(const char * word)
^
word.h: In copy constructor ‘Word::Word(const Word&)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:68:16: warning: ‘char* Word::m_word’ [-Wreorder]
char * m_word;
^
word.h:256:9: warning: when initialized here [-Wreorder]
Word(const Word & w)
^
In file included from basefrm.cpp:28:0:
word.h: In constructor ‘Word::Word(const char*)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:71:27: warning: ‘baseformpointer* Word::pbfD’ [-Wreorder]
baseformpointer * pbfD; // dictionary's base forms
^
word.h:244:9: warning: when initialized here [-Wreorder]
Word(const char * word)
^
word.h: In copy constructor ‘Word::Word(const Word&)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:68:16: warning: ‘char* Word::m_word’ [-Wreorder]
char * m_word;
^
word.h:256:9: warning: when initialized here [-Wreorder]
Word(const Word & w)
^
In file included from basefrmpntr.cpp:22:0:
basefrmpntr.h: In constructor ‘baseformpointer::baseformpointer(const char*, const char*, size_t)’:
basefrmpntr.h:64:14: warning: ‘baseformpointer::owning’ will be initialized after [-Wreorder]
bool owning;
^
basefrmpntr.h:60:27: warning: ‘baseformpointer* baseformpointer::next’ [-Wreorder]
baseformpointer * next;
^
basefrmpntr.cpp:69:1: warning: when initialized here [-Wreorder]
baseformpointer::baseformpointer(const char * s,const char * t,size_t len):owning(true),next(NULL),hidden(false)
^
In file included from flattext.cpp:25:0:
word.h: In constructor ‘Word::Word(const char*)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:71:27: warning: ‘baseformpointer* Word::pbfD’ [-Wreorder]
baseformpointer * pbfD; // dictionary's base forms
^
word.h:244:9: warning: when initialized here [-Wreorder]
Word(const char * word)
^
word.h: In copy constructor ‘Word::Word(const Word&)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:68:16: warning: ‘char* Word::m_word’ [-Wreorder]
char * m_word;
^
word.h:256:9: warning: when initialized here [-Wreorder]
Word(const Word & w)
^
In file included from lemmatiser.cpp:22:0:
lemmatiser.h: In constructor ‘Lemmatiser::Lemmatiser(optionStruct&)’:
lemmatiser.h:63:14: warning: ‘Lemmatiser::SortInput’ will be initialized after [-Wreorder]
bool SortInput; // derived from other options
^
lemmatiser.h:60:24: warning: ‘optionStruct& Lemmatiser::Option’ [-Wreorder]
optionStruct & Option;
^
lemmatiser.cpp:126:1: warning: when initialized here [-Wreorder]
Lemmatiser::Lemmatiser(optionStruct & a_Option) : listLemmas(0),SortInput(false),Option(a_Option),changed(true)
^
In file included from text.cpp:27:0:
word.h: In constructor ‘Word::Word(const char*)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:71:27: warning: ‘baseformpointer* Word::pbfD’ [-Wreorder]
baseformpointer * pbfD; // dictionary's base forms
^
word.h:244:9: warning: when initialized here [-Wreorder]
Word(const char * word)
^
word.h: In copy constructor ‘Word::Word(const Word&)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:68:16: warning: ‘char* Word::m_word’ [-Wreorder]
char * m_word;
^
word.h:256:9: warning: when initialized here [-Wreorder]
Word(const Word & w)
^
In file included from text.cpp:23:0:
text.h: In constructor ‘text::text(bool, bool)’:
text.h:91:27: warning: ‘text::reducedtotal’ will be initialized after [-Wreorder]
unsigned long int reducedtotal;
^
text.h:87:17: warning: ‘field* text::fields’ [-Wreorder]
field * fields;
^
text.cpp:831:1: warning: when initialized here [-Wreorder]
text::text(bool a_InputHasTags,bool nice)
^
In file included from text.cpp:23:0:
text.h:87:17: warning: ‘text::fields’ will be initialized after [-Wreorder]
field * fields;
^
text.h:72:20: warning: ‘basefrm** text::basefrmarrD’ [-Wreorder]
basefrm ** basefrmarrD;
^
text.cpp:831:1: warning: when initialized here [-Wreorder]
text::text(bool a_InputHasTags,bool nice)
^
In file included from word.cpp:22:0:
word.h: In constructor ‘Word::Word(const char*)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:71:27: warning: ‘baseformpointer* Word::pbfD’ [-Wreorder]
baseformpointer * pbfD; // dictionary's base forms
^
word.h:244:9: warning: when initialized here [-Wreorder]
Word(const char * word)
^
word.h: In copy constructor ‘Word::Word(const Word&)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:68:16: warning: ‘char* Word::m_word’ [-Wreorder]
char * m_word;
^
word.h:256:9: warning: when initialized here [-Wreorder]
Word(const Word & w)
^
In file included from wordReader.cpp:22:0:
wordReader.h: In constructor ‘wordReader::wordReader(field*, field*, field*, bool, XMLtext*)’:
wordReader.h:42:14: warning: ‘wordReader::treatSlashAsAlternativesSeparator’ will be initialized after [-Wreorder]
bool treatSlashAsAlternativesSeparator;
^
wordReader.h:40:19: warning: ‘XMLtext* wordReader::Text’ [-Wreorder]
XMLtext * Text;
^
wordReader.cpp:165:1: warning: when initialized here [-Wreorder]
wordReader::wordReader(field * format,field * wordfield,field * tagfield,bool treatSlashAsAlternativesSeparator,XMLtext * Text)
^
In file included from XMLtext.cpp:36:0:
word.h: In constructor ‘Word::Word(const char*)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:71:27: warning: ‘baseformpointer* Word::pbfD’ [-Wreorder]
baseformpointer * pbfD; // dictionary's base forms
^
word.h:244:9: warning: when initialized here [-Wreorder]
Word(const char * word)
^
word.h: In copy constructor ‘Word::Word(const Word&)’:
word.h:84:29: warning: ‘Word::SegmentInitial’ will be initialized after [-Wreorder]
bool SegmentInitial:1; /* 20160205. If true and word starts with uppercase,
^
word.h:68:16: warning: ‘char* Word::m_word’ [-Wreorder]
char * m_word;
^
word.h:256:9: warning: when initialized here [-Wreorder]
Word(const Word & w)
^
In file included from XMLtext.cpp:32:0:
XMLtext.h: In constructor ‘XMLtext::XMLtext(FILE*, optionStruct&)’:
XMLtext.h:98:14: warning: ‘XMLtext::LemmaClassPosComing’ will be initialized after [-Wreorder]
bool LemmaClassPosComing;
^
XMLtext.h:89:16: warning: ‘char* XMLtext::alltext’ [-Wreorder]
char * alltext;
^
XMLtext.cpp:524:1: warning: when initialized here [-Wreorder]
XMLtext::XMLtext(FILE * fpi,optionStruct & Option)
^
In file included from XMLtext.cpp:32:0:
XMLtext.h:89:16: warning: ‘XMLtext::alltext’ will be initialized after [-Wreorder]
char * alltext;
^
XMLtext.h:73:22: warning: ‘const char* XMLtext::wordAttribute’ [-Wreorder]
const char * wordAttribute; // if null, word is PCDATA
^
XMLtext.cpp:524:1: warning: when initialized here [-Wreorder]
XMLtext::XMLtext(FILE * fpi,optionStruct & Option)
^
g++ -static cstlemma.o applyrules.o argopt.o basefrm.o basefrmpntr.o caseconv.o dictionary.o entities.o field.o flattext.o flex.o freqfile.o function.o functiontree.o hashmap.o lemmatise.o lemmatiser.o lemmtags.o letter.o letterfunc.o lext.o makedict.o makesuffixflex.o option.o outputclass.o parsesgml.o readfreq.o readlemm.o tags.o text.o utf8func.o word.o wordReader.o XMLtext.o -o ../cstlemmas -L/usr/local/lib -lstdc++
g++ cstlemma.o applyrules.o argopt.o basefrm.o basefrmpntr.o caseconv.o dictionary.o entities.o field.o flattext.o flex.o freqfile.o function.o functiontree.o hashmap.o lemmatise.o lemmatiser.o lemmtags.o letter.o letterfunc.o lext.o makedict.o makesuffixflex.o option.o outputclass.o parsesgml.o readfreq.o readlemm.o tags.o text.o utf8func.o word.o wordReader.o XMLtext.o -o ../cstlemma -L/usr/local/lib -lstdc++
g++ -shared -Wl,-soname,libcstlemma.so.7 -o libcstlemma.so.7.10 applyrules.o argopt.o basefrm.o basefrmpntr.o caseconv.o dictionary.o entities.o field.o flattext.o flex.o freqfile.o function.o functiontree.o hashmap.o lemmatise.o lemmatiser.o lemmtags.o letter.o letterfunc.o lext.o makedict.o makesuffixflex.o option.o outputclass.o parsesgml.o readfreq.o readlemm.o tags.o text.o utf8func.o word.o wordReader.o XMLtext.o
ln -sf libcstlemma.so.7.10 libcstlemma.so.7
ln -sf libcstlemma.so.7 libcstlemma.so
g++ cstlemma.o libcstlemma.so.7.10 -o ../cstlemmadl -L/usr/local/lib -lstdc++
ronnie@stat:~/cstlemma/src$ cd ..
ronnie@stat:~/cstlemma$ cd ..
ronnie@stat:~$
ronnie@stat:~$ cd cstlemma/
doc/ .git/ src/
ronnie@stat:~$ cd cstlemma/src/
ronnie@stat:~/cstlemma/src$ make cstlemma
g++ -I. -I../../hashmap/src -I../../letterfunc/src -I../../parsesgml/src -O3 -Wall -pedantic -DNDEBUG cstlemma.o -o cstlemma
cstlemma.o: In function `main':
cstlemma.cpp:(.text.startup+0x35): undefined reference to `optionStruct::optionStruct()'
cstlemma.cpp:(.text.startup+0x42): undefined reference to `optionStruct::readArgs(int, char**)'
cstlemma.cpp:(.text.startup+0x5e): undefined reference to `Lemmatiser::Lemmatiser(optionStruct&)'
cstlemma.cpp:(.text.startup+0x72): undefined reference to `Word::deleteStaticMembers()'
cstlemma.cpp:(.text.startup+0x7a): undefined reference to `Lemmatiser::~Lemmatiser()'
cstlemma.cpp:(.text.startup+0x86): undefined reference to `optionStruct::~optionStruct()'
cstlemma.cpp:(.text.startup+0xb3): undefined reference to `Lemmatiser::LemmatiseFile()'
cstlemma.cpp:(.text.startup+0x150): undefined reference to `Lemmatiser::~Lemmatiser()'
cstlemma.cpp:(.text.startup+0x158): undefined reference to `optionStruct::~optionStruct()'
collect2: error: ld returned 1 exit status
<builtin>: recipe for target 'cstlemma' failed
make: *** [cstlemma] Error 1
ronnie@stat:~/cstlemma/src$ make
make: '../cstlemma' is up to date.
ronnie@stat:~/cstlemma/src$ make clean
rm -f *.o
rm -f libcstlemma.so.7.10
rm -f libcstlemma.so.7
rm -f libcstlemma.so
ronnie@stat:~/cstlemma/src$ touch my_empty_rule_file
ronnie@stat:~/cstlemma/src$ cstlemma -L -f my_empty_rule_file -i my_text_file.txt
cstlemma: command not found
ronnie@stat:~/cstlemma/src$ cstlemma -H
cstlemma: command not found