1 Index: lttoolbox/lt-proc.1
2 ===================================================================
3 --- lttoolbox/lt-proc.1 (revision 21745)
4 +++ lttoolbox/lt-proc.1 (working copy)
19 +.B \-\-surf-bilingual \fR|
20 .B \-\-case-sensitive \fR|
21 +.B \-\-debugged-gen \fR|
22 +.B \-\-decompose-nouns \fR|
23 .B \-\-generation \fR|
24 .B \-\-non-marked-gen \fR|
25 .B \-\-tagged-gen \fR|
27 form in the source language. Works tipically with the output of
30 +.B \-o, \-\-surf-bilingual
31 +As with \-b, but takes input from apertium\-tagger \-p , with
32 +surface forms, and if the lexical form is not found in the bilingual
33 +dictionary, it outputs the surface form of the word.
36 .B \-c, \-\-case-sensitive
37 Use the literal case of the incoming characters
39 +.B \-d, \-\-debugged-gen
40 +Morph. generation with all the stuff
42 .B \-e, \-\-decompose-compounds
43 Try to treat unknown words as compounds, and decompose them.
47 Lots of...lurking in the dark and waiting for you!
49 -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights
51 +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante.
52 Index: lttoolbox/fst_processor.cc
53 ===================================================================
54 --- lttoolbox/fst_processor.cc (revision 21745)
55 +++ lttoolbox/fst_processor.cc (working copy)
58 caseSensitive = false;
59 dictionaryCase = false;
60 - compoundDecomposition = false;
61 + do_decomposition = false;
63 nullFlushGeneration = false;
64 + showControlSymbols = false;
65 + biltransSurfaceForms = false;
66 + compoundOnlyLSymbol = 0;
67 + compoundRSymbol = 0;
68 + compound_max_elements = 4;
70 - pool = new Pool<vector<int> >(4, vector<int>(50));
72 - initial_state = new State(pool);
73 - current_state = new State(pool);
74 + initial_state = new State();
75 + current_state = new State();
78 FSTProcessor::~FSTProcessor()
92 +FSTProcessor::readBilingual(FILE *input, FILE *output)
94 + wint_t val = fgetwc_unlocked(input);
95 + wstring symbol = L"";
99 + return pair<wstring, int>(symbol, 0x7fffffff);
106 + val = fgetwc_unlocked(input);
109 + return pair<wstring, int>(symbol, 0x7fffffff);
112 + else if(val == L'\\')
114 + fputwc_unlocked(val, output);
115 + val = fgetwc_unlocked(input);
118 + return pair<wstring, int>(symbol, 0x7fffffff);
120 + fputwc_unlocked(val,output);
121 + skipUntil(input, output, L'^');
122 + val = fgetwc_unlocked(input);
125 + return pair<wstring, int>(symbol, 0x7fffffff);
130 + fputwc_unlocked(val, output);
131 + skipUntil(input, output, L'^');
132 + val = fgetwc_unlocked(input);
135 + return pair<wstring, int>(symbol, 0x7fffffff);
143 + val = fgetwc_unlocked(input);
144 + return pair<wstring, int>(symbol, val);
146 + else if(val == L'$')
149 + return pair<wstring, int>(symbol, static_cast<int>(L'$'));
151 + else if(val == L'<')
154 + cad += static_cast<wchar_t>(val);
155 + while((val = fgetwc_unlocked(input)) != L'>')
161 + cad += static_cast<wchar_t>(val);
163 + cad += static_cast<wchar_t>(val);
165 + int res = alphabet(cad);
170 + return pair<wstring, int>(symbol, res);
172 + else if(val == L'[')
174 + fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output);
175 + return readBilingual(input, output);
179 + return pair<wstring, int>(symbol, val);
182 + return pair<wstring, int>(symbol, 0x7fffffff);
186 FSTProcessor::flushBlanks(FILE *output)
192 +FSTProcessor::writeEscapedWithTags(wstring const &str, FILE *output)
194 + for(unsigned int i = 0, limit = str.size(); i < limit; i++)
196 + if(str[i] == L'<' && i >=1 && str[i-1] != L'\\')
198 + fputws_unlocked(str.substr(i).c_str(), output);
202 + if(escaped_chars.find(str[i]) != escaped_chars.end())
204 + fputwc_unlocked(L'\\', output);
206 + fputwc_unlocked(str[i], output);
213 FSTProcessor::printWord(wstring const &sf, wstring const &lf, FILE *output)
215 fputwc_unlocked(L'^', output);
222 +FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupper) {
223 + const int MAX_COMBINATIONS = 500;
224 + //wcerr << L"compoundAnalysis(input_word = " << input_word << L")" << endl;
226 + State current_state = *initial_state;
228 + for(unsigned int i=0; i<input_word.size(); i++) {
229 + wchar_t val=input_word.at(i);
231 + //wcerr << val << L" før step " << i << L" current_state = " << current_state.getReadableString(alphabet) << endl;
232 + current_state.step_case(val, caseSensitive);
234 + if(current_state.size() > MAX_COMBINATIONS) {
235 + wcerr << L"Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << L"'" << endl;
236 + wcerr << L" gave up at char " << i << L" '" << val << L"'." << endl;
238 + wstring nullString = L"";
242 + //wcerr << val << L" eft step " << i << L" current_state = " << current_state.getReadableString(alphabet) << endl;
244 + if(i < input_word.size()-1)
245 + current_state.restartFinals(all_finals, compoundOnlyLSymbol, initial_state, '+');
247 + //wcerr << val << " eft rest " << i << " current_state = " << current_state.getReadableString(alphabet) << endl;
248 + //wcerr << i << " result = " << current_state.filterFinals(all_finals, alphabet, escaped_chars, uppercase, firstupper) << endl;
249 + //wcerr << i << " -- size = " << current_state.size() << endl;
251 + if(current_state.size()==0) {
252 + wstring nullString = L"";
257 + current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements);
258 + wstring result = current_state.filterFinals(all_finals, alphabet, escaped_chars, uppercase, firstupper);
259 + //wcerr << L"rrresult = " << result << endl;
267 +FSTProcessor::initDecompositionSymbols() {
268 + if ((compoundOnlyLSymbol=alphabet(L"<:co:only-L>")) == 0
269 + && (compoundOnlyLSymbol=alphabet(L"<:compound:only-L>")) == 0
270 + && (compoundOnlyLSymbol=alphabet(L"<@co:only-L>")) == 0
271 + && (compoundOnlyLSymbol=alphabet(L"<@compound:only-L>")) == 0
272 + && (compoundOnlyLSymbol=alphabet(L"<compound-only-L>")) == 0)
274 + wcerr << L"Warning: Decomposition symbol <:compound:only-L> not found" << endl;
276 + else if (!showControlSymbols)
277 + alphabet.setSymbol(compoundOnlyLSymbol, L"");
279 + if ((compoundRSymbol=alphabet(L"<:co:R>")) == 0
280 + && (compoundRSymbol=alphabet(L"<:compound:R>")) == 0
281 + && (compoundRSymbol=alphabet(L"<@co:R>")) == 0
282 + && (compoundRSymbol=alphabet(L"<@compound:R>")) == 0
283 + && (compoundRSymbol=alphabet(L"<compound-R>")) == 0)
285 + wcerr << L"Warning: Decomposition symbol <:compound:R> not found" << endl;
287 + else if (!showControlSymbols)
288 + alphabet.setSymbol(compoundRSymbol, L"");
293 +FSTProcessor::initDecomposition() {
294 + do_decomposition = true;
296 + initDecompositionSymbols();
300 FSTProcessor::decompose(wstring w)
302 State current_state = *initial_state;
305 //wcerr << L"+ decompose: " << lf << endl;
311 FSTProcessor::analysis(FILE *input, FILE *output)
312 @@ -839,6 +1035,10 @@
313 uppercase = firstupper && iswupper(sf[sf.size()-1]);
316 + if(do_decomposition && compoundOnlyLSymbol != 0)
318 + current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
320 lf = current_state.filterFinals(all_finals, alphabet,
322 uppercase, firstupper);
323 @@ -853,6 +1053,10 @@
324 uppercase = firstupper && iswupper(sf[sf.size()-1]);
327 + if(do_decomposition && compoundOnlyLSymbol != 0)
329 + current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
331 lf = current_state.filterFinals(all_finals, alphabet,
333 uppercase, firstupper);
334 @@ -867,6 +1071,10 @@
335 uppercase = firstupper && iswupper(sf[sf.size()-1]);
338 + if(do_decomposition && compoundOnlyLSymbol != 0)
340 + current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
342 lf = current_state.filterFinals(all_finals, alphabet,
344 uppercase, firstupper);
345 @@ -881,6 +1089,10 @@
346 uppercase = firstupper && iswupper(sf[sf.size()-1]);
349 + if(do_decomposition && compoundOnlyLSymbol != 0)
351 + current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
353 lf = current_state.filterFinals(all_finals, alphabet,
355 uppercase, firstupper);
356 @@ -969,16 +1181,22 @@
359 input_buffer.back(sf.size());
360 - fputwc_unlocked(sf[0], output);
361 + writeEscaped(sf.substr(0,1), output);
365 input_buffer.back(1+(size-limit));
366 wstring unknown_word = sf.substr(0, limit);
367 - if(compoundDecomposition)
368 + if(do_decomposition)
370 + if(!dictionaryCase)
372 + firstupper = iswupper(sf[0]);
373 + uppercase = firstupper && iswupper(sf[sf.size()-1]);
376 wstring compound = L"";
377 - compound = decompose(unknown_word);
378 + compound = compoundAnalysis(unknown_word, uppercase, firstupper);
381 printWord(unknown_word, compound, output);
382 @@ -1002,16 +1220,22 @@
385 input_buffer.back(sf.size());
386 - fputwc_unlocked(sf[0], output);
387 + writeEscaped(sf.substr(0,1), output);
391 input_buffer.back(1+(size-limit));
392 wstring unknown_word = sf.substr(0, limit);
393 - if(compoundDecomposition)
394 + if(do_decomposition)
396 + if(!dictionaryCase)
398 + firstupper = iswupper(sf[0]);
399 + uppercase = firstupper && iswupper(sf[sf.size()-1]);
402 wstring compound = L"";
403 - compound = decompose(unknown_word);
404 + compound = compoundAnalysis(unknown_word, uppercase, firstupper);
407 printWord(unknown_word, compound, output);
408 @@ -1296,19 +1520,27 @@
409 fputwc(L'=', output);
410 val = readGeneration(input, output);
414 if(val == L'$' && outOfWord)
416 if(sf[0] == L'*' || sf[0] == L'%')
418 - if(mode != gm_clean)
419 + if(mode != gm_clean && mode != gm_tagged_nm)
421 writeEscaped(sf, output);
424 + else if (mode == gm_clean)
426 writeEscaped(sf.substr(1), output);
428 + else if(mode == gm_tagged_nm)
430 + fputwc_unlocked(L'^', output);
431 + writeEscaped(removeTags(sf.substr(1)), output);
432 + fputwc_unlocked(L'/', output);
433 + writeEscapedWithTags(sf, output);
434 + fputwc_unlocked(L'$', output);
437 else if(sf[0] == L'@')
439 @@ -1324,6 +1556,18 @@
441 writeEscaped(removeTags(sf), output);
443 + else if(mode == gm_tagged)
445 + writeEscaped(removeTags(sf), output);
447 + else if(mode == gm_tagged_nm)
449 + fputwc_unlocked(L'^', output);
450 + writeEscaped(removeTags(sf.substr(1)), output);
451 + fputwc_unlocked(L'/', output);
452 + writeEscapedWithTags(sf, output);
453 + fputwc_unlocked(L'$', output);
456 else if(current_state.isFinal(all_finals))
458 @@ -1330,7 +1574,7 @@
459 bool uppercase = sf.size() > 1 && iswupper(sf[1]);
460 bool firstupper= iswupper(sf[0]);
462 - if(mode == gm_tagged)
463 + if(mode == gm_tagged || mode == gm_tagged_nm)
465 fputwc_unlocked(L'^', output);
467 @@ -1339,10 +1583,10 @@
469 uppercase, firstupper).substr(1).c_str(),
471 - if(mode == gm_tagged)
472 + if(mode == gm_tagged || mode == gm_tagged_nm)
474 fputwc_unlocked(L'/', output);
475 - fputws_unlocked(sf.c_str(), output);
476 + writeEscapedWithTags(sf, output);
477 fputwc_unlocked(L'$', output);
480 @@ -1360,9 +1604,26 @@
482 else if(mode == gm_unknown)
486 + fputwc_unlocked(L'#', output);
487 + writeEscaped(removeTags(sf), output);
490 + else if(mode == gm_tagged)
492 fputwc_unlocked(L'#', output);
493 writeEscaped(removeTags(sf), output);
495 + else if(mode == gm_tagged_nm)
497 + fputwc_unlocked(L'^', output);
498 + writeEscaped(removeTags(sf), output);
499 + fputwc_unlocked(L'/', output);
500 + fputwc_unlocked(L'#', output);
501 + writeEscapedWithTags(sf, output);
502 + fputwc_unlocked(L'$', output);
506 current_state = *initial_state;
507 @@ -2033,19 +2294,62 @@
510 State current_state = *initial_state;
512 - wstring queue = L"";
513 - wstring result = L"";
514 + wstring sf = L""; // source language analysis
515 + wstring queue = L""; // symbols to be added to each target
516 + wstring result = L""; // result of looking up analysis in bidix
520 skipUntil(input, output, L'^');
522 + pair<wstring,int> tr; // readBilingual return value, containing:
523 + int val; // the alphabet value of current symbol, and
524 + wstring symbol = L""; // the current symbol as a string
525 + bool seentags = false; // have we seen any tags at all in the analysis?
527 - while((val = readGeneration(input, output)) != 0x7fffffff)
528 + bool seensurface = false;
529 + wstring surface = L"";
531 + while(true) // ie. while(val != 0x7fffffff)
533 + tr = readBilingual(input, output);
537 + //fwprintf(stderr, L"> %S : %C : %d\n", tr.first.c_str(), tr.second, tr.second);
538 + if(biltransSurfaceForms && !seensurface && !outOfWord)
540 + while(val != L'/' && val != 0x7fffffff)
542 + surface = surface + symbol;
543 + alphabet.getSymbol(surface, val);
544 + tr = readBilingual(input, output);
547 + //fwprintf(stderr, L" == %S : %C : %d => %S\n", symbol.c_str(), val, val, surface.c_str());
549 + seensurface = true;
550 + tr = readBilingual(input, output);
555 + if (val == 0x7fffffff)
560 if(val == L'$' && outOfWord)
562 + if(!seentags) // if no tags: only return complete matches
564 + bool uppercase = sf.size() > 1 && iswupper(sf[1]);
565 + bool firstupper= iswupper(sf[0]);
567 + result = current_state.filterFinals(all_finals, alphabet,
569 + uppercase, firstupper, 0);
574 printWordBilingual(sf, L"/"+sf, output);
575 @@ -2055,14 +2359,23 @@
576 printWordBilingual(sf, compose(result, queue), output);
580 - printWordBilingual(sf, L"/@"+sf, output);
582 + if(biltransSurfaceForms)
584 + printWordBilingual(surface, L"/@"+surface, output);
588 + printWordBilingual(sf, L"/@"+sf, output);
592 + seensurface = false;
596 current_state = *initial_state;
600 else if(iswspace(val) && sf.size() == 0)
602 @@ -2074,7 +2387,11 @@
606 - alphabet.getSymbol(sf, val);
607 + alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic
608 + if(val == 0) // non-alphabetic, possibly unknown tag; add to sf
615 @@ -2082,7 +2399,15 @@
619 - alphabet.getSymbol(sf,val);
620 + alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic
621 + if(val == 0) // non-alphabetic, possibly unknown tag; add to sf
625 + if(alphabet.isTag(val) || val == 0)
629 if(current_state.size() != 0)
631 if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive)
632 @@ -2105,12 +2430,21 @@
634 if(current_state.size() == 0 && result != L"")
636 - if(alphabet.isTag(val))
637 + // We already have a result, but there is still more to read
638 + // of the analysis; following tags are not consumed, but
639 + // output as target language tags (added to result on
641 + if(alphabet.isTag(val)) // known tag
643 alphabet.getSymbol(queue, val);
645 + else if (val == 0) // non-alphabetic, possibly unknown tag
651 + // There are no more alive transductions and the current symbol is not a tag -- unknown word!
655 @@ -2127,6 +2461,7 @@
656 unsigned int end_point = input_word.size()-2;
659 + bool seentags = false; // have we seen any tags at all in the analysis?
661 if(with_delim == false)
663 @@ -2160,6 +2495,7 @@
665 else if(input_word[i] == L'<')
669 for(unsigned int j = i + 1; j <= end_point; j++)
671 @@ -2217,7 +2553,7 @@
674 if(current_state.size() == 0)
677 if(symbol != L"" && result != L"")
679 queue.append(symbol);
680 @@ -2224,20 +2560,39 @@
684 - // word is not present
685 + // word is not present
689 result = L"^@" + input_word.substr(1);
695 result = L"@" + input_word;
698 return pair<wstring, int>(result, 0);
704 + && L"" == current_state.filterFinals(all_finals, alphabet,
706 + uppercase, firstupper, 0))
708 + // word is not present
711 + result = L"^@" + input_word.substr(1);
715 + result = L"@" + input_word;
717 + return pair<wstring, int>(result, 0);
722 // attach unmatched queue automatically
725 @@ -2661,10 +3016,11 @@
731 -FSTProcessor::setDecompoundingMode(bool const value)
732 +FSTProcessor::setBiltransSurfaceForms(bool const value)
734 - compoundDecomposition = value;
735 + biltransSurfaceForms = value;
739 @@ -2688,7 +3044,7 @@
741 FSTProcessor::getDecompoundingMode()
743 - return compoundDecomposition;
744 + return do_decomposition;
748 Index: lttoolbox/lt_comp.cc
749 ===================================================================
750 --- lttoolbox/lt_comp.cc (revision 21745)
751 +++ lttoolbox/lt_comp.cc (working copy)
763 cout << basename(name) << " v" << PACKAGE_VERSION <<": build a letter transducer from a dictionary" << endl;
764 - cout << "USAGE: " << basename(name) << " lr | rl dictionary_file output_file [acx_file]" << endl;
765 + cout << "USAGE: " << basename(name) << " [-avh] lr | rl dictionary_file output_file [acx_file]" << endl;
766 + cout << " -v: set language variant" << endl;
767 + cout << " -a: set alternative (monodix)" << endl;
768 + cout << " -l: set left language variant (bidix)" << endl;
769 + cout << " -r: set right language variant (bidix)" << endl;
770 cout << "Modes:" << endl;
771 cout << " lr: left-to-right compilation" << endl;
772 cout << " rl: right-to-left compilation" << endl;
775 int main(int argc, char *argv[])
777 - if(argc != 4 && argc != 5)
779 + c.setVerbose(false);
781 +#if HAVE_GETOPT_LONG
782 + int option_index=0;
789 +#if HAVE_GETOPT_LONG
790 + static struct option long_options[] =
792 + {"alt", required_argument, 0, 'a'},
793 + {"var", required_argument, 0, 'v'},
794 + {"var-left", required_argument, 0, 'l'},
795 + {"var-right", required_argument, 0, 'r'},
796 + {"help", no_argument, 0, 'h'},
797 + {"verbose", no_argument, 0, 'V'},
801 + int cnt=getopt_long(argc, argv, "a:v:l:r:hV", long_options, &option_index);
803 + int cnt=getopt(argc, argv, "a:v:l:r:hV");
811 + c.setAltValue(optarg);
815 + c.setVariantValue(optarg);
820 + c.setVariantLeftValue(vl);
825 + c.setVariantRightValue(vr);
829 + c.setVerbose(true);
834 + endProgram(argv[0]);
844 + switch(argc - optind + 1)
846 - endProgram(argv[0]);
848 + opc = argv[argc-4];
849 + infile = argv[argc-3];
850 + outfile = argv[argc-2];
851 + acxfile = argv[argc-1];
855 + opc = argv[argc-3];
856 + infile = argv[argc-2];
857 + outfile = argv[argc-1];
861 + endProgram(argv[0]);
865 - string opc = argv[1];
873 + if(vr == "" && vl != "")
875 - c.parseACX(argv[4], Compiler::COMPILER_RESTRICTION_LR_VAL);
876 + cout << "Error: -l specified, but mode is lr" << endl;
877 + endProgram(argv[0]);
879 - c.parse(argv[2], Compiler::COMPILER_RESTRICTION_LR_VAL);
882 + c.parseACX(acxfile, Compiler::COMPILER_RESTRICTION_LR_VAL);
884 + c.parse(infile, Compiler::COMPILER_RESTRICTION_LR_VAL);
888 - c.parse(argv[2], Compiler::COMPILER_RESTRICTION_RL_VAL);
889 + if(vl == "" && vr != "")
891 + cout << "Error: -r specified, but mode is rl" << endl;
892 + endProgram(argv[0]);
894 + c.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL);
902 - FILE *output = fopen(argv[3], "wb");
903 + FILE *output = fopen(outfile.c_str(), "wb");
906 - cerr << "Error: Cannot open file '" << argv[2] << "'." << endl;
907 + cerr << "Error: Cannot open file '" << outfile << "'." << endl;
911 Index: lttoolbox/fst_processor.h
912 ===================================================================
913 --- lttoolbox/fst_processor.h (revision 21745)
914 +++ lttoolbox/fst_processor.h (working copy)
916 gm_clean, // clear all
917 gm_unknown, // display unknown words, clear transfer and generation tags
918 gm_all, // display all
919 - gm_tagged // tagged generation
920 + gm_tagged, // tagged generation
921 + gm_tagged_nm // clean tagged generation
927 map<wstring, TransExe, Ltstr> transducers;
929 - Pool<vector<int> > *pool;
932 * Current state of lexical analysis
938 + * true if we're automatically removing surface forms.
940 + bool biltransSurfaceForms;
944 * if true, makes always difference between uppercase and lowercase
949 * try analysing unknown words as compounds
951 - bool compoundDecomposition;
952 + bool do_decomposition;
955 + * Symbol of CompoundOnlyL
957 + int compoundOnlyLSymbol;
960 + * Symbol of CompoundR
962 + int compoundRSymbol;
965 + * Show or not the controls symbols (as compoundRSymbol)
967 + bool showControlSymbols;
970 + * Max compound elements
971 + * Hard coded for now, but there might come a switch one day
973 + int compound_max_elements;
976 * Prints an error of input stream and exits
980 int readGeneration(FILE *input, FILE *output);
983 + * Read text from stream (biltrans version)
984 + * @param input the stream to read
985 + * @return the queue of 0-symbols, and the next symbol in the stream
987 + pair<wstring, int> readBilingual(FILE *input, FILE *output);
990 * Read text from stream (SAO version)
991 * @param input the stream to read
992 * @return the next symbol in the stream
995 void writeEscaped(wstring const &str, FILE *output);
999 + * Write a string to an output stream, escaping all escapable characters
1000 + * but keeping symbols without escaping
1001 + * @param str the string to write, escaping characters
1002 + * @param output the stream to write in
1004 + void writeEscapedWithTags(wstring const &str, FILE *output);
1008 * Checks if an string ends with a particular suffix
1009 * @param str the string to test
1010 * @param the searched suffix
1013 void printUnknownWord(wstring const &sf, FILE *output);
1015 + void initDecompositionSymbols();
1017 vector<wstring> numbers;
1018 int readTMAnalysis(FILE *input);
1021 void printSpace(wchar_t const val, FILE *output);
1022 void skipUntil(FILE *input, FILE *output, wint_t const character);
1023 static wstring removeTags(wstring const &str);
1024 - wstring decompose(wstring str);
1025 + wstring compoundAnalysis(wstring str, bool uppercase, bool firstupper);
1026 size_t firstNotAlpha(wstring const &sf);
1028 void analysis_wrapper_null_flush(FILE *input, FILE *output);
1031 void setCaseSensitiveMode(bool const value);
1032 void setDictionaryCaseMode(bool const value);
1033 + void setBiltransSurfaceForms(bool const value);
1034 void setNullFlush(bool const value);
1035 bool getNullFlush();
1036 - void setDecompoundingMode(bool const value);
1037 bool getDecompoundingMode();
1040 Index: lttoolbox/lt_proc.cc
1041 ===================================================================
1042 --- lttoolbox/lt_proc.cc (revision 21745)
1043 +++ lttoolbox/lt_proc.cc (working copy)
1045 void endProgram(char *name)
1047 cout << basename(name) << ": process a stream with a letter transducer" << endl;
1048 - cout << "USAGE: " << basename(name) << " [-c] [-a|-g|-n|-d|-p|-s|-t|-b] fst_file [input_file [output_file]]" << endl;
1049 + cout << "USAGE: " << basename(name) << " [ -a | -b | -c | -d | -e | -g | -n | -p | -s | -t | -v | -h -z -w ] fst_file [input_file [output_file]]" << endl;
1050 cout << "Options:" << endl;
1051 #if HAVE_GETOPT_LONG
1052 cout << " -a, --analysis: morphological analysis (default behavior)" << endl;
1053 - cout << " -b, --bilingual: lexical transference" << endl;
1054 + cout << " -b, --bilingual: lexical transfer" << endl;
1055 cout << " -c, --case-sensitive: use the literal case of the incoming characters" << endl;
1056 + cout << " -d, --debugged-gen morph. generation with all the stuff" <<endl;
1057 + cout << " -e, --decompose-nouns: Try to decompound unknown words" << endl;
1058 cout << " -g, --generation: morphological generation" << endl;
1059 + cout << " -l, --tagged-gen: morphological generation keeping lexical forms" << endl;
1060 + cout << " -m, --tagged-nm-gen: same as -l but without unknown word marks" << endl;
1061 cout << " -n, --non-marked-gen morph. generation without unknown word marks" << endl;
1062 - cout << " -d, --debugged-gen morph. generation with all the stuff" <<endl;
1063 + cout << " -o, --surf-bilingual: lexical transfer with surface forms" << endl;
1064 cout << " -p, --post-generation: post-generation" << endl;
1065 - cout << " -e, --decompose-compounds: try to decompose unknown word as compounds" << endl;
1066 cout << " -s, --sao: SAO annotation system input processing" << endl;
1067 cout << " -t, --transliteration: apply transliteration dictionary" << endl;
1068 + cout << " -v, --version: version" << endl;
1069 cout << " -z, --null-flush: flush output on the null character " << endl;
1070 cout << " -w, --dictionary-case: use dictionary case instead of surface case" << endl;
1071 - cout << " -v, --version: version" << endl;
1072 cout << " -h, --help: show this help" << endl;
1074 cout << " -a: morphological analysis (default behavior)" << endl;
1075 + cout << " -b: lexical transfer" << endl;
1076 cout << " -c: use the literal case of the incoming characters" << endl;
1077 + cout << " -d: morph. generation with all the stuff" << endl;
1078 + cout << " -e: try to decompose unknown words as compounds" << endl;
1079 cout << " -g: morphological generation" << endl;
1080 + cout << " -l: morphological generation keeping lexical forms" << endl;
1081 cout << " -n: morph. generation without unknown word marks" << endl;
1082 + cout << " -o: lexical transfer with surface forms" << endl;
1083 cout << " -p: post-generation" << endl;
1084 - cout << " -e: try to decompose unknown words as compounds" << endl;
1085 cout << " -s: SAO annotation system input processing" << endl;
1086 cout << " -t: apply transliteration dictionary" << endl;
1087 + cout << " -v: version" << endl;
1088 cout << " -z: flush output on the null character " << endl;
1089 cout << " -w: use dictionary case instead of surface case" << endl;
1090 - cout << " -v: version" << endl;
1091 cout << " -h: show this help" << endl;
1096 {"analysis", 0, 0, 'a'},
1097 {"bilingual", 0, 0, 'b'},
1098 + {"surf-bilingual", 0, 0, 'o'},
1099 {"generation", 0, 0, 'g'},
1100 {"non-marked-gen", 0, 0, 'n'},
1101 {"debugged-gen", 0, 0, 'd'},
1102 {"tagged-gen", 0, 0, 'l'},
1103 + {"tagged-nm-gen", 0, 0, 'm'},
1104 {"post-generation", 0, 0, 'p'},
1106 {"transliteration", 0, 0, 't'},
1109 #if HAVE_GETOPT_LONG
1111 - int c = getopt_long(argc, argv, "abceglndpstzwvh", long_options, &option_index);
1112 + int c = getopt_long(argc, argv, "abceglmndopstzwvh", long_options, &option_index);
1114 - int c = getopt(argc, argv, "abceglndpstzwvh");
1115 + int c = getopt(argc, argv, "abceglmndopstzwvh");
1119 @@ -123,13 +132,12 @@
1120 fstp.setCaseSensitiveMode(true);
1124 - fstp.setDecompoundingMode(true);
1136 @@ -248,11 +256,19 @@
1137 fstp.initGeneration();
1138 checkValidity(fstp);
1139 fstp.generation(input, output, gm_all);
1143 fstp.initGeneration();
1144 checkValidity(fstp);
1145 fstp.generation(input, output, gm_tagged);
1149 + fstp.initGeneration();
1150 + checkValidity(fstp);
1151 + fstp.generation(input, output, gm_tagged_nm);
1155 fstp.initPostgeneration();
1156 @@ -272,11 +288,24 @@
1157 fstp.transliteration(input, output);
1161 + fstp.initBiltrans();
1162 + checkValidity(fstp);
1163 + fstp.setBiltransSurfaceForms(true);
1164 + fstp.bilingual(input, output);
1168 fstp.initBiltrans();
1169 checkValidity(fstp);
1170 fstp.bilingual(input, output);
1174 + fstp.initDecomposition();
1175 + checkValidity(fstp);
1176 + fstp.analysis(input, output);
1181 Index: lttoolbox/expander.cc
1182 ===================================================================
1183 --- lttoolbox/expander.cc (revision 21745)
1184 +++ lttoolbox/expander.cc (working copy)
1185 @@ -295,9 +295,18 @@
1187 wstring atributo=this->attrib(Compiler::COMPILER_RESTRICTION_ATTR);
1188 wstring entrname=this->attrib(Compiler::COMPILER_LEMMA_ATTR);
1189 + wstring altval = this->attrib(Compiler::COMPILER_ALT_ATTR);
1190 + wstring varval = this->attrib(Compiler::COMPILER_V_ATTR);
1191 + wstring varl = this->attrib(Compiler::COMPILER_VL_ATTR);
1192 + wstring varr = this->attrib(Compiler::COMPILER_VR_ATTR);
1194 wstring myname = L"";
1195 - if(this->attrib(Compiler::COMPILER_IGNORE_ATTR) == L"yes")
1196 + if(this->attrib(Compiler::COMPILER_IGNORE_ATTR) == L"yes"
1197 + || altval != L"" && altval != alt
1198 + || (varval != L"" && varval != variant && atributo == Compiler::COMPILER_RESTRICTION_RL_VAL)
1199 + || ((varl != L"" && varl != variant_left) && (varr != L"" && varr != variant_right))
1200 + || (varl != L"" && varl != variant_left && atributo == Compiler::COMPILER_RESTRICTION_RL_VAL)
1201 + || (varr != L"" && varr != variant_right && atributo == Compiler::COMPILER_RESTRICTION_LR_VAL))
1205 @@ -316,11 +325,14 @@
1208 EntList items, items_lr, items_rl;
1209 - if(atributo == Compiler::COMPILER_RESTRICTION_LR_VAL)
1210 + if(atributo == Compiler::COMPILER_RESTRICTION_LR_VAL
1211 + || (varval != L"" && varval != variant && atributo != Compiler::COMPILER_RESTRICTION_RL_VAL)
1212 + || varl != L"" && varl != variant_left)
1214 items_lr.push_back(pair<wstring, wstring>(L"", L""));
1216 - else if(atributo == Compiler::COMPILER_RESTRICTION_RL_VAL)
1217 + else if(atributo == Compiler::COMPILER_RESTRICTION_RL_VAL
1218 + || (varr != L"" && varr != variant_right))
1220 items_rl.push_back(pair<wstring, wstring>(L"", L""));
1222 @@ -594,3 +606,27 @@
1223 it->second.append(endings.second);
1228 +Expander::setAltValue(string const &a)
1230 + alt = XMLParseUtil::stows(a);
1234 +Expander::setVariantValue(string const &v)
1236 + variant = XMLParseUtil::stows(v);
1240 +Expander::setVariantLeftValue(string const &v)
1242 + variant_left = XMLParseUtil::stows(v);
1246 +Expander::setVariantRightValue(string const &v)
1248 + variant_right = XMLParseUtil::stows(v);
1250 Index: lttoolbox/lt-expand.1
1251 ===================================================================
1252 --- lttoolbox/lt-expand.1 (revision 21745)
1253 +++ lttoolbox/lt-expand.1 (working copy)
1255 architecture: \fBhttp://www.apertium.org\fR.
1265 dictionary_file [output_file]
1271 +.B \-\-var\-left \fR|
1272 +.B \-\-var\-right \fR|
1275 +dictionary_file [output_file]
1279 -Is the application responsible of expanding a dictionary into a
1280 +Is the application responsible for expanding a dictionary into a
1281 simple list of input string-output string pairs by eliminating
1282 paradigms through substitution and unfolding.
1285 The output goes to \fIoutput_file\fR if it is present or to standard
1286 output if it is missing.
1291 +Sets the value of the \fIalt\fR attribute to use in expansion
1294 +Sets the value of the \fIv\fR attribute to use in expansion of monodixes
1296 +.B \-l, \-\-var\-left
1297 +Sets the value of the \fIvl\fR attribute to use in expansion of bidixes
1299 +.B \-r, \-\-var\-right
1300 +Sets the value of the \fIvr\fR attribute to use in expansion of bidixes
1303 +Prints a short help message
1307 The input dictionary to expand.
1310 Lots of...lurking in the dark and waiting for you!
1312 -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights
1314 +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante.
1315 Index: lttoolbox/dix.dtd
1316 ===================================================================
1317 --- lttoolbox/dix.dtd (revision 21745)
1318 +++ lttoolbox/dix.dtd (working copy)
1321 + Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
1323 + This program is free software; you can redistribute it and/or
1324 + modify it under the terms of the GNU General Public License as
1325 + published by the Free Software Foundation; either version 2 of the
1326 + License, or (at your option) any later version.
1328 + This program is distributed in the hope that it will be useful, but
1329 + WITHOUT ANY WARRANTY; without even the implied warranty of
1330 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1331 + General Public License for more details.
1333 + You should have received a copy of the GNU General Public License
1334 + along with this program; if not, write to the Free Software
1335 + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
1338 DTD for the format of dictionaries
1340 <!ELEMENT dictionary (alphabet?, sdefs?,
1345 + alt CDATA #IMPLIED
1350 <!-- r: restriction LR: left-to-right,
1351 RL: right-to-left -->
1353 <!-- i: ignore ('yes') means ignore, otherwise it is not ignored) -->
1354 <!-- slr: translation sense when translating from left to right -->
1355 <!-- srl: translation sense when translating from right to left -->
1356 + <!-- alt: alternative entries are omitted if not selected -->
1357 + <!-- v: variant sets (monodix) direction restrictions based on language variant -->
1358 + <!-- vl: variant left sets direction restrictions based on language variant for language on left of bidix -->
1359 + <!-- vr: variant right sets direction restrictions based on language variant for language on right of bidix -->
1360 <!ELEMENT par EMPTY>
1361 <!-- reference to paradigm -->
1363 Index: lttoolbox/compiler.cc
1364 ===================================================================
1365 --- lttoolbox/compiler.cc (revision 21745)
1366 +++ lttoolbox/compiler.cc (working copy)
1368 wstring const Compiler::COMPILER_LEMMA_ATTR = L"lm";
1369 wstring const Compiler::COMPILER_IGNORE_ATTR = L"i";
1370 wstring const Compiler::COMPILER_IGNORE_YES_VAL = L"yes";
1371 +wstring const Compiler::COMPILER_ALT_ATTR = L"alt";
1372 +wstring const Compiler::COMPILER_V_ATTR = L"v";
1373 +wstring const Compiler::COMPILER_VL_ATTR = L"vl";
1374 +wstring const Compiler::COMPILER_VR_ATTR = L"vr";
1376 Compiler::Compiler()
1378 @@ -417,6 +421,12 @@
1382 + if(verbose && first_element && (both_sides.front() == (int)L' '))
1384 + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
1385 + wcerr << L"): Entry begins with space." << endl;
1387 + first_element = false;
1389 e.setSingleTransduction(both_sides, both_sides);
1391 @@ -444,6 +454,13 @@
1392 readString(lhs, name);
1396 + if(verbose && first_element && (lhs.front() == (int)L' '))
1398 + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
1399 + wcerr << L"): Entry begins with space." << endl;
1401 + first_element = false;
1403 skip(name, COMPILER_RIGHT_ELEM);
1405 @@ -480,7 +497,15 @@
1408 wstring nomparadigma = attrib(COMPILER_N_ATTR);
1409 + first_element = false;
1411 + if(current_paradigm != L"" && nomparadigma == current_paradigm)
1413 + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
1414 + wcerr << L"): Paradigm refers to itself '" << nomparadigma << L"'." <<endl;
1415 + exit(EXIT_FAILURE);
1418 if(paradigms.find(nomparadigma) == paradigms.end())
1420 wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
1421 @@ -632,9 +657,18 @@
1423 wstring atributo=this->attrib(COMPILER_RESTRICTION_ATTR);
1424 wstring ignore = this->attrib(COMPILER_IGNORE_ATTR);
1425 + wstring altval = this->attrib(COMPILER_ALT_ATTR);
1426 + wstring varval = this->attrib(COMPILER_V_ATTR);
1427 + wstring varl = this->attrib(COMPILER_VL_ATTR);
1428 + wstring varr = this->attrib(COMPILER_VR_ATTR);
1430 // if entry is masked by a restriction of direction or an ignore mark
1431 - if((atributo != L"" && atributo != direction) || ignore == COMPILER_IGNORE_YES_VAL)
1432 + if((atributo != L"" && atributo != direction)
1433 + || ignore == COMPILER_IGNORE_YES_VAL
1434 + || (altval != L"" && altval != alt)
1435 + || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant)
1436 + || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left)
1437 + || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right))
1439 // parse to the end of the entry
1441 @@ -662,6 +696,11 @@
1442 wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
1445 + if(current_paradigm == L"" && verbose)
1447 + first_element = true;
1450 int tipo = xmlTextReaderNodeType(reader);
1451 if(name == COMPILER_PAIR_ELEM)
1453 @@ -845,3 +884,33 @@
1454 it->second.write(output);
1459 +Compiler::setAltValue(string const &a)
1461 + alt = XMLParseUtil::stows(a);
1465 +Compiler::setVariantValue(string const &v)
1467 + variant = XMLParseUtil::stows(v);
1471 +Compiler::setVariantLeftValue(string const &v)
1473 + variant_left = XMLParseUtil::stows(v);
1477 +Compiler::setVariantRightValue(string const &v)
1479 + variant_right = XMLParseUtil::stows(v);
1483 +Compiler::setVerbose(bool verbosity)
1485 + verbose = verbosity;
1487 Index: lttoolbox/transducer.h
1488 ===================================================================
1489 --- lttoolbox/transducer.h (revision 21745)
1490 +++ lttoolbox/transducer.h (working copy)
1491 @@ -146,6 +146,13 @@
1492 bool isFinal(int const state) const;
1495 + * Test if a pattern is recognised by the FST
1496 + * @param a widestring of the pattern to be recognised
1497 + * @return true if the pattern is recognised by the transducer
1499 + bool recognise(wstring patro, Alphabet &a, FILE *err = stderr);
1502 * Set the state as a final or not, yes by default
1503 * @param state the state
1504 * @param value if true, the state is set as final state
1505 @@ -179,6 +186,12 @@
1506 void reverse(int const epsilon_tag = 0);
1509 + * Print all the transductions of a transducer in ATT format
1510 + * @param epsilon_tag the tag to take as epsilon
1512 + void show(Alphabet &a, FILE *output = stdout, int const epsilon_tag = 0);
1515 * Determinize the transducer
1516 * @param epsilon_tag the tag to take as epsilon
1518 @@ -242,6 +255,12 @@
1519 bool isEmpty(int const state) const;
1522 + * Returns the number of transitions from a given state
1523 + * @return the number of transitions
1525 + int getStateSize(int const state);
1529 * @param output the stream to write to
1530 * @param decalage offset to sum to the tags
1531 Index: lttoolbox/lt_expand.cc
1532 ===================================================================
1533 --- lttoolbox/lt_expand.cc (revision 21745)
1534 +++ lttoolbox/lt_expand.cc (working copy)
1539 +#include <getopt.h>
1546 cout << basename(name) << " v" << PACKAGE_VERSION <<": expand the contents of a dictionary file" << endl;
1547 - cout << "USAGE: " << basename(name) << " dictionary_file [output_file]" << endl;
1548 + cout << "USAGE: " << basename(name) << " [-avlrh] dictionary_file [output_file]" << endl;
1553 int main(int argc, char *argv[])
1555 FILE *input = NULL, *output = NULL;
1559 +#if HAVE_GETOPT_LONG
1560 + int option_index=0;
1564 +#if HAVE_GETOPT_LONG
1565 + static struct option long_options[] =
1567 + {"alt", required_argument, 0, 'a'},
1568 + {"var", required_argument, 0, 'v'},
1569 + {"var-left", required_argument, 0, 'l'},
1570 + {"var-right", required_argument, 0, 'r'},
1571 + {"help", no_argument, 0, 'h'},
1575 + int cnt=getopt_long(argc, argv, "a:v:l:r:h", long_options, &option_index);
1577 + int cnt=getopt(argc, argv, "a:v:l:r:h");
1585 + e.setAltValue(optarg);
1589 + e.setVariantValue(optarg);
1593 + e.setVariantLeftValue(optarg);
1597 + e.setVariantRightValue(optarg);
1602 + endProgram(argv[0]);
1610 + switch(argc - optind + 1)
1613 - input = fopen(argv[1], "rb");
1614 + infile = argv[argc-1];
1615 + input = fopen(infile.c_str(), "rb");
1618 - cerr << "Error: Cannot open file '" << argv[1] << "'." << endl;
1619 + cerr << "Error: Cannot open file '" << infile << "'." << endl;
1623 @@ -60,18 +114,20 @@
1627 - input = fopen(argv[1], "rb");
1628 + infile = argv[argc-2];
1629 + input = fopen(infile.c_str(), "rb");
1632 - cerr << "Error: Cannot open file '" << argv[1] << "'." << endl;
1633 + cerr << "Error: Cannot open file '" << infile << "'." << endl;
1638 - output = fopen(argv[2], "wb");
1639 + outfile = argv[argc-1];
1640 + output = fopen(argv[argc-1], "wb");
1643 - cerr << "Error: Cannot open file '" << argv[2] << "'." << endl;
1644 + cerr << "Error: Cannot open file '" << outfile << "'." << endl;
1649 _setmode(_fileno(output), _O_U8TEXT);
1653 - e.expand(argv[1], output);
1654 + e.expand(infile, output);
1657 return EXIT_SUCCESS;
1658 Index: lttoolbox/state.cc
1659 ===================================================================
1660 --- lttoolbox/state.cc (revision 21745)
1661 +++ lttoolbox/state.cc (working copy)
1668 -State::State(Pool<vector<int> > *p)
1670 +//#include <iostream>
1671 +//using namespace std;
1684 - // release references
1685 for(size_t i = 0, limit = state.size(); i != limit; i++)
1687 - pool->release(state[i].sequence);
1688 + delete state[i].sequence;
1693 // release references
1694 for(size_t i = 0, limit = state.size(); i != limit; i++)
1696 - pool->release(state[i].sequence);
1697 + delete state[i].sequence;
1703 for(size_t i = 0, limit = state.size(); i != limit; i++)
1705 - vector<int> *tmp = pool->get();
1706 + vector<int> *tmp = new vector<int>();
1707 *tmp = *(state[i].sequence);
1708 state[i].sequence = tmp;
1711 State::init(Node *initial)
1714 - state.push_back(TNodeState(initial,pool->get(),false));
1715 + state.push_back(TNodeState(initial, new vector<int>(), false));
1716 state[0].sequence->clear();
1721 for(int j = 0; j != it->second.size; j++)
1723 - vector<int> *new_v = pool->get();
1724 + vector<int> *new_v = new vector<int>();
1725 *new_v = *(state[i].sequence);
1729 new_state.push_back(TNodeState(it->second.dest[j], new_v, state[i].dirty||false));
1732 - pool->release(state[i].sequence);
1733 + delete state[i].sequence;
1739 for(int j = 0; j != it->second.size; j++)
1741 - vector<int> *new_v = pool->get();
1742 - *new_v = *(state[i].sequence);
1743 + vector<int> *new_v = new vector<int>();
1744 + *new_v = *(state[i].sequence);
1747 new_v->push_back(it->second.out_tag[j]);
1750 for(int j = 0; j != it->second.size; j++)
1752 - vector<int> *new_v = pool->get();
1753 + vector<int> *new_v = new vector<int>();
1754 *new_v = *(state[i].sequence);
1758 new_state.push_back(TNodeState(it->second.dest[j], new_v, true));
1761 - pool->release(state[i].sequence);
1762 + delete state[i].sequence;
1768 for(int j = 0 ; j != it2->second.size; j++)
1770 - vector<int> *tmp = pool->get();
1771 + vector<int> *tmp = new vector<int>();
1772 *tmp = *(state[i].sequence);
1773 if(it2->second.out_tag[j] != 0)
1775 @@ -199,6 +202,69 @@
1780 +State::apply(int const input, int const alt1, int const alt2)
1782 + vector<TNodeState> new_state;
1783 + if(input == 0 || alt1 == 0 || alt2 == 0)
1785 + state = new_state;
1789 + for(size_t i = 0, limit = state.size(); i != limit; i++)
1791 + map<int, Dest>::const_iterator it;
1792 + it = state[i].where->transitions.find(input);
1793 + if(it != state[i].where->transitions.end())
1795 + for(int j = 0; j != it->second.size; j++)
1797 + vector<int> *new_v = new vector<int>();
1798 + *new_v = *(state[i].sequence);
1799 + if(it->first != 0)
1801 + new_v->push_back(it->second.out_tag[j]);
1803 + new_state.push_back(TNodeState(it->second.dest[j], new_v, state[i].dirty||false));
1806 + it = state[i].where->transitions.find(alt1);
1807 + if(it != state[i].where->transitions.end())
1809 + for(int j = 0; j != it->second.size; j++)
1811 + vector<int> *new_v = new vector<int>();
1812 + *new_v = *(state[i].sequence);
1813 + if(it->first != 0)
1815 + new_v->push_back(it->second.out_tag[j]);
1817 + new_state.push_back(TNodeState(it->second.dest[j], new_v, true));
1820 + it = state[i].where->transitions.find(alt2);
1821 + if(it != state[i].where->transitions.end())
1823 + for(int j = 0; j != it->second.size; j++)
1825 + vector<int> *new_v = new vector<int>();
1826 + *new_v = *(state[i].sequence);
1827 + if(it->first != 0)
1829 + new_v->push_back(it->second.out_tag[j]);
1831 + new_state.push_back(TNodeState(it->second.dest[j], new_v, true));
1835 + delete state[i].sequence;
1838 + state = new_state;
1843 State::step(int const input)
1845 @@ -213,6 +279,37 @@
1850 +State::step(int const input, int const alt1, int const alt2)
1852 + apply(input, alt1, alt2);
1857 +State::step_case(wchar_t val, wchar_t val2, bool caseSensitive)
1859 + if (!iswupper(val) || caseSensitive) {
1861 + } else if(val != towlower(val)) {
1862 + step(val, towlower(val), val2);
1870 +State::step_case(wchar_t val, bool caseSensitive)
1872 + if (!iswupper(val) || caseSensitive) {
1875 + step(val, towlower(val));
1881 State::isFinal(set<Node *> const &finals) const
1883 @@ -282,6 +379,60 @@
1888 +set<pair<wstring, vector<wstring> > >
1889 +State::filterFinalsLRX(set<Node *> const &finals,
1890 + Alphabet const &alphabet,
1891 + set<wchar_t> const &escaped_chars,
1892 + bool uppercase, bool firstupper, int firstchar) const
1894 + set<pair<wstring, vector<wstring> > > results;
1896 + vector<wstring> current_result;
1897 + wstring rule_id = L"";
1899 + // /<$><select>station<n><ANY_TAG><$><skip><6>/<$><select>station<n><ANY_TAG><$><skip><6>
1901 + // if <$> current_result.push_back(current_word)
1902 + // if / results.insert(current_result)
1904 + for(size_t i = 0, limit = state.size(); i != limit; i++)
1906 + if(finals.find(state[i].where) != finals.end())
1908 + current_result.clear();
1910 + wstring current_word = L"";
1911 + for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++)
1913 + if(escaped_chars.find((*(state[i].sequence))[j]) != escaped_chars.end())
1915 + current_word += L'\\';
1917 + wstring sym = L"";
1918 + alphabet.getSymbol(sym, (*(state[i].sequence))[j], uppercase);
1921 + if(current_word != L"")
1923 + current_result.push_back(current_word);
1925 + current_word = L"";
1929 + current_word += sym;
1932 + rule_id = current_word;
1933 + results.insert(make_pair(rule_id, current_result));
1942 State::filterFinalsSAO(set<Node *> const &finals,
1943 Alphabet const &alphabet,
1944 @@ -438,3 +589,149 @@
1952 +State::pruneCompounds(int requiredSymbol, int separationSymbol, int compound_max_elements)
1954 + int minNoOfCompoundElements = compound_max_elements;
1955 + int *noOfCompoundElements = new int[state.size()];
1957 + //wcerr << L"pruneCompounds..." << endl;
1959 + for (unsigned int i = 0; i<state.size(); i++) {
1960 + vector<int> seq = *state.at(i).sequence;
1962 + if (lastPartHasRequiredSymbol(seq, requiredSymbol, separationSymbol)) {
1963 + int this_noOfCompoundElements = 0;
1964 + for (int j = seq.size()-2; j>0; j--) if (seq.at(j)==separationSymbol) this_noOfCompoundElements++;
1965 + noOfCompoundElements[i] = this_noOfCompoundElements;
1966 + minNoOfCompoundElements = (minNoOfCompoundElements < this_noOfCompoundElements) ?
1967 + minNoOfCompoundElements : this_noOfCompoundElements;
1970 + noOfCompoundElements[i] = INT_MAX;
1971 + //wcerr << L"Prune - No requiered symbol in state number " << i << endl;
1975 + // remove states with more than minimum number of compounds (or without the requiered symbol in the last part)
1976 + vector<TNodeState>::iterator it = state.begin();
1978 + while(it != state.end()) {
1979 + if (noOfCompoundElements[i] > minNoOfCompoundElements) {
1980 + delete (*it).sequence;
1981 + it = state.erase(it);
1982 + //wcerr << L"Prune - State number " << i << L" removed!" << endl;
1988 + delete[] noOfCompoundElements;
1994 +State::pruneStatesWithForbiddenSymbol(int forbiddenSymbol)
1996 + vector<TNodeState>::iterator it = state.begin();
1997 + while(it != state.end()) {
1998 + vector<int> *seq = (*it).sequence;
1999 + bool found = false;
2000 + for(int i = seq->size()-1; i>=0; i--) {
2001 + if(seq->at(i) == forbiddenSymbol) {
2003 + delete (*it).sequence;
2004 + it = state.erase(it);
2015 +State::lastPartHasRequiredSymbol(const vector<int> &seq, int requiredSymbol, int separationSymbol)
2017 + // state is final - it should be restarted it with all elements in stateset restart_state, with old symbols conserved
2018 + bool restart=false;
2019 + for (int n=seq.size()-1; n>=0; n--) {
2020 + int symbol=seq.at(n);
2021 + if (symbol==requiredSymbol) {
2025 + if (symbol==separationSymbol) {
2034 +State::restartFinals(const set<Node *> &finals, int requiredSymbol, State *restart_state, int separationSymbol)
2037 + for (unsigned int i=0; i<state.size(); i++) {
2038 + TNodeState state_i = state.at(i);
2039 + // A state can be a possible final state and still have transitions
2041 + if (finals.count(state_i.where) > 0) {
2042 + bool restart = lastPartHasRequiredSymbol(*(state_i.sequence), requiredSymbol, separationSymbol);
2044 + if (restart_state != NULL) {
2045 + for (unsigned int j=0; j<restart_state->state.size(); j++) {
2046 + TNodeState initst = restart_state->state.at(j);
2047 + vector<int> *tnvec = new vector<int>;
2049 + for(unsigned int k=0; k < state_i.sequence->size(); k++) tnvec->push_back(state_i.sequence->at(k));
2050 + TNodeState tn(initst.where, tnvec, state_i.dirty);
2051 + tn.sequence->push_back(separationSymbol);
2052 + state.push_back(tn);
2063 +State::getReadableString(const Alphabet &a)
2065 + wstring retval = L"[";
2067 + for(unsigned int i=0; i<state.size(); i++) {
2068 + vector<int>* seq = state.at(i).sequence;
2069 + if(seq != NULL) for (unsigned int j=0; j<seq->size(); j++) {
2071 + a.getSymbol(ws, seq->at(j));
2072 + //if(ws == L"") ws = L"?";
2073 + retval.append(ws);
2076 + /*Node *where = state.at(i).where;
2077 + if(where == NULL) retval.append(L"→@null");
2079 + retval.append(L"→");
2080 + map<int, Dest>::iterator it;
2082 + for (it = where->transitions.begin(); it != where->transitions.end(); it++) {
2083 + int symbol = (*it).first;
2084 + a.getSymbol(ws, symbol);
2085 + retval.append(ws);
2088 + if (i+1 < state.size()) retval.append(L", ");
2090 + retval.append(L"]");
2094 Index: lttoolbox/alphabet.cc
2095 ===================================================================
2096 --- lttoolbox/alphabet.cc (revision 21745)
2097 +++ lttoolbox/alphabet.cc (working copy)
2100 return spairinv[code];
2104 +void Alphabet::setSymbol(int symbol, wstring newSymbolString) {
2105 + //Should be a special character!
2106 + if (symbol < 0) slexicinv[-symbol-1] = newSymbolString;
2108 Index: lttoolbox/lt-tmxproc.1
2109 ===================================================================
2110 --- lttoolbox/lt-tmxproc.1 (revision 21745)
2111 +++ lttoolbox/lt-tmxproc.1 (working copy)
2114 Lots of...lurking in the dark and waiting for you!
2116 -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights
2118 +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante.
2119 Index: lttoolbox/lt-comp.1
2120 ===================================================================
2121 --- lttoolbox/lt-comp.1 (revision 21745)
2122 +++ lttoolbox/lt-comp.1 (working copy)
2136 ] dictionary_file output_file
2142 +.B \-\-var\-left \fR|
2143 +.B \-\-var\-right \fR|
2149 +] dictionary_file output_file
2153 Is the application responsible of compiling dictionaries used by
2159 +Sets the value of the \fIalt\fR attribute to use in compilation.
2161 +Note that if no value is set, all entries containing an \fIalt\fR
2162 +attribute are omitted.
2165 +Sets the value of the \fIv\fR attribute to use in compilation.
2166 +This should only be used with monodixes; for bidixes, see \-l and \-r.
2168 +Note that if no value is set, all entries containing a \fIv\fR
2169 +attribute are considered to be \fIleft-to-right\fR.
2171 +.B \-l, \-\-var\-left
2172 +Sets the value of the \fIvl\fR attribute for use in compilation of bidixes.
2173 +"Left" here refers to the side of the dictionary, so this option is only valid
2176 +.B \-r, \-\-var\-right
2177 +Sets the value of the \fIvr\fR attribute for use in compilation of bidixes.
2178 +"Right" here refers to the side of the dictionary, so this option is only valid
2182 +Prints a short help message
2185 The resulting transducer will process dictionary entries
2186 \fIleft-to-right\fR.
2189 Lots of...lurking in the dark and waiting for you!
2191 -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights
2193 +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante.
2194 Index: lttoolbox/lt_locale.h
2195 ===================================================================
2196 --- lttoolbox/lt_locale.h (revision 21745)
2197 +++ lttoolbox/lt_locale.h (working copy)
2199 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
2206 Index: lttoolbox/expander.h
2207 ===================================================================
2208 --- lttoolbox/expander.h (revision 21745)
2209 +++ lttoolbox/expander.h (working copy)
2211 xmlTextReaderPtr reader;
2219 + * The variant value (monodix)
2224 + * The variant value (left side of bidix)
2226 + wstring variant_left;
2229 + * The variant value (right side of bidix)
2231 + wstring variant_right;
2234 * The paradigm being compiled
2236 wstring current_paradigm;
2237 @@ -186,6 +206,29 @@
2238 * Compile dictionary to letter transducers
2240 void expand(string const &fichero, FILE *output);
2242 + * Set the alt value to use in compilation
2243 + * @param a the value
2245 + void setAltValue(string const &a);
2248 + * Set the variant value to use in expansion
2249 + * @param v the value
2251 + void setVariantValue(string const &v);
2254 + * Set the variant_left value to use in expansion
2255 + * @param v the value
2257 + void setVariantLeftValue(string const &v);
2260 + * Set the variant_right value to use in expansion
2261 + * @param v the value
2263 + void setVariantRightValue(string const &v);
2267 Index: lttoolbox/transducer.cc
2268 ===================================================================
2269 --- lttoolbox/transducer.cc (revision 21745)
2270 +++ lttoolbox/transducer.cc (working copy)
2273 #include <lttoolbox/transducer.h>
2274 #include <lttoolbox/compression.h>
2275 +#include <lttoolbox/alphabet.h>
2276 #include <lttoolbox/lttoolbox_config.h>
2277 #include <lttoolbox/my_stdio.h>
2279 @@ -187,6 +188,13 @@
2281 Transducer::setFinal(int const state, bool valor)
2283 + int initial_copy = getInitial();
2285 + if(state == initial_copy)
2287 + wcerr << L"Setting initial state to final" << endl;
2292 finals.insert(state);
2293 @@ -609,3 +617,119 @@
2299 +Transducer::show(Alphabet &alphabet, FILE *output, int const epsilon_tag)
2301 + joinFinals(epsilon_tag);
2303 + map<int, multimap<int, int> > temporal;
2305 + for(map<int, multimap<int, int> >::iterator it = transitions.begin(); it != transitions.end(); it++)
2307 + multimap<int, int> aux = it->second;
2309 + for(multimap<int, int>::iterator it2 = aux.begin(); it2 != aux.end(); it2++)
2311 + pair<int, int> t = alphabet.decode(it2->first);
2312 + fwprintf(output, L"%d\t", it->first);
2313 + fwprintf(output, L"%d\t", it2->second);
2315 + alphabet.getSymbol(l, t.first);
2316 + if(l == L"") // If we find an epsilon
2318 + fwprintf(output, L"ε\t", l.c_str());
2322 + fwprintf(output, L"%S\t", l.c_str());
2325 + alphabet.getSymbol(r, t.second);
2326 + if(r == L"") // If we find an epsilon
2328 + fwprintf(output, L"ε\t", r.c_str());
2332 + fwprintf(output, L"%S\t", r.c_str());
2334 + fwprintf(output, L"\n");
2338 + for(set<int>::iterator it3 = finals.begin(); it3 != finals.end(); it3++)
2340 + fwprintf(output, L"%d\n", *it3);
2345 +Transducer::getStateSize(int const state)
2348 + set<int> myclosure1 = closure(state, 0);
2349 + states.insert(myclosure1.begin(), myclosure1.end());
2350 + int num_transitions = 0;
2352 + for(set<int>::iterator it2 = states.begin(); it2 != states.end(); it2++)
2354 + num_transitions += transitions[*it2].size();
2357 + return num_transitions;
2361 +Transducer::recognise(wstring patro, Alphabet &a, FILE *err)
2363 + bool accepted = false;
2366 + set<int> myclosure1 = closure(getInitial(), 0);
2367 + states.insert(myclosure1.begin(), myclosure1.end());
2368 + // For each of the characters in the input string
2369 + for(wstring::iterator it = patro.begin(); it != patro.end(); it++)
2371 + set<int> new_state; //Transducer::closure(int const state, int const epsilon_tag)
2373 + // For each of the current alive states
2374 + //fwprintf(err, L"step: %S %C (%d)\n", patro.c_str(), *it, sym);
2375 + for(set<int>::iterator it2 = states.begin(); it2 != states.end(); it2++)
2377 + multimap<int, int> p = transitions[*it2];
2378 + // For each of the transitions in the state
2380 + for(multimap<int, int>::iterator it3 = p.begin(); it3 != p.end(); it3++)
2383 + pair<int, int> t = a.decode(it3->first);
2385 + a.getSymbol(l, t.first);
2386 + //wstring r = L"";
2387 + //a.getSymbol(r, t.second);
2389 + //fwprintf(err, L" -> state: %d, trans: %S:%S, targ: %d\n", *it2, (l == L"") ? L"ε" : l.c_str(), (r == L"") ? L"ε" : r.c_str(), it3->second);
2390 + //if(l.find(*it) != wstring::npos || l == L"" )
2391 + if(l.find(*it) != wstring::npos)
2393 + set<int> myclosure = closure(it3->second, 0);
2394 + //wcerr << L"Before closure alives: " <<new_state.size() << endl;
2395 + new_state.insert(myclosure.begin(), myclosure.end());
2396 + //wcerr << L"After closure alives: " <<new_state.size() << endl;
2400 + states = new_state;
2402 + for(set<int>::iterator it4 = states.begin(); it4 != states.end(); it4++)
2413 Index: lttoolbox/pool.h
2414 ===================================================================
2415 --- lttoolbox/pool.h (revision 21745)
2416 +++ lttoolbox/pool.h (working copy)
2419 - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
2421 - * This program is free software; you can redistribute it and/or
2422 - * modify it under the terms of the GNU General Public License as
2423 - * published by the Free Software Foundation; either version 2 of the
2424 - * License, or (at your option) any later version.
2426 - * This program is distributed in the hope that it will be useful, but
2427 - * WITHOUT ANY WARRANTY; without even the implied warranty of
2428 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2429 - * General Public License for more details.
2431 - * You should have received a copy of the GNU General Public License
2432 - * along with this program; if not, write to the Free Software
2433 - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
2434 - * 02111-1307, USA.
2436 -#ifndef _GENERIC_POOL_
2437 -#define _GENERIC_POOL_
2441 -using namespace std;
2444 - * Pool of T objects
2451 - * Free pointers to objects
2456 - * Currently created objects
2462 - * @param other pool object
2464 - void copy(Pool const &p)
2466 - created = p.created;
2468 - // all new members are available
2469 - for(typename list<T>::iterator it = created.begin(), limit = created.end();
2470 - it != limit; it++)
2472 - free.push_back(&(*it));
2485 - * Allocate a pool of nelems size
2486 - * @param nelems initial size of the pool
2488 - void init(unsigned int const nelems)
2493 - for(unsigned int i = 0; i != nelems; i++)
2495 - created.push_front(tmp);
2496 - free.push_front(&(*(created.begin())));
2501 - * Allocate a pool of nelems size with objects equal to 'object'
2502 - * @param nelems initial size of the pool
2503 - * @param object initial value of the objects in the pool
2505 - void init(unsigned int const nelems, T const &object)
2509 - for(unsigned int i = 0; i != nelems; i++)
2511 - created.push_front(object);
2512 - free.push_front(&(*(created.begin())));
2528 - * Parametrized constructor
2529 - * @param nelems initial size of the pool
2530 - * @param object initial value of the objects in the pool
2532 - Pool(unsigned int const nelems, T const &object)
2534 - init(nelems, object);
2538 - * Parametrized constructor
2539 - * @param nelems initial size of the pool
2541 - Pool(unsigned int const nelems)
2555 - * Copy constructor
2557 - Pool(Pool const &p)
2563 - * Allocate a pointer to a free 'new' object.
2564 - * @return pointer to the object
2568 - if(free.size() != 0)
2570 - T *result = *(free.begin());
2571 - free.erase(free.begin());
2577 - created.push_front(tmp);
2578 - return &(*(created.begin()));
2583 - * Release a no more needed instance of a pooled object
2584 - * @param item the no more needed instance of the object
2586 - void release(T *item)
2588 - free.push_front(item);
2593 Index: lttoolbox/compiler.h
2594 ===================================================================
2595 --- lttoolbox/compiler.h (revision 21745)
2596 +++ lttoolbox/compiler.h (working copy)
2598 xmlTextReaderPtr reader;
2606 + * The variant value (monodix)
2611 + * The variant value (left side of bidix)
2613 + wstring variant_left;
2616 + * The variant value (right side of bidix)
2618 + wstring variant_right;
2621 * The paradigm being compiled
2623 wstring current_paradigm;
2628 + * Set verbose mode: warnings which may or may not be correct
2633 + * First element (of an entry)
2635 + bool first_element;
2638 * Identifier of all the symbols during the compilation
2641 @@ -264,10 +294,14 @@
2642 static wstring const COMPILER_LEMMA_ATTR;
2643 static wstring const COMPILER_IGNORE_ATTR;
2644 static wstring const COMPILER_IGNORE_YES_VAL;
2645 + static wstring const COMPILER_ALT_ATTR;
2646 + static wstring const COMPILER_V_ATTR;
2647 + static wstring const COMPILER_VL_ATTR;
2648 + static wstring const COMPILER_VR_ATTR;
2657 @@ -292,6 +326,35 @@
2658 * @param fd the stream where write the result
2660 void write(FILE *fd);
2663 + * Set verbose output
2665 + void setVerbose(bool verbosity = false);
2668 + * Set the alt value to use in compilation
2669 + * @param a the value
2671 + void setAltValue(string const &a);
2674 + * Set the variant value to use in compilation
2675 + * @param v the value
2677 + void setVariantValue(string const &v);
2680 + * Set the variant_left value to use in compilation
2681 + * @param v the value
2683 + void setVariantLeftValue(string const &v);
2686 + * Set the variant_right value to use in compilation
2687 + * @param v the value
2689 + void setVariantRightValue(string const &v);
2693 Index: lttoolbox/lt-tmxcomp.1
2694 ===================================================================
2695 --- lttoolbox/lt-tmxcomp.1 (revision 21745)
2696 +++ lttoolbox/lt-tmxcomp.1 (working copy)
2699 Lots of...lurking in the dark and waiting for you!
2701 -(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights
2703 +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante.
2704 Index: lttoolbox/alphabet.h
2705 ===================================================================
2706 --- lttoolbox/alphabet.h (revision 21745)
2707 +++ lttoolbox/alphabet.h (working copy)
2708 @@ -145,6 +145,13 @@
2710 bool isTag(int const symbol) const;
2713 + * Sets an already existing symbol to represent a new value
2714 + * @param symbol the code of the symbol to set
2715 + * @param newSymbolString the new string for this symbol
2717 + void setSymbol(int symbol, wstring newSymbolString);
2719 pair<int, int> const & decode(int const code) const;
2722 Index: lttoolbox/state.h
2723 ===================================================================
2724 --- lttoolbox/state.h (revision 21745)
2725 +++ lttoolbox/state.h (working copy)
2736 #include <lttoolbox/alphabet.h>
2737 #include <lttoolbox/node.h>
2738 -#include <lttoolbox/pool.h>
2739 +#include <lttoolbox/match_exe.h>
2740 +#include <lttoolbox/match_state.h>
2741 +#include <lttoolbox/transducer.h>
2743 using namespace std;
2748 vector<int> *sequence;
2750 + bool dirty; // What does "dirty" mean ?
2752 TNodeState(Node * const &w, vector<int> * const &s, bool const &d): where(w), sequence(s), dirty(d){}
2753 TNodeState & operator=(TNodeState const &other)
2755 vector<TNodeState> state;
2758 - * Pool of wchar_t vectors, for efficience (static class)
2760 - Pool<vector<int> > *pool;
2764 - * @param s the state to be copied
2766 - void copy(State const &s);
2774 void apply(int const input, int const alt);
2776 + void apply(int const input, int const alt1, int const alt2);
2779 * Calculate the epsilon closure over the current state, replacing
2783 void epsilonClosure();
2785 + bool lastPartHasRequiredSymbol(const vector<int> &seq, int requiredSymbol, int separationSymbol);
2791 + * @param s the state to be copied
2793 + void copy(State const &s);
2799 - State(Pool<vector<int> > *);
2804 @@ -135,6 +139,13 @@
2806 void step(int const input, int const alt);
2808 + void step(int const input, int const alt1, int const alt2);
2810 + void step_case(wchar_t val, bool caseSensitive);
2812 + void step_case(wchar_t val, wchar_t val2, bool caseSensitive);
2816 * Init the state with the initial node and empty output
2817 * @param initial the initial node of the transducer
2818 @@ -142,6 +153,21 @@
2819 void init(Node *initial);
2822 + * Remove states not containing a specific symbol in their last 'part', and states
2823 + * with more than a number of 'parts'
2824 + * @param requieredSymbol the symbol requiered in the last part
2825 + * @param separationSymbol the symbol that represent the separation between two parts
2826 + * @param compound_max_elements the maximum part number allowed
2828 + void pruneCompounds(int requiredSymbol, int separationSymbol, int compound_max_elements);
2831 + * Remove states containing a forbidden symbol
2832 + * @param forbiddenSymbol the symbol forbidden
2834 + void pruneStatesWithForbiddenSymbol(int forbiddenSymbol);
2837 * Print all outputs of current parsing, preceded by a bar '/',
2838 * from the final nodes of the state
2839 * @param finals the set of final nodes
2841 wstring filterFinals(set<Node *> const &finals, Alphabet const &a,
2842 set<wchar_t> const &escaped_chars,
2843 bool uppercase = false,
2844 - bool firstupper = false,
2845 - int firstchar = 0) const;
2846 + bool firstupper = false,
2847 + int firstchar = 0) const;
2850 * Same as previous one, but the output is adapted to the SAO system
2851 @@ -173,11 +199,44 @@
2852 wstring filterFinalsSAO(set<Node *> const &finals, Alphabet const &a,
2853 set<wchar_t> const &escaped_chars,
2854 bool uppercase = false,
2855 - bool firstupper = false,
2856 - int firstchar = 0) const;
2857 + bool firstupper = false,
2858 + int firstchar = 0) const;
2862 + * Same as previous one, but the output is adapted to the LRX system
2863 + * @param finals the set of final nodes
2864 + * @param a the alphabet to decode strings
2865 + * @param escaped_chars the set of chars to be preceded with one
2867 + * @param uppercase true if the word is uppercase
2868 + * @param firstupper true if the first letter of a word is uppercase
2869 + * @param firstchar first character of the word
2870 + * @return the result of the transduction
2873 + set<pair<wstring, vector<wstring> > > filterFinalsLRX(set<Node *> const &finals, Alphabet const &a,
2874 + set<wchar_t> const &escaped_chars,
2875 + bool uppercase = false,
2876 + bool firstupper = false,
2877 + int firstchar = 0) const;
2884 + * Find final states, remove those that not has a requiredSymbol and 'restart' each of them as the
2885 + * set of initial states, but remembering the sequence and adding a separationSymbol
2887 + * @param requiredSymbol
2888 + * @param restart_state
2889 + * @param separationSymbol
2891 + void restartFinals(const set<Node *> &finals, int requiredSymbol, State *restart_state, int separationSymbol);
2895 * Returns true if at least one record of the state references a
2896 * final node of the set
2897 * @param finals set of final nodes @return
2898 @@ -185,6 +244,11 @@
2900 bool isFinal(set<Node *> const &finals) const;
2903 + * Return the full states string (to allow debuging...) using a Java ArrayList.toString style
2905 + wstring getReadableString(const Alphabet &a);
2907 wstring filterFinalsTM(set<Node *> const &finals,
2908 Alphabet const &alphabet,
2909 set<wchar_t> const &escaped_chars,
2910 Index: lttoolbox/Makefile.am
2911 ===================================================================
2912 --- lttoolbox/Makefile.am (revision 21745)
2913 +++ lttoolbox/Makefile.am (working copy)
2915 h_sources = alphabet.h buffer.h compiler.h compression.h \
2916 entry_token.h expander.h fst_processor.h lt_locale.h ltstr.h \
2917 match_exe.h match_node.h match_state.h my_stdio.h node.h \
2918 - pattern_list.h pool.h regexp_compiler.h sorted_vector.h state.h \
2919 + pattern_list.h regexp_compiler.h sorted_vector.h state.h \
2920 transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h
2921 cc_sources = alphabet.cc compiler.cc compression.cc entry_token.cc \
2922 expander.cc fst_processor.cc lt_locale.cc match_exe.cc \
2924 library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)
2925 library_include_HEADERS = $(h_sources)
2927 -bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc
2928 +bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print
2931 lib_LTLIBRARIES= liblttoolbox3.la
2934 lttoolbox_DATA = dix.dtd
2936 +lt_print_SOURCES = lt_print.cc
2937 +lt_print_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
2938 +lt_print_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)
2940 lt_comp_SOURCES = lt_comp.cc
2941 lt_comp_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
2942 lt_comp_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)
2944 lt_tmxproc_LDADD = liblttoolbox$(GENERIC_MAJOR_VERSION).la
2945 lt_tmxproc_LDFLAGS = -llttoolbox$(GENERIC_MAJOR_VERSION) $(LTTOOLBOX_LIBS)
2947 -man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1
2948 +#lt-validate-dictionary: Makefile.am validate-header.sh
2949 +# @echo "Creating lt-validate-dictionary script"
2950 +# @echo "#!$(BASH)" > $@
2951 +# @cat validate-header.sh >> $@
2952 +# @echo "$(XMLLINT) --dtdvalid $(apertiumdir)/dix.dtd --noout \$$FILE1 && exit 0;" >> $@
2953 +# @echo "exit 1;" >> $@
2958 +man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1
2960 INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS)
2963 Index: lttoolbox/lt-print.1
2964 ===================================================================
2965 --- lttoolbox/lt-print.1 (revision 0)
2966 +++ lttoolbox/lt-print.1 (revision 44914)
2968 +.TH lt-print 1 2006-03-08 "" ""
2970 +lt-print \- This application is part of the lexical processing modules
2975 +This tool is part of the apertium machine translation
2976 +architecture: \fBhttp://www.apertium.org\fR.
2983 +Is the application responsible for printing compiled dictionaries in
2987 +The compiled input file .
2990 +The transducer in ATT format .
2995 +.I lt-expand\fR(1),
2996 +.I apertium-tagger\fR(1),
2999 +Lots of...lurking in the dark and waiting for you!
3001 +(c) 2005--2012 Universitat d'Alacant / Universidad de Alicante.
3002 Index: lttoolbox/lt_print.cc
3003 ===================================================================
3004 --- lttoolbox/lt_print.cc (revision 0)
3005 +++ lttoolbox/lt_print.cc (revision 44914)
3008 + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
3010 + * This program is free software; you can redistribute it and/or
3011 + * modify it under the terms of the GNU General Public License as
3012 + * published by the Free Software Foundation; either version 2 of the
3013 + * License, or (at your option) any later version.
3015 + * This program is distributed in the hope that it will be useful, but
3016 + * WITHOUT ANY WARRANTY; without even the implied warranty of
3017 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3018 + * General Public License for more details.
3020 + * You should have received a copy of the GNU General Public License
3021 + * along with this program; if not, write to the Free Software
3022 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
3023 + * 02111-1307, USA.
3025 +#include <lttoolbox/transducer.h>
3026 +#include <lttoolbox/compression.h>
3027 +#include <lttoolbox/lttoolbox_config.h>
3029 +#include <lttoolbox/my_stdio.h>
3030 +#include <lttoolbox/lt_locale.h>
3033 +#include <iostream>
3034 +#include <libgen.h>
3037 +using namespace std;
3039 +void endProgram(char *name)
3043 + cout << basename(name) << " v" << PACKAGE_VERSION <<": dump a transducer to text in ATT format" << endl;
3044 + cout << "USAGE: " << basename(name) << " bin_file " << endl;
3046 + exit(EXIT_FAILURE);
3050 +int main(int argc, char *argv[])
3054 + endProgram(argv[0]);
3057 + LtLocale::tryToSetLocale();
3060 + FILE *input = fopen(argv[1], "r");
3062 + Alphabet new_alphabet;
3063 + set<wchar_t> alphabetic_chars;
3065 + map<wstring, Transducer> transducers;
3068 + int len = Compression::multibyte_read(input);
3071 + alphabetic_chars.insert(static_cast<wchar_t>(Compression::multibyte_read(input)));
3076 + new_alphabet.read(input);
3078 + len = Compression::multibyte_read(input);
3082 + int len2 = Compression::multibyte_read(input);
3083 + wstring name = L"";
3086 + name += static_cast<wchar_t>(Compression::multibyte_read(input));
3089 + transducers[name].read(input);
3094 + /////////////////////
3096 + FILE *output = stdout;
3097 + map<wstring, Transducer>::iterator penum = transducers.end();
3099 + for(map<wstring, Transducer>::iterator it = transducers.begin(); it != transducers.end(); it++)
3101 + //it->second.minimize();
3102 + it->second.show(new_alphabet, output);
3105 + fwprintf(output, L"--\n", it->first.c_str());