]>
Commit | Line | Data |
---|---|---|
ae08c8ad | 1 | diff -uNr postgresql-7.4/contrib/tsearch2/dict_ispell.c postgresql-7.4.fixed/contrib/tsearch2/dict_ispell.c |
2 | --- postgresql-7.4/contrib/tsearch2/dict_ispell.c 2003-08-04 02:43:11.000000000 +0200 | |
3 | +++ postgresql-7.4.fixed/contrib/tsearch2/dict_ispell.c 2003-12-18 17:46:03.000000000 +0100 | |
4 | @@ -27,7 +27,7 @@ | |
5 | static void | |
6 | freeDictISpell(DictISpell * d) | |
7 | { | |
8 | - FreeIspell(&(d->obj)); | |
9 | + NIFree(&(d->obj)); | |
10 | freestoplist(&(d->stoplist)); | |
11 | free(d); | |
12 | } | |
13 | @@ -71,7 +71,7 @@ | |
14 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), | |
15 | errmsg("dictionary already loaded"))); | |
16 | } | |
17 | - if (ImportDictionary(&(d->obj), pcfg->value)) | |
18 | + if (NIImportDictionary(&(d->obj), pcfg->value)) | |
19 | { | |
20 | freeDictISpell(d); | |
21 | ereport(ERROR, | |
22 | @@ -90,7 +90,7 @@ | |
23 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), | |
24 | errmsg("affixes already loaded"))); | |
25 | } | |
26 | - if (ImportAffixes(&(d->obj), pcfg->value)) | |
27 | + if (NIImportAffixes(&(d->obj), pcfg->value)) | |
28 | { | |
29 | freeDictISpell(d); | |
30 | ereport(ERROR, | |
31 | @@ -132,8 +132,8 @@ | |
32 | ||
33 | if (affloaded && dictloaded) | |
34 | { | |
35 | - SortDictionary(&(d->obj)); | |
36 | - SortAffixes(&(d->obj)); | |
37 | + NISortDictionary(&(d->obj)); | |
38 | + NISortAffixes(&(d->obj)); | |
39 | } | |
40 | else if (!affloaded) | |
41 | { | |
42 | @@ -168,7 +168,7 @@ | |
43 | ||
44 | res = palloc(sizeof(char *) * 2); | |
45 | txt = pnstrdup(in, PG_GETARG_INT32(2)); | |
46 | - res = NormalizeWord(&(d->obj), txt); | |
47 | + res = NINormalizeWord(&(d->obj), txt); | |
48 | pfree(txt); | |
49 | ||
50 | if (res == NULL) | |
51 | diff -uNr postgresql-7.4/contrib/tsearch2/ispell/spell.c postgresql-7.4.fixed/contrib/tsearch2/ispell/spell.c | |
52 | --- postgresql-7.4/contrib/tsearch2/ispell/spell.c 2003-08-04 02:43:11.000000000 +0200 | |
53 | +++ postgresql-7.4.fixed/contrib/tsearch2/ispell/spell.c 2003-12-18 17:46:03.000000000 +0100 | |
54 | @@ -7,15 +7,26 @@ | |
55 | ||
56 | #include "spell.h" | |
57 | ||
58 | -#define MAXNORMLEN 56 | |
59 | +#define MAX_NORM 1024 | |
60 | +#define MAXNORMLEN 256 | |
61 | ||
62 | #define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y))) | |
63 | +#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)=='p') ? (N) : ( (L) - 1 - (N) ) ] ) | |
64 | +#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T ) | |
65 | + | |
66 | + | |
67 | +#define MEMOUT(X) if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))) | |
68 | ||
69 | static int | |
70 | cmpspell(const void *s1, const void *s2) | |
71 | { | |
72 | return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word)); | |
73 | } | |
74 | +static int | |
75 | +cmpspellaffix(const void *s1, const void *s2) | |
76 | +{ | |
77 | + return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag)); | |
78 | +} | |
79 | ||
80 | static void | |
81 | strlower(char *str) | |
82 | @@ -29,6 +40,13 @@ | |
83 | } | |
84 | } | |
85 | ||
86 | +static char* | |
87 | +strnduplicate(char *s, int len) { | |
88 | + char *d=(char*)palloc( len + 1 ); | |
89 | + memcpy(d, s, len ); | |
90 | + d[len]='\0'; | |
91 | + return d; | |
92 | +} | |
93 | /* backward string compaire for suffix tree operations */ | |
94 | static int | |
95 | strbcmp(const char *s1, const char *s2) | |
96 | @@ -92,7 +110,7 @@ | |
97 | } | |
98 | ||
99 | int | |
100 | -AddSpell(IspellDict * Conf, const char *word, const char *flag) | |
101 | +NIAddSpell(IspellDict * Conf, const char *word, const char *flag) | |
102 | { | |
103 | if (Conf->nspell >= Conf->mspell) | |
104 | { | |
105 | @@ -106,24 +124,18 @@ | |
106 | Conf->mspell = 1024 * 20; | |
107 | Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL)); | |
108 | } | |
109 | - if (Conf->Spell == NULL) | |
110 | - ereport(ERROR, | |
111 | - (errcode(ERRCODE_OUT_OF_MEMORY), | |
112 | - errmsg("out of memory"))); | |
113 | + MEMOUT(Conf->Spell); | |
114 | } | |
115 | Conf->Spell[Conf->nspell].word = strdup(word); | |
116 | - if (!Conf->Spell[Conf->nspell].word) | |
117 | - ereport(ERROR, | |
118 | - (errcode(ERRCODE_OUT_OF_MEMORY), | |
119 | - errmsg("out of memory"))); | |
120 | - strncpy(Conf->Spell[Conf->nspell].flag, flag, 10); | |
121 | + MEMOUT(Conf->Spell[Conf->nspell].word); | |
122 | + strncpy(Conf->Spell[Conf->nspell].p.flag, flag, 16); | |
123 | Conf->nspell++; | |
124 | return (0); | |
125 | } | |
126 | ||
127 | ||
128 | int | |
129 | -ImportDictionary(IspellDict * Conf, const char *filename) | |
130 | +NIImportDictionary(IspellDict * Conf, const char *filename) | |
131 | { | |
132 | unsigned char str[BUFSIZ]; | |
133 | FILE *dict; | |
134 | @@ -143,7 +155,7 @@ | |
135 | flag = s; | |
136 | while (*s) | |
137 | { | |
138 | - if (((*s >= 'A') && (*s <= 'Z')) || ((*s >= 'a') && (*s <= 'z'))) | |
139 | + if (isprint(*s) && !isspace(*s)) | |
140 | s++; | |
141 | else | |
142 | { | |
143 | @@ -166,65 +178,49 @@ | |
144 | *s = 0; | |
145 | s++; | |
146 | } | |
147 | - AddSpell(Conf, str, flag); | |
148 | + NIAddSpell(Conf, str, flag); | |
149 | } | |
150 | fclose(dict); | |
151 | return (0); | |
152 | } | |
153 | ||
154 | ||
155 | -static SPELL * | |
156 | -FindWord(IspellDict * Conf, const char *word, int affixflag) | |
157 | +static int | |
158 | +FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly) | |
159 | { | |
160 | - int l, | |
161 | - c, | |
162 | - r, | |
163 | - resc, | |
164 | - resl, | |
165 | - resr, | |
166 | - i; | |
167 | - | |
168 | - i = (int) (*word) & 255; | |
169 | - l = Conf->SpellTree.Left[i]; | |
170 | - r = Conf->SpellTree.Right[i]; | |
171 | - if (l == -1) | |
172 | - return (NULL); | |
173 | - while (l <= r) | |
174 | - { | |
175 | - c = (l + r) >> 1; | |
176 | - resc = strcmp(Conf->Spell[c].word, word); | |
177 | - if ((resc == 0) && | |
178 | - ((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL))) | |
179 | - return (&Conf->Spell[c]); | |
180 | - resl = strcmp(Conf->Spell[l].word, word); | |
181 | - if ((resl == 0) && | |
182 | - ((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL))) | |
183 | - return (&Conf->Spell[l]); | |
184 | - resr = strcmp(Conf->Spell[r].word, word); | |
185 | - if ((resr == 0) && | |
186 | - ((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL))) | |
187 | - return (&Conf->Spell[r]); | |
188 | - if (resc < 0) | |
189 | - { | |
190 | - l = c + 1; | |
191 | - r--; | |
192 | - } | |
193 | - else if (resc > 0) | |
194 | - { | |
195 | - r = c - 1; | |
196 | - l++; | |
197 | - } | |
198 | - else | |
199 | - { | |
200 | - l++; | |
201 | - r--; | |
202 | + SPNode *node = Conf->Dictionary; | |
203 | + SPNodeData *StopLow, *StopHigh, *StopMiddle; | |
204 | + int level=0, wrdlen=strlen(word); | |
205 | + | |
206 | + while( node && level<wrdlen) { | |
207 | + StopLow = node->data; | |
208 | + StopHigh = node->data+node->length; | |
209 | + while (StopLow < StopHigh) { | |
210 | + StopMiddle = StopLow + (StopHigh - StopLow) / 2; | |
211 | + if ( StopMiddle->val == ((uint8*)(word))[level] ) { | |
212 | + if ( wrdlen==level+1 && StopMiddle->isword ) { | |
213 | + if ( compoundonly && !StopMiddle->compoundallow ) | |
214 | + return 0; | |
215 | + if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL)) | |
216 | + return 1; | |
217 | + } | |
218 | + node=StopMiddle->node; | |
219 | + level++; | |
220 | + break; | |
221 | + } else if ( StopMiddle->val < ((uint8*)(word))[level] ) { | |
222 | + StopLow = StopMiddle + 1; | |
223 | + } else { | |
224 | + StopHigh = StopMiddle; | |
225 | + } | |
226 | } | |
227 | + if ( StopLow >= StopHigh ) | |
228 | + break; | |
229 | } | |
230 | - return (NULL); | |
231 | + return 0; | |
232 | } | |
233 | ||
234 | int | |
235 | -AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type) | |
236 | +NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type) | |
237 | { | |
238 | if (Conf->naffixes >= Conf->maffixes) | |
239 | { | |
240 | @@ -238,16 +234,14 @@ | |
241 | Conf->maffixes = 16; | |
242 | Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX)); | |
243 | } | |
244 | - if (Conf->Affix == NULL) | |
245 | - ereport(ERROR, | |
246 | - (errcode(ERRCODE_OUT_OF_MEMORY), | |
247 | - errmsg("out of memory"))); | |
248 | + MEMOUT(Conf->Affix); | |
249 | } | |
250 | if (type == 's') | |
251 | sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); | |
252 | else | |
253 | sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); | |
254 | Conf->Affix[Conf->naffixes].compile = 1; | |
255 | + Conf->Affix[Conf->naffixes].flagflags = flagflags; | |
256 | Conf->Affix[Conf->naffixes].flag = flag; | |
257 | Conf->Affix[Conf->naffixes].type = type; | |
258 | ||
259 | @@ -281,7 +275,7 @@ | |
260 | ||
261 | ||
262 | int | |
263 | -ImportAffixes(IspellDict * Conf, const char *filename) | |
264 | +NIImportAffixes(IspellDict * Conf, const char *filename) | |
265 | { | |
266 | unsigned char str[BUFSIZ]; | |
267 | unsigned char flag = 0; | |
268 | @@ -292,13 +286,24 @@ | |
269 | int i; | |
270 | int suffixes = 0; | |
271 | int prefixes = 0; | |
272 | + unsigned char flagflags = 0; | |
273 | FILE *affix; | |
274 | ||
275 | if (!(affix = fopen(filename, "r"))) | |
276 | return (1); | |
277 | + Conf->compoundcontrol='\t'; | |
278 | ||
279 | while (fgets(str, sizeof(str), affix)) | |
280 | { | |
281 | + if (STRNCASECMP(str, "compoundwords")==0) { | |
282 | + s=strchr(str, 'l'); | |
283 | + if ( s ) { | |
284 | + while( *s!=' ' ) s++; | |
285 | + while( *s==' ' ) s++; | |
286 | + Conf->compoundcontrol = *s; | |
287 | + continue; | |
288 | + } | |
289 | + } | |
290 | if (!STRNCASECMP(str, "suffixes")) | |
291 | { | |
292 | suffixes = 1; | |
293 | @@ -314,8 +319,18 @@ | |
294 | if (!STRNCASECMP(str, "flag ")) | |
295 | { | |
296 | s = str + 5; | |
297 | - while (strchr("* ", *s)) | |
298 | + flagflags=0; | |
299 | + while( *s==' ' ) s++; | |
300 | + if ( *s=='*' ) { | |
301 | + flagflags|=FF_CROSSPRODUCT; | |
302 | + s++; | |
303 | + } else if ( *s=='~' ) { | |
304 | + flagflags|=FF_COMPOUNDONLYAFX; | |
305 | s++; | |
306 | + } | |
307 | + | |
308 | + if ( *s=='\\' ) s++; | |
309 | + | |
310 | flag = *s; | |
311 | continue; | |
312 | } | |
313 | @@ -351,7 +366,7 @@ | |
314 | continue; | |
315 | } | |
316 | ||
317 | - AddAffix(Conf, (int) flag, mask, find, repl, suffixes ? 's' : 'p'); | |
318 | + NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p'); | |
319 | ||
320 | } | |
321 | fclose(affix); | |
322 | @@ -359,87 +374,266 @@ | |
323 | return (0); | |
324 | } | |
325 | ||
326 | +static int | |
327 | +MergeAffix(IspellDict *Conf, int a1, int a2) { | |
328 | + int naffix=0; | |
329 | + char **ptr=Conf->AffixData; | |
330 | + | |
331 | + while(*ptr) { | |
332 | + naffix++; | |
333 | + ptr++; | |
334 | + } | |
335 | + | |
336 | + Conf->AffixData=(char**)realloc( Conf->AffixData, (naffix+2)*sizeof(char*) ); | |
337 | + MEMOUT(Conf->AffixData); | |
338 | + ptr = Conf->AffixData + naffix; | |
339 | + *ptr=malloc( strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) + 1 /* space */ + 1 /* \0 */ ); | |
340 | + MEMOUT(ptr); | |
341 | + sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]); | |
342 | + ptr++; | |
343 | + *ptr='\0'; | |
344 | + return naffix; | |
345 | +} | |
346 | + | |
347 | + | |
348 | +static SPNode* | |
349 | +mkSPNode(IspellDict *Conf, int low, int high, int level) { | |
350 | + int i; | |
351 | + int nchar=0; | |
352 | + char lastchar='\0'; | |
353 | + SPNode *rs; | |
354 | + SPNodeData *data; | |
355 | + int lownew=low; | |
356 | + | |
357 | + for(i=low; i<high; i++) | |
358 | + if ( Conf->Spell[i].p.d.len>level && lastchar!=Conf->Spell[i].word[level] ) { | |
359 | + nchar++; | |
360 | + lastchar=Conf->Spell[i].word[level]; | |
361 | + } | |
362 | + | |
363 | + if (!nchar) | |
364 | + return NULL; | |
365 | + | |
366 | + rs=(SPNode*)malloc(SPNHRDSZ+nchar*sizeof(SPNodeData)); | |
367 | + MEMOUT(rs); | |
368 | + memset(rs,0,SPNHRDSZ+nchar*sizeof(SPNodeData)); | |
369 | + rs->length = nchar; | |
370 | + data=rs->data; | |
371 | + | |
372 | + lastchar='\0'; | |
373 | + for(i=low; i<high; i++) | |
374 | + if ( Conf->Spell[i].p.d.len>level ) { | |
375 | + if ( lastchar!=Conf->Spell[i].word[level] ) { | |
376 | + if ( lastchar ) { | |
377 | + data->node = mkSPNode(Conf, lownew, i, level+1); | |
378 | + lownew=i; | |
379 | + data++; | |
380 | + } | |
381 | + lastchar=Conf->Spell[i].word[level]; | |
382 | + } | |
383 | + data->val=((uint8*)(Conf->Spell[i].word))[level]; | |
384 | + if ( Conf->Spell[i].p.d.len == level+1 ) { | |
385 | + if ( data->isword && data->affix!=Conf->Spell[i].p.d.affix) { | |
386 | + /* | |
387 | + fprintf(stderr,"Word already exists: %s (affixes: '%s' and '%s')\n", | |
388 | + Conf->Spell[i].word, | |
389 | + Conf->AffixData[data->affix], | |
390 | + Conf->AffixData[Conf->Spell[i].p.d.affix] | |
391 | + ); | |
392 | + */ | |
393 | + /* MergeAffix called a few times */ | |
394 | + data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i].p.d.affix); | |
395 | + } else | |
396 | + data->affix = Conf->Spell[i].p.d.affix; | |
397 | + data->isword=1; | |
398 | + if ( strchr( Conf->AffixData[ data->affix ], Conf->compoundcontrol ) ) | |
399 | + data->compoundallow=1; | |
400 | + } | |
401 | + } | |
402 | + | |
403 | + data->node = mkSPNode(Conf, lownew, high, level+1); | |
404 | + | |
405 | + return rs; | |
406 | +} | |
407 | + | |
408 | + | |
409 | + | |
410 | void | |
411 | -SortDictionary(IspellDict * Conf) | |
412 | +NISortDictionary(IspellDict * Conf) | |
413 | { | |
414 | - int CurLet = -1, | |
415 | - Let; | |
416 | size_t i; | |
417 | - | |
418 | + int naffix=3; | |
419 | + | |
420 | + /* compress affixes */ | |
421 | + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspellaffix); | |
422 | + for (i = 1; i < Conf->nspell; i++) | |
423 | + if ( strcmp(Conf->Spell[i].p.flag,Conf->Spell[i-1].p.flag) ) | |
424 | + naffix++; | |
425 | + | |
426 | + Conf->AffixData=(char**)malloc( naffix*sizeof(char*) ); | |
427 | + MEMOUT(Conf->AffixData); | |
428 | + memset(Conf->AffixData, 0, naffix*sizeof(char*)); | |
429 | + naffix=1; | |
430 | + Conf->AffixData[0]=strdup(""); | |
431 | + MEMOUT(Conf->AffixData[0]); | |
432 | + Conf->AffixData[1]=strdup( Conf->Spell[0].p.flag ); | |
433 | + MEMOUT(Conf->AffixData[1]); | |
434 | + Conf->Spell[0].p.d.affix = 1; | |
435 | + Conf->Spell[0].p.d.len = strlen(Conf->Spell[0].word); | |
436 | + for (i = 1; i < Conf->nspell; i++) { | |
437 | + if ( strcmp(Conf->Spell[i].p.flag, Conf->AffixData[naffix]) ) { | |
438 | + naffix++; | |
439 | + Conf->AffixData[naffix] = strdup( Conf->Spell[i].p.flag ); | |
440 | + MEMOUT(Conf->AffixData[naffix]); | |
441 | + } | |
442 | + Conf->Spell[i].p.d.affix = naffix; | |
443 | + Conf->Spell[i].p.d.len = strlen(Conf->Spell[i].word); | |
444 | + } | |
445 | + | |
446 | qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell); | |
447 | + Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); | |
448 | + | |
449 | + for (i = 0; i < Conf->nspell; i++) | |
450 | + free( Conf->Spell[i].word ); | |
451 | + free( Conf->Spell ); | |
452 | + Conf->Spell=NULL; | |
453 | +} | |
454 | + | |
455 | +static AffixNode* | |
456 | +mkANode(IspellDict *Conf, int low, int high, int level, int type) { | |
457 | + int i; | |
458 | + int nchar=0; | |
459 | + uint8 lastchar='\0'; | |
460 | + AffixNode *rs; | |
461 | + AffixNodeData *data; | |
462 | + int lownew=low; | |
463 | + | |
464 | + for(i=low; i<high; i++) | |
465 | + if ( Conf->Affix[i].replen>level && lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) { | |
466 | + nchar++; | |
467 | + lastchar=GETCHAR( Conf->Affix + i, level, type ); | |
468 | + } | |
469 | ||
470 | - for (i = 0; i < 256; i++) | |
471 | - Conf->SpellTree.Left[i] = -1; | |
472 | + if (!nchar) | |
473 | + return NULL; | |
474 | ||
475 | - for (i = 0; i < Conf->nspell; i++) | |
476 | - { | |
477 | - Let = (int) (*(Conf->Spell[i].word)) & 255; | |
478 | - if (CurLet != Let) | |
479 | - { | |
480 | - Conf->SpellTree.Left[Let] = i; | |
481 | - CurLet = Let; | |
482 | + rs=(AffixNode*)malloc(ANHRDSZ+nchar*sizeof(AffixNodeData)); | |
483 | + MEMOUT(rs); | |
484 | + memset(rs,0,ANHRDSZ+nchar*sizeof(AffixNodeData)); | |
485 | + rs->length = nchar; | |
486 | + data=rs->data; | |
487 | + | |
488 | + lastchar='\0'; | |
489 | + for(i=low; i<high; i++) | |
490 | + if ( Conf->Affix[i].replen>level ) { | |
491 | + if ( lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) { | |
492 | + if ( lastchar ) { | |
493 | + data->node = mkANode(Conf, lownew, i, level+1, type); | |
494 | + lownew=i; | |
495 | + data++; | |
496 | + } | |
497 | + lastchar=GETCHAR( Conf->Affix + i, level, type ); | |
498 | + } | |
499 | + data->val=GETCHAR( Conf->Affix + i, level, type ); | |
500 | + if ( Conf->Affix[i].replen == level+1 ) { /* affix stopped */ | |
501 | + if ( !data->naff ) | |
502 | + data->aff=(AFFIX**)malloc(sizeof(AFFIX*)*(high-i+1)); | |
503 | + MEMOUT(data); | |
504 | + data->aff[ data->naff ] = Conf->Affix + i; | |
505 | + data->naff++; | |
506 | + } | |
507 | } | |
508 | - Conf->SpellTree.Right[Let] = i; | |
509 | - } | |
510 | + | |
511 | + data->node = mkANode(Conf, lownew, high, level+1, type); | |
512 | + | |
513 | + return rs; | |
514 | } | |
515 | ||
516 | void | |
517 | -SortAffixes(IspellDict * Conf) | |
518 | +NISortAffixes(IspellDict * Conf) | |
519 | { | |
520 | - int CurLetP = -1, | |
521 | - CurLetS = -1, | |
522 | - Let; | |
523 | AFFIX *Affix; | |
524 | size_t i; | |
525 | + CMPDAffix* ptr; | |
526 | + int firstsuffix=-1; | |
527 | ||
528 | if (Conf->naffixes > 1) | |
529 | qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); | |
530 | - for (i = 0; i < 256; i++) | |
531 | - { | |
532 | - Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1; | |
533 | - Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1; | |
534 | - } | |
535 | ||
536 | - for (i = 0; i < Conf->naffixes; i++) | |
537 | - { | |
538 | + Conf->CompoundAffix = ptr = (CMPDAffix*)malloc( sizeof(CMPDAffix) * Conf->naffixes ); | |
539 | + MEMOUT(Conf->CompoundAffix); | |
540 | + ptr->affix=NULL; | |
541 | + | |
542 | + for (i = 0; i < Conf->naffixes; i++) { | |
543 | Affix = &(((AFFIX *) Conf->Affix)[i]); | |
544 | - if (Affix->type == 'p') | |
545 | - { | |
546 | - Let = (int) (*(Affix->repl)) & 255; | |
547 | - if (CurLetP != Let) | |
548 | - { | |
549 | - Conf->PrefixTree.Left[Let] = i; | |
550 | - CurLetP = Let; | |
551 | + if ( Affix->type == 's' ) { | |
552 | + if ( firstsuffix<0 ) firstsuffix=i; | |
553 | + if ( Affix->flagflags & FF_COMPOUNDONLYAFX ) { | |
554 | + if ( !ptr->affix || strbncmp((ptr-1)->affix, Affix->repl, (ptr-1)->len) ) { | |
555 | + /* leave only unique and minimals suffixes */ | |
556 | + ptr->affix=Affix->repl; | |
557 | + ptr->len=Affix->replen; | |
558 | + ptr++; | |
559 | + } | |
560 | } | |
561 | - Conf->PrefixTree.Right[Let] = i; | |
562 | } | |
563 | - else | |
564 | - { | |
565 | - Let = (Affix->replen) ? (int) (Affix->repl[Affix->replen - 1]) & 255 : 0; | |
566 | - if (CurLetS != Let) | |
567 | - { | |
568 | - Conf->SuffixTree.Left[Let] = i; | |
569 | - CurLetS = Let; | |
570 | + } | |
571 | + ptr->affix = NULL; | |
572 | + Conf->CompoundAffix = (CMPDAffix*)realloc( Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr-Conf->CompoundAffix+1) ); | |
573 | + | |
574 | + Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p'); | |
575 | + Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's'); | |
576 | +} | |
577 | + | |
578 | +static AffixNodeData* | |
579 | +FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) { | |
580 | + AffixNodeData *StopLow, *StopHigh, *StopMiddle; | |
581 | + uint8 symbol; | |
582 | + | |
583 | + while( node && *level<wrdlen) { | |
584 | + StopLow = node->data; | |
585 | + StopHigh = node->data+node->length; | |
586 | + while (StopLow < StopHigh) { | |
587 | + StopMiddle = StopLow + (StopHigh - StopLow) / 2; | |
588 | + symbol = GETWCHAR(word,wrdlen,*level,type); | |
589 | + if ( StopMiddle->val == symbol ) { | |
590 | + if ( StopMiddle->naff ) | |
591 | + return StopMiddle; | |
592 | + node=StopMiddle->node; | |
593 | + (*level)++; | |
594 | + break; | |
595 | + } else if ( StopMiddle->val < symbol ) { | |
596 | + StopLow = StopMiddle + 1; | |
597 | + } else { | |
598 | + StopHigh = StopMiddle; | |
599 | } | |
600 | - Conf->SuffixTree.Right[Let] = i; | |
601 | } | |
602 | + if ( StopLow >= StopHigh ) | |
603 | + break; | |
604 | } | |
605 | + return NULL; | |
606 | } | |
607 | ||
608 | static char * | |
609 | -CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict * Conf) | |
610 | -{ | |
611 | +CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) { | |
612 | regmatch_t subs[2]; /* workaround for apache&linux */ | |
613 | - char newword[2 * MAXNORMLEN] = ""; | |
614 | int err; | |
615 | ||
616 | - *res = strbncmp(word, Affix->repl, Affix->replen); | |
617 | - if (*res < 0) | |
618 | - return NULL; | |
619 | - if (*res > 0) | |
620 | - return NULL; | |
621 | - strcpy(newword, word); | |
622 | - strcpy(newword + len - Affix->replen, Affix->find); | |
623 | + if ( flagflags & FF_COMPOUNDONLYAFX ) { | |
624 | + if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 ) | |
625 | + return NULL; | |
626 | + } else { | |
627 | + if ( Affix->flagflags & FF_COMPOUNDONLYAFX ) | |
628 | + return NULL; | |
629 | + } | |
630 | + | |
631 | + if ( Affix->type=='s' ) { | |
632 | + strcpy(newword, word); | |
633 | + strcpy(newword + len - Affix->replen, Affix->find); | |
634 | + } else { | |
635 | + strcpy(newword, Affix->find); | |
636 | + strcat(newword, word + Affix->replen); | |
637 | + } | |
638 | ||
639 | if (Affix->compile) | |
640 | { | |
641 | @@ -452,205 +646,364 @@ | |
642 | } | |
643 | Affix->compile = 0; | |
644 | } | |
645 | - if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) | |
646 | - { | |
647 | - if (FindWord(Conf, newword, Affix->flag)) | |
648 | - return pstrdup(newword); | |
649 | - } | |
650 | + if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) | |
651 | + return newword; | |
652 | return NULL; | |
653 | } | |
654 | ||
655 | -#define NS 1 | |
656 | -#define MAX_NORM 512 | |
657 | -static int | |
658 | -CheckPrefix(const char *word, size_t len, AFFIX * Affix, IspellDict * Conf, int pi, | |
659 | - char **forms, char ***cur) | |
660 | -{ | |
661 | - regmatch_t subs[NS * 2]; | |
662 | + | |
663 | +static char ** | |
664 | +NormalizeSubWord(IspellDict * Conf, char *word, char flag) { | |
665 | + AffixNodeData *suffix=NULL, *prefix=NULL; | |
666 | + int slevel=0, plevel=0; | |
667 | + int wrdlen = strlen(word), swrdlen; | |
668 | + char **forms; | |
669 | + char **cur; | |
670 | char newword[2 * MAXNORMLEN] = ""; | |
671 | - int err, | |
672 | - ls, | |
673 | - res, | |
674 | - lres; | |
675 | - size_t newlen; | |
676 | - AFFIX *CAffix = Conf->Affix; | |
677 | - | |
678 | - res = strncmp(word, Affix->repl, Affix->replen); | |
679 | - if (res != 0) | |
680 | - return res; | |
681 | - strcpy(newword, Affix->find); | |
682 | - strcat(newword, word + Affix->replen); | |
683 | + char pnewword[2 * MAXNORMLEN] = ""; | |
684 | + AffixNode *snode = Conf->Suffix, *pnode; | |
685 | + int i,j; | |
686 | + | |
687 | + if (wrdlen > MAXNORMLEN) return NULL; | |
688 | + strlower(word); | |
689 | + cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); | |
690 | + *cur = NULL; | |
691 | ||
692 | - if (Affix->compile) | |
693 | - { | |
694 | - err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB); | |
695 | - if (err) | |
696 | - { | |
697 | - /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */ | |
698 | - regfree(&(Affix->reg)); | |
699 | - return (0); | |
700 | - } | |
701 | - Affix->compile = 0; | |
702 | + | |
703 | + /* Check that the word itself is normal form */ | |
704 | + if (FindWord(Conf, word, 0, flag & FF_COMPOUNDWORD)) { | |
705 | + *cur = pstrdup(word); | |
706 | + cur++; | |
707 | + *cur = NULL; | |
708 | } | |
709 | - if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) | |
710 | - { | |
711 | - SPELL *curspell; | |
712 | ||
713 | - if ((curspell = FindWord(Conf, newword, Affix->flag))) | |
714 | - { | |
715 | - if ((*cur - forms) < (MAX_NORM - 1)) | |
716 | - { | |
717 | - **cur = pstrdup(newword); | |
718 | - (*cur)++; | |
719 | - **cur = NULL; | |
720 | + /* Find all other NORMAL forms of the 'word' (check only prefix)*/ | |
721 | + pnode=Conf->Prefix; | |
722 | + plevel=0; | |
723 | + while(pnode) { | |
724 | + prefix=FinfAffixes(pnode, word, wrdlen, &plevel,'p'); | |
725 | + if (!prefix) break; | |
726 | + for(j=0;j<prefix->naff;j++) { | |
727 | + if ( CheckAffix(word,wrdlen,prefix->aff[j], flag, newword) ) { | |
728 | + /* prefix success */ | |
729 | + if ( FindWord(Conf, newword, prefix->aff[j]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) { | |
730 | + /* word search success */ | |
731 | + *cur = pstrdup(newword); | |
732 | + cur++; | |
733 | + *cur=NULL; | |
734 | + } | |
735 | } | |
736 | } | |
737 | - newlen = strlen(newword); | |
738 | - ls = Conf->SuffixTree.Left[pi]; | |
739 | - if (ls >= 0 && ((*cur - forms) < (MAX_NORM - 1))) | |
740 | - { | |
741 | - **cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf); | |
742 | - if (**cur) | |
743 | - { | |
744 | - (*cur)++; | |
745 | - **cur = NULL; | |
746 | + pnode = prefix->node; | |
747 | + plevel++; | |
748 | + } | |
749 | + | |
750 | + /* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/ | |
751 | + while( snode ) { | |
752 | + /* find possible suffix */ | |
753 | + suffix = FinfAffixes(snode, word, wrdlen, &slevel, 's'); | |
754 | + if (!suffix) break; | |
755 | + /* foreach suffix check affix */ | |
756 | + for(i=0;i<suffix->naff;i++) { | |
757 | + if ( CheckAffix(word, wrdlen, suffix->aff[i], flag, newword) ) { | |
758 | + /* suffix success */ | |
759 | + if ( FindWord(Conf, newword, suffix->aff[i]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) { | |
760 | + /* word search success */ | |
761 | + *cur = pstrdup(newword); | |
762 | + cur++; | |
763 | + *cur=NULL; | |
764 | + } | |
765 | + /* now we will look changed word with prefixes */ | |
766 | + pnode=Conf->Prefix; | |
767 | + plevel=0; | |
768 | + swrdlen=strlen(newword); | |
769 | + while(pnode) { | |
770 | + prefix=FinfAffixes(pnode, newword, swrdlen, &plevel,'p'); | |
771 | + if (!prefix) break; | |
772 | + for(j=0;j<prefix->naff;j++) { | |
773 | + if ( CheckAffix(newword,swrdlen,prefix->aff[j], flag, pnewword) ) { | |
774 | + /* prefix success */ | |
775 | + int ff=( prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT ) ? | |
776 | + 0 : prefix->aff[j]->flag; | |
777 | + if ( FindWord(Conf, pnewword, ff, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) { | |
778 | + /* word search success */ | |
779 | + *cur = pstrdup(pnewword); | |
780 | + cur++; | |
781 | + *cur=NULL; | |
782 | + } | |
783 | + } | |
784 | + } | |
785 | + pnode = prefix->node; | |
786 | + plevel++; | |
787 | + } | |
788 | } | |
789 | } | |
790 | - } | |
791 | - return 0; | |
792 | -} | |
793 | ||
794 | + snode=suffix->node; | |
795 | + slevel++; | |
796 | + } | |
797 | ||
798 | -char ** | |
799 | -NormalizeWord(IspellDict * Conf, char *word) | |
800 | -{ | |
801 | -/*regmatch_t subs[NS];*/ | |
802 | - size_t len; | |
803 | - char **forms; | |
804 | - char **cur; | |
805 | - AFFIX *Affix; | |
806 | - int ri, | |
807 | - pi, | |
808 | - ipi, | |
809 | - lp, | |
810 | - rp, | |
811 | - cp, | |
812 | - ls, | |
813 | - rs; | |
814 | - int lres, | |
815 | - rres, | |
816 | - cres = 0; | |
817 | - SPELL *spell; | |
818 | - | |
819 | - len = strlen(word); | |
820 | - if (len > MAXNORMLEN) | |
821 | + if (cur == forms) { | |
822 | + pfree(forms); | |
823 | return (NULL); | |
824 | + } | |
825 | + return (forms); | |
826 | +} | |
827 | ||
828 | - strlower(word); | |
829 | +typedef struct SplitVar { | |
830 | + int nstem; | |
831 | + char **stem; | |
832 | + struct SplitVar *next; | |
833 | +} SplitVar; | |
834 | + | |
835 | +static int | |
836 | +CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len) { | |
837 | + while( (*ptr)->affix ) { | |
838 | + if ( len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len)==0 ) { | |
839 | + len = (*ptr)->len; | |
840 | + (*ptr)++; | |
841 | + return len; | |
842 | + } | |
843 | + (*ptr)++; | |
844 | + } | |
845 | + return 0; | |
846 | +} | |
847 | ||
848 | - forms = (char **) palloc(MAX_NORM * sizeof(char **)); | |
849 | - cur = forms; | |
850 | - *cur = NULL; | |
851 | +static SplitVar* | |
852 | +CopyVar(SplitVar *s, int makedup) { | |
853 | + SplitVar *v = (SplitVar*)palloc(sizeof(SplitVar)); | |
854 | + | |
855 | + v->stem=(char**)palloc( sizeof(char*) * (MAX_NORM) ); | |
856 | + v->next=NULL; | |
857 | + if ( s ) { | |
858 | + int i; | |
859 | + v->nstem = s->nstem; | |
860 | + for(i=0;i<s->nstem;i++) | |
861 | + v->stem[i] = (makedup) ? pstrdup( s->stem[i] ) : s->stem[i]; | |
862 | + } else { | |
863 | + v->nstem=0; | |
864 | + } | |
865 | + return v; | |
866 | +} | |
867 | ||
868 | - ri = (int) (*word) & 255; | |
869 | - pi = (int) (word[strlen(word) - 1]) & 255; | |
870 | - Affix = (AFFIX *) Conf->Affix; | |
871 | ||
872 | - /* Check that the word itself is normal form */ | |
873 | - if ((spell = FindWord(Conf, word, 0))) | |
874 | - { | |
875 | - *cur = pstrdup(word); | |
876 | - cur++; | |
877 | - *cur = NULL; | |
878 | - } | |
879 | +static SplitVar* | |
880 | +SplitToVariants( IspellDict * Conf, SPNode *snode, SplitVar * orig, char *word, int wordlen, int startpos, int minpos ) { | |
881 | + SplitVar *var=NULL; | |
882 | + SPNodeData *StopLow, *StopHigh, *StopMiddle; | |
883 | + SPNode *node = (snode) ? snode : Conf->Dictionary; | |
884 | + int level=(snode) ? minpos : startpos; /* recursive minpos==level*/ | |
885 | + int lenaff; | |
886 | + CMPDAffix *caff; | |
887 | + char notprobed[wordlen]; | |
888 | + | |
889 | + memset(notprobed,1,wordlen); | |
890 | + var = CopyVar(orig,1); | |
891 | + | |
892 | + while( node && level<wordlen) { | |
893 | + StopLow = node->data; | |
894 | + StopHigh = node->data+node->length; | |
895 | + while (StopLow < StopHigh) { | |
896 | + StopMiddle = StopLow + (StopHigh - StopLow) / 2; | |
897 | + if ( StopMiddle->val == ((uint8*)(word))[level] ) { | |
898 | + break; | |
899 | + } else if ( StopMiddle->val < ((uint8*)(word))[level] ) { | |
900 | + StopLow = StopMiddle + 1; | |
901 | + } else { | |
902 | + StopHigh = StopMiddle; | |
903 | + } | |
904 | + } | |
905 | + if ( StopLow >= StopHigh ) | |
906 | + break; | |
907 | ||
908 | - /* Find all other NORMAL forms of the 'word' */ | |
909 | + /* find word with epenthetic */ | |
910 | + caff = Conf->CompoundAffix; | |
911 | + while ( level>startpos && (lenaff=CheckCompoundAffixes( &caff, word + level, wordlen - level ))>0 ) { | |
912 | + /* there is one of compound suffixes, so check word for existings */ | |
913 | + char buf[MAXNORMLEN]; | |
914 | + char **subres; | |
915 | + | |
916 | + lenaff=level-startpos+lenaff; | |
917 | + | |
918 | + if ( !notprobed[startpos+lenaff-1] ) | |
919 | + continue; | |
920 | + | |
921 | + if ( level+lenaff-1 <= minpos ) | |
922 | + continue; | |
923 | ||
924 | - for (ipi = 0; ipi <= pi; ipi += pi) | |
925 | - { | |
926 | + memcpy(buf, word+startpos, lenaff); | |
927 | + buf[lenaff]='\0'; | |
928 | ||
929 | - /* check prefix */ | |
930 | - lp = Conf->PrefixTree.Left[ri]; | |
931 | - rp = Conf->PrefixTree.Right[ri]; | |
932 | - while (lp >= 0 && lp <= rp) | |
933 | - { | |
934 | - cp = (lp + rp) >> 1; | |
935 | - cres = 0; | |
936 | - if ((cur - forms) < (MAX_NORM - 1)) | |
937 | - cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur); | |
938 | - if ((lp < cp) && ((cur - forms) < (MAX_NORM - 1))) | |
939 | - lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur); | |
940 | - if ((rp > cp) && ((cur - forms) < (MAX_NORM - 1))) | |
941 | - rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur); | |
942 | - if (cres < 0) | |
943 | - { | |
944 | - rp = cp - 1; | |
945 | - lp++; | |
946 | - } | |
947 | - else if (cres > 0) | |
948 | - { | |
949 | - lp = cp + 1; | |
950 | - rp--; | |
951 | - } | |
952 | - else | |
953 | - { | |
954 | - lp++; | |
955 | - rp--; | |
956 | + subres = NormalizeSubWord(Conf, buf, FF_COMPOUNDWORD | FF_COMPOUNDONLYAFX); | |
957 | + if ( subres ) { | |
958 | + /* Yes, it was a word from dictionary */ | |
959 | + SplitVar *new=CopyVar(var,0); | |
960 | + SplitVar *ptr=var; | |
961 | + char **sptr=subres; | |
962 | + | |
963 | + notprobed[startpos+lenaff-1]=0; | |
964 | + | |
965 | + while(*sptr) { | |
966 | + new->stem[ new->nstem ] = *sptr; | |
967 | + new->nstem++; | |
968 | + sptr++; | |
969 | + } | |
970 | + pfree(subres); | |
971 | + | |
972 | + while( ptr->next ) | |
973 | + ptr = ptr->next; | |
974 | + ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos+lenaff, startpos+lenaff); | |
975 | + | |
976 | + pfree(new->stem); | |
977 | + pfree(new); | |
978 | } | |
979 | } | |
980 | ||
981 | - /* check suffix */ | |
982 | - ls = Conf->SuffixTree.Left[ipi]; | |
983 | - rs = Conf->SuffixTree.Right[ipi]; | |
984 | - while (ls >= 0 && ls <= rs) | |
985 | - { | |
986 | - if (((cur - forms) < (MAX_NORM - 1))) | |
987 | - { | |
988 | - *cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf); | |
989 | - if (*cur) | |
990 | - { | |
991 | - cur++; | |
992 | - *cur = NULL; | |
993 | + /* find infinitive */ | |
994 | + if ( StopMiddle->isword && StopMiddle->compoundallow && notprobed[level] ) { | |
995 | + /* ok, we found full compoundallowed word*/ | |
996 | + if ( level>minpos ) { | |
997 | + /* and its length more than minimal */ | |
998 | + if ( wordlen==level+1 ) { | |
999 | + /* well, it was last word */ | |
1000 | + var->stem[ var->nstem ] = strnduplicate(word + startpos, wordlen - startpos); | |
1001 | + var->nstem++; | |
1002 | + return var; | |
1003 | + } else { | |
1004 | + /* then we will search more big word at the same point */ | |
1005 | + SplitVar *ptr=var; | |
1006 | + while( ptr->next ) | |
1007 | + ptr = ptr->next; | |
1008 | + ptr->next=SplitToVariants(Conf, node, var, word, wordlen, startpos, level); | |
1009 | + /* we can find next word */ | |
1010 | + level++; | |
1011 | + var->stem[ var->nstem ] = strnduplicate(word + startpos, level - startpos); | |
1012 | + var->nstem++; | |
1013 | + node = Conf->Dictionary; | |
1014 | + startpos=level; | |
1015 | + continue; | |
1016 | } | |
1017 | } | |
1018 | - if ((rs > ls) && ((cur - forms) < (MAX_NORM - 1))) | |
1019 | - { | |
1020 | - *cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf); | |
1021 | - if (*cur) | |
1022 | - { | |
1023 | - cur++; | |
1024 | - *cur = NULL; | |
1025 | + } | |
1026 | + level++; | |
1027 | + node=StopMiddle->node; | |
1028 | + } | |
1029 | + | |
1030 | + var->stem[ var->nstem ] = strnduplicate(word + startpos, wordlen - startpos); | |
1031 | + var->nstem++; | |
1032 | + return var; | |
1033 | +} | |
1034 | + | |
1035 | +char ** | |
1036 | +NINormalizeWord(IspellDict * Conf, char *word) { | |
1037 | + char **res= NormalizeSubWord(Conf, word, 0); | |
1038 | + | |
1039 | + if ( Conf->compoundcontrol != '\t' ) { | |
1040 | + int wordlen=strlen(word); | |
1041 | + SplitVar *ptr, *var = SplitToVariants(Conf,NULL,NULL, word, wordlen, 0, -1); | |
1042 | + char **cur=res; | |
1043 | + int i; | |
1044 | + | |
1045 | + while(var) { | |
1046 | + if ( var->nstem > 1 ) { | |
1047 | + char **subres = NormalizeSubWord(Conf, var->stem[ var->nstem-1 ], FF_COMPOUNDWORD); | |
1048 | + if ( subres ) { | |
1049 | + char **ptr=subres; | |
1050 | + | |
1051 | + if ( cur ) { | |
1052 | + while(*cur) | |
1053 | + cur++; | |
1054 | + } else { | |
1055 | + res=cur=(char **) palloc(MAX_NORM * sizeof(char *)); | |
1056 | + } | |
1057 | + | |
1058 | + for(i=0;i<var->nstem-1;i++) { | |
1059 | + *cur=var->stem[ i ]; | |
1060 | + cur++; | |
1061 | + } | |
1062 | + while(*ptr) { | |
1063 | + *cur=*ptr; | |
1064 | + cur++; ptr++; | |
1065 | + } | |
1066 | + *cur=NULL; | |
1067 | + pfree(subres); | |
1068 | + var->stem[ 0 ] = NULL; | |
1069 | } | |
1070 | } | |
1071 | - ls++; | |
1072 | - rs--; | |
1073 | - } /* end while */ | |
1074 | + | |
1075 | + for(i=0;i<var->nstem && var->stem[ i ];i++) | |
1076 | + pfree( var->stem[i] ); | |
1077 | + ptr = var->next; | |
1078 | + pfree(var->stem); | |
1079 | + pfree(var); | |
1080 | + var=ptr; | |
1081 | + } | |
1082 | + } | |
1083 | + return res; | |
1084 | +} | |
1085 | ||
1086 | - } /* for ipi */ | |
1087 | ||
1088 | - if (cur == forms) | |
1089 | - { | |
1090 | - pfree(forms); | |
1091 | - return (NULL); | |
1092 | +static void freeSPNode(SPNode *node) { | |
1093 | + SPNodeData *data; | |
1094 | + | |
1095 | + if (!node) return; | |
1096 | + data=node->data; | |
1097 | + while( node->length ) { | |
1098 | + freeSPNode(data->node); | |
1099 | + data++; | |
1100 | + node->length--; | |
1101 | } | |
1102 | - return (forms); | |
1103 | + free(node); | |
1104 | } | |
1105 | + | |
1106 | +static void freeANode(AffixNode *node) { | |
1107 | + AffixNodeData *data; | |
1108 | + | |
1109 | + if (!node) return; | |
1110 | + data=node->data; | |
1111 | + while( node->length ) { | |
1112 | + freeANode(data->node); | |
1113 | + if (data->naff) | |
1114 | + free(data->aff); | |
1115 | + data++; | |
1116 | + node->length--; | |
1117 | + } | |
1118 | + free(node); | |
1119 | +} | |
1120 | + | |
1121 | ||
1122 | void | |
1123 | -FreeIspell(IspellDict * Conf) | |
1124 | +NIFree(IspellDict * Conf) | |
1125 | { | |
1126 | int i; | |
1127 | AFFIX *Affix = (AFFIX *) Conf->Affix; | |
1128 | + char** aff = Conf->AffixData; | |
1129 | + | |
1130 | + if ( aff ) { | |
1131 | + while(*aff) { | |
1132 | + free(*aff); | |
1133 | + aff++; | |
1134 | + } | |
1135 | + free(Conf->AffixData); | |
1136 | + } | |
1137 | ||
1138 | + | |
1139 | for (i = 0; i < Conf->naffixes; i++) | |
1140 | { | |
1141 | if (Affix[i].compile == 0) | |
1142 | regfree(&(Affix[i].reg)); | |
1143 | } | |
1144 | - for (i = 0; i < Conf->naffixes; i++) | |
1145 | - free(Conf->Spell[i].word); | |
1146 | - free(Conf->Affix); | |
1147 | - free(Conf->Spell); | |
1148 | + if (Conf->Spell) { | |
1149 | + for (i = 0; i < Conf->nspell; i++) | |
1150 | + free(Conf->Spell[i].word); | |
1151 | + free(Conf->Spell); | |
1152 | + } | |
1153 | + | |
1154 | + if (Conf->Affix) free(Conf->Affix); | |
1155 | + if ( Conf->CompoundAffix ) free(Conf->CompoundAffix); | |
1156 | + freeSPNode(Conf->Dictionary); | |
1157 | + freeANode(Conf->Suffix); | |
1158 | + freeANode(Conf->Prefix); | |
1159 | memset((void *) Conf, 0, sizeof(IspellDict)); | |
1160 | return; | |
1161 | } | |
1162 | diff -uNr postgresql-7.4/contrib/tsearch2/ispell/spell.h postgresql-7.4.fixed/contrib/tsearch2/ispell/spell.h | |
1163 | --- postgresql-7.4/contrib/tsearch2/ispell/spell.h 2003-08-04 02:43:11.000000000 +0200 | |
1164 | +++ postgresql-7.4.fixed/contrib/tsearch2/ispell/spell.h 2003-12-18 17:46:03.000000000 +0100 | |
1165 | @@ -3,16 +3,44 @@ | |
1166 | ||
1167 | #include <sys/types.h> | |
1168 | #include <regex.h> | |
1169 | +#include "c.h" | |
1170 | + | |
1171 | +struct SPNode; | |
1172 | + | |
1173 | + | |
1174 | +typedef struct { | |
1175 | + uint32 | |
1176 | + val:8, | |
1177 | + isword:1, | |
1178 | + compoundallow:1, | |
1179 | + affix:22; | |
1180 | + struct SPNode *node; | |
1181 | +} SPNodeData; | |
1182 | + | |
1183 | +typedef struct SPNode { | |
1184 | + uint32 length; | |
1185 | + SPNodeData data[1]; | |
1186 | +} SPNode; | |
1187 | + | |
1188 | +#define SPNHRDSZ (sizeof(uint32)) | |
1189 | + | |
1190 | ||
1191 | typedef struct spell_struct | |
1192 | { | |
1193 | char *word; | |
1194 | - char flag[10]; | |
1195 | + union { | |
1196 | + char flag[16]; | |
1197 | + struct { | |
1198 | + int affix; | |
1199 | + int len; | |
1200 | + } d; | |
1201 | + } p; | |
1202 | } SPELL; | |
1203 | ||
1204 | typedef struct aff_struct | |
1205 | { | |
1206 | char flag; | |
1207 | + char flagflags; | |
1208 | char type; | |
1209 | char mask[33]; | |
1210 | char find[16]; | |
1211 | @@ -22,35 +50,66 @@ | |
1212 | char compile; | |
1213 | } AFFIX; | |
1214 | ||
1215 | +#define FF_CROSSPRODUCT 0x01 | |
1216 | +#define FF_COMPOUNDWORD 0x02 | |
1217 | +#define FF_COMPOUNDONLYAFX 0x04 | |
1218 | + | |
1219 | +struct AffixNode; | |
1220 | + | |
1221 | +typedef struct { | |
1222 | + uint32 | |
1223 | + val:8, | |
1224 | + naff:24; | |
1225 | + AFFIX **aff; | |
1226 | + struct AffixNode *node; | |
1227 | +} AffixNodeData; | |
1228 | + | |
1229 | +typedef struct AffixNode { | |
1230 | + uint32 length; | |
1231 | + AffixNodeData data[1]; | |
1232 | +} AffixNode; | |
1233 | + | |
1234 | +#define ANHRDSZ (sizeof(uint32)) | |
1235 | + | |
1236 | typedef struct Tree_struct | |
1237 | { | |
1238 | int Left[256], | |
1239 | Right[256]; | |
1240 | } Tree_struct; | |
1241 | ||
1242 | +typedef struct { | |
1243 | + char *affix; | |
1244 | + int len; | |
1245 | +} CMPDAffix; | |
1246 | + | |
1247 | typedef struct | |
1248 | { | |
1249 | int maffixes; | |
1250 | int naffixes; | |
1251 | AFFIX *Affix; | |
1252 | + char compoundcontrol; | |
1253 | ||
1254 | int nspell; | |
1255 | int mspell; | |
1256 | SPELL *Spell; | |
1257 | - Tree_struct SpellTree; | |
1258 | - Tree_struct PrefixTree; | |
1259 | - Tree_struct SuffixTree; | |
1260 | + | |
1261 | + AffixNode *Suffix; | |
1262 | + AffixNode *Prefix; | |
1263 | + | |
1264 | + SPNode *Dictionary; | |
1265 | + char **AffixData; | |
1266 | + CMPDAffix *CompoundAffix; | |
1267 | ||
1268 | } IspellDict; | |
1269 | ||
1270 | -char **NormalizeWord(IspellDict * Conf, char *word); | |
1271 | -int ImportAffixes(IspellDict * Conf, const char *filename); | |
1272 | -int ImportDictionary(IspellDict * Conf, const char *filename); | |
1273 | - | |
1274 | -int AddSpell(IspellDict * Conf, const char *word, const char *flag); | |
1275 | -int AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type); | |
1276 | -void SortDictionary(IspellDict * Conf); | |
1277 | -void SortAffixes(IspellDict * Conf); | |
1278 | -void FreeIspell(IspellDict * Conf); | |
1279 | +char **NINormalizeWord(IspellDict * Conf, char *word); | |
1280 | +int NIImportAffixes(IspellDict * Conf, const char *filename); | |
1281 | +int NIImportDictionary(IspellDict * Conf, const char *filename); | |
1282 | + | |
1283 | +int NIAddSpell(IspellDict * Conf, const char *word, const char *flag); | |
1284 | +int NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type); | |
1285 | +void NISortDictionary(IspellDict * Conf); | |
1286 | +void NISortAffixes(IspellDict * Conf); | |
1287 | +void NIFree(IspellDict * Conf); | |
1288 | ||
1289 | #endif | |
1290 | diff -uNr postgresql-7.4/contrib/tsearch2/my2ispell/Makefile postgresql-7.4.fixed/contrib/tsearch2/my2ispell/Makefile | |
1291 | --- postgresql-7.4/contrib/tsearch2/my2ispell/Makefile 1970-01-01 01:00:00.000000000 +0100 | |
1292 | +++ postgresql-7.4.fixed/contrib/tsearch2/my2ispell/Makefile 2003-12-18 17:46:03.000000000 +0100 | |
1293 | @@ -0,0 +1,47 @@ | |
1294 | +ZIPFILE=nb_NO | |
1295 | +LANGUAGE=norsk | |
1296 | + | |
1297 | + | |
1298 | +UNZIP=unzip -o | |
1299 | + | |
1300 | + | |
1301 | +all: $(LANGUAGE).dict $(LANGUAGE).aff | |
1302 | + | |
1303 | +$(ZIPFILE).aff: $(ZIPFILE).zip | |
1304 | + $(UNZIP) $? $@ | |
1305 | + touch $@ | |
1306 | + | |
1307 | + | |
1308 | +# 1 Cleanup dictionary | |
1309 | +# 2 remove " symbol | |
1310 | +# 3 add compoundwords controlled flag to word which hasn't it, but | |
1311 | +# has compound only suffixes | |
1312 | + | |
1313 | +$(LANGUAGE).dict: $(ZIPFILE).zip | |
1314 | + $(UNZIP) $? $(ZIPFILE).dic | |
1315 | + grep -v -E '^[[:digit:]]+$$' < $(ZIPFILE).dic \ | |
1316 | + | grep -v '\.' \ | |
1317 | + | sed -e 's/"//g' \ | |
1318 | + | perl -pi -e 's|/(\S+)| $$q=$$1; ( $$q=~/[\\_`]/ && $$q!~/z/ ) ? "/$${q}z" : "/$${q}"|e' \ | |
1319 | + | sort \ | |
1320 | + > $@ | |
1321 | + | |
1322 | +#just convert affix file | |
1323 | + | |
1324 | +$(LANGUAGE).aff: $(ZIPFILE).aff | |
1325 | + grep -v -i zyzyzy $(ZIPFILE).aff \ | |
1326 | + | grep -v -i zyzyzy \ | |
1327 | + | perl -pi \ | |
1328 | + -e 's/^COMPOUNDFLAG\s+(\S+)/compoundwords controlled $$1/;' \ | |
1329 | + -e 's/^COMPOUNDMIN\s+(\d+)/compoundmin $$1/;' \ | |
1330 | + -e 's/^PFX\s+(\S+)\s+Y\s+\d+.*$$/ if ( !$$wasprf ) { $$wasprf=1; "prefixes\n\nflag $$1:" } else { "flag $$1:" } /e;' \ | |
1331 | + -e 's/^PFX\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)/ uc(" $$3 > $$2")/e;' \ | |
1332 | + -e 's/^(.*)SFX\s+(\S+)\s+([YN])\s+\d+.*$$/ $$flg=($$3 eq "Y") ? "*" : ""; $$flg="~$$flg" if length $$1; $$q=$$2; $$q="\\$$q" if $$q!~m#[a-zA-Z]#; if ( !$$wassfx ) { $$wassfx=1; "suffixes\n\nflag $$flg$$q:" } else { "flag $$flg$$q:" } /e;' \ | |
1333 | + -e 's/^.*SFX\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)/ uc(" $$3 > ".( ($$1 eq "0") ? "" : "-$$1,").( ($$2 eq "0") ? "" : "$$2") )/e;' \ | |
1334 | + -e 's/^(SET|TRY)/#$$1/' \ | |
1335 | + > $@ | |
1336 | + | |
1337 | +clean: | |
1338 | + rm -rf $(ZIPFILE).aff $(ZIPFILE).dic $(LANGUAGE).dict $(LANGUAGE).aff | |
1339 | + | |
1340 | + | |
1341 | diff -uNr postgresql-7.4/contrib/tsearch2/my2ispell/README postgresql-7.4.fixed/contrib/tsearch2/my2ispell/README | |
1342 | --- postgresql-7.4/contrib/tsearch2/my2ispell/README 1970-01-01 01:00:00.000000000 +0100 | |
1343 | +++ postgresql-7.4.fixed/contrib/tsearch2/my2ispell/README 2003-12-18 17:46:03.000000000 +0100 | |
1344 | @@ -0,0 +1,12 @@ | |
1345 | +Utility for convert MySpell dictionary and affix from | |
1346 | +myspell to ispell format. | |
1347 | +Utility tested on nb_NO.zip and nn_NO.zip from | |
1348 | +OpenOffice (http://lingucomponent.openoffice.org/download_dictionary.html) | |
1349 | + | |
1350 | +usage: | |
1351 | +For example, make norwegian dictionary and affix: | |
1352 | +% cp nb_NO.zip my2ispell | |
1353 | +% cd my2ispell | |
1354 | +% gmake ZIPFILE=nb_NO LANGUAGE=norsk | |
1355 | + | |
1356 | +Author: Teodor Sigaev <teodor@sigaev.ru> |