[Snowball-discuss] a simple algorithm problem

ayhan peker ayhan at aramanet.com
Sun Dec 12 00:27:29 GMT 2004


Hi there
I have been working on a simple turkish stemming algorithm. I am having
some problems with special turkish characters . Simply i am just trying
to replace long words (more then 15 chars)  with (l/w) and some of
turkish suffixes.
My problem is where there are turkish chars (ie u") the algorithm is not
working. 
when i test is with my code with postgres :
 select lexize('tr12','asaiitılardıa');
 lexize
--------
 {l/w}
(1 row)
 
 select lexize('tr12','asaiitalardaa');
     lexize
-----------------
 {asaiitalardaa}
(1 row)
 number of chars in first query and the second one is the same but first
one is wrong whereas second one is right.

I wonder if somebody can help me

thanks in advance.

ayhan peker

 here is my code :



routines (        
	   mark_regions
           R1 R2
           common_suffix
	  
)
externals ( stem )
integers ( p1 p2 p3)
groupings ( v all )
stringescapes {}

/* special characters (in turkish) */

stringdef u"   hex 'FC'  // u w�th d�aer�es
stringdef i^   hex 'FD'  //
stringdef o"   hex 'F6'  //
stringdef s,   hex 'FE'  //
stringdef c,   hex 'E7'  //
stringdef g^   hex 'F0'  //

define v 'aeiou{u"}{o"}{i^}'

define all
'aeiouqwrtyplkjhgfdszxcvbnm1234567890!£$%^&*()-_=+[]@~;:/?><#'

define mark_regions as (
    $p1 = limit
    $p2 = limit // defaults
    $p3=size
    do (
        ( gopast v  gopast non-v)  setmark p1
         ( gopast v gopast non-v) setmark p2
	
    )
    
)
backwardmode (
    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor	
   

    define common_suffix as (
        [substring] among(
	'ler' 'lar' 'diler' 'dular' 'd{i^}lar' 'd{u"}ler' 'tiler' 'tular'
't{i^}lar' 't{u"}ler' 'dir' 'd{i^}r' 'mi{s,}' 'm{i^}{s,}' 'm{u"}{s,}'
'mu{s,}' 
	'mi{s,}ler' 'm{i^}{s,}lar' 'm{u"}{s,}ler' 'mu{s,}lar' 
		 (R1 or R2 delete)
	    )	
    )    
)


define stem as (
    
    do mark_regions    
    backwards (
    	( 
	  	do common_suffix
	)	
	do(	
//delete all if the lengt is more than 15 chars
		$p3>15
		repeat ( gopast([all] or gopast(v) )  delete )	
		do insert 'l/w' 
	)	
    )
)

-- 
ayhan at aramanet.com
www.aramanet.com
En fazla websayfasi iceren turkce arama motoru 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.tartarus.org/mailman/private/snowball-discuss/attachments/20041212/659814b3/attachment.htm


More information about the Snowball-discuss mailing list