[Snowball-discuss] Turkish Stemmer

Wed Jan 17 14:11:40 GMT 2007

Skipped content of type multipart/alternative-------------- next part --------------
/* Stemmer for Turkish 
	* author: Evren (Kapusuz) ?ilden
	* email: evren.kapusuz at gmail.com
	* version: 1.0 (15.01.2007)

	* stems nominal verb suffixes
	* stems nominal inflections
	* more than one syllable word check
	* (y,n,s,U) context check
	* vowel harmony check
	* last consonent check and conversion (b, c, d, ? to p, ?, t, k)

	* The stemming algorithm is based on the paper "An Affix Stripping 
	* Morphological Analyzer for Turkish" by G?l?en Eryi?it and
	* E?ref Adal? (Proceedings of the IAESTED International Conference
	* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
	* Innsbruck, Austria

	* Turkish is an agglutinative language and has a very rich morphological
	* structure. In Turkish, you can form many different words from a single stem
	* by appending a sequence of suffixes. Eg. The word "doktoruymu?sunuz" means
	* "You had been the doctor of him". The stem of the word is "doktor" and it 
	* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
	* the append order of suffixes can be clearly described as FSMs. 
	* The paper referenced above defines some FSMs for right to left
	* morphological analysis. I generated a method for constructing snowball
	* expressions from right to left FSMs for stemming suffixes. 
*/

routines (
	append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
	check_vowel_harmony	// tests vowel harmony for suffixes
	is_reserved_word	// tests whether current string is a reserved word ('ad','soyad')
	mark_cAsInA		// nominal verb suffix
	mark_DA			// noun suffix
	mark_DAn		// noun suffix
	mark_DUr		// nominal verb suffix
	mark_ki			// noun suffix
	mark_lAr		// noun suffix, nominal verb suffix
	mark_lArI		// noun suffix
	mark_nA			// noun suffix
	mark_ncA		// noun suffix
	mark_ndA		// noun suffix
	mark_ndAn		// noun suffix
	mark_nU			// noun suffix
	mark_nUn		// noun suffix
	mark_nUz		// nominal verb suffix
	mark_sU			// noun suffix
	mark_sUn		// nominal verb suffix
	mark_sUnUz		// nominal verb suffix
	mark_possessives	// -(U)m,-(U)n,-(U)mUz,-(U)nUz,
	mark_yA			// noun suffix
	mark_ylA		// noun suffix
	mark_yU			// noun suffix
	mark_yUm		// nominal verb suffix
	mark_yUz		// nominal verb suffix
	mark_yDU		// nominal verb suffix
	mark_yken		// nominal verb suffix
	mark_ymUs_		// nominal verb suffix
	mark_ysA		// nominal verb suffix

	mark_suffix_with_optional_y_consonant
	mark_suffix_with_optional_U_vowel
	mark_suffix_with_optional_n_consonant
	mark_suffix_with_optional_s_consonant

	more_than_one_syllable_word

	post_process_last_consonants
	postlude

	stem_nominal_verb_suffixes
	stem_noun_suffixes
	stem_suffix_chain_before_ki
)

/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef c.   	hex 'E7'	// LATIN SMALL LETTER C WITH CEDILLA
stringdef g~   	hex '011F'	// LATIN SMALL LETTER G WITH BREVE
stringdef i'   	hex '0131'	// LATIN SMALL LETTER I WITHOUT DOT
stringdef o"  	hex 'F6'	// LATIN SMALL LETTER O WITH DIAERESIS
stringdef s.	hex '015F'	// LATIN SMALL LETTER S WITH CEDILLA
stringdef u"  	hex 'FC'	// LATIN SMALL LETTER U WITH DIAERESIS

stringescapes 	{ }

integers 	( strlen )	// length of a string

booleans	( continue_stemming_noun_suffixes )

groupings 	( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)

define vowel 	'ae{i'}io{o"}u{u"}'
define U	'{i'}iu{u"}'

// the vowel grouping definitions below are used for checking vowel harmony
define vowel1  	'a{i'}ou' 		// vowels that can end with suffixes containing 'a'
define vowel2  	'ei{o"}{u"}' 		// vowels that can end with suffixes containing 'e'
define vowel3  	'a{i'}' 		// vowels that can end with suffixes containing 'i''
define vowel4  	'ei'	 		// vowels that can end with suffixes containing 'i'
define vowel5  	'ou'	 		// vowels that can end with suffixes containing 'o' or 'u'
define vowel6  	'{o"}{u"}' 		// vowels that can end with suffixes containing 'o"' or 'u"'

externals 	( stem )

backwardmode (
	// checks vowel harmony for possible suffixes, 
	// helps to detect whether the candidate for suffix applies to vowel harmony
	// this rule is added to prevent over stemming
	define check_vowel_harmony as (
		test
		(
			(goto vowel)   // if there is a vowel
			(
				('a' goto vowel1) or
				('e' goto vowel2) or
				('{i'}' goto vowel3) or
				('i' goto vowel4) or
				('o' goto vowel5) or
				('{o"}' goto vowel6) or
				('u' goto vowel5) or
				('{u"}' goto vowel6)
			)
		)
	)

	// if the last consonant before suffix is vowel and n then advance and delete
	// if the last consonant before suffix is non vowel and n do nothing
	// if the last consonant before suffix is not n then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_n_consonant as (
		((test 'n') next (test vowel))
		or
		((not(test 'n')) test(next (test vowel)))

	)

	// if the last consonant before suffix is vowel and s then advance and delete
	// if the last consonant before suffix is non vowel and s do nothing
	// if the last consonant before suffix is not s then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_s_consonant as (
		((test 's') next (test vowel))
		or
		((not(test 's')) test(next (test vowel)))
	)

	// if the last consonant before suffix is vowel and y then advance and delete
	// if the last consonant before suffix is non vowel and y do nothing
	// if the last consonant before suffix is not y then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_y_consonant as (
		((test 'y') next (test vowel))
		or
		((not(test 'y')) test(next (test vowel)))
	)

	define mark_suffix_with_optional_U_vowel as (
		((test U) next (test non-vowel))
		or
		((not(test U)) test(next (test non-vowel)))

	)

	define mark_possessives as (
		among ('m{i'}z' 'miz' 'muz' 'm{u"}z' 
		       'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
		(mark_suffix_with_optional_U_vowel)
	)

	define mark_sU as (
		check_vowel_harmony
		U
		(mark_suffix_with_optional_s_consonant)
	)

	define mark_lArI as (
		among ('leri' 'lar?')
	)

	define mark_yU as (
		check_vowel_harmony
		U
		(mark_suffix_with_optional_y_consonant)	
	)

	define mark_nU as (
		check_vowel_harmony
		among ('n{i'}' 'ni' 'nu' 'n{u"}')	
	)

	define mark_nUn as (
		check_vowel_harmony
		among ('{i'}n' 'in' 'un' '{u"}n')	
		(mark_suffix_with_optional_n_consonant)
	)

	define mark_yA as (
		check_vowel_harmony
		among('a' 'e')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_nA as (
		check_vowel_harmony
		among('na' 'ne')
	)

	define mark_DA as (
		check_vowel_harmony
		among('da' 'de' 'ta' 'te')
	)

	define mark_ndA as (
		check_vowel_harmony
		among('nda' 'nde')
	)

	define mark_DAn as (
		check_vowel_harmony
		among('dan' 'den' 'tan' 'ten')
	)

	define mark_ndAn as (
		check_vowel_harmony
		among('ndan' 'nden')
	)

	define mark_ylA as (
		check_vowel_harmony
		among('la' 'le')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_ki as (
		'ki'
	)

	define mark_ncA as (
		check_vowel_harmony
		among('ca' 'ce')	
		(mark_suffix_with_optional_n_consonant)
	)

	define mark_yUm as (
		check_vowel_harmony
		among ('{i'}m' 'im' 'um' '{u"}m')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_sUn as (
		check_vowel_harmony
		among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) 
	)

	define mark_yUz as (
		check_vowel_harmony
		among ('{i'}z' 'iz' 'uz' '{u"}z')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_sUnUz as (
		among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') 
	)

	define mark_lAr as (
		check_vowel_harmony
		among ('ler' 'lar') 
	)

	define mark_nUz as (
		check_vowel_harmony
		among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') 
	)

	define mark_DUr as (
		check_vowel_harmony
		among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
	)

	define mark_cAsInA as (
		among ('cas{i'}na' 'cesine') 
	)

	define mark_yDU as (
		check_vowel_harmony
		among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
			't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
			't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
			't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
		(mark_suffix_with_optional_y_consonant)
	)

	// does not fully obey vowel harmony	
	define mark_ysA as (
		among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_ymUs_ as (
		check_vowel_harmony
		among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') 
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_yken as (
		'ken' (mark_suffix_with_optional_y_consonant)
	)

	define stem_nominal_verb_suffixes as (
		[	
			set continue_stemming_noun_suffixes
			(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
			or
			(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
			or
			(
				mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
				unset continue_stemming_noun_suffixes 
			)
			or 
			(mark_nUz (mark_yDU or mark_ysA))
			or
			((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
			or 
			(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
		]delete
	)

	// stems noun suffix chains ending with -ki
	define stem_suffix_chain_before_ki as (
		[
			mark_ki
			( 
				(mark_DA] delete try([
					(mark_lAr] delete try(stem_suffix_chain_before_ki))
					or
					(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))

				))
				or
				(mark_nUn] delete try([
					(mark_lArI] delete)
					or
					([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
					or
					(stem_suffix_chain_before_ki)
				))
				or
				(mark_ndA (	
					(mark_lArI] delete)
					or 
					((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
					or
					(stem_suffix_chain_before_ki)
				))
			)
	)

	define stem_noun_suffixes as (
		([mark_lAr] delete try(stem_suffix_chain_before_ki))
		or
		([mark_ncA] delete 
			try(
				([mark_lArI] delete)
				or
				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				([mark_lAr] delete stem_suffix_chain_before_ki)
			)
		)
		or
		([(mark_ndA or mark_nA) 
			(
		  		(mark_lArI] delete)
		  		or
		  		(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
		  		or
		  		(stem_suffix_chain_before_ki)
		  	)
		)
		or   
		([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
		or
		( [mark_DAn] delete try ([ 
			(
		 		(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
		 		or 
		 		(mark_lAr] delete try(stem_suffix_chain_before_ki))
		 		or 
		 		(stem_suffix_chain_before_ki)
		 	))
		)
		or
		([mark_nUn or mark_ylA] delete 
			try( 
				([mark_lAr] delete stem_suffix_chain_before_ki)
				or
				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				stem_suffix_chain_before_ki 
			)
		)
		or 
		([mark_lArI] delete)
		or	
		(stem_suffix_chain_before_ki)
		or
		([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
		or
		([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
	)

	define post_process_last_consonants as (	
		[substring] among (
			'b' (<- 'p')
			'c' (<- '{c.}')
			'd' (<- 't')
			'{g~}' (<- 'k')
		)
	)

	// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
	// like in 'kedim' -> 'ked'
	// Turkish words don't usually end with 'd' or 'g'
	// some very well known words are ignored (like 'ad' 'soyad'	
	// appends U to stems ending with d or g, decides which vowel to add
	// based on the last vowel in the stem
	define append_U_to_stems_ending_with_d_or_g as (
		test('d' or 'g')
		(test((goto vowel) 'a' or '{i'}') <+ '{i'}')
		or
		(test((goto vowel) 'e' or 'i') <+ 'i')
		or
		(test((goto vowel) 'o' or 'u') <+ 'u')
		or
		(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
	)

)

// Tests if there are more than one syllables
// In Turkish each vowel indicates a distinct syllable
define more_than_one_syllable_word as (
	test (atleast 2 (gopast vowel))
)

define is_reserved_word as (
	test(gopast 'ad' ($strlen = 2) ($strlen == limit))
	or
	test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
)

define postlude as (
	not(is_reserved_word)
	backwards (
		do append_U_to_stems_ending_with_d_or_g
		do post_process_last_consonants

	)
)

define stem as (
	(more_than_one_syllable_word) 
	(
		backwards (
			do stem_nominal_verb_suffixes
			continue_stemming_noun_suffixes
			do stem_noun_suffixes
		)

	postlude
	)
)

-------------- next part --------------
A non-text attachment was scrubbed...
Name: TurkishStemmer.java
Type: text/x-java
Size: 116404 bytes
Desc: not available
Url : http://lists.tartarus.org/mailman/private/snowball-discuss/attachments/20070117/b0429bf1/TurkishStemmer-0001.java
-------------- next part --------------
A non-text attachment was scrubbed...
Name: TestTurkishStemmer.java
Type: text/x-java
Size: 1140 bytes
Desc: not available
Url : http://lists.tartarus.org/mailman/private/snowball-discuss/attachments/20070117/b0429bf1/TestTurkishStemmer-0001.java
-------------- next part --------------
A non-text attachment was scrubbed...
Name: StemmingTurkishWords.doc
Type: application/msword
Size: 133632 bytes
Desc: not available
Url : http://lists.tartarus.org/mailman/private/snowball-discuss/attachments/20070117/b0429bf1/StemmingTurkishWords-0001.doc
-------------- next part --------------
Here are the results
?ocukmu?:->?ocuk
kediymi?:->kedi
bal???m:->bal?k
soyad?m:->soyad
kedim:->kedi
kalem:->kale
doktoruymu?sunuz:->doktor
kalelerimizdekilerden:->kale
?ocu?uymu?umcas?na:->?ocuk
kedileriyle:->kedi
?ocuklar?mm??:->?ocuk
kitab?m?zd?:->kitap
kelimelerin:->kelime
kay?s?s?:->kay?s?s
eri?inin:->erik
eri?indeki:->erik
g?ls?m:->g?ls
suyunun:->suy
yarg?m?z:->yarg?
eri?inden:->erik
eri?ine:->erik
eri?inde:->erik
kay?s?s?na:->kay?s?
kay?s?s?nda:->kay?s?
yarg?n?n:->yarg?
saatlerimiz:->saatler
kalemimin:->kalem
ucu:->u?
eri?inin:->erik
**********************:->**********************
kalelerdekilerden:->kale
kalelerdekilerin:->kale
kalelerimizdekilerden:->kale
kalelerimizdekilerde:->kale
kaleninkininkininkininkinin:->kale
kalemizinkininkininkinin:->kale
kalelerindeki:->kale
**********************_noun_suffix_tests_**********************:->**********************_noun_suffix_tests_**********************
erikleri:->erik
erikler:->erik
eri?im:->erik
eri?imiz:->erik
eri?in:->erik
eri?iniz:->erik
eri?i:->erik
eri?ini:->erik
eri?inin:->erik
eri?e:->erik
eri?ine:->erik
eriklerine:->erik
erikte:->erik
eri?inde:->erik
erikten:->erik
eri?inden:->erik
eri?indeki:->erik
eri?iyle:->erik
eri?inin:->erik
eri?indeki:->erik
eri?ince:->erik
etkilerden:->etki
eriksi:->eriksi
kay?s?s?:->kay?s?s
g?l?m:->g?l
kalem:->kale
erikteki:->erik
eriktekilerden:->erik
eriklerdeki:->erik
**********************_last_consonent_tests_**********************:->**********************_last_consonent_tests_**********************
kitab:->kitap
a?ac:->a?a?
halu?:->haluk
**********************_optional_y_tests_**********************:->**********************_optional_y_tests_**********************
yum:->yum
eri?im:->erik
kay?s?y?m:->kay?s
**********************_nominal_verb_suffix_tests_**********************:->**********************_nominal_verb_suffix_tests_**********************
eriksem:->erik
eriksen:->erik
erikse:->erik
erikseniz:->erik
erikseler:->erik
erikti:->erik
kay?s?yd?m:->kay?s
eriktim:->erik
eriktin:->erik
erikti:->erik
eriktiniz:->erik
eriktiler:->erik
erikmi?:->erik
erikmi?cesine:->erik
erikmi?tir:->erik
erikmi?im:->erik
erikmi?sin:->erik
erikmi?sindir:->erik
erikmi?imdir:->erik
erikmi?iz:->erik
erikmi?izdir:->erik
erikmi?siniz:->erik
erikmi?sinizdir:->erik
erikmi?ler:->erik
erikmi?lerdir:->erik
erikmi?imcesine:->erik
erikmi?sincesine:->erik
erikmi?izcesine:->erik
erikmi?sinizcesine:->erik
erikmi?lercesine:->erik
erikler:->erik
eri?im:->erik
eriksin:->erik
erik:->erik
eri?iz:->erik
eriksiniz:->erik
erikler:->erik
eriktir:->erik
eriktirler:->erik
erikken:->erik
kay?s?yken:->kay?s
t?ym??:->t?
k?t?ym??:->k?t
tersy?z:->tersy?z
y?z:->y?z
mu?:->mu?