[Snowball-discuss] Evening -> Even, Marine -> Marin

Thu Jun 22 16:51:24 BST 2017

Hi All,

I help maintain a Drupal site where a database search will be replaced with
an Apache Solr search.  I'm having some trouble with stemming.  When
searching for the word *evening*, the results include excerpts with the
word *even*.  When searching for the last name *Marin*, results include
excerpts with the word *marine*.  The excerpts do not include evening or
Marin.  What is the best way to handle this?  Should I add these words as
we come across them to the protected words file?  I am using the
*SnowballPorterFilterFactory*.  Below is the configuration we are using
that comes with Apache Solr 3.6.1:

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.MappingCharFilterFactory" mapping="
mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="
stopwords.txt"enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt"
generateWordParts="1"generateNumberParts="1" catenateWords="1" cat
enateNumbers="1" catenateAll="0" splitOnCaseChange="1"preserveOriginal="1"/>
<filter class="solr.LengthFilterFactory" min="2" max="100"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.*SnowballPorterFilterFactory*" language="English"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.MappingCharFilterFactory" mapping="
mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase
="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="
stopwords.txt"enablePositionIncrements="true"/>
<filter class="solr.WordDelimiterFilterFactory" protected="protwords.txt"
generateWordParts="1"generateNumberParts="1" catenateWords="0" cat
enateNumbers="0" catenateAll="0" splitOnCaseChange="1"preserveOriginal="1"/>
<filter class="solr.LengthFilterFactory" min="2" max="100"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.*SnowballPorterFilterFactory*" language="English"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>

Thank you for any help,
Ann
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.tartarus.org/mailman/private/snowball-discuss/attachments/20170622/622cb9ba/attachment.html>