Create a Java Module custom-solr-hybris-components-8.11.2
CustomFrenchMinimalStemFilterFactory contains similar code as FrenchMinimalStemFilterFactory, the only difference is the references to custom classes
package com.sap.custom.solr.lucene.analysis.fr;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
public class CustomFrenchMinimalStemFilterFactory extends TokenFilterFactory {
public static final String NAME = "customFrenchMinimalStem";
public CustomFrenchMinimalStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
public TokenStream create(TokenStream input) {
return new CustomFrenchMinimalStemFilter(input);
}
}
CustomFrenchMinimalStemFilter contains similar code as FrenchMinimalStemFilter, the only difference is the references to custom classes
package com.sap.custom.solr.lucene.analysis.fr;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
public final class CustomFrenchMinimalStemFilter extends TokenFilter {
private final CustomFrenchMinimalStemmer stemmer = new CustomFrenchMinimalStemmer();
private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = (KeywordAttribute)this.addAttribute(KeywordAttribute.class);
public CustomFrenchMinimalStemFilter(TokenStream input) {
super(input);
}
public boolean incrementToken() throws IOException {
if (this.input.incrementToken()) {
if (!this.keywordAttr.isKeyword()) {
int newlen = this.stemmer.stem(this.termAtt.buffer(), this.termAtt.length());
this.termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}
CustomFrenchMinimalStemmer is inspired from FrenchMinimalStemmer but in addition we will add the specific algorithm for customer specific requirements
package com.sap.custom.solr.lucene.analysis.fr;
/**
* Custom French Stemmer to handle specific requirement
* - So far Handles
* -
* - Non-removal of 'r' at the end of the word (No stemmer for verbs)
* - Non-removal of 'e' for feminin at the end of the word if the pervious letter is
* 's' (liasse not transformed into lias) or
* 'r' (timbre not transformed in 'timbr') or
* 'i' (monnaie not transformed in 'monnaie') or
* 't' (porte not transformed in 'port')
* - Transformation of plural 'aux' to singular 'al' except for token finishing with 'eaux'
* - otherwise it keeps the algorithm of FrenchMinimalStemmer by
* - Removal of 's' for plural
* - Removal of 'x' for plural for some cases
* - To be enriched with additional specific requirements
*
*/
public class CustomFrenchMinimalStemmer {
public CustomFrenchMinimalStemmer() {
}
public int stem(char[] s, int len) {
if (len < 5) { // Change Standard FrenchMinimalStemmer use 5 instead of 6 for token length
return len;
}
else if (s[len - 1] == 'x') { // Change Standard FrenchMinimalStemmer handle plural with aux (-> al) and remove 'x' for some cases (ignore words finishing with '-eaux')
// if ends with 'aux' replace 'aux' by 'al' except for 'eaux'
if (s[len - 3] == 'a' && s[len - 2] == 'u' && s[len - 4] != 'e') {
s[len - 2] = 'l';
}
// Otherwise juste remove 'x'
return len - 1;
} else {
// Keep the Standard FrenchMinimalStemmer remove 's' for plural
if (s[len - 1] == 's') {
--len;
}
// Change Standard FrenchMinimalStemmer - Remove 'r' for verbs at the end - Customization cancel this rule to keep the 'r'
/* if (s[len - 1] == 'r') {
--len;
}*/
// Change Standard FrenchMinimalStemmer - Customization Remove 'e' for feminine
if (s[len - 1] == 'e') {
//Remove "e" only if the previous letter is not s or r or i or t
if(s[len - 2] != 's' && s[len - 2] != 'r' && s[len - 2] != 'i' && s[len - 2] != 't') {
--len;
}
}
// Keep the Standard FrenchMinimalStemmer
if (s[len - 1] == 233) {
--len;
}
// Keep the Standard FrenchMinimalStemmer - remove duplicated letters at the end of the word (ex. timbree -> timbre, timbress -> timbres)
if (s[len - 1] == s[len - 2]) {
--len;
}
return len;
}
}
}
<fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
[...]
<!-- <filter class="solr.SnowballPorterFilterFactory" language="French" />-->
<!-- <filter class="solr.FrenchLightStemFilterFactory" /> -->
<!-- <filter class="solr.FrenchMinimalStemFilterFactory" /> -->
<!-- <filter class="solr.ASCIIFoldingFilterFactory" /> -->
<filter class="com.sap.custom.solr.lucene.analysis.fr.CustomFrenchMinimalStemFilterFactory" />
[...]
</analyzer>
<analyzer type="query">
[...]
<!-- <filter class="solr.SnowballPorterFilterFactory" language="French" />-->
<!-- <filter class="solr.FrenchLightStemFilterFactory" /> -->
<!-- <filter class="solr.FrenchMinimalStemFilterFactory" /> -->
<!-- <filter class="solr.ASCIIFoldingFilterFactory" /> -->
<filter class="com.sap.custom.solr.lucene.analysis.fr.CustomFrenchMinimalStemFilterFactory" />
[...]
</analyzer>
</fieldType>
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
9 | |
9 | |
2 | |
2 | |
1 | |
1 | |
1 | |
1 | |
1 | |
1 |