root/trunk/components/server/src/ome/services/fulltext/FullTextAnalyzer.java
| Revision 2487, 2.2 kB (checked in by jmoore, 7 months ago) |
|---|
| Line | |
|---|---|
| 1 | /* |
| 2 | * $Id$ |
| 3 | * |
| 4 | * Copyright 2008 Glencoe Software, Inc. All rights reserved. |
| 5 | * Use is subject to license terms supplied in LICENSE.txt |
| 6 | */ |
| 7 | |
| 8 | package ome.services.fulltext; |
| 9 | |
| 10 | import java.io.Reader; |
| 11 | |
| 12 | import org.apache.commons.logging.Log; |
| 13 | import org.apache.commons.logging.LogFactory; |
| 14 | import org.apache.lucene.analysis.Analyzer; |
| 15 | import org.apache.lucene.analysis.CharTokenizer; |
| 16 | import org.apache.lucene.analysis.LetterTokenizer; |
| 17 | import org.apache.lucene.analysis.LowerCaseTokenizer; |
| 18 | import org.apache.lucene.analysis.SimpleAnalyzer; |
| 19 | import org.apache.lucene.analysis.TokenStream; |
| 20 | |
| 21 | /** |
| 22 | * {@link Analyzer} implementation based largely on {@link SimpleAnalyzer}, but |
| 23 | * with extensions for handling scientific and OS-type strings. |
| 24 | * |
| 25 | * @author Josh Moore, josh at glencoesoftware.com |
| 26 | * @since 3.0-Beta3 |
| 27 | */ |
| 28 | public class FullTextAnalyzer extends Analyzer { |
| 29 | |
| 30 | private final static Log log = LogFactory.getLog(FullTextAnalyzer.class); |
| 31 | |
| 32 | static { |
| 33 | log.info("Initialized FullTextAnalyzer"); |
| 34 | } |
| 35 | |
| 36 | /** |
| 37 | * Based on {@link LowerCaseTokenizer}, with the same optimization. |
| 38 | * However, in order to do alphanumeric tokenizing, rather than just |
| 39 | * alphabetic, it was necessary to combine that implementation with |
| 40 | * {@link LetterTokenizer} and extend {@link CharTokenizer} directly. |
| 41 | * |
| 42 | */ |
| 43 | static class LowercaseAlphaNumericTokenizer extends CharTokenizer { |
| 44 | |
| 45 | public LowercaseAlphaNumericTokenizer(Reader input) { |
| 46 | super(input); |
| 47 | } |
| 48 | |
| 49 | /** |
| 50 | * Returns true if "c" is {@link Character#isLetter(char)} or |
| 51 | * {@link Character#isDigit(char)}. |
| 52 | */ |
| 53 | @Override |
| 54 | protected boolean isTokenChar(char c) { |
| 55 | return Character.isLetter(c) || Character.isDigit(c); |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * Lower cases via {@link Character#toLowerCase(char)} |
| 60 | */ |
| 61 | @Override |
| 62 | protected char normalize(char c) { |
| 63 | return Character.toLowerCase(c); |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /** |
| 68 | * Returns a {@link LowercaseAlphaNumericTokenizer} |
| 69 | */ |
| 70 | @Override |
| 71 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| 72 | return new LowercaseAlphaNumericTokenizer(reader); |
| 73 | } |
| 74 | |
| 75 | } |
Note: See TracBrowser
for help on using the browser.
