Lucene分析器的實現(xiàn)。
Lucene分詞器Tokenizer,它的繼承子類的實現(xiàn)。
Tokenizer類的繼承關(guān)系,如圖所示:
ChineseTokenizer類實現(xiàn)中文分詞
中文分詞在Lucene中的處理很簡單,就是單個字分。它的實現(xiàn)類為ChineseTokenizer,在包org.apache.lucene.analysis.cn中,源代碼如下:
package org.apache.lucene.analysis.cn;
    
    
      import java.io.Reader;
      
      import org.apache.lucene.analysis.*;
    
  
public final class ChineseTokenizer extends Tokenizer {
    
      ??? public ChineseTokenizer(Reader in) {
      
      ??????? input = in;
      
      ??? }
    
  
    
      ??? private int offset = 0, bufferIndex=0, dataLen=0;
      
      ??? private final static int MAX_WORD_LEN = 255;
      
      ??? private final static int IO_BUFFER_SIZE = 1024;
      
      ??? private final char[] buffer = new char[MAX_WORD_LEN];
      
      ??? private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
    
  
    
      ??? private int length;
      
      ??? private int start;
    
  
??? private final void push(char c) {??? // 對待分詞的文本進行預(yù)處理,輸入到緩沖區(qū)buffer中
    
      ??????? if (length == 0) start = offset-1;??????????? 
      
        // 根據(jù)詞條長度,設(shè)置起始位置索引
      
      
      ??????? buffer[length++] = Character.toLowerCase(c); 
      
        // 預(yù)處理:將中文Unicode碼轉(zhuǎn)化成小寫
      
    
  
??? }
??? private final Token flush() { // 根據(jù)緩沖區(qū)預(yù)處理后的文本,構(gòu)造詞條
    
      ??????? if (length>0) {
      
      ??????????? return new Token(new String(buffer, 0, length), start, start+length);
      
      ??????? }
      
      ??????? else
      
      ??????????? return null;
      
      ??? }
    
  
??? public final Token next() throws java.io.IOException {??? // 返回下一個詞條
    
      ??????? length = 0;
      
      ??????? start = offset;
    
  
??????? while (true) {
    
      ??????????? final char c;
      
      ??????????? offset++;
    
  
    
      ??????????? if (bufferIndex >= dataLen) { 
      
        // 當緩沖區(qū)沒有溢出
      
      
      ??????????????? dataLen = input.read(ioBuffer);
      
      ??????????????? bufferIndex = 0;
      
      ??????????? }
    
  
    
      ??????????? if (dataLen == -1) return flush();
      
      ??????????? else
      
      ??????????????? c = ioBuffer[bufferIndex++];
    
  
??????????? switch(Character.getType(c)) {
    
      ??????????? case Character.DECIMAL_DIGIT_NUMBER:
      
      ??????????? case Character.LOWERCASE_LETTER:
      
      ??????????? case Character.UPPERCASE_LETTER:
      
      ??????????????? push(c);
      
      ??????????????? if (length == MAX_WORD_LEN) return flush();
      
      ??????????????? break;
    
  
    
      ??????????? case Character.OTHER_LETTER:
      
      ??????????????? if (length>0) {
      
      ??????????????????? bufferIndex--;
      
      ??????????????????? offset--;
      
      ??????????????????? return flush();
      
      ??????????????? }
      
      ??????????????? push(c);
      
      ??????????????? return flush();
    
  
    
      ??????????? default:
      
      ??????????????? if (length>0) return flush();
      
      ??????????????? break;
      
      ??????????? }
      
      ??????? }
    
  
    
      ??? }
      
      }
    
  
這里,還提及到一個CJKTokenizer分詞類,它處理分詞的時候,比ChineseTokenizer分詞處理要好一點,但是也存在弊病,源代碼給了一個例子,如下:
如果一個中文詞匯C1C2C3C4被索引,使用ChineseTokenizer分詞,返回的詞條(Token)為:C1,C2,C3,C4;使用CJKTokenizer進行分詞,則返回的詞條(Token)為:C1C2,C2C3,C3C4。
問題在于:當檢索關(guān)鍵字為C1,C1C2,C1C3,C4C2,C1C2C3……的時候,ChineseTokenizer可以對其實現(xiàn)分詞,而CJKTokenizer就不能實現(xiàn)分詞了。
CJKTokenizer類實現(xiàn)中文分詞
CJKTokenizer類的源代碼如下所示:
package org.apache.lucene.analysis.cjk;
    
      import org.apache.lucene.analysis.Token;
      
      import org.apache.lucene.analysis.Tokenizer;
    
  
import java.io.Reader;
    
    
      public final class CJKTokenizer extends Tokenizer {
      
      ?????
    
    
      
         /** Max word length */
        
      
      ??? private static final int MAX_WORD_LEN = 255;
    
  
    
      ??
    
    
      
         /** buffer size: */
        
      
      ??? private static final int IO_BUFFER_SIZE = 256;
    
  
    
      ????
    
    
      
         /** word offset, used to imply which character(in ) is parsed */
        
      
      ??? private int offset = 0;
    
  
    
      ???
    
    
      
         /** the index used only for ioBuffer */
        
      
      ??? private int bufferIndex = 0;
    
  
    
      ???
      
         /** data length */
      
      
      ??? private int dataLen = 0;
    
  
    
      ???
    
    
      
         /**
        
        ???? * 字符緩沖區(qū),存儲那些經(jīng)過處理后返回的詞條
        
        ???? */
        
      
      ??? private final char[] buffer = new char[MAX_WORD_LEN];
    
  
    
      ???
    
    
      
         /**
        
        ???? * I/O buffer, used to store the content of the input(one of the <br>
        
        ???? * members of Tokenizer)
        
        ???? */
        
      
      ??? private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
    
  
    
      ???
    
    
      
         /** word type: single=>ASCII double=>non-ASCII word=>default */
        
      
      ??? private String tokenType = "word";
    
  
??? private boolean preIsTokened = false;
    
      ?????? public CJKTokenizer(Reader in) {
      
      ??????? input = in;
      
      ??? }
    
  
    
      ????????? public final Token next() throws java.io.IOException {
      
      
      ??????? int length = 0;
    
  
    
      ??????? 
    
    
      
        /** the position used to create Token */
        
      
      ??????? int start = offset;
    
  
    
      ??????? while (true) {
      
      ???????????
    
    
      
         /** current charactor */
        
      
      ??????????? char c;
    
  
    
      ???????????
    
    
      
         /** unicode block of current charactor for detail */
        
      
      ??????????? Character.UnicodeBlock ub;
    
  
??????????? offset++;
    
      ??????????? if (bufferIndex >= dataLen) {
      
      ??????????????? dataLen = input.read(ioBuffer);
      
      ??????????????? bufferIndex = 0;
      
      ??????????? }
    
  
    
      ??????????? if (dataLen == -1) {
      
      ??????????????? if (length > 0) {
      
      ??????????????????? if (preIsTokened == true) {
      
      ??????????????????????? length = 0;
      
      ??????????????????????? preIsTokened = false;
      
      ??????????????????? }
    
  
    
      ??????????????????? break;
      
      ??????????????? } else {
      
      ??????????????????? return null;
      
      ??????????????? }
      
      ??????????? } else {
      
      ???????????????
      
         //get current character
      
      
      ??????????????? c = ioBuffer[bufferIndex++];
    
  
    
      ??????????????? 
      
        //get the UnicodeBlock of the current character
      
      
      ??????????????? ub = Character.UnicodeBlock.of(c);
      
      ??????????? }
    
  
    
      ???????????
    
    
      
         //if the current character is ASCII or Extend ASCII
        
      
      ??????????? if ((ub == Character.UnicodeBlock.BASIC_LATIN)
      
      ??????????????????? || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
      
      ?????????????? ) {
      
      ??????????????? if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
      
      ??????????????????? 
    
    
      
        /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
        
      
      ??????????????????? int i = (int) c;
      
      ??????????????????? i = i - 65248;
      
      ??????????????????? c = (char) i;
      
      ??????????????? }
    
  
    
      ???????????????
    
    
      
         // if the current character is a letter or "_" "+" "#"
        
      
      ??????????????? if (Character.isLetterOrDigit(c)
      
      ??????????????????????? || ((c == '_') || (c == '+') || (c == '#'))
      
      ?????????????????? ) {
      
      ??????????????????? if (length == 0) {
      
      ???????????????????????
    
    
      
         // "javaC1C2C3C4linux" <br>
        
        ??????????????????????? //????? ^--: the current character begin to token the ASCII
        
        ??????????????????????? // letter
        
      
      ??????????????????????? start = offset - 1;
      
      ??????????????????? } else if (tokenType == "double") {
      
      ??????????????????????? offset--;
      
      ??????????????????????? bufferIndex--;
      
      ??????????????????????? tokenType = "single";
    
  
    
      ??????????????????????? if (preIsTokened == true) {
      
      ???????????????????????????
      
         // there is only one non-ASCII has been stored
      
      
      ??????????????????????????? length = 0;
      
      ??????????????????????????? preIsTokened = false;
    
  
    
      ??????????????????????????? break;
      
      ??????????????????????? } else {
      
      ??????????????????????????? break;
      
      ??????????????????????? }
      
      ??????????????????? }
    
  
    
      ???????????????????
      
         // store the LowerCase(c) in the buffer
      
      
      ??????????????????? buffer[length++] = Character.toLowerCase(c);
      
      ??????????????????? tokenType = "single";
    
  
    
      ???????????????????
      
         // break the procedure if buffer overflowed!
      
      
      ??????????????????? if (length == MAX_WORD_LEN) {
      
      ??????????????????????? break;
      
      ??????????????????? }
      
      ??????????????? } else if (length > 0) {
      
      ??????????????????? if (preIsTokened == true) {
      
      ??????????????????????? length = 0;
      
      ??????????????????????? preIsTokened = false;
      
      ??????????????????? } else {
      
      ??????????????????????? break;
      
      ??????????????????? }
      
      ??????????????? }
      
      ??????????? } else {
      
      ??????????????
    
    
      
         // non-ASCII letter, eg."C1C2C3C4"
        
      
      ??????????????? if (Character.isLetter(c)) {
      
      ??????????????????? if (length == 0) {
      
      ??????????????????????? start = offset - 1;
      
      ??????????????????????? buffer[length++] = c;
      
      ??????????????????????? tokenType = "double";
      
      ??????????????????? } else {
      
      ??????????????????????? if (tokenType == "single") {
      
      ??????????????????????????? offset--;
      
      ??????????????????????????? bufferIndex--;
    
  
    
      ???????????????????????????
    
    
      
         //return the previous ASCII characters
        
      
      ??????????????????????????? break;
      
      ??????????????????????? } else {
      
      ??????????????????????????? buffer[length++] = c;
      
      ??????????????????????????? tokenType = "double";
    
  
    
      ??????????????????????????? if (length == 2) {
      
      ??????????????????????????????? offset--;
      
      ??????????????????????????????? bufferIndex--;
      
      ??????????????????????????????? preIsTokened = true;
    
  
    
      ??????????????????????????????? break;
      
      ??????????????????????????? }
      
      ??????????????????????? }
      
      ??????????????????? }
      
      ??????????????? } else if (length > 0) {
      
      ??????????????????? if (preIsTokened == true) {
      
      ??????????????????????? 
    
    
      
        // empty the buffer
        
      
      ??????????????????????? length = 0;
      
      ??????????????????????? preIsTokened = false;
      
      ??????????????????? } else {
      
      ??????????????????????? break;
      
      ??????????????????? }
      
      ??????????????? }
      
      ??????????? }
      
      ??????? }
    
  
    
      ??????? return new Token(new String(buffer, 0, length), start, start + length,
      
      ???????????????????????? tokenType
      
      ??????????????????????? );
      
      ??? }
      
      }
    
  
?
更多文章、技術(shù)交流、商務(wù)合作、聯(lián)系博主
微信掃碼或搜索:z360901061
					微信掃一掃加我為好友
QQ號聯(lián)系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
					
