Index: test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java =================================================================== --- test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java (revision 0) +++ test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java (revision 0) @@ -0,0 +1,86 @@ +package org.wikimedia.lsearch.analyzers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +public class CJKFilterTest extends TestCase { + private Analyzer a; + + public CJKFilterTest(String name){ + super(name); + } + + protected void setup() throws Exception { + } + + public void testEmpty(){ + a = new WhitespaceAnalyzer(); + assertEquals("[]",tokens("")); + } + + public void testCJK() throws Exception { + a = new WhitespaceAnalyzer(); + assertEquals("[(い,0,1)]",tokens("い")); + assertEquals("[(いわ,0,2)]",tokens("いわ")); + assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4)]",tokens("いわさき")); + } + + public void testNonCJK() throws Exception { + a = new WhitespaceAnalyzer(); + assertEquals("[(i,0,1)]",tokens("i")); + assertEquals("[(ic,0,2)]",tokens("ic")); + assertEquals("[(icic,0,4)]",tokens("icic")); + } + public void testNonCJKandCJK() throws Exception { + a = new WhitespaceAnalyzer(); + assertEquals("[(c,0,1), (カー,1,3), (ード,2,4)]",tokens("cカード")); + assertEquals("[(ic,0,2), (カー,2,4), (ード,3,5)]",tokens("icカード")); + assertEquals("[(icic,0,4), (カー,4,6), (ード,5,7)]",tokens("icicカード")); + } + + public void testCJKandNonCJK() throws Exception { + a = new WhitespaceAnalyzer(); + assertEquals("[(き,0,1), (ic,1,3)]",tokens("きic")); + assertEquals("[(さき,0,2), (ic,2,4)]",tokens("さきic")); + } + public void testEndWithNonCJK() throws Exception { + a = new WhitespaceAnalyzer(); + assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (i,4,5)]",tokens("いわさきi")); + assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (ic,4,6)]",tokens("いわさきic")); + assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (icic,4,8)]",tokens("いわさきicic")); + } + public void testEndWithCJK() throws Exception{ + a = new WhitespaceAnalyzer(); + assertEquals("[(ic,0,2), (カ,2,3)]",tokens("icカ")); + } + + private String tokens(String text){ + try{ + return Arrays.toString(tokensFromAnalysis(a,text,"contents")); + } catch(IOException e){ + fail(e.getMessage()); + return null; + } + } + + private static Token[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException { + TokenStream stream = analyzer.tokenStream(field, text); + CJKFilter cjkfilter = new CJKFilter(stream); + ArrayList tokenList = new ArrayList(); + while (true) { + Token token = cjkfilter.next(); + if (token == null) break; + tokenList.add(token); + } + return (Token[])tokenList.toArray(new Token[0]); + } + +} \ No newline at end of file Index: src/org/wikimedia/lsearch/analyzers/CJKFilter.java =================================================================== --- src/org/wikimedia/lsearch/analyzers/CJKFilter.java (revision 81083) +++ src/org/wikimedia/lsearch/analyzers/CJKFilter.java (working copy) @@ -44,23 +44,29 @@ for(i=0,offset=0,len=0;i